Initial commit

db87d753 · Gervaise Henry · db87d753 · db87d753 · db87d753 · db87d753
Commit db87d753 authored 6 years ago by Gervaise Henry
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# nextflow analysis folders/files
+/test_data/*
+/workflow/.nextflow/*
+/workflow/work/*
+/workflow/output/*
+pipeline_trace*.txt*
+.nextflow*.log*
+report.html*
+timeline*.html*
+
+*~
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+before_script:
+  - module load python/3.6.1-2-anaconda
+  - module load nextflow/0.27.6
+  - ln -s /project/shared/bicf_workflow_ref/workflow_testdata/cellranger_mkfastq/*tar.gz test_data/
+
+stages:
+  - integration
+
+simple_test:
+  stage: integration
+  script:
+  - nextflow run workflow/main.nf
--- a/README.md
+++ b/README.md
+10x Genomics scRNA-Seq (cellranger) count Pipeline
+========================================
+
+Introduction
+------------
+
+This pipeline is a wrapper for the cellranger count tool from 10x Genomics. It takes fastq files from 10x Genomics Single Cell Gene Expression libraries, performs alignment, filtering, barcode counting, and UMI counting. It uses the Chromium cellular barcodes to generate gene-barcode matrices, determine clusters, and perform gene expression analysis.
+
+The pipeline uses Nextflow, a bioinformatics workflow tool.
+
+This pipeline is primarily used with a SLURM cluster on the BioHPC Cluster. However, the pipeline should be able to run on any system that Nextflow supports.
+
+Additionally, the pipeline is designed to work with Astrocyte Workflow System using a simple web interface.
\ No newline at end of file
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
+#
+# metadata for the  Astrocyte CellRanger mkfastq workflow package
+#
+
+# -----------------------------------------------------------------------------
+# BASIC INFORMATION
+# -----------------------------------------------------------------------------
+
+# A unique identifier for the workflow package, text/underscores only
+name: 'cellranger_count'
+# Who wrote this?
+author: 'Gervaise Henry and Venkat Malladi'
+# A contact email address for questions
+email: 'bicf@utsouthwestern.edu'
+# A more informative title for the workflow package
+title: 'BICF CellRanger mkfastq Workflow'
+# A summary of the workflow package in plain text
+description: |
+  This is a workflow package for the BICF/Strand Lab CellRanger count workflow system.
+  It implements 10x CellRanger count analysis workflow application.
+
+# -----------------------------------------------------------------------------
+# DOCUMENTATION
+# -----------------------------------------------------------------------------
+
+# A list of documentation file in .md format that should be viewable from the
+# web interface. These files are in the 'docs' subdirectory. The first file
+# listed will be used as a documentation index and is index.md by convention
+documentation_files:
+  - 'index.md'
+
+# -----------------------------------------------------------------------------
+# NEXTFLOW WORKFLOW CONFIGURATION
+# -----------------------------------------------------------------------------
+
+# Remember - The workflow file is always named 'workflow/main.nf'
+#            The workflow must publish all final output into $baseDir
+
+# A list of cluster environment modules that this workflow requires to run.
+# Specify versioned module names to ensure reproducability.
+workflow_modules:
+  - 'python/3.6.1-2-anaconda'
+  - 'cellranger/2.1.1'
+  - 'bcl2fastq/2.17.1.14'
+
+# A list of parameters used by the workflow, defining how to present them,
+# options etc in the web interface. For each parameter:
+#
+# REQUIRED INFORMATION
+#  id:         The name of the parameter in the NEXTFLOW workflow
+#  type:       The type of the parameter, one of:
+#                string    - A free-format string
+#                integer   - An integer
+#                real      - A real number
+#                file      - A single file from user data
+#                files     - One or more files from user data
+#                select    - A selection from a list of values
+#  required:    true/false, must the parameter be entered/chosen?
+#  description: A user friendly description of the meaning of the parameter
+#
+# OPTIONAL INFORMATION
+#  default:   A default value for the parameter (optional)
+#  min:       Minium value/characters/files for number/string/files types
+#  max:       Maxumum value/characters/files for number/string/files types
+#  regex:     A regular expression that describes valid entries / filenames
+#
+# SELECT TYPE
+#  choices:   A set of choices presented to the user for the parameter.
+#             Each choice is a pair of value and description, e.g.
+#
+#             choices:
+#               - [ 'myval', 'The first option']
+#               - [ 'myval', 'The second option']
+#
+# NOTE - All parameters are passed to NEXTFLOW as strings... but they
+#        are validated by astrocyte using the information provided above
+
+workflow_parameters:
+
+  - id: fastq
+    type: files
+    required: true
+    description: |
+      Pairs (read1 and read2) of fastq.gz files from a sequencing of 10x single-cell expereiment. Index fastq not required.
+    regex: ".*fastq.gz"
+    min: 2
+
+  - id: designFile
+    type: file
+    required: true
+    regex: ".*csv"
+    description: |
+      A design file listing sample, corresponding read1 filename, corresponding read2 filename. There can be multiple rows with the same sample name, if there are multiple fastq's for that sample.
+
+- id: genome
+    type: select
+    choices:
+      - [ 'GRCh38', 'Human GRCh38']
+      - [ 'hg19', 'Human GRCh37 (hg19)']
+      - [ 'mm10', 'Mouse GRCm38 (mm10)']
+      - [ 'hg19.mm10', 'Human GRCh37 (hg19) + Mouse GRCm38 (mm19)']
+      - [ 'ercc92', 'ERCC.92 Spike-In']
+    required: true
+    description: |
+      Reference species and genome used for alignment and subsequent analysis.
+
+  - id: expectCells
+    type: integer
+    default: 3000
+    min: 0
+    max: 10000
+    required: false
+    description: |
+      Expected number of recovered cells.
+
+  - id: forceCells
+    type: integer
+    default: 0
+    min: 0
+    max: 10000
+    required: false
+    description: |
+      Force pipeline to use this number of cells, bypassing the cell detection algorithm. Use this if the number of cells estimated by Cell Ranger is not consistent with the barcode rank plot. A value of 0 ignores this option.
+
+# -----------------------------------------------------------------------------
+# SHINY APP CONFIGURATION
+# -----------------------------------------------------------------------------
+
+# Remember - The vizapp is always 'vizapp/server.R' 'vizapp/ui.R'
+#            The workflow must publish all final output into $baseDir
+
+# Name of the R module that the vizapp will run against
+vizapp_r_module: 'R/3.2.1-intel'
+
+# List of any CRAN packages, not provided by the modules, that must be made
+# available to the vizapp
+vizapp_cran_packages:
+  - shiny
+  - shinyFiles
+
+# List of any Bioconductor packages, not provided by the modules,
+# that must be made available to the vizapp
+vizapp_bioc_packages:
+  - chipseq
--- a/docs/index.md
+++ b/docs/index.md
+# Astrocyte CellRanger 10x Workflow Package
+
+## Workflow SOP
--- a/vizapp/server.R
+++ b/vizapp/server.R
+# This example implements a simple file browser for accessing results.
+
+library(shiny)
+library(shinyFiles)
+
+# Results are available in the directory specified by the outputDir environment
+# variable, red by Sys.getenv
+
+rootdir <- Sys.getenv('outputDir')
+
+
+shinyServer(function(input, output, session) {
+
+    # The backend for a simple file chooser, restricted to the
+    # rootdir we obtained above.
+    # See https://github.com/thomasp85/shinyFiles
+
+    shinyFileChoose(input, 'files', roots=c('workflow'=rootdir), filetypes=c('', 'bed', 'xls','wig'), session=session)
+
+})
--- a/vizapp/ui.R
+++ b/vizapp/ui.R
+library(shiny)
+library(shinyFiles)
+
+
+shinyUI(fluidPage(
+
+  verticalLayout(
+
+    # Application title
+    titlePanel("Astrocyte Example"),
+
+    wellPanel(
+
+        helpText("This is a minimal example, demonstrating how
+        a Shiny visualization application can access the output of a workflow.
+        Here we provide a file browser using the shinyFiles package. Real
+        Astrocyte vizapps would provide custom methods to access and visualize
+        output."),
+
+        helpText("The workflow output is in the directory set in the
+        outputDir environment variable. this can be retrieved in R with the
+        command Sys.getenv('outputDir')"),
+
+        # A simple file browser within the workflow output directory
+        # See https://github.com/thomasp85/shinyFiles
+        shinyFilesButton('files', label='Browse workflow output', title='Please select a file', multiple=FALSE)
+
+    )
+  )
+))
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
+process {
+  executor = 'slurm'
+  queue='super'
+
+  // Process specific configuration
+  $checkDesignFile {
+    module = ['python/3.6.1-2-anaconda']
+    executor = 'local'
+  }
+  $count {
+    module = ['cellranger/2.1.1']
+  }
+}
+
+params {
+  // Reference file paths on BioHPC
+  genomes {
+    'ercc92' {
+      ref = '/project/apps_database/cellranger/refdata-cellranger-ercc92-1.2.0'
+    }
+    'GRCh38' {
+      ref = '/project/apps_database/cellranger/refdata-cellranger-GRCh38-1.2.0'
+    }
+    'hg19' {
+	ref = '/project/apps_database/cellranger/refdata-cellranger-hg19-1.2.0'
+    }
+    'mm10' {
+      ref = '/project/apps_database/cellranger/refdata-cellranger-mm10-1.2.0'
+    }
+    'hg19.mm10' {
+      ref = '/project/apps_database/cellranger/refdata-cellranger-hg19_and_mm10-1.2.0'
+    }
+  }
+}
+
+trace {
+  enabled = true
+  file = 'pipeline_trace.txt'
+  fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
+}
+
+timeline {
+  enabled = true
+  file = 'timeline.html'
+}
+
+report {
+  enabled = true
+  file = 'report.html'
+}
--- a/workflow/main.nf
+++ b/workflow/main.nf
+#!/usr/bin/env nextflow
+
+// Path to an input file, or a pattern for multiple inputs
+// Note - $baseDir is the location of this workflow file main.nf
+
+// Define Input variables
+params.fastq = "$baseDir/../test_data/*.fastq.gz"
+params.designFile = "$baseDir/../test_data/design.csv"
+params.genome = 'GRCh38'
+params.genomes = []
+params.ref = params.genome ? params.genomes[ params.genome ].ref ?: false : false
+params.expectCells = 10000
+params.forceCells = 0
+
+// Check inputs
+if( params.ref ){
+  refLocation = Channel
+    .fromPath(params.ref)
+    .ifEmpty { exit 1, "referene not found: ${params.ref}" }
+} else {
+  exit 1, "No reference genome specified."
+}
+
+// Define List of Files
+fastqList = Channel
+  .fromPath(params.fastq)
+  .flatten()
+  .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t") }
+  .collectFile(name: 'fileList.tsv', newLine: true)
+
+// Define regular variables
+expectCells = params.expectCells
+forceCells = params.forceCells
+
+process checkDesignFile {
+
+  publishDir "$baseDir/output", mode: 'copy'
+
+  input:
+
+  params.designFile
+  file fastqList
+
+  output:
+
+  file("design.csv") into designPaths
+
+  script:
+
+  """
+  module load python/3.6.1-2-anaconda
+  python3 $baseDir/scripts/check_design.py -d $params.designFile -f $fastqList
+  """
+}
+
+// Parse design file
+samples = designPaths
+  .splitCsv (sep: ',', header: true)
+  .map { row -> [ row.Sample, file(row.fastq_R1), file(row.fastq_R2) ] }
+  .groupTuple()
+  //.subscribe { println it }
+
+
+process count {
+  tag "$sample"
+
+  publishDir "$baseDir/output", mode: 'copy'
+
+  input:
+
+  set sample, file("${sample}_S1_L00?_R1_001.fastq.gz"), file("${sample}_S1_L00?_R2_001.fastq.gz") from samples
+  file ref from refLocation.first()
+  expectCells
+  forceCells
+
+  output:
+
+  file("**/outs/**") into outPaths
+
+  script:
+  """
+  module load cellranger/2.1.1
+  """
+  if ( forceCells ==0){
+    """
+    cellranger count --id="$sample" --transcriptome="$ref" --fastqs=. --sample="$sample" --expect-cells=$expectCells
+    """
+  } else {
+    """
+    cellranger count --id="$sample" --transcriptome="$ref" --fastqs=. --sample="$sample" --expect-cells=$expectCells --force-cells=$forceCells
+    """
+  }
+}
\ No newline at end of file
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
+profiles {
+  standard {
+    includeConfig 'conf/biohpc.config'
+  }
+}
--- a/workflow/scripts/check_design.py
+++ b/workflow/scripts/check_design.py
+#!/usr/bin/env python3
+
+'''Check if design file is correctly formatted and matches files list.'''
+
+import argparse
+import logging
+import pandas as pd
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-d', '--design',
+                        help="The design file to run QC (tsv format).",
+                        required=True )
+
+    parser.add_argument('-f', '--fastq',
+                        help="File with list of fastq files (tsv format).",
+                        required=True )
+
+    args = parser.parse_args()
+    return args
+
+
+def check_design_headers(design):
+    '''Check if design file conforms to sequencing type.'''
+
+    # Default headers
+    design_template = [
+        'Sample',
+	    'fastq_R1',
+	    'fastq_R2']
+
+    design_headers = list(design.columns.values)
+
+    # Check if headers
+    logger.info("Running header check.")
+
+    missing_headers = set(design_template) - set(design_headers)
+
+    if len(missing_headers) > 0:
+        logger.error('Missing column headers: %s', list(missing_headers))
+        raise Exception("Missing column headers: %s" % list(missing_headers))
+    
+    return design
+
+def check_files(design, fastq):
+    '''Check if design file has the files found.'''
+
+    logger.info("Running file check.")
+
+    files = list(design['fastq_R1']) + list(design['fastq_R2'])
+
+    files_found = fastq['name']
+
+    missing_files = set(files) - set(files_found)
+
+    if len(missing_files) > 0:
+        logger.error('Missing files from design file: %s', list(missing_files))
+        raise Exception("Missing files from design file: %s" %
+            list(missing_files))
+    else:
+        file_dict = fastq.set_index('name').T.to_dict()
+    
+    design['fastq_R1'] = design['fastq_R1'].apply(lambda x: file_dict[x]['path'])
+    design['fastq_R2'] = design['fastq_R2'].apply(lambda x: file_dict[x]['path'])
+
+    return design
+
+
+def main():
+    args = get_args()
+    design = args.design
+
+    # Create a file handler
+    handler = logging.FileHandler('design.log')
+    logger.addHandler(handler)
+
+    # Read files as dataframes
+    design_df = pd.read_csv(args.design, sep=',')
+    fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path'])
+
+    # Check design file
+    new_design_df = check_design_headers(design_df)
+    check_files(design_df, fastq_df)
+
+    new_design_df.to_csv('design.csv', header=True, sep=',', index=False)
+
+if __name__ == '__main__':
+    main()