Initial commit

54b35b23 · Gervaise Henry · 54b35b23 · 54b35b23 · 54b35b23 · 54b35b23
Commit 54b35b23 authored 6 years ago by Gervaise Henry
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# nextflow analysis folders/files
+/workflow/.nextflow/
+/test_data/*
+/workflow/work
+/workflow/output/design/
+/workflow/output/bcl/
+/workflow/output/fastq/
+pipeline_trace*.txt*
+.nextflow*.log*
+report.html
+
+*~
--- a/README.md
+++ b/README.md
+# CellRanger Pipeline
+
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
+process {
+  executor = 'slurm'
+  queue='super'
+
+  // Process specific configuration
+  $checkDesignFile {
+    module = ['python/3.6.1-2-anaconda']
+    executor = 'local'
+  }
+  $untarBCL {
+    cpus = 32
+  }
+  $mkfastq {
+    module = ['cellranger/2.1.1', 'bcl2fastq/2.17.1.14']
+    cpus = 128
+  }
+}
+
+params {
+  // Reference file paths on BioHPC
+  genomes {
+    'ercc92' {
+      ref = '/project/apps_database/cellranger/refdata-cellranger-ercc92-1.2.0'
+    }
+    'GRCh38' {
+      ref = '/project/apps_database/cellranger/refdata-cellranger-GRCh38-1.2.0'
+    }
+    'hg19' {
+	ref = '/project/apps_database/cellranger/refdata-cellranger-hg19-1.2.0'
+    }
+    'mm10' {
+      ref = '/project/apps_database/cellranger/refdata-cellranger-mm10-1.2.0'
+    }
+    'hg19.mm10' {
+      ref = '/project/apps_database/cellranger/refdata-cellranger-hg19_and_mm10-1.2.0'
+    }
+  }
+}
+
+trace {
+  enabled = true
+  file = 'pipeline_trace.txt'
+  fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
+}
+
+timeline {
+  enabled = true
+  file = 'timeline.html'
+}
+
+report {
+  enabled = true
+  file = 'report.html'
+}
--- a/workflow/main.nf
+++ b/workflow/main.nf
+#!/usr/bin/env nextflow
+
+// Path to an input file, or a pattern for multiple inputs
+// Note - $baseDir is the location of this workflow file main.nf
+
+// Define Input variables
+params.bcl = "$baseDir/../test_data/*.tar"
+params.designFile = "$baseDir/../test_data/design.csv"
+params.genome = 'GRCm38'
+
+// Define List of Files
+tarList = Channel.fromPath( params.bcl )
+
+
+// Define regular variables
+
+
+process checkDesignFile {
+
+  publishDir "$baseDir/output/design", mode: 'copy'
+
+  input:
+
+  params.designFile
+
+  output:
+
+  file("design.csv") into designPaths
+
+  script:
+
+  """
+  python $baseDir/scripts/check_design.py -d $params.designFile
+  """
+}
+
+
+process untarBCL {
+  tag "$tar"
+
+  publishDir "$baseDir/output/bcl", mode: 'copy'
+
+  input:
+
+  file tar from tarList
+
+  output:
+
+  file("*") into bclPaths
+
+  script:
+
+  """
+  
+  tar -xvf $tar
+  """
+}
+
+
+process mkfastq {
+
+  tag "${bcl.baseName}"
+  publishDir "$baseDir/output/fastq/${bcl.baseName}", mode: 'copy' 
+
+  input:
+
+  val bcl from bclPaths
+  file designPaths
+
+  output:
+
+  file("**/outs/fastq_path/**/*") into fastqPaths
+
+  script:
+
+  """
+  cellranger mkfastq --id="${bcl.baseName}" --run=$bcl --csv=$designPaths
+  """
+}
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
+profiles {
+  standard {
+    includeConfig 'conf/biohpc.config'
+  }
+}
--- a/workflow/output/.gitkeep
+++ b/workflow/output/.gitkeep
--- a/workflow/scripts/check_design.py
+++ b/workflow/scripts/check_design.py
+#!/usr/bin/env python3
+
+'''Check if design file is correctly formatted and matches files list.'''
+
+import argparse
+import logging
+import pandas as pd
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-d', '--design',
+                        help="The design file to run QC (tsv format).",
+                        required=True)
+
+    args = parser.parse_args()
+    return args
+
+
+def check_design_headers(design):
+    '''Check if design file conforms to sequencing type.'''
+
+    # Default headers
+    design_template = [
+        'Lane',
+        'Sample',
+        'Index']
+
+    design_headers = list(design.columns.values)
+
+    # Check if headers
+    logger.info("Running header check.")
+
+    missing_headers = set(design_template) - set(design_headers)
+
+    if len(missing_headers) > 0:
+        logger.error('Missing column headers: %s', list(missing_headers))
+        raise Exception("Missing column headers: %s" % list(missing_headers))
+    
+    return design
+
+def main():
+    args = get_args()
+    design = args.design
+
+    # Create a file handler
+    handler = logging.FileHandler('design.log')
+    logger.addHandler(handler)
+
+    # Read files as dataframes
+    design_df = pd.read_csv(args.design, sep=',')
+
+    # Check design file
+    new_design_df = check_design_headers(design_df)
+
+    new_design_df.to_csv('design.csv', header=True, sep=',', index=False)
+
+if __name__ == '__main__':
+    main()