Add option of feature count, start fix check_design.py if feature=true

4f99d15c · Gervaise Henry · fd2352e2 · 4f99d15c · 4f99d15c · 4f99d15c
Commit 4f99d15c authored 6 years ago by Gervaise Henry
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -150,6 +150,15 @@ workflow_parameters:
    description: |
      10x cellranger version.
+  - id: feature
+    type: select
+    default: 'no'
+    choices:
+      - [ 'no', 'No']
+      - [ 'yes', 'Yes']
+    required: true
+    description: |
+      Additional features to count (only used in cellranger version 3+, ignored otherwise).
 # -----------------------------------------------------------------------------
 # SHINY APP CONFIGURATION

--- a/workflow/main.test.nf
+++ b/workflow/main.test.nf
+#!/usr/bin/env nextflow
+// Path to an input file, or a pattern for multiple inputs
+// Note - $baseDir is the location of this workflow file main.nf
+// Define Input variables
+params.fastq = "$baseDir/../test_data/*.fastq.gz"
+params.designFile = "$baseDir/../test_data/design.csv"
+params.genome = 'GRCh38-3.0.0'
+params.genomes = []
+params.genomeLocation = params.genome ? params.genomes[ params.genome ].loc ?: false : false
+params.expectCells = 10000
+params.forceCells = 0
+params.kitVersion = '3'
+params.chemistry = []
+params.chemistryParam = params.kitVersion ? params.chemistry[ params.kitVersion ].param ?: false : false
+params.version = '3.0.2'
+params.feature = 'yes'
+params.outDir = "$baseDir/output"
+// Define regular variables
+designLocation = Channel
+  .fromPath(params.designFile)
+  .ifEmpty { exit 1, "design file not found: ${params.designFile}" }
+fastqList = Channel
+  .fromPath(params.fastq)
+  .flatten()
+  .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t") }
+  .collectFile(name: 'fileList.tsv', newLine: true)
+refLocation = Channel
+  .fromPath(params.genomeLocation+params.genome)
+  .ifEmpty { exit 1, "referene not found: ${params.genome}" }
+expectCells = params.expectCells
+forceCells = params.forceCells
+chemistryParam = params.chemistryParam
+version = params.version
+feature = params.feature
+featurechk = feature
+outDir = params.outDir
+process checkDesignFile {
+  publishDir "$outDir/${task.process}", mode: 'copy'
+  input:
+  file designLocation
+  file fastqList
+  featurechk
+  output:
+  file("*.checked.csv") into designPaths
+  script:
+  """
+  python3 $baseDir/scripts/check_design.test.py -d $designLocation -f $fastqList -t "$featurechk"
+  """
+}
+// Parse design file
+samples = designPaths
+  .splitCsv (sep: ',', header: true)
+  .map { row -> [ row.Sample, file(row.fastq_R1), file(row.fastq_R2) ] }
+  .groupTuple()
+  //.subscribe { println it }
+// Duplicate variables
+samples.into {
+  samples211
+  samples301
+  samples302
+}
+refLocation.into {
+  refLocation211
+  refLocation301
+  refLocation302
+}
+expectCells211 = expectCells
+expectCells301 = expectCells
+expectCells302 = expectCells
+forceCells211 = forceCells
+forceCells301 = forceCells
+forceCells302 = forceCells
+chemistryParam301 = chemistryParam
+chemistryParam302 = chemistryParam
+feature301 = feature
+feature302 = feature
--- a/workflow/scripts/check_design.test.py
+++ b/workflow/scripts/check_design.test.py
+#!/usr/bin/env python3
+'''Check if design file is correctly formatted and matches files list.'''
+import argparse
+import logging
+import pandas as pd
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+# SETTINGS
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+def get_args():
+    '''Define arguments.'''
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-d', '--design',
+                        help="The design file to run QC (tsv format).",
+                        required=True )
+    parser.add_argument('-f', '--fastq',
+                        help="File with list of fastq files (tsv format).",
+                        required=True )
+    parser.add_argument('-t', '--feature',
+                        help="Additional features to count?",
+                        required=True )
+    args = parser.parse_args()
+    return args
+def check_design_headers_n(design):
+    '''Check if design file conforms to sequencing type.'''
+    # Default headers
+    design_template = [
+        'Sample',
+	    'fastq_R1',
+	    'fastq_R2']
+    design_headers = list(design.columns.values)
+    # Check if headers
+    logger.info("Running header check.")
+    missing_headers = set(design_template) - set(design_headers)
+    if len(missing_headers) > 0:
+        logger.error('Missing column headers: %s', list(missing_headers))
+        raise Exception("Missing column headers: %s" % list(missing_headers))
+    return design
+def check_design_headers_y(design):
+    '''Check if design file conforms to sequencing type.'''
+    # Default headers
+    design_template = [
+        'Sample',
+	    'fastq_R1',
+	    'fastq_R2',
+	    'library_type']
+    design_headers = list(design.columns.values)
+    # Check if headers
+    logger.info("Running header check.")
+    missing_headers = set(design_template) - set(design_headers)
+    if len(missing_headers) > 0:
+        logger.error('Missing column headers: %s', list(missing_headers))
+        raise Exception("Missing column headers: %s" % list(missing_headers))
+    return design
+def check_files(design, fastq):
+    '''Check if design file has the files found.'''
+    logger.info("Running file check.")
+    files = list(design['fastq_R1']) + list(design['fastq_R2'])
+    files_found = fastq['name']
+    missing_files = set(files) - set(files_found)
+    if len(missing_files) > 0:
+        logger.error('Missing files from design file: %s', list(missing_files))
+        raise Exception("Missing files from design file: %s" %
+            list(missing_files))
+    else:
+        file_dict = fastq.set_index('name').T.to_dict()
+    design['fastq_R1'] = design['fastq_R1'].apply(lambda x: file_dict[x]['path'])
+    design['fastq_R2'] = design['fastq_R2'].apply(lambda x: file_dict[x]['path'])
+    return design
+def main():
+    args = get_args()
+    design = args.design
+    # Create a file handler
+    handler = logging.FileHandler('design.log')
+    logger.addHandler(handler)
+    # Read files as dataframes
+    design_df = pd.read_csv(args.design, sep=',')
+    fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path'])
+    # Check design file
+    if args.feature == 'no':
+    	new_design_df = check_design_headers_n(design_df)
+    else:
+    	new_design_df = check_design_headers_y(design_df)
+	#new_design_df[['sample']].to_csv('library.checked.csv', header=True, sep=',', index=False)
+    check_files(design_df, fastq_df)
+    new_design_df.drop('library_type', 1).to_csv('design.checked.csv', header=True, sep=',', index=False)
+if __name__ == '__main__':
+    main()