diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index 8417664465fb8691f52e97960b6acda0b54594c8..57a83b333141e9056fa00b894bc76e43010ee341 100755 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -150,6 +150,16 @@ workflow_parameters: description: | 10x cellranger version. + - id: feature + type: select + default: 'no' + choices: + - [ 'no', 'No'] + - [ 'yes', 'Yes'] + required: true + description: | + Additional features to count (only used in cellranger version 3+, ignored otherwise). + - id: astrocyte type: select choices: diff --git a/workflow/main.test.nf b/workflow/main.test.nf new file mode 100644 index 0000000000000000000000000000000000000000..581f1777764f5d67b7dc352d17bb9b3e2e350065 --- /dev/null +++ b/workflow/main.test.nf @@ -0,0 +1,89 @@ +#!/usr/bin/env nextflow + +// Path to an input file, or a pattern for multiple inputs +// Note - $baseDir is the location of this workflow file main.nf + +// Define Input variables +params.fastq = "$baseDir/../test_data/*.fastq.gz" +params.designFile = "$baseDir/../test_data/design.csv" +params.genome = 'GRCh38-3.0.0' +params.genomes = [] +params.genomeLocation = params.genome ? params.genomes[ params.genome ].loc ?: false : false +params.expectCells = 10000 +params.forceCells = 0 +params.kitVersion = '3' +params.chemistry = [] +params.chemistryParam = params.kitVersion ? params.chemistry[ params.kitVersion ].param ?: false : false +params.version = '3.0.2' +params.feature = 'yes' +params.outDir = "$baseDir/output" + +// Define regular variables +designLocation = Channel + .fromPath(params.designFile) + .ifEmpty { exit 1, "design file not found: ${params.designFile}" } +fastqList = Channel + .fromPath(params.fastq) + .flatten() + .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t") } + .collectFile(name: 'fileList.tsv', newLine: true) +refLocation = Channel + .fromPath(params.genomeLocation+params.genome) + .ifEmpty { exit 1, "referene not found: ${params.genome}" } +expectCells = params.expectCells +forceCells = params.forceCells +chemistryParam = params.chemistryParam +version = params.version +feature = params.feature +featurechk = feature +outDir = params.outDir + +process checkDesignFile { + + publishDir "$outDir/${task.process}", mode: 'copy' + + input: + + file designLocation + file fastqList + featurechk + + output: + + file("*.checked.csv") into designPaths + + script: + + """ + python3 $baseDir/scripts/check_design.test.py -d $designLocation -f $fastqList -t "$featurechk" + """ +} + +// Parse design file +samples = designPaths + .splitCsv (sep: ',', header: true) + .map { row -> [ row.Sample, file(row.fastq_R1), file(row.fastq_R2) ] } + .groupTuple() + //.subscribe { println it } + +// Duplicate variables +samples.into { + samples211 + samples301 + samples302 +} +refLocation.into { + refLocation211 + refLocation301 + refLocation302 +} +expectCells211 = expectCells +expectCells301 = expectCells +expectCells302 = expectCells +forceCells211 = forceCells +forceCells301 = forceCells +forceCells302 = forceCells +chemistryParam301 = chemistryParam +chemistryParam302 = chemistryParam +feature301 = feature +feature302 = feature diff --git a/workflow/scripts/check_design.test.py b/workflow/scripts/check_design.test.py new file mode 100755 index 0000000000000000000000000000000000000000..e08f08a2573c9a32c88ce3bdbc3860e5bd179446 --- /dev/null +++ b/workflow/scripts/check_design.test.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 + +'''Check if design file is correctly formatted and matches files list.''' + +import argparse +import logging +import pandas as pd + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to run QC (tsv format).", + required=True ) + + parser.add_argument('-f', '--fastq', + help="File with list of fastq files (tsv format).", + required=True ) + + parser.add_argument('-t', '--feature', + help="Additional features to count?", + required=True ) + + args = parser.parse_args() + return args + + +def check_design_headers_n(design): + '''Check if design file conforms to sequencing type.''' + + # Default headers + design_template = [ + 'Sample', + 'fastq_R1', + 'fastq_R2'] + + design_headers = list(design.columns.values) + + # Check if headers + logger.info("Running header check.") + + missing_headers = set(design_template) - set(design_headers) + + if len(missing_headers) > 0: + logger.error('Missing column headers: %s', list(missing_headers)) + raise Exception("Missing column headers: %s" % list(missing_headers)) + + return design + +def check_design_headers_y(design): + '''Check if design file conforms to sequencing type.''' + + # Default headers + design_template = [ + 'Sample', + 'fastq_R1', + 'fastq_R2', + 'library_type'] + + design_headers = list(design.columns.values) + + # Check if headers + logger.info("Running header check.") + + missing_headers = set(design_template) - set(design_headers) + + if len(missing_headers) > 0: + logger.error('Missing column headers: %s', list(missing_headers)) + raise Exception("Missing column headers: %s" % list(missing_headers)) + + return design + +def check_files(design, fastq): + '''Check if design file has the files found.''' + + logger.info("Running file check.") + + files = list(design['fastq_R1']) + list(design['fastq_R2']) + + files_found = fastq['name'] + + missing_files = set(files) - set(files_found) + + if len(missing_files) > 0: + logger.error('Missing files from design file: %s', list(missing_files)) + raise Exception("Missing files from design file: %s" % + list(missing_files)) + else: + file_dict = fastq.set_index('name').T.to_dict() + + design['fastq_R1'] = design['fastq_R1'].apply(lambda x: file_dict[x]['path']) + design['fastq_R2'] = design['fastq_R2'].apply(lambda x: file_dict[x]['path']) + + return design + + +def main(): + args = get_args() + design = args.design + + # Create a file handler + handler = logging.FileHandler('design.log') + logger.addHandler(handler) + + # Read files as dataframes + design_df = pd.read_csv(args.design, sep=',') + fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path']) + + # Check design file + if args.feature == 'no': + new_design_df = check_design_headers_n(design_df) + else: + new_design_df = check_design_headers_y(design_df) + #new_design_df[['sample']].to_csv('library.checked.csv', header=True, sep=',', index=False) + + check_files(design_df, fastq_df) + new_design_df.drop('library_type', 1).to_csv('design.checked.csv', header=True, sep=',', index=False) + + + +if __name__ == '__main__': + main()