Skip to content
Snippets Groups Projects
Commit 8f536aab authored by Gervaise Henry's avatar Gervaise Henry 🤠
Browse files

Merge branch '10-count.features' into 'develop'

# Conflicts:
#   astrocyte_pkg.yml
parents b69acc56 4f99d15c
2 merge requests!32Develop,!29Develop
Pipeline #3330 passed with stages
in 25 minutes and 31 seconds
This commit is part of merge request !29. Comments created here will be created in the context of that merge request.
......@@ -150,6 +150,16 @@ workflow_parameters:
description: |
10x cellranger version.
- id: feature
type: select
default: 'no'
choices:
- [ 'no', 'No']
- [ 'yes', 'Yes']
required: true
description: |
Additional features to count (only used in cellranger version 3+, ignored otherwise).
- id: astrocyte
type: select
choices:
......
#!/usr/bin/env nextflow
// Path to an input file, or a pattern for multiple inputs
// Note - $baseDir is the location of this workflow file main.nf
// Define Input variables
params.fastq = "$baseDir/../test_data/*.fastq.gz"
params.designFile = "$baseDir/../test_data/design.csv"
params.genome = 'GRCh38-3.0.0'
params.genomes = []
params.genomeLocation = params.genome ? params.genomes[ params.genome ].loc ?: false : false
params.expectCells = 10000
params.forceCells = 0
params.kitVersion = '3'
params.chemistry = []
params.chemistryParam = params.kitVersion ? params.chemistry[ params.kitVersion ].param ?: false : false
params.version = '3.0.2'
params.feature = 'yes'
params.outDir = "$baseDir/output"
// Define regular variables
designLocation = Channel
.fromPath(params.designFile)
.ifEmpty { exit 1, "design file not found: ${params.designFile}" }
fastqList = Channel
.fromPath(params.fastq)
.flatten()
.map { file -> [ file.getFileName().toString(), file.toString() ].join("\t") }
.collectFile(name: 'fileList.tsv', newLine: true)
refLocation = Channel
.fromPath(params.genomeLocation+params.genome)
.ifEmpty { exit 1, "referene not found: ${params.genome}" }
expectCells = params.expectCells
forceCells = params.forceCells
chemistryParam = params.chemistryParam
version = params.version
feature = params.feature
featurechk = feature
outDir = params.outDir
process checkDesignFile {
publishDir "$outDir/${task.process}", mode: 'copy'
input:
file designLocation
file fastqList
featurechk
output:
file("*.checked.csv") into designPaths
script:
"""
python3 $baseDir/scripts/check_design.test.py -d $designLocation -f $fastqList -t "$featurechk"
"""
}
// Parse design file
samples = designPaths
.splitCsv (sep: ',', header: true)
.map { row -> [ row.Sample, file(row.fastq_R1), file(row.fastq_R2) ] }
.groupTuple()
//.subscribe { println it }
// Duplicate variables
samples.into {
samples211
samples301
samples302
}
refLocation.into {
refLocation211
refLocation301
refLocation302
}
expectCells211 = expectCells
expectCells301 = expectCells
expectCells302 = expectCells
forceCells211 = forceCells
forceCells301 = forceCells
forceCells302 = forceCells
chemistryParam301 = chemistryParam
chemistryParam302 = chemistryParam
feature301 = feature
feature302 = feature
#!/usr/bin/env python3
'''Check if design file is correctly formatted and matches files list.'''
import argparse
import logging
import pandas as pd
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run QC (tsv format).",
required=True )
parser.add_argument('-f', '--fastq',
help="File with list of fastq files (tsv format).",
required=True )
parser.add_argument('-t', '--feature',
help="Additional features to count?",
required=True )
args = parser.parse_args()
return args
def check_design_headers_n(design):
'''Check if design file conforms to sequencing type.'''
# Default headers
design_template = [
'Sample',
'fastq_R1',
'fastq_R2']
design_headers = list(design.columns.values)
# Check if headers
logger.info("Running header check.")
missing_headers = set(design_template) - set(design_headers)
if len(missing_headers) > 0:
logger.error('Missing column headers: %s', list(missing_headers))
raise Exception("Missing column headers: %s" % list(missing_headers))
return design
def check_design_headers_y(design):
'''Check if design file conforms to sequencing type.'''
# Default headers
design_template = [
'Sample',
'fastq_R1',
'fastq_R2',
'library_type']
design_headers = list(design.columns.values)
# Check if headers
logger.info("Running header check.")
missing_headers = set(design_template) - set(design_headers)
if len(missing_headers) > 0:
logger.error('Missing column headers: %s', list(missing_headers))
raise Exception("Missing column headers: %s" % list(missing_headers))
return design
def check_files(design, fastq):
'''Check if design file has the files found.'''
logger.info("Running file check.")
files = list(design['fastq_R1']) + list(design['fastq_R2'])
files_found = fastq['name']
missing_files = set(files) - set(files_found)
if len(missing_files) > 0:
logger.error('Missing files from design file: %s', list(missing_files))
raise Exception("Missing files from design file: %s" %
list(missing_files))
else:
file_dict = fastq.set_index('name').T.to_dict()
design['fastq_R1'] = design['fastq_R1'].apply(lambda x: file_dict[x]['path'])
design['fastq_R2'] = design['fastq_R2'].apply(lambda x: file_dict[x]['path'])
return design
def main():
args = get_args()
design = args.design
# Create a file handler
handler = logging.FileHandler('design.log')
logger.addHandler(handler)
# Read files as dataframes
design_df = pd.read_csv(args.design, sep=',')
fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path'])
# Check design file
if args.feature == 'no':
new_design_df = check_design_headers_n(design_df)
else:
new_design_df = check_design_headers_y(design_df)
#new_design_df[['sample']].to_csv('library.checked.csv', header=True, sep=',', index=False)
check_files(design_df, fastq_df)
new_design_df.drop('library_type', 1).to_csv('design.checked.csv', header=True, sep=',', index=False)
if __name__ == '__main__':
main()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment