Skip to content
Snippets Groups Projects
Commit 717de4dc authored by Gervaise Henry's avatar Gervaise Henry :cowboy:
Browse files

Merge branch 'revert-8f536aab' into 'develop'

Revert "Merge branch '10-count.features' into 'develop'"

See merge request !30
parents 8f536aab bc6c09e8
Branches
Tags
3 merge requests!35Develop,!32Develop,!30Revert "Merge branch '10-count.features' into 'develop'"
Pipeline #3333 passed with stages
in 23 minutes and 47 seconds
...@@ -150,16 +150,6 @@ workflow_parameters: ...@@ -150,16 +150,6 @@ workflow_parameters:
description: | description: |
10x cellranger version. 10x cellranger version.
- id: feature
type: select
default: 'no'
choices:
- [ 'no', 'No']
- [ 'yes', 'Yes']
required: true
description: |
Additional features to count (only used in cellranger version 3+, ignored otherwise).
- id: astrocyte - id: astrocyte
type: select type: select
choices: choices:
......
#!/usr/bin/env nextflow
// Path to an input file, or a pattern for multiple inputs
// Note - $baseDir is the location of this workflow file main.nf
// Define Input variables
params.fastq = "$baseDir/../test_data/*.fastq.gz"
params.designFile = "$baseDir/../test_data/design.csv"
params.genome = 'GRCh38-3.0.0'
params.genomes = []
params.genomeLocation = params.genome ? params.genomes[ params.genome ].loc ?: false : false
params.expectCells = 10000
params.forceCells = 0
params.kitVersion = '3'
params.chemistry = []
params.chemistryParam = params.kitVersion ? params.chemistry[ params.kitVersion ].param ?: false : false
params.version = '3.0.2'
params.feature = 'yes'
params.outDir = "$baseDir/output"
// Define regular variables
designLocation = Channel
.fromPath(params.designFile)
.ifEmpty { exit 1, "design file not found: ${params.designFile}" }
fastqList = Channel
.fromPath(params.fastq)
.flatten()
.map { file -> [ file.getFileName().toString(), file.toString() ].join("\t") }
.collectFile(name: 'fileList.tsv', newLine: true)
refLocation = Channel
.fromPath(params.genomeLocation+params.genome)
.ifEmpty { exit 1, "referene not found: ${params.genome}" }
expectCells = params.expectCells
forceCells = params.forceCells
chemistryParam = params.chemistryParam
version = params.version
feature = params.feature
featurechk = feature
outDir = params.outDir
process checkDesignFile {
publishDir "$outDir/${task.process}", mode: 'copy'
input:
file designLocation
file fastqList
featurechk
output:
file("*.checked.csv") into designPaths
script:
"""
python3 $baseDir/scripts/check_design.test.py -d $designLocation -f $fastqList -t "$featurechk"
"""
}
// Parse design file
samples = designPaths
.splitCsv (sep: ',', header: true)
.map { row -> [ row.Sample, file(row.fastq_R1), file(row.fastq_R2) ] }
.groupTuple()
//.subscribe { println it }
// Duplicate variables
samples.into {
samples211
samples301
samples302
}
refLocation.into {
refLocation211
refLocation301
refLocation302
}
expectCells211 = expectCells
expectCells301 = expectCells
expectCells302 = expectCells
forceCells211 = forceCells
forceCells301 = forceCells
forceCells302 = forceCells
chemistryParam301 = chemistryParam
chemistryParam302 = chemistryParam
feature301 = feature
feature302 = feature
#!/usr/bin/env python3
'''Check if design file is correctly formatted and matches files list.'''
import argparse
import logging
import pandas as pd
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run QC (tsv format).",
required=True )
parser.add_argument('-f', '--fastq',
help="File with list of fastq files (tsv format).",
required=True )
parser.add_argument('-t', '--feature',
help="Additional features to count?",
required=True )
args = parser.parse_args()
return args
def check_design_headers_n(design):
'''Check if design file conforms to sequencing type.'''
# Default headers
design_template = [
'Sample',
'fastq_R1',
'fastq_R2']
design_headers = list(design.columns.values)
# Check if headers
logger.info("Running header check.")
missing_headers = set(design_template) - set(design_headers)
if len(missing_headers) > 0:
logger.error('Missing column headers: %s', list(missing_headers))
raise Exception("Missing column headers: %s" % list(missing_headers))
return design
def check_design_headers_y(design):
'''Check if design file conforms to sequencing type.'''
# Default headers
design_template = [
'Sample',
'fastq_R1',
'fastq_R2',
'library_type']
design_headers = list(design.columns.values)
# Check if headers
logger.info("Running header check.")
missing_headers = set(design_template) - set(design_headers)
if len(missing_headers) > 0:
logger.error('Missing column headers: %s', list(missing_headers))
raise Exception("Missing column headers: %s" % list(missing_headers))
return design
def check_files(design, fastq):
'''Check if design file has the files found.'''
logger.info("Running file check.")
files = list(design['fastq_R1']) + list(design['fastq_R2'])
files_found = fastq['name']
missing_files = set(files) - set(files_found)
if len(missing_files) > 0:
logger.error('Missing files from design file: %s', list(missing_files))
raise Exception("Missing files from design file: %s" %
list(missing_files))
else:
file_dict = fastq.set_index('name').T.to_dict()
design['fastq_R1'] = design['fastq_R1'].apply(lambda x: file_dict[x]['path'])
design['fastq_R2'] = design['fastq_R2'].apply(lambda x: file_dict[x]['path'])
return design
def main():
args = get_args()
design = args.design
# Create a file handler
handler = logging.FileHandler('design.log')
logger.addHandler(handler)
# Read files as dataframes
design_df = pd.read_csv(args.design, sep=',')
fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path'])
# Check design file
if args.feature == 'no':
new_design_df = check_design_headers_n(design_df)
else:
new_design_df = check_design_headers_y(design_df)
#new_design_df[['sample']].to_csv('library.checked.csv', header=True, sep=',', index=False)
check_files(design_df, fastq_df)
new_design_df.drop('library_type', 1).to_csv('design.checked.csv', header=True, sep=',', index=False)
if __name__ == '__main__':
main()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment