diff --git a/CHANGELOG.md b/CHANGELOG.md index c90e24a68ed21255944f7cdcccba3377c9bfc91a..94f20980b833a1803b5120c0dfd8fb04bc83201d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ * Add seqwho * Add seqwho results to multiqc report * Modify repository structure to allow for use with XPACK-DNANEXUS +* Add override for endness **Background** * Add memory limit (75%) per thread for samtools sort (#108) diff --git a/README.md b/README.md index 04f49d7d7d3ebe739023ccfa30a099136e48b19d..33fb73d09c9dc8668f61a71c5029940fbc690404 100644 --- a/README.md +++ b/README.md @@ -56,12 +56,14 @@ To Run: * `--inputBagForce` utilizes a local replicate inputBag instead of downloading from the data-hub (still requires accurate repRID input) * eg: `--inputBagForce test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip` (must be the expected bag structure, this example will not work because it is a test bag) * `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input) - * eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order) + * eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order, also consider using `endsForce` if the endness doesn't match submitted value) * `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses a metadata mismatch or an ambiguous species error * eg: `--speciesForce 'Mus musculus'` + * `--endsForce` forces the endness to be "se", or "pe", it bypasses a metadata mismatch error + * eg: `--endsForce 'pe'` * `--strandedForce` forces the strandedness to be "forward", "reverse" or "unstranded", it bypasses a metadata mismatch error * eg: `--strandedForce 'unstranded'` - * `--spikeForce` forces the spike-in to be "false" or "true", it bypasses a metadata mismatch error + * `--spikeForce` forces the spike-in to be "false", or "true", it bypasses a metadata mismatch error * eg: `--spikeForce 'true'` * Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)): * `--ci` boolean (default = false) diff --git a/rna-seq.nf b/rna-seq.nf index f7619777cd7104c4e5ab954ded7c8521fa72f387..78d24fa22be5b1cdacf2cb89884df7e56113cf00 100644 --- a/rna-seq.nf +++ b/rna-seq.nf @@ -26,6 +26,7 @@ params.track = false params.refSource = "biohpc" params.inputBagForce = "" params.fastqsForce = "" +params.endsForce = "" params.speciesForce = "" params.strandedForce = "" params.spikeForce = "" @@ -64,6 +65,7 @@ logsDir = "${outDir}/Logs" upload = params.upload inputBagForce = params.inputBagForce fastqsForce = params.fastqsForce +endsForce = params.endsForce speciesForce = params.speciesForce strandedForce = params.strandedForce spikeForce = params.spikeForce @@ -1469,6 +1471,7 @@ process inferMetadata { path sampledBam path reference_inferMetadata path script_inferMeta + val endsForce val strandedForce val fastqCountError from fastqCountError_inferMetadata val fastqReadError from fastqReadError_inferMetadata @@ -1489,41 +1492,47 @@ process inferMetadata { hostname > ${repRID}.inferMetadata.log ulimit -a >> ${repRID}.inferMetadata.log - # infer experimental setting from dedup bam - echo -e "LOG: infer experimental setting from bam" >> ${repRID}.inferMetadata.log - infer_experiment.py -r ./genome.bed -i ${sampledBam} 1>> ${repRID}.infer_experiment.txt - echo -e "LOG: inferred" >> ${repRID}.inferMetadata.log + # infer experimental setting from dedup bam + echo -e "LOG: infer experimental setting from bam" >> ${repRID}.inferMetadata.log + infer_experiment.py -r ./genome.bed -i ${sampledBam} 1>> ${repRID}.infer_experiment.txt + echo -e "LOG: inferred" >> ${repRID}.inferMetadata.log - ended=`bash ${script_inferMeta} endness ${repRID}.infer_experiment.txt` - fail=`bash ${script_inferMeta} fail ${repRID}.infer_experiment.txt` - if [ \${ended} == "PairEnd" ] - then - ends="pe" - percentF=`bash ${script_inferMeta} pef ${repRID}.infer_experiment.txt` - percentR=`bash ${script_inferMeta} per ${repRID}.infer_experiment.txt` - elif [ \${ended} == "SingleEnd" ] - then - ends="se" - percentF=`bash ${script_inferMeta} sef ${repRID}.infer_experiment.txt` - percentR=`bash ${script_inferMeta} ser ${repRID}.infer_experiment.txt` - fi - echo -e "LOG: percentage reads in the same direction as gene: \${percentF}" >> ${repRID}.inferMetadata.log - echo -e "LOG: percentage reads in the opposite direction as gene: \${percentR}" >> ${repRID}.inferMetadata.log - if [ 1 -eq \$(echo \$(expr \${percentF#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentR#*.} "<" 2500)) ] - then - stranded="forward" - elif [ 1 -eq \$(echo \$(expr \${percentR#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentF#*.} "<" 2500)) ] - then - stranded="reverse" - else - stranded="unstranded" - fi - echo -e "LOG: stradedness set to: \${stranded}" >> ${repRID}.inferMetadata.log - if [ "${strandedForce}" != "" ] - then - stranded=${strandedForce} - echo -e "LOG: spike-in metadata forced: \${stranded}" >> ${repRID}.inferMetadata.log - fi + ended=`bash ${script_inferMeta} endness ${repRID}.infer_experiment.txt` + fail=`bash ${script_inferMeta} fail ${repRID}.infer_experiment.txt` + if [ \${ended} == "PairEnd" ] + then + ends="pe" + percentF=`bash ${script_inferMeta} pef ${repRID}.infer_experiment.txt` + percentR=`bash ${script_inferMeta} per ${repRID}.infer_experiment.txt` + elif [ \${ended} == "SingleEnd" ] + then + ends="se" + percentF=`bash ${script_inferMeta} sef ${repRID}.infer_experiment.txt` + percentR=`bash ${script_inferMeta} ser ${repRID}.infer_experiment.txt` + fi + echo -e "LOG: percentage reads in the same direction as gene: \${percentF}" >> ${repRID}.inferMetadata.log + echo -e "LOG: percentage reads in the opposite direction as gene: \${percentR}" >> ${repRID}.inferMetadata.log + if [ 1 -eq \$(echo \$(expr \${percentF#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentR#*.} "<" 2500)) ] + then + stranded="forward" + elif [ 1 -eq \$(echo \$(expr \${percentR#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentF#*.} "<" 2500)) ] + then + stranded="reverse" + else + stranded="unstranded" + fi + echo -e "LOG: ends set to: \${ends}" >> ${repRID}.inferMetadata.log + if [ "${endsForce}" != "" ] + then + ends=${endsForce} + echo -e "LOG: ends metadata forced: \${ends}" >> ${repRID}.inferMetadata.log + fi + echo -e "LOG: stradedness set to: \${stranded}" >> ${repRID}.inferMetadata.log + if [ "${strandedForce}" != "" ] + then + stranded=${strandedForce} + echo -e "LOG: spike-in metadata forced: \${stranded}" >> ${repRID}.inferMetadata.log + fi # write inferred metadata to file echo "\${ends},\${stranded},\${percentF},\${percentR},\${fail}" > infer.csv @@ -1632,9 +1641,16 @@ process checkMetadata { fi if [ "${endsMeta}" != "${endsInfer}" ] then - pipelineError=true - pipelineError_ends=true - echo -e "LOG: ends do not match: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log + if [ "${params.endsForce}" != "" ] + then + pipelineError=false + pipelineError_ends=false + echo -e "LOG: ends forced: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError=true + pipelineError_ends=true + echo -e "LOG: ends do not match: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log + fi else pipelineError_ends=false echo -e "LOG: ends matches: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log