From 59cb654aa2d1484fe1f218b9475accaf1421dbfa Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Tue, 26 Jan 2021 15:59:07 -0600 Subject: [PATCH] Add force params for stranded and spike --- .gitlab-ci.yml | 84 +++++++++++++++++++++++------- CHANGELOG.md | 1 + README.md | 6 ++- workflow/rna-seq.nf | 123 ++++++++++++++++++++++++++++++++------------ 4 files changed, 159 insertions(+), 55 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 75b4793..2b0b70a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -660,11 +660,11 @@ integration_se: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 --source dev --upload true -with-dag dag.png --dev false --ci true --email 'venkat.malladi@utsouthwestern.edu,Gervaise.Henry@UTSouthwestern.edu' + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 --source dev --upload true -with-dag dag.png --dev false --ci true -with-report ./SE_report.html - find . -type f -name "multiqc_data.json" -exec cp {} ./SE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -684,11 +684,11 @@ integration_pe: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA --source dev --upload true -with-dag dag.png --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA --source dev --upload true -with-dag dag.png --dev false --ci true -with-report ./PE_report.html - find . -type f -name "multiqc_data.json" -exec cp {} ./PE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -710,11 +710,11 @@ failAmbiguousSpecies: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --source dev --upload true -with-dag dag.png --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --source dev --upload true -with-dag dag.png --dev false --ci true -with-report ./failAmbiguousSpecies_report.html retry: max: 0 when: @@ -725,11 +725,11 @@ failTrunkation: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ET --source dev --upload true -with-dag dag.png --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ET --source dev --upload true -with-dag dag.png --dev false --ci true -with-report ./failTrunkation_report.html retry: max: 0 when: @@ -740,11 +740,11 @@ failMismatchR1R2: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-CWH4 --source dev --upload true -with-dag dag.png --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-CWH4 --source dev --upload true -with-dag dag.png --dev false --ci true -with-report ./failMismatchR1R2_report.html retry: max: 0 when: @@ -755,11 +755,11 @@ failUnexpectedMeta: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 14-3R4R --source dev --upload true -with-dag dag.png --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 14-3R4R --source dev --upload true -with-dag dag.png --dev false --ci true -with-report ./failUnexpectedMeta_report.html retry: max: 0 when: @@ -770,11 +770,11 @@ failFileStructure: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5HT --source dev --upload true -with-dag dag.png --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5HT --source dev --upload true -with-dag dag.png --dev false --ci true -with-report ./failFileStructure_report.html retry: max: 0 when: @@ -785,11 +785,11 @@ override_inputBag: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source dev --inputBagForce ./test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip --upload false --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source dev --inputBagForce ./test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip --upload false --dev false --ci true -with-report ./inputBagOverride_report.html - find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_PE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -807,11 +807,11 @@ override_fastq: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source dev --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --upload false --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source dev --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --upload false --dev false --ci true -with-report ./fastqOverride_report.html - find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_PE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -829,11 +829,11 @@ override_species: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --source dev --speciesForce 'Homo sapiens' --upload false --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --source dev --speciesForce 'Homo sapiens' --upload false --dev false --ci true -with-report ./speciesOverride_report.html - find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_PE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -846,6 +846,50 @@ override_species: when: - always +override_stranded: + stage: integration + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --source dev --strandedForce unstranded --upload false --dev false --ci true -with-report ./strandedOverride_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./strandedOverride_PE_multiqc_data.json \; + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - strandedOverride_PE_multiqc_data.json + expire_in: 7 days + retry: + max: 0 + when: + - always + +override_stpike: + stage: integration + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --source dev --spikeForce t --upload false --dev false --ci true -with-report ./spikeOverride_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./spikeOverride_PE_multiqc_data.json \; + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - spikedOverride_PE_multiqc_data.json + expire_in: 7 days + retry: + max: 0 + when: + - always + consistency: stage: consistency diff --git a/CHANGELOG.md b/CHANGELOG.md index d66bbc8..e45a78d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ * Endness metadata "Single Read" changed to "Single End" in data-hub, pipeline updated to handle (#110) ("Single Read" still acceptable for backwards compatibility) * Strandedness metadata "yes"/"no" changed to boolean "t"/"f" in data-hub, pipeline updated to handle (#70) ("yes"/"no" still acceptable for backwards compatibility) * Upload empty mRNA_QC entry if data error (#111) +* Allow forcing of strandedness and spike (#100) **Background** * Add memory limit (75%) per thread for samtools sort (#108) diff --git a/README.md b/README.md index 90878a2..38d978f 100644 --- a/README.md +++ b/README.md @@ -57,8 +57,12 @@ To Run: * eg: `--inputBagForce test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip` (must be the expected bag structure, this example will not work because it is a test bag) * `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input) * eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order) - * `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses ambiguous species error + * `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses a metadata mismatch or an ambiguous species error * eg: `--speciesForce 'Mus musculus'` + * `--strandedForce` forces the strandedness to be "forward", "reverse" or "unstranded", it bypasses a metadata mismatch error + * eg: `--strandedForce 'unstranded'` + * `--spikeForce` forces the spike-in to be "f" or "t", it bypasses a metadata mismatch error + * eg: `--spikeForce 't'` * Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)): * `--ci` boolean (default = false) * `--dev` boolean (default = true) diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 73f293d..42030a1 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -27,6 +27,8 @@ params.refSource = "biohpc" params.inputBagForce = "" params.fastqsForce = "" params.speciesForce = "" +params.strandedForce = "" +params.spikeForce = "" // Define tracking input variables params.ci = false @@ -64,6 +66,8 @@ upload = params.upload inputBagForce = params.inputBagForce fastqsForce = params.fastqsForce speciesForce = params.speciesForce +sptrandedForce = params.speciesForce +spikeForce = params.speciesForce email = params.email // Define fixed files and variables @@ -311,6 +315,8 @@ process parseMetadata { path experiment from experimentMeta path (fastq) from fastqs_parseMetadata.collect() val fastqCount + val strandedForce + val spikeForce output: path "design.csv" into metadata_fl @@ -364,10 +370,20 @@ process parseMetadata { # get strandedness metadata stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p stranded) echo -e "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.log + if [ "\${strandedForce}" != "" ] + then + stranded=${strandedForce} + echo -e "LOG: spike-in metadata forced: \${spike}" >> ${repRID}.parseMetadata.log + fi # get spike-in metadata spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p spike) echo -e "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.log + if [ "\${spikeForce}" != "" ] + then + spike=${spikeForce} + echo -e "LOG: spike-in metadata forced: \${spike}" >> ${repRID}.parseMetadata.log + fi if [ "\${spike}" == "f" ] then spike="false" @@ -1170,51 +1186,68 @@ process checkMetadata { pipelineError=false # check if submitted metadata matches inferred - if [ "${endsMeta}" != "${endsInfer}" ] - then - pipelineError=true - pipelineError_ends=true - echo -e "LOG: ends do not match: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log - else - pipelineError_ends=false - echo -e "LOG: ends matches: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log - fi if [ "${strandedMeta}" != "${strandedInfer}" ] then - pipelineError=true - pipelineError_stranded=true - if [ "${strandedMeta}" == "stranded" ] + if [ "${params.strandedForce}" != "" ] then - if [[ "${strandedInfer}" == "forward" ]] || [[ "${strandedInfer}" == "reverse" ]] + pipelineError=false + echo -e "LOG: stranded forced: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError=true + pipelineError_stranded=true + if [ "${strandedMeta}" == "stranded" ] then - pipelineError=false - pipelineError_stranded=false - echo -e "LOG: stranded matches: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log + if [[ "${strandedInfer}" == "forward" ]] || [[ "${strandedInfer}" == "reverse" ]] + then + pipelineError=false + pipelineError_stranded=false + echo -e "LOG: stranded matches: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log + else + echo -e "LOG: stranded does not match: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log + fi else echo -e "LOG: stranded does not match: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log fi - else - echo -e "LOG: stranded does not match: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log fi else pipelineError=false - pipelineError_stranded=false echo -e "LOG: stranded matches: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log fi - if [ "${spikeMeta}" != "${spikeInfer}" ] + if [ "${endsMeta}" != "${endsInfer}" ] then pipelineError=true - pipelineError_spike=true - echo -e "LOG: spike does not match: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log + pipelineError_ends=true + echo -e "LOG: ends do not match: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError_ends=false + echo -e "LOG: ends matches: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log + fi + if [ "${spikeMeta}" != "${spikeInfer}" ] + then + if [[ "${params.spikeForce}" != "" ]] + then + pipelineError_spike=false + echo -e "LOG: spike forced: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError=true + pipelineError_spike=true + echo -e "LOG: spike does not match: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log + fi else pipelineError_spike=false - echo -e "LOG: stranded matches: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log + echo -e "LOG: spike matches: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log fi if [ "${speciesMeta}" != "${speciesInfer}" ] then - pipelineError=true - pipelineError_species=true - echo -e "LOG: species does not match: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log + if [[ "${params.speciesForce}" != "" ]] + then + pipelineError_species=false + echo -e "LOG: species forced: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError=true + pipelineError_species=true + echo -e "LOG: species does not match: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log + fi else pipelineError_species=false echo -e "LOG: species matches: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log @@ -1935,7 +1968,7 @@ process aggrQC { ulimit -a >> ${repRID}.aggrQC.log # make run table - if [ "${params.inputBagForce}" == "" ] && [ "${params.fastqsForce}" == "" ] && [ "${params.speciesForce}" == "" ] + if [ "${params.inputBagForce}" == "" ] && [ "${params.fastqsForce}" == "" ] && [ "${params.speciesForce}" == "" && [ "${params.strandedForce}" == "" && [ "${params.spikeForce}" == "" ] then input="default" else @@ -1952,6 +1985,14 @@ process aggrQC { then input=\$(echo \${input} species) fi + if [ "${params.strandedForce}" != "" ] + then + input=\$(echo \${input} stranded) + fi + if [ "${params.spikeForce}" != "" ] + then + input=\$(echo \${input} spike) + fi fi echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log echo -e "Session\tSession ID\tStart Time\tPipeline Version\tInput" > run.tsv @@ -1969,10 +2010,24 @@ process aggrQC { echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv if [ "${params.speciesForce}" == "" ] then - echo -e "Inferred\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + input=\$(echo "Inferred\t${speciesI}\t) + else + input=\${echo "Inferred\t${speciesI} (FORCED)\t) + fi + input=\$(echo \${input}${endsI}\t) + if [ "${params.strandedForce}" == "" ] + then + input=\$(echo \${input}${strandedI}\t) else - echo -e "Inferred\t${speciesI} (FORCED)\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + input=$(echo \${input}${strandedI} (FORCED)\t) fi + if [ "${params.spikeForce}" == "" ] + then + input=$(echo \${input}${spikeI}\t-\t-\t-\t-) + else + input=$(echo \${input}${spikeI} (FORCED)\t-\t-\t-\t-" >> metadata.tsv + fi + echo -e \${input} >> metadata.tsv echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv # make reference table @@ -2231,11 +2286,11 @@ process uploadOutputBag { echo LOG: output bag RID uploaded - \${outputBag_rid} >> ${repRID}.uploadOutputBag.log rid=\${outputBag_rid} else - exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - exist=\${exist:8:-6} - outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -o ${source} -c \${cookie} -u \${exist}) - echo LOG: output bag RID already exists - \${exist} >> ${repRID}.uploadOutputBag.log - rid=\${exist} + exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + exist=\${exist:8:-6} + outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -o ${source} -c \${cookie} -u \${exist}) + echo LOG: output bag RID already exists - \${exist} >> ${repRID}.uploadOutputBag.log + rid=\${exist} fi echo "\${rid}" > outputBagRID.csv -- GitLab