From 03e05d4c9786930656b1dff206d456b8be34c8f7 Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Wed, 26 Aug 2020 12:56:17 -0500 Subject: [PATCH] Add species override --- .gitlab-ci.yml | 14 ++++++++ CHANGELOG.md | 7 ++-- README.md | 2 ++ workflow/conf/multiqc_config.yaml | 2 ++ workflow/rna-seq.nf | 56 +++++++++++++++++++++++++++---- 5 files changed, 72 insertions(+), 9 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8f4c222..7aff1aa 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -201,6 +201,20 @@ override_fastq: paths: - fastqOverride_PE_multiqc_data.json expire_in: 7 days + +override_species: + stage: integration + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --ci true + - find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_PE_multiqc_data.json \; + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - speciesOverride_PE_multiqc_data.json + expire_in: 7 days consistency: stage: consistency diff --git a/CHANGELOG.md b/CHANGELOG.md index 104cc84..3db82b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,12 +6,15 @@ * MultiQC output custom talbes (html+JSON): * Run table: *Session ID* and *Pipeline Version* * Reference Table: *Species*, *Genome Reference Consortium Build*, *Genome Reference Consortium Patch*, *GENCODE Annotation Release* (ouputs both human and mouse versions) -* Add inputBag override param (`inputBagForce`) +* Add inputBag override param (`inputBagForce`) [`*.zip`] * Uses provided inputBag instead of downloading from data-hub * Still requires matching repRID input param * Add fastq override param (`fastqsForce`) [`R1`,`R2`] * Uses provided fastq instead of downloading from data-hub * Still requires matching repRID input param and will pull inputBag from data-hub to access submitted metadata for reporting +* Add species override param (`speciesForce`) [`Mus musculus` or `Homo sapiens`] + * forces the use of the provided species + * ignors infered ambiguous species **Background** * Add GeneSymbol/EnsemblID/EntrezID translation files to references @@ -19,7 +22,7 @@ *Known Bugs* * outputBag does not contain fetch for processed data * Does not include automatic data upload -* Override params (inputBag and fastq) are't checked for integrity +* Override params (inputBag, fastq, species) are't checked for integrity <hr> diff --git a/README.md b/README.md index e96460d..49069c5 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,8 @@ To Run: * eg: `--inputBagForce test_data/bag/Replicate_Q-Y5F6.zip` (must be the expected bag structure) * `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input) * eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order) + * `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses ambiguous species error + * eg: `--speciesForce 'Mus musculus'` * Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)): * `--ci` boolean (default = false) * `--dev` boolean (default = false) diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml index e6e85c6..0c780d9 100644 --- a/workflow/conf/multiqc_config.yaml +++ b/workflow/conf/multiqc_config.yaml @@ -73,6 +73,7 @@ custom_data: Session Session ID Pipeline Version + Input rid: file_format: 'tsv' section_name: 'RID' @@ -83,6 +84,7 @@ custom_data: scale: false format: '{}' headers: + Replicate Replicate RID Experiment RID Study RID diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 387e7f1..b6687df 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -22,6 +22,7 @@ params.outDir = "${baseDir}/../output" // Define override input variable params.inputBagForce = "" params.fastqsForce = "" +params.speciesForce = "" // Parse input variables deriva = Channel @@ -38,7 +39,7 @@ outDir = params.outDir logsDir = "${outDir}/Logs" inputBagForce = params.inputBagForce fastqsForce = params.fastqsForce - +speciesForce = params.speciesForce // Define fixed files derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") @@ -562,7 +563,24 @@ process inferMetadata { bed="./GRCm/bed/genome.bed" else echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log - exit 1 + if [ "${speciesForce}" == "" ] + then + exit 1 + fi + fi + if [ "${speciesForce}" != "" ] + then + echo -e "LOG: species overridden to: ${speciesForce}" + species="${speciesForce}" + if [ "${speciesForce}" == "Homo sapiens" ] + then + bam="GRCh.sampled.sorted.bam" + bed="./GRCh/bed/genome.bed" + elif [ "${speciesForce}" == "Mus musculus" ] + then + bam="GRCm.sampled.sorted.bam" + bed="./GRCm/bed/genome.bed" + fi fi echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log @@ -1064,20 +1082,44 @@ process aggrQC { ulimit -a >> ${repRID}.aggrQC.log # make run table + if [ "${params.inputBagForce}" == "" ] && [ "${params.fastqsForce}" == "" ] && [ "${params.speciesForce}" == "" ] + then + input="default" + else + input="override:" + if [ "${params.inputBagForce}" != "" ] + then + input=\$(echo \${input} inputBag) + fi + if [ "${params.fastqsForce}" != "" ] + then + input=\$(echo \${input} fastq) + fi + if [ "${params.speciesForce}" != "" ] + then + input=\$(echo \${input} species) + fi + fi echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log - echo -e "Session\tSession ID\tPipeline Version" > run.tsv - echo -e "Session\t${workflow.sessionId}\t${workflow.manifest.version}" >> run.tsv + echo -e "Session\tSession ID\tPipeline Version\tInput" > run.tsv + echo -e "Session\t${workflow.sessionId}\t${workflow.manifest.version}\t\${input}" >> run.tsv + # make RID table echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log - echo -e "Replicate RID\tExperiment RID\tStudy RID" > rid.tsv - echo -e "${repRID}\t${expRID}\t${studyRID}" >> rid.tsv + echo -e "Replicate\tReplicate RID\tExperiment RID\tStudy RID" > rid.tsv + echo -e "Replicate\t${repRID}\t${expRID}\t${studyRID}" >> rid.tsv # make metadata table echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRaw Reads\tAssigned Reads\tMedian Read Length\tMedian TIN" > metadata.tsv echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv - echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + if [ "${params.speciesForce}" == "" ] + then + echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + else + echo -e "Infered\t${speciesI} (FORCED)\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + fi echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv # make reference table -- GitLab