From 03e05d4c9786930656b1dff206d456b8be34c8f7 Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Wed, 26 Aug 2020 12:56:17 -0500
Subject: [PATCH] Add species override

---
 .gitlab-ci.yml                    | 14 ++++++++
 CHANGELOG.md                      |  7 ++--
 README.md                         |  2 ++
 workflow/conf/multiqc_config.yaml |  2 ++
 workflow/rna-seq.nf               | 56 +++++++++++++++++++++++++++----
 5 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 8f4c222..7aff1aa 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -201,6 +201,20 @@ override_fastq:
     paths:
       - fastqOverride_PE_multiqc_data.json
     expire_in: 7 days
+    
+override_species:
+  stage: integration
+  script:
+  - hostname
+  - ulimit -a
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --ci true
+  - find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_PE_multiqc_data.json \;
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - speciesOverride_PE_multiqc_data.json
+    expire_in: 7 days
 
 consistency:
   stage: consistency
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 104cc84..3db82b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,12 +6,15 @@
 * MultiQC output custom talbes (html+JSON):
   * Run table: *Session ID* and *Pipeline Version*
   * Reference Table: *Species*, *Genome Reference Consortium Build*, *Genome Reference Consortium Patch*, *GENCODE Annotation Release* (ouputs both human and mouse versions)
-* Add inputBag override param (`inputBagForce`)
+* Add inputBag override param (`inputBagForce`) [`*.zip`]
   * Uses provided inputBag instead of downloading from data-hub
   * Still requires matching repRID input param
 * Add fastq override param (`fastqsForce`) [`R1`,`R2`]
   * Uses provided fastq instead of downloading from data-hub
   * Still requires matching repRID input param and will pull inputBag from data-hub to access submitted metadata for reporting
+* Add species override param (`speciesForce`) [`Mus musculus` or `Homo sapiens`]
+  * forces the use of the provided species
+  * ignors infered ambiguous species
 
 **Background**
 * Add GeneSymbol/EnsemblID/EntrezID translation files to references
@@ -19,7 +22,7 @@
 *Known Bugs*
 * outputBag does not contain fetch for processed data
 * Does not include automatic data upload
-* Override params (inputBag and fastq) are't checked for integrity
+* Override params (inputBag, fastq, species) are't checked for integrity
 
 <hr>
 
diff --git a/README.md b/README.md
index e96460d..49069c5 100644
--- a/README.md
+++ b/README.md
@@ -53,6 +53,8 @@ To Run:
     * eg: `--inputBagForce test_data/bag/Replicate_Q-Y5F6.zip` (must be the expected bag structure)
   * `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input)
     * eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order)
+  * `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses ambiguous species error
+    * eg: `--speciesForce 'Mus musculus'`
 * Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)):
   * `--ci` boolean (default = false)
   * `--dev` boolean (default = false)
diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml
index e6e85c6..0c780d9 100644
--- a/workflow/conf/multiqc_config.yaml
+++ b/workflow/conf/multiqc_config.yaml
@@ -73,6 +73,7 @@ custom_data:
             Session
             Session ID
             Pipeline Version
+            Input
     rid:
         file_format: 'tsv'
         section_name: 'RID'
@@ -83,6 +84,7 @@ custom_data:
             scale: false
             format: '{}'
         headers:
+            Replicate
             Replicate RID
             Experiment RID
             Study RID
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index 387e7f1..b6687df 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -22,6 +22,7 @@ params.outDir = "${baseDir}/../output"
 // Define override input variable
 params.inputBagForce = ""
 params.fastqsForce = ""
+params.speciesForce = ""
 
 // Parse input variables
 deriva = Channel
@@ -38,7 +39,7 @@ outDir = params.outDir
 logsDir = "${outDir}/Logs"
 inputBagForce = params.inputBagForce
 fastqsForce = params.fastqsForce
-
+speciesForce = params.speciesForce
 
 // Define fixed files
 derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
@@ -562,7 +563,24 @@ process inferMetadata {
       bed="./GRCm/bed/genome.bed"
     else
       echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log
-      exit 1
+      if [ "${speciesForce}" == "" ]
+      then
+        exit 1
+      fi
+    fi
+    if [ "${speciesForce}" != "" ]
+    then
+      echo -e "LOG: species overridden to: ${speciesForce}"
+      species="${speciesForce}"
+      if [ "${speciesForce}" == "Homo sapiens" ]
+      then
+        bam="GRCh.sampled.sorted.bam"
+        bed="./GRCh/bed/genome.bed"
+      elif [ "${speciesForce}" == "Mus musculus" ]
+      then
+        bam="GRCm.sampled.sorted.bam"
+        bed="./GRCm/bed/genome.bed"
+      fi
     fi
     echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log
 
@@ -1064,20 +1082,44 @@ process aggrQC {
     ulimit -a >> ${repRID}.aggrQC.log
 
     # make run table
+    if [ "${params.inputBagForce}" == "" ] && [ "${params.fastqsForce}" == "" ] && [ "${params.speciesForce}" == "" ]
+    then
+      input="default"
+    else
+      input="override:"
+      if [ "${params.inputBagForce}" != "" ]
+      then
+        input=\$(echo \${input} inputBag)
+      fi
+      if [ "${params.fastqsForce}" != "" ]
+      then
+        input=\$(echo \${input} fastq)
+      fi
+      if [ "${params.speciesForce}" != "" ]
+      then
+        input=\$(echo \${input} species)
+      fi
+    fi
     echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log
-    echo -e "Session\tSession ID\tPipeline Version" > run.tsv
-    echo -e "Session\t${workflow.sessionId}\t${workflow.manifest.version}" >> run.tsv
+    echo -e "Session\tSession ID\tPipeline Version\tInput" > run.tsv
+    echo -e "Session\t${workflow.sessionId}\t${workflow.manifest.version}\t\${input}" >> run.tsv
+    
     
     # make RID table
     echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log
-    echo -e "Replicate RID\tExperiment RID\tStudy RID" > rid.tsv
-    echo -e "${repRID}\t${expRID}\t${studyRID}" >> rid.tsv
+    echo -e "Replicate\tReplicate RID\tExperiment RID\tStudy RID" > rid.tsv
+    echo -e "Replicate\t${repRID}\t${expRID}\t${studyRID}" >> rid.tsv
 
     # make metadata table
     echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log
     echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRaw Reads\tAssigned Reads\tMedian Read Length\tMedian TIN" > metadata.tsv
     echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv
-    echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
+    if [ "${params.speciesForce}" == "" ]
+    then
+      echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
+    else
+      echo -e "Infered\t${speciesI} (FORCED)\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
+    fi
     echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv
 
     # make reference table
-- 
GitLab