From 5f48c1c116f2fb4ccdfca5df712e3c2f190801d5 Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Tue, 25 Aug 2020 17:46:17 -0500
Subject: [PATCH] Add fastq override #71

---
 .gitlab-ci.yml      | 16 +++++++++++++++-
 CHANGELOG.md        |  7 +++++++
 README.md           |  5 +++++
 workflow/rna-seq.nf | 42 +++++++++++++++++++++++++++---------------
 4 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 4589f16..c2d779e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -206,4 +206,18 @@ inputBag:
     when: always
     paths:
       - inputBagOverride_PE_multiqc_data.json
-    expire_in: 7 days
\ No newline at end of file
+    expire_in: 7 days
+
+  fastq:
+    stage: override
+    script:
+      - hostname
+      - ulimit -a
+      - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA --fastqForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --ci true
+      - find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_PE_multiqc_data.json \;
+      artifacts:
+        name: "$CI_JOB_NAME"
+        when: always
+        paths:
+          - fastqOverride_PE_multiqc_data.json
+        expire_in: 7 days
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 07869f4..104cc84 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,12 @@
 * MultiQC output custom talbes (html+JSON):
   * Run table: *Session ID* and *Pipeline Version*
   * Reference Table: *Species*, *Genome Reference Consortium Build*, *Genome Reference Consortium Patch*, *GENCODE Annotation Release* (ouputs both human and mouse versions)
+* Add inputBag override param (`inputBagForce`)
+  * Uses provided inputBag instead of downloading from data-hub
+  * Still requires matching repRID input param
+* Add fastq override param (`fastqsForce`) [`R1`,`R2`]
+  * Uses provided fastq instead of downloading from data-hub
+  * Still requires matching repRID input param and will pull inputBag from data-hub to access submitted metadata for reporting
 
 **Background**
 * Add GeneSymbol/EnsemblID/EntrezID translation files to references
@@ -13,6 +19,7 @@
 *Known Bugs*
 * outputBag does not contain fetch for processed data
 * Does not include automatic data upload
+* Override params (inputBag and fastq) are't checked for integrity
 
 <hr>
 
diff --git a/README.md b/README.md
index 2216c27..b2aa90a 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,11 @@ To Run:
   * reference version consists of Genome Reference Consortium version, patch release and GENCODE annotation release # (leaving the params blank will use the default version tied to the pipeline version)
     * *current mouse* **38.p6.vM22** = GRCm38.p6 with GENCODE annotation release M22
     * *current human* **38.p6.v31** = GRCh38.p12 with GENCODE annotation release 31
+* ***Optional*** input overrides
+  * `--inputBagForce` utilizes a local replicate inputBag instead of downloading from the data-hub (still requires accurate repRID input)
+    * eg: `--inputBagForce test_data/bagit/Replicate_Q-Y5F6.zip` (must be the expected bagit structure)
+  * `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input)
+    * eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order)
 * Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)):
   * `--ci` boolean (default = false)
   * `--dev` boolean (default = false)
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index f1a7c65..2cf5364 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -21,6 +21,7 @@ params.outDir = "${baseDir}/../output"
 
 // Define override input variable
 params.inputBagForce = ""
+params.fastqsForce = ""
 
 // Parse input variables
 deriva = Channel
@@ -36,6 +37,7 @@ refERCCVersion = params.refERCCVersion
 outDir = params.outDir
 logsDir = "${outDir}/Logs"
 inputBagForce = params.inputBagForce
+fastqsForce = params.fastqsForce
 
 
 // Define fixed files
@@ -122,10 +124,10 @@ process getBag {
     path derivaConfig
 
   output:
-    path ("Replicate_*.zip") into bagit
+    path ("Replicate_*.zip") into bag
 
   when:
-    params.inputBagForce == ""
+    inputBagForce == ""
 
   script:
     """
@@ -139,7 +141,7 @@ process getBag {
     echo -e "LOG: linked" >> ${repRID}.getBag.log
 
     # deriva-download replicate RID
-    echo -e "LOG: fetching bagit for ${repRID} in GUDMAP" >> ${repRID}.getBag.log
+    echo -e "LOG: fetching bag for ${repRID} in GUDMAP" >> ${repRID}.getBag.log
     deriva-download-cli ${source} --catalog 2 ${derivaConfig} . rid=${repRID}
     echo -e "LOG: fetched" >> ${repRID}.getBag.log
     """
@@ -149,9 +151,9 @@ process getBag {
 if (inputBagForce != "") {
   inputBag = Channel
     .fromPath(inputBagForce)
-    .ifEmpty { exit 1, "override inputBagit file not found: ${inputBagForce}" }
+    .ifEmpty { exit 1, "override inputBag file not found: ${inputBagForce}" }
 } else {
-  inputBag = bagit
+  inputBag = bag
 }
 
 /*
@@ -182,26 +184,36 @@ process getData {
     ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt
     echo -e "LOG: linked" >> ${repRID}.getData.log
     
-    # get bagit basename
-    replicate=\$(basename "\${inputBag}" | cut -d "." -f1)
-    echo -e "LOG: bagit replicate name \${replicate}" >> ${repRID}.getData.log
+    # get bag basename
+    replicate=\$(basename "${inputBag}" | cut -d "." -f1)
+    echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log
     
-    # unzip bagit
-    echo -e "LOG: unzipping replicate bagit" >> ${repRID}.getData.log
+    # unzip bag
+    echo -e "LOG: unzipping replicate bag" >> ${repRID}.getData.log
     unzip ${inputBag}
     echo -e "LOG: unzipped" >> ${repRID}.getData.log
     
-    # bagit fetch fastq's only and rename by repRID
+    # bag fetch fastq's only and rename by repRID
     echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log
     sh ${script_bdbagFetch} \${replicate} ${repRID}
     echo -e "LOG: fetched" >> ${repRID}.getData.log
     """
 }
 
-// Replicate raw fastq's for multiple process inputs
-fastqs.into {
-  fastqs_trimData
-  fastqs_fastqc
+// Set raw fastq to downloaded or forced input and replicate them for multiple process inputs
+if (fastqsForce != "") {
+  Channel
+    .fromPath(fastqsForce)
+    .ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" }
+    .collect().into {
+      fastqs_trimData
+      fastqs_fastqc
+    }
+} else {
+  fastqs.into {
+    fastqs_trimData
+    fastqs_fastqc
+  }
 }
 
 /*
-- 
GitLab