Merge branch '75-no.upload' into '11-deriva.upload'

Resolve "Add an option to not upload" See merge request !54

Merge branch '75-no.upload' into '11-deriva.upload'
Resolve "Add an option to not upload" See merge request !54
10cc87d9 · Gervaise Henry · e253651f · 8792bf4b · 10cc87d9 · 10cc87d9
Commit 10cc87d9 authored 4 years ago by Gervaise Henry
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -494,7 +494,7 @@ integration_se:
      - SE_multiqc_data.json
    expire_in: 7 days
  retry:
-    max: 1
+    max: 0
    when:
      - always

@@ -519,7 +519,7 @@ integration_pe:
      - PE_multiqc_data.json
    expire_in: 7 days
  retry:
-    max: 1
+    max: 0
    when:
      - always

@@ -533,7 +533,7 @@ override_inputBag:
  script:
  - hostname
  - ulimit -a
-  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --inputBagForce ./test_data/bag/staging/Replicate_Q-Y5F6.zip --ci true
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --inputBagForce ./test_data/bag/staging/Replicate_Q-Y5F6.zip --upload false --ci true
  - find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_PE_multiqc_data.json \;
  artifacts:
    name: "$CI_JOB_NAME"
@@ -542,7 +542,7 @@ override_inputBag:
      - inputBagOverride_PE_multiqc_data.json
    expire_in: 7 days
  retry:
-    max: 1
+    max: 0
    when:
      - always

@@ -555,7 +555,7 @@ override_fastq:
  script:
  - hostname
  - ulimit -a
-  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --ci true
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --upload false --ci true
  - find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_PE_multiqc_data.json \;
  artifacts:
    name: "$CI_JOB_NAME"
@@ -564,7 +564,7 @@ override_fastq:
      - fastqOverride_PE_multiqc_data.json
    expire_in: 7 days
  retry:
-    max: 1
+    max: 0
    when:
      - always

@@ -577,7 +577,7 @@ override_species:
  script:
  - hostname
  - ulimit -a
-  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --ci true
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --upload false --ci true
  - find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_PE_multiqc_data.json \;
  artifacts:
    name: "$CI_JOB_NAME"
@@ -586,7 +586,7 @@ override_species:
      - speciesOverride_PE_multiqc_data.json
    expire_in: 7 days
  retry:
-    max: 1
+    max: 0
    when:
      - always


--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
 * Upload execution run
 * Upload mRNA QC
 * Create and upload output bag
+* Add optional to not upload

 **Background**
 * Remove (comment out) option to pull references from S3

--- a/README.md
+++ b/README.md
@@ -34,9 +34,12 @@ To Run:
    * **dev** = [dev.gudmap.org](dev.gudmap.org) (default, does not contain all data)
    * **staging** = [staging.gudmap.org](staging.gudmap.org) (does not contain all data)
    * **production** = [www.gudmap.org](www.gudmap.org) (***does contain  all data***)
-  * `--refMoVersion` mouse reference version ***(optional)***
-  * `--refHuVersion` human reference version ***(optional)***
-  * `--refERCCVersion` human reference version ***(optional)***
+  * `--refMoVersion` mouse reference version ***(optional, default = 38.p6.vM22)***
+  * `--refHuVersion` human reference version ***(optional, default = 38.p12.v31)***
+  * `--refERCCVersion` human reference version ***(optional, default = 92)***
+  * `--upload` option to not upload output back to the data-hub ***(optional, default = true)***
+    * **true** = upload outputs to the data-hub
+    * **false** = do *NOT* upload outputs to the data-hub
  * `-profile` config profile to use ***(optional)***:
    * defaut = processes on BioHPC cluster
    * **biohpc** = process on BioHPC cluster

--- a/workflow/conf/replicate_export_config.json
+++ b/workflow/conf/replicate_export_config.json
 {
  "bag": {
-    "bag_name": "Replicate_{rid}",
+    "bag_name": "{rid}_inputBag",
    "bag_algorithms": [
      "md5"
    ],

--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -18,6 +18,7 @@ params.refMoVersion = "38.p6.vM22"
 params.refHuVersion = "38.p12.v31"
 params.refERCCVersion = "92"
 params.outDir = "${baseDir}/../output"
+params.upload = true
 params.email = ""


@@ -51,6 +52,7 @@ refHuVersion = params.refHuVersion
 refERCCVersion = params.refERCCVersion
 outDir = params.outDir
 logsDir = "${outDir}/Logs"
+upload = params.upload
 inputBagForce = params.inputBagForce
 fastqsForce = params.fastqsForce
 speciesForce = params.speciesForce
@@ -155,7 +157,7 @@ process getBag {
    path replicateExportConfig

  output:
-    path ("Replicate_*.zip") into bag
+    path ("*.zip") into bag

  when:
    inputBagForce == ""
@@ -220,7 +222,7 @@ process getData {
    echo -e "LOG: linked" >> ${repRID}.getData.log

    # get bag basename
-    replicate=\$(basename "${inputBag}" | cut -d "." -f1)
+    replicate=\$(basename "${inputBag}" | cut -d "_" -f1)
    echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log

    # unzip bag
@@ -1284,6 +1286,9 @@ process uploadInputBag {
  output:
    path ("inputBagRID.csv") into inputBagRID_fl

+  when:
+    upload
+
  script:
  """
  hostname > ${repRID}.uploadInputBag.log
@@ -1350,6 +1355,9 @@ process uploadExecutionRun {
  output:
    path ("executionRunRID.csv") into executionRunRID_fl

+  when:
+    upload
+
  script:
  """
  hostname > ${repRID}.uploadExecutionRun.log
@@ -1431,6 +1439,9 @@ process uploadQC {
  output:
    path ("qcRID.csv") into qcRID_fl

+  when:
+    upload
+
  script:
  """
  hostname > ${repRID}.uploadQC.log
@@ -1469,7 +1480,6 @@ qcRID_fl.splitCsv(sep: ",", header: false).separate(
  qcRID
 )

-
 /*
 *ouputBag: create ouputBag
 */
@@ -1493,40 +1503,27 @@ process outputBag {
  output:
    path ("${repRID}_Output_Bag.zip") into outputBag

+  when:
+    upload
+
  script:
  """
  hostname > ${repRID}.outputBag.log
  ulimit -a >> ${repRID}.outputBag.log

-  mkdir -p ./deriva/Seq/Workflow_Runs/${studyRID}/${executionRunRID}/
-  cp ${bam} ./deriva/Seq/Workflow_Runs/${studyRID}/${executionRunRID}/
-  cp ${bigwig} ./deriva/Seq/Workflow_Runs/${studyRID}/${executionRunRID}/
-  cp ${counts} ./deriva/Seq/Workflow_Runs/${studyRID}/${executionRunRID}/
+  mkdir -p ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/
+  cp ${bam} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/
+  cp ${bai} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/
+  cp ${bigwig} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/
+  cp ${counts} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/

  cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
  cookie=\${cookie:20:-1}
-  deriva-upload-cli --catalog 2 --token \${cookie} ${source} ./deriva --purge-state
-
-  fileBam=\$(basename -a ${bam})
-  md5Bam=\$(md5sum ./\${fileBam} | awk '{ print \$1 }')
-  fileBigwig=\$(basename -a ${bigwig})
-  md5Bigwig=\$(md5sum ./\${fileBigwig} | awk '{ print \$1 }')
-  fileCounts=\$(basename -a ${counts})
-  md5Counts=\$(md5sum ./\${fileCounts} | awk '{ print \$1 }')
-  urlBam=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Processed_File/File_MD5=\${md5Bam})
-  urlBam=\$(echo \${urlBam} | grep -o '\\"File_URL\\":\\".*\\",\\"File_Name')
-  urlBam=\${urlBam:12:-12}
-  urlBigwig=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Processed_File/File_MD5=\${md5Bigwig})
-  urlBigwig=\$(echo \${urlBigwig} | grep -o '\\"File_URL\\":\\".*\\",\\"File_Name')
-  urlBigwig=\${urlBigwig:12:-12}
-  urlCounts=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Processed_File/File_MD5=\${md5Counts})
-  urlCounts=\$(echo \${urlCounts} | grep -o '\\"File_URL\\":\\".*\\",\\"File_Name')
-  urlCounts=\${urlCounts:12:-12}
-  echo \${urlBam} > url.txt
-  echo \${urlBigwig} >> url.txt
-  echo \${urlCounts} >> url.txt
+  deriva-upload-cli --catalog 2 --token \${cookie} ${source} ./deriva
+  echo LOG: processed files uploaded >> ${repRID}.outputBag.log

  deriva-download-cli --catalog 2 --token \${cookie} ${source} ${executionRunExportConfig} . rid=${executionRunRID}
+  echo LOG: execution run bag downloaded >> ${repRID}.outputBag.log

  echo -e "### Run Details" >> runDetails.md
  echo -e "**Workflow URL:** https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq" >> runDetails.md
@@ -1544,6 +1541,7 @@ process outputBag {
  echo -e "**Genome Assembly Version:** \${genome} patch \${patch}" >> runDetails.md
  echo -e "**Annotation Version:** GENCODE release \${annotation}" >> runDetails.md
  echo -e "**Run ID:** ${repRID}" >> runDetails.md
+  echo LOG: runDetails.md created >> ${repRID}.outputBag.log

  unzip Execution_Run_${executionRunRID}.zip 
  mv Execution_Run_${executionRunRID} ${repRID}_Output_Bag
@@ -1554,6 +1552,7 @@ process outputBag {
  cp ${multiqcJSON} \${loc}

  bdbag ./${repRID}_Output_Bag/ --update --archiver zip --debug
+  echo LOG: output bag created >> ${repRID}.outputBag.log
  """
 }

@@ -1572,6 +1571,9 @@ process uploadOutputBag {
  output:
    path ("outputBagRID.csv") into outputBagRID_fl

+  when:
+    upload
+
  script:
  """
  hostname > ${repRID}.uploadOutputBag.log

--- a/workflow/scripts/bdbagFetch.sh
+++ b/workflow/scripts/bdbagFetch.sh
@@ -2,7 +2,7 @@

 if [ -z "${3}" ]
 then
-    bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz ${1}
+    bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz ${1}_inputBag
    for i in $(find */ -name "*R*.fastq.gz")
    do
        path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz")