From ab0e4bb698f0bf36bb0012ca4f235c9add718be9 Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Mon, 7 Dec 2020 10:16:43 -0600
Subject: [PATCH] Add upload output bag

---
 .gitlab-ci.yml                      | 66 +++++++++++------------
 workflow/conf/aws.config            |  8 +++
 workflow/conf/biohpc.config         |  3 ++
 workflow/nextflow.config            |  3 ++
 workflow/rna-seq.nf                 | 81 +++++++++++++++++++++++++++--
 workflow/scripts/uploadOutputBag.py | 48 +++++++++++++++++
 6 files changed, 173 insertions(+), 36 deletions(-)
 create mode 100644 workflow/scripts/uploadOutputBag.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e2df40a..f62f838 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -288,7 +288,7 @@ uploadInputBag:
   script:
   - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json
   - >
-    if [ ! `deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -q '/TEST$'` ]; then
+    if [ `deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -q '/TEST$'` ]; then
       singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org mkdir /hatrac/resources/rnaseq/pipeline/input_bag/TEST
       echo /hatrac/resources/rnaseq/pipeline/input_bag/TEST created
     else
@@ -311,7 +311,7 @@ uploadInputBag:
       echo ${rid} test input bag already exists
     fi
 
-    uploadInputBag:
+uploadExecutionRun:
   stage: unit
   only:
     - push
@@ -321,31 +321,20 @@ uploadInputBag:
   script:
   - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json
   - >
-    hatrac=`singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -o '/TEST$'` &&
-    if [ -z "${hatrac}" ]; then
-      singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org mkdir /hatrac/resources/rnaseq/pipeline/input_bag/TEST
-      echo /hatrac/resources/rnaseq/pipeline/input_bag/TEST created
-    else
-      echo /hatrac/resources/rnaseq/pipeline/input_bag/TEST already exists
-    fi
-  - echo THIS IS A TEST FILE > test.txt
-  - >
-    md5=$(md5sum ./test.txt | awk '{ print $1 }') &&
-    size=$(wc -c < ./test.txt) &&
-    exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=${md5}) &&
+    exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=18-DT5T/Reference_Genome=17-BPD4/Input_Bag=18-MHWC/Replicate=18-MJ3A) &&
+    cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') &&
+    cookie=${cookie:11:-1} &&
     if [ "${exist}" == "[]" ]; then
-      cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') &&
-      cookie=${cookie:11:-1} &&
-      loc=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/input_bag/TEST/test.txt) &&
-      rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadInputBag.py -f test.txt -l ${loc} -s ${md5} -b ${size} -o dev.gudmap.org -c ${cookie}) &&
-      echo ${rid} test input bag created
+      rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadExecutionRun.py -r 18-MJ3A -w 18-DT5T -g 17-BPD4 -i 18-MHWC -s Error -d 'Run in process' -o dev.gudmap.org -c ${cookie} -u F) &&
+      echo ${rid} test execution run created
     else
       rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') &&
       rid=${rid:7:-6} &&
-      echo ${rid} test input bag already exists
+      rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadExecutionRun.py -r 18-MJ3A -w 18-DT5T -g 17-BPD4 -i 18-MHWC -s Error -d 'Run in process' -o dev.gudmap.org -c ${cookie} -u ${rid}) &&
+      echo ${rid} test execution run already exists
     fi
 
-uploadExecutionRun:
+uploadQC:
   stage: unit
   only:
     - push
@@ -355,20 +344,20 @@ uploadExecutionRun:
   script:
   - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json
   - >
-    exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=18-DT5T/Reference_Genome=17-BPD4/Input_Bag=18-MHWC/Replicate=18-MJ3A) &&
+    exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=18-MJ3A/Execution_Run=18-MJ3C) &&
     cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') &&
     cookie=${cookie:11:-1} &&
     if [ "${exist}" == "[]" ]; then
-      rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadExecutionRun.py -r 18-MJ3A -w 18-DT5T -g 17-BPD4 -i 18-MHWC -s Error -d 'Run in process' -o dev.gudmap.org -c ${cookie} -u F) &&
+      rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadQC.py -r 18-MJ3A -e 18-MJ3C -p "Single Read" -s forward -l 35 -w 5 -f 1 -o dev.gudmap.org -c ${cookie} -u F) &&
       echo ${rid} test execution run created
     else
       rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') &&
       rid=${rid:7:-6} &&
-      rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadExecutionRun.py -r 18-MJ3A -w 18-DT5T -g 17-BPD4 -i 18-MHWC -s Error -d 'Run in process' -o dev.gudmap.org -c ${cookie} -u ${rid}) &&
+      rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadQC.py -r 18-MJ3A -e 18-MJ3C -p "Single Read" -s forward -l 35 -w 5 -f 1 -o dev.gudmap.org -c ${cookie} -u ${rid}) &&
       echo ${rid} test execution run already exists
     fi
 
-uploadQC:
+uploadOutputBag:
   stage: unit
   only:
     - push
@@ -378,19 +367,30 @@ uploadQC:
   script:
   - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json
   - >
-    exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=18-MJ3A/Execution_Run=18-MJ3C) &&
-    cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') &&
-    cookie=${cookie:11:-1} &&
+    if [ `deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/output_bag/ | grep -q '/TEST$'` ]; then
+      singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org mkdir /hatrac/resources/rnaseq/pipeline/output_bag/TEST
+      echo /hatrac/resources/rnaseq/pipeline/output_bag/TEST created
+    else
+      echo /hatrac/resources/rnaseq/pipeline/output_bag/TEST already exists
+    fi
+  - echo THIS IS A TEST FILE > test.txt
+  - >
+    md5=$(md5sum ./test.txt | awk '{ print $1 }') &&
+    size=$(wc -c < ./test.txt) &&
+    exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=${md5}) &&
     if [ "${exist}" == "[]" ]; then
-      rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadQC.py -r 18-MJ3A -e 18-MJ3C -p "Single Read" -s forward -l 35 -w 5 -f 1 -o dev.gudmap.org -c ${cookie} -u F) &&
-      echo ${rid} test execution run created
+      cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') &&
+      cookie=${cookie:11:-1} &&
+      loc=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/output_bag/TEST/test.txt) &&
+      rid=singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadOutputBag.py -f test.txt -l ${loc} -s ${md5} -b ${size} -o dev.gudmap.org -c ${cookie} &&
+      echo ${rid} test output bag created
     else
       rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') &&
-      rid=${rid:7:-6} &&
-      rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadQC.py -r 18-MJ3A -e 18-MJ3C -p "Single Read" -s forward -l 35 -w 5 -f 1 -o dev.gudmap.org -c ${cookie} -u ${rid}) &&
-      echo ${rid} test execution run already exists
+      rid=${rid:8:-6} &&
+      echo ${rid} test output bag already exists
     fi
 
+
 generateVersions:
   stage: aggregation
   only:
diff --git a/workflow/conf/aws.config b/workflow/conf/aws.config
index 27e5609..1c6359d 100644
--- a/workflow/conf/aws.config
+++ b/workflow/conf/aws.config
@@ -96,4 +96,12 @@ process {
     cpus = 1
     memory = '1 GB'
   }
+  withName: uploadQC {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName: uploadOutputBag {
+    cpus = 1
+    memory = '1 GB'
+  }
 }
diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config
index d092914..181b356 100755
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -70,6 +70,9 @@ process {
   withName: uploadQC {
     executor = 'local'
   }
+  withName: uploadOutputBag {
+    executor = 'local'
+  }
 }
 
 singularity {
diff --git a/workflow/nextflow.config b/workflow/nextflow.config
index 109eb24..93d2b8c 100644
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -79,6 +79,9 @@ process {
   withName:uploadQC {
     container = 'gudmaprbk/deriva1.3:1.0.0'
   }
+  withName:uploadOutputBag {
+    container = 'gudmaprbk/deriva1.3:1.0.0'
+  }
 }
 
 trace {
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index 662cfce..4db5b0e 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -39,6 +39,7 @@ deriva.into {
   deriva_uploadInputBag
   deriva_uploadExecutionRun
   deriva_uploadQC
+  deriva_uploadOutputBag
 }
 bdbag = Channel
   .fromPath(params.bdbag)
@@ -88,6 +89,7 @@ script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py")
 script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/uploadInputBag.py")
 script_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/uploadExecutionRun.py")
 script_uploadQC = Channel.fromPath("${baseDir}/scripts/uploadQC.py")
+script_uploadOutputBag = Channel.fromPath("${baseDir}/scripts/uploadOutputBag.py")
 
 /*
  * trackStart: track start of pipeline
@@ -1321,7 +1323,7 @@ process uploadInputBag {
   mn=\$(date +'%m')
   dy=\$(date +'%d')
 
-  if [ ! `deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -q \${yr}_\${mn}_\${dy}` ]
+  if [ `deriva-hatrac-cli --host ${source} ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -q \${yr}_\${mn}_\${dy}` ]
   then
     deriva-hatrac-cli --host ${source} mkdir /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy}
     echo LOG: hatrac folder created - /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy} >> ${repRID}.uploadInputBag.log
@@ -1347,7 +1349,7 @@ process uploadInputBag {
       rid=\${inputBag_rid}
   else
       exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
-      exist=\${exist:8:-6}
+      exist=\${exist:7:-6}
       echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log
       rid=\${exist}
   fi
@@ -1410,6 +1412,7 @@ process uploadExecutionRun {
   cookie=\${cookie:11:-1}
 
   exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Reference_Genome=\${genome}/Input_Bag=${inputBagRID}/Replicate=${repRID})
+  echo \${exist} >> ${repRID}.uploadExecutionRun.log
   if [ "\${exist}" == "[]" ]
   then
     executionRun_rid=\$(python3 uploadExecutionRun.py -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d 'Run in process' -o ${source} -c \${cookie} -u F)
@@ -1417,6 +1420,7 @@ process uploadExecutionRun {
   else
     rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
     rid=\${rid:7:-6}
+    echo \${rid} >> ${repRID}.uploadExecutionRun.log
     executionRun_rid=\$(python3 uploadExecutionRun.py -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d 'Run in process' -o ${source} -c \${cookie} -u \${rid})
     echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log
   fi
@@ -1431,6 +1435,12 @@ executionRunRID_fl.splitCsv(sep: ",", header: false).separate(
   executionRunRID
 )
 
+//
+executionRunRID.into {
+  executionRunRID_uploadQC
+  executionRunRID_uploadOutputBag
+}
+
 /* 
  * uploadQC: uploads the mRNA QC
 */
@@ -1440,7 +1450,7 @@ process uploadQC {
   input:
     path script_uploadQC
     path credential, stageAs: "credential.json" from deriva_uploadQC
-    val executionRunRID
+    val executionRunRID from executionRunRID_uploadQC
     val ends from endsInfer_uploadQC
     val stranded from strandedInfer_uploadQC
     val length from readLengthInfer_uploadQC
@@ -1488,6 +1498,71 @@ qcRID_fl.splitCsv(sep: ",", header: false).separate(
   qcRID
 )
 
+/* 
+ * uploadOutputBag: uploads the output bag
+*/
+process uploadOutputBag {
+  tag "${repRID}"
+
+  input:
+    path script_uploadOutputBag
+    path outputBag
+    val executionRunRID from executionRunRID_uploadOutputBag
+    path credential, stageAs: "credential.json" from deriva_uploadOutputBag
+
+  output:
+    path ("outputBagRID.csv") into outputBagRID_fl
+
+  script:
+  """
+  hostname > ${repRID}.uploadOutputBag.log
+  ulimit -a >> ${repRID}.uploadOutputBag.log
+
+  yr=\$(date +'%Y')
+  mn=\$(date +'%m')
+  dy=\$(date +'%d')
+
+  if [ `deriva-hatrac-cli --host ${source} ls /hatrac/resources/rnaseq/pipeline/output_bag/ | grep -q \${yr}_\${mn}_\${dy}` ]
+  then
+    deriva-hatrac-cli --host ${source} mkdir /hatrac/resources/rnaseq/pipeline/output_bag/\${yr}_\${mn}_\${dy}
+    echo LOG: hatrac folder created - /hatrac/resources/rnaseq/pipeline/output_bag/\${yr}_\${mn}_\${dy} >> ${repRID}.uploadOutputBag.log
+  else
+    echo LOG: hatrac folder already exists - /hatrac/resources/rnaseq/pipeline/output_bag/\${yr}_\${mn}_\${dy} >> ${repRID}.uploadOutputBag.log
+  fi
+
+  file=\$(basename -a ${outputBag})
+  md5=\$(md5sum ./\${file} | awk '{ print \$1 }')
+  echo LOG: ${repRID} output bag md5 sum - \${md5} >> ${repRID}.uploadOutputBag.log
+  size=\$(wc -c < ./\${file})
+  echo LOG: ${repRID} output bag size - \${size} bytes >> ${repRID}.uploadOutputBag.log
+  
+  exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=\${md5})
+  if [ "\${exist}" == "[]" ]
+  then
+      cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
+      cookie=\${cookie:11:-1}
+
+      loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/output_bag/\${yr}_\${mn}_\${dy}/\${file})
+      outputBag_rid=\$(python3 uploadOutputBag.py -e ${executionRunRID} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie})
+      echo LOG: output bag RID uploaded - \${outputBag_rid} >> ${repRID}.uploadOutputBag.log
+      rid=\${outputBag_rid}
+  else
+      exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
+      exist=\${exist:8:-6}
+      echo LOG: output bag RID already exists - \${exist} >> ${repRID}.uploadOutputBag.log
+      rid=\${exist}
+  fi
+
+  echo \${rid} > outputBagRID.csv
+  """
+}
+
+// Extract output bag RID into channel
+outputBagRID = Channel.create()
+outputBagRID_fl.splitCsv(sep: ",", header: false).separate(
+  outputBagRID
+)
+
 
 workflow.onError = {
   subject = "$workflow.manifest.name FAILED: $params.repRID"
diff --git a/workflow/scripts/uploadOutputBag.py b/workflow/scripts/uploadOutputBag.py
new file mode 100644
index 0000000..6eb7699
--- /dev/null
+++ b/workflow/scripts/uploadOutputBag.py
@@ -0,0 +1,48 @@
+import argparse
+from deriva.core import ErmrestCatalog, get_credential, BaseCLI
+import sys
+import csv
+from datetime import datetime
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-e', '--executionRunRID', help="exection run RID", required=True)
+    parser.add_argument('-f', '--file', help="file name", required=True)
+    parser.add_argument('-l', '--loc', help="datahub location", required=True)
+    parser.add_argument('-s', '--md5', help="md5 sum", required=True)
+    parser.add_argument('-b', '--bytes', help="size in bytes", required=True)
+    parser.add_argument('-n', '--notes', help="notes", default="", required=False)
+    parser.add_argument('-o', '--host', help="datahub host", required=True)
+    parser.add_argument('-c', '--cookie', help="cookie token", required=True)
+    args = parser.parse_args()
+    return args
+
+def main(hostname, catalog_number, credential):
+    catalog = ErmrestCatalog('https', hostname, catalog_number, credential)
+    pb = catalog.getPathBuilder()
+    outputBag_table = pb.RNASeq.Output_Bag
+
+    outputBag_data = {
+        "Execution_Run": args.executionRunRID,
+        "File_Name": args.file,
+        "File_URL": args.loc,
+        "File_MD5": args.md5,
+        "File_Bytes": args.bytes,
+        "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(),
+        "Notes": args.notes,
+        "Bag_Type": "Replicate_mRNA_Analysis"
+        }
+
+    entities = outputBag_table.insert([outputBag_data])
+    rid = entities[0]["RID"]
+
+    print(rid)
+
+
+if __name__ == '__main__':
+    args = get_args()
+    cli = BaseCLI("Custom RNASeq query", None, 1)
+    cli.remove_options(["--config-file"])
+    host = args.host
+    credential = {"cookie": args.cookie}
+    main(host, 2, credential)
\ No newline at end of file
-- 
GitLab