From a503d1e23520e04426812fd59b7cdfa3915826cf Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Mon, 25 Jan 2021 20:00:28 -0600
Subject: [PATCH] Change logic for uploadOutputBag

---
 CHANGELOG.md                             |  2 ++
 workflow/rna-seq.nf                      | 20 ++++++++-------
 workflow/scripts/upload_execution_run.py |  1 -
 workflow/scripts/upload_output_bag.py    | 31 +++++++++++++++---------
 4 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 360cfd6..d66bbc8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,10 +21,12 @@
 * Data-hub column title change from "Sequencing_Type" to "Experiment_Type" (#114)
 * Data-hub column title change from "Has_Strand_Specific_Information" to "Strandedness" (#115)
 * Merge data error pre-inference execution run upload/finalize to 1 process
+* Change uploadOutputBag logic to change reuse hatrac file if alread exists (re-uses Output_Bag entry by reassigning Execution_Run RID) (#112)
 
 *Known Bugs*
 * Override params (inputBag, fastq, species) aren't checked for integrity
 * Authentication files and tokens must be active (active auth client) for the duration of the pipeline run (until long-lived token utilization included)
+* Check for outputBag in hatrac doesn't check for any uploaded by chaise
 
 <hr>
 
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index d0a6bf9..75df6fe 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -2213,20 +2213,22 @@ process uploadOutputBag {
   echo LOG: ${repRID} output bag md5 sum - \${md5} >> ${repRID}.uploadOutputBag.log
   size=\$(wc -c < ./\${file})
   echo LOG: ${repRID} output bag size - \${size} bytes >> ${repRID}.uploadOutputBag.log
-    
-  exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=\${md5})
+  
+  loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/output_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents)
+
+  cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
+  cookie=\${cookie:11:-1}
+
+  exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_URL=\${loc})
   if [ "\${exist}" == "[]" ]
   then
-    cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
-      cookie=\${cookie:11:-1}
-
-      loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/output_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents)
-      outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie})
-      echo LOG: output bag RID uploaded - \${outputBag_rid} >> ${repRID}.uploadOutputBag.log
-      rid=\${outputBag_rid}
+    outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie} -u F)
+    echo LOG: output bag RID uploaded - \${outputBag_rid} >> ${repRID}.uploadOutputBag.log
+    rid=\${outputBag_rid}
   else
       exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
       exist=\${exist:8:-6}
+      outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -o ${source} -c \${cookie} -u \${exist})
       echo LOG: output bag RID already exists - \${exist} >> ${repRID}.uploadOutputBag.log
       rid=\${exist}
   fi
diff --git a/workflow/scripts/upload_execution_run.py b/workflow/scripts/upload_execution_run.py
index 2e8ea8d..405c81a 100644
--- a/workflow/scripts/upload_execution_run.py
+++ b/workflow/scripts/upload_execution_run.py
@@ -48,7 +48,6 @@ def main(hostname, catalog_number, credential):
         }
         entities = run_table.update([run_data])
         rid = args.update
-    
 
     print(rid)
 
diff --git a/workflow/scripts/upload_output_bag.py b/workflow/scripts/upload_output_bag.py
index 397658c..e1e1fc1 100644
--- a/workflow/scripts/upload_output_bag.py
+++ b/workflow/scripts/upload_output_bag.py
@@ -14,6 +14,7 @@ def get_args():
     parser.add_argument('-n', '--notes', help="notes", default="", required=False)
     parser.add_argument('-o', '--host', help="datahub host", required=True)
     parser.add_argument('-c', '--cookie', help="cookie token", required=True)
+    parser.add_argument('-u', '--update', help="update?", default="F", required=True)
     args = parser.parse_args()
     return args
 
@@ -22,19 +23,27 @@ def main(hostname, catalog_number, credential):
     pb = catalog.getPathBuilder()
     outputBag_table = pb.RNASeq.Output_Bag
 
-    outputBag_data = {
-        "Execution_Run": args.executionRunRID,
-        "File_Name": args.file,
-        "File_URL": args.loc,
-        "File_MD5": args.md5,
-        "File_Bytes": args.bytes,
-        "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(),
-        "Notes": args.notes,
-        "Bag_Type": "mRNA_Replicate_Analysis"
+    if args.update == "F":
+        outputBag_data = {
+            "Execution_Run": args.executionRunRID,
+            "File_Name": args.file,
+            "File_URL": args.loc,
+            "File_MD5": args.md5,
+            "File_Bytes": args.bytes,
+            "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(),
+            "Notes": args.notes,
+            "Bag_Type": "mRNA_Replicate_Analysis"
         }
+        entities = outputBag_table.insert([outputBag_data])
+        rid = entities[0]["RID"]
 
-    entities = outputBag_table.insert([outputBag_data])
-    rid = entities[0]["RID"]
+    else:
+        outputBag_data = {
+            "RID": args.update,
+            "Execution_Run": args.executionRunRID
+        }
+        entities = outputBag_table.insert([outputBag_data])
+        rid = entities[0]["RID"]
 
     print(rid)
 
-- 
GitLab