From a503d1e23520e04426812fd59b7cdfa3915826cf Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Mon, 25 Jan 2021 20:00:28 -0600 Subject: [PATCH] Change logic for uploadOutputBag --- CHANGELOG.md | 2 ++ workflow/rna-seq.nf | 20 ++++++++------- workflow/scripts/upload_execution_run.py | 1 - workflow/scripts/upload_output_bag.py | 31 +++++++++++++++--------- 4 files changed, 33 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 360cfd6..d66bbc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,10 +21,12 @@ * Data-hub column title change from "Sequencing_Type" to "Experiment_Type" (#114) * Data-hub column title change from "Has_Strand_Specific_Information" to "Strandedness" (#115) * Merge data error pre-inference execution run upload/finalize to 1 process +* Change uploadOutputBag logic to change reuse hatrac file if alread exists (re-uses Output_Bag entry by reassigning Execution_Run RID) (#112) *Known Bugs* * Override params (inputBag, fastq, species) aren't checked for integrity * Authentication files and tokens must be active (active auth client) for the duration of the pipeline run (until long-lived token utilization included) +* Check for outputBag in hatrac doesn't check for any uploaded by chaise <hr> diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index d0a6bf9..75df6fe 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -2213,20 +2213,22 @@ process uploadOutputBag { echo LOG: ${repRID} output bag md5 sum - \${md5} >> ${repRID}.uploadOutputBag.log size=\$(wc -c < ./\${file}) echo LOG: ${repRID} output bag size - \${size} bytes >> ${repRID}.uploadOutputBag.log - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=\${md5}) + + loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/output_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_URL=\${loc}) if [ "\${exist}" == "[]" ] then - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/output_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) - outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) - echo LOG: output bag RID uploaded - \${outputBag_rid} >> ${repRID}.uploadOutputBag.log - rid=\${outputBag_rid} + outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie} -u F) + echo LOG: output bag RID uploaded - \${outputBag_rid} >> ${repRID}.uploadOutputBag.log + rid=\${outputBag_rid} else exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') exist=\${exist:8:-6} + outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -o ${source} -c \${cookie} -u \${exist}) echo LOG: output bag RID already exists - \${exist} >> ${repRID}.uploadOutputBag.log rid=\${exist} fi diff --git a/workflow/scripts/upload_execution_run.py b/workflow/scripts/upload_execution_run.py index 2e8ea8d..405c81a 100644 --- a/workflow/scripts/upload_execution_run.py +++ b/workflow/scripts/upload_execution_run.py @@ -48,7 +48,6 @@ def main(hostname, catalog_number, credential): } entities = run_table.update([run_data]) rid = args.update - print(rid) diff --git a/workflow/scripts/upload_output_bag.py b/workflow/scripts/upload_output_bag.py index 397658c..e1e1fc1 100644 --- a/workflow/scripts/upload_output_bag.py +++ b/workflow/scripts/upload_output_bag.py @@ -14,6 +14,7 @@ def get_args(): parser.add_argument('-n', '--notes', help="notes", default="", required=False) parser.add_argument('-o', '--host', help="datahub host", required=True) parser.add_argument('-c', '--cookie', help="cookie token", required=True) + parser.add_argument('-u', '--update', help="update?", default="F", required=True) args = parser.parse_args() return args @@ -22,19 +23,27 @@ def main(hostname, catalog_number, credential): pb = catalog.getPathBuilder() outputBag_table = pb.RNASeq.Output_Bag - outputBag_data = { - "Execution_Run": args.executionRunRID, - "File_Name": args.file, - "File_URL": args.loc, - "File_MD5": args.md5, - "File_Bytes": args.bytes, - "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(), - "Notes": args.notes, - "Bag_Type": "mRNA_Replicate_Analysis" + if args.update == "F": + outputBag_data = { + "Execution_Run": args.executionRunRID, + "File_Name": args.file, + "File_URL": args.loc, + "File_MD5": args.md5, + "File_Bytes": args.bytes, + "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(), + "Notes": args.notes, + "Bag_Type": "mRNA_Replicate_Analysis" } + entities = outputBag_table.insert([outputBag_data]) + rid = entities[0]["RID"] - entities = outputBag_table.insert([outputBag_data]) - rid = entities[0]["RID"] + else: + outputBag_data = { + "RID": args.update, + "Execution_Run": args.executionRunRID + } + entities = outputBag_table.insert([outputBag_data]) + rid = entities[0]["RID"] print(rid) -- GitLab