From ab0e4bb698f0bf36bb0012ca4f235c9add718be9 Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Mon, 7 Dec 2020 10:16:43 -0600 Subject: [PATCH] Add upload output bag --- .gitlab-ci.yml | 66 +++++++++++------------ workflow/conf/aws.config | 8 +++ workflow/conf/biohpc.config | 3 ++ workflow/nextflow.config | 3 ++ workflow/rna-seq.nf | 81 +++++++++++++++++++++++++++-- workflow/scripts/uploadOutputBag.py | 48 +++++++++++++++++ 6 files changed, 173 insertions(+), 36 deletions(-) create mode 100644 workflow/scripts/uploadOutputBag.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e2df40a..f62f838 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -288,7 +288,7 @@ uploadInputBag: script: - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json - > - if [ ! `deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -q '/TEST$'` ]; then + if [ `deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -q '/TEST$'` ]; then singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org mkdir /hatrac/resources/rnaseq/pipeline/input_bag/TEST echo /hatrac/resources/rnaseq/pipeline/input_bag/TEST created else @@ -311,7 +311,7 @@ uploadInputBag: echo ${rid} test input bag already exists fi - uploadInputBag: +uploadExecutionRun: stage: unit only: - push @@ -321,31 +321,20 @@ uploadInputBag: script: - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json - > - hatrac=`singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -o '/TEST$'` && - if [ -z "${hatrac}" ]; then - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org mkdir /hatrac/resources/rnaseq/pipeline/input_bag/TEST - echo /hatrac/resources/rnaseq/pipeline/input_bag/TEST created - else - echo /hatrac/resources/rnaseq/pipeline/input_bag/TEST already exists - fi - - echo THIS IS A TEST FILE > test.txt - - > - md5=$(md5sum ./test.txt | awk '{ print $1 }') && - size=$(wc -c < ./test.txt) && - exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=${md5}) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=18-DT5T/Reference_Genome=17-BPD4/Input_Bag=18-MHWC/Replicate=18-MJ3A) && + cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && if [ "${exist}" == "[]" ]; then - cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && - cookie=${cookie:11:-1} && - loc=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/input_bag/TEST/test.txt) && - rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadInputBag.py -f test.txt -l ${loc} -s ${md5} -b ${size} -o dev.gudmap.org -c ${cookie}) && - echo ${rid} test input bag created + rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadExecutionRun.py -r 18-MJ3A -w 18-DT5T -g 17-BPD4 -i 18-MHWC -s Error -d 'Run in process' -o dev.gudmap.org -c ${cookie} -u F) && + echo ${rid} test execution run created else rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && rid=${rid:7:-6} && - echo ${rid} test input bag already exists + rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadExecutionRun.py -r 18-MJ3A -w 18-DT5T -g 17-BPD4 -i 18-MHWC -s Error -d 'Run in process' -o dev.gudmap.org -c ${cookie} -u ${rid}) && + echo ${rid} test execution run already exists fi -uploadExecutionRun: +uploadQC: stage: unit only: - push @@ -355,20 +344,20 @@ uploadExecutionRun: script: - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json - > - exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=18-DT5T/Reference_Genome=17-BPD4/Input_Bag=18-MHWC/Replicate=18-MJ3A) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=18-MJ3A/Execution_Run=18-MJ3C) && cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && cookie=${cookie:11:-1} && if [ "${exist}" == "[]" ]; then - rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadExecutionRun.py -r 18-MJ3A -w 18-DT5T -g 17-BPD4 -i 18-MHWC -s Error -d 'Run in process' -o dev.gudmap.org -c ${cookie} -u F) && + rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadQC.py -r 18-MJ3A -e 18-MJ3C -p "Single Read" -s forward -l 35 -w 5 -f 1 -o dev.gudmap.org -c ${cookie} -u F) && echo ${rid} test execution run created else rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && rid=${rid:7:-6} && - rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadExecutionRun.py -r 18-MJ3A -w 18-DT5T -g 17-BPD4 -i 18-MHWC -s Error -d 'Run in process' -o dev.gudmap.org -c ${cookie} -u ${rid}) && + rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadQC.py -r 18-MJ3A -e 18-MJ3C -p "Single Read" -s forward -l 35 -w 5 -f 1 -o dev.gudmap.org -c ${cookie} -u ${rid}) && echo ${rid} test execution run already exists fi -uploadQC: +uploadOutputBag: stage: unit only: - push @@ -378,19 +367,30 @@ uploadQC: script: - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json - > - exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=18-MJ3A/Execution_Run=18-MJ3C) && - cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && - cookie=${cookie:11:-1} && + if [ `deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/output_bag/ | grep -q '/TEST$'` ]; then + singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org mkdir /hatrac/resources/rnaseq/pipeline/output_bag/TEST + echo /hatrac/resources/rnaseq/pipeline/output_bag/TEST created + else + echo /hatrac/resources/rnaseq/pipeline/output_bag/TEST already exists + fi + - echo THIS IS A TEST FILE > test.txt + - > + md5=$(md5sum ./test.txt | awk '{ print $1 }') && + size=$(wc -c < ./test.txt) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=${md5}) && if [ "${exist}" == "[]" ]; then - rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadQC.py -r 18-MJ3A -e 18-MJ3C -p "Single Read" -s forward -l 35 -w 5 -f 1 -o dev.gudmap.org -c ${cookie} -u F) && - echo ${rid} test execution run created + cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + loc=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/output_bag/TEST/test.txt) && + rid=singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadOutputBag.py -f test.txt -l ${loc} -s ${md5} -b ${size} -o dev.gudmap.org -c ${cookie} && + echo ${rid} test output bag created else rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && - rid=${rid:7:-6} && - rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadQC.py -r 18-MJ3A -e 18-MJ3C -p "Single Read" -s forward -l 35 -w 5 -f 1 -o dev.gudmap.org -c ${cookie} -u ${rid}) && - echo ${rid} test execution run already exists + rid=${rid:8:-6} && + echo ${rid} test output bag already exists fi + generateVersions: stage: aggregation only: diff --git a/workflow/conf/aws.config b/workflow/conf/aws.config index 27e5609..1c6359d 100644 --- a/workflow/conf/aws.config +++ b/workflow/conf/aws.config @@ -96,4 +96,12 @@ process { cpus = 1 memory = '1 GB' } + withName: uploadQC { + cpus = 1 + memory = '1 GB' + } + withName: uploadOutputBag { + cpus = 1 + memory = '1 GB' + } } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index d092914..181b356 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -70,6 +70,9 @@ process { withName: uploadQC { executor = 'local' } + withName: uploadOutputBag { + executor = 'local' + } } singularity { diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 109eb24..93d2b8c 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -79,6 +79,9 @@ process { withName:uploadQC { container = 'gudmaprbk/deriva1.3:1.0.0' } + withName:uploadOutputBag { + container = 'gudmaprbk/deriva1.3:1.0.0' + } } trace { diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 662cfce..4db5b0e 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -39,6 +39,7 @@ deriva.into { deriva_uploadInputBag deriva_uploadExecutionRun deriva_uploadQC + deriva_uploadOutputBag } bdbag = Channel .fromPath(params.bdbag) @@ -88,6 +89,7 @@ script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py") script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/uploadInputBag.py") script_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/uploadExecutionRun.py") script_uploadQC = Channel.fromPath("${baseDir}/scripts/uploadQC.py") +script_uploadOutputBag = Channel.fromPath("${baseDir}/scripts/uploadOutputBag.py") /* * trackStart: track start of pipeline @@ -1321,7 +1323,7 @@ process uploadInputBag { mn=\$(date +'%m') dy=\$(date +'%d') - if [ ! `deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -q \${yr}_\${mn}_\${dy}` ] + if [ `deriva-hatrac-cli --host ${source} ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -q \${yr}_\${mn}_\${dy}` ] then deriva-hatrac-cli --host ${source} mkdir /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy} echo LOG: hatrac folder created - /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy} >> ${repRID}.uploadInputBag.log @@ -1347,7 +1349,7 @@ process uploadInputBag { rid=\${inputBag_rid} else exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - exist=\${exist:8:-6} + exist=\${exist:7:-6} echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log rid=\${exist} fi @@ -1410,6 +1412,7 @@ process uploadExecutionRun { cookie=\${cookie:11:-1} exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Reference_Genome=\${genome}/Input_Bag=${inputBagRID}/Replicate=${repRID}) + echo \${exist} >> ${repRID}.uploadExecutionRun.log if [ "\${exist}" == "[]" ] then executionRun_rid=\$(python3 uploadExecutionRun.py -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d 'Run in process' -o ${source} -c \${cookie} -u F) @@ -1417,6 +1420,7 @@ process uploadExecutionRun { else rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') rid=\${rid:7:-6} + echo \${rid} >> ${repRID}.uploadExecutionRun.log executionRun_rid=\$(python3 uploadExecutionRun.py -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d 'Run in process' -o ${source} -c \${cookie} -u \${rid}) echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log fi @@ -1431,6 +1435,12 @@ executionRunRID_fl.splitCsv(sep: ",", header: false).separate( executionRunRID ) +// +executionRunRID.into { + executionRunRID_uploadQC + executionRunRID_uploadOutputBag +} + /* * uploadQC: uploads the mRNA QC */ @@ -1440,7 +1450,7 @@ process uploadQC { input: path script_uploadQC path credential, stageAs: "credential.json" from deriva_uploadQC - val executionRunRID + val executionRunRID from executionRunRID_uploadQC val ends from endsInfer_uploadQC val stranded from strandedInfer_uploadQC val length from readLengthInfer_uploadQC @@ -1488,6 +1498,71 @@ qcRID_fl.splitCsv(sep: ",", header: false).separate( qcRID ) +/* + * uploadOutputBag: uploads the output bag +*/ +process uploadOutputBag { + tag "${repRID}" + + input: + path script_uploadOutputBag + path outputBag + val executionRunRID from executionRunRID_uploadOutputBag + path credential, stageAs: "credential.json" from deriva_uploadOutputBag + + output: + path ("outputBagRID.csv") into outputBagRID_fl + + script: + """ + hostname > ${repRID}.uploadOutputBag.log + ulimit -a >> ${repRID}.uploadOutputBag.log + + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + + if [ `deriva-hatrac-cli --host ${source} ls /hatrac/resources/rnaseq/pipeline/output_bag/ | grep -q \${yr}_\${mn}_\${dy}` ] + then + deriva-hatrac-cli --host ${source} mkdir /hatrac/resources/rnaseq/pipeline/output_bag/\${yr}_\${mn}_\${dy} + echo LOG: hatrac folder created - /hatrac/resources/rnaseq/pipeline/output_bag/\${yr}_\${mn}_\${dy} >> ${repRID}.uploadOutputBag.log + else + echo LOG: hatrac folder already exists - /hatrac/resources/rnaseq/pipeline/output_bag/\${yr}_\${mn}_\${dy} >> ${repRID}.uploadOutputBag.log + fi + + file=\$(basename -a ${outputBag}) + md5=\$(md5sum ./\${file} | awk '{ print \$1 }') + echo LOG: ${repRID} output bag md5 sum - \${md5} >> ${repRID}.uploadOutputBag.log + size=\$(wc -c < ./\${file}) + echo LOG: ${repRID} output bag size - \${size} bytes >> ${repRID}.uploadOutputBag.log + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=\${md5}) + if [ "\${exist}" == "[]" ] + then + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/output_bag/\${yr}_\${mn}_\${dy}/\${file}) + outputBag_rid=\$(python3 uploadOutputBag.py -e ${executionRunRID} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) + echo LOG: output bag RID uploaded - \${outputBag_rid} >> ${repRID}.uploadOutputBag.log + rid=\${outputBag_rid} + else + exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + exist=\${exist:8:-6} + echo LOG: output bag RID already exists - \${exist} >> ${repRID}.uploadOutputBag.log + rid=\${exist} + fi + + echo \${rid} > outputBagRID.csv + """ +} + +// Extract output bag RID into channel +outputBagRID = Channel.create() +outputBagRID_fl.splitCsv(sep: ",", header: false).separate( + outputBagRID +) + workflow.onError = { subject = "$workflow.manifest.name FAILED: $params.repRID" diff --git a/workflow/scripts/uploadOutputBag.py b/workflow/scripts/uploadOutputBag.py new file mode 100644 index 0000000..6eb7699 --- /dev/null +++ b/workflow/scripts/uploadOutputBag.py @@ -0,0 +1,48 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv +from datetime import datetime + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-e', '--executionRunRID', help="exection run RID", required=True) + parser.add_argument('-f', '--file', help="file name", required=True) + parser.add_argument('-l', '--loc', help="datahub location", required=True) + parser.add_argument('-s', '--md5', help="md5 sum", required=True) + parser.add_argument('-b', '--bytes', help="size in bytes", required=True) + parser.add_argument('-n', '--notes', help="notes", default="", required=False) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + outputBag_table = pb.RNASeq.Output_Bag + + outputBag_data = { + "Execution_Run": args.executionRunRID, + "File_Name": args.file, + "File_URL": args.loc, + "File_MD5": args.md5, + "File_Bytes": args.bytes, + "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(), + "Notes": args.notes, + "Bag_Type": "Replicate_mRNA_Analysis" + } + + entities = outputBag_table.insert([outputBag_data]) + rid = entities[0]["RID"] + + print(rid) + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credential = {"cookie": args.cookie} + main(host, 2, credential) \ No newline at end of file -- GitLab