diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7264e8c3b819f41e5b6753f9d3605a1adb4d8698..9d090337f72592e743e38eb71e55acdb075393b6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -288,8 +288,7 @@ uploadInputBag: script: - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json - > - hatrac=`singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -o '/TEST$'` && - if [ -z "${hatrac}" ]; then + if [ ! `deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -q '/TEST$'` ]; then singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org mkdir /hatrac/resources/rnaseq/pipeline/input_bag/TEST echo /hatrac/resources/rnaseq/pipeline/input_bag/TEST created else @@ -312,6 +311,62 @@ uploadInputBag: echo ${rid} test input bag already exists fi + uploadInputBag: + stage: unit + only: + - push + - tags + except: + - merge_requests + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - > + hatrac=`singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -o '/TEST$'` && + if [ -z "${hatrac}" ]; then + singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org mkdir /hatrac/resources/rnaseq/pipeline/input_bag/TEST + echo /hatrac/resources/rnaseq/pipeline/input_bag/TEST created + else + echo /hatrac/resources/rnaseq/pipeline/input_bag/TEST already exists + fi + - echo THIS IS A TEST FILE > test.txt + - > + md5=$(md5sum ./test.txt | awk '{ print $1 }') && + size=$(wc -c < ./test.txt) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=${md5}) && + if [ "${exist}" == "[]" ]; then + cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + loc=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host dev.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/input_bag/TEST/test.txt) && + rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadInputBag.py -f test.txt -l ${loc} -s ${md5} -b ${size} -o dev.gudmap.org -c ${cookie}) && + echo ${rid} test input bag created + else + rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && + rid=${rid:7:-6} && + echo ${rid} test input bag already exists + fi + +uploadExecutionRun: + stage: unit + only: + - push + - tags + except: + - merge_requests + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - > + exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=18-DT5T/Reference_Genome=17-BPD4/Input_Bag=18-MHWC/Replicate=18-MJ3A) + if [ "${exist}" == "[]" ]; then + cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + rid=$(python3 uploadExecutionRun.py -r 18-MJ3A -w 18-DT5T -g 17-BPD4 -i 18-MHWC -s Error -d 'Run in process' -o dev.gudmap.org -c ${cookie} -u F) + echo ${rid} test execution run created + else + rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + rid=\${rid:7:-6} + rid=$(python3 uploadExecutionRun.py -r 18-MJ3A -w 18-DT5T -g 17-BPD4 -i 18-MHWC -s Error -d 'Run in process' -o dev.gudmap.org -c ${cookie} -u ${rid}) + echo ${rid} test execution run already exists + fi generateVersions: stage: aggregation diff --git a/workflow/conf/aws.config b/workflow/conf/aws.config index 231c86f8f6acd450a49199cd7ae71221df8a20bf..27e5609e9758d97570ae7621cdf06d911b07d22b 100644 --- a/workflow/conf/aws.config +++ b/workflow/conf/aws.config @@ -92,4 +92,8 @@ process { cpus = 1 memory = '1 GB' } + withName: uploadExecutionRun { + cpus = 1 + memory = '1 GB' + } } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 55d09d1f860678e731e4ff416ce47b4306b4ef47..f61243262b256576bf9aeff0df79ca82444d1ccc 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -64,6 +64,9 @@ process { withName: uploadInputBag { executor = 'local' } + withName: uploadExecutionRun { + executor = 'local' + } } singularity { diff --git a/workflow/nextflow.config b/workflow/nextflow.config index c5d97e9d5dab1a32efad246efb3006782fe8701a..4a6b812832401d08f05edeac5fb07b6be875d5db 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -73,6 +73,9 @@ process { withName:uploadInputBag { container = 'gudmaprbk/deriva1.3:1.0.0' } + withName:uploadExecutionRun { + container = 'gudmaprbk/deriva1.3:1.0.0' + } } trace { @@ -101,6 +104,6 @@ manifest { homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' mainScript = 'rna-seq.nf' - version = 'v1.0.0_indev' + version = 'v0.0.4_indev' nextflowVersion = '>=19.09.0' } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index c661ab024d7b797cc9a3089f6e3feabfe3724e18..45799478fb1d64b7b48514f4101696765e06c652 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -37,6 +37,7 @@ deriva.into { deriva_getRefInfer deriva_getRef deriva_uploadInputBag + deriva_uploadExecutionRun } bdbag = Channel .fromPath(params.bdbag) @@ -84,6 +85,7 @@ script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R") script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R") script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py") script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/uploadInputBag.py") +script_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/uploadExecutionRun.py") /* * trackStart: track start of pipeline @@ -717,11 +719,13 @@ strandedInfer.into { spikeInfer.into{ spikeInfer_getRef spikeInfer_aggrQC + spikeInfer_uploadExecutionRun } speciesInfer.into { speciesInfer_getRef speciesInfer_aggrQC speciesInfer_outputBag + speciesInfer_uploadExecutionRun } @@ -1295,8 +1299,7 @@ process uploadInputBag { mn=\$(date +'%m') dy=\$(date +'%d') - hatrac=\$(deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -o \${yr}_\${mn}_\${dy}) - if [ -z "\${hatrac}" ] + if [ ! `deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -q \${yr}_\${mn}_\${dy}` ] then deriva-hatrac-cli --host ${source} mkdir /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy} echo LOG: hatrac folder created - /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy} >> ${repRID}.uploadInputBag.log @@ -1309,11 +1312,13 @@ process uploadInputBag { echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log size=\$(wc -c < ./\${file}) echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5}) if [ "\${exist}" == "[]" ] then cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') cookie=\${cookie:11:-1} + loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy}/\${file}) inputBag_rid=\$(python3 uploadInputBag.py -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log @@ -1334,6 +1339,74 @@ inputBagRIDfl.splitCsv(sep: ",", header: false).separate( inputBagRID ) +/* + * uploadExecutionRun: uploads the execution run +*/ +process uploadExecutionRun { + tag "${repRID}" + + input: + path script_uploadExecutionRun + path credential, stageAs: "credential.json" from deriva_uploadExecutionRun + val spike from spikeInfer_uploadExecutionRun + val species from speciesInfer_uploadExecutionRun + val inputBagRID + + output: + path ("executionRunRID.csv") into executionRunRIDfl + + script: + """ + hostname > ${repRID}.uploadExecutionRun.log + ulimit -a >> ${repRID}.uploadExecutionRun.log + + echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.uploadExecutionRun.log + workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA/Version=${workflow.manifest.version}) + workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + workflow=\${workflow:7:-6} + echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.uploadExecutionRun.log + + if [ "${species}" == "Homo sapiens" ] + then + genomeName=\$(echo GRCh${refHuVersion}) + elif [ "${species}" == "Mus musculus" ] + then + genomeName=\$(echo GRCm${refMoVersion}) + fi + if [ "${spike}" == "yes" ] + then + genomeName=\$(echo \${genomeName}-S) + fi + echo LOG: searching for genome name - \${genomeName} >> ${repRID}.uploadExecutionRun.log + genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}_indev) + genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + genome=\${genome:7:-6} + echo LOG: genome RID extracted - \${genome} >> ${repRID}.uploadExecutionRun.log + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Reference_Genome=\${genome}/Input_Bag=${inputBagRID}/Replicate=${repRID}) + if [ "\${exist}" == "[]" ] + then + executionRun_rid=\$(python3 uploadExecutionRun.py -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d 'Run in process' -o ${source} -c \${cookie} -u F) + echo LOG: execution run RID uploaded - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log + else + rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + rid=\${rid:7:-6} + executionRun_rid=\$(python3 uploadExecutionRun.py -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d 'Run in process' -o ${source} -c \${cookie} -u \${rid}) + echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log + fi + + echo \${executionRun_rid} > executionRunRID.csv + """ +} + +executionRunRID = Channel.create() +executionRunRIDfl.splitCsv(sep: ",", header: false).separate( + executionRunRID +) + workflow.onError = { subject = "$workflow.manifest.name FAILED: $params.repRID" diff --git a/workflow/scripts/uploadExecutionRun.py b/workflow/scripts/uploadExecutionRun.py new file mode 100644 index 0000000000000000000000000000000000000000..5af8565ab0426bd32dc886188a0347360ff4b42c --- /dev/null +++ b/workflow/scripts/uploadExecutionRun.py @@ -0,0 +1,62 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repRID', help="replicate RID", required=True) + parser.add_argument('-w', '--workflowRID', help="workflow RID", required=True) + parser.add_argument('-g', '--referenceRID', help="reference genome RID", required=True) + parser.add_argument('-i', '--inputBagRID', help="inputBag RID", required=True) + parser.add_argument('-n', '--notes', help="notes", default="", required=False) + parser.add_argument('-s', '--status', help="run status", default="", required=False) + parser.add_argument('-d', '--statusDetail', help="status detail", default="", required=False) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + parser.add_argument('-u', '--update', help="update?", default="F", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + run_table = pb.RNASeq.Execution_Run + + if args.update == "F": + run_data = { + "Replicate": args.repRID, + "Workflow": args.workflowRID, + "Reference_Genome": args.referenceRID, + "Input_Bag": args.inputBagRID, + "Notes": args.notes, + "Execution_Status": args.status, + "Execution_Status_Detail": args.statusDetail + } + entities = run_table.insert([run_data]) + rid = entities[0]["RID"] + else: + run_data = { + "RID": args.update, + "Replicate": args.repRID, + "Workflow": args.workflowRID, + "Reference_Genome": args.referenceRID, + "Input_Bag": args.inputBagRID, + "Notes": args.notes, + "Execution_Status": args.status, + "Execution_Status_Detail": args.statusDetail + } + entities = run_table.update([run_data]) + rid = args.update + + + print(rid) + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credentials = {"cookie": args.cookie} + main(host, 2, credentials) \ No newline at end of file