diff --git a/CHANGELOG.md b/CHANGELOG.md index 57b95447b438ca5d213c8ddc32d0917975aa618a..ed8348111c4107fff79f07dbb47babb41977823b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,9 @@ -# v0.0.4 (in development) +# v1.0.0 (in development) **User Facing** * Add option to pull references from datahub * Add option to send email on workflow error, with pipeline error message * Add versions and paper references of software used to report +* Upload input bag **Background** * Remove (comment out) option to pull references from S3 diff --git a/workflow/conf/aws.config b/workflow/conf/aws.config index c26e1a4a7e44318c53f4baa561ee6c5cb3020798..231c86f8f6acd450a49199cd7ae71221df8a20bf 100644 --- a/workflow/conf/aws.config +++ b/workflow/conf/aws.config @@ -88,4 +88,8 @@ process { cpus = 1 memory = '1 GB' } + withName: uploadInputBag { + cpus = 1 + memory = '1 GB' + } } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 57b72fd929898a976d9d077ff1b23470388cc37a..55d09d1f860678e731e4ff416ce47b4306b4ef47 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -61,6 +61,9 @@ process { withName: outputBag { executor = 'local' } + withName: uploadInputBag { + executor = 'local' + } } singularity { diff --git a/workflow/nextflow.config b/workflow/nextflow.config index f9fcef964a79aafb697c020defa67e68b93f5ec0..c5d97e9d5dab1a32efad246efb3006782fe8701a 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -70,6 +70,9 @@ process { withName:outputBag { container = 'bicf/gudmaprbkfilexfer:2.0.1_indev' } + withName:uploadInputBag { + container = 'gudmaprbk/deriva1.3:1.0.0' + } } trace { @@ -98,6 +101,6 @@ manifest { homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' mainScript = 'rna-seq.nf' - version = 'v0.0.4_indev' + version = 'v1.0.0_indev' nextflowVersion = '>=19.09.0' } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 04709a14369741efdafe5f4fbc0dbc7bb06a7627..c661ab024d7b797cc9a3089f6e3feabfe3724e18 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -36,6 +36,7 @@ deriva.into { deriva_getBag deriva_getRefInfer deriva_getRef + deriva_uploadInputBag } bdbag = Channel .fromPath(params.bdbag) @@ -82,7 +83,7 @@ script_refData = Channel.fromPath("${baseDir}/scripts/extractRefData.py") script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R") script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R") script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py") - +script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/uploadInputBag.py") /* * trackStart: track start of pipeline @@ -177,6 +178,10 @@ if (inputBagForce != "") { } else { inputBag = bag } +inputBag.into { + inputBag_getData + inputBag_uploadInputBag +} /* * getData: fetch study files from consortium with downloaded bdbag.zip @@ -187,7 +192,7 @@ process getData { input: path script_bdbagFetch path cookies, stageAs: "deriva-cookies.txt" from bdbag - path inputBag + path inputBag from inputBag_getData output: path ("*.R{1,2}.fastq.gz") into fastqs @@ -1267,6 +1272,67 @@ process outputBag { """ } +/* + * uploadInputBag: uploads the input bag +*/ +process uploadInputBag { + tag "${repRID}" + + input: + path script_uploadInputBag + path inputBag from inputBag_uploadInputBag + path credential, stageAs: "credential.json" from deriva_uploadInputBag + + output: + path ("inputBagRID.csv") into inputBagRIDfl + + script: + """ + hostname > ${repRID}.uploadInputBag.log + ulimit -a >> ${repRID}.uploadInputBag.log + + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + + hatrac=\$(deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -o \${yr}_\${mn}_\${dy}) + if [ -z "\${hatrac}" ] + then + deriva-hatrac-cli --host ${source} mkdir /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy} + echo LOG: hatrac folder created - /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy} >> ${repRID}.uploadInputBag.log + else + echo LOG: hatrac folder already exists - /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy} >> ${repRID}.uploadInputBag.log + fi + + file=\$(basename -a ${inputBag}) + md5=\$(md5sum ./\${file} | awk '{ print \$1 }') + echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log + size=\$(wc -c < ./\${file}) + echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5}) + if [ "\${exist}" == "[]" ] + then + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy}/\${file}) + inputBag_rid=\$(python3 uploadInputBag.py -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) + echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log + rid=\${inputBag_rid} + else + exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + exist=\${exist:8:-6} + echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log + rid=\${exist} + fi + + echo \${rid} > inputBagRID.csv + """ +} + +inputBagRID = Channel.create() +inputBagRIDfl.splitCsv(sep: ",", header: false).separate( + inputBagRID +) workflow.onError = { subject = "$workflow.manifest.name FAILED: $params.repRID" diff --git a/workflow/scripts/uploadInputBag.py b/workflow/scripts/uploadInputBag.py new file mode 100644 index 0000000000000000000000000000000000000000..a9276e7f096f710583167a1b48028d0bf8448a9e --- /dev/null +++ b/workflow/scripts/uploadInputBag.py @@ -0,0 +1,45 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv +from datetime import datetime + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--file', help="file", required=True) + parser.add_argument('-l', '--loc', help="location", required=True) + parser.add_argument('-s', '--md5', help="md5", required=True) + parser.add_argument('-b', '--bytes', help="bytes", required=True) + parser.add_argument('-o', '--host', help="bytes", required=True) + parser.add_argument('-c', '--cookie', help="bytes", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + inputBag_table = pb.RNASeq.Input_Bag + + inputBag_data = { + "File_Name": args.file, + "File_URL": args.loc, + "File_MD5": args.md5, + "File_Bytes": args.bytes, + "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(), + "Notes": "TEST", + "Bag_Type": "Replicate_Input_Seq" + } + + entities = inputBag_table.insert([inputBag_data]) + rid = entities[0]["RID"] + + print(rid) + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credential = {"cookie": args.cookie} + main(host, 2, credential) \ No newline at end of file