diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a6a703ea194e02fe4f20785d112fd5c39e8f0fb2..e2df40af967ac84d22cf7fcd260f0e26656037cb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -368,6 +368,29 @@ uploadExecutionRun: echo ${rid} test execution run already exists fi +uploadQC: + stage: unit + only: + - push + - tags + except: + - merge_requests + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - > + exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=18-MJ3A/Execution_Run=18-MJ3C) && + cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + if [ "${exist}" == "[]" ]; then + rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadQC.py -r 18-MJ3A -e 18-MJ3C -p "Single Read" -s forward -l 35 -w 5 -f 1 -o dev.gudmap.org -c ${cookie} -u F) && + echo ${rid} test execution run created + else + rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && + rid=${rid:7:-6} && + rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadQC.py -r 18-MJ3A -e 18-MJ3C -p "Single Read" -s forward -l 35 -w 5 -f 1 -o dev.gudmap.org -c ${cookie} -u ${rid}) && + echo ${rid} test execution run already exists + fi + generateVersions: stage: aggregation only: diff --git a/CHANGELOG.md b/CHANGELOG.md index ed8348111c4107fff79f07dbb47babb41977823b..5c022e36cf4b7bea4760fd2e5c809fb2f32fd0a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ * Add option to send email on workflow error, with pipeline error message * Add versions and paper references of software used to report * Upload input bag +* Upload execution run +* Upload mRNA QC **Background** * Remove (comment out) option to pull references from S3 diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index f61243262b256576bf9aeff0df79ca82444d1ccc..d0929149c7f47c4283f8f2d3015e5086cde95ea9 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -67,6 +67,9 @@ process { withName: uploadExecutionRun { executor = 'local' } + withName: uploadQC { + executor = 'local' + } } singularity { diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 4a6b812832401d08f05edeac5fb07b6be875d5db..109eb2483ce7198321c6d5296e1e8af59db67e5f 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -76,6 +76,9 @@ process { withName:uploadExecutionRun { container = 'gudmaprbk/deriva1.3:1.0.0' } + withName:uploadQC { + container = 'gudmaprbk/deriva1.3:1.0.0' + } } trace { diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 45799478fb1d64b7b48514f4101696765e06c652..662cfce88d70b2cbd0966a2ed80cd5e212bc9c13 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -38,6 +38,7 @@ deriva.into { deriva_getRef deriva_uploadInputBag deriva_uploadExecutionRun + deriva_uploadQC } bdbag = Channel .fromPath(params.bdbag) @@ -86,6 +87,7 @@ script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymb script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py") script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/uploadInputBag.py") script_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/uploadExecutionRun.py") +script_uploadQC = Channel.fromPath("${baseDir}/scripts/uploadQC.py") /* * trackStart: track start of pipeline @@ -256,7 +258,7 @@ process parseMetadata { path experiment from experimentMeta output: - path "design.csv" into metadata + path "design.csv" into metadata_fl script: """ @@ -317,7 +319,7 @@ speciesMeta = Channel.create() readLengthMeta = Channel.create() expRID = Channel.create() studyRID = Channel.create() -metadata.splitCsv(sep: ",", header: false).separate( +metadata_fl.splitCsv(sep: ",", header: false).separate( endsMeta, endsManual, strandedMeta, @@ -327,6 +329,7 @@ metadata.splitCsv(sep: ",", header: false).separate( expRID, studyRID ) + // Replicate metadata for multiple process inputs endsManual.into { endsManual_trimData @@ -350,7 +353,7 @@ process trimData { path ("*.fq.gz") into fastqsTrim path ("*.fastq.gz", includeInputs:true) into fastqs_fastqc path ("*_trimming_report.txt") into trimQC - path ("readLength.csv") into inferMetadata_readLength + path ("readLength.csv") into readLengthInfer_fl script: """ @@ -378,11 +381,16 @@ process trimData { // Extract calculated read length metadata into channel readLengthInfer = Channel.create() -inferMetadata_readLength.splitCsv(sep: ",", header: false).separate( +readLengthInfer_fl.splitCsv(sep: ",", header: false).separate( readLengthInfer ) -// Replicate trimmed fastq's +// Replicate infered read length for multiple process inputs +readLengthInfer.into { + readLengthInfer_aggrQC + readLengthInfer_uploadQC +} +// Replicate trimmed fastq's for multiple process inputs fastqsTrim.into { fastqsTrim_alignData fastqsTrim_downsampleData @@ -582,7 +590,7 @@ process inferMetadata { path alignSummary from alignSampleQC_inferMetadata.collect() output: - path "infer.csv" into inferMetadata + path "infer.csv" into inferMetadata_fl path "${repRID}.infer_experiment.txt" into inferExperiment script: @@ -691,7 +699,7 @@ align_moInfer = Channel.create() percentFInfer = Channel.create() percentRInfer = Channel.create() failInfer = Channel.create() -inferMetadata.splitCsv(sep: ",", header: false).separate( +inferMetadata_fl.splitCsv(sep: ",", header: false).separate( endsInfer, strandedInfer, spikeInfer, @@ -710,11 +718,13 @@ endsInfer.into { endsInfer_countData endsInfer_dataQC endsInfer_aggrQC + endsInfer_uploadQC } strandedInfer.into { strandedInfer_alignData strandedInfer_countData strandedInfer_aggrQC + strandedInfer_uploadQC } spikeInfer.into{ spikeInfer_getRef @@ -986,7 +996,7 @@ process countData { output: path ("*.tpmTable.csv") into counts path ("*.countData.summary") into countsQC - path ("assignedReads.csv") into inferMetadata_assignedReads + path ("assignedReads.csv") into assignedReadsInfer_fl script: """ @@ -1035,10 +1045,16 @@ process countData { // Extract number of assigned reads metadata into channel assignedReadsInfer = Channel.create() -inferMetadata_assignedReads.splitCsv(sep: ",", header: false).separate( +assignedReadsInfer_fl.splitCsv(sep: ",", header: false).separate( assignedReadsInfer ) +// Replicate infered assigned reads for multiple process inputs +assignedReadsInfer.into { + assignedReadsInfer_aggrQC + assignedReadsInfer_uploadQC +} + /* *fastqc: run fastqc on untrimmed fastq's */ @@ -1050,7 +1066,7 @@ process fastqc { output: path ("*_fastqc.zip") into fastqc - path ("rawReads.csv") into inferMetadata_rawReads + path ("rawReads.csv") into rawReadsInfer_fl script: """ @@ -1068,10 +1084,16 @@ process fastqc { // Extract number of raw reads metadata into channel rawReadsInfer = Channel.create() -inferMetadata_rawReads.splitCsv(sep: ",", header: false).separate( +rawReadsInfer_fl.splitCsv(sep: ",", header: false).separate( rawReadsInfer ) +// Replicate infered raw reads for multiple process inputs +rawReadsInfer.into { + rawReadsInfer_aggrQC + rawReadsInfer_uploadQC +} + /* *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates */ @@ -1087,7 +1109,7 @@ process dataQC { output: path "${repRID}.tin.hist.tsv" into tinHist - path "${repRID}.tin.med.csv" into inferMetadata_tinMed + path "${repRID}.tin.med.csv" into tinMedInfer_fl path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance script: @@ -1122,7 +1144,7 @@ process dataQC { // Extract median TIN metadata into channel tinMedInfer = Channel.create() -inferMetadata_tinMed.splitCsv(sep: ",", header: false).separate( +tinMedInfer_fl.splitCsv(sep: ",", header: false).separate( tinMedInfer ) @@ -1158,9 +1180,9 @@ process aggrQC { val spikeI from spikeInfer_aggrQC val speciesI from speciesInfer_aggrQC val readLengthM from readLengthMeta - val readLengthI from readLengthInfer - val rawReadsI from rawReadsInfer - val assignedReadsI from assignedReadsInfer + val readLengthI from readLengthInfer_aggrQC + val rawReadsI from rawReadsInfer_aggrQC + val assignedReadsI from assignedReadsInfer_aggrQC val tinMedI from tinMedInfer val expRID val studyRID @@ -1288,7 +1310,7 @@ process uploadInputBag { path credential, stageAs: "credential.json" from deriva_uploadInputBag output: - path ("inputBagRID.csv") into inputBagRIDfl + path ("inputBagRID.csv") into inputBagRID_fl script: """ @@ -1334,8 +1356,9 @@ process uploadInputBag { """ } +// Extract input bag RID into channel inputBagRID = Channel.create() -inputBagRIDfl.splitCsv(sep: ",", header: false).separate( +inputBagRID_fl.splitCsv(sep: ",", header: false).separate( inputBagRID ) @@ -1353,7 +1376,7 @@ process uploadExecutionRun { val inputBagRID output: - path ("executionRunRID.csv") into executionRunRIDfl + path ("executionRunRID.csv") into executionRunRID_fl script: """ @@ -1402,11 +1425,70 @@ process uploadExecutionRun { """ } +// Extract execution run RID into channel executionRunRID = Channel.create() -executionRunRIDfl.splitCsv(sep: ",", header: false).separate( +executionRunRID_fl.splitCsv(sep: ",", header: false).separate( executionRunRID ) +/* + * uploadQC: uploads the mRNA QC +*/ +process uploadQC { + tag "${repRID}" + + input: + path script_uploadQC + path credential, stageAs: "credential.json" from deriva_uploadQC + val executionRunRID + val ends from endsInfer_uploadQC + val stranded from strandedInfer_uploadQC + val length from readLengthInfer_uploadQC + val rawCount from rawReadsInfer_uploadQC + val finalCount from assignedReadsInfer_uploadQC + + output: + path ("qcRID.csv") into qcRID_fl + + script: + """ + hostname > ${repRID}.uploadQC.log + ulimit -a >> ${repRID}.uploadQC.log + + if [ "${ends}" == "pe" ] + then + end="Paired End" + elif [ "${ends}" == "se" ] + then + end="Single Read" + fi + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=${repRID}/Execution_Run=${executionRunRID}) + if [ "\${exist}" == "[]" ] + then + qc_rid=\$(python3 uploadQC.py -r ${repRID} -e ${executionRunRID} -p "\${end}" -s ${stranded} -l ${length} -w ${rawCount} -f ${finalCount} -o ${source} -c \${cookie} -u F) + echo LOG: mRNA QC RID uploaded - \${qc_rid} >> ${repRID}.uploadQC.log + else + rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + rid=\${rid:7:-6} + qc_rid=\$(python3 uploadQC.py -r ${repRID} -e ${executionRunRID} -p "\${end}" -s ${stranded} -l ${length} -w ${rawCount} -f ${finalCount} -o ${source} -c \${cookie} -u \${rid}) + echo LOG: mRNA QC RID updated - \${qc_rid} >> ${repRID}.uploadQC.log + fi + + echo \${qc_rid} > qcRID.csv + """ +} + +// Extract mRNA qc RID into channel +qcRID = Channel.create() +qcRID_fl.splitCsv(sep: ",", header: false).separate( + qcRID +) + + workflow.onError = { subject = "$workflow.manifest.name FAILED: $params.repRID" diff --git a/workflow/scripts/uploadQC.py b/workflow/scripts/uploadQC.py new file mode 100644 index 0000000000000000000000000000000000000000..930896d3abce8882aca7985a4ad304904f6b3a44 --- /dev/null +++ b/workflow/scripts/uploadQC.py @@ -0,0 +1,65 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repRID', help="replicate RID", required=True) + parser.add_argument('-e', '--executionRunRID', help="exection run RID", required=True) + parser.add_argument('-p', '--ends', help="single/paired ends", required=True) + parser.add_argument('-s', '--stranded', help="stranded?", required=True) + parser.add_argument('-l', '--length', help="median read length", required=True) + parser.add_argument('-w', '--rawCount', help="raw count", required=True) + parser.add_argument('-f', '--assignedCount', help="final assigned count", required=True) + parser.add_argument('-n', '--notes', help="notes", default="", required=False) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + parser.add_argument('-u', '--update', help="update?", default="F", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + run_table = pb.RNASeq.mRNA_QC + + if args.update == "F": + run_data = { + "Execution_Run": args.executionRunRID, + "Replicate": args.repRID, + "Paired_End": args.ends, + "Strandedness": args.stranded, + "Median_Read_Length": args.length, + "Raw_Count": args.rawCount, + "Final_Count": args.assignedCount, + "Notes": args.notes + } + entities = run_table.insert([run_data]) + rid = entities[0]["RID"] + else: + run_data = { + "RID": args.update, + "Execution_Run": args.executionRunRID, + "Replicate": args.repRID, + "Paired_End": args.ends, + "Strandedness": args.stranded, + "Median_Read_Length": args.length, + "Raw_Count": args.rawCount, + "Final_Count": args.assignedCount, + "Notes": args.notes + } + entities = run_table.update([run_data]) + rid = args.update + + + print(rid) + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credentials = {"cookie": args.cookie} + main(host, 2, credentials) \ No newline at end of file