Skip to content
Snippets Groups Projects
Commit e4b99530 authored by Gervaise Henry's avatar Gervaise Henry :cowboy:
Browse files

Add upload mRNA QC

parent 765286c5
Branches
Tags
2 merge requests!58Develop,!53Resolve "process_derivaUpload"
Pipeline #8466 passed with stages
in 2 minutes and 29 seconds
...@@ -368,6 +368,29 @@ uploadExecutionRun: ...@@ -368,6 +368,29 @@ uploadExecutionRun:
echo ${rid} test execution run already exists echo ${rid} test execution run already exists
fi fi
uploadQC:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json
- >
exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=18-MJ3A/Execution_Run=18-MJ3C) &&
cookie=$(cat credential.json | grep -A 1 '\"dev.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') &&
cookie=${cookie:11:-1} &&
if [ "${exist}" == "[]" ]; then
rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadQC.py -r 18-MJ3A -e 18-MJ3C -p "Single Read" -s forward -l 35 -w 5 -f 1 -o dev.gudmap.org -c ${cookie} -u F) &&
echo ${rid} test execution run created
else
rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') &&
rid=${rid:7:-6} &&
rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/uploadQC.py -r 18-MJ3A -e 18-MJ3C -p "Single Read" -s forward -l 35 -w 5 -f 1 -o dev.gudmap.org -c ${cookie} -u ${rid}) &&
echo ${rid} test execution run already exists
fi
generateVersions: generateVersions:
stage: aggregation stage: aggregation
only: only:
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
* Add option to send email on workflow error, with pipeline error message * Add option to send email on workflow error, with pipeline error message
* Add versions and paper references of software used to report * Add versions and paper references of software used to report
* Upload input bag * Upload input bag
* Upload execution run
* Upload mRNA QC
**Background** **Background**
* Remove (comment out) option to pull references from S3 * Remove (comment out) option to pull references from S3
......
...@@ -67,6 +67,9 @@ process { ...@@ -67,6 +67,9 @@ process {
withName: uploadExecutionRun { withName: uploadExecutionRun {
executor = 'local' executor = 'local'
} }
withName: uploadQC {
executor = 'local'
}
} }
singularity { singularity {
......
...@@ -76,6 +76,9 @@ process { ...@@ -76,6 +76,9 @@ process {
withName:uploadExecutionRun { withName:uploadExecutionRun {
container = 'gudmaprbk/deriva1.3:1.0.0' container = 'gudmaprbk/deriva1.3:1.0.0'
} }
withName:uploadQC {
container = 'gudmaprbk/deriva1.3:1.0.0'
}
} }
trace { trace {
......
...@@ -38,6 +38,7 @@ deriva.into { ...@@ -38,6 +38,7 @@ deriva.into {
deriva_getRef deriva_getRef
deriva_uploadInputBag deriva_uploadInputBag
deriva_uploadExecutionRun deriva_uploadExecutionRun
deriva_uploadQC
} }
bdbag = Channel bdbag = Channel
.fromPath(params.bdbag) .fromPath(params.bdbag)
...@@ -86,6 +87,7 @@ script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymb ...@@ -86,6 +87,7 @@ script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymb
script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py") script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py")
script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/uploadInputBag.py") script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/uploadInputBag.py")
script_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/uploadExecutionRun.py") script_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/uploadExecutionRun.py")
script_uploadQC = Channel.fromPath("${baseDir}/scripts/uploadQC.py")
/* /*
* trackStart: track start of pipeline * trackStart: track start of pipeline
...@@ -256,7 +258,7 @@ process parseMetadata { ...@@ -256,7 +258,7 @@ process parseMetadata {
path experiment from experimentMeta path experiment from experimentMeta
output: output:
path "design.csv" into metadata path "design.csv" into metadata_fl
script: script:
""" """
...@@ -317,7 +319,7 @@ speciesMeta = Channel.create() ...@@ -317,7 +319,7 @@ speciesMeta = Channel.create()
readLengthMeta = Channel.create() readLengthMeta = Channel.create()
expRID = Channel.create() expRID = Channel.create()
studyRID = Channel.create() studyRID = Channel.create()
metadata.splitCsv(sep: ",", header: false).separate( metadata_fl.splitCsv(sep: ",", header: false).separate(
endsMeta, endsMeta,
endsManual, endsManual,
strandedMeta, strandedMeta,
...@@ -327,6 +329,7 @@ metadata.splitCsv(sep: ",", header: false).separate( ...@@ -327,6 +329,7 @@ metadata.splitCsv(sep: ",", header: false).separate(
expRID, expRID,
studyRID studyRID
) )
// Replicate metadata for multiple process inputs // Replicate metadata for multiple process inputs
endsManual.into { endsManual.into {
endsManual_trimData endsManual_trimData
...@@ -350,7 +353,7 @@ process trimData { ...@@ -350,7 +353,7 @@ process trimData {
path ("*.fq.gz") into fastqsTrim path ("*.fq.gz") into fastqsTrim
path ("*.fastq.gz", includeInputs:true) into fastqs_fastqc path ("*.fastq.gz", includeInputs:true) into fastqs_fastqc
path ("*_trimming_report.txt") into trimQC path ("*_trimming_report.txt") into trimQC
path ("readLength.csv") into inferMetadata_readLength path ("readLength.csv") into readLengthInfer_fl
script: script:
""" """
...@@ -378,11 +381,16 @@ process trimData { ...@@ -378,11 +381,16 @@ process trimData {
// Extract calculated read length metadata into channel // Extract calculated read length metadata into channel
readLengthInfer = Channel.create() readLengthInfer = Channel.create()
inferMetadata_readLength.splitCsv(sep: ",", header: false).separate( readLengthInfer_fl.splitCsv(sep: ",", header: false).separate(
readLengthInfer readLengthInfer
) )
// Replicate trimmed fastq's // Replicate infered read length for multiple process inputs
readLengthInfer.into {
readLengthInfer_aggrQC
readLengthInfer_uploadQC
}
// Replicate trimmed fastq's for multiple process inputs
fastqsTrim.into { fastqsTrim.into {
fastqsTrim_alignData fastqsTrim_alignData
fastqsTrim_downsampleData fastqsTrim_downsampleData
...@@ -582,7 +590,7 @@ process inferMetadata { ...@@ -582,7 +590,7 @@ process inferMetadata {
path alignSummary from alignSampleQC_inferMetadata.collect() path alignSummary from alignSampleQC_inferMetadata.collect()
output: output:
path "infer.csv" into inferMetadata path "infer.csv" into inferMetadata_fl
path "${repRID}.infer_experiment.txt" into inferExperiment path "${repRID}.infer_experiment.txt" into inferExperiment
script: script:
...@@ -691,7 +699,7 @@ align_moInfer = Channel.create() ...@@ -691,7 +699,7 @@ align_moInfer = Channel.create()
percentFInfer = Channel.create() percentFInfer = Channel.create()
percentRInfer = Channel.create() percentRInfer = Channel.create()
failInfer = Channel.create() failInfer = Channel.create()
inferMetadata.splitCsv(sep: ",", header: false).separate( inferMetadata_fl.splitCsv(sep: ",", header: false).separate(
endsInfer, endsInfer,
strandedInfer, strandedInfer,
spikeInfer, spikeInfer,
...@@ -710,11 +718,13 @@ endsInfer.into { ...@@ -710,11 +718,13 @@ endsInfer.into {
endsInfer_countData endsInfer_countData
endsInfer_dataQC endsInfer_dataQC
endsInfer_aggrQC endsInfer_aggrQC
endsInfer_uploadQC
} }
strandedInfer.into { strandedInfer.into {
strandedInfer_alignData strandedInfer_alignData
strandedInfer_countData strandedInfer_countData
strandedInfer_aggrQC strandedInfer_aggrQC
strandedInfer_uploadQC
} }
spikeInfer.into{ spikeInfer.into{
spikeInfer_getRef spikeInfer_getRef
...@@ -986,7 +996,7 @@ process countData { ...@@ -986,7 +996,7 @@ process countData {
output: output:
path ("*.tpmTable.csv") into counts path ("*.tpmTable.csv") into counts
path ("*.countData.summary") into countsQC path ("*.countData.summary") into countsQC
path ("assignedReads.csv") into inferMetadata_assignedReads path ("assignedReads.csv") into assignedReadsInfer_fl
script: script:
""" """
...@@ -1035,10 +1045,16 @@ process countData { ...@@ -1035,10 +1045,16 @@ process countData {
// Extract number of assigned reads metadata into channel // Extract number of assigned reads metadata into channel
assignedReadsInfer = Channel.create() assignedReadsInfer = Channel.create()
inferMetadata_assignedReads.splitCsv(sep: ",", header: false).separate( assignedReadsInfer_fl.splitCsv(sep: ",", header: false).separate(
assignedReadsInfer assignedReadsInfer
) )
// Replicate infered assigned reads for multiple process inputs
assignedReadsInfer.into {
assignedReadsInfer_aggrQC
assignedReadsInfer_uploadQC
}
/* /*
*fastqc: run fastqc on untrimmed fastq's *fastqc: run fastqc on untrimmed fastq's
*/ */
...@@ -1050,7 +1066,7 @@ process fastqc { ...@@ -1050,7 +1066,7 @@ process fastqc {
output: output:
path ("*_fastqc.zip") into fastqc path ("*_fastqc.zip") into fastqc
path ("rawReads.csv") into inferMetadata_rawReads path ("rawReads.csv") into rawReadsInfer_fl
script: script:
""" """
...@@ -1068,10 +1084,16 @@ process fastqc { ...@@ -1068,10 +1084,16 @@ process fastqc {
// Extract number of raw reads metadata into channel // Extract number of raw reads metadata into channel
rawReadsInfer = Channel.create() rawReadsInfer = Channel.create()
inferMetadata_rawReads.splitCsv(sep: ",", header: false).separate( rawReadsInfer_fl.splitCsv(sep: ",", header: false).separate(
rawReadsInfer rawReadsInfer
) )
// Replicate infered raw reads for multiple process inputs
rawReadsInfer.into {
rawReadsInfer_aggrQC
rawReadsInfer_uploadQC
}
/* /*
*dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates
*/ */
...@@ -1087,7 +1109,7 @@ process dataQC { ...@@ -1087,7 +1109,7 @@ process dataQC {
output: output:
path "${repRID}.tin.hist.tsv" into tinHist path "${repRID}.tin.hist.tsv" into tinHist
path "${repRID}.tin.med.csv" into inferMetadata_tinMed path "${repRID}.tin.med.csv" into tinMedInfer_fl
path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance
script: script:
...@@ -1122,7 +1144,7 @@ process dataQC { ...@@ -1122,7 +1144,7 @@ process dataQC {
// Extract median TIN metadata into channel // Extract median TIN metadata into channel
tinMedInfer = Channel.create() tinMedInfer = Channel.create()
inferMetadata_tinMed.splitCsv(sep: ",", header: false).separate( tinMedInfer_fl.splitCsv(sep: ",", header: false).separate(
tinMedInfer tinMedInfer
) )
...@@ -1158,9 +1180,9 @@ process aggrQC { ...@@ -1158,9 +1180,9 @@ process aggrQC {
val spikeI from spikeInfer_aggrQC val spikeI from spikeInfer_aggrQC
val speciesI from speciesInfer_aggrQC val speciesI from speciesInfer_aggrQC
val readLengthM from readLengthMeta val readLengthM from readLengthMeta
val readLengthI from readLengthInfer val readLengthI from readLengthInfer_aggrQC
val rawReadsI from rawReadsInfer val rawReadsI from rawReadsInfer_aggrQC
val assignedReadsI from assignedReadsInfer val assignedReadsI from assignedReadsInfer_aggrQC
val tinMedI from tinMedInfer val tinMedI from tinMedInfer
val expRID val expRID
val studyRID val studyRID
...@@ -1288,7 +1310,7 @@ process uploadInputBag { ...@@ -1288,7 +1310,7 @@ process uploadInputBag {
path credential, stageAs: "credential.json" from deriva_uploadInputBag path credential, stageAs: "credential.json" from deriva_uploadInputBag
output: output:
path ("inputBagRID.csv") into inputBagRIDfl path ("inputBagRID.csv") into inputBagRID_fl
script: script:
""" """
...@@ -1334,8 +1356,9 @@ process uploadInputBag { ...@@ -1334,8 +1356,9 @@ process uploadInputBag {
""" """
} }
// Extract input bag RID into channel
inputBagRID = Channel.create() inputBagRID = Channel.create()
inputBagRIDfl.splitCsv(sep: ",", header: false).separate( inputBagRID_fl.splitCsv(sep: ",", header: false).separate(
inputBagRID inputBagRID
) )
...@@ -1353,7 +1376,7 @@ process uploadExecutionRun { ...@@ -1353,7 +1376,7 @@ process uploadExecutionRun {
val inputBagRID val inputBagRID
output: output:
path ("executionRunRID.csv") into executionRunRIDfl path ("executionRunRID.csv") into executionRunRID_fl
script: script:
""" """
...@@ -1402,11 +1425,70 @@ process uploadExecutionRun { ...@@ -1402,11 +1425,70 @@ process uploadExecutionRun {
""" """
} }
// Extract execution run RID into channel
executionRunRID = Channel.create() executionRunRID = Channel.create()
executionRunRIDfl.splitCsv(sep: ",", header: false).separate( executionRunRID_fl.splitCsv(sep: ",", header: false).separate(
executionRunRID executionRunRID
) )
/*
* uploadQC: uploads the mRNA QC
*/
process uploadQC {
tag "${repRID}"
input:
path script_uploadQC
path credential, stageAs: "credential.json" from deriva_uploadQC
val executionRunRID
val ends from endsInfer_uploadQC
val stranded from strandedInfer_uploadQC
val length from readLengthInfer_uploadQC
val rawCount from rawReadsInfer_uploadQC
val finalCount from assignedReadsInfer_uploadQC
output:
path ("qcRID.csv") into qcRID_fl
script:
"""
hostname > ${repRID}.uploadQC.log
ulimit -a >> ${repRID}.uploadQC.log
if [ "${ends}" == "pe" ]
then
end="Paired End"
elif [ "${ends}" == "se" ]
then
end="Single Read"
fi
cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
cookie=\${cookie:11:-1}
exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=${repRID}/Execution_Run=${executionRunRID})
if [ "\${exist}" == "[]" ]
then
qc_rid=\$(python3 uploadQC.py -r ${repRID} -e ${executionRunRID} -p "\${end}" -s ${stranded} -l ${length} -w ${rawCount} -f ${finalCount} -o ${source} -c \${cookie} -u F)
echo LOG: mRNA QC RID uploaded - \${qc_rid} >> ${repRID}.uploadQC.log
else
rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
rid=\${rid:7:-6}
qc_rid=\$(python3 uploadQC.py -r ${repRID} -e ${executionRunRID} -p "\${end}" -s ${stranded} -l ${length} -w ${rawCount} -f ${finalCount} -o ${source} -c \${cookie} -u \${rid})
echo LOG: mRNA QC RID updated - \${qc_rid} >> ${repRID}.uploadQC.log
fi
echo \${qc_rid} > qcRID.csv
"""
}
// Extract mRNA qc RID into channel
qcRID = Channel.create()
qcRID_fl.splitCsv(sep: ",", header: false).separate(
qcRID
)
workflow.onError = { workflow.onError = {
subject = "$workflow.manifest.name FAILED: $params.repRID" subject = "$workflow.manifest.name FAILED: $params.repRID"
......
import argparse
from deriva.core import ErmrestCatalog, get_credential, BaseCLI
import sys
import csv
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--repRID', help="replicate RID", required=True)
parser.add_argument('-e', '--executionRunRID', help="exection run RID", required=True)
parser.add_argument('-p', '--ends', help="single/paired ends", required=True)
parser.add_argument('-s', '--stranded', help="stranded?", required=True)
parser.add_argument('-l', '--length', help="median read length", required=True)
parser.add_argument('-w', '--rawCount', help="raw count", required=True)
parser.add_argument('-f', '--assignedCount', help="final assigned count", required=True)
parser.add_argument('-n', '--notes', help="notes", default="", required=False)
parser.add_argument('-o', '--host', help="datahub host", required=True)
parser.add_argument('-c', '--cookie', help="cookie token", required=True)
parser.add_argument('-u', '--update', help="update?", default="F", required=True)
args = parser.parse_args()
return args
def main(hostname, catalog_number, credential):
catalog = ErmrestCatalog('https', hostname, catalog_number, credential)
pb = catalog.getPathBuilder()
run_table = pb.RNASeq.mRNA_QC
if args.update == "F":
run_data = {
"Execution_Run": args.executionRunRID,
"Replicate": args.repRID,
"Paired_End": args.ends,
"Strandedness": args.stranded,
"Median_Read_Length": args.length,
"Raw_Count": args.rawCount,
"Final_Count": args.assignedCount,
"Notes": args.notes
}
entities = run_table.insert([run_data])
rid = entities[0]["RID"]
else:
run_data = {
"RID": args.update,
"Execution_Run": args.executionRunRID,
"Replicate": args.repRID,
"Paired_End": args.ends,
"Strandedness": args.stranded,
"Median_Read_Length": args.length,
"Raw_Count": args.rawCount,
"Final_Count": args.assignedCount,
"Notes": args.notes
}
entities = run_table.update([run_data])
rid = args.update
print(rid)
if __name__ == '__main__':
args = get_args()
cli = BaseCLI("Custom RNASeq query", None, 1)
cli.remove_options(["--config-file"])
host = args.host
credentials = {"cookie": args.cookie}
main(host, 2, credentials)
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment