diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 63a773a70a31266f96bf54a1bdc61c2f8fc94098..20659ed35f42727c0f2284be96318dc1d7162220 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -130,7 +130,7 @@ dedupData: - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai - > - for i in {"chr8","chr4","chrY"}; do + for i in {"chr8","chr4","chrY"}; do echo "samtools view -b Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;"; done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k - pytest -m dedupData @@ -145,7 +145,7 @@ countData: script: - ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv - ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv - - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam + - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.countData - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se - assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)') @@ -283,7 +283,7 @@ integration_se: script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 -with-dag dag.png --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 -with-dag dag.png --ci true --email 'venkat.malladi@utsouthwestern.edu,Gervaise.Henry@UTSouthwestern.edu' - find . -type f -name "multiqc_data.json" -exec cp {} ./SE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -366,7 +366,7 @@ override_fastq: max: 1 when: - always - + override_species: stage: integration only: [merge_requests] @@ -388,7 +388,7 @@ override_species: max: 1 when: - always - + consistency: stage: consistency @@ -413,4 +413,4 @@ consistency: - assignedPE.txt - assignedExpectSE.txt - assignedExpectPE.txt - expire_in: 7 days \ No newline at end of file + expire_in: 7 days diff --git a/CHANGELOG.md b/CHANGELOG.md index 21e53310538eb6756056fa6a43f59313817c12a8..d9de614cb1b0a85c3442a9f9f8db16fdd000aaeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # v0.0.4 (in development) **User Facing** * Add option to pull references from datahub +* Add option to send email on workflow error, with pipeline error message **Background** * Remove (comment out) option to pull references from S3 diff --git a/README.md b/README.md index e3134bf4b3b6d7cbe0521fe58d7567ec44f34da4..acfeb4dd1074aa34d09199a19642367f36255ec0 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,9 @@ To Run: * **biohpc_max** = process on high power BioHPC cluster nodes (=> 128GB nodes), for resource testing * **aws_ondemand** = AWS Batch on-demand instant requests * **aws_spot** = AWS Batch spot instance requests + * `--email` email address(es) to send failure notification (comma separated) ***(optional)***: + * e.g: `--email 'venkat.malladi@utsouthwestern.edu,Gervaise.Henry@UTSouthwestern.edu'` + * NOTES: * once deriva-auth is run and authenticated, the two files above are saved in ```~/.deriva/``` (see official documents from [deriva](https://github.com/informatics-isi-edu/deriva-client#installer-packages-for-windows-and-macosx) on the lifetime of the credentials) * reference version consists of Genome Reference Consortium version, patch release and GENCODE annotation release # (leaving the params blank will use the default version tied to the pipeline version) @@ -126,4 +129,4 @@ Please cite in publications: Pipeline was developed by BICF from funding provide Pipeline Directed Acyclic Graph ------------------------------- - \ No newline at end of file + diff --git a/docs/dag.png b/docs/dag.png old mode 100755 new mode 100644 index 836be2de2281072566a79d1c3f1d74ac11a2aa69..bbc8bffc0fed5b15c7542c8170caacec76a57727 Binary files a/docs/dag.png and b/docs/dag.png differ diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 218cf2ae2e6c306be80f1bc0ccfd17da9125e9a8..f9fcef964a79aafb697c020defa67e68b93f5ec0 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -82,7 +82,7 @@ timeline { enabled = false file = 'timeline.html' } - + report { enabled = false file = 'report.html' @@ -94,6 +94,7 @@ tower { } manifest { + name = 'gudmap_rbk/rna-seq' homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' mainScript = 'rna-seq.nf' diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index a94077891768522aac7320ab35922c7ab3361bd4..7a7207ec05d8e44d4f1ab8824c8ba6a20490295c 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -1,12 +1,12 @@ #!/usr/bin/env nextflow -// ######## #### ###### ######## -// ## ## ## ## ## ## -// ## ## ## ## ## -// ######## ## ## ###### -// ## ## ## ## ## -// ## ## ## ## ## ## -// ######## #### ###### ## +// ######## #### ###### ######## +// ## ## ## ## ## ## +// ## ## ## ## ## +// ######## ## ## ###### +// ## ## ## ## ## +// ## ## ## ## ## ## +// ######## #### ###### ## // Define input variables params.deriva = "${baseDir}/../test_data/auth/credential.json" @@ -18,6 +18,8 @@ params.refMoVersion = "38.p6.vM22" params.refHuVersion = "38.p12.v31" params.refERCCVersion = "92" params.outDir = "${baseDir}/../output" +params.email = "" + // Define override input variable params.refSource = "biohpc" @@ -25,6 +27,7 @@ params.inputBagForce = "" params.fastqsForce = "" params.speciesForce = "" + // Parse input variables deriva = Channel .fromPath(params.deriva) @@ -46,6 +49,7 @@ logsDir = "${outDir}/Logs" inputBagForce = params.inputBagForce fastqsForce = params.fastqsForce speciesForce = params.speciesForce +email = params.email // Define fixed files derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") @@ -89,7 +93,7 @@ process trackStart { """ hostname ulimit -a - + curl -H 'Content-Type: application/json' -X PUT -d \ '{ \ "sessionId": "${workflow.sessionId}", \ @@ -199,16 +203,16 @@ process getData { mkdir -p ~/.bdbag ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt echo -e "LOG: linked" >> ${repRID}.getData.log - + # get bag basename replicate=\$(basename "${inputBag}" | cut -d "." -f1) echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log - + # unzip bag echo -e "LOG: unzipping replicate bag" >> ${repRID}.getData.log unzip ${inputBag} echo -e "LOG: unzipped" >> ${repRID}.getData.log - + # bag fetch fastq's only and rename by repRID echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log sh ${script_bdbagFetch} \${replicate} ${repRID} @@ -259,7 +263,7 @@ process parseMetadata { # get experiment RID metadata exp=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p expRID) echo -e "LOG: experiment RID metadata parsed: \${exp}" >> ${repRID}.parseMetadata.log - + # get study RID metadata study=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p studyRID) echo -e "LOG: study RID metadata parsed: \${study}" >> ${repRID}.parseMetadata.log @@ -267,7 +271,7 @@ process parseMetadata { # get endedness metadata endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p endsMeta) echo -e "LOG: endedness metadata parsed: \${endsMeta}" >> ${repRID}.parseMetadata.log - + # ganually get endness endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p endsManual) echo -e "LOG: endedness manually detected: \${endsManual}" >> ${repRID}.parseMetadata.log @@ -275,11 +279,11 @@ process parseMetadata { # get strandedness metadata stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p stranded) echo -e "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.log - + # get spike-in metadata spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p spike) echo -e "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.log - + # get species metadata species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p species) echo -e "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.log @@ -358,7 +362,7 @@ process trimData { fi echo -e "LOG: trimmed" >> ${repRID}.trimData.log echo -e "LOG: average trimmed read length: \${readLength}" >> ${repRID}.trimData.log - + # save read length file echo -e "\${readLength}" > readLength.csv """ @@ -381,7 +385,7 @@ getRefInferInput = referenceInfer.combine(deriva_getRefInfer.combine(script_refD /* * getRefInfer: dowloads appropriate reference for metadata inference -*/ +*/ process getRefInfer { tag "${refName}" @@ -391,7 +395,7 @@ process getRefInfer { output: tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf") into refInfer path ("${refName}", type: 'dir') into bedInfer - + script: """ hostname > ${repRID}.${refName}.getRefInfer.log @@ -532,14 +536,14 @@ process alignSampleData { echo -e "LOG: aligning ${ends}" >> ${repRID}.${ref}.alignSampleData.log if [ "${ends}" == "se" ] then - + hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${ref}.alignSampleSummary.txt --new-summary elif [ "${ends}" == "pe" ] then hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary fi echo -e "LOG: aliged" >> ${repRID}.${ref}.alignSampleData.log - + # convert the output sam file to a sorted bam file using Samtools echo -e "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.log samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam @@ -639,7 +643,7 @@ process inferMetadata { ended=`bash inferMeta.sh endness ${repRID}.infer_experiment.txt` fail=`bash inferMeta.sh fail ${repRID}.infer_experiment.txt` - if [ \${ended} == "PairEnd" ] + if [ \${ended} == "PairEnd" ] then ends="pe" percentF=`bash inferMeta.sh pef ${repRID}.infer_experiment.txt` @@ -728,7 +732,7 @@ process getRef { output: tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv") into reference - + script: """ hostname > ${repRID}.getRef.log @@ -847,7 +851,7 @@ process alignData { strandedParam="--rna-strandness R" elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "pe" ] then - strandedParam="--rna-strandness RF" + strandedParam="--rna-strandness RF" fi # align the reads with Hisat2 @@ -860,7 +864,7 @@ process alignData { hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary fi echo -e "LOG: alignined" >> ${repRID}.align.log - + # convert the output sam file to a sorted bam file using Samtools echo -e "LOG: converting from sam to bam" >> ${repRID}.align.log samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam @@ -892,7 +896,7 @@ process dedupData { output: tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam - tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam + tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam path ("*.deduped.Metrics.txt") into dedupQC script: @@ -908,7 +912,7 @@ process dedupData { # sort the bam file using Samtools echo -e "LOG: sorting the bam file" >> ${repRID}.dedup.log samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam - + # index the sorted bam using Samtools echo -e "LOG: indexing sorted bam file" >> ${repRID}.dedup.log samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai @@ -1004,7 +1008,7 @@ process countData { featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam fi echo -e "LOG: counted" >> ${repRID}.countData.log - + # extract assigned reads grep -m 1 'Assigned' *.countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv @@ -1069,12 +1073,12 @@ process dataQC { tuple path (bam), path (bai) from dedupBam_dataQC tuple path (chrBam), path (chrBai) from dedupChrBam val ends from endsInfer_dataQC - + output: path "${repRID}.tin.hist.tsv" into tinHist path "${repRID}.tin.med.csv" into inferMetadata_tinMed path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance - + script: """ hostname > ${repRID}.dataQC.log @@ -1179,8 +1183,8 @@ process aggrQC { echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log echo -e "Session\tSession ID\tStart Time\tPipeline Version\tInput" > run.tsv echo -e "Session\t${workflow.sessionId}\t${workflow.start}\t${workflow.manifest.version}\t\${input}" >> run.tsv - - + + # make RID table echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log echo -e "Replicate\tReplicate RID\tExperiment RID\tStudy RID" > rid.tsv @@ -1224,11 +1228,11 @@ process aggrQC { process outputBag { tag "${repRID}" publishDir "${outDir}/outputBag", mode: 'copy', pattern: "Replicate_${repRID}.outputBag.zip" - + input: path multiqc path multiqcJSON - + output: path ("Replicate_*.zip") into outputBag @@ -1239,4 +1243,25 @@ process outputBag { cp ${multiqcJSON} Replicate_${repRID}.outputBag bdbag Replicate_${repRID}.outputBag --archiver zip """ -} \ No newline at end of file +} + + +workflow.onError = { + subject = "$workflow.manifest.name FAILED: $params.repRID" + + def msg = """\ + + Pipeline error summary + --------------------------- + RID : ${params.repRID} + Version : ${workflow.manifest.version} + Duration : ${workflow.duration} + Nf Version : ${workflow.nextflow.version} + Message : ${workflow.errorMessage} + exit status : ${workflow.exitStatus} + """ + .stripIndent() + if (email != '') { + sendMail(to: email, subject: subject , body: msg) + } +}