From d0be69dab81e2ab982f32220298d6b3d0bb464c9 Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Mon, 3 Aug 2020 15:27:48 -0500 Subject: [PATCH] Measure median trimmed read length and display in multiqc table report --- .gitlab-ci.yml | 2 +- workflow/rna-seq.nf | 27 ++++++++++++++++++++++----- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2e48991..29f4925 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -39,7 +39,7 @@ parseMetadata: - spike=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike) - species=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species) - readLength=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.stageNew.csv" -p readLength) - - echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv + - echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv - pytest -m parseMetadata inferMetadata: diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 1443869..ee012c2 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -241,7 +241,7 @@ process parseMetadata { readLength=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p readLength) echo -e "LOG: read length metadata parsed: \${readLength}" >> ${repRID}.parseMetadata.log - # gave design file + # save design file echo -e "\${endsMeta},\${endsManual},\${stranded},\${spike},\${species},\${readLength},\${exp},\${study}" > design.csv """ } @@ -287,6 +287,7 @@ process trimData { output: path ("*.fq.gz") into fastqsTrim path ("*_trimming_report.txt") into trimQC + path ("readLength.csv") into inferMetadata_readLength script: """ @@ -298,14 +299,27 @@ process trimData { if [ "${ends}" == "se" ] then trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} + readLength=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') elif [ "${ends}" == "pe" ] then trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} + readLength=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') fi echo -e "LOG: trimmed" >> ${repRID}.trimData.log + echo -e "LOG: average trimmed read length: /${readLength}" >> ${repRID}.trimData.log + + # save read length file + echo -e "\${readLength}" > readLength.csv """ } +// Split metadata into separate channels +readLengthInfer = Channel.create() +inferMetadata_readLength.splitCsv(sep: ",", header: false).separate( + readLengthInfer +} + + // Replicate trimmed fastq's fastqsTrim.into { fastqsTrim_alignData @@ -962,6 +976,8 @@ process aggrQC { val strandedI from strandedInfer_aggrQC val spikeI from spikeInfer_aggrQC val speciesI from speciesInfer_aggrQC + val readLengthM from readLengthMeta + val readLengthI from readLengthInfer val expRID val studyRID @@ -980,10 +996,11 @@ process aggrQC { # make metadata table echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log - echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in" > metadata.tsv - echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}" >> metadata.tsv - echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}" >> metadata.tsv - echo -e "Manual\t-\t${endsManual}\t-\t-" >> metadata.tsv + echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRead Length" > metadata.tsv + echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-" >> metadata.tsv + echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t${readLengthM}" >> metadata.tsv + echo -e "Manual\t-\t${endsManual}\t-\t-\t-" >> metadata.tsv + echo -e "Measured\t-\t-\t-\t-\t${readLengthI}" # remove inner distance report if it is empty (SE repRID) echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log -- GitLab