From d0be69dab81e2ab982f32220298d6b3d0bb464c9 Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Mon, 3 Aug 2020 15:27:48 -0500
Subject: [PATCH] Measure median trimmed read length and display in multiqc
 table report

---
 .gitlab-ci.yml      |  2 +-
 workflow/rna-seq.nf | 27 ++++++++++++++++++++++-----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 2e48991..29f4925 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -39,7 +39,7 @@ parseMetadata:
   - spike=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike)
   - species=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species)
   - readLength=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.stageNew.csv" -p readLength)
-  -  echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv
+  - echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv
   - pytest -m parseMetadata
 
 inferMetadata:
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index 1443869..ee012c2 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -241,7 +241,7 @@ process parseMetadata {
     readLength=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p readLength)
     echo -e "LOG: read length metadata parsed: \${readLength}" >> ${repRID}.parseMetadata.log
 
-    # gave design file
+    # save design file
     echo -e "\${endsMeta},\${endsManual},\${stranded},\${spike},\${species},\${readLength},\${exp},\${study}" > design.csv
     """
 }
@@ -287,6 +287,7 @@ process trimData {
   output:
     path ("*.fq.gz") into fastqsTrim
     path ("*_trimming_report.txt") into trimQC
+    path ("readLength.csv") into inferMetadata_readLength
 
   script:
     """
@@ -298,14 +299,27 @@ process trimData {
     if [ "${ends}" == "se" ]
     then
       trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]}
+      readLength=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
     elif [ "${ends}" == "pe" ]
     then
       trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]}
+      readLength=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
     fi
     echo -e "LOG: trimmed" >> ${repRID}.trimData.log
+    echo -e "LOG: average trimmed read length: /${readLength}" >> ${repRID}.trimData.log
+    
+    # save read length file
+    echo -e "\${readLength}" > readLength.csv
     """
 }
 
+// Split metadata into separate channels
+readLengthInfer = Channel.create()
+inferMetadata_readLength.splitCsv(sep: ",", header: false).separate(
+  readLengthInfer
+}
+
+
 // Replicate trimmed fastq's
 fastqsTrim.into {
   fastqsTrim_alignData
@@ -962,6 +976,8 @@ process aggrQC {
     val strandedI from strandedInfer_aggrQC
     val spikeI from spikeInfer_aggrQC
     val speciesI from speciesInfer_aggrQC
+    val readLengthM from readLengthMeta
+    val readLengthI from readLengthInfer
     val expRID
     val studyRID
 
@@ -980,10 +996,11 @@ process aggrQC {
 
     # make metadata table
     echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log
-    echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in" > metadata.tsv
-    echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}" >> metadata.tsv
-    echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}" >> metadata.tsv
-    echo -e "Manual\t-\t${endsManual}\t-\t-" >> metadata.tsv
+    echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRead Length" > metadata.tsv
+    echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-" >> metadata.tsv
+    echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t${readLengthM}" >> metadata.tsv
+    echo -e "Manual\t-\t${endsManual}\t-\t-\t-" >> metadata.tsv
+    echo -e "Measured\t-\t-\t-\t-\t${readLengthI}"
 
     # remove inner distance report if it is empty (SE repRID)
     echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log
-- 
GitLab