From fa11871a5efbc5211f8366743369876763241242 Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Wed, 5 Aug 2020 20:09:31 -0500
Subject: [PATCH] Extract raw reads

---
 workflow/conf/multiqc_config.yaml |  5 +++--
 workflow/rna-seq.nf               | 19 +++++++++++++++----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml
index 3c1c535..99a861c 100644
--- a/workflow/conf/multiqc_config.yaml
+++ b/workflow/conf/multiqc_config.yaml
@@ -80,9 +80,10 @@ custom_data:
             Ends
             Stranded
             Spike-in
+            Raw Reads
             Assigned Reads
-            Read Length
-            TIN
+            Median Read Length
+            Median TIN
 
         file_format: 'tsv'
         section_name: 'TIN'
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index 1871139..b1cf47e 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -902,6 +902,7 @@ process fastqc {
 
   output:
     path ("*_fastqc.zip") into fastqc
+    path ("rawRead.csv") into inferMetadata_rawReads
 
   script:
     """
@@ -911,9 +912,18 @@ process fastqc {
     # run fastqc
     echo -e "LOG: running fastq on raw fastqs" >> ${repRID}.fastqc.log
     fastqc *.fastq.gz -o .
+
+    # count raw reads
+    zcat ${fastq[0]} | echo $((`wc -l`/4)) > rawReads.csv
     """
 }
 
+// Extract number of raw reads metadata into channel
+rawReadsInfer = Channel.create()
+inferMetadata_rawReads.splitCsv(sep: ",", header: false).separate(
+  rawReadsInfer
+)
+
 /*
  *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates
 */
@@ -998,6 +1008,7 @@ process aggrQC {
     val speciesI from speciesInfer_aggrQC
     val readLengthM from readLengthMeta
     val readLengthI from readLengthInfer
+    val rawReadsI from rawReadsInfer
     val assignedReadsI from assignedReadsInfer
     val tinMedI from tinMedInfer
     val expRID
@@ -1018,10 +1029,10 @@ process aggrQC {
 
     # make metadata table
     echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log
-    echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tAssigned Reads\tRead Length\tTIN" > metadata.tsv
-    echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-" >> metadata.tsv
-    echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t${readLengthM}\t-" >> metadata.tsv
-    echo -e "Measured\t-\t${endsManual}\t-\t-\t${assignedReadsI}\t${readLengthI}\t${tinMedI}" >> metadata.tsv
+    echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRaw Reads\tAssigned Reads\tMedian Read Length\tMedian TIN" > metadata.tsv
+    echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t${readLengthM}\t-" >> metadata.tsv
+    echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
+    echo -e "Measured\t-\t${endsManual}\t-\t-\t${rawReadsI}\t${assignedReadsI}\t${readLengthI}\t${tinMedI}" >> metadata.tsv
 
     # remove inner distance report if it is empty (SE repRID)
     echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log
-- 
GitLab