Extract assigned reads count

788a539f · Gervaise Henry · 740d2eff · 788a539f · 788a539f
Commit 788a539f authored 4 years ago by Gervaise Henry
--- a/workflow/conf/multiqc_config.yaml
+++ b/workflow/conf/multiqc_config.yaml
@@ -80,10 +80,12 @@ custom_data:
            Ends
            Stranded
            Spike-in
+            Assigned Reads:
+		        format: '{:,.0f}'
            Read Length:
-		format: '{:,.0f}'
+		        format: '{:,.0f}'
            TIN:
-		format: '{:+.1f}'
+		        format: '{:+.1f}'

        file_format: 'tsv'
        section_name: 'TIN'

--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -842,6 +842,7 @@ process countData {
  output:
    path ("*.countTable.csv") into counts
    path ("*.countData.summary") into countsQC
+    path ("assignedReads") into inferMetadata_assignedReads

  script:
    """
@@ -861,7 +862,7 @@ process countData {
    elif [ "${stranded}" == "reverse" ]
    then
      stranding=2
-      echo -e "LOG: strandedness set to forward stranded [2]" >> ${repRID}.countData.log
+      echo -e "LOG: strandedness set to reverse stranded [2]" >> ${repRID}.countData.log
    fi

    # run featureCounts
@@ -875,6 +876,11 @@ process countData {
    fi
    echo -e "LOG: counted" >> ${repRID}.countData.log

+    assignedReads=grep -m 1 'Assigned' *.countData.summary | grep -oe '\([0-9.]*\)'
+    echo -e \${assignedReads} > assignedReads.csv
+    echo -e "LOG: assigned reads: "\${assignedReads} >> ${repRID}.countData.log
+
+
    # calculate TPM from the resulting countData table
    echo -e "LOG: calculating TPM with R" >> ${repRID}.countData.log
    Rscript calculateTPM.R --count "${repRID}.countData"
@@ -905,6 +911,12 @@ process fastqc {
    """
 }

+// Extract number of assigned reads metadata into channel
+assignedReadsInfer = Channel.create()
+inferMetadata_assignedReads.splitCsv(sep: ",", header: false).separate(
+  assignedReads
+)
+
 /*
 *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates
 */
@@ -920,7 +932,7 @@ process dataQC {
    
  output:
    path "${repRID}.tin.hist.tsv" into tinHist
-    path "${repRID}.tin.med.csv" into tinMed
+    path "${repRID}.tin.med.csv" into inferMetadata_tinMed
    path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance
  
  script:
@@ -955,7 +967,7 @@ process dataQC {

 // Extract median TIN metadata into channel
 tinMedInfer = Channel.create()
-tinMed.splitCsv(sep: ",", header: false).separate(
+inferMetadata_tinMed.splitCsv(sep: ",", header: false).separate(
  tinMedInfer
 )

@@ -989,6 +1001,7 @@ process aggrQC {
    val speciesI from speciesInfer_aggrQC
    val readLengthM from readLengthMeta
    val readLengthI from readLengthInfer
+    val assignedReadsI from assignedReadsInfer
    val tinMedI from tinMedInfer
    val expRID
    val studyRID
@@ -1008,11 +1021,10 @@ process aggrQC {

    # make metadata table
    echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log
-    echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRead Length\tTIN" > metadata.tsv
-    echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-" >> metadata.tsv
-    echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t${readLengthM}\t-" >> metadata.tsv
-    echo -e "Manual\t-\t${endsManual}\t-\t-\t-\t-" >> metadata.tsv
-    echo -e "Measured\t-\t-\t-\t-\t${readLengthI}\t${tinMedI}" >> metadata.tsv
+    echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tAssigned Reads\tRead Length\tTIN" > metadata.tsv
+    echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-" >> metadata.tsv
+    echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t${readLengthM}\t-\t-" >> metadata.tsv
+    echo -e "Measured\t-\t${endsManual}\t-\t-\t${assignedReadsI}\t${readLengthI}\t${tinMedI}" >> metadata.tsv

    # remove inner distance report if it is empty (SE repRID)
    echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log