From 82bac4e793f958f9b10641e63f3f6adf5b77d0c4 Mon Sep 17 00:00:00 2001
From: s181706 <jonathan.gesell@utsouthwestern.edu>
Date: Thu, 26 Mar 2020 18:06:00 -0500
Subject: [PATCH] Parallelized inferMetaData's TIN calculation.

---
 workflow/rna-seq.nf | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index 5c18610..b4111e5 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -347,7 +347,8 @@ process dedupData {
     set path (inBam), path (inBai) from rawBam_dedupData
 
   output:
-    tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai"), path ("${repRID}.sorted.deduped.*.bam") into dedupBam
+    tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam
+    tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam 
     path ("*.deduped.Metrics.txt") into dedupQC
     path ("${repRID}.dedup.out")
     path ("${repRID}.dedup.err")
@@ -364,7 +365,7 @@ process dedupData {
     # Split the deduped BAM file for multi-threaded tin calculation
     for i in `samtools view ${repRID}.sorted.deduped.bam | cut -f3 | sort | uniq`;
       do
-      echo "echo \"LOG: splitting each chromosome into its own BAM file with Samtools\" >> ${repRID}.dedup.err; samtools view -b ${repRID}.sorted.deduped.bam \${i} > ${repRID}.sorted.deduped.\${i}.bam"
+      echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\" >> ${repRID}.dedup.err; samtools view -b ${repRID}.sorted.deduped.bam \${i} > ${repRID}.sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}.sorted.deduped.\${i}.bam ${repRID}.sorted.deduped.\${i}.bam.bai"
     done | parallel -j `nproc` -k 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
     """
 }
@@ -428,7 +429,8 @@ process inferMetadata {
   input:
     path script_inferMeta
     path reference_inferMeta
-    set path (inBam), path (inBai), path (inBamChr) from dedupBam_inferMeta
+    set path (inBam), path (inBai) from dedupBam_inferMeta
+    set path (inChrBam), path (inChrBai) from dedupChrBam
 
   output:
     path "infer.tsv" into inferedMetadata
@@ -480,13 +482,11 @@ process inferMetadata {
     fi
 
     # calcualte TIN values per feature on each chromosome
-    for i in `find sorted.deduped.*.bam`;
-      do 
-      echo "\"LOG: running tin.py on \${i}\" >> ${repRID}.rseqc.err\"; tin.py -i \"\${i}\" -r ./bed/genome.bed 1>>${repRID}.rseqc.log 2>>${repRID}.rseqc.err"
-    done | shuf | parallel -j `nproc` -k
-
+    for i in `cat ./bed/genome.bed | cut -f1 | sort | uniq`; do
+      echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.rseqc.err; tin.py -i ${repRID}.sorted.deduped.\${i}.bam  -r ./bed/genome.bed 1>>${repRID}.rseqc.log 2>>${repRID}.rseqc.err; cat ${repRID}.sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";";
+    done | parallel -j `nproc` -k > ${repRID}.sorted.deduped.tin.xls 2>>${repRID}.rseqc.err
 
     # write infered metadata to file
-    echo -e \${endness}'\\t'\${stranded}'\\t'\${strategy}'\\t'\${percentF}'\\t'\${percentR}'\\t'\${fail} > infer.tsv
+    echo -e "\${endness}'\\t'\${stranded}'\\t'\${strategy}'\\t'\${percentF}'\\t'\${percentR}'\\t'\${fail}" > infer.tsv
     """
 }
-- 
GitLab