From 12996d717d1720bef95679f6e017475bcfacb366 Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Thu, 12 Mar 2020 19:39:07 -0500
Subject: [PATCH] Clean inference, add tin

---
 workflow/conf/biohpc.config           |  3 ++
 workflow/rna-seq.nf                   | 58 ++++++++++++++++++---------
 workflow/scripts/aggregateInference.R | 43 ++++++++++++++++++++
 3 files changed, 86 insertions(+), 18 deletions(-)
 create mode 100644 workflow/scripts/aggregateInference.R

diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config
index b09cba3..ea9d137 100755
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -27,6 +27,9 @@ process {
   withName:fastqc {
     queue = 'super'
   }
+  withName:inferMetadata {
+    queue = 'super'
+  }
 }
 
 singularity {
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index e8c1fce..15e8eff 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -39,6 +39,7 @@ referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references"
 script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
 script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")
 script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh")
+script_aggregateInference = Channel.fromPath("${baseDir}/scripts/aggregateInference.R")
 
 /*
  * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
@@ -195,11 +196,9 @@ stranded.into {
 }
 spike.into{
   spike_getRef
-  spike_rseqc
 }
 species.into {
   species_getRef
-  species_rseqc
 }
 
 /*
@@ -417,12 +416,13 @@ process inferMetadata {
 
   input:
   path script_inferMeta
+  path script_aggregateInference
   path reference_rseqc
   set val (repRID), path (inBam), path (inBai) from dedupBam_rseqc
-  val species_rseqc
-  val spike_rseqc
 
   output:
+  path "infer.csv" into inferMetadata
+
 
   script:
     """
@@ -434,29 +434,28 @@ process inferMetadata {
 
     endness=`bash inferMeta.sh endness ${repRID}.rseqc.log`
     fail=`bash inferMeta.sh fail ${repRID}.rseqc.log`
-    echo \${endness}
-    echo \${fail}
     if [ \${endness} == "PairEnd" ] 
     then
       percentF=`bash inferMeta.sh pef ${repRID}.rseqc.log`
       percentR=`bash inferMeta.sh per ${repRID}.rseqc.log`
       inner_distance.py -i "${inBam}" -o ${repRID}.insertSize -r ./bed/genome.bed
-      junction_saturation.py -i "${inBam}" -r ./bed/genome.bed -o ${repRID}.junctionSaturation
     elif [ \${endness} == "SingleEnd" ]
     then
       percentF=`bash inferMeta.sh sef ${repRID}.rseqc.log`
       percentR=`bash inferMeta.sh ser ${repRID}.rseqc.log`
     fi
-    if [ \$percentF > 0.25 ] && [ \$percentR < 0.25 ]
+    if [ \$percentF -gt 0.25 ] && [ \$percentR -lt 0.25 ]
     then
+      stranded="forward"
       if [ \$endness == "PairEnd" ]
       then
         strategy="1++,1--,2+-,2-+"
       else
         strategy="++,--"
       fi
-    elif [ \$percentR > 0.25 ] && [ \$percentF < 0.25 ]
+    elif [ \$percentR -gt 0.25 ] && [ \$percentF -lt 0.25 ]
     then
+      stranded="reverse"
       if [ \$endness == "PairEnd" ]
       then
         strategy="1+-,1-+,2++,2--"
@@ -464,15 +463,38 @@ process inferMetadata {
         strategy="+-,-+"
       fi
     else
+      stranded="unstranded"
       strategy="us"
     fi
-    if [ \$strategy != "us" ]
-    then
-      FPKM_count.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed -d \$strategy
-      RPKM_saturation.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed -d \$strategy
-    else
-      FPKM_count.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed
-      RPKM_saturation.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed
-    fi
+
+    # calcualte TIN values per feature
+    tin.py -i "${inBam}" -r ./bed/genome.bed
+
+    # aggregate infered metadata (including generate TIN stats)
+    Rscript aggregateInference.R --endness "\${endness}" --stranded "\${stranded}" --strategy "\${strategy}" --percentF \${percentF} --percentR \${percentR} --percentFail \${fail} --tin "${inBam.baseName}.tin.xls"
     """
-}
\ No newline at end of file
+}
+
+// Split infered metadata into separate channels
+endsMetaI = Channel.create()
+strandedI = Channel.create()
+strategyI = Channel.create()
+percentFI = Channel.create()
+percentRI = Channel.create()
+percentFailI = Channel.create()
+tinMinI = Channel.create()
+tinMedI = Channel.create()
+tinMaxI = Channel.create()
+tinSDI = Channel.create()
+inferMetadata.splitCsv(sep: ",", header: false).separate(
+  endsMetaI,
+  strandedI,
+  strategyI,
+  percentFI,
+  percentRI,
+  percentFailI,
+  tinMinI,
+  tinMedI,
+  tinMaxI,
+  tinSDI
+)
diff --git a/workflow/scripts/aggregateInference.R b/workflow/scripts/aggregateInference.R
new file mode 100644
index 0000000..3e8388d
--- /dev/null
+++ b/workflow/scripts/aggregateInference.R
@@ -0,0 +1,43 @@
+gc()
+library(optparse)
+
+option_list=list(
+  make_option("--endness",action="store",type='character',help="Infered Endness"),
+  make_option("--stranded",action="store",type='character',help="Infered Strandedness"),
+  make_option("--strategy",action="store",type='character',help="Infered Sequencing Strategy"),
+  make_option("--percentF",action="store",type='double',help="Percent Forward Aligned Reads"),
+  make_option("--percentR",action="store",type='double',help="Percent Reverse Aligned Reads"),
+  make_option("--percentFail",action="store",type='double',help="Percent Failed Aligned Reads"),
+  make_option("--tin",action="store",type='character',help="TIN File")
+)
+opt=parse_args(OptionParser(option_list=option_list))
+rm(option_list)
+
+if (length(setdiff(c("endness","stranded","strategy","percentF","percentR","percentFail","tin","help"),names(opt))) != 0){
+  stop(paste0("No input missing ",setdiff(c("endness","stranded","strategy","percentF","percentR","percentFail","tin","help"),names(opt)),", exiting."))
+} else if (!file.exists(opt$tin)){
+  stop("No tin file passed, exiting.")
+}
+
+if (opt$endness == "PairEnd"){
+  endness <- "pe"
+} else if (opt$endness == "SingleEnd"){
+  endness <- "se"
+}
+
+percentF <- round(opt$percentF,digits=2)
+percentR <- round(opt$percentR,digits=2)
+percentFail <- round(opt$percentFail,digits=2)
+
+repRID <- basename(gsub(".sorted.deduped.tin.xls","",opt$tin))
+
+tin <- read.delim(opt$tin,sep="\t")
+
+tin.min <- round(min(tin$TIN),digits=2)
+tin.med <- round(median(tin$TIN),digits=2)
+tin.max <- round(max(tin$TIN),digits=2)
+tin.sd <- round(sd(tin$TIN),digits=2)
+
+output <- paste(endness,opt$stranded,opt$strategy,percentF,percentR,percentFail,tin.min,tin.med,tin.max,tin.sd,sep=",")
+
+write(output,file="infer.csv")
-- 
GitLab