Clean inference, add tin

12996d71 · Gervaise Henry · 4a68865f · 12996d71 · 12996d71 · 12996d71
Commit 12996d71 authored 5 years ago by Gervaise Henry
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -27,6 +27,9 @@ process {
  withName:fastqc {
    queue = 'super'
  }
+  withName:inferMetadata {
+    queue = 'super'
+  }
 }

 singularity {

--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -39,6 +39,7 @@ referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references"
 script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
 script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")
 script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh")
+script_aggregateInference = Channel.fromPath("${baseDir}/scripts/aggregateInference.R")

 /*
 * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
@@ -195,11 +196,9 @@ stranded.into {
 }
 spike.into{
  spike_getRef
-  spike_rseqc
 }
 species.into {
  species_getRef
-  species_rseqc
 }

 /*
@@ -417,12 +416,13 @@ process inferMetadata {

  input:
  path script_inferMeta
+  path script_aggregateInference
  path reference_rseqc
  set val (repRID), path (inBam), path (inBai) from dedupBam_rseqc
-  val species_rseqc
-  val spike_rseqc

  output:
+  path "infer.csv" into inferMetadata
+

  script:
    """
@@ -434,29 +434,28 @@ process inferMetadata {

    endness=`bash inferMeta.sh endness ${repRID}.rseqc.log`
    fail=`bash inferMeta.sh fail ${repRID}.rseqc.log`
-    echo \${endness}
-    echo \${fail}
    if [ \${endness} == "PairEnd" ] 
    then
      percentF=`bash inferMeta.sh pef ${repRID}.rseqc.log`
      percentR=`bash inferMeta.sh per ${repRID}.rseqc.log`
      inner_distance.py -i "${inBam}" -o ${repRID}.insertSize -r ./bed/genome.bed
-      junction_saturation.py -i "${inBam}" -r ./bed/genome.bed -o ${repRID}.junctionSaturation
    elif [ \${endness} == "SingleEnd" ]
    then
      percentF=`bash inferMeta.sh sef ${repRID}.rseqc.log`
      percentR=`bash inferMeta.sh ser ${repRID}.rseqc.log`
    fi
-    if [ \$percentF > 0.25 ] && [ \$percentR < 0.25 ]
+    if [ \$percentF -gt 0.25 ] && [ \$percentR -lt 0.25 ]
    then
+      stranded="forward"
      if [ \$endness == "PairEnd" ]
      then
        strategy="1++,1--,2+-,2-+"
      else
        strategy="++,--"
      fi
-    elif [ \$percentR > 0.25 ] && [ \$percentF < 0.25 ]
+    elif [ \$percentR -gt 0.25 ] && [ \$percentF -lt 0.25 ]
    then
+      stranded="reverse"
      if [ \$endness == "PairEnd" ]
      then
        strategy="1+-,1-+,2++,2--"
@@ -464,15 +463,38 @@ process inferMetadata {
        strategy="+-,-+"
      fi
    else
+      stranded="unstranded"
      strategy="us"
    fi
-    if [ \$strategy != "us" ]
-    then
-      FPKM_count.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed -d \$strategy
-      RPKM_saturation.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed -d \$strategy
-    else
-      FPKM_count.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed
-      RPKM_saturation.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed
-    fi
+
+    # calcualte TIN values per feature
+    tin.py -i "${inBam}" -r ./bed/genome.bed
+
+    # aggregate infered metadata (including generate TIN stats)
+    Rscript aggregateInference.R --endness "\${endness}" --stranded "\${stranded}" --strategy "\${strategy}" --percentF \${percentF} --percentR \${percentR} --percentFail \${fail} --tin "${inBam.baseName}.tin.xls"
    """
-}
\ No newline at end of file
+}
+
+// Split infered metadata into separate channels
+endsMetaI = Channel.create()
+strandedI = Channel.create()
+strategyI = Channel.create()
+percentFI = Channel.create()
+percentRI = Channel.create()
+percentFailI = Channel.create()
+tinMinI = Channel.create()
+tinMedI = Channel.create()
+tinMaxI = Channel.create()
+tinSDI = Channel.create()
+inferMetadata.splitCsv(sep: ",", header: false).separate(
+  endsMetaI,
+  strandedI,
+  strategyI,
+  percentFI,
+  percentRI,
+  percentFailI,
+  tinMinI,
+  tinMedI,
+  tinMaxI,
+  tinSDI
+)
--- a/workflow/scripts/aggregateInference.R
+++ b/workflow/scripts/aggregateInference.R
+gc()
+library(optparse)
+
+option_list=list(
+  make_option("--endness",action="store",type='character',help="Infered Endness"),
+  make_option("--stranded",action="store",type='character',help="Infered Strandedness"),
+  make_option("--strategy",action="store",type='character',help="Infered Sequencing Strategy"),
+  make_option("--percentF",action="store",type='double',help="Percent Forward Aligned Reads"),
+  make_option("--percentR",action="store",type='double',help="Percent Reverse Aligned Reads"),
+  make_option("--percentFail",action="store",type='double',help="Percent Failed Aligned Reads"),
+  make_option("--tin",action="store",type='character',help="TIN File")
+)
+opt=parse_args(OptionParser(option_list=option_list))
+rm(option_list)
+
+if (length(setdiff(c("endness","stranded","strategy","percentF","percentR","percentFail","tin","help"),names(opt))) != 0){
+  stop(paste0("No input missing ",setdiff(c("endness","stranded","strategy","percentF","percentR","percentFail","tin","help"),names(opt)),", exiting."))
+} else if (!file.exists(opt$tin)){
+  stop("No tin file passed, exiting.")
+}
+
+if (opt$endness == "PairEnd"){
+  endness <- "pe"
+} else if (opt$endness == "SingleEnd"){
+  endness <- "se"
+}
+
+percentF <- round(opt$percentF,digits=2)
+percentR <- round(opt$percentR,digits=2)
+percentFail <- round(opt$percentFail,digits=2)
+
+repRID <- basename(gsub(".sorted.deduped.tin.xls","",opt$tin))
+
+tin <- read.delim(opt$tin,sep="\t")
+
+tin.min <- round(min(tin$TIN),digits=2)
+tin.med <- round(median(tin$TIN),digits=2)
+tin.max <- round(max(tin$TIN),digits=2)
+tin.sd <- round(sd(tin$TIN),digits=2)
+
+output <- paste(endness,opt$stranded,opt$strategy,percentF,percentR,percentFail,tin.min,tin.med,tin.max,tin.sd,sep=",")
+
+write(output,file="infer.csv")