From 75c3181000c3a71cc1b6093b8f1ca7e20e17dc19 Mon Sep 17 00:00:00 2001
From: s181706 <jonathan.gesell@utsouthwestern.edu>
Date: Wed, 29 Jan 2020 17:50:18 -0600
Subject: [PATCH] Initial branch commit, tested and working.

---
 workflow/conf/biohpc.config |  3 +++
 workflow/nextflow.config    |  3 +++
 workflow/rna-seq.nf         | 36 ++++++++++++++++++++++++++++++++----
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config
index e2b1335..dbc4e53 100755
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -18,6 +18,9 @@ process {
   withName:alignReads {
     queue = '256GB,256GBv1,384GB'
   }
+  withName: dedupReads {
+    queue = '128GB,256GB,256GBv1,384GB'
+  }
 }
 
 singularity {
diff --git a/workflow/nextflow.config b/workflow/nextflow.config
index acb710d..67cb95e 100644
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -26,6 +26,9 @@ process {
   withName: alignReads {
     container = 'bicf/gudmaprbkaligner:2.0.0'
   }
+  withName: dedupReads {
+    container = 'bicf/picard2.21.7:2.0.0'
+  }
 }
 
 trace {
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index ff34829..a15e439 100755
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -244,17 +244,45 @@ process alignReads {
     val referenceDir_alignReads
 
   output:
-    set repRID, file ("${repRID}.unal.gz"), file ("${repRID}.sorted.bam"), file ("${repRID}.sorted.bai")
-    set repRID, file ("${repRID}.align.out"), file ("${repRID}.align.err")
+    tuple repRID, file ("${repRID}.unal.gz"), file ("${repRID}.sorted.bam"), file ("${repRID}.sorted.bai") into dedupReads
+    tuple repRID, file ("${repRID}.align.out"), file ("${repRID}.align.err")
 
   script:
     """
+    hostname >${repRID}.align.err
+    ulimit -a >>${repRID}.align.err
+
+    # align reads
     if [ "${endsManual_alignReads}" == 'pe' ]; then
-    hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${referenceDir_alignReads} ${stranded_alignReads} --no-mixed --no-discordant -1 ${fqs[0]} -2 ${fqs[1]} 1>${repRID}.align.out 2>${repRID}.align.err;
-    else hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${referenceDir_alignReads} ${stranded_alignReads} -U ${fqs[0]} 1>${repRID}.align.out 2>${repRID}.align.err;
+    hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${referenceDir_alignReads} ${stranded_alignReads} --no-mixed --no-discordant -1 ${fqs[0]} -2 ${fqs[1]} 1>${repRID}.align.out 2>>${repRID}.align.err;
+    else hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${referenceDir_alignReads} ${stranded_alignReads} -U ${fqs[0]} 1>${repRID}.align.out 2>>${repRID}.align.err;
     fi;
     samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err;
     samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err;
     samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err;
     """
 }
+
+/*
+ *dedupReads: mark the duplicate reads, specifically focused on PCR or optical duplicates
+*/
+process dedupReads {
+  tag "${repRID}"
+  publishDir "${outDir}/deduped", mode: 'copy', pattern: "${repRID}.deduped.{bam,Metrics.txt}"
+  publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.dedup.{out,err}"
+
+  input:
+    tuple repRID, file (unal), file (sortedBam), file (sortedBai) from dedupReads
+
+  output:
+    tuple repRID, file ("${repRID}.deduped.bam"), file ("${repRID}.deduped.Metrics.txt")
+
+  script:
+    """
+    hostname >${repRID}.dedup.err
+    ulimit -a >>${repRID}.dedup.err
+
+    #Remove duplicated reads
+    java -jar /picard/build/libs/picard.jar MarkDuplicates I=${sortedBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
+    """
+}
-- 
GitLab