From 3d8cc5f784200482eb4d8aefeadb32e7e4eb1263 Mon Sep 17 00:00:00 2001
From: s181706 <jonathan.gesell@utsouthwestern.edu>
Date: Mon, 11 Nov 2019 14:31:54 -0600
Subject: [PATCH] Added trimming step using trimGalore and logging.

---
 workflow/conf/biohpc.config |  5 +++
 workflow/rna-seq.nf         | 73 +++++++++++++++++++++++++------------
 2 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config
index d221fee..5842b58 100755
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -1,6 +1,7 @@
 process {
   executor = 'slurm'
   queue = 'super'
+  clusterOptions = '--hold'
 
   // Process specific configuration
   withName:splitData {
@@ -9,6 +10,10 @@ process {
   withName:getData {
     container = 'docker://bicf/bdbag:1.0'
   }
+  withName:trimData {
+    container = 'docker://bicf/trimgalore:1.0'
+    queue = '256GB,256GBv1,384GB'
+  }
 }
 
 
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index 4f1fd5f..035faa8 100755
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -13,12 +13,15 @@ bdbag = Channel
   .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }
 
 outDir = params.outDir
+logsDir = "${outDir}/Logs"
 
 /*
  * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
  */
 process splitData {
   tag "${bdbag.baseName}"
+  executor 'local'
+  publishDir "${logsDir}/splitData", mode: 'symlink', pattern: "${bdbag.baseName}.splitData.err"
 
   input:
     file bdbag
@@ -29,23 +32,24 @@ process splitData {
     file("${bdbag.baseName}/data/File.csv") into fileMeta
     file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta
     file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta
+    file ("${bdbag.baseName}.splitData.err")
 
   script:
     """
-    hostname
-    ulimit -a
-    ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt
-    echo "LOG: deriva cookie linked"
-    study=`echo "${bdbag}" | cut -d '.' -f1`
-    echo "LOG: \${study}"
-    unzip ${bdbag}
-    echo "LOG: bdgag unzipped"
-    python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study}
-    echo "LOG: fetch file filtered for only .fastq.gz"
-    python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study}
-    echo "LOG: fetch file split by replicates"
-    sh ${baseDir}/scripts/splitBag.sh \${study}
-    echo "LOG: bag recreated with replicate split fetch file"
+    hostname >> ${bdbag.baseName}.splitData.err
+    ulimit -a >> ${bdbag.baseName}.splitData.err
+    ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt 2>>${bdbag.baseName}.splitData.err
+    echo "LOG: deriva cookie linked" >> ${bdbag.baseName}.splitData.err 
+    study=`echo "${bdbag}" | cut -d '.' -f1` 2>>${bdbag.baseName}.splitData.err
+    echo "LOG: \${study}" >> ${bdbag.baseName}.splitData.err
+    unzip ${bdbag} 2>>${bdbag.baseName}.splitData.err
+    echo "LOG: bdgag unzipped" >> ${bdbag.baseName}.splitData.err
+    python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err
+    echo "LOG: fetch file filtered for only .fastq.gz" >> ${bdbag.baseName}.splitData.err
+    python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err
+    echo "LOG: fetch file split by replicates" >> ${bdbag.baseName}.splitData.err
+    sh ${baseDir}/scripts/splitBag.sh \${study} 2>>${bdbag.baseName}.splitData.err
+    echo "LOG: bag recreated with replicate split fetch file" >> ${bdbag.baseName}.splitData.err
     """
 }
 
@@ -54,24 +58,45 @@ process splitData {
  */
 process getData {
   tag "${rep.baseName}"
-  publishDir "${outDir}/tempOut/fastqs", mode: "symlink"
+  publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${rep.baseName}.getData.err"
 
   input:
     each rep from bdbagSplit
 
   output:
-    path ("*.R*.fastq.gz", type: 'file', maxDepth: '0') into fastq
+    set val ("${rep.baseName}"), file ("*.R{1,2}.fastq.gz") into trimming
 
   script:
     """
-    hostname
-    ulimit -a
+    hostname >>${rep.baseName}.getData.err
+    ulimit -a >>${rep.baseName}.getData.err
     export https_proxy=\${http_proxy}
     replicate=\$(basename "${rep}" | cut -d '.' -f1)
-    echo "LOG: \${replicate}"
-    unzip ${rep}
-    echo "LOG: replicate bdbag unzipped"
-    sh ${baseDir}/scripts/bdbagFetch.sh \${replicate}
-    echo "LOG: replicate bdbag fetched"
+    echo "LOG: \${replicate}" >>${rep.baseName}.getData.err
+    unzip ${rep} 2>>${rep.baseName}.getData.err
+    echo "LOG: replicate bdbag unzipped" >>${rep.baseName}.getData.err
+    sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} 2>>${rep.baseName}.getData.err
+    echo "LOG: replicate bdbag fetched" >>${rep.baseName}.getData.err
     """
- }
+}
+
+/*
+ * trimData: trims any adapter or non-host sequences from the data
+*/
+process trimData {
+  tag "trim-${repID}"
+  publishDir "${outDir}/tempOut/trimmed", mode: "symlink", pattern: "*_val_{1,2}.fq.gz"
+  publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${rep}.trimData.*"
+
+  input:
+    set repID, reads from trimming
+
+  output:
+    path ("*_val_{1,2}.fq.gz", type: 'file', maxDepth: '0')
+
+  script:
+    """
+    rep=`echo ${repID} | cut -f2- -d '_'`;
+    trim_galore --gzip --max_n 1 --paired --basename \${rep} -j `nproc` ${reads[0]} ${reads[1]} 1>>\${rep}.trimData.log 2>>\${rep}.trimData.err;
+    """
+}
-- 
GitLab