From 3b4d0255e7f2cb51c8e4a984f20127003fbd9bf4 Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Tue, 24 Mar 2020 15:39:49 -0500
Subject: [PATCH] Add dowsampling trimmed fastqs

---
 workflow/conf/biohpc.config |  3 +++
 workflow/nextflow.config    |  3 +++
 workflow/rna-seq.nf         | 47 +++++++++++++++++++++++++++++++++++--
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config
index 83fac1f..5bcbc27 100755
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -18,6 +18,9 @@ process {
   withName:trimData {
     queue = 'super'
   }
+  withName:downsampleData {
+    executor = 'local'
+  }
   withName:alignData {
     queue = '256GB,256GBv1'
   }
diff --git a/workflow/nextflow.config b/workflow/nextflow.config
index f0be347..eb9ca4d 100644
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -26,6 +26,9 @@ process {
   withName: trimData {
     container = 'bicf/trimgalore:1.1'
   }
+  withName: downsampleData {
+    container = 'bicf/seqtk:2.0.0'
+  }
   withName: alignData {
     container = 'bicf/gudmaprbkaligner:2.0.0'
   }
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index 2285223..625e3a0 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -117,6 +117,7 @@ process getData {
 
 // Replicate raw fastqs for multiple process inputs
 fastqs.into {
+  fastqs_downsampleData
   fastqs_trimData
   fastqs_fastqc
 }
@@ -190,6 +191,7 @@ metadata.splitCsv(sep: ",", header: false).separate(
 )
 // Replicate metadata for multiple process inputs
 endsManual.into {
+  endsManual_downsampleData
   endsManual_trimData
   endsManual_alignData
   endsManual_featureCounts
@@ -284,7 +286,7 @@ process trimData {
     path (fastq) from fastqs_trimData
 
   output:
-    path ("*.fq.gz") into fastqs_trimmed
+    path ("*.fq.gz") into fastqsTrim
     path ("*_trimming_report.txt") into trimQC
     path ("${repRID}.trimData.{out,err}")
 
@@ -306,6 +308,47 @@ process trimData {
     """
 }
 
+// Replicate trimmed fastqs
+fastqsTrim.into {
+  fastqsTrim_downsampleData
+  fastqsTrim_alignData
+}
+
+/*
+ * downsampleData: downsample fastq's for metadata inference
+ */
+process downsampleData {
+  tag "${repRID}"
+  publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.downsampleData.{out,err}"
+
+  input:
+    val endsManual_downsampleData
+    path fastq from fastqsTrim_downsampleData
+
+  output:
+    path ("sampled.{1,2}.fq") into fastqsSample
+    path ("${repRID}.downsampleData.{out,err}")
+
+  script:
+    """
+    hostname > ${repRID}.downsampleData.err
+    ulimit -a >> ${repRID}.downsampleData.err
+    export https_proxy=\${http_proxy}
+
+    if [ "${endsManual_downsampleData}" == "se" ]
+    then
+      echo "LOG: downsampling single-end trimmed fastq" >> ${repRID}.downsampleData.err
+      seqtk sample -s100 *trimmed.fq.gz 10000 1> sampled.1.fq 2>> ${repRID}.downsampleData.err
+    elif [ "${endsManual_downsampleData}" == "pe" ]
+    then
+      echo "LOG: downsampling read 1 of paired-end trimmed fastq" >> ${repRID}.downsampleData.err
+      seqtk sample -s100 *1.fq.gz 1000000 1> sampled.1.fq 2>> ${repRID}.downsampleData.err
+      echo "LOG: downsampling read 2 of paired-end trimmed fastq" >> ${repRID}.downsampleData.err
+      seqtk sample -s100 *2.fq.gz 1000000 1> sampled.2.fq 2>> ${repRID}.downsampleData.err
+    fi
+    """
+}
+
 /*
  * alignData: aligns the reads to a reference database
 */
@@ -316,7 +359,7 @@ process alignData {
   input:
     val endsManual_alignData
     val stranded_alignData
-    path fastq from fastqs_trimmed
+    path fastq from fastqsTrim_alignData
     path reference_alignData
 
   output:
-- 
GitLab