From 7cf5d610ec3ae37ee8f55257d2b6be796e407243 Mon Sep 17 00:00:00 2001
From: Venkat Malladi <venkat.malladi@utsouthwestern.edu>
Date: Thu, 12 Nov 2020 22:24:00 -0600
Subject: [PATCH] Add back parallel trim step. Moved fastqc step after trim.

---
 .gitlab-ci.yml      | 4 ++--
 CHANGELOG.md        | 1 +
 workflow/rna-seq.nf | 7 +++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3a60c64..5ada019 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -113,8 +113,8 @@ trimData:
     - merge_requests
   script:
   - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --version > version_trimgalore.txt
-  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
-  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
+  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se -j 4 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
+  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe -j 4 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
   - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
   - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
   - pytest -m trimData
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5b47ef9..02c9891 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@
 * Make pull references from BioHPC default (including in biohpc.config)
 * Start using new gudmaprbk dockerhub (images autobuilt)
 * Moved consistency checks to be fully python
+* Added back parallel form of trim_galore and now use fastqc after trim step
 
 *Known Bugs*
 * Datahub reference pull uses dev.gudmap.org as source until referencencs are placed on production
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index 551f18d..8829006 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -229,12 +229,10 @@ if (fastqsForce != "") {
     .ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" }
     .collect().into {
       fastqs_trimData
-      fastqs_fastqc
     }
 } else {
   fastqs.into {
     fastqs_trimData
-    fastqs_fastqc
   }
 }
 
@@ -343,6 +341,7 @@ process trimData {
 
   output:
     path ("*.fq.gz") into fastqsTrim
+    path ("*.R{1,2}.fastq.gz") into fastqs_fastqc
     path ("*_trimming_report.txt") into trimQC
     path ("readLength.csv") into inferMetadata_readLength
 
@@ -355,11 +354,11 @@ process trimData {
     echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log
     if [ "${ends}" == "se" ]
     then
-      trim_galore --gzip -q 25 --length 35 --basename ${repRID} ${fastq[0]}
+      trim_galore --gzip -q 25 --length 35 --basename ${repRID} -j 4 ${fastq[0]}
       readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
     elif [ "${ends}" == "pe" ]
     then
-      trim_galore --gzip -q 25 --length 35 --paired --basename ${repRID} ${fastq[0]} ${fastq[1]}
+      trim_galore --gzip -q 25 --length 35 --paired --basename ${repRID} -j 4 ${fastq[0]} ${fastq[1]}
       readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
     fi
     echo -e "LOG: trimmed" >> ${repRID}.trimData.log
-- 
GitLab