Add back parallel trim step. Moved fastqc step after trim.

7cf5d610 · Venkat Malladi · 3ccdc626 · 7cf5d610 · 7cf5d610 · 7cf5d610
Commit 7cf5d610 authored 4 years ago by Venkat Malladi
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -113,8 +113,8 @@ trimData:
    - merge_requests
  script:
  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --version > version_trimgalore.txt
-  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
+  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se -j 4 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
-  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
+  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe -j 4 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
  - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
  - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
  - pytest -m trimData

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@
 * Make pull references from BioHPC default (including in biohpc.config)
 * Start using new gudmaprbk dockerhub (images autobuilt)
 * Moved consistency checks to be fully python
+* Added back parallel form of trim_galore and now use fastqc after trim step
 *Known Bugs*
 * Datahub reference pull uses dev.gudmap.org as source until referencencs are placed on production

--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -229,12 +229,10 @@ if (fastqsForce != "") {
    .ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" }
    .collect().into {
      fastqs_trimData
-      fastqs_fastqc
    }
 } else {
  fastqs.into {
    fastqs_trimData
-    fastqs_fastqc
  }
 }
@@ -343,6 +341,7 @@ process trimData {
  output:
    path ("*.fq.gz") into fastqsTrim
+    path ("*.R{1,2}.fastq.gz") into fastqs_fastqc
    path ("*_trimming_report.txt") into trimQC
    path ("readLength.csv") into inferMetadata_readLength
@@ -355,11 +354,11 @@ process trimData {
    echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log
    if [ "${ends}" == "se" ]
    then
-      trim_galore --gzip -q 25 --length 35 --basename ${repRID} ${fastq[0]}
+      trim_galore --gzip -q 25 --length 35 --basename ${repRID} -j 4 ${fastq[0]}
      readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
    elif [ "${ends}" == "pe" ]
    then
-      trim_galore --gzip -q 25 --length 35 --paired --basename ${repRID} ${fastq[0]} ${fastq[1]}
+      trim_galore --gzip -q 25 --length 35 --paired --basename ${repRID} -j 4 ${fastq[0]} ${fastq[1]}
      readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
    fi
    echo -e "LOG: trimmed" >> ${repRID}.trimData.log