From 3d8cc5f784200482eb4d8aefeadb32e7e4eb1263 Mon Sep 17 00:00:00 2001 From: s181706 <jonathan.gesell@utsouthwestern.edu> Date: Mon, 11 Nov 2019 14:31:54 -0600 Subject: [PATCH] Added trimming step using trimGalore and logging. --- workflow/conf/biohpc.config | 5 +++ workflow/rna-seq.nf | 73 +++++++++++++++++++++++++------------ 2 files changed, 54 insertions(+), 24 deletions(-) diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index d221fee..5842b58 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -1,6 +1,7 @@ process { executor = 'slurm' queue = 'super' + clusterOptions = '--hold' // Process specific configuration withName:splitData { @@ -9,6 +10,10 @@ process { withName:getData { container = 'docker://bicf/bdbag:1.0' } + withName:trimData { + container = 'docker://bicf/trimgalore:1.0' + queue = '256GB,256GBv1,384GB' + } } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 4f1fd5f..035faa8 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -13,12 +13,15 @@ bdbag = Channel .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } outDir = params.outDir +logsDir = "${outDir}/Logs" /* * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid */ process splitData { tag "${bdbag.baseName}" + executor 'local' + publishDir "${logsDir}/splitData", mode: 'symlink', pattern: "${bdbag.baseName}.splitData.err" input: file bdbag @@ -29,23 +32,24 @@ process splitData { file("${bdbag.baseName}/data/File.csv") into fileMeta file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta + file ("${bdbag.baseName}.splitData.err") script: """ - hostname - ulimit -a - ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt - echo "LOG: deriva cookie linked" - study=`echo "${bdbag}" | cut -d '.' -f1` - echo "LOG: \${study}" - unzip ${bdbag} - echo "LOG: bdgag unzipped" - python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} - echo "LOG: fetch file filtered for only .fastq.gz" - python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} - echo "LOG: fetch file split by replicates" - sh ${baseDir}/scripts/splitBag.sh \${study} - echo "LOG: bag recreated with replicate split fetch file" + hostname >> ${bdbag.baseName}.splitData.err + ulimit -a >> ${bdbag.baseName}.splitData.err + ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt 2>>${bdbag.baseName}.splitData.err + echo "LOG: deriva cookie linked" >> ${bdbag.baseName}.splitData.err + study=`echo "${bdbag}" | cut -d '.' -f1` 2>>${bdbag.baseName}.splitData.err + echo "LOG: \${study}" >> ${bdbag.baseName}.splitData.err + unzip ${bdbag} 2>>${bdbag.baseName}.splitData.err + echo "LOG: bdgag unzipped" >> ${bdbag.baseName}.splitData.err + python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err + echo "LOG: fetch file filtered for only .fastq.gz" >> ${bdbag.baseName}.splitData.err + python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err + echo "LOG: fetch file split by replicates" >> ${bdbag.baseName}.splitData.err + sh ${baseDir}/scripts/splitBag.sh \${study} 2>>${bdbag.baseName}.splitData.err + echo "LOG: bag recreated with replicate split fetch file" >> ${bdbag.baseName}.splitData.err """ } @@ -54,24 +58,45 @@ process splitData { */ process getData { tag "${rep.baseName}" - publishDir "${outDir}/tempOut/fastqs", mode: "symlink" + publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${rep.baseName}.getData.err" input: each rep from bdbagSplit output: - path ("*.R*.fastq.gz", type: 'file', maxDepth: '0') into fastq + set val ("${rep.baseName}"), file ("*.R{1,2}.fastq.gz") into trimming script: """ - hostname - ulimit -a + hostname >>${rep.baseName}.getData.err + ulimit -a >>${rep.baseName}.getData.err export https_proxy=\${http_proxy} replicate=\$(basename "${rep}" | cut -d '.' -f1) - echo "LOG: \${replicate}" - unzip ${rep} - echo "LOG: replicate bdbag unzipped" - sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} - echo "LOG: replicate bdbag fetched" + echo "LOG: \${replicate}" >>${rep.baseName}.getData.err + unzip ${rep} 2>>${rep.baseName}.getData.err + echo "LOG: replicate bdbag unzipped" >>${rep.baseName}.getData.err + sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} 2>>${rep.baseName}.getData.err + echo "LOG: replicate bdbag fetched" >>${rep.baseName}.getData.err """ - } +} + +/* + * trimData: trims any adapter or non-host sequences from the data +*/ +process trimData { + tag "trim-${repID}" + publishDir "${outDir}/tempOut/trimmed", mode: "symlink", pattern: "*_val_{1,2}.fq.gz" + publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${rep}.trimData.*" + + input: + set repID, reads from trimming + + output: + path ("*_val_{1,2}.fq.gz", type: 'file', maxDepth: '0') + + script: + """ + rep=`echo ${repID} | cut -f2- -d '_'`; + trim_galore --gzip --max_n 1 --paired --basename \${rep} -j `nproc` ${reads[0]} ${reads[1]} 1>>\${rep}.trimData.log 2>>\${rep}.trimData.err; + """ +} -- GitLab