diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 83fac1f1092e641a25214c2af2b61745ed352a83..5bcbc27e1fc72ae7cb15b2f20a7f903b8be4d5d4 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -18,6 +18,9 @@ process { withName:trimData { queue = 'super' } + withName:downsampleData { + executor = 'local' + } withName:alignData { queue = '256GB,256GBv1' } diff --git a/workflow/nextflow.config b/workflow/nextflow.config index f0be347ef28c9306abe43974d48464484f48b5c4..eb9ca4d9332b6efddca753663ad9f59855044dab 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -26,6 +26,9 @@ process { withName: trimData { container = 'bicf/trimgalore:1.1' } + withName: downsampleData { + container = 'bicf/seqtk:2.0.0' + } withName: alignData { container = 'bicf/gudmaprbkaligner:2.0.0' } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 22852231d9d29bfda93b65fecf43cf77df98c51d..625e3a0ec6d0d36ff542103bf849c2286923c079 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -117,6 +117,7 @@ process getData { // Replicate raw fastqs for multiple process inputs fastqs.into { + fastqs_downsampleData fastqs_trimData fastqs_fastqc } @@ -190,6 +191,7 @@ metadata.splitCsv(sep: ",", header: false).separate( ) // Replicate metadata for multiple process inputs endsManual.into { + endsManual_downsampleData endsManual_trimData endsManual_alignData endsManual_featureCounts @@ -284,7 +286,7 @@ process trimData { path (fastq) from fastqs_trimData output: - path ("*.fq.gz") into fastqs_trimmed + path ("*.fq.gz") into fastqsTrim path ("*_trimming_report.txt") into trimQC path ("${repRID}.trimData.{out,err}") @@ -306,6 +308,47 @@ process trimData { """ } +// Replicate trimmed fastqs +fastqsTrim.into { + fastqsTrim_downsampleData + fastqsTrim_alignData +} + +/* + * downsampleData: downsample fastq's for metadata inference + */ +process downsampleData { + tag "${repRID}" + publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.downsampleData.{out,err}" + + input: + val endsManual_downsampleData + path fastq from fastqsTrim_downsampleData + + output: + path ("sampled.{1,2}.fq") into fastqsSample + path ("${repRID}.downsampleData.{out,err}") + + script: + """ + hostname > ${repRID}.downsampleData.err + ulimit -a >> ${repRID}.downsampleData.err + export https_proxy=\${http_proxy} + + if [ "${endsManual_downsampleData}" == "se" ] + then + echo "LOG: downsampling single-end trimmed fastq" >> ${repRID}.downsampleData.err + seqtk sample -s100 *trimmed.fq.gz 10000 1> sampled.1.fq 2>> ${repRID}.downsampleData.err + elif [ "${endsManual_downsampleData}" == "pe" ] + then + echo "LOG: downsampling read 1 of paired-end trimmed fastq" >> ${repRID}.downsampleData.err + seqtk sample -s100 *1.fq.gz 1000000 1> sampled.1.fq 2>> ${repRID}.downsampleData.err + echo "LOG: downsampling read 2 of paired-end trimmed fastq" >> ${repRID}.downsampleData.err + seqtk sample -s100 *2.fq.gz 1000000 1> sampled.2.fq 2>> ${repRID}.downsampleData.err + fi + """ +} + /* * alignData: aligns the reads to a reference database */ @@ -316,7 +359,7 @@ process alignData { input: val endsManual_alignData val stranded_alignData - path fastq from fastqs_trimmed + path fastq from fastqsTrim_alignData path reference_alignData output: