diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 5bcbc27e1fc72ae7cb15b2f20a7f903b8be4d5d4..bd377933d123799cfe6cb55452a951af59788548 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -12,6 +12,9 @@ process { withName:parseMetadata { executor = 'local' } + withName: getRefInfer { + executor = 'local' + } withName:getRef { executor = 'local' } @@ -21,6 +24,9 @@ process { withName:downsampleData { executor = 'local' } + withName:alignSampleData { + queue = 'super' + } withName:alignData { queue = '256GB,256GBv1' } diff --git a/workflow/nextflow.config b/workflow/nextflow.config index eb9ca4d9332b6efddca753663ad9f59855044dab..d79f47cfe6df99ea801caab598565d5a7a626210 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -20,6 +20,9 @@ process { withName: parseMetadata { container = 'bicf/python3:1.3' } + withName: getRefInfer { + container = 'bicf/awscli:1.1' + } withName: getRef { container = 'bicf/awscli:1.1' } @@ -29,6 +32,9 @@ process { withName: downsampleData { container = 'bicf/seqtk:2.0.0' } + withName: alignSampleData { + container = 'bicf/gudmaprbkaligner:2.0.0' + } withName: alignData { container = 'bicf/gudmaprbkaligner:2.0.0' } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 3abc8b561ea083b70e4ac89d182e319e3d7fbc9d..9ee3df735f602f97595e74d64cae5ab1cfc174c6 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -118,7 +118,7 @@ process getData { """ } -// Replicate raw fastqs for multiple process inputs +// Replicate raw fastq's for multiple process inputs fastqs.into { fastqs_downsampleData fastqs_trimData @@ -195,6 +195,7 @@ metadata.splitCsv(sep: ",", header: false).separate( // Replicate metadata for multiple process inputs endsManual.into { endsManual_downsampleData + endsManual_alignSampleData endsManual_trimData endsManual_alignData endsManual_featureCounts @@ -221,7 +222,7 @@ process getRefInfer { val referenceInfer output: - tuple val ("${referenceInfer}"), path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf") into refInfer + tuple val (referenceInfer), path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf") into refInfer path ("${repRID}.getRefInfer.{out,err}") script: @@ -359,7 +360,7 @@ process trimData { hostname > ${repRID}.trimData.err ulimit -a >> ${repRID}.trimData.err - #Trim fastqs using trim_galore + #Trim fastq's using trim_galore if [ "${endsManual_trimData}" == "se" ] then echo "LOG: running trim_galore using single-end settings" >> ${repRID}.trimData.err @@ -372,7 +373,7 @@ process trimData { """ } -// Replicate trimmed fastqs +// Replicate trimmed fastq's fastqsTrim.into { fastqsTrim_downsampleData fastqsTrim_alignData @@ -390,7 +391,8 @@ process downsampleData { path fastq from fastqsTrim_downsampleData output: - path ("sampled.{1,2}.fq") into fastqsSample + path ("sampled.1.fq") into fastqs1Sample + path ("sampled.2.fq") optional true into fastqs2Sample path ("${repRID}.downsampleData.{out,err}") script: @@ -413,6 +415,54 @@ process downsampleData { """ } +// Replicate the dowsampled fastq's and attatched to the references +inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect()))) + +/* + * alignSampleData: aligns the downsampled reads to a reference database +*/ +process alignSampleData { + tag "${ref}" + publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.alignSampleData.{out,err}" + + input: + tuple val (ends), val (ref), path (hisat2), path (bed), path (fna), path (gtf), path (fastq1), path (fastq2) from inferInput + + output: + tuple val (ref), path ("sampled.sorted.bam"), path ("sampled.sorted.bam.bai"), path (bed) into sampleBam + path ("*.alignSampleSummary.txt") into alignSampleQC + path ("${repRID}.alignSampleData.{out,err}") + + script: + """ + hostname > ${repRID}.alignSampleData.err + ulimit -a >> ${repRID}.alignSampleData.err + + #Align the reads with Hisat 2 + if [ "${ends}" == "se" ] + then + echo "LOG: running Hisat2 with single-end settings" >> ${repRID}.align.err + hisat2 -p `nproc` --add-chrname -S sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${repRID}.alignSampleSummary.txt --new-summary 1>> ${repRID}.alignSampleData.out 2>> ${repRID}.alignSampleData.err + elif [ "${ends}" == "pe" ] + then + echo "LOG: running Hisat2 with paired-end settings" >> ${repRID}.align.err + hisat2 -p `nproc` --add-chrname -S sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${repRID}.alignSampleSummary.txt --new-summary 1>> ${repRID}.alignSampleData.out 2>> ${repRID}.alignSampleData.err + fi + + #Convert the output sam file to a sorted bam file using Samtools + echo "LOG: converting from sam to bam" >> ${repRID}.alignSampleData.err + samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o sampled.bam sampled.sam 1>> ${repRID}.alignSampleData.out 2>> ${repRID}.alignSampleData.err; + + #Sort the bam file using Samtools + echo "LOG: sorting the bam file" >> ${repRID}.alignSampleData.err + samtools sort -@ `nproc` -O BAM -o sampled.sorted.bam sampled.bam 1>> ${repRID}.alignSampleData.out 2>> ${repRID}.alignSampleData.err; + + #Index the sorted bam using Samtools + echo "LOG: indexing sorted bam file" >> ${repRID}.alignSampleData.err + samtools index -@ `nproc` -b sampled.sorted.bam sampled.sorted.bam.bai 1>> ${repRID}.alignSampleData.out 2>> ${repRID}.alignSampleData.err; + """ +} + /* * alignData: aligns the reads to a reference database */