diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5c9d184ccff41358ca43353aa909d8fda6e788fd..37083c154c27c879833b85c8f8c256c2aecb46dc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -61,6 +61,12 @@ alignData: - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bai - pytest -m alignData +dedupData: + stage: unit + script: + - singularity exec 'docker://bicf/picard2.21.7:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5JA_1M.se.sorted.bam O=Q-Y5JA_1M.se.deduped.bam M=Q-Y5JA_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true + - pytest -m dedupData + integration_se: stage: integration script: diff --git a/workflow/conf/aws_ondemand.config b/workflow/conf/aws_ondemand.config index 52043ab4da33c04a110605866acb3bc5eba750c2..ca23b6c49cd80b98c3120010609d46e7131757f8 100755 --- a/workflow/conf/aws_ondemand.config +++ b/workflow/conf/aws_ondemand.config @@ -13,13 +13,22 @@ process { cpus = 1 memory = '1 GB' + withName:parseMetadata { + cpus = 5 + } withName:getRef { cpus = 8 } withName:trimData { - cpus = 15 + cpus = 8 + memory = '2 GB' } withName:alignData { cpus = 50 + memory = '10 GB' + } + withName:dedupData { + cpus = 2 + memory = '20 GB' } } \ No newline at end of file diff --git a/workflow/conf/aws_spot.config b/workflow/conf/aws_spot.config index 189ab7e02f21440583feb13606964ab67e7a4f78..e0447565495e09c3ccde2968eb887aa14f7db251 100755 --- a/workflow/conf/aws_spot.config +++ b/workflow/conf/aws_spot.config @@ -13,13 +13,22 @@ process { cpus = 1 memory = '1 GB' + withName:parseMetadata { + cpus = 5 + } withName:getRef { cpus = 8 } withName:trimData { - cpus = 15 + cpus = 8 + memory = '2 GB' } withName:alignData { cpus = 50 + memory = '10 GB' + } + withName:dedupData { + cpus = 2 + memory = '20 GB' } } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 7d58dcc685fab609ac4c8ec71ec449871bc6abe5..a17f91e1a3397c2c28fe84d0e70b2342f32cbb0a 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -16,10 +16,13 @@ process { executor = 'local' } withName:trimData { - queue = '256GB,256GBv1,384GB' + queue = 'super' } withName:alignData { - queue = '256GB,256GBv1,384GB' + queue = '256GB,256GBv1' + } + withName: dedupData { + queue = 'super' } } diff --git a/workflow/nextflow.config b/workflow/nextflow.config index b78a6e4f3f2675f1ba585d91bed02d9b9fb63e37..0b6f27a5c078db72a5ae025a5d0cde2118163047 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -29,6 +29,9 @@ process { withName: alignData { container = 'bicf/gudmaprbkaligner:2.0.0' } + withName: dedupData { + container = 'bicf/picard2.21.7:2.0.0' + } } trace { @@ -58,4 +61,4 @@ manifest { mainScript = 'rna-seq.nf' version = 'v0.0.1_indev' nextflowVersion = '>=19.09.0' -} +} \ No newline at end of file diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 21c77a09555cabf75011ceab149d3bed31f88e34..740db3bc6973c950339127ba8d81934766ad085f 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -209,6 +209,7 @@ process getRef { ulimit -a >>${repRID}.getRef.err export https_proxy=\${http_proxy} + # retreive appropriate reference from S3 bucket if [ "${species_getRef}" == "Mus musculus" ] then references=\$(echo ${referenceBase}/mouse/${refVersion}/GRCm${refMuVersion}) @@ -275,14 +276,17 @@ process alignData { path reference output: - path ("${repRID}.unal.gz") - path ("${repRID}.sorted.bam") - path ("${repRID}.sorted.bai") + path ("${repRID}.sorted.bam") into rawBam + path ("${repRID}.sorted.bai") into rawBai path ("${repRID}.align.out") path ("${repRID}.align.err") script: """ + hostname >${repRID}.align.err + ulimit -a >>${repRID}.align.err + + # align reads if [ "${endsManual_alignData}" == "se" ] then hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} 1>${repRID}.align.out 2>${repRID}.align.err @@ -290,8 +294,36 @@ process alignData { then hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} 1>${repRID}.align.out 2>${repRID}.align.err fi - samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err - samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err - samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err + + # convert sam to bam and sort and index + samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err; + samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err; + samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err; + """ +} + +/* + *dedupReads: mark the duplicate reads, specifically focused on PCR or optical duplicates +*/ +process dedupData { + tag "${repRID}" + publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam" + publishDir "${logsDir}", mode: 'copy', pattern: "*.dedup.{out,err}" + + input: + path rawBam + + output: + path ("${repRID}.deduped.bam") into dedupBam + path ("${repRID}.dedup.out") + path ("${repRID}.dedup.err") + + script: + """ + hostname >${repRID}.dedup.err + ulimit -a >>${repRID}.dedup.err + + #Remove duplicated reads + java -jar /picard/build/libs/picard.jar MarkDuplicates I=${rawBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err """ } \ No newline at end of file diff --git a/workflow/tests/test_alignReads.py b/workflow/tests/test_alignReads.py index 7c20dd3f269e202e824285daa00a85d4301b3278..d95e63a6426e5d1cbec874b0c4e86b17388ba975 100644 --- a/workflow/tests/test_alignReads.py +++ b/workflow/tests/test_alignReads.py @@ -20,20 +20,4 @@ def test_alignData_se(): def test_alignData_pe(): assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.unal.gz')) assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bam')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bai')) - - -@pytest.mark.alignLogs -def test_alignLogs_se(): - assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.align.err')) - assert '34497376 reads; of these:' in open(os.path.join(data_output_path, '16-1ZX4.align.err')).readlines()[0] - assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.align.out')) - - -@pytest.mark.alignLogs -def test_alignLogs_pe(): - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.align.err')) - assert utils.count_lines(os.path.join(data_output_path, 'Q-Y5JA.align.err')) == 7 - assert '15824858 reads; of these:' in open(os.path.join(data_output_path, 'Q-Y5JA.align.err')).readlines()[0] - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.align.out')) - assert utils.count_lines(os.path.join(data_output_path, 'Q-Y5JA.align.out')) == 0 + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bai')) \ No newline at end of file diff --git a/workflow/tests/test_dedupReads.py b/workflow/tests/test_dedupReads.py new file mode 100644 index 0000000000000000000000000000000000000000..0c0cd7aa67ab0e47168f8bf438191f6a9c217b72 --- /dev/null +++ b/workflow/tests/test_dedupReads.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +import os +import utils + +data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + + +@pytest.mark.dedupData +def test_dedupData(): + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.deduped.bam')) \ No newline at end of file