diff --git a/.gitignore b/.gitignore index 9b75a201f875574a221bd0e4bf073e5a0d0db406..f86b65e13780bc14b987372cb35d5dd8e310adcc 100644 --- a/.gitignore +++ b/.gitignore @@ -298,4 +298,4 @@ timeline*.html* *.out run*.sh -!.gitkeep +!.gitkeep \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index edff72458a74626000a40e0802fe01872fbe5aad..37083c154c27c879833b85c8f8c256c2aecb46dc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -32,33 +32,38 @@ parseMetadata: - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p ends - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual - - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded - - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p specie + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded -e se + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species + +getRef: + stage: unit + script: + - singularity run 'docker://bicf/awscli:1.1' aws s3 ls s3://bicf-references/mouse/0.0.1/GRCm38.P6 + - singularity run 'docker://bicf/awscli:1.1' aws s3 ls s3://bicf-references/human/0.0.1/GRCh38.p12 trimData: stage: unit script: - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename 16-1ZX4 -j `nproc` ./test_data/fastq/16-1ZX4.R1.fastq.gz - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5JA -j `nproc` ./test_data/fastq/Q-Y5JA.R1.fastq.gz ./test_data/fastq/Q-Y5JA.R2.fastq.gz + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5JA_1M.se -j `nproc` ./test_data/fastq/small/Q-Y5JA_1M.R1.fastq.gz + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5JA_1M.pe -j `nproc` ./test_data/fastq/small/Q-Y5JA_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5JA_1M.R2.fastq.gz - pytest -m trimData -alignReads: +alignData: stage: unit script: - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA.unal.gz -S Q-Y5JA.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/Q-Y5JA_R1_val_1.fq.gz -2 ./test_data/fastq/Q-Y5JA_R2_val_2.fq.gz - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o Q-Y5JA.bam Q-Y5JA.sam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA.sorted.bam Q-Y5JA.bam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA.sorted.bam Q-Y5JA.sorted.bai - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz 16-1ZX4.unal.gz -S 16-1ZX4.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12/hisat2/genome --rna-strandness FR -U ./test_data/fastq/16-1ZX4_trimmed.fq.gz - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o 16-1ZX4.bam 16-1ZX4.sam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o 16-1ZX4.sorted.bam 16-1ZX4.bam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b 16-1ZX4.sorted.bam 16-1ZX4.sorted.bai + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.se.unal.gz -S Q-Y5JA_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5JA_1M_trimmed.fq.gz + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o Q-Y5JA_1M.se.bam Q-Y5JA_1M.se.sam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.bam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.sorted.bai + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.pe.unal.gz -S Q-Y5JA_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5JA_1M_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5JA_1M_R2_val_2.fq.gz + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o Q-Y5JA_1M.pe.bam Q-Y5JA_1M.pe.sam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.bam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bai - pytest -m alignData -dedupReads: +dedupData: stage: unit script: - - singularity exec 'docker://bicf/picard2.21.7:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5JA_1M.pe.sorted.bam O=Q-Y5JA_1M.pe.deduped.bam M=Q-Y5JA_1M.pe.deduped.Metrics.txt REMOVE_DUPLICATES=true - singularity exec 'docker://bicf/picard2.21.7:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5JA_1M.se.sorted.bam O=Q-Y5JA_1M.se.deduped.bam M=Q-Y5JA_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true - pytest -m dedupData @@ -70,4 +75,4 @@ integration_se: integration_pe: stage: integration script: - - nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA + - nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 9345688f2a0ae3ee68fb31d2947a772882c51926..8ff0911406a181f08e86e495c7530f87d6f43dae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,6 @@ # v0.0.1 (in development) **User Facing** -* Raw alignment files **Background** -* Implementation of CI *Known Bugs* diff --git a/workflow/conf/aws_ondemand.config b/workflow/conf/aws_ondemand.config index 84fcb275e131e30ea4a21ad829d67c368dca1811..ca23b6c49cd80b98c3120010609d46e7131757f8 100755 --- a/workflow/conf/aws_ondemand.config +++ b/workflow/conf/aws_ondemand.config @@ -13,7 +13,22 @@ process { cpus = 1 memory = '1 GB' + withName:parseMetadata { + cpus = 5 + } + withName:getRef { + cpus = 8 + } withName:trimData { - cpus = 15 + cpus = 8 + memory = '2 GB' + } + withName:alignData { + cpus = 50 + memory = '10 GB' + } + withName:dedupData { + cpus = 2 + memory = '20 GB' } } \ No newline at end of file diff --git a/workflow/conf/aws_spot.config b/workflow/conf/aws_spot.config index fbccb3cba394d2a1a7751bee7206483c869eac23..e0447565495e09c3ccde2968eb887aa14f7db251 100755 --- a/workflow/conf/aws_spot.config +++ b/workflow/conf/aws_spot.config @@ -13,7 +13,22 @@ process { cpus = 1 memory = '1 GB' + withName:parseMetadata { + cpus = 5 + } + withName:getRef { + cpus = 8 + } withName:trimData { - cpus = 15 + cpus = 8 + memory = '2 GB' + } + withName:alignData { + cpus = 50 + memory = '10 GB' + } + withName:dedupData { + cpus = 2 + memory = '20 GB' } } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index dbc4e53bdc004d1bc61e52806bc9a94d7f7634cd..a17f91e1a3397c2c28fe84d0e70b2342f32cbb0a 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -12,14 +12,17 @@ process { withName:parseMetadata { executor = 'local' } + withName:getRef { + executor = 'local' + } withName:trimData { - queue = '256GB,256GBv1,384GB' + queue = 'super' } - withName:alignReads { - queue = '256GB,256GBv1,384GB' + withName:alignData { + queue = '256GB,256GBv1' } - withName: dedupReads { - queue = '128GB,256GB,256GBv1,384GB' + withName: dedupData { + queue = 'super' } } diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 67cb95ebd958c861697dad2eede6b906cc728669..0b6f27a5c078db72a5ae025a5d0cde2118163047 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -20,13 +20,16 @@ process { withName:parseMetadata { container = 'bicf/python3:1.3' } + withName:getRef { + container = 'bicf/awscli:1.1' + } withName:trimData { container = 'bicf/trimgalore:1.1' } - withName: alignReads { + withName: alignData { container = 'bicf/gudmaprbkaligner:2.0.0' } - withName: dedupReads { + withName: dedupData { container = 'bicf/picard2.21.7:2.0.0' } } @@ -58,4 +61,4 @@ manifest { mainScript = 'rna-seq.nf' version = 'v0.0.1_indev' nextflowVersion = '>=19.09.0' -} +} \ No newline at end of file diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index a15e439d5eb8ab5bd3a53c941733360c159c0470..740db3bc6973c950339127ba8d81934766ad085f 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -5,8 +5,10 @@ params.deriva = "${baseDir}/../test_data/auth/credential.json" params.bdbag = "${baseDir}/../test_data/auth/cookies.txt" //params.repRID = "16-1ZX4" params.repRID = "Q-Y5JA" +params.refVersion = "0.0.1" +params.refMuVersion = "38.P6" +params.refHuVersion = "38.p12" params.outDir = "${baseDir}/../output" -params.reference = "/project/BICF/BICF_Core/shared/gudmap/references" // Parse input variables deriva = Channel @@ -16,12 +18,15 @@ bdbag = Channel .fromPath(params.bdbag) .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" } repRID = params.repRID +refVersion = params.refVersion +refMuVersion = params.refMuVersion +refHuVersion = params.refHuVersion outDir = params.outDir -referenceBase = file ("${params.reference}") logsDir = "${outDir}/Logs" // Define fixed files derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") +referenceBase = "s3://bicf-references" // Define script files script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") @@ -32,15 +37,15 @@ script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py") */ process getBag { tag "${repRID}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getBag.err" + publishDir "${logsDir}", mode: "copy", pattern: "*.getBag.err" input: - path credential, stageAs: 'credential.json' from deriva + path credential, stageAs: "credential.json" from deriva path derivaConfig output: path ("Replicate_*.zip") into bagit - file ("${repRID}.getBag.err") + path ("${repRID}.getBag.err") script: """ @@ -62,19 +67,19 @@ process getBag { */ process getData { tag "${repRID}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getData.err" + publishDir "${logsDir}", mode: "copy", pattern: "*.getData.err" input: path script_bdbagFetch - path cookies, stageAs: 'deriva-cookies.txt' from bdbag + path cookies, stageAs: "deriva-cookies.txt" from bdbag path bagit output: path ("*.R{1,2}.fastq.gz") into fastqs - file("**/File.csv") into fileMeta - file("**/Experiment Settings.csv") into experimentSettingsMeta - file("**/Experiment.csv") into experimentMeta - file ("${repRID}.getData.err") + path ("**/File.csv") into fileMeta + path ("**/Experiment Settings.csv") into experimentSettingsMeta + path ("**/Experiment.csv") into experimentMeta + path ("${repRID}.getData.err") script: """ @@ -87,14 +92,14 @@ process getData { echo "LOG: deriva cookie linked" >>${repRID}.getData.err # get bagit basename - replicate=\$(basename "${bagit}" | cut -d '.' -f1) + replicate=\$(basename "${bagit}" | cut -d "." -f1) echo "LOG: \${replicate}" >>${repRID}.getData.err # unzip bagit unzip ${bagit} 2>>${repRID}.getData.err echo "LOG: replicate bdbag unzipped" >>${repRID}.getData.err - # bagit fetch fastq's only and rename by repRID + # bagit fetch fastq"s only and rename by repRID sh ${script_bdbagFetch} \${replicate} ${repRID} 2>>${repRID}.getData.err echo "LOG: replicate bdbag fetched" >>${repRID}.getData.err """ @@ -105,7 +110,7 @@ process getData { */ process parseMetadata { tag "${repRID}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.parseMetadata.err" + publishDir "${logsDir}", mode: "copy", pattern: "*.parseMetadata.err" input: path script_parseMeta @@ -115,7 +120,7 @@ process parseMetadata { path experimentMeta output: - path 'design.csv' into metadata + path "design.csv" into metadata script: """ @@ -135,7 +140,7 @@ process parseMetadata { echo "LOG: endedness manually detected: \${endsManual}" >>${repRID}.parseMetadata.err # Get strandedness metadata - stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded) + stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded -e \${endsManual}) echo "LOG: strandedness metadata parsed: \${stranded}" >>${repRID}.parseMetadata.err # Get spike-in metadata @@ -146,27 +151,8 @@ process parseMetadata { species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentMeta}" -p species) echo "LOG: species metadata parsed: \${species}" >>${repRID}.parseMetadata.err - #Set the correct Reference Directory - if [ "\${spike}" == 'yes' ]; then - if [ "\${species}" == 'Homo sapiens' ]; then - referenceDir="/project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12-S/hisat2/genome"; - echo "LOG: Referece set to Homo sapiens with spike-in." >>${repRID}.parseMetadata.err; - elif [ "\${species}" == 'Mus musculus' ]; then - referenceDir="/project/BICF/BICF_Core/shared/gudmap/references/GRCm38.P6-S/hisat2/genome"; - echo "LOG: Referece set to Mus musculus with spike-in." >>${repRID}.parseMetadata.err; - fi; - else - if [ "\${species}" == 'Homo sapiens' ]; then - referenceDir="/project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12/hisat2/genome"; - echo "LOG: Referece set to Homo sapiens without spike-in." >>${repRID}.parseMetadata.err; - elif [ "\${species}" == 'Mus musculus' ]; then - referenceDir="/project/BICF/BICF_Core/shared/gudmap/references/GRCm38.P6/hisat2/genome"; - echo "LOG: Referece set to Mus musculus without spike-in." >>${repRID}.parseMetadata.err; - fi; - fi; - # Save design file - echo "\${rep},\${endsMeta},\${endsManual},\${stranded},\${spike},\${species},\${referenceDir}" > design.csv + echo "\${rep},\${endsMeta},\${endsManual},\${stranded},\${spike},\${species}" > design.csv """ } @@ -177,25 +163,71 @@ endsManual = Channel.create() stranded = Channel.create() spike = Channel.create() species = Channel.create() -referenceDir = Channel.create() -metadata.splitCsv(sep: ',', header: false).separate( +metadata.splitCsv(sep: ",", header: false).separate( rep, endsMeta, endsManual, stranded, spike, - species, - referenceDir + species ) endsManual.into { endsManual_trimData - endsManual_alignReads + endsManual_alignData } stranded.into { - stranded_alignReads + stranded_alignData +} +spike.into{ + spike_getRef +} +species.into { + species_getRef } -referenceDir.into { - referenceDir_alignReads + +/* + * getRef: Dowloads appropriate reference +*/ +process getRef { + tag "${species_getRef}" + publishDir "${logsDir}", mode: "copy", pattern: "*.getRef.err" + + input: + val referenceBase + val refVersion + val refMuVersion + val refHuVersion + val spike_getRef + val species_getRef + + output: + path ("*") into reference + + script: + """ + hostname >>${repRID}.getRef.err + ulimit -a >>${repRID}.getRef.err + export https_proxy=\${http_proxy} + + # retreive appropriate reference from S3 bucket + if [ "${species_getRef}" == "Mus musculus" ] + then + references=\$(echo ${referenceBase}/mouse/${refVersion}/GRCm${refMuVersion}) + elif [ '${species_getRef}' == "Homo sapiens" ] + then + references=\$(echo ${referenceBase}/human/${refVersion}/GRCh${refHuVersion}) + else + exit 1 + fi + if [ "${spike_getRef}" == "yes" ] + then + references=\$(echo \${reference}-S/) + elif [ "${spike_getRef}" == "no" ] + then + reference=\$(echo \${references}/) + fi + aws s3 cp "\${references}" ./ --recursive >>${repRID}.getRef.err + """ } /* @@ -203,16 +235,16 @@ referenceDir.into { */ process trimData { tag "${repRID}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.trimData.*" + publishDir "${logsDir}", mode: "copy", pattern: "*.trimData.*" input: val endsManual_trimData - file(fastq) from fastqs + path (fastq) from fastqs output: path ("*.fq.gz") into fastqs_trimmed - file ("${repRID}.trimData.log") - file ("${repRID}.trimData.err") + path ("${repRID}.trimData.log") + path ("${repRID}.trimData.err") script: """ @@ -220,32 +252,34 @@ process trimData { ulimit -a >>${repRID}.trimData.err # trim fastqs - if [ '${endsManual_trimData}' == 'se' ] + if [ "${endsManual_trimData}" == "se" ] then - trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; - else - trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; + trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err + elif [ "${endsManual_trimData}" == "pe" ] + then + trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err fi """ } /* - * alignReads: aligns the reads to a reference database + * alignData: aligns the reads to a reference database */ -process alignReads { +process alignData { tag "${repRID}" - publishDir "${outDir}/aligned", mode: "copy", pattern: "${repRID}.{unal,sorted}.{gz,bam,bai}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.align.{out,err}" + publishDir "${logsDir}", mode: "copy", pattern: "*.align.{out,err}" input: - val endsManual_alignReads - val stranded_alignReads - path fqs from fastqs_trimmed - val referenceDir_alignReads + val endsManual_alignData + val stranded_alignData + path fastq from fastqs_trimmed + path reference output: - tuple repRID, file ("${repRID}.unal.gz"), file ("${repRID}.sorted.bam"), file ("${repRID}.sorted.bai") into dedupReads - tuple repRID, file ("${repRID}.align.out"), file ("${repRID}.align.err") + path ("${repRID}.sorted.bam") into rawBam + path ("${repRID}.sorted.bai") into rawBai + path ("${repRID}.align.out") + path ("${repRID}.align.err") script: """ @@ -253,10 +287,15 @@ process alignReads { ulimit -a >>${repRID}.align.err # align reads - if [ "${endsManual_alignReads}" == 'pe' ]; then - hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${referenceDir_alignReads} ${stranded_alignReads} --no-mixed --no-discordant -1 ${fqs[0]} -2 ${fqs[1]} 1>${repRID}.align.out 2>>${repRID}.align.err; - else hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${referenceDir_alignReads} ${stranded_alignReads} -U ${fqs[0]} 1>${repRID}.align.out 2>>${repRID}.align.err; - fi; + if [ "${endsManual_alignData}" == "se" ] + then + hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} 1>${repRID}.align.out 2>${repRID}.align.err + elif [ "${endsManual_alignData}" == "pe" ] + then + hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} 1>${repRID}.align.out 2>${repRID}.align.err + fi + + # convert sam to bam and sort and index samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err; samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err; samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err; @@ -266,16 +305,18 @@ process alignReads { /* *dedupReads: mark the duplicate reads, specifically focused on PCR or optical duplicates */ -process dedupReads { +process dedupData { tag "${repRID}" - publishDir "${outDir}/deduped", mode: 'copy', pattern: "${repRID}.deduped.{bam,Metrics.txt}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.dedup.{out,err}" + publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam" + publishDir "${logsDir}", mode: 'copy', pattern: "*.dedup.{out,err}" input: - tuple repRID, file (unal), file (sortedBam), file (sortedBai) from dedupReads + path rawBam output: - tuple repRID, file ("${repRID}.deduped.bam"), file ("${repRID}.deduped.Metrics.txt") + path ("${repRID}.deduped.bam") into dedupBam + path ("${repRID}.dedup.out") + path ("${repRID}.dedup.err") script: """ @@ -283,6 +324,6 @@ process dedupReads { ulimit -a >>${repRID}.dedup.err #Remove duplicated reads - java -jar /picard/build/libs/picard.jar MarkDuplicates I=${sortedBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err + java -jar /picard/build/libs/picard.jar MarkDuplicates I=${rawBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err """ -} +} \ No newline at end of file diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py deleted file mode 100644 index e6accfbbaee5ac303253294f69f8f345c7ba1dc5..0000000000000000000000000000000000000000 --- a/workflow/scripts/modifyFetch.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import pandas as pd -import re - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('-f', '--files',help="The fetch file from bdgap.zip.",required=True) - args = parser.parse_args() - return args - -def main(): - args = get_args() - fetchFile = pd.read_csv(args.files+"/fetch.txt",sep="\t",header=None) - fileFile = pd.read_csv(args.files+"/data/File.csv",sep=",",header=0) - fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"] - fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"] - fetchFile_filtered_renamed = fetchFile_filtered - for i in fileFile_filtered["File_Name"]: - fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0]) - fetchFile_filtered_renamed.to_csv(args.files+"/fetch.txt",sep="\t",header=False,index=False) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/workflow/scripts/parseMeta.py b/workflow/scripts/parseMeta.py index 4797610f870f8e8f22024e0198dc0255c0829073..eaa2656f9266a5c3de07fb0fc80ad5aeb2ea8422 100644 --- a/workflow/scripts/parseMeta.py +++ b/workflow/scripts/parseMeta.py @@ -10,6 +10,7 @@ def get_args(): parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True) parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True) parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True) + parser.add_argument('-e', '--endsManual',help="The endness.",required=False) args = parser.parse_args() return args @@ -55,9 +56,9 @@ def main(): # Get strandedness metadata from 'Experiment Settings.csv' if (args.parameter == "stranded"): if (metaFile.Has_Strand_Specific_Information.unique() == "yes"): - if endsManual == "se": + if (args.endsManual=="se"): stranded = "--rna-strandness F" - else: + elif (args.endsManual=="pe"): stranded = "--rna-strandness FR" elif (metaFile.Has_Strand_Specific_Information.unique() == "no"): stranded = "" diff --git a/workflow/scripts/splitFetch.py b/workflow/scripts/splitFetch.py deleted file mode 100644 index 63385c184ff7bd6ddb4cb047c81a693cf7a8ecfb..0000000000000000000000000000000000000000 --- a/workflow/scripts/splitFetch.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import pandas as pd -import os - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('-f', '--files',help="The fetch file from bdgap.zip.",required=True) - args = parser.parse_args() - return args - -def main(): - args = get_args() - fetchFile = pd.read_csv(args.files+"/fetch.txt",sep="\t",header=None) - fileFile = pd.read_csv(args.files+"/data/File.csv",sep=",",header=0) - replicateRID = fileFile.Replicate_RID.unique() - fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID} - for i in replicateRID: - if not os.path.exists(i): - os.mkdir("Replicate_"+i) - fetchFile[fetchFile[0].str.contains('|'.join(fetchArray[i]))].to_csv("Replicate_"+i+"/fetch.txt",sep="\t",header=False,index=False) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/workflow/tests/test_alignReads.py b/workflow/tests/test_alignReads.py index 79770bb42245df6da0ce7fa1081d78ec5aed736f..d95e63a6426e5d1cbec874b0c4e86b17388ba975 100644 --- a/workflow/tests/test_alignReads.py +++ b/workflow/tests/test_alignReads.py @@ -7,39 +7,17 @@ import utils data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../../' -logs_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' @pytest.mark.alignData def test_alignData_se(): - assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.unal.gz')) - assert utils.count_lines(os.path.join(data_output_path, '16-1ZX4.unal.gz')) == 3070528 - assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.sorted.bam')) - assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.sorted.bai')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.unal.gz')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.bai')) @pytest.mark.alignData def test_alignData_pe(): - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.unal.gz')) - assert utils.count_lines(os.path.join(data_output_path, 'Q-Y5JA.unal.gz')) == 0 - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.sorted.bam')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.sorted.bai')) - - -@pytest.mark.alignLogs -def test_alignLogs_se(): - assert os.path.exists(os.path.join(logs_output_path, '16-1ZX4.align.err')) - assert utils.count_lines(os.path.join(logs_output_path, '16-1ZX4.align.err')) == 7 - assert '34497376 reads; of these:' in open(os.path.join(logs_output_path, '16-1ZX4.align.err')).readlines()[0] - assert os.path.exists(os.path.join(logs_output_path, '16-1ZX4.align.out')) - assert utils.count_lines(os.path.join(logs_output_path, '16-1ZX4.align.out')) == 0 - - -@pytest.mark.alignLogs -def test_alignLogs_pe(): - assert os.path.exists(os.path.join(logs_output_path, 'Q-Y5JA.align.err')) - assert utils.count_lines(os.path.join(logs_output_path, 'Q-Y5JA.align.err')) == 7 - assert '15824858 reads; of these:' in open(os.path.join(logs_output_path, 'Q-Y5JA.align.err')).readlines()[0] - assert os.path.exists(os.path.join(logs_output_path, 'Q-Y5JA.align.out')) - assert utils.count_lines(os.path.join(logs_output_path, 'Q-Y5JA.align.out')) == 0 + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.unal.gz')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bai')) \ No newline at end of file diff --git a/workflow/tests/test_dedupReads.py b/workflow/tests/test_dedupReads.py index e204317e7a30bc05206d64535472963f35da0b5f..0c0cd7aa67ab0e47168f8bf438191f6a9c217b72 100644 --- a/workflow/tests/test_dedupReads.py +++ b/workflow/tests/test_dedupReads.py @@ -7,27 +7,8 @@ import utils data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../../' -logs_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' - -@pytest.mark.dedupData -def test_dedupData_se(): - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.deduped.bam')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.deduped.Metrics.txt')) @pytest.mark.dedupData -def test_dedupData_pe(): - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.deduped.bam')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.deduped.Metrics.txt')) - - -@pytest.mark.dedupLogs -def test_dedupLogs_se(): - assert os.path.exists(os.path.join(logs_output_path, '16-1ZX4.dedup.err')) - assert os.path.exists(os.path.join(logs_output_path, '16-1ZX4.dedup.out')) - -@pytest.mark.dedupLogs -def test_dedupLogs_pe(): - assert os.path.exists(os.path.join(logs_output_path, 'Q-Y5JA.dedup.err')) - assert os.path.exists(os.path.join(logs_output_path, 'Q-Y5JA.dedup.out')) +def test_dedupData(): + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.deduped.bam')) \ No newline at end of file diff --git a/workflow/tests/test_trimData.py b/workflow/tests/test_trimData.py index 6538a0081aaea69c5814d479751455abd879f4bc..a8f479afec7b0c97847179f5b7817987ee872a16 100644 --- a/workflow/tests/test_trimData.py +++ b/workflow/tests/test_trimData.py @@ -10,9 +10,10 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.trimData def test_trimData_se(): - assert os.path.exists(os.path.join(test_output_path, '16-1ZX4_trimmed.fq.gz')) + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_1M.se_trimmed.fq.gz')) + @pytest.mark.trimData def test_trimData_pe(): - assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R1_val_1.fq.gz')) - assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R2_val_2.fq.gz')) \ No newline at end of file + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_1M.pe_R1_val_1.fq.gz')) + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_1M.pe_R2_val_2.fq.gz')) \ No newline at end of file