diff --git a/.gitignore b/.gitignore index 9b75a201f875574a221bd0e4bf073e5a0d0db406..f86b65e13780bc14b987372cb35d5dd8e310adcc 100644 --- a/.gitignore +++ b/.gitignore @@ -298,4 +298,4 @@ timeline*.html* *.out run*.sh -!.gitkeep +!.gitkeep \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6ea53a7f0dc2a8265a0a93ba8cfd4ef0dd9afc1b..5c9d184ccff41358ca43353aa909d8fda6e788fd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -32,27 +32,33 @@ parseMetadata: - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p ends - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual - - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded - - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p specie + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded -e se + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species + +getRef: + stage: unit + script: + - singularity run 'docker://bicf/awscli:1.1' aws s3 ls s3://bicf-references/mouse/0.0.1/GRCm38.P6 + - singularity run 'docker://bicf/awscli:1.1' aws s3 ls s3://bicf-references/human/0.0.1/GRCh38.p12 trimData: stage: unit script: - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename 16-1ZX4 -j `nproc` ./test_data/fastq/16-1ZX4.R1.fastq.gz - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5JA -j `nproc` ./test_data/fastq/Q-Y5JA.R1.fastq.gz ./test_data/fastq/Q-Y5JA.R2.fastq.gz + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5JA_1M.se -j `nproc` ./test_data/fastq/small/Q-Y5JA_1M.R1.fastq.gz + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5JA_1M.pe -j `nproc` ./test_data/fastq/small/Q-Y5JA_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5JA_1M.R2.fastq.gz - pytest -m trimData -alignReads: +alignData: stage: unit script: - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA.unal.gz -S Q-Y5JA.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/Q-Y5JA_R1_val_1.fq.gz -2 ./test_data/fastq/Q-Y5JA_R2_val_2.fq.gz - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o Q-Y5JA.bam Q-Y5JA.sam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA.sorted.bam Q-Y5JA.bam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA.sorted.bam Q-Y5JA.sorted.bai - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz 16-1ZX4.unal.gz -S 16-1ZX4.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12/hisat2/genome --rna-strandness FR -U ./test_data/fastq/16-1ZX4_trimmed.fq.gz - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o 16-1ZX4.bam 16-1ZX4.sam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o 16-1ZX4.sorted.bam 16-1ZX4.bam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b 16-1ZX4.sorted.bam 16-1ZX4.sorted.bai + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.se.unal.gz -S Q-Y5JA_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5JA_1M_trimmed.fq.gz + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o Q-Y5JA_1M.se.bam Q-Y5JA_1M.se.sam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.bam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.sorted.bai + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.pe.unal.gz -S Q-Y5JA_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5JA_1M_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5JA_1M_R2_val_2.fq.gz + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o Q-Y5JA_1M.pe.bam Q-Y5JA_1M.pe.sam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.bam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bai - pytest -m alignData integration_se: @@ -63,4 +69,4 @@ integration_se: integration_pe: stage: integration script: - - nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA + - nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 9345688f2a0ae3ee68fb31d2947a772882c51926..8ff0911406a181f08e86e495c7530f87d6f43dae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,6 @@ # v0.0.1 (in development) **User Facing** -* Raw alignment files **Background** -* Implementation of CI *Known Bugs* diff --git a/workflow/conf/aws_ondemand.config b/workflow/conf/aws_ondemand.config index 84fcb275e131e30ea4a21ad829d67c368dca1811..52043ab4da33c04a110605866acb3bc5eba750c2 100755 --- a/workflow/conf/aws_ondemand.config +++ b/workflow/conf/aws_ondemand.config @@ -13,7 +13,13 @@ process { cpus = 1 memory = '1 GB' + withName:getRef { + cpus = 8 + } withName:trimData { cpus = 15 } + withName:alignData { + cpus = 50 + } } \ No newline at end of file diff --git a/workflow/conf/aws_spot.config b/workflow/conf/aws_spot.config index fbccb3cba394d2a1a7751bee7206483c869eac23..189ab7e02f21440583feb13606964ab67e7a4f78 100755 --- a/workflow/conf/aws_spot.config +++ b/workflow/conf/aws_spot.config @@ -13,7 +13,13 @@ process { cpus = 1 memory = '1 GB' + withName:getRef { + cpus = 8 + } withName:trimData { cpus = 15 } + withName:alignData { + cpus = 50 + } } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index e2b1335e2a2f9a923931ad3f293cf364f689e69f..7d58dcc685fab609ac4c8ec71ec449871bc6abe5 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -12,10 +12,13 @@ process { withName:parseMetadata { executor = 'local' } + withName:getRef { + executor = 'local' + } withName:trimData { queue = '256GB,256GBv1,384GB' } - withName:alignReads { + withName:alignData { queue = '256GB,256GBv1,384GB' } } diff --git a/workflow/nextflow.config b/workflow/nextflow.config index acb710d45a2036fbfb83a0b5abb7a80ce64ce3ca..b78a6e4f3f2675f1ba585d91bed02d9b9fb63e37 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -20,10 +20,13 @@ process { withName:parseMetadata { container = 'bicf/python3:1.3' } + withName:getRef { + container = 'bicf/awscli:1.1' + } withName:trimData { container = 'bicf/trimgalore:1.1' } - withName: alignReads { + withName: alignData { container = 'bicf/gudmaprbkaligner:2.0.0' } } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index ff348297ea88cefa54d494cc1097f090f896fb41..21c77a09555cabf75011ceab149d3bed31f88e34 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -5,8 +5,10 @@ params.deriva = "${baseDir}/../test_data/auth/credential.json" params.bdbag = "${baseDir}/../test_data/auth/cookies.txt" //params.repRID = "16-1ZX4" params.repRID = "Q-Y5JA" +params.refVersion = "0.0.1" +params.refMuVersion = "38.P6" +params.refHuVersion = "38.p12" params.outDir = "${baseDir}/../output" -params.reference = "/project/BICF/BICF_Core/shared/gudmap/references" // Parse input variables deriva = Channel @@ -16,12 +18,15 @@ bdbag = Channel .fromPath(params.bdbag) .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" } repRID = params.repRID +refVersion = params.refVersion +refMuVersion = params.refMuVersion +refHuVersion = params.refHuVersion outDir = params.outDir -referenceBase = file ("${params.reference}") logsDir = "${outDir}/Logs" // Define fixed files derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") +referenceBase = "s3://bicf-references" // Define script files script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") @@ -32,15 +37,15 @@ script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py") */ process getBag { tag "${repRID}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getBag.err" + publishDir "${logsDir}", mode: "copy", pattern: "*.getBag.err" input: - path credential, stageAs: 'credential.json' from deriva + path credential, stageAs: "credential.json" from deriva path derivaConfig output: path ("Replicate_*.zip") into bagit - file ("${repRID}.getBag.err") + path ("${repRID}.getBag.err") script: """ @@ -62,19 +67,19 @@ process getBag { */ process getData { tag "${repRID}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getData.err" + publishDir "${logsDir}", mode: "copy", pattern: "*.getData.err" input: path script_bdbagFetch - path cookies, stageAs: 'deriva-cookies.txt' from bdbag + path cookies, stageAs: "deriva-cookies.txt" from bdbag path bagit output: path ("*.R{1,2}.fastq.gz") into fastqs - file("**/File.csv") into fileMeta - file("**/Experiment Settings.csv") into experimentSettingsMeta - file("**/Experiment.csv") into experimentMeta - file ("${repRID}.getData.err") + path ("**/File.csv") into fileMeta + path ("**/Experiment Settings.csv") into experimentSettingsMeta + path ("**/Experiment.csv") into experimentMeta + path ("${repRID}.getData.err") script: """ @@ -87,14 +92,14 @@ process getData { echo "LOG: deriva cookie linked" >>${repRID}.getData.err # get bagit basename - replicate=\$(basename "${bagit}" | cut -d '.' -f1) + replicate=\$(basename "${bagit}" | cut -d "." -f1) echo "LOG: \${replicate}" >>${repRID}.getData.err # unzip bagit unzip ${bagit} 2>>${repRID}.getData.err echo "LOG: replicate bdbag unzipped" >>${repRID}.getData.err - # bagit fetch fastq's only and rename by repRID + # bagit fetch fastq"s only and rename by repRID sh ${script_bdbagFetch} \${replicate} ${repRID} 2>>${repRID}.getData.err echo "LOG: replicate bdbag fetched" >>${repRID}.getData.err """ @@ -105,7 +110,7 @@ process getData { */ process parseMetadata { tag "${repRID}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.parseMetadata.err" + publishDir "${logsDir}", mode: "copy", pattern: "*.parseMetadata.err" input: path script_parseMeta @@ -115,7 +120,7 @@ process parseMetadata { path experimentMeta output: - path 'design.csv' into metadata + path "design.csv" into metadata script: """ @@ -135,7 +140,7 @@ process parseMetadata { echo "LOG: endedness manually detected: \${endsManual}" >>${repRID}.parseMetadata.err # Get strandedness metadata - stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded) + stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded -e \${endsManual}) echo "LOG: strandedness metadata parsed: \${stranded}" >>${repRID}.parseMetadata.err # Get spike-in metadata @@ -146,27 +151,8 @@ process parseMetadata { species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentMeta}" -p species) echo "LOG: species metadata parsed: \${species}" >>${repRID}.parseMetadata.err - #Set the correct Reference Directory - if [ "\${spike}" == 'yes' ]; then - if [ "\${species}" == 'Homo sapiens' ]; then - referenceDir="/project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12-S/hisat2/genome"; - echo "LOG: Referece set to Homo sapiens with spike-in." >>${repRID}.parseMetadata.err; - elif [ "\${species}" == 'Mus musculus' ]; then - referenceDir="/project/BICF/BICF_Core/shared/gudmap/references/GRCm38.P6-S/hisat2/genome"; - echo "LOG: Referece set to Mus musculus with spike-in." >>${repRID}.parseMetadata.err; - fi; - else - if [ "\${species}" == 'Homo sapiens' ]; then - referenceDir="/project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12/hisat2/genome"; - echo "LOG: Referece set to Homo sapiens without spike-in." >>${repRID}.parseMetadata.err; - elif [ "\${species}" == 'Mus musculus' ]; then - referenceDir="/project/BICF/BICF_Core/shared/gudmap/references/GRCm38.P6/hisat2/genome"; - echo "LOG: Referece set to Mus musculus without spike-in." >>${repRID}.parseMetadata.err; - fi; - fi; - # Save design file - echo "\${rep},\${endsMeta},\${endsManual},\${stranded},\${spike},\${species},\${referenceDir}" > design.csv + echo "\${rep},\${endsMeta},\${endsManual},\${stranded},\${spike},\${species}" > design.csv """ } @@ -177,25 +163,70 @@ endsManual = Channel.create() stranded = Channel.create() spike = Channel.create() species = Channel.create() -referenceDir = Channel.create() -metadata.splitCsv(sep: ',', header: false).separate( +metadata.splitCsv(sep: ",", header: false).separate( rep, endsMeta, endsManual, stranded, spike, - species, - referenceDir + species ) endsManual.into { endsManual_trimData - endsManual_alignReads + endsManual_alignData } stranded.into { - stranded_alignReads + stranded_alignData } -referenceDir.into { - referenceDir_alignReads +spike.into{ + spike_getRef +} +species.into { + species_getRef +} + +/* + * getRef: Dowloads appropriate reference +*/ +process getRef { + tag "${species_getRef}" + publishDir "${logsDir}", mode: "copy", pattern: "*.getRef.err" + + input: + val referenceBase + val refVersion + val refMuVersion + val refHuVersion + val spike_getRef + val species_getRef + + output: + path ("*") into reference + + script: + """ + hostname >>${repRID}.getRef.err + ulimit -a >>${repRID}.getRef.err + export https_proxy=\${http_proxy} + + if [ "${species_getRef}" == "Mus musculus" ] + then + references=\$(echo ${referenceBase}/mouse/${refVersion}/GRCm${refMuVersion}) + elif [ '${species_getRef}' == "Homo sapiens" ] + then + references=\$(echo ${referenceBase}/human/${refVersion}/GRCh${refHuVersion}) + else + exit 1 + fi + if [ "${spike_getRef}" == "yes" ] + then + references=\$(echo \${reference}-S/) + elif [ "${spike_getRef}" == "no" ] + then + reference=\$(echo \${references}/) + fi + aws s3 cp "\${references}" ./ --recursive >>${repRID}.getRef.err + """ } /* @@ -203,16 +234,16 @@ referenceDir.into { */ process trimData { tag "${repRID}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.trimData.*" + publishDir "${logsDir}", mode: "copy", pattern: "*.trimData.*" input: val endsManual_trimData - file(fastq) from fastqs + path (fastq) from fastqs output: path ("*.fq.gz") into fastqs_trimmed - file ("${repRID}.trimData.log") - file ("${repRID}.trimData.err") + path ("${repRID}.trimData.log") + path ("${repRID}.trimData.err") script: """ @@ -220,41 +251,47 @@ process trimData { ulimit -a >>${repRID}.trimData.err # trim fastqs - if [ '${endsManual_trimData}' == 'se' ] + if [ "${endsManual_trimData}" == "se" ] then - trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; - else - trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; + trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err + elif [ "${endsManual_trimData}" == "pe" ] + then + trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err fi """ } /* - * alignReads: aligns the reads to a reference database + * alignData: aligns the reads to a reference database */ -process alignReads { +process alignData { tag "${repRID}" - publishDir "${outDir}/aligned", mode: "copy", pattern: "${repRID}.{unal,sorted}.{gz,bam,bai}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.align.{out,err}" + publishDir "${logsDir}", mode: "copy", pattern: "*.align.{out,err}" input: - val endsManual_alignReads - val stranded_alignReads - path fqs from fastqs_trimmed - val referenceDir_alignReads + val endsManual_alignData + val stranded_alignData + path fastq from fastqs_trimmed + path reference output: - set repRID, file ("${repRID}.unal.gz"), file ("${repRID}.sorted.bam"), file ("${repRID}.sorted.bai") - set repRID, file ("${repRID}.align.out"), file ("${repRID}.align.err") + path ("${repRID}.unal.gz") + path ("${repRID}.sorted.bam") + path ("${repRID}.sorted.bai") + path ("${repRID}.align.out") + path ("${repRID}.align.err") script: """ - if [ "${endsManual_alignReads}" == 'pe' ]; then - hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${referenceDir_alignReads} ${stranded_alignReads} --no-mixed --no-discordant -1 ${fqs[0]} -2 ${fqs[1]} 1>${repRID}.align.out 2>${repRID}.align.err; - else hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${referenceDir_alignReads} ${stranded_alignReads} -U ${fqs[0]} 1>${repRID}.align.out 2>${repRID}.align.err; - fi; - samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err; - samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err; - samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err; + if [ "${endsManual_alignData}" == "se" ] + then + hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} 1>${repRID}.align.out 2>${repRID}.align.err + elif [ "${endsManual_alignData}" == "pe" ] + then + hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} 1>${repRID}.align.out 2>${repRID}.align.err + fi + samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err + samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err + samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err """ -} +} \ No newline at end of file diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py deleted file mode 100644 index e6accfbbaee5ac303253294f69f8f345c7ba1dc5..0000000000000000000000000000000000000000 --- a/workflow/scripts/modifyFetch.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import pandas as pd -import re - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('-f', '--files',help="The fetch file from bdgap.zip.",required=True) - args = parser.parse_args() - return args - -def main(): - args = get_args() - fetchFile = pd.read_csv(args.files+"/fetch.txt",sep="\t",header=None) - fileFile = pd.read_csv(args.files+"/data/File.csv",sep=",",header=0) - fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"] - fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"] - fetchFile_filtered_renamed = fetchFile_filtered - for i in fileFile_filtered["File_Name"]: - fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0]) - fetchFile_filtered_renamed.to_csv(args.files+"/fetch.txt",sep="\t",header=False,index=False) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/workflow/scripts/parseMeta.py b/workflow/scripts/parseMeta.py index 4797610f870f8e8f22024e0198dc0255c0829073..eaa2656f9266a5c3de07fb0fc80ad5aeb2ea8422 100644 --- a/workflow/scripts/parseMeta.py +++ b/workflow/scripts/parseMeta.py @@ -10,6 +10,7 @@ def get_args(): parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True) parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True) parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True) + parser.add_argument('-e', '--endsManual',help="The endness.",required=False) args = parser.parse_args() return args @@ -55,9 +56,9 @@ def main(): # Get strandedness metadata from 'Experiment Settings.csv' if (args.parameter == "stranded"): if (metaFile.Has_Strand_Specific_Information.unique() == "yes"): - if endsManual == "se": + if (args.endsManual=="se"): stranded = "--rna-strandness F" - else: + elif (args.endsManual=="pe"): stranded = "--rna-strandness FR" elif (metaFile.Has_Strand_Specific_Information.unique() == "no"): stranded = "" diff --git a/workflow/scripts/splitFetch.py b/workflow/scripts/splitFetch.py deleted file mode 100644 index 63385c184ff7bd6ddb4cb047c81a693cf7a8ecfb..0000000000000000000000000000000000000000 --- a/workflow/scripts/splitFetch.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import pandas as pd -import os - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('-f', '--files',help="The fetch file from bdgap.zip.",required=True) - args = parser.parse_args() - return args - -def main(): - args = get_args() - fetchFile = pd.read_csv(args.files+"/fetch.txt",sep="\t",header=None) - fileFile = pd.read_csv(args.files+"/data/File.csv",sep=",",header=0) - replicateRID = fileFile.Replicate_RID.unique() - fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID} - for i in replicateRID: - if not os.path.exists(i): - os.mkdir("Replicate_"+i) - fetchFile[fetchFile[0].str.contains('|'.join(fetchArray[i]))].to_csv("Replicate_"+i+"/fetch.txt",sep="\t",header=False,index=False) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/workflow/tests/test_alignReads.py b/workflow/tests/test_alignReads.py index 79770bb42245df6da0ce7fa1081d78ec5aed736f..7c20dd3f269e202e824285daa00a85d4301b3278 100644 --- a/workflow/tests/test_alignReads.py +++ b/workflow/tests/test_alignReads.py @@ -7,39 +7,33 @@ import utils data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../../' -logs_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' @pytest.mark.alignData def test_alignData_se(): - assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.unal.gz')) - assert utils.count_lines(os.path.join(data_output_path, '16-1ZX4.unal.gz')) == 3070528 - assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.sorted.bam')) - assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.sorted.bai')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.unal.gz')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.bai')) @pytest.mark.alignData def test_alignData_pe(): - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.unal.gz')) - assert utils.count_lines(os.path.join(data_output_path, 'Q-Y5JA.unal.gz')) == 0 - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.sorted.bam')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.sorted.bai')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.unal.gz')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bai')) @pytest.mark.alignLogs def test_alignLogs_se(): - assert os.path.exists(os.path.join(logs_output_path, '16-1ZX4.align.err')) - assert utils.count_lines(os.path.join(logs_output_path, '16-1ZX4.align.err')) == 7 - assert '34497376 reads; of these:' in open(os.path.join(logs_output_path, '16-1ZX4.align.err')).readlines()[0] - assert os.path.exists(os.path.join(logs_output_path, '16-1ZX4.align.out')) - assert utils.count_lines(os.path.join(logs_output_path, '16-1ZX4.align.out')) == 0 + assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.align.err')) + assert '34497376 reads; of these:' in open(os.path.join(data_output_path, '16-1ZX4.align.err')).readlines()[0] + assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.align.out')) @pytest.mark.alignLogs def test_alignLogs_pe(): - assert os.path.exists(os.path.join(logs_output_path, 'Q-Y5JA.align.err')) - assert utils.count_lines(os.path.join(logs_output_path, 'Q-Y5JA.align.err')) == 7 - assert '15824858 reads; of these:' in open(os.path.join(logs_output_path, 'Q-Y5JA.align.err')).readlines()[0] - assert os.path.exists(os.path.join(logs_output_path, 'Q-Y5JA.align.out')) - assert utils.count_lines(os.path.join(logs_output_path, 'Q-Y5JA.align.out')) == 0 + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.align.err')) + assert utils.count_lines(os.path.join(data_output_path, 'Q-Y5JA.align.err')) == 7 + assert '15824858 reads; of these:' in open(os.path.join(data_output_path, 'Q-Y5JA.align.err')).readlines()[0] + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.align.out')) + assert utils.count_lines(os.path.join(data_output_path, 'Q-Y5JA.align.out')) == 0 diff --git a/workflow/tests/test_trimData.py b/workflow/tests/test_trimData.py index 6538a0081aaea69c5814d479751455abd879f4bc..a8f479afec7b0c97847179f5b7817987ee872a16 100644 --- a/workflow/tests/test_trimData.py +++ b/workflow/tests/test_trimData.py @@ -10,9 +10,10 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.trimData def test_trimData_se(): - assert os.path.exists(os.path.join(test_output_path, '16-1ZX4_trimmed.fq.gz')) + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_1M.se_trimmed.fq.gz')) + @pytest.mark.trimData def test_trimData_pe(): - assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R1_val_1.fq.gz')) - assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R2_val_2.fq.gz')) \ No newline at end of file + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_1M.pe_R1_val_1.fq.gz')) + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_1M.pe_R2_val_2.fq.gz')) \ No newline at end of file