diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f35caee3a7ce07870b96d60475903727a785d2f6..cd792d6d1065cdaef00a8785cbb78a30f48409cc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,6 +26,15 @@ getData: - singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' sh ./workflow/scripts/bdbagFetch.sh Replicate_16-1ZX4 16-1ZX4 - pytest -m getData +parseMetadata: + stage: unit + script: + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p ends + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p specie + trimData: stage: unit script: diff --git a/workflow/conf/aws_ondemand.config b/workflow/conf/aws_ondemand.config index 1a14ebf3dc44d33198c8472a231796f980e312da..84fcb275e131e30ea4a21ad829d67c368dca1811 100755 --- a/workflow/conf/aws_ondemand.config +++ b/workflow/conf/aws_ondemand.config @@ -13,14 +13,7 @@ process { cpus = 1 memory = '1 GB' - withName:getBag { - container = 'bicf/gudmaprbkfilexfer:1.3' - } - withName:getData { - container = 'bicf/gudmaprbkfilexfer:1.3' - } withName:trimData { - container = 'bicf/trimgalore:1.1' cpus = 15 } } \ No newline at end of file diff --git a/workflow/conf/aws_spot.config b/workflow/conf/aws_spot.config index b5239a2388616beb2936e41020e5c387f87118a6..fbccb3cba394d2a1a7751bee7206483c869eac23 100755 --- a/workflow/conf/aws_spot.config +++ b/workflow/conf/aws_spot.config @@ -13,14 +13,7 @@ process { cpus = 1 memory = '1 GB' - withName:getBag { - container = 'bicf/gudmaprbkfilexfer:1.3' - } - withName:getData { - container = 'bicf/gudmaprbkfilexfer:1.3' - } withName:trimData { - container = 'bicf/trimgalore:1.1' cpus = 15 } } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 11d83b277b06efc7920c5afbe2f09240492f27e6..e2b1335e2a2f9a923931ad3f293cf364f689e69f 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -9,6 +9,9 @@ process { withName:getData { executor = 'local' } + withName:parseMetadata { + executor = 'local' + } withName:trimData { queue = '256GB,256GBv1,384GB' } diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 4f4f34d051a54568aabee33754ba02fd8f30a67d..acb710d45a2036fbfb83a0b5abb7a80ce64ce3ca 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -17,6 +17,9 @@ process { withName:getData { container = 'bicf/gudmaprbkfilexfer:1.3' } + withName:parseMetadata { + container = 'bicf/python3:1.3' + } withName:trimData { container = 'bicf/trimgalore:1.1' } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 8ac3b27bb3c860abaffdc535f8fb1cee92eef254..e2b8d3c33d91a9ac8bad99a8502ad0ac09fefc35 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -1,12 +1,10 @@ #!/usr/bin/env nextflow // Define input variables -params.deriva = "${baseDir}/../test_data/credential.json" -params.bdbag = "${baseDir}/../test_data/cookies.txt" +params.deriva = "${baseDir}/../test_data/auth/credential.json" +params.bdbag = "${baseDir}/../test_data/auth/cookies.txt" //params.repRID = "16-1ZX4" params.repRID = "Q-Y5JA" -params.spikein = "false" -params.reference = "mouse" params.outDir = "${baseDir}/../output" @@ -17,60 +15,23 @@ deriva = Channel bdbag = Channel .fromPath(params.bdbag) .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" } - repRID = params.repRID outDir = params.outDir logsDir = "${outDir}/Logs" -reference = params.reference - -if (params.spikein) { - if (params.reference == "human") { - reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12-S/hisat2") - } else if (params.reference == "mouse") { - reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6-S/hisat2") - } -} else if (params.reference == "mouse") { - reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2") -} else if (params.reference == "human") { - reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12/hisat2") -} else { - print ("Warning: Reference genome not specified, defaulting to GRCm38.P6 with NO spike-in") - reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2") -} // Define fixed files derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") -/* - * Pass in programs from the scripts directory, and any default files for later - */ - - -/* - *Checking the species and spike-in status - */ -if (params.spikein) { - if (params.species == "human") { - reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12-S/hisat2") - } else if (params.species == "mouse") { - reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6-S/hisat2") - } -} else if (params.species == "mouse") { - reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2") -} else if (params.species == "human") { - reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12/hisat2") -} else { - print ("Warning: Reference genome not specified, defaulting to GRCm38.P6 with NO spike-in") - reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2") -} +// Define script files +script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") +script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py") /* * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid */ process getBag { - executor 'local' tag "${repRID}" - publishDir "${logsDir}/getBag", mode: 'copy', pattern: "${repRID}.getBag.err" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getBag.err" input: path credential, stageAs: 'credential.json' from deriva @@ -85,8 +46,12 @@ process getBag { hostname >>${repRID}.getBag.err ulimit -a >>${repRID}.getBag.err export https_proxy=\${http_proxy} + + # link credential file for authentication ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID}.getBag.err echo "LOG: deriva credentials linked" >>${repRID}.getBag.err + + # deriva-download replicate RID deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID} 2>>${repRID}.getBag.err """ } @@ -96,10 +61,10 @@ process getBag { */ process getData { tag "${repRID}" - publishDir "${logsDir}/getData", mode: 'copy', pattern: "${repRID}.getData.err" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getData.err" input: - executor 'local' + path script_bdbagFetch path cookies, stageAs: 'deriva-cookies.txt' from bdbag path bagit @@ -110,49 +75,149 @@ process getData { file("**/Experiment.csv") into experimentMeta file ("${repRID}.getData.err") - script: """ hostname >>${repRID}.getData.err ulimit -a >>${repRID}.getData.err export https_proxy=\${http_proxy} + + # link deriva cookie for authentication ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID}.getData.err echo "LOG: deriva cookie linked" >>${repRID}.getData.err + + # get bagit basename replicate=\$(basename "${bagit}" | cut -d '.' -f1) echo "LOG: \${replicate}" >>${repRID}.getData.err + + # unzip bagit unzip ${bagit} 2>>${repRID}.getData.err echo "LOG: replicate bdbag unzipped" >>${repRID}.getData.err - sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} ${repRID} 2>>${repRID}.getData.err + + # bagit fetch fastq's only and rename by repRID + sh ${script_bdbagFetch} \${replicate} ${repRID} 2>>${repRID}.getData.err echo "LOG: replicate bdbag fetched" >>${repRID}.getData.err """ } +/* + * parseMetadata: parses metadata to extract experiment parameters +*/ +process parseMetadata { + tag "${repRID}" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.parseMetadata.err" + + input: + path script_parseMeta + val repRID + path fileMeta + path experimentSettingsMeta + path experimentMeta + + output: + path 'design.csv' into metadata + + script: + """ + hostname >>${repRID}.parseMetadata.err + ulimit -a >>${repRID}.parseMetadata.err + + # Check replicate RID metadata + rep=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p repRID) + echo "LOG: replicate RID metadata parsed: \${rep}" >>${repRID}.parseMetadata.err + + # Get endedness metadata + endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p endsMeta) + echo "LOG: endedness metadata parsed: \${endsMeta}" >>${repRID}.parseMetadata.err + + # Manually get endness + endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p endsManual) + echo "LOG: endedness manually detected: \${endsManual}" >>${repRID}.parseMetadata.err + + # Get strandedness metadata + stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded) + echo "LOG: strandedness metadata parsed: \${stranded}" >>${repRID}.parseMetadata.err + + # Get spike-in metadata + spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p spike) + echo "LOG: spike-in metadata parsed: \${spike}" >>${repRID}.parseMetadata.err + + # Get species metadata + species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentMeta}" -p species) + echo "LOG: species metadata parsed: \${species}" >>${repRID}.parseMetadata.err + + # Save design file + echo "\${rep},\${endsMeta},\${endsManual},\${stranded},\${spike},\${species}" > design.csv + """ +} + +// Split metadata into separate channels +rep = Channel.create() +endsMeta = Channel.create() +endsManual = Channel.create() +stranded = Channel.create() +spike = Channel.create() +species = Channel.create() +metadata.splitCsv(sep: ',', header: false).separate( + rep, + endsMeta, + endsManual, + stranded, + spike, + species +) + +// Exit with no species +if (species == "") { + print ("ERROR: Reference genome not specified") + exit 1 +} + +// Setting references +if (spike) { + if (species == "Homo sapiens") { + reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12-S/hisat2") + } else if (species == "Mus musculus") { + reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6-S/hisat2") + } +} else { + if (species == "Homo sapiens") { + reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12/hisat2") + } else if (species == "Mus musculus") { + reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2") + } +} + + + + + /* * trimData: trims any adapter or non-host sequences from the data */ process trimData { tag "${repRID}" - publishDir "${logsDir}/trimData", mode: 'copy', pattern: "\${repRID}.trimData.*" + publishDir "${logsDir}", mode: 'copy', pattern: "\${repRID}.trimData.*" input: file(fastq) from fastqs output: - tuple file ("params.csv"), file ("*.fq.gz") into fastqs_trimmed + path ("*.fq.gz") into fastqs_trimmed file ("${repRID}.trimData.log") file ("${repRID}.trimData.err") script: """ - if [ '${fastq[1]}' == 'null' ] + hostname >>${repRID}.trimData.err + ulimit -a >>${repRID}.trimData.err + + # trim fastqs + if [ '${endsManual}' == 'se' ] then - ends='se' trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; else - ends='pe' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; fi - echo \${ends} > params.csv; """ } @@ -161,23 +226,21 @@ process trimData { */ process alignReads { tag "align-${repRID}" - publishDir "${outDir}/tempOut/aligned", mode: "copy" + publishDir "${outDir}/aligned", mode: "copy" input: - set ends, fqs from fastqs_trimmed - file reference + path fqs from fastqs_trimmed output: set repRID, file ("${repRID}.unal.gz"), file ("${repRID}.bam"), file ("${repRID}.bai") script: """ - ends=`cat ${ends}`; - if [ "\${ends}" == 'pe' ]; then + if [ "${endsManual}" == 'pe' ]; then hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${reference}/genome -1 ${fqs[0]} -2 ${fqs[1]} 1>${repRID}.align.out 2> ${repRID}.align.err; else hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${reference}/genome -U ${fqs[0]} 1>${repRID}.align.out 2> ${repRID}.align.err; fi; samtools view -1 --threads `nproc` -o ${repRID}.bam ${repRID}.sam 1>${repRID}.align.out 2> ${repRID}.align.err; samtools sort -@ `nproc` -O BAM ${repRID}.bam 1>${repRID}.align.out 2> ${repRID}.align.err; """ -} +} \ No newline at end of file diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py index 82b1d4c50a17cf34d9410ef0889dc7ca0d112d84..e6accfbbaee5ac303253294f69f8f345c7ba1dc5 100644 --- a/workflow/scripts/modifyFetch.py +++ b/workflow/scripts/modifyFetch.py @@ -6,20 +6,20 @@ import re def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True) + parser.add_argument('-f', '--files',help="The fetch file from bdgap.zip.",required=True) args = parser.parse_args() return args def main(): args = get_args() - fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) - fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0) + fetchFile = pd.read_csv(args.files+"/fetch.txt",sep="\t",header=None) + fileFile = pd.read_csv(args.files+"/data/File.csv",sep=",",header=0) fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"] fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"] fetchFile_filtered_renamed = fetchFile_filtered for i in fileFile_filtered["File_Name"]: fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0]) - fetchFile_filtered_renamed.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) + fetchFile_filtered_renamed.to_csv(args.files+"/fetch.txt",sep="\t",header=False,index=False) if __name__ == '__main__': main() \ No newline at end of file diff --git a/workflow/scripts/parseMeta.py b/workflow/scripts/parseMeta.py new file mode 100644 index 0000000000000000000000000000000000000000..43ca2392078171ec9a1f42f7f9a83d13d0f0383b --- /dev/null +++ b/workflow/scripts/parseMeta.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True) + parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True) + parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True) + args = parser.parse_args() + return args + + +def main(): + args = get_args() + metaFile = pd.read_csv(args.metaFile,sep=",",header=0) + + # Check replicate RID metadata from 'File.csv' + if (args.parameter == "repRID"): + if (len(metaFile.Replicate_RID.unique()) > 1): + print("There are multiple replicate RID's in the metadata: " + " ".join(metaFile.Replicate_RID.unique())) + exit(1) + if not (metaFile.Replicate_RID.unique() == args.repRID): + print("Replicate RID in metadata does not match run parameters: " + metaFile.Replicate_RID.unique() + " vs " + args.repRID) + exit(1) + else: + rep=metaFile["Replicate_RID"].unique()[0] + print(rep) + if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2): + print("There are more then 2 fastq's in the metadata: " + " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID)) + exit(1) + + # Get endedness metadata from 'Experiment Settings.csv' + if (args.parameter == "endsMeta"): + if (metaFile.Paired_End.unique() == "Single End"): + endsMeta = "se" + elif (metaFile.Paired_End.unique() == "Paired End"): + endsMeta = "pe" + else: + endsMeta = "uk" + print(endsMeta) + + # Manually get endness count from 'File.csv' + if (args.parameter == "endsManual"): + if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1): + endsManual = "se" + elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2): + endsManual = "pe" + print(endsManual) + + # Get strandedness metadata from 'Experiment Settings.csv' + if (args.parameter == "stranded"): + if (metaFile.Has_Strand_Specific_Information.unique() == "yes"): + stranded = "stranded" + elif (metaFile.Has_Strand_Specific_Information.unique() == "no"): + stranded = "unstranded" + else: + print("Stranded metadata not match expected options: " + metaFile.Has_Strand_Specific_Information.unique()) + exit(1) + print(stranded) + + # Get spike-in metadata from 'Experiment Settings.csv' + if (args.parameter == "spike"): + if (metaFile.Used_Spike_Ins.unique() == "yes"): + spike = "yes" + elif (metaFile.Used_Spike_Ins.unique() == "no"): + spike = "no" + else: + print("Spike-ins metadata not match expected options: " + metaFile.Used_Spike_Ins.unique()) + exit(1) + print(spike) + + # Get species metadata from 'Experiment.csv' + if (args.parameter == "species"): + if (metaFile.Species.unique() == "Mus musculus"): + species = "Mus musculus" + elif (metaFile.Species.unique() == "Homo sapiens"): + species = "Homo sapiens" + else: + print("Species metadata not match expected options: " + metaFile.Species.unique()) + exit(1) + print(species) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/workflow/scripts/splitFetch.py b/workflow/scripts/splitFetch.py index c8f60043be43a70ae570800f6edb117923d91810..63385c184ff7bd6ddb4cb047c81a693cf7a8ecfb 100644 --- a/workflow/scripts/splitFetch.py +++ b/workflow/scripts/splitFetch.py @@ -6,14 +6,14 @@ import os def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True) + parser.add_argument('-f', '--files',help="The fetch file from bdgap.zip.",required=True) args = parser.parse_args() return args def main(): args = get_args() - fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) - fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0) + fetchFile = pd.read_csv(args.files+"/fetch.txt",sep="\t",header=None) + fileFile = pd.read_csv(args.files+"/data/File.csv",sep=",",header=0) replicateRID = fileFile.Replicate_RID.unique() fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID} for i in replicateRID: diff --git a/workflow/tests/test_trimData.py b/workflow/tests/test_trimData.py index ea75252f558adc95e5feb564551afbe3f8858cb0..6538a0081aaea69c5814d479751455abd879f4bc 100644 --- a/workflow/tests/test_trimData.py +++ b/workflow/tests/test_trimData.py @@ -9,7 +9,10 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../../' @pytest.mark.trimData -def test_trimData(): +def test_trimData_se(): assert os.path.exists(os.path.join(test_output_path, '16-1ZX4_trimmed.fq.gz')) + +@pytest.mark.trimData +def test_trimData_pe(): assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R1_val_1.fq.gz')) assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R2_val_2.fq.gz')) \ No newline at end of file