diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f35caee3a7ce07870b96d60475903727a785d2f6..cd792d6d1065cdaef00a8785cbb78a30f48409cc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,6 +26,15 @@ getData: - singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' sh ./workflow/scripts/bdbagFetch.sh Replicate_16-1ZX4 16-1ZX4 - pytest -m getData +parseMetadata: + stage: unit + script: + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p ends + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p specie + trimData: stage: unit script: diff --git a/workflow/conf/aws_ondemand.config b/workflow/conf/aws_ondemand.config index 1a14ebf3dc44d33198c8472a231796f980e312da..84fcb275e131e30ea4a21ad829d67c368dca1811 100755 --- a/workflow/conf/aws_ondemand.config +++ b/workflow/conf/aws_ondemand.config @@ -13,14 +13,7 @@ process { cpus = 1 memory = '1 GB' - withName:getBag { - container = 'bicf/gudmaprbkfilexfer:1.3' - } - withName:getData { - container = 'bicf/gudmaprbkfilexfer:1.3' - } withName:trimData { - container = 'bicf/trimgalore:1.1' cpus = 15 } } \ No newline at end of file diff --git a/workflow/conf/aws_spot.config b/workflow/conf/aws_spot.config index b5239a2388616beb2936e41020e5c387f87118a6..fbccb3cba394d2a1a7751bee7206483c869eac23 100755 --- a/workflow/conf/aws_spot.config +++ b/workflow/conf/aws_spot.config @@ -13,14 +13,7 @@ process { cpus = 1 memory = '1 GB' - withName:getBag { - container = 'bicf/gudmaprbkfilexfer:1.3' - } - withName:getData { - container = 'bicf/gudmaprbkfilexfer:1.3' - } withName:trimData { - container = 'bicf/trimgalore:1.1' cpus = 15 } } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 20da91a7f7a241e610708d7186d299d397958c41..36d5b332611f5d234a692b545819ad44db8c381e 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -5,16 +5,16 @@ process { withName:getBag { executor = 'local' - container = 'docker://bicf/gudmaprbkfilexfer:1.3' } withName:getData { executor = 'local' - container = 'docker://bicf/gudmaprbkfilexfer:1.3' } withName:trimData { - container = 'docker://bicf/trimgalore:1.1' queue = '256GB,256GBv1,384GB' } + withName:parseMetadata { + executor = 'local' + } } singularity { diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 37584999cf8152c9776b676d0b013f8aeb5e8709..7782046e10ab0c49d186141867e5964e87ea52cb 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -10,6 +10,21 @@ profiles { } } +process { + withName:getBag { + container = 'bicf/gudmaprbkfilexfer:1.3' + } + withName:getData { + container = 'bicf/gudmaprbkfilexfer:1.3' + } + withName:trimData { + container = 'bicf/trimgalore:1.1' + } + withName:parseMetadata { + container = 'bicf/python3:1.3' + } +} + trace { enabled = true file = 'pipeline_trace.txt' diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 68a65161a2dd082fad1e164b1b871ff35da17acf..5a8b614af62d339d32fa6b0bd92f8f00df77d673 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -1,8 +1,8 @@ #!/usr/bin/env nextflow // Define input variables -params.deriva = "${baseDir}/../test_data/credential.json" -params.bdbag = "${baseDir}/../test_data/cookies.txt" +params.deriva = "${baseDir}/../test_data/auth/credential.json" +params.bdbag = "${baseDir}/../test_data/auth/cookies.txt" //params.repRID = "16-1ZX4" params.repRID = "Q-Y5JA" @@ -25,6 +25,7 @@ derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") // Define script files script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") +script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py") /* * getData: get bagit file from consortium @@ -75,7 +76,6 @@ process getData { file("**/Experiment.csv") into experimentMeta file ("${repRID}.getData.err") - script: """ hostname >>${repRID}.getData.err @@ -95,11 +95,67 @@ process getData { echo "LOG: replicate bdbag unzipped" >>${repRID}.getData.err # bagit fetch fastq's only and rename by repRID - sh bdbagFetch.sh \${replicate} ${repRID} 2>>${repRID}.getData.err + sh ${script_bdbagFetch} \${replicate} ${repRID} 2>>${repRID}.getData.err echo "LOG: replicate bdbag fetched" >>${repRID}.getData.err """ } +/* + * parseMetadata: parses metadata to extract experiment parameters +*/ +process parseMetadata { + tag "${repRID}" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.parseMetadata.err" + + input: + path script_parseMeta + val repRID + path fileMeta + path experimentSettingsMeta + path experimentMeta + + output: + path 'design.csv' into metadata + + script: + """ + hostname >>${repRID}.parseMetadata.err + ulimit -a >>${repRID}.parseMetadata.err + + # Check replicate RID metadata + rep=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p repRID) + echo "LOG: replicate RID metadata parsed: \${rep}" >>${repRID}.parseMetadata.err + + # Get endedness metadata + endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p endsMeta) + echo "LOG: endedness metadata parsed: \${endsMeta}" >>${repRID}.parseMetadata.err + + # Manually get endness + endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p endsManual) + echo "LOG: endedness manually detected: \${endsManual}" >>${repRID}.parseMetadata.err + + # Get strandedness metadata + stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded) + echo "LOG: strandedness metadata parsed: \${stranded}" >>${repRID}.parseMetadata.err + + # Get spike-in metadata + spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p spike) + echo "LOG: spike-in metadata parsed: \${spike}" >>${repRID}.parseMetadata.err + + # Get species metadata + species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentMeta}" -p species) + echo "LOG: species metadata parsed: \${species}" >>${repRID}.parseMetadata.err + + # Save design file + echo "\${rep},\${endsMeta},\${endsManual},\${stranded},\${spike},\${species}" > design.csv + """ +} + +metadata.splitCsv(sep: ',', header: false).into { + metadata_trimData + metadata_qc +} + /* * trimData: trims any adapter or non-host sequences from the data */ @@ -109,10 +165,10 @@ process trimData { input: file(fastq) from fastqs + tuple val(rep), val(endsMeta), val(endsManual), val(stranded), val(spike), val(species) from metadata_trimData output: path ("*.fq.gz") into fastqs_trimmed - val ends file ("${repRID}.trimData.log") file ("${repRID}.trimData.err") @@ -122,12 +178,10 @@ process trimData { ulimit -a >>${repRID}.trimData.err # trim fastqs - if [ '${fastq[1]}' == 'null' ] + if [ '${endsManual}' == 'se' ] then - ends='se' trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; else - ends='pe' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; fi """ diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py new file mode 100644 index 0000000000000000000000000000000000000000..e6accfbbaee5ac303253294f69f8f345c7ba1dc5 --- /dev/null +++ b/workflow/scripts/modifyFetch.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import re + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--files',help="The fetch file from bdgap.zip.",required=True) + args = parser.parse_args() + return args + +def main(): + args = get_args() + fetchFile = pd.read_csv(args.files+"/fetch.txt",sep="\t",header=None) + fileFile = pd.read_csv(args.files+"/data/File.csv",sep=",",header=0) + fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"] + fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"] + fetchFile_filtered_renamed = fetchFile_filtered + for i in fileFile_filtered["File_Name"]: + fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0]) + fetchFile_filtered_renamed.to_csv(args.files+"/fetch.txt",sep="\t",header=False,index=False) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/workflow/scripts/parseMeta.py b/workflow/scripts/parseMeta.py new file mode 100644 index 0000000000000000000000000000000000000000..43ca2392078171ec9a1f42f7f9a83d13d0f0383b --- /dev/null +++ b/workflow/scripts/parseMeta.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True) + parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True) + parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True) + args = parser.parse_args() + return args + + +def main(): + args = get_args() + metaFile = pd.read_csv(args.metaFile,sep=",",header=0) + + # Check replicate RID metadata from 'File.csv' + if (args.parameter == "repRID"): + if (len(metaFile.Replicate_RID.unique()) > 1): + print("There are multiple replicate RID's in the metadata: " + " ".join(metaFile.Replicate_RID.unique())) + exit(1) + if not (metaFile.Replicate_RID.unique() == args.repRID): + print("Replicate RID in metadata does not match run parameters: " + metaFile.Replicate_RID.unique() + " vs " + args.repRID) + exit(1) + else: + rep=metaFile["Replicate_RID"].unique()[0] + print(rep) + if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2): + print("There are more then 2 fastq's in the metadata: " + " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID)) + exit(1) + + # Get endedness metadata from 'Experiment Settings.csv' + if (args.parameter == "endsMeta"): + if (metaFile.Paired_End.unique() == "Single End"): + endsMeta = "se" + elif (metaFile.Paired_End.unique() == "Paired End"): + endsMeta = "pe" + else: + endsMeta = "uk" + print(endsMeta) + + # Manually get endness count from 'File.csv' + if (args.parameter == "endsManual"): + if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1): + endsManual = "se" + elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2): + endsManual = "pe" + print(endsManual) + + # Get strandedness metadata from 'Experiment Settings.csv' + if (args.parameter == "stranded"): + if (metaFile.Has_Strand_Specific_Information.unique() == "yes"): + stranded = "stranded" + elif (metaFile.Has_Strand_Specific_Information.unique() == "no"): + stranded = "unstranded" + else: + print("Stranded metadata not match expected options: " + metaFile.Has_Strand_Specific_Information.unique()) + exit(1) + print(stranded) + + # Get spike-in metadata from 'Experiment Settings.csv' + if (args.parameter == "spike"): + if (metaFile.Used_Spike_Ins.unique() == "yes"): + spike = "yes" + elif (metaFile.Used_Spike_Ins.unique() == "no"): + spike = "no" + else: + print("Spike-ins metadata not match expected options: " + metaFile.Used_Spike_Ins.unique()) + exit(1) + print(spike) + + # Get species metadata from 'Experiment.csv' + if (args.parameter == "species"): + if (metaFile.Species.unique() == "Mus musculus"): + species = "Mus musculus" + elif (metaFile.Species.unique() == "Homo sapiens"): + species = "Homo sapiens" + else: + print("Species metadata not match expected options: " + metaFile.Species.unique()) + exit(1) + print(species) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/workflow/scripts/splitFetch.py b/workflow/scripts/splitFetch.py new file mode 100644 index 0000000000000000000000000000000000000000..63385c184ff7bd6ddb4cb047c81a693cf7a8ecfb --- /dev/null +++ b/workflow/scripts/splitFetch.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import os + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--files',help="The fetch file from bdgap.zip.",required=True) + args = parser.parse_args() + return args + +def main(): + args = get_args() + fetchFile = pd.read_csv(args.files+"/fetch.txt",sep="\t",header=None) + fileFile = pd.read_csv(args.files+"/data/File.csv",sep=",",header=0) + replicateRID = fileFile.Replicate_RID.unique() + fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID} + for i in replicateRID: + if not os.path.exists(i): + os.mkdir("Replicate_"+i) + fetchFile[fetchFile[0].str.contains('|'.join(fetchArray[i]))].to_csv("Replicate_"+i+"/fetch.txt",sep="\t",header=False,index=False) + +if __name__ == '__main__': + main() \ No newline at end of file