diff --git a/.gitignore b/.gitignore index 2bc34493af5ad8d30a9ef477283aa7c4b32700b8..8b4b1eadf6253fc94cefe75b485a051ed8f3d71e 100644 --- a/.gitignore +++ b/.gitignore @@ -297,7 +297,6 @@ $RECYCLE.BIN/ # nextflow analysis folders/files /test_data/* -/workflow/docker/images/* /workflow/.nextflow/* /workflow/work/* /workflow/output/* diff --git a/docs/GUDMAP.RBK Pipeline.docx b/docs/GUDMAP.RBK Pipeline.docx index deae8a8fbfb7adc32ba2fba03a25eca6af57b4d7..7230d3575b627acad8496bb24aabfc9366507a77 100755 Binary files a/docs/GUDMAP.RBK Pipeline.docx and b/docs/GUDMAP.RBK Pipeline.docx differ diff --git a/nextflow.config b/nextflow.config deleted file mode 100644 index 28777047bfa85b13d08a0df02a22e6eac6d66540..0000000000000000000000000000000000000000 --- a/nextflow.config +++ /dev/null @@ -1,5 +0,0 @@ -profiles { - standard { - includeConfig 'workflow/conf/biohpc.config' - } -} diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 10bb43a6cc67fdf1942b582e908ace5ed4877c36..5203ec8faf03460601a3cfa4340c840ada365431 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -3,6 +3,9 @@ process { queue = 'super' // Process specific configuration + withName:splitData { + container = 'docker://bicf/bdbag:1.0' + } withName:getData { container = 'docker://bicf/bdbag:1.0' } diff --git a/workflow/docker/.gitkeep b/workflow/docker/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/workflow/docker/getData b/workflow/docker/getData deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/workflow/docker/images/.gitkeep b/workflow/docker/images/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/workflow/docker/temp b/workflow/docker/temp deleted file mode 100644 index f7dcb3af08981d465bf0838d09de1b38e9e0c5aa..0000000000000000000000000000000000000000 --- a/workflow/docker/temp +++ /dev/null @@ -1,14 +0,0 @@ - - -RUN apt-get install -y python3.7 python3-pip - -RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \ - rm Miniconda3-latest-Linux-x86_64.sh -ENV PATH=/miniconda/bin:${PATH} -RUN conda config --add channels defaults && \ - conda config --add channels bioconda && \ - conda config --add channels conda-forge && \ - conda update -n base -c defaults -y conda - -RUN pip install --upgrade pip diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 86e537b648edd8691f1e0c5bff59066b751a2187..d55fb81ac18df0bc4b6ca3d2819cb83f929a075b 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -1,34 +1,36 @@ #!/usr/bin/env nextflow // Define input variables +params.deriva = "${baseDir}/../test_data/deriva-cookies.txt" params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip" params.outDir = "${baseDir}/../output" // Parse input variables +deriva = Channel + .fromPath(params.deriva) + .ifEmpty { exit 1, "deriva cookie file not found: ${params.deriva}" } bdbag = Channel .fromPath(params.bdbag) .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } outDir = params.outDir - /* - * getData: fetch study files from consortium with downloaded bdbag.zip - * python must be loaded prior to nextflow run, because conda env create from .yml doesn't work with nextflow loaded module (either process in-line, or config file) + * splitData: split bdbag files by replicate so fetch can occure in parallel */ - process getData { - publishDir "${outDir}/temp/getData", mode: "symlink" -// conda "${baseDir}/conf/conda.env.bdbag.yml" +process splitData { + tag "${bdbag.baseName}" + publishDir "${outDir}/temp/${task.process}", mode: "symlink" - input: + input: file bdbag output: - file("**/*.R*.fastq.gz") into fastqPaths - file("**/File.csv") into filePaths - file("**/Experiment Settings.csv") into experimentSettingsPaths - file("**/Experiment.csv") into experimentPaths + file("Replicate_*.zip") into bdbagSplit mode flatten + file("${bdbag.baseName}/data/File.csv") into fileMeta + file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta + file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta script: """ @@ -40,10 +42,40 @@ outDir = params.outDir echo LOG: bdgag unzipped python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} echo LOG: fetch file filtered for only .fastq.gz - #bdbag --materialize "\$(echo "${bdbag}" | cut -d'.' -f1)" - sh ${baseDir}/scripts/bdbagFetch.sh \${study} - echo LOG: bdbag fetched - sh ${baseDir}/scripts/renameFastq.sh \${study} + python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} + echo LOG: fetch file split by replicates + sh ${baseDir}/scripts/splitBag.sh \${study} + echo LOG: bag recreated with replicate split fetch file + """ +} + +/* + * getData: fetch study files from consortium with downloaded bdbag.zip + */ +process getData { + tag "${rep.baseName}" + publishDir "${outDir}/temp/${task.process}", mode: "symlink" + + input: + file deriva + each rep from bdbagSplit + + output: + file("**/*.R*.fastq.gz") into fastq + + script: + """ + hostname + ulimit -a + replicate=\$(echo "${rep}" | cut -d'.' -f1) + echo LOG: \${replicate} + cp "${deriva}" ~/.bdbag/deriva-cookies.txt + echo LOG: deriva cookie loaded + unzip ${rep} + echo LOG: replicate bdbag unzipped + sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} + echo LOG: replicate bdbag fetched + sh ${baseDir}/scripts/renameFastq.sh \${replicate} echo LOG: fastq.gz files renamed to replicate RID """ - } + } \ No newline at end of file diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py index 8a330e539054c8592363bd84bb4e6a0871b750f4..bae8c2286ebe3353fa7db487753bd2de8381706b 100644 --- a/workflow/scripts/modifyFetch.py +++ b/workflow/scripts/modifyFetch.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import argparse import pandas as pd @@ -9,9 +11,9 @@ def get_args(): def main(): args = get_args() - fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) - fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"] - fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) + fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) + fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"] + fetchFile_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) if __name__ == '__main__': main() \ No newline at end of file diff --git a/workflow/scripts/splitBag.sh b/workflow/scripts/splitBag.sh new file mode 100644 index 0000000000000000000000000000000000000000..3f6f6cdb610c684bdb57f666822dc0deb864fb04 --- /dev/null +++ b/workflow/scripts/splitBag.sh @@ -0,0 +1,7 @@ +#!/bin + +for i in $(ls -d Replicate_*) +do +rsync -r $1/ ${i} --exclude=fetch.txt +zip -r ${i}.zip ${i} +done \ No newline at end of file diff --git a/workflow/scripts/splitFetch.py b/workflow/scripts/splitFetch.py new file mode 100644 index 0000000000000000000000000000000000000000..c8f60043be43a70ae570800f6edb117923d91810 --- /dev/null +++ b/workflow/scripts/splitFetch.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import os + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True) + args = parser.parse_args() + return args + +def main(): + args = get_args() + fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) + fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0) + replicateRID = fileFile.Replicate_RID.unique() + fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID} + for i in replicateRID: + if not os.path.exists(i): + os.mkdir("Replicate_"+i) + fetchFile[fetchFile[0].str.contains('|'.join(fetchArray[i]))].to_csv("Replicate_"+i+"/fetch.txt",sep="\t",header=False,index=False) + +if __name__ == '__main__': + main() \ No newline at end of file