diff --git a/.gitignore b/.gitignore index 2bc34493af5ad8d30a9ef477283aa7c4b32700b8..8b4b1eadf6253fc94cefe75b485a051ed8f3d71e 100644 --- a/.gitignore +++ b/.gitignore @@ -297,7 +297,6 @@ $RECYCLE.BIN/ # nextflow analysis folders/files /test_data/* -/workflow/docker/images/* /workflow/.nextflow/* /workflow/work/* /workflow/output/* diff --git a/.gitlab/issue_templates/Bug.md b/.gitlab/issue_templates/Bug.md new file mode 100644 index 0000000000000000000000000000000000000000..9263e44e3e98f4ba84c6605277179cb6f7968a6c --- /dev/null +++ b/.gitlab/issue_templates/Bug.md @@ -0,0 +1,21 @@ +# Summary + + +# Setps to reproduce + + +# Observed bug behavoir + + +# Expected behavior + + +# Relevant logs and/or screenshots + + +# Potential fixes + + + +/label ~bug ~"To Do" +/cc @ghenry @venkat.malladi @s181706 @s189701 \ No newline at end of file diff --git a/README.md b/README.md index 9d078c41b77e4f38832167b1ee910287db1825da..b2a3fecf8142b15790274609eb38a35a318af72e 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ This worklow is was developed by [Bioinformatic Core Facility (BICF), Department PI -- Venkat S. Malladi\ -*Director*\ +*Faculty Associate & Director*\ Bioinformatics Core Facility\ UT Southwestern Medical Center\ <a href="https://orcid.org/0000-0002-0144-0564" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0002-0144-0564</a>\ @@ -57,5 +57,11 @@ UT Southwestern Medical Center\ <a href="https://orcid.org/0000-0001-5902-3299" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0001-5902-3299</a>\ [johnathan.gesell@utsouthwestern.edu](mailto:jonathn.gesell@utsouthwestern.edu) +Jeremy A. Mathews\ +*Computational Intern*\ +Bioinformatics Core Facility\ +UT Southwestern Medical Center\ +<a href="https://orcid.org/0000-0002-2931-1430" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0002-2931-1430</a>\ +[jeremy.mathews@utsouthwestern.edu](mailto:jeremy.mathews@utsouthwestern.edu) Please cite in publications: Pipeline was developed by BICF from funding provided by **Cancer Prevention and Research Institute of Texas (RP150596)**. diff --git a/nextflow.config b/nextflow.config deleted file mode 100644 index 28777047bfa85b13d08a0df02a22e6eac6d66540..0000000000000000000000000000000000000000 --- a/nextflow.config +++ /dev/null @@ -1,5 +0,0 @@ -profiles { - standard { - includeConfig 'workflow/conf/biohpc.config' - } -} diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 0ea74405dd6faeb0342698df1ef8736804dda5c6..d221fee521fa74fbf98b2e4760650e0d94e6026e 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -1,10 +1,13 @@ process { executor = 'slurm' - queue='super' + queue = 'super' // Process specific configuration - withLabel:getData { - executor = 'super' + withName:splitData { + container = 'docker://bicf/bdbag:1.0' + } + withName:getData { + container = 'docker://bicf/bdbag:1.0' } } @@ -28,4 +31,15 @@ report { tower { accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f' enabled = true -} \ No newline at end of file +} + +singularity { + enabled = true + cacheDir = '/project/shared/bicf_workflow_ref/singularity_images/' +} + +env { + http_proxy = 'http://proxy.swmed.edu:3128' + https_proxy = 'http://proxy.swmed.edu:3128' + all_proxy = 'http://proxy.swmed.edu:3128' +} diff --git a/workflow/docker/.gitkeep b/workflow/docker/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/workflow/docker/getData b/workflow/docker/getData deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/workflow/docker/images/.gitkeep b/workflow/docker/images/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/workflow/docker/temp b/workflow/docker/temp deleted file mode 100644 index f7dcb3af08981d465bf0838d09de1b38e9e0c5aa..0000000000000000000000000000000000000000 --- a/workflow/docker/temp +++ /dev/null @@ -1,14 +0,0 @@ - - -RUN apt-get install -y python3.7 python3-pip - -RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \ - rm Miniconda3-latest-Linux-x86_64.sh -ENV PATH=/miniconda/bin:${PATH} -RUN conda config --add channels defaults && \ - conda config --add channels bioconda && \ - conda config --add channels conda-forge && \ - conda update -n base -c defaults -y conda - -RUN pip install --upgrade pip diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index d839044791bc3aaccc701c9fb62099105c97205b..4f1fd5f249b3bf7e25388a7389675fd100c9b18c 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -1,49 +1,77 @@ #!/usr/bin/env nextflow // Define input variables +params.deriva = "/project/BICF/BICF_Core/shared/gudmap/cookies/deriva-cookies.txt" params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip" params.outDir = "${baseDir}/../output" // Parse input variables +deriva = file(params.deriva, checkIfExists: 'true') bdbag = Channel .fromPath(params.bdbag) .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } outDir = params.outDir +/* + * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid + */ +process splitData { + tag "${bdbag.baseName}" + + input: + file bdbag + path cookies, stageAs: 'cookies.txt' from deriva + + output: + file("Replicate_*.zip") into bdbagSplit mode flatten + file("${bdbag.baseName}/data/File.csv") into fileMeta + file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta + file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta + + script: + """ + hostname + ulimit -a + ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt + echo "LOG: deriva cookie linked" + study=`echo "${bdbag}" | cut -d '.' -f1` + echo "LOG: \${study}" + unzip ${bdbag} + echo "LOG: bdgag unzipped" + python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} + echo "LOG: fetch file filtered for only .fastq.gz" + python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} + echo "LOG: fetch file split by replicates" + sh ${baseDir}/scripts/splitBag.sh \${study} + echo "LOG: bag recreated with replicate split fetch file" + """ +} /* * getData: fetch study files from consortium with downloaded bdbag.zip - * python must be loaded prior to nextflow run, because conda env create from .yml doesn't work with nextflow loaded module (either process in-line, or config file) */ - process getData { - publishDir "${outDir}/temp/getData", mode: "symlink" - conda "${baseDir}/conf/conda.env.bdbag.yml" - - input: - file bdbag - - output: - file("**/*.R*.fastq.gz") into fastqPaths - file("**/File.csv") into filePaths - file("**/Experiment Settings.csv") into experimentSettingsPaths - file("**/Experiment.csv") into experimentPaths - - script: - """ - hostname - ulimit -a - study=\$(echo "${bdbag}" | cut -d'.' -f1) - echo LOG: \${study} - unzip ${bdbag} - echo LOG: bdgag unzipped - python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} - echo LOG: fetch file filtered for only .fastq.gz - #bdbag --materialize "\$(echo "${bdbag}" | cut -d'.' -f1)" - sh ${baseDir}/scripts/bdbagFetch.sh \${study} - echo LOG: bdbag fetched - sh ${baseDir}/scripts/renameFastq.sh \${study} - echo LOG: fastq.gz files renamed to replicate RID - """ - } \ No newline at end of file +process getData { + tag "${rep.baseName}" + publishDir "${outDir}/tempOut/fastqs", mode: "symlink" + + input: + each rep from bdbagSplit + + output: + path ("*.R*.fastq.gz", type: 'file', maxDepth: '0') into fastq + + script: + """ + hostname + ulimit -a + export https_proxy=\${http_proxy} + replicate=\$(basename "${rep}" | cut -d '.' -f1) + echo "LOG: \${replicate}" + unzip ${rep} + echo "LOG: replicate bdbag unzipped" + sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} + echo "LOG: replicate bdbag fetched" + """ + } diff --git a/workflow/scripts/bdbagFetch.sh b/workflow/scripts/bdbagFetch.sh index 28dab3f5338b3b6371b2b8f4ee7ac6bf2e715fa6..9af4eb46c0e716e0e1db7cb66e9f027f63611218 100644 --- a/workflow/scripts/bdbagFetch.sh +++ b/workflow/scripts/bdbagFetch.sh @@ -1,3 +1,6 @@ -#!/bin +#!/bin/bash -bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 \ No newline at end of file +bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 && +for i in $(find */ -name "*.R*.fastq.gz"); do + mv ${i} .; +done; diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py index 8a330e539054c8592363bd84bb4e6a0871b750f4..82b1d4c50a17cf34d9410ef0889dc7ca0d112d84 100644 --- a/workflow/scripts/modifyFetch.py +++ b/workflow/scripts/modifyFetch.py @@ -1,5 +1,8 @@ +#!/usr/bin/env python3 + import argparse import pandas as pd +import re def get_args(): parser = argparse.ArgumentParser() @@ -9,9 +12,14 @@ def get_args(): def main(): args = get_args() - fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) - fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"] - fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) + fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) + fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0) + fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"] + fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"] + fetchFile_filtered_renamed = fetchFile_filtered + for i in fileFile_filtered["File_Name"]: + fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0]) + fetchFile_filtered_renamed.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) if __name__ == '__main__': main() \ No newline at end of file diff --git a/workflow/scripts/renameFastq.sh b/workflow/scripts/renameFastq.sh deleted file mode 100644 index f5593766b3a3bd645c3f2c8758d3a20fd354c9be..0000000000000000000000000000000000000000 --- a/workflow/scripts/renameFastq.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin - -while read loc checksum fileLocation -do - file=$(echo ${fileLocation##*/}) - fileName=$(echo ${file%.R*.fastq.gz}) - fileExt=$(echo ${file##${fileName}.}) - while IFS="," read RID Study_RID Experiment_RID Replicate_RID Caption File_Type File_Name URI File_size MD5 GEO_Archival_URL dbGaP_Accession_ID Processed Notes Principal_Investigator Consortium Release_Date RCT RMT Legacy_File_RID GUDMAP_NGF_OID GUDMAP_NGS_OID - do - if [ ${file} == ${File_Name} ] - then - find . -type f -name ${file} -execdir mv {} ${Replicate_RID}.${fileExt} ';' - fi - done < $1/data/File.csv -done < $1/fetch.txt \ No newline at end of file diff --git a/workflow/scripts/splitBag.sh b/workflow/scripts/splitBag.sh new file mode 100644 index 0000000000000000000000000000000000000000..3f6f6cdb610c684bdb57f666822dc0deb864fb04 --- /dev/null +++ b/workflow/scripts/splitBag.sh @@ -0,0 +1,7 @@ +#!/bin + +for i in $(ls -d Replicate_*) +do +rsync -r $1/ ${i} --exclude=fetch.txt +zip -r ${i}.zip ${i} +done \ No newline at end of file diff --git a/workflow/scripts/splitFetch.py b/workflow/scripts/splitFetch.py new file mode 100644 index 0000000000000000000000000000000000000000..c8f60043be43a70ae570800f6edb117923d91810 --- /dev/null +++ b/workflow/scripts/splitFetch.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import os + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True) + args = parser.parse_args() + return args + +def main(): + args = get_args() + fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) + fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0) + replicateRID = fileFile.Replicate_RID.unique() + fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID} + for i in replicateRID: + if not os.path.exists(i): + os.mkdir("Replicate_"+i) + fetchFile[fetchFile[0].str.contains('|'.join(fetchArray[i]))].to_csv("Replicate_"+i+"/fetch.txt",sep="\t",header=False,index=False) + +if __name__ == '__main__': + main() \ No newline at end of file