diff --git a/.gitignore b/.gitignore index 2bc34493af5ad8d30a9ef477283aa7c4b32700b8..8b4b1eadf6253fc94cefe75b485a051ed8f3d71e 100644 --- a/.gitignore +++ b/.gitignore @@ -297,7 +297,6 @@ $RECYCLE.BIN/ # nextflow analysis folders/files /test_data/* -/workflow/docker/images/* /workflow/.nextflow/* /workflow/work/* /workflow/output/* diff --git a/.gitlab/issue_templates/Bug.md b/.gitlab/issue_templates/Bug.md new file mode 100644 index 0000000000000000000000000000000000000000..9263e44e3e98f4ba84c6605277179cb6f7968a6c --- /dev/null +++ b/.gitlab/issue_templates/Bug.md @@ -0,0 +1,21 @@ +# Summary + + +# Setps to reproduce + + +# Observed bug behavoir + + +# Expected behavior + + +# Relevant logs and/or screenshots + + +# Potential fixes + + + +/label ~bug ~"To Do" +/cc @ghenry @venkat.malladi @s181706 @s189701 \ No newline at end of file diff --git a/.gitlab/merge_request_templates/Merge_Request.md b/.gitlab/merge_request_templates/Merge_Request.md new file mode 100644 index 0000000000000000000000000000000000000000..cf2214fd8f235e57315ec19996ee2c5512555f51 --- /dev/null +++ b/.gitlab/merge_request_templates/Merge_Request.md @@ -0,0 +1,17 @@ +Please fill in the appropriate checklist below (delete whatever is not relevant). +These are the most common things requested on pull requests (PRs). + +## PR checklist + - [ ] This comment contains a description of changes (with reason) + - [ ] If you've fixed a bug or added code that should be tested, add tests! + - [ ] Documentation in `docs` is updated + - [ ] `CHANGELOG.md` is updated + - [ ] `README.md` is updated + - [ ] `LICENSE.md` is updated with new contributors + + +* [ ] **Close issue**\ +Closes # + +/cc @ghenry @venkat.malladi +/assign @ghenry \ No newline at end of file diff --git a/LICENSE b/LICENSE index e4cf85dd516f931238ad4ff60f25ea8fa31a5256..af5e9c54a9121cc97781032e2ddbc58d59947ec0 100644 --- a/LICENSE +++ b/LICENSE @@ -1 +1,25 @@ -NOT YET LICENSED +MIT License + +Copyright (c) 2019 University of Texas Southwestern Medical Center. + +Contributors: Gervaise H. Henry, Jon Gesell, Jeremy Mathews, and Venkat Malladi + +Department: Bioinformatic Core Facility, Department of Bioinformatics + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index abbd229ea18b8093a3e35ccaac9f1426ad09930e..b2a3fecf8142b15790274609eb38a35a318af72e 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ +<!-- |*master*|*develop*| |:-:|:-:| |[](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/master)|[](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/develop)| [![DOI]()]() - +--> GUDMAP/RBK RNA-Seq Pipeline =========================== @@ -13,11 +14,10 @@ Introduction To Run: ------- - * Available parameters: * FULL EXAMPLE: ``` - nextflow run workflow/main.nf + nextflow run workflow/rna-seq.nf ``` * Design example: @@ -25,9 +25,43 @@ To Run: [**CHANGELOG**](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/blob/develop/CHANGELOG.md) +--- + Credits -------- +======= This worklow is was developed by [Bioinformatic Core Facility (BICF), Department of Bioinformatics](http://www.utsouthwestern.edu/labs/bioinformatics/) +PI +-- +Venkat S. Malladi\ +*Faculty Associate & Director*\ +Bioinformatics Core Facility\ +UT Southwestern Medical Center\ +<a href="https://orcid.org/0000-0002-0144-0564" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0002-0144-0564</a>\ +[venkat.malladi@utsouthwestern.edu](mailto:venkat.malladi@utsouthwestern.edu) + + +Developers +---------- +Gervaise H. Henry\ +*Computational Biologist*\ +Department of Urology\ +UT Southwestern Medical Center\ +<a href="https://orcid.org/0000-0001-7772-9578" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0001-7772-9578</a>\ +[gervaise.henry@utsouthwestern.edu](mailto:gervaise.henry@utsouthwestern.edu) + +Jonathan Gesell\ +*Computational Biologist*\ +Bioinformatics Core Facility\ +UT Southwestern Medical Center\ +<a href="https://orcid.org/0000-0001-5902-3299" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0001-5902-3299</a>\ +[johnathan.gesell@utsouthwestern.edu](mailto:jonathn.gesell@utsouthwestern.edu) + +Jeremy A. Mathews\ +*Computational Intern*\ +Bioinformatics Core Facility\ +UT Southwestern Medical Center\ +<a href="https://orcid.org/0000-0002-2931-1430" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0002-2931-1430</a>\ +[jeremy.mathews@utsouthwestern.edu](mailto:jeremy.mathews@utsouthwestern.edu) Please cite in publications: Pipeline was developed by BICF from funding provided by **Cancer Prevention and Research Institute of Texas (RP150596)**. diff --git a/docs/GUDMAP.RBK Pipeline.docx b/docs/GUDMAP.RBK Pipeline.docx index deae8a8fbfb7adc32ba2fba03a25eca6af57b4d7..b22d55b4d7185a1d9c30f45c889b13757483f306 100755 Binary files a/docs/GUDMAP.RBK Pipeline.docx and b/docs/GUDMAP.RBK Pipeline.docx differ diff --git a/docs/gudmap-rbk_logo.png b/docs/gudmap-rbk_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..840030e2dac96bff2d41805d6697abab29c3e5bb Binary files /dev/null and b/docs/gudmap-rbk_logo.png differ diff --git a/nextflow.config b/nextflow.config deleted file mode 100644 index 28777047bfa85b13d08a0df02a22e6eac6d66540..0000000000000000000000000000000000000000 --- a/nextflow.config +++ /dev/null @@ -1,5 +0,0 @@ -profiles { - standard { - includeConfig 'workflow/conf/biohpc.config' - } -} diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 0ea74405dd6faeb0342698df1ef8736804dda5c6..2e90c4f157e816cfccb05ec5eb68b4417c651fd0 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -1,10 +1,18 @@ process { executor = 'slurm' - queue='super' + queue = 'super' + clusterOptions = '--hold' // Process specific configuration - withLabel:getData { - executor = 'super' + withName:splitData { + container = 'docker://bicf/gudmaprbkfilexfer:1.0' + } + withName:getData { + container = 'docker://bicf/gudmaprbkfilexfer:1.0' + } + withName:trimData { + container = 'docker://bicf/trimgalore:1.0' + queue = '256GB,256GBv1,384GB' } } @@ -28,4 +36,15 @@ report { tower { accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f' enabled = true -} \ No newline at end of file +} + +singularity { + enabled = true + cacheDir = '/project/shared/bicf_workflow_ref/singularity_images/' +} + +env { + http_proxy = 'http://proxy.swmed.edu:3128' + https_proxy = 'http://proxy.swmed.edu:3128' + all_proxy = 'http://proxy.swmed.edu:3128' +} diff --git a/workflow/docker/.gitkeep b/workflow/docker/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/workflow/docker/getData b/workflow/docker/getData deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/workflow/docker/images/.gitkeep b/workflow/docker/images/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/workflow/docker/temp b/workflow/docker/temp deleted file mode 100644 index f7dcb3af08981d465bf0838d09de1b38e9e0c5aa..0000000000000000000000000000000000000000 --- a/workflow/docker/temp +++ /dev/null @@ -1,14 +0,0 @@ - - -RUN apt-get install -y python3.7 python3-pip - -RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \ - rm Miniconda3-latest-Linux-x86_64.sh -ENV PATH=/miniconda/bin:${PATH} -RUN conda config --add channels defaults && \ - conda config --add channels bioconda && \ - conda config --add channels conda-forge && \ - conda update -n base -c defaults -y conda - -RUN pip install --upgrade pip diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index d839044791bc3aaccc701c9fb62099105c97205b..035faa8ee8b3f2b95f298e4edc8f074a0b695587 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -1,49 +1,102 @@ #!/usr/bin/env nextflow // Define input variables +params.deriva = "/project/BICF/BICF_Core/shared/gudmap/cookies/deriva-cookies.txt" params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip" params.outDir = "${baseDir}/../output" // Parse input variables +deriva = file(params.deriva, checkIfExists: 'true') bdbag = Channel .fromPath(params.bdbag) .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } outDir = params.outDir +logsDir = "${outDir}/Logs" +/* + * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid + */ +process splitData { + tag "${bdbag.baseName}" + executor 'local' + publishDir "${logsDir}/splitData", mode: 'symlink', pattern: "${bdbag.baseName}.splitData.err" + + input: + file bdbag + path cookies, stageAs: 'cookies.txt' from deriva + + output: + file("Replicate_*.zip") into bdbagSplit mode flatten + file("${bdbag.baseName}/data/File.csv") into fileMeta + file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta + file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta + file ("${bdbag.baseName}.splitData.err") + + script: + """ + hostname >> ${bdbag.baseName}.splitData.err + ulimit -a >> ${bdbag.baseName}.splitData.err + ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt 2>>${bdbag.baseName}.splitData.err + echo "LOG: deriva cookie linked" >> ${bdbag.baseName}.splitData.err + study=`echo "${bdbag}" | cut -d '.' -f1` 2>>${bdbag.baseName}.splitData.err + echo "LOG: \${study}" >> ${bdbag.baseName}.splitData.err + unzip ${bdbag} 2>>${bdbag.baseName}.splitData.err + echo "LOG: bdgag unzipped" >> ${bdbag.baseName}.splitData.err + python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err + echo "LOG: fetch file filtered for only .fastq.gz" >> ${bdbag.baseName}.splitData.err + python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err + echo "LOG: fetch file split by replicates" >> ${bdbag.baseName}.splitData.err + sh ${baseDir}/scripts/splitBag.sh \${study} 2>>${bdbag.baseName}.splitData.err + echo "LOG: bag recreated with replicate split fetch file" >> ${bdbag.baseName}.splitData.err + """ +} /* * getData: fetch study files from consortium with downloaded bdbag.zip - * python must be loaded prior to nextflow run, because conda env create from .yml doesn't work with nextflow loaded module (either process in-line, or config file) */ - process getData { - publishDir "${outDir}/temp/getData", mode: "symlink" - conda "${baseDir}/conf/conda.env.bdbag.yml" - - input: - file bdbag - - output: - file("**/*.R*.fastq.gz") into fastqPaths - file("**/File.csv") into filePaths - file("**/Experiment Settings.csv") into experimentSettingsPaths - file("**/Experiment.csv") into experimentPaths - - script: - """ - hostname - ulimit -a - study=\$(echo "${bdbag}" | cut -d'.' -f1) - echo LOG: \${study} - unzip ${bdbag} - echo LOG: bdgag unzipped - python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} - echo LOG: fetch file filtered for only .fastq.gz - #bdbag --materialize "\$(echo "${bdbag}" | cut -d'.' -f1)" - sh ${baseDir}/scripts/bdbagFetch.sh \${study} - echo LOG: bdbag fetched - sh ${baseDir}/scripts/renameFastq.sh \${study} - echo LOG: fastq.gz files renamed to replicate RID - """ - } \ No newline at end of file +process getData { + tag "${rep.baseName}" + publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${rep.baseName}.getData.err" + + input: + each rep from bdbagSplit + + output: + set val ("${rep.baseName}"), file ("*.R{1,2}.fastq.gz") into trimming + + script: + """ + hostname >>${rep.baseName}.getData.err + ulimit -a >>${rep.baseName}.getData.err + export https_proxy=\${http_proxy} + replicate=\$(basename "${rep}" | cut -d '.' -f1) + echo "LOG: \${replicate}" >>${rep.baseName}.getData.err + unzip ${rep} 2>>${rep.baseName}.getData.err + echo "LOG: replicate bdbag unzipped" >>${rep.baseName}.getData.err + sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} 2>>${rep.baseName}.getData.err + echo "LOG: replicate bdbag fetched" >>${rep.baseName}.getData.err + """ +} + +/* + * trimData: trims any adapter or non-host sequences from the data +*/ +process trimData { + tag "trim-${repID}" + publishDir "${outDir}/tempOut/trimmed", mode: "symlink", pattern: "*_val_{1,2}.fq.gz" + publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${rep}.trimData.*" + + input: + set repID, reads from trimming + + output: + path ("*_val_{1,2}.fq.gz", type: 'file', maxDepth: '0') + + script: + """ + rep=`echo ${repID} | cut -f2- -d '_'`; + trim_galore --gzip --max_n 1 --paired --basename \${rep} -j `nproc` ${reads[0]} ${reads[1]} 1>>\${rep}.trimData.log 2>>\${rep}.trimData.err; + """ +} diff --git a/workflow/scripts/bdbagFetch.sh b/workflow/scripts/bdbagFetch.sh index 28dab3f5338b3b6371b2b8f4ee7ac6bf2e715fa6..9af4eb46c0e716e0e1db7cb66e9f027f63611218 100644 --- a/workflow/scripts/bdbagFetch.sh +++ b/workflow/scripts/bdbagFetch.sh @@ -1,3 +1,6 @@ -#!/bin +#!/bin/bash -bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 \ No newline at end of file +bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 && +for i in $(find */ -name "*.R*.fastq.gz"); do + mv ${i} .; +done; diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py index 8a330e539054c8592363bd84bb4e6a0871b750f4..82b1d4c50a17cf34d9410ef0889dc7ca0d112d84 100644 --- a/workflow/scripts/modifyFetch.py +++ b/workflow/scripts/modifyFetch.py @@ -1,5 +1,8 @@ +#!/usr/bin/env python3 + import argparse import pandas as pd +import re def get_args(): parser = argparse.ArgumentParser() @@ -9,9 +12,14 @@ def get_args(): def main(): args = get_args() - fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) - fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"] - fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) + fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) + fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0) + fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"] + fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"] + fetchFile_filtered_renamed = fetchFile_filtered + for i in fileFile_filtered["File_Name"]: + fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0]) + fetchFile_filtered_renamed.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) if __name__ == '__main__': main() \ No newline at end of file diff --git a/workflow/scripts/renameFastq.sh b/workflow/scripts/renameFastq.sh deleted file mode 100644 index f5593766b3a3bd645c3f2c8758d3a20fd354c9be..0000000000000000000000000000000000000000 --- a/workflow/scripts/renameFastq.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin - -while read loc checksum fileLocation -do - file=$(echo ${fileLocation##*/}) - fileName=$(echo ${file%.R*.fastq.gz}) - fileExt=$(echo ${file##${fileName}.}) - while IFS="," read RID Study_RID Experiment_RID Replicate_RID Caption File_Type File_Name URI File_size MD5 GEO_Archival_URL dbGaP_Accession_ID Processed Notes Principal_Investigator Consortium Release_Date RCT RMT Legacy_File_RID GUDMAP_NGF_OID GUDMAP_NGS_OID - do - if [ ${file} == ${File_Name} ] - then - find . -type f -name ${file} -execdir mv {} ${Replicate_RID}.${fileExt} ';' - fi - done < $1/data/File.csv -done < $1/fetch.txt \ No newline at end of file diff --git a/workflow/scripts/splitBag.sh b/workflow/scripts/splitBag.sh new file mode 100644 index 0000000000000000000000000000000000000000..3f6f6cdb610c684bdb57f666822dc0deb864fb04 --- /dev/null +++ b/workflow/scripts/splitBag.sh @@ -0,0 +1,7 @@ +#!/bin + +for i in $(ls -d Replicate_*) +do +rsync -r $1/ ${i} --exclude=fetch.txt +zip -r ${i}.zip ${i} +done \ No newline at end of file diff --git a/workflow/scripts/splitFetch.py b/workflow/scripts/splitFetch.py new file mode 100644 index 0000000000000000000000000000000000000000..c8f60043be43a70ae570800f6edb117923d91810 --- /dev/null +++ b/workflow/scripts/splitFetch.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import os + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True) + args = parser.parse_args() + return args + +def main(): + args = get_args() + fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) + fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0) + replicateRID = fileFile.Replicate_RID.unique() + fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID} + for i in replicateRID: + if not os.path.exists(i): + os.mkdir("Replicate_"+i) + fetchFile[fetchFile[0].str.contains('|'.join(fetchArray[i]))].to_csv("Replicate_"+i+"/fetch.txt",sep="\t",header=False,index=False) + +if __name__ == '__main__': + main() \ No newline at end of file