diff --git a/.gitignore b/.gitignore index 8b4b1eadf6253fc94cefe75b485a051ed8f3d71e..2bc34493af5ad8d30a9ef477283aa7c4b32700b8 100644 --- a/.gitignore +++ b/.gitignore @@ -297,6 +297,7 @@ $RECYCLE.BIN/ # nextflow analysis folders/files /test_data/* +/workflow/docker/images/* /workflow/.nextflow/* /workflow/work/* /workflow/output/* diff --git a/cleanup.sh b/cleanup.sh new file mode 100644 index 0000000000000000000000000000000000000000..9569ff54fd71cd94bddde415af03a101820ab514 --- /dev/null +++ b/cleanup.sh @@ -0,0 +1,7 @@ +rm *.out +rm pipeline_trace*.txt* +rm report*.html* +rm timeline*.html* +rm .nextflow*.log* +rm -r .nextflow/ +rm -r work/ diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000000000000000000000000000000000000..28777047bfa85b13d08a0df02a22e6eac6d66540 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,5 @@ +profiles { + standard { + includeConfig 'workflow/conf/biohpc.config' + } +} diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config new file mode 100755 index 0000000000000000000000000000000000000000..0ea74405dd6faeb0342698df1ef8736804dda5c6 --- /dev/null +++ b/workflow/conf/biohpc.config @@ -0,0 +1,31 @@ +process { + executor = 'slurm' + queue='super' + + // Process specific configuration + withLabel:getData { + executor = 'super' + } +} + + +trace { + enabled = true + file = 'pipeline_trace.txt' + fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' +} + +timeline { + enabled = true + file = 'timeline.html' +} + +report { + enabled = true + file = 'report.html' +} + +tower { + accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f' + enabled = true +} \ No newline at end of file diff --git a/workflow/conf/conda.env.bdbag.yml b/workflow/conf/conda.env.bdbag.yml new file mode 100644 index 0000000000000000000000000000000000000000..33361d301b3fac561fa39807e3c740583e57d28b --- /dev/null +++ b/workflow/conf/conda.env.bdbag.yml @@ -0,0 +1,5 @@ +name: bdbag +dependencies: + - pandas=0.23.3=py36_0 + - pip: + - bdbag==1.5.5 diff --git a/workflow/docker/.gitkeep b/workflow/docker/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workflow/docker/getData b/workflow/docker/getData new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workflow/docker/images/.gitkeep b/workflow/docker/images/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workflow/docker/temp b/workflow/docker/temp new file mode 100644 index 0000000000000000000000000000000000000000..f7dcb3af08981d465bf0838d09de1b38e9e0c5aa --- /dev/null +++ b/workflow/docker/temp @@ -0,0 +1,14 @@ + + +RUN apt-get install -y python3.7 python3-pip + +RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \ + rm Miniconda3-latest-Linux-x86_64.sh +ENV PATH=/miniconda/bin:${PATH} +RUN conda config --add channels defaults && \ + conda config --add channels bioconda && \ + conda config --add channels conda-forge && \ + conda update -n base -c defaults -y conda + +RUN pip install --upgrade pip diff --git a/workflow/main.nf b/workflow/main.nf deleted file mode 100755 index 5d9292b275bdd31916663dbf1e683e0ac3f0374b..0000000000000000000000000000000000000000 --- a/workflow/main.nf +++ /dev/null @@ -1 +0,0 @@ -#!/usr/bin/env nextflow diff --git a/workflow/nextflow.config b/workflow/nextflow.config new file mode 100644 index 0000000000000000000000000000000000000000..30e47ea1aea37ed6550cc2944d69d26e69887489 --- /dev/null +++ b/workflow/nextflow.config @@ -0,0 +1,5 @@ +profiles { + standard { + includeConfig 'conf/biohpc.config' + } +} diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf new file mode 100755 index 0000000000000000000000000000000000000000..d839044791bc3aaccc701c9fb62099105c97205b --- /dev/null +++ b/workflow/rna-seq.nf @@ -0,0 +1,49 @@ +#!/usr/bin/env nextflow + +// Define input variables +params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip" + +params.outDir = "${baseDir}/../output" + +// Parse input variables +bdbag = Channel + .fromPath(params.bdbag) + .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } + +outDir = params.outDir + + +/* + * getData: fetch study files from consortium with downloaded bdbag.zip + * python must be loaded prior to nextflow run, because conda env create from .yml doesn't work with nextflow loaded module (either process in-line, or config file) + */ + process getData { + publishDir "${outDir}/temp/getData", mode: "symlink" + conda "${baseDir}/conf/conda.env.bdbag.yml" + + input: + file bdbag + + output: + file("**/*.R*.fastq.gz") into fastqPaths + file("**/File.csv") into filePaths + file("**/Experiment Settings.csv") into experimentSettingsPaths + file("**/Experiment.csv") into experimentPaths + + script: + """ + hostname + ulimit -a + study=\$(echo "${bdbag}" | cut -d'.' -f1) + echo LOG: \${study} + unzip ${bdbag} + echo LOG: bdgag unzipped + python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} + echo LOG: fetch file filtered for only .fastq.gz + #bdbag --materialize "\$(echo "${bdbag}" | cut -d'.' -f1)" + sh ${baseDir}/scripts/bdbagFetch.sh \${study} + echo LOG: bdbag fetched + sh ${baseDir}/scripts/renameFastq.sh \${study} + echo LOG: fastq.gz files renamed to replicate RID + """ + } \ No newline at end of file diff --git a/workflow/scripts/bdbagFetch.sh b/workflow/scripts/bdbagFetch.sh new file mode 100644 index 0000000000000000000000000000000000000000..28dab3f5338b3b6371b2b8f4ee7ac6bf2e715fa6 --- /dev/null +++ b/workflow/scripts/bdbagFetch.sh @@ -0,0 +1,3 @@ +#!/bin + +bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 \ No newline at end of file diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py new file mode 100644 index 0000000000000000000000000000000000000000..8a330e539054c8592363bd84bb4e6a0871b750f4 --- /dev/null +++ b/workflow/scripts/modifyFetch.py @@ -0,0 +1,17 @@ +import argparse +import pandas as pd + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True) + args = parser.parse_args() + return args + +def main(): + args = get_args() + fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) + fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"] + fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/workflow/scripts/renameFastq.sh b/workflow/scripts/renameFastq.sh new file mode 100644 index 0000000000000000000000000000000000000000..f5593766b3a3bd645c3f2c8758d3a20fd354c9be --- /dev/null +++ b/workflow/scripts/renameFastq.sh @@ -0,0 +1,15 @@ +#!/bin + +while read loc checksum fileLocation +do + file=$(echo ${fileLocation##*/}) + fileName=$(echo ${file%.R*.fastq.gz}) + fileExt=$(echo ${file##${fileName}.}) + while IFS="," read RID Study_RID Experiment_RID Replicate_RID Caption File_Type File_Name URI File_size MD5 GEO_Archival_URL dbGaP_Accession_ID Processed Notes Principal_Investigator Consortium Release_Date RCT RMT Legacy_File_RID GUDMAP_NGF_OID GUDMAP_NGS_OID + do + if [ ${file} == ${File_Name} ] + then + find . -type f -name ${file} -execdir mv {} ${Replicate_RID}.${fileExt} ';' + fi + done < $1/data/File.csv +done < $1/fetch.txt \ No newline at end of file