From e8e5d14014af7af92aea2c52f1705c9c6ea59264 Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Tue, 8 Oct 2019 23:33:35 -0500 Subject: [PATCH] getData setup --- .gitignore | 1 + cleanup.sh | 7 ++++++ nextflow.config | 5 ++++ workflow/conf/biohpc.config | 31 +++++++++++++++++++++++++ workflow/conf/conda.env.bdbag.yml | 5 ++++ workflow/docker/.gitkeep | 0 workflow/docker/getData | 0 workflow/docker/images/.gitkeep | 0 workflow/docker/temp | 14 ++++++++++++ workflow/main.nf | 1 - workflow/nextflow.config | 5 ++++ workflow/rna-seq.nf | 38 +++++++++++++++++++++++++++++++ workflow/scripts/modifyFetch.py | 17 ++++++++++++++ workflow/scripts/modifyFetch.sh | 3 +++ 14 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 cleanup.sh create mode 100644 nextflow.config create mode 100755 workflow/conf/biohpc.config create mode 100644 workflow/conf/conda.env.bdbag.yml create mode 100644 workflow/docker/.gitkeep create mode 100644 workflow/docker/getData create mode 100644 workflow/docker/images/.gitkeep create mode 100644 workflow/docker/temp delete mode 100755 workflow/main.nf create mode 100644 workflow/nextflow.config create mode 100755 workflow/rna-seq.nf create mode 100644 workflow/scripts/modifyFetch.py create mode 100644 workflow/scripts/modifyFetch.sh diff --git a/.gitignore b/.gitignore index 8b4b1ea..2bc3449 100644 --- a/.gitignore +++ b/.gitignore @@ -297,6 +297,7 @@ $RECYCLE.BIN/ # nextflow analysis folders/files /test_data/* +/workflow/docker/images/* /workflow/.nextflow/* /workflow/work/* /workflow/output/* diff --git a/cleanup.sh b/cleanup.sh new file mode 100644 index 0000000..9569ff5 --- /dev/null +++ b/cleanup.sh @@ -0,0 +1,7 @@ +rm *.out +rm pipeline_trace*.txt* +rm report*.html* +rm timeline*.html* +rm .nextflow*.log* +rm -r .nextflow/ +rm -r work/ diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..2877704 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,5 @@ +profiles { + standard { + includeConfig 'workflow/conf/biohpc.config' + } +} diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config new file mode 100755 index 0000000..0ea7440 --- /dev/null +++ b/workflow/conf/biohpc.config @@ -0,0 +1,31 @@ +process { + executor = 'slurm' + queue='super' + + // Process specific configuration + withLabel:getData { + executor = 'super' + } +} + + +trace { + enabled = true + file = 'pipeline_trace.txt' + fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' +} + +timeline { + enabled = true + file = 'timeline.html' +} + +report { + enabled = true + file = 'report.html' +} + +tower { + accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f' + enabled = true +} \ No newline at end of file diff --git a/workflow/conf/conda.env.bdbag.yml b/workflow/conf/conda.env.bdbag.yml new file mode 100644 index 0000000..33361d3 --- /dev/null +++ b/workflow/conf/conda.env.bdbag.yml @@ -0,0 +1,5 @@ +name: bdbag +dependencies: + - pandas=0.23.3=py36_0 + - pip: + - bdbag==1.5.5 diff --git a/workflow/docker/.gitkeep b/workflow/docker/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/workflow/docker/getData b/workflow/docker/getData new file mode 100644 index 0000000..e69de29 diff --git a/workflow/docker/images/.gitkeep b/workflow/docker/images/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/workflow/docker/temp b/workflow/docker/temp new file mode 100644 index 0000000..f7dcb3a --- /dev/null +++ b/workflow/docker/temp @@ -0,0 +1,14 @@ + + +RUN apt-get install -y python3.7 python3-pip + +RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \ + rm Miniconda3-latest-Linux-x86_64.sh +ENV PATH=/miniconda/bin:${PATH} +RUN conda config --add channels defaults && \ + conda config --add channels bioconda && \ + conda config --add channels conda-forge && \ + conda update -n base -c defaults -y conda + +RUN pip install --upgrade pip diff --git a/workflow/main.nf b/workflow/main.nf deleted file mode 100755 index 5d9292b..0000000 --- a/workflow/main.nf +++ /dev/null @@ -1 +0,0 @@ -#!/usr/bin/env nextflow diff --git a/workflow/nextflow.config b/workflow/nextflow.config new file mode 100644 index 0000000..30e47ea --- /dev/null +++ b/workflow/nextflow.config @@ -0,0 +1,5 @@ +profiles { + standard { + includeConfig 'conf/biohpc.config' + } +} diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf new file mode 100755 index 0000000..b272392 --- /dev/null +++ b/workflow/rna-seq.nf @@ -0,0 +1,38 @@ +#!/usr/bin/env nextflow + +// Define input variables +params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip" + +params.outDir = "${baseDir}/../output" + +// Parse input variables +bdbag = Channel + .fromPath(params.bdbag) + .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } + +outDir = params.outDir + + +/* + * getData: fetch study files from consortium with downloaded bdbag.zip + * python must be loaded prior to nextflow run, because conda env create from .yml doesn't work with nextflow loaded module (either process in-line, or config file) + */ + process getData { + publishDir "${outDir}/temp/getData", mode: "symlink" + conda "${baseDir}/conf/conda.env.bdbag.yml" + + input: + file bdbag + + output: + file("*") into dataPaths + + script: + """ + hostname + ulimit -a + unzip ${bdbag} + python3 ${baseDir}/scripts/modifyFetch.py -f \$(echo "${bdbag}" | cut -d'.' -f1) + bdbag --materialize "\$(echo "${bdbag}" | cut -d'.' -f1)" + """ + } \ No newline at end of file diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py new file mode 100644 index 0000000..8a330e5 --- /dev/null +++ b/workflow/scripts/modifyFetch.py @@ -0,0 +1,17 @@ +import argparse +import pandas as pd + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True) + args = parser.parse_args() + return args + +def main(): + args = get_args() + fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) + fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"] + fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/workflow/scripts/modifyFetch.sh b/workflow/scripts/modifyFetch.sh new file mode 100644 index 0000000..f243f5c --- /dev/null +++ b/workflow/scripts/modifyFetch.sh @@ -0,0 +1,3 @@ +#!/bin + +unzip $1 \ No newline at end of file -- GitLab