From e8e5d14014af7af92aea2c52f1705c9c6ea59264 Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Tue, 8 Oct 2019 23:33:35 -0500
Subject: [PATCH] getData setup

---
 .gitignore                        |  1 +
 cleanup.sh                        |  7 ++++++
 nextflow.config                   |  5 ++++
 workflow/conf/biohpc.config       | 31 +++++++++++++++++++++++++
 workflow/conf/conda.env.bdbag.yml |  5 ++++
 workflow/docker/.gitkeep          |  0
 workflow/docker/getData           |  0
 workflow/docker/images/.gitkeep   |  0
 workflow/docker/temp              | 14 ++++++++++++
 workflow/main.nf                  |  1 -
 workflow/nextflow.config          |  5 ++++
 workflow/rna-seq.nf               | 38 +++++++++++++++++++++++++++++++
 workflow/scripts/modifyFetch.py   | 17 ++++++++++++++
 workflow/scripts/modifyFetch.sh   |  3 +++
 14 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 cleanup.sh
 create mode 100644 nextflow.config
 create mode 100755 workflow/conf/biohpc.config
 create mode 100644 workflow/conf/conda.env.bdbag.yml
 create mode 100644 workflow/docker/.gitkeep
 create mode 100644 workflow/docker/getData
 create mode 100644 workflow/docker/images/.gitkeep
 create mode 100644 workflow/docker/temp
 delete mode 100755 workflow/main.nf
 create mode 100644 workflow/nextflow.config
 create mode 100755 workflow/rna-seq.nf
 create mode 100644 workflow/scripts/modifyFetch.py
 create mode 100644 workflow/scripts/modifyFetch.sh

diff --git a/.gitignore b/.gitignore
index 8b4b1ea..2bc3449 100644
--- a/.gitignore
+++ b/.gitignore
@@ -297,6 +297,7 @@ $RECYCLE.BIN/
 
 # nextflow analysis folders/files
 /test_data/*
+/workflow/docker/images/*
 /workflow/.nextflow/*
 /workflow/work/*
 /workflow/output/*
diff --git a/cleanup.sh b/cleanup.sh
new file mode 100644
index 0000000..9569ff5
--- /dev/null
+++ b/cleanup.sh
@@ -0,0 +1,7 @@
+rm *.out
+rm pipeline_trace*.txt*
+rm report*.html*
+rm timeline*.html*
+rm .nextflow*.log*
+rm -r .nextflow/
+rm -r work/
diff --git a/nextflow.config b/nextflow.config
new file mode 100644
index 0000000..2877704
--- /dev/null
+++ b/nextflow.config
@@ -0,0 +1,5 @@
+profiles {
+  standard {
+    includeConfig 'workflow/conf/biohpc.config'
+  }
+}
diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config
new file mode 100755
index 0000000..0ea7440
--- /dev/null
+++ b/workflow/conf/biohpc.config
@@ -0,0 +1,31 @@
+process {
+  executor = 'slurm'
+  queue='super'
+
+  // Process specific configuration
+  withLabel:getData {
+    executor = 'super'
+  }
+}
+
+
+trace {
+  enabled = true
+  file = 'pipeline_trace.txt'
+  fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
+}
+
+timeline {
+  enabled = true
+  file = 'timeline.html'
+}
+
+report {
+  enabled = true
+  file = 'report.html'
+}
+
+tower {
+  accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f'
+  enabled = true
+}
\ No newline at end of file
diff --git a/workflow/conf/conda.env.bdbag.yml b/workflow/conf/conda.env.bdbag.yml
new file mode 100644
index 0000000..33361d3
--- /dev/null
+++ b/workflow/conf/conda.env.bdbag.yml
@@ -0,0 +1,5 @@
+name: bdbag
+dependencies:
+  - pandas=0.23.3=py36_0
+  - pip:
+    - bdbag==1.5.5
diff --git a/workflow/docker/.gitkeep b/workflow/docker/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/workflow/docker/getData b/workflow/docker/getData
new file mode 100644
index 0000000..e69de29
diff --git a/workflow/docker/images/.gitkeep b/workflow/docker/images/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/workflow/docker/temp b/workflow/docker/temp
new file mode 100644
index 0000000..f7dcb3a
--- /dev/null
+++ b/workflow/docker/temp
@@ -0,0 +1,14 @@
+
+
+RUN apt-get install -y python3.7 python3-pip
+
+RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+  bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \
+  rm Miniconda3-latest-Linux-x86_64.sh
+ENV PATH=/miniconda/bin:${PATH}                                                                                      
+RUN conda config --add channels defaults && \
+  conda config --add channels bioconda && \
+  conda config --add channels conda-forge && \
+  conda update -n base -c defaults -y conda
+
+RUN pip install --upgrade pip
diff --git a/workflow/main.nf b/workflow/main.nf
deleted file mode 100755
index 5d9292b..0000000
--- a/workflow/main.nf
+++ /dev/null
@@ -1 +0,0 @@
-#!/usr/bin/env nextflow
diff --git a/workflow/nextflow.config b/workflow/nextflow.config
new file mode 100644
index 0000000..30e47ea
--- /dev/null
+++ b/workflow/nextflow.config
@@ -0,0 +1,5 @@
+profiles {
+  standard {
+    includeConfig 'conf/biohpc.config'
+  }
+}
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
new file mode 100755
index 0000000..b272392
--- /dev/null
+++ b/workflow/rna-seq.nf
@@ -0,0 +1,38 @@
+#!/usr/bin/env nextflow
+
+// Define input variables
+params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip"
+
+params.outDir = "${baseDir}/../output"
+
+// Parse input variables
+bdbag = Channel
+  .fromPath(params.bdbag)
+  .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }
+
+outDir = params.outDir
+
+
+/*
+ * getData: fetch study files from consortium with downloaded bdbag.zip
+ * python must be loaded prior to nextflow run, because conda env create from .yml doesn't work with nextflow loaded module (either process in-line, or config file)
+ */
+ process getData {
+     publishDir "${outDir}/temp/getData", mode: "symlink"
+     conda "${baseDir}/conf/conda.env.bdbag.yml"
+
+     input:
+        file bdbag
+
+    output:
+        file("*") into dataPaths
+
+    script:
+        """
+        hostname
+        ulimit -a
+        unzip ${bdbag}
+        python3 ${baseDir}/scripts/modifyFetch.py -f \$(echo "${bdbag}" | cut -d'.' -f1)
+        bdbag --materialize "\$(echo "${bdbag}" | cut -d'.' -f1)"
+        """
+ }
\ No newline at end of file
diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py
new file mode 100644
index 0000000..8a330e5
--- /dev/null
+++ b/workflow/scripts/modifyFetch.py
@@ -0,0 +1,17 @@
+import argparse
+import pandas as pd
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args()
+    fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
+    fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"]
+    fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/workflow/scripts/modifyFetch.sh b/workflow/scripts/modifyFetch.sh
new file mode 100644
index 0000000..f243f5c
--- /dev/null
+++ b/workflow/scripts/modifyFetch.sh
@@ -0,0 +1,3 @@
+#!/bin
+
+unzip $1
\ No newline at end of file
-- 
GitLab