Add new process to split bdbag by replicates for parallel getData

347d8e1a · Gervaise Henry · 2b4b2e9b · 347d8e1a · 347d8e1a · 2b4b2e9b
Commit 347d8e1a authored 5 years ago by Gervaise Henry
--- a/.gitignore
+++ b/.gitignore
@@ -297,7 +297,6 @@ $RECYCLE.BIN/

 # nextflow analysis folders/files
 /test_data/*
-/workflow/docker/images/*
 /workflow/.nextflow/*
 /workflow/work/*
 /workflow/output/*

--- a/docs/GUDMAP.RBK Pipeline.docx
+++ b/docs/GUDMAP.RBK Pipeline.docx
--- a/nextflow.config
+++ b/nextflow.config
-profiles {
-  standard {
-    includeConfig 'workflow/conf/biohpc.config'
-  }
-}
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -3,6 +3,9 @@ process {
  queue = 'super'

  // Process specific configuration
+  withName:splitData {
+    container = 'docker://bicf/bdbag:1.0'
+  }
  withName:getData {
    container = 'docker://bicf/bdbag:1.0'
  }

--- a/workflow/docker/.gitkeep
+++ b/workflow/docker/.gitkeep
--- a/workflow/docker/getData
+++ b/workflow/docker/getData
--- a/workflow/docker/images/.gitkeep
+++ b/workflow/docker/images/.gitkeep
--- a/workflow/docker/temp
+++ b/workflow/docker/temp
-
-
-RUN apt-get install -y python3.7 python3-pip
-
-RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-  bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \
-  rm Miniconda3-latest-Linux-x86_64.sh
-ENV PATH=/miniconda/bin:${PATH}                                                                                      
-RUN conda config --add channels defaults && \
-  conda config --add channels bioconda && \
-  conda config --add channels conda-forge && \
-  conda update -n base -c defaults -y conda
-
-RUN pip install --upgrade pip
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
 #!/usr/bin/env nextflow

 // Define input variables
+params.deriva = "${baseDir}/../test_data/deriva-cookies.txt"
 params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip"

 params.outDir = "${baseDir}/../output"

 // Parse input variables
+deriva = Channel
+  .fromPath(params.deriva)
+  .ifEmpty { exit 1, "deriva cookie file not found: ${params.deriva}" }
 bdbag = Channel
  .fromPath(params.bdbag)
  .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }

 outDir = params.outDir

-
 /*
- * getData: fetch study files from consortium with downloaded bdbag.zip
- * python must be loaded prior to nextflow run, because conda env create from .yml doesn't work with nextflow loaded module (either process in-line, or config file)
+ * splitData: split bdbag files by replicate so fetch can occure in parallel
 */
- process getData {
-     publishDir "${outDir}/temp/getData", mode: "symlink"
-//     conda "${baseDir}/conf/conda.env.bdbag.yml"
+process splitData {
+    tag "${bdbag.baseName}"
+    publishDir "${outDir}/temp/${task.process}", mode: "symlink"

-     input:
+    input:
        file bdbag

    output:
-        file("**/*.R*.fastq.gz") into fastqPaths
-        file("**/File.csv") into filePaths
-        file("**/Experiment Settings.csv") into experimentSettingsPaths
-        file("**/Experiment.csv") into experimentPaths
+        file("Replicate_*.zip") into bdbagSplit mode flatten
+        file("${bdbag.baseName}/data/File.csv") into fileMeta
+        file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta
+        file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta

    script:
        """
@@ -40,10 +42,40 @@ outDir = params.outDir
        echo LOG: bdgag unzipped
        python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study}
        echo LOG: fetch file filtered for only .fastq.gz
-        #bdbag --materialize "\$(echo "${bdbag}" | cut -d'.' -f1)"
-        sh ${baseDir}/scripts/bdbagFetch.sh \${study}
-        echo LOG: bdbag fetched
-        sh ${baseDir}/scripts/renameFastq.sh \${study}
+        python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study}
+        echo LOG: fetch file split by replicates
+        sh ${baseDir}/scripts/splitBag.sh \${study}
+        echo LOG: bag recreated with replicate split fetch file
+        """
+}
+
+/*
+ * getData: fetch study files from consortium with downloaded bdbag.zip
+ */
+process getData {
+    tag "${rep.baseName}"
+    publishDir "${outDir}/temp/${task.process}", mode: "symlink"
+
+    input:
+        file deriva
+        each rep from bdbagSplit
+
+    output:
+        file("**/*.R*.fastq.gz") into fastq
+
+    script:
+        """
+        hostname
+        ulimit -a
+        replicate=\$(echo "${rep}" | cut -d'.' -f1)
+        echo LOG: \${replicate}
+        cp "${deriva}" ~/.bdbag/deriva-cookies.txt
+        echo LOG: deriva cookie loaded
+        unzip ${rep}
+        echo LOG: replicate bdbag unzipped
+        sh ${baseDir}/scripts/bdbagFetch.sh \${replicate}
+        echo LOG: replicate bdbag fetched
+        sh ${baseDir}/scripts/renameFastq.sh \${replicate}
        echo LOG: fastq.gz files renamed to replicate RID
        """
- }
+ }
\ No newline at end of file
--- a/workflow/scripts/modifyFetch.py
+++ b/workflow/scripts/modifyFetch.py
+#!/usr/bin/env python3
+
 import argparse
 import pandas as pd

@@ -9,9 +11,9 @@ def get_args():

 def main():
    args = get_args()
-    fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
-    fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"]
-    fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)
+    fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
+    fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"]
+    fetchFile_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)

 if __name__ == '__main__':
    main()
\ No newline at end of file
--- a/workflow/scripts/splitBag.sh
+++ b/workflow/scripts/splitBag.sh
+#!/bin
+
+for i in $(ls -d Replicate_*)
+do
+rsync -r $1/ ${i} --exclude=fetch.txt
+zip -r ${i}.zip ${i}
+done
\ No newline at end of file
--- a/workflow/scripts/splitFetch.py
+++ b/workflow/scripts/splitFetch.py
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import os
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args()
+    fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
+    fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0)
+    replicateRID = fileFile.Replicate_RID.unique()
+    fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID}
+    for i in replicateRID:
+        if not os.path.exists(i):
+            os.mkdir("Replicate_"+i)
+        fetchFile[fetchFile[0].str.contains('|'.join(fetchArray[i]))].to_csv("Replicate_"+i+"/fetch.txt",sep="\t",header=False,index=False)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file