Resolve merge conflict with pipeline diagram

b29bf6fd · Gervaise Henry · c8794f5d · c2b05eaa · b29bf6fd · c8794f5d
Commit b29bf6fd authored 5 years ago by Gervaise Henry
--- a/.gitignore
+++ b/.gitignore
@@ -297,7 +297,6 @@ $RECYCLE.BIN/

 # nextflow analysis folders/files
 /test_data/*
-/workflow/docker/images/*
 /workflow/.nextflow/*
 /workflow/work/*
 /workflow/output/*

--- a/nextflow.config
+++ b/nextflow.config
-profiles {
-  standard {
-    includeConfig 'workflow/conf/biohpc.config'
-  }
-}
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
 process {
  executor = 'slurm'
-  queue='super'
+  queue = 'super'

  // Process specific configuration
-  withLabel:getData {
-    executor = 'super'
+  withName:splitData {
+    container = 'docker://bicf/bdbag:1.0'
+  }
+  withName:getData {
+    container = 'docker://bicf/bdbag:1.0'
  }
 }

@@ -28,4 +31,15 @@ report {
 tower {
  accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f'
  enabled = true
-}
\ No newline at end of file
+}
+
+singularity {
+  enabled = true
+  cacheDir = '/project/shared/bicf_workflow_ref/singularity_images/'
+}
+
+env {
+  http_proxy = 'http://proxy.swmed.edu:3128'
+  https_proxy = 'http://proxy.swmed.edu:3128'
+  all_proxy = 'http://proxy.swmed.edu:3128'
+}
--- a/workflow/docker/.gitkeep
+++ b/workflow/docker/.gitkeep
--- a/workflow/docker/getData
+++ b/workflow/docker/getData
--- a/workflow/docker/images/.gitkeep
+++ b/workflow/docker/images/.gitkeep
--- a/workflow/docker/temp
+++ b/workflow/docker/temp
-
-
-RUN apt-get install -y python3.7 python3-pip
-
-RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-  bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \
-  rm Miniconda3-latest-Linux-x86_64.sh
-ENV PATH=/miniconda/bin:${PATH}                                                                                      
-RUN conda config --add channels defaults && \
-  conda config --add channels bioconda && \
-  conda config --add channels conda-forge && \
-  conda update -n base -c defaults -y conda
-
-RUN pip install --upgrade pip
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
 #!/usr/bin/env nextflow

 // Define input variables
+params.deriva = "/project/BICF/BICF_Core/shared/gudmap/cookies/deriva-cookies.txt"
 params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip"

 params.outDir = "${baseDir}/../output"

 // Parse input variables
+deriva = file(params.deriva, checkIfExists: 'true')
 bdbag = Channel
  .fromPath(params.bdbag)
  .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }

 outDir = params.outDir

+/*
+ * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
+ */
+process splitData {
+  tag "${bdbag.baseName}"
+
+  input:
+    file bdbag
+    path cookies, stageAs: 'cookies.txt' from deriva
+
+  output:
+    file("Replicate_*.zip") into bdbagSplit mode flatten
+    file("${bdbag.baseName}/data/File.csv") into fileMeta
+    file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta
+    file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta
+
+  script:
+    """
+    hostname
+    ulimit -a
+    ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt
+    echo "LOG: deriva cookie linked"
+    study=`echo "${bdbag}" | cut -d '.' -f1`
+    echo "LOG: \${study}"
+    unzip ${bdbag}
+    echo "LOG: bdgag unzipped"
+    python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study}
+    echo "LOG: fetch file filtered for only .fastq.gz"
+    python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study}
+    echo "LOG: fetch file split by replicates"
+    sh ${baseDir}/scripts/splitBag.sh \${study}
+    echo "LOG: bag recreated with replicate split fetch file"
+    """
+}

 /*
 * getData: fetch study files from consortium with downloaded bdbag.zip
- * python must be loaded prior to nextflow run, because conda env create from .yml doesn't work with nextflow loaded module (either process in-line, or config file)
 */
- process getData {
-     publishDir "${outDir}/temp/getData", mode: "symlink"
-     conda "${baseDir}/conf/conda.env.bdbag.yml"
-
-     input:
-        file bdbag
-
-    output:
-        file("**/*.R*.fastq.gz") into fastqPaths
-        file("**/File.csv") into filePaths
-        file("**/Experiment Settings.csv") into experimentSettingsPaths
-        file("**/Experiment.csv") into experimentPaths
-
-    script:
-        """
-        hostname
-        ulimit -a
-        study=\$(echo "${bdbag}" | cut -d'.' -f1)
-        echo LOG: \${study}
-        unzip ${bdbag}
-        echo LOG: bdgag unzipped
-        python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study}
-        echo LOG: fetch file filtered for only .fastq.gz
-        #bdbag --materialize "\$(echo "${bdbag}" | cut -d'.' -f1)"
-        sh ${baseDir}/scripts/bdbagFetch.sh \${study}
-        echo LOG: bdbag fetched
-        sh ${baseDir}/scripts/renameFastq.sh \${study}
-        echo LOG: fastq.gz files renamed to replicate RID
-        """
- }
\ No newline at end of file
+process getData {
+  tag "${rep.baseName}"
+  publishDir "${outDir}/tempOut/fastqs", mode: "symlink"
+
+  input:
+    each rep from bdbagSplit
+
+  output:
+    path ("*.R*.fastq.gz", type: 'file', maxDepth: '0') into fastq
+
+  script:
+    """
+    hostname
+    ulimit -a
+    export https_proxy=\${http_proxy}
+    replicate=\$(basename "${rep}" | cut -d '.' -f1)
+    echo "LOG: \${replicate}"
+    unzip ${rep}
+    echo "LOG: replicate bdbag unzipped"
+    sh ${baseDir}/scripts/bdbagFetch.sh \${replicate}
+    echo "LOG: replicate bdbag fetched"
+    """
+ }
--- a/workflow/scripts/bdbagFetch.sh
+++ b/workflow/scripts/bdbagFetch.sh
-#!/bin
+#!/bin/bash

-bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1
\ No newline at end of file
+bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 &&
+for i in $(find */ -name "*.R*.fastq.gz"); do
+  mv ${i} .;
+done;
--- a/workflow/scripts/modifyFetch.py
+++ b/workflow/scripts/modifyFetch.py
+#!/usr/bin/env python3
+
 import argparse
 import pandas as pd
+import re

 def get_args():
    parser = argparse.ArgumentParser()
@@ -9,9 +12,14 @@ def get_args():

 def main():
    args = get_args()
-    fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
-    fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"]
-    fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)
+    fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
+    fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0)
+    fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"]
+    fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"]
+    fetchFile_filtered_renamed = fetchFile_filtered
+    for i in fileFile_filtered["File_Name"]:
+        fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0])
+    fetchFile_filtered_renamed.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)

 if __name__ == '__main__':
    main()
\ No newline at end of file
--- a/workflow/scripts/renameFastq.sh
+++ b/workflow/scripts/renameFastq.sh
-#!/bin
-
-while read loc checksum fileLocation
-do
-    file=$(echo ${fileLocation##*/})
-    fileName=$(echo ${file%.R*.fastq.gz})
-    fileExt=$(echo ${file##${fileName}.})
-    while IFS="," read RID Study_RID Experiment_RID Replicate_RID Caption File_Type File_Name URI File_size MD5 GEO_Archival_URL dbGaP_Accession_ID Processed Notes Principal_Investigator Consortium Release_Date RCT RMT Legacy_File_RID GUDMAP_NGF_OID GUDMAP_NGS_OID
-    do
-        if [ ${file} == ${File_Name} ]
-        then
-            find . -type f -name ${file} -execdir mv {} ${Replicate_RID}.${fileExt} ';'
-        fi
-    done < $1/data/File.csv
-done < $1/fetch.txt
\ No newline at end of file
--- a/workflow/scripts/splitBag.sh
+++ b/workflow/scripts/splitBag.sh
+#!/bin
+
+for i in $(ls -d Replicate_*)
+do
+rsync -r $1/ ${i} --exclude=fetch.txt
+zip -r ${i}.zip ${i}
+done
\ No newline at end of file
--- a/workflow/scripts/splitFetch.py
+++ b/workflow/scripts/splitFetch.py
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import os
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args()
+    fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
+    fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0)
+    replicateRID = fileFile.Replicate_RID.unique()
+    fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID}
+    for i in replicateRID:
+        if not os.path.exists(i):
+            os.mkdir("Replicate_"+i)
+        fetchFile[fetchFile[0].str.contains('|'.join(fetchArray[i]))].to_csv("Replicate_"+i+"/fetch.txt",sep="\t",header=False,index=False)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file