Skip to content
Snippets Groups Projects
Commit 347d8e1a authored by Gervaise Henry's avatar Gervaise Henry :cowboy:
Browse files

Add new process to split bdbag by replicates for parallel getData

parent 2b4b2e9b
Branches
Tags
3 merge requests!37v0.0.1,!4Develop,!2Resolve "process_getData"
...@@ -297,7 +297,6 @@ $RECYCLE.BIN/ ...@@ -297,7 +297,6 @@ $RECYCLE.BIN/
# nextflow analysis folders/files # nextflow analysis folders/files
/test_data/* /test_data/*
/workflow/docker/images/*
/workflow/.nextflow/* /workflow/.nextflow/*
/workflow/work/* /workflow/work/*
/workflow/output/* /workflow/output/*
......
No preview for this file type
profiles {
standard {
includeConfig 'workflow/conf/biohpc.config'
}
}
...@@ -3,6 +3,9 @@ process { ...@@ -3,6 +3,9 @@ process {
queue = 'super' queue = 'super'
// Process specific configuration // Process specific configuration
withName:splitData {
container = 'docker://bicf/bdbag:1.0'
}
withName:getData { withName:getData {
container = 'docker://bicf/bdbag:1.0' container = 'docker://bicf/bdbag:1.0'
} }
......
RUN apt-get install -y python3.7 python3-pip
RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \
rm Miniconda3-latest-Linux-x86_64.sh
ENV PATH=/miniconda/bin:${PATH}
RUN conda config --add channels defaults && \
conda config --add channels bioconda && \
conda config --add channels conda-forge && \
conda update -n base -c defaults -y conda
RUN pip install --upgrade pip
#!/usr/bin/env nextflow #!/usr/bin/env nextflow
// Define input variables // Define input variables
params.deriva = "${baseDir}/../test_data/deriva-cookies.txt"
params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip" params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip"
params.outDir = "${baseDir}/../output" params.outDir = "${baseDir}/../output"
// Parse input variables // Parse input variables
deriva = Channel
.fromPath(params.deriva)
.ifEmpty { exit 1, "deriva cookie file not found: ${params.deriva}" }
bdbag = Channel bdbag = Channel
.fromPath(params.bdbag) .fromPath(params.bdbag)
.ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }
outDir = params.outDir outDir = params.outDir
/* /*
* getData: fetch study files from consortium with downloaded bdbag.zip * splitData: split bdbag files by replicate so fetch can occure in parallel
* python must be loaded prior to nextflow run, because conda env create from .yml doesn't work with nextflow loaded module (either process in-line, or config file)
*/ */
process getData { process splitData {
publishDir "${outDir}/temp/getData", mode: "symlink" tag "${bdbag.baseName}"
// conda "${baseDir}/conf/conda.env.bdbag.yml" publishDir "${outDir}/temp/${task.process}", mode: "symlink"
input: input:
file bdbag file bdbag
output: output:
file("**/*.R*.fastq.gz") into fastqPaths file("Replicate_*.zip") into bdbagSplit mode flatten
file("**/File.csv") into filePaths file("${bdbag.baseName}/data/File.csv") into fileMeta
file("**/Experiment Settings.csv") into experimentSettingsPaths file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta
file("**/Experiment.csv") into experimentPaths file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta
script: script:
""" """
...@@ -40,10 +42,40 @@ outDir = params.outDir ...@@ -40,10 +42,40 @@ outDir = params.outDir
echo LOG: bdgag unzipped echo LOG: bdgag unzipped
python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study}
echo LOG: fetch file filtered for only .fastq.gz echo LOG: fetch file filtered for only .fastq.gz
#bdbag --materialize "\$(echo "${bdbag}" | cut -d'.' -f1)" python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study}
sh ${baseDir}/scripts/bdbagFetch.sh \${study} echo LOG: fetch file split by replicates
echo LOG: bdbag fetched sh ${baseDir}/scripts/splitBag.sh \${study}
sh ${baseDir}/scripts/renameFastq.sh \${study} echo LOG: bag recreated with replicate split fetch file
"""
}
/*
* getData: fetch study files from consortium with downloaded bdbag.zip
*/
process getData {
tag "${rep.baseName}"
publishDir "${outDir}/temp/${task.process}", mode: "symlink"
input:
file deriva
each rep from bdbagSplit
output:
file("**/*.R*.fastq.gz") into fastq
script:
"""
hostname
ulimit -a
replicate=\$(echo "${rep}" | cut -d'.' -f1)
echo LOG: \${replicate}
cp "${deriva}" ~/.bdbag/deriva-cookies.txt
echo LOG: deriva cookie loaded
unzip ${rep}
echo LOG: replicate bdbag unzipped
sh ${baseDir}/scripts/bdbagFetch.sh \${replicate}
echo LOG: replicate bdbag fetched
sh ${baseDir}/scripts/renameFastq.sh \${replicate}
echo LOG: fastq.gz files renamed to replicate RID echo LOG: fastq.gz files renamed to replicate RID
""" """
} }
\ No newline at end of file
#!/usr/bin/env python3
import argparse import argparse
import pandas as pd import pandas as pd
...@@ -9,9 +11,9 @@ def get_args(): ...@@ -9,9 +11,9 @@ def get_args():
def main(): def main():
args = get_args() args = get_args()
fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"] fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"]
fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) fetchFile_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
\ No newline at end of file
#!/bin
for i in $(ls -d Replicate_*)
do
rsync -r $1/ ${i} --exclude=fetch.txt
zip -r ${i}.zip ${i}
done
\ No newline at end of file
#!/usr/bin/env python3
import argparse
import pandas as pd
import os
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
args = parser.parse_args()
return args
def main():
args = get_args()
fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0)
replicateRID = fileFile.Replicate_RID.unique()
fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID}
for i in replicateRID:
if not os.path.exists(i):
os.mkdir("Replicate_"+i)
fetchFile[fetchFile[0].str.contains('|'.join(fetchArray[i]))].to_csv("Replicate_"+i+"/fetch.txt",sep="\t",header=False,index=False)
if __name__ == '__main__':
main()
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment