Skip to content
Snippets Groups Projects
Commit e552d2e6 authored by Gervaise Henry's avatar Gervaise Henry 🤠
Browse files

Add getData deriva-download of bagit

parent cbba6901
3 merge requests!37v0.0.1,!11Develop,!8Resolve "Add automated download of bagit through deriva"
Pipeline #5634 passed with stage
in 7 seconds
This commit is part of merge request !8. Comments created here will be created in the context of that merge request.
...@@ -48,3 +48,11 @@ env { ...@@ -48,3 +48,11 @@ env {
https_proxy = 'http://proxy.swmed.edu:3128' https_proxy = 'http://proxy.swmed.edu:3128'
all_proxy = 'http://proxy.swmed.edu:3128' all_proxy = 'http://proxy.swmed.edu:3128'
} }
manifest {
homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
description = 'This pipeline was created to analize RNA-sequencing data from GUDMAP/RBK consortiums.'
mainScript = 'rna-seq.nf'
version = 'v0.0.1_indev'
nextflowVersion = '>=19.09.0'
}
#!/usr/bin/env nextflow #!/usr/bin/env nextflow
// Define input variables // Define input variables
params.deriva = "/project/BICF/BICF_Core/shared/gudmap/cookies/credential.json" params.deriva = "${baseDir}/../test_data/credential.json"
params.bdbag = "/project/BICF/BICF_Core/shared/gudmap/cookies/deriva-cookies.txt" params.bdbag = "${baseDir}/../test_data/cookies.txt"
params.repRID = "16-1ZX4" //params.repRID = "16-1ZX4"
params.repRID = "Q-Y5JA"
//params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip" //params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip"
params.outDir = "${baseDir}/../output" params.outDir = "${baseDir}/../output"
// Parse input variables // Parse input variables
deriva = file(params.deriva, checkIfExists: 'true') deriva = Channel
bdbag = file(params.bdbag, checkIfExists: 'true') .fromPath(params.deriva)
.ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" }
bdbag = Channel
.fromPath(params.bdbag)
.ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" }
//bdbag = Channel //bdbag = Channel
// .fromPath(params.bdbag) // .fromPath(params.bdbag)
// .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } // .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }
repRID = params.repRID Channel.from(params.repRID)
.into {
repRID_getBag
repRID_getData
repRID_trimData
}
outDir = params.outDir outDir = params.outDir
logsDir = "${outDir}/Logs" logsDir = "${outDir}/Logs"
derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
/* /*
* splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
*/ */
...@@ -65,48 +77,56 @@ process splitData { ...@@ -65,48 +77,56 @@ process splitData {
* getData: get bagit file from consortium * getData: get bagit file from consortium
*/ */
process getBag { process getBag {
tag "${repRID}" executor 'local'
publishDir "${logsDir}/getBag", mode: 'symlink', pattern: "${rep.baseName}.getBag.err" tag "${repRID_getBag}"
publishDir "${logsDir}/getBag", mode: 'symlink', pattern: "${repRID_getBag}.getBag.err"
input: input:
path deriva val repRID_getBag
val repRID path credential, stageAs: 'credential.json' from deriva
path derivaConfig
output: output:
file path ("Replicate_*.zip") into bagit
script: script:
""" """
hostname >>${rep.baseName}.getData.err hostname >>${repRID_getBag}.getBag.err
ulimit -a >>${rep.baseName}.getData.err ulimit -a >>${repRID_getBag}.getBag.err
ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID_getBag}.getBag.err
echo "LOG: deriva credentials linked" >>${repRID_getBag}.getBag.err
deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID_getBag} 2>>${repRID_getBag}.getBag.err
""" """
} }
/* /*
* getData: fetch study files from consortium with downloaded bdbag.zip * getData: fetch study files from consortium with downloaded bdbag.zip
*/ */
process getData { process getData {
tag "${rep.baseName}" tag "${repRID_getData}"
publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${rep.baseName}.getData.err" publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${repRID_getData}.getData.err"
input: input:
each rep from bdbagSplit val repRID_getData
path cookies, stageAs: 'deriva-cookies.txt' from bdbag
path bagit
output: output:
set val ("${rep.baseName}"), file ("*.R{1,2}.fastq.gz") into trimming path ("**/*.R{1,2}.fastq.gz") into fastqs
script: script:
""" """
hostname >>${rep.baseName}.getData.err hostname >>${repRID_getData}.getData.err
ulimit -a >>${rep.baseName}.getData.err ulimit -a >>${repRID_getData}.getData.err
export https_proxy=\${http_proxy} export https_proxy=\${http_proxy}
replicate=\$(basename "${rep}" | cut -d '.' -f1) ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID_getData}.getData.err
echo "LOG: \${replicate}" >>${rep.baseName}.getData.err echo "LOG: deriva cookie linked" >>${repRID_getData}.getData.err
unzip ${rep} 2>>${rep.baseName}.getData.err replicate=\$(basename "${bagit}" | cut -d '.' -f1)
echo "LOG: replicate bdbag unzipped" >>${rep.baseName}.getData.err echo "LOG: \${replicate}" >>${repRID_getData}.getData.err
sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} 2>>${rep.baseName}.getData.err unzip ${bagit} 2>>${repRID_getData}.getData.err
echo "LOG: replicate bdbag fetched" >>${rep.baseName}.getData.err echo "LOG: replicate bdbag unzipped" >>${repRID_getData}.getData.err
sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} 2>>${repRID_getData}.getData.err
echo "LOG: replicate bdbag fetched" >>${repRID_getData}.getData.err
""" """
} }
...@@ -114,19 +134,30 @@ process getData { ...@@ -114,19 +134,30 @@ process getData {
* trimData: trims any adapter or non-host sequences from the data * trimData: trims any adapter or non-host sequences from the data
*/ */
process trimData { process trimData {
tag "trim-${repID}" tag "${repRID_trimData}"
publishDir "${outDir}/tempOut/trimmed", mode: "symlink", pattern: "*_val_{1,2}.fq.gz" publishDir "${outDir}/tempOut/trimmed", mode: "symlink", pattern: "*_val_{1,2}.fq.gz"
publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${rep}.trimData.*" publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${repRID_trimData}.trimData.*"
input: input:
set repID, reads from trimming val repRID_trimData
file(fastq) from fastqs
output: output:
path ("*_val_{1,2}.fq.gz", type: 'file', maxDepth: '0') path ("*_val_{1,2}.fq.gz", type: 'file', maxDepth: '0')
script: script:
""" """
rep=`echo ${repID} | cut -f2- -d '_'`; if [ `nproc` -gt 8 ]
trim_galore --gzip --max_n 1 --paired --basename \${rep} -j `nproc` ${reads[0]} ${reads[1]} 1>>\${rep}.trimData.log 2>>\${rep}.trimData.err; then
ncore=8
else
ncore=`nproc`
fi
if [ -z ${fastq[1]} ]
then
trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err;
else
trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} ${fastq[1]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err;
fi
""" """
} }
\ No newline at end of file
#!/bin/bash #!/bin/bash
bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 && bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1
for i in $(find */ -name "*.R*.fastq.gz"); do \ No newline at end of file
mv ${i} .;
done;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment