From e552d2e6d3eb24df2fc2920c27e7d4cb9838d9f0 Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Mon, 13 Jan 2020 18:58:53 -0600 Subject: [PATCH] Add getData deriva-download of bagit --- workflow/conf/biohpc.config | 8 +++ workflow/rna-seq.nf | 95 ++++++++++++++++++++++------------ workflow/scripts/bdbagFetch.sh | 5 +- 3 files changed, 72 insertions(+), 36 deletions(-) diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 13368ad..adcd15e 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -48,3 +48,11 @@ env { https_proxy = 'http://proxy.swmed.edu:3128' all_proxy = 'http://proxy.swmed.edu:3128' } + +manifest { + homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' + description = 'This pipeline was created to analize RNA-sequencing data from GUDMAP/RBK consortiums.' + mainScript = 'rna-seq.nf' + version = 'v0.0.1_indev' + nextflowVersion = '>=19.09.0' +} diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 5470fe1..0a43a13 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -1,25 +1,37 @@ #!/usr/bin/env nextflow // Define input variables -params.deriva = "/project/BICF/BICF_Core/shared/gudmap/cookies/credential.json" -params.bdbag = "/project/BICF/BICF_Core/shared/gudmap/cookies/deriva-cookies.txt" -params.repRID = "16-1ZX4" +params.deriva = "${baseDir}/../test_data/credential.json" +params.bdbag = "${baseDir}/../test_data/cookies.txt" +//params.repRID = "16-1ZX4" +params.repRID = "Q-Y5JA" //params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip" params.outDir = "${baseDir}/../output" // Parse input variables -deriva = file(params.deriva, checkIfExists: 'true') -bdbag = file(params.bdbag, checkIfExists: 'true') +deriva = Channel + .fromPath(params.deriva) + .ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" } +bdbag = Channel + .fromPath(params.bdbag) + .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" } //bdbag = Channel // .fromPath(params.bdbag) // .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } -repRID = params.repRID +Channel.from(params.repRID) + .into { + repRID_getBag + repRID_getData + repRID_trimData + } outDir = params.outDir logsDir = "${outDir}/Logs" +derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") + /* * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid */ @@ -65,48 +77,56 @@ process splitData { * getData: get bagit file from consortium */ process getBag { - tag "${repRID}" - publishDir "${logsDir}/getBag", mode: 'symlink', pattern: "${rep.baseName}.getBag.err" + executor 'local' + tag "${repRID_getBag}" + publishDir "${logsDir}/getBag", mode: 'symlink', pattern: "${repRID_getBag}.getBag.err" input: - path deriva - val repRID + val repRID_getBag + path credential, stageAs: 'credential.json' from deriva + path derivaConfig output: - file + path ("Replicate_*.zip") into bagit script: """ - hostname >>${rep.baseName}.getData.err - ulimit -a >>${rep.baseName}.getData.err + hostname >>${repRID_getBag}.getBag.err + ulimit -a >>${repRID_getBag}.getBag.err + ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID_getBag}.getBag.err + echo "LOG: deriva credentials linked" >>${repRID_getBag}.getBag.err + deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID_getBag} 2>>${repRID_getBag}.getBag.err """ } - /* * getData: fetch study files from consortium with downloaded bdbag.zip */ process getData { - tag "${rep.baseName}" - publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${rep.baseName}.getData.err" + tag "${repRID_getData}" + publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${repRID_getData}.getData.err" input: - each rep from bdbagSplit + val repRID_getData + path cookies, stageAs: 'deriva-cookies.txt' from bdbag + path bagit output: - set val ("${rep.baseName}"), file ("*.R{1,2}.fastq.gz") into trimming + path ("**/*.R{1,2}.fastq.gz") into fastqs script: """ - hostname >>${rep.baseName}.getData.err - ulimit -a >>${rep.baseName}.getData.err + hostname >>${repRID_getData}.getData.err + ulimit -a >>${repRID_getData}.getData.err export https_proxy=\${http_proxy} - replicate=\$(basename "${rep}" | cut -d '.' -f1) - echo "LOG: \${replicate}" >>${rep.baseName}.getData.err - unzip ${rep} 2>>${rep.baseName}.getData.err - echo "LOG: replicate bdbag unzipped" >>${rep.baseName}.getData.err - sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} 2>>${rep.baseName}.getData.err - echo "LOG: replicate bdbag fetched" >>${rep.baseName}.getData.err + ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID_getData}.getData.err + echo "LOG: deriva cookie linked" >>${repRID_getData}.getData.err + replicate=\$(basename "${bagit}" | cut -d '.' -f1) + echo "LOG: \${replicate}" >>${repRID_getData}.getData.err + unzip ${bagit} 2>>${repRID_getData}.getData.err + echo "LOG: replicate bdbag unzipped" >>${repRID_getData}.getData.err + sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} 2>>${repRID_getData}.getData.err + echo "LOG: replicate bdbag fetched" >>${repRID_getData}.getData.err """ } @@ -114,19 +134,30 @@ process getData { * trimData: trims any adapter or non-host sequences from the data */ process trimData { - tag "trim-${repID}" + tag "${repRID_trimData}" publishDir "${outDir}/tempOut/trimmed", mode: "symlink", pattern: "*_val_{1,2}.fq.gz" - publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${rep}.trimData.*" + publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${repRID_trimData}.trimData.*" input: - set repID, reads from trimming + val repRID_trimData + file(fastq) from fastqs output: path ("*_val_{1,2}.fq.gz", type: 'file', maxDepth: '0') script: """ - rep=`echo ${repID} | cut -f2- -d '_'`; - trim_galore --gzip --max_n 1 --paired --basename \${rep} -j `nproc` ${reads[0]} ${reads[1]} 1>>\${rep}.trimData.log 2>>\${rep}.trimData.err; + if [ `nproc` -gt 8 ] + then + ncore=8 + else + ncore=`nproc` + fi + if [ -z ${fastq[1]} ] + then + trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err; + else + trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} ${fastq[1]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err; + fi """ -} +} \ No newline at end of file diff --git a/workflow/scripts/bdbagFetch.sh b/workflow/scripts/bdbagFetch.sh index 9af4eb4..65d309d 100644 --- a/workflow/scripts/bdbagFetch.sh +++ b/workflow/scripts/bdbagFetch.sh @@ -1,6 +1,3 @@ #!/bin/bash -bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 && -for i in $(find */ -name "*.R*.fastq.gz"); do - mv ${i} .; -done; +bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 \ No newline at end of file -- GitLab