diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 5203ec8faf03460601a3cfa4340c840ada365431..d221fee521fa74fbf98b2e4760650e0d94e6026e 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -39,11 +39,7 @@ singularity { } env { - http_proxy = "http://proxy.swmed.edu:3128" - https_proxy = "http://proxy.swmed.edu:3128" - HTTP_PROXY = "http://proxy.swmed.edu:3128" - HTTPS_PROXY = "http://proxy.swmed.edu:3128" - all_proxy = "http://proxy.swmed.edu:3128" - ALL_PROXY = "http://proxy.swmed.edu:3128" + http_proxy = 'http://proxy.swmed.edu:3128' + https_proxy = 'http://proxy.swmed.edu:3128' + all_proxy = 'http://proxy.swmed.edu:3128' } - diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index d55fb81ac18df0bc4b6ca3d2819cb83f929a075b..32d40777bf03087aa65a0f21e5897156b4e56a6f 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -7,9 +7,11 @@ params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip" params.outDir = "${baseDir}/../output" // Parse input variables -deriva = Channel - .fromPath(params.deriva) - .ifEmpty { exit 1, "deriva cookie file not found: ${params.deriva}" } +deriva = file(params.deriva) +deriva.copyTo('~/.bdbag/deriva-cookies.txt') +//deriva = Channel +// .fromPath(params.deriva) +// .ifEmpty { exit 1, "deriva cookie file not found: ${params.deriva}" } bdbag = Channel .fromPath(params.bdbag) .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } @@ -20,62 +22,72 @@ outDir = params.outDir * splitData: split bdbag files by replicate so fetch can occure in parallel */ process splitData { - tag "${bdbag.baseName}" - publishDir "${outDir}/temp/${task.process}", mode: "symlink" + tag "${bdbag.baseName}" + publishDir "${outDir}/temp/${task.process}", mode: "symlink" - input: - file bdbag + input: + file bdbag + file deriva - output: - file("Replicate_*.zip") into bdbagSplit mode flatten - file("${bdbag.baseName}/data/File.csv") into fileMeta - file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta - file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta + output: + file("Replicate_*.zip") into bdbagSplit mode flatten + file("${bdbag.baseName}/data/File.csv") into fileMeta + file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta + file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta - script: - """ - hostname - ulimit -a - study=\$(echo "${bdbag}" | cut -d'.' -f1) - echo LOG: \${study} - unzip ${bdbag} - echo LOG: bdgag unzipped - python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} - echo LOG: fetch file filtered for only .fastq.gz - python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} - echo LOG: fetch file split by replicates - sh ${baseDir}/scripts/splitBag.sh \${study} - echo LOG: bag recreated with replicate split fetch file - """ + script: + """ + hostname + ulimit -a + ln -sf `readlink -e ${deriva}` ~/.bdbag/deriva-cookies.txt + study=`echo "${bdbag}" | cut -d'.' -f1` + echo LOG: \${study} + unzip ${bdbag} + echo LOG: bdgag unzipped + python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} + cd \${study} + bash ${baseDir}/scripts/fixFetch.sh + cd .. + echo LOG: fetch file filtered for only .fastq.gz + python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} + echo LOG: fetch file split by replicates + sh ${baseDir}/scripts/splitBag.sh \${study} + echo LOG: bag recreated with replicate split fetch file + """ } +println {${http_proxy}} +println {${https_proxy}} + /* * getData: fetch study files from consortium with downloaded bdbag.zip */ process getData { - tag "${rep.baseName}" - publishDir "${outDir}/temp/${task.process}", mode: "symlink" + tag "${rep.baseName}" + publishDir "${outDir}/temp/${task.process}", mode: "symlink" - input: - file deriva - each rep from bdbagSplit + input: + file deriva + each rep from bdbagSplit - output: - file("**/*.R*.fastq.gz") into fastq + output: + file("**/*.R*.fastq.gz") into fastq - script: - """ - hostname - ulimit -a - replicate=\$(echo "${rep}" | cut -d'.' -f1) - echo LOG: \${replicate} - cp "${deriva}" ~/.bdbag/deriva-cookies.txt - echo LOG: deriva cookie loaded - unzip ${rep} - echo LOG: replicate bdbag unzipped - sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} - echo LOG: replicate bdbag fetched - sh ${baseDir}/scripts/renameFastq.sh \${replicate} - echo LOG: fastq.gz files renamed to replicate RID - """ - } \ No newline at end of file + script: + """ + hostname + ulimit -a + echo LOG:\${http_proxy} + export https_proxy=\${http_proxy} + ln -sf `readlink -e ${deriva}` ~/.bdbag/deriva-cookies.txt + replicate=\$(echo "${rep}" | cut -d'.' -f1 | rev | cut -f1 -d '/' | rev) + echo LOG: \${replicate} + echo LOG: deriva cookie loaded + unzip ${rep} + echo LOG: replicate bdbag unzipped + sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} + echo LOG: replicate bdbag fetched + sh ${baseDir}/scripts/renameFastq.sh \${replicate} + echo LOG: fastq.gz files renamed to replicate RID + """ + } diff --git a/workflow/scripts/renameFastq.sh b/workflow/scripts/renameFastq.sh index f5593766b3a3bd645c3f2c8758d3a20fd354c9be..c57eccb206053a4041932652395e95a8b5bfa75c 100644 --- a/workflow/scripts/renameFastq.sh +++ b/workflow/scripts/renameFastq.sh @@ -1,15 +1,12 @@ #!/bin -while read loc checksum fileLocation -do - file=$(echo ${fileLocation##*/}) - fileName=$(echo ${file%.R*.fastq.gz}) - fileExt=$(echo ${file##${fileName}.}) - while IFS="," read RID Study_RID Experiment_RID Replicate_RID Caption File_Type File_Name URI File_size MD5 GEO_Archival_URL dbGaP_Accession_ID Processed Notes Principal_Investigator Consortium Release_Date RCT RMT Legacy_File_RID GUDMAP_NGF_OID GUDMAP_NGS_OID - do - if [ ${file} == ${File_Name} ] - then - find . -type f -name ${file} -execdir mv {} ${Replicate_RID}.${fileExt} ';' - fi - done < $1/data/File.csv -done < $1/fetch.txt \ No newline at end of file +while read loc checksum fileLocation; do + file=$(echo ${fileLocation##*/}); + fileName=$(echo ${file%.R*.fastq.gz}); + fileExt=$(echo ${file##${fileName}.}); + while IFS="," read RID Study_RID Experiment_RID Replicate_RID Caption File_Type File_Name URI File_size MD5 GEO_Archival_URL dbGaP_Accession_ID Processed Notes Principal_Investigator Consortium Release_Date RCT RMT Legacy_File_RID GUDMAP_NGF_OID GUDMAP_NGS_OID; do + if [ "${file}" == "${File_Name}" ]; then + find . -type f -name "${file}" -execdir mv {} ${Replicate_RID}.${fileExt} ';'; + fi; + done < $1/data/File.csv; +done < $1/fetch.txt;