diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 20da91a7f7a241e610708d7186d299d397958c41..11d83b277b06efc7920c5afbe2f09240492f27e6 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -5,14 +5,14 @@ process { withName:getBag { executor = 'local' - container = 'docker://bicf/gudmaprbkfilexfer:1.3' } withName:getData { executor = 'local' - container = 'docker://bicf/gudmaprbkfilexfer:1.3' } withName:trimData { - container = 'docker://bicf/trimgalore:1.1' + queue = '256GB,256GBv1,384GB' + } + withName:alignReads { queue = '256GB,256GBv1,384GB' } } @@ -26,4 +26,4 @@ env { http_proxy = 'http://proxy.swmed.edu:3128' https_proxy = 'http://proxy.swmed.edu:3128' all_proxy = 'http://proxy.swmed.edu:3128' -} \ No newline at end of file +} diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 37584999cf8152c9776b676d0b013f8aeb5e8709..4f4f34d051a54568aabee33754ba02fd8f30a67d 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -10,6 +10,21 @@ profiles { } } +process { + withName:getBag { + container = 'bicf/gudmaprbkfilexfer:1.3' + } + withName:getData { + container = 'bicf/gudmaprbkfilexfer:1.3' + } + withName:trimData { + container = 'bicf/trimgalore:1.1' + } + withName: alignReads { + container = 'bicf/gudmaprbkaligner:2.0.0' + } +} + trace { enabled = true file = 'pipeline_trace.txt' @@ -37,4 +52,4 @@ manifest { mainScript = 'rna-seq.nf' version = 'v0.0.1_indev' nextflowVersion = '>=19.09.0' -} \ No newline at end of file +} diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index c1d72efac168ca372471a35e4baa9ccfd96aae18..8b7823a0565d860725c60c6a134f179a6561ffe9 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -5,6 +5,8 @@ params.deriva = "${baseDir}/../test_data/credential.json" params.bdbag = "${baseDir}/../test_data/cookies.txt" //params.repRID = "16-1ZX4" params.repRID = "Q-Y5JA" +params.spikein = "false" +params.reference = "mouse" params.outDir = "${baseDir}/../output" @@ -16,15 +18,25 @@ bdbag = Channel .fromPath(params.bdbag) .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" } -Channel.from(params.repRID) - .into { - repRID_getBag - repRID_getData - repRID_trimData - } - +repRID = params.repRID outDir = params.outDir logsDir = "${outDir}/Logs" +reference = params.reference + +if (params.spikein) { + if (params.reference == "human") { + reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12-S/hisat2") + } else if (params.reference == "mouse") { + reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6-S/hisat2") + } +} else if (params.reference == "mouse") { + reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2") +} else if (params.reference == "human") { + reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12/hisat2") +} else { + print ("Warning: Reference genome not specified, defaulting to GRCm38.P6 with NO spike-in") + reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2") +} // Define fixed files derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") @@ -34,26 +46,25 @@ derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") */ process getBag { executor 'local' - tag "${repRID_getBag}" - publishDir "${logsDir}/getBag", mode: 'symlink', pattern: "${repRID_getBag}.getBag.err" + tag "${repRID}" + publishDir "${logsDir}/getBag", mode: 'copy', pattern: "${repRID}.getBag.err" input: - val repRID_getBag path credential, stageAs: 'credential.json' from deriva path derivaConfig output: path ("Replicate_*.zip") into bagit - file ("${repRID_getBag}.getBag.err") + file ("${repRID}.getBag.err") script: """ - hostname >>${repRID_getBag}.getBag.err - ulimit -a >>${repRID_getBag}.getBag.err + hostname >>${repRID}.getBag.err + ulimit -a >>${repRID}.getBag.err export https_proxy=\${http_proxy} - ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID_getBag}.getBag.err - echo "LOG: deriva credentials linked" >>${repRID_getBag}.getBag.err - deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID_getBag} 2>>${repRID_getBag}.getBag.err + ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID}.getBag.err + echo "LOG: deriva credentials linked" >>${repRID}.getBag.err + deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID} 2>>${repRID}.getBag.err """ } @@ -61,11 +72,10 @@ process getBag { * getData: fetch study files from consortium with downloaded bdbag.zip */ process getData { - tag "${repRID_getData}" - publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${repRID_getData}.getData.err" + tag "${repRID}" + publishDir "${logsDir}/getData", mode: 'copy', pattern: "${repRID}.getData.err" input: - val repRID_getData executor 'local' path cookies, stageAs: 'deriva-cookies.txt' from bdbag path bagit @@ -75,22 +85,22 @@ process getData { file("**/File.csv") into fileMeta file("**/Experiment Settings.csv") into experimentSettingsMeta file("**/Experiment.csv") into experimentMeta - file ("${repRID_getData}.getData.err") + file ("${repRID}.getData.err") script: """ - hostname >>${repRID_getData}.getData.err - ulimit -a >>${repRID_getData}.getData.err + hostname >>${repRID}.getData.err + ulimit -a >>${repRID}.getData.err export https_proxy=\${http_proxy} - ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID_getData}.getData.err - echo "LOG: deriva cookie linked" >>${repRID_getData}.getData.err + ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID}.getData.err + echo "LOG: deriva cookie linked" >>${repRID}.getData.err replicate=\$(basename "${bagit}" | cut -d '.' -f1) - echo "LOG: \${replicate}" >>${repRID_getData}.getData.err - unzip ${bagit} 2>>${repRID_getData}.getData.err - echo "LOG: replicate bdbag unzipped" >>${repRID_getData}.getData.err - sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} ${repRID_getData} 2>>${repRID_getData}.getData.err - echo "LOG: replicate bdbag fetched" >>${repRID_getData}.getData.err + echo "LOG: \${replicate}" >>${repRID}.getData.err + unzip ${bagit} 2>>${repRID}.getData.err + echo "LOG: replicate bdbag unzipped" >>${repRID}.getData.err + sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} ${repRID} 2>>${repRID}.getData.err + echo "LOG: replicate bdbag fetched" >>${repRID}.getData.err """ } @@ -98,34 +108,54 @@ process getData { * trimData: trims any adapter or non-host sequences from the data */ process trimData { - tag "${repRID_trimData}" - publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${repRID_trimData}.trimData.*" + tag "${repRID}" + publishDir "${logsDir}/trimData", mode: 'copy', pattern: "\${repRID}.trimData.*" input: - val repRID_trimData file(fastq) from fastqs output: - path ("*.fq.gz") into fastqs_trimmed - val ends - file ("${repRID_trimData}.trimData.log") - file ("${repRID_trimData}.trimData.err") + tuple file ("params.csv"), file ("*.fq.gz") into fastqs_trimmed + file ("${repRID}.trimData.log") + file ("${repRID}.trimData.err") script: """ - if [ `nproc` -gt 8 ] - then - ncore=8 - else - ncore=`nproc` - fi if [ '${fastq[1]}' == 'null' ] then ends='se' - trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err; + trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; else ends='pe' - trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} ${fastq[1]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err; + trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; fi + echo \${ends} > params.csv; """ -} \ No newline at end of file +} + +/* + * alignReads: aligns the reads to a reference database +*/ +process alignReads { + tag "align-${repRID}" + publishDir "${outDir}/tempOut/aligned", mode: "copy" + + input: + set ends, fqs from fastqs_trimmed + file reference + + output: + set repRID, file ("${repRID}.unal.gz"), file ("${repRID}.bam"), file ("${repRID}.bai") + + script: + """ + ends=`cat ${ends}`; + if [ "\${ends}" == 'pe' ]; then + hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${reference}/genome -1 ${fqs[0]} -2 ${fqs[1]} 1>${repRID}.align.out 2> ${repRID}.align.err; + else hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${reference}/genome -U ${fqs[0]} 1>${repRID}.align.out 2> ${repRID}.align.err; + fi; + samtools view -1 --threads `nproc` -o ${repRID}.bam ${repRID}.sam 1>${repRID}.align.out 2> ${repRID}.align.err; + samtools sort -@ `nproc` -O BAM ${repRID}.bam 1>${repRID}.align.out 2> ${repRID}.align.err; + """ +} +