diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 531447b75249eca61cf5661e515a586fcc65913f..0a03aabea1f2e74432f7dedb191efab76e11b38f 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -15,14 +15,7 @@ deriva = Channel bdbag = Channel .fromPath(params.bdbag) .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" } - -Channel.from(params.repRID) - .into { - repRID_getBag - repRID_getData - repRID_parseMetadata - repRID_trimData - } +repRID = params.repRID outDir = params.outDir logsDir = "${outDir}/Logs" @@ -30,30 +23,36 @@ logsDir = "${outDir}/Logs" // Define fixed files derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") +// Define script files +script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") + /* * getData: get bagit file from consortium */ process getBag { - tag "${repRID_getBag}" - publishDir "${logsDir}/getBag", mode: 'symlink', pattern: "${repRID_getBag}.getBag.err" + tag "${repRID}" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getBag.err" input: - val repRID_getBag path credential, stageAs: 'credential.json' from deriva path derivaConfig output: path ("Replicate_*.zip") into bagit - file ("${repRID_getBag}.getBag.err") + file ("${repRID}.getBag.err") script: """ - hostname >>${repRID_getBag}.getBag.err - ulimit -a >>${repRID_getBag}.getBag.err + hostname >>${repRID}.getBag.err + ulimit -a >>${repRID}.getBag.err export https_proxy=\${http_proxy} - ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID_getBag}.getBag.err - echo "LOG: deriva credentials linked" >>${repRID_getBag}.getBag.err - deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID_getBag} 2>>${repRID_getBag}.getBag.err + + # link credential file for authentication + ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID}.getBag.err + echo "LOG: deriva credentials linked" >>${repRID}.getBag.err + + # deriva-download replicate RID + deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID} 2>>${repRID}.getBag.err """ } @@ -61,11 +60,11 @@ process getBag { * getData: fetch study files from consortium with downloaded bdbag.zip */ process getData { - tag "${repRID_getData}" - publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${repRID_getData}.getData.err" + tag "${repRID}" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getData.err" input: - val repRID_getData + path script_bdbagFetch path cookies, stageAs: 'deriva-cookies.txt' from bdbag path bagit @@ -74,21 +73,29 @@ process getData { file("**/File.csv") into fileMeta file("**/Experiment Settings.csv") into experimentSettingsMeta file("**/Experiment.csv") into experimentMeta - file ("${repRID_getData}.getData.err") + file ("${repRID}.getData.err") script: """ - hostname >>${repRID_getData}.getData.err - ulimit -a >>${repRID_getData}.getData.err + hostname >>${repRID}.getData.err + ulimit -a >>${repRID}.getData.err export https_proxy=\${http_proxy} - ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID_getData}.getData.err - echo "LOG: deriva cookie linked" >>${repRID_getData}.getData.err + + # link deriva cookie for authentication + ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID}.getData.err + echo "LOG: deriva cookie linked" >>${repRID}.getData.err + + # get bagit basename replicate=\$(basename "${bagit}" | cut -d '.' -f1) - echo "LOG: \${replicate}" >>${repRID_getData}.getData.err - unzip ${bagit} 2>>${repRID_getData}.getData.err - echo "LOG: replicate bdbag unzipped" >>${repRID_getData}.getData.err - sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} ${repRID_getData} 2>>${repRID_getData}.getData.err - echo "LOG: replicate bdbag fetched" >>${repRID_getData}.getData.err + echo "LOG: \${replicate}" >>${repRID}.getData.err + + # unzip bagit + unzip ${bagit} 2>>${repRID}.getData.err + echo "LOG: replicate bdbag unzipped" >>${repRID}.getData.err + + # bagit fetch fastq's only and rename by repRID + sh bdbagFetch.sh \${replicate} ${repRID} 2>>${repRID}.getData.err + echo "LOG: replicate bdbag fetched" >>${repRID}.getData.err """ } @@ -96,11 +103,11 @@ process getData { * parseMetadata: parses metadata to extract experiment parameters */ process parseMetadata { - tag "${repRID_parseMetadata}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID_parseMetadata}.parseMetadata.err" + tag "${repRID}" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.parseMetadata.err" input: - val repRID_parseMetadata + val repRID path fileMeta path experimentSettingsMeta path experimentMeta @@ -113,28 +120,28 @@ process parseMetadata { script: """ - hostname >>${repRID_parseMetadata}.parseMetadata.err - ulimit -a >>${repRID_parseMetadata}.parseMetadata.err + hostname >>${repRID}.parseMetadata.err + ulimit -a >>${repRID}.parseMetadata.err # Check replicate RID metadata - rep=\$(python3 ${baseDir}/scripts/parseMeta.py -r ${repRID_parseMetadata} -m "${fileMeta}" -p repRID) - echo "LOG: replicate RID metadata parsed: \${rep}" >>${repRID_parseMetadata}.parseMetadata.err + rep=\$(python3 ${baseDir}/scripts/parseMeta.py -r ${repRID} -m "${fileMeta}" -p repRID) + echo "LOG: replicate RID metadata parsed: \${rep}" >>${repRID}.parseMetadata.err # Get endedness metadata - ends=\$(python3 ${baseDir}/scripts/parseMeta.py -r ${repRID_parseMetadata} -m "${experimentSettingsMeta}" -p ends) - echo "LOG: endedness metadata parsed: \${ends}" >>${repRID_parseMetadata}.parseMetadata.err + ends=\$(python3 ${baseDir}/scripts/parseMeta.py -r ${repRID} -m "${experimentSettingsMeta}" -p ends) + echo "LOG: endedness metadata parsed: \${ends}" >>${repRID}.parseMetadata.err # Get strandedness metadata - stranded=\$(python3 ${baseDir}/scripts/parseMeta.py -r ${repRID_parseMetadata} -m "${experimentSettingsMeta}" -p stranded) - echo "LOG: strandedness metadata parsed: \${stranded}" >>${repRID_parseMetadata}.parseMetadata.err + stranded=\$(python3 ${baseDir}/scripts/parseMeta.py -r ${repRID} -m "${experimentSettingsMeta}" -p stranded) + echo "LOG: strandedness metadata parsed: \${stranded}" >>${repRID}.parseMetadata.err # Get spike-in metadata - spike=\$(python3 ${baseDir}/scripts/parseMeta.py -r ${repRID_parseMetadata} -m "${experimentSettingsMeta}" -p spike) - echo "LOG: spike-in metadata parsed: \${spike}" >>${repRID_parseMetadata}.parseMetadata.err + spike=\$(python3 ${baseDir}/scripts/parseMeta.py -r ${repRID} -m "${experimentSettingsMeta}" -p spike) + echo "LOG: spike-in metadata parsed: \${spike}" >>${repRID}.parseMetadata.err # Get species metadata - specie=\$(python3 ${baseDir}/scripts/parseMeta.py -r ${repRID_parseMetadata} -m "${experimentMeta}" -p specie) - echo "LOG: species metadata parsed: \${specie}" >>${repRID_parseMetadata}.parseMetadata.err + specie=\$(python3 ${baseDir}/scripts/parseMeta.py -r ${repRID} -m "${experimentMeta}" -p specie) + echo "LOG: species metadata parsed: \${specie}" >>${repRID}.parseMetadata.err """ } @@ -146,32 +153,32 @@ ends.set { * trimData: trims any adapter or non-host sequences from the data */ process trimData { - tag "${repRID_trimData}" - publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${repRID_trimData}.trimData.*" + tag "${repRID}" + publishDir "${logsDir}", mode: 'copy', pattern: "\${repRID}.trimData.*" input: - val repRID_trimData file(fastq) from fastqs val ends_trimData output: path ("*.fq.gz") into fastqs_trimmed - file ("${repRID_trimData}.trimData.log") - file ("${repRID_trimData}.trimData.err") + val ends + file ("${repRID}.trimData.log") + file ("${repRID}.trimData.err") script: """ - if [ `nproc` -gt 8 ] - then - ncore=8 - else - ncore=`nproc` - fi + hostname >>${repRID}.trimData.err + ulimit -a >>${repRID}.trimData.err + + # trim fastqs if [ '${ends_trimData}' == 'se' ] then - trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err; + ends='se' + trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; else - trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} ${fastq[1]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err; + ends='pe' + trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; fi """ } \ No newline at end of file diff --git a/workflow/tests/test_trimData.py b/workflow/tests/test_trimData.py index ea75252f558adc95e5feb564551afbe3f8858cb0..6538a0081aaea69c5814d479751455abd879f4bc 100644 --- a/workflow/tests/test_trimData.py +++ b/workflow/tests/test_trimData.py @@ -9,7 +9,10 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../../' @pytest.mark.trimData -def test_trimData(): +def test_trimData_se(): assert os.path.exists(os.path.join(test_output_path, '16-1ZX4_trimmed.fq.gz')) + +@pytest.mark.trimData +def test_trimData_pe(): assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R1_val_1.fq.gz')) assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R2_val_2.fq.gz')) \ No newline at end of file