diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index c1d72efac168ca372471a35e4baa9ccfd96aae18..68a65161a2dd082fad1e164b1b871ff35da17acf 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -15,13 +15,7 @@ deriva = Channel bdbag = Channel .fromPath(params.bdbag) .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" } - -Channel.from(params.repRID) - .into { - repRID_getBag - repRID_getData - repRID_trimData - } +repRID = params.repRID outDir = params.outDir logsDir = "${outDir}/Logs" @@ -29,31 +23,36 @@ logsDir = "${outDir}/Logs" // Define fixed files derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") +// Define script files +script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") + /* * getData: get bagit file from consortium */ process getBag { - executor 'local' - tag "${repRID_getBag}" - publishDir "${logsDir}/getBag", mode: 'symlink', pattern: "${repRID_getBag}.getBag.err" + tag "${repRID}" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getBag.err" input: - val repRID_getBag path credential, stageAs: 'credential.json' from deriva path derivaConfig output: path ("Replicate_*.zip") into bagit - file ("${repRID_getBag}.getBag.err") + file ("${repRID}.getBag.err") script: """ - hostname >>${repRID_getBag}.getBag.err - ulimit -a >>${repRID_getBag}.getBag.err + hostname >>${repRID}.getBag.err + ulimit -a >>${repRID}.getBag.err export https_proxy=\${http_proxy} - ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID_getBag}.getBag.err - echo "LOG: deriva credentials linked" >>${repRID_getBag}.getBag.err - deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID_getBag} 2>>${repRID_getBag}.getBag.err + + # link credential file for authentication + ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID}.getBag.err + echo "LOG: deriva credentials linked" >>${repRID}.getBag.err + + # deriva-download replicate RID + deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID} 2>>${repRID}.getBag.err """ } @@ -61,12 +60,11 @@ process getBag { * getData: fetch study files from consortium with downloaded bdbag.zip */ process getData { - tag "${repRID_getData}" - publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${repRID_getData}.getData.err" + tag "${repRID}" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getData.err" input: - val repRID_getData - executor 'local' + path script_bdbagFetch path cookies, stageAs: 'deriva-cookies.txt' from bdbag path bagit @@ -75,22 +73,30 @@ process getData { file("**/File.csv") into fileMeta file("**/Experiment Settings.csv") into experimentSettingsMeta file("**/Experiment.csv") into experimentMeta - file ("${repRID_getData}.getData.err") + file ("${repRID}.getData.err") script: """ - hostname >>${repRID_getData}.getData.err - ulimit -a >>${repRID_getData}.getData.err + hostname >>${repRID}.getData.err + ulimit -a >>${repRID}.getData.err export https_proxy=\${http_proxy} - ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID_getData}.getData.err - echo "LOG: deriva cookie linked" >>${repRID_getData}.getData.err + + # link deriva cookie for authentication + ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID}.getData.err + echo "LOG: deriva cookie linked" >>${repRID}.getData.err + + # get bagit basename replicate=\$(basename "${bagit}" | cut -d '.' -f1) - echo "LOG: \${replicate}" >>${repRID_getData}.getData.err - unzip ${bagit} 2>>${repRID_getData}.getData.err - echo "LOG: replicate bdbag unzipped" >>${repRID_getData}.getData.err - sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} ${repRID_getData} 2>>${repRID_getData}.getData.err - echo "LOG: replicate bdbag fetched" >>${repRID_getData}.getData.err + echo "LOG: \${replicate}" >>${repRID}.getData.err + + # unzip bagit + unzip ${bagit} 2>>${repRID}.getData.err + echo "LOG: replicate bdbag unzipped" >>${repRID}.getData.err + + # bagit fetch fastq's only and rename by repRID + sh bdbagFetch.sh \${replicate} ${repRID} 2>>${repRID}.getData.err + echo "LOG: replicate bdbag fetched" >>${repRID}.getData.err """ } @@ -98,34 +104,31 @@ process getData { * trimData: trims any adapter or non-host sequences from the data */ process trimData { - tag "${repRID_trimData}" - publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${repRID_trimData}.trimData.*" + tag "${repRID}" + publishDir "${logsDir}", mode: 'copy', pattern: "\${repRID}.trimData.*" input: - val repRID_trimData file(fastq) from fastqs output: path ("*.fq.gz") into fastqs_trimmed val ends - file ("${repRID_trimData}.trimData.log") - file ("${repRID_trimData}.trimData.err") + file ("${repRID}.trimData.log") + file ("${repRID}.trimData.err") script: """ - if [ `nproc` -gt 8 ] - then - ncore=8 - else - ncore=`nproc` - fi + hostname >>${repRID}.trimData.err + ulimit -a >>${repRID}.trimData.err + + # trim fastqs if [ '${fastq[1]}' == 'null' ] then ends='se' - trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err; + trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; else ends='pe' - trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} ${fastq[1]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err; + trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err; fi """ } \ No newline at end of file diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py deleted file mode 100644 index 82b1d4c50a17cf34d9410ef0889dc7ca0d112d84..0000000000000000000000000000000000000000 --- a/workflow/scripts/modifyFetch.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import pandas as pd -import re - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True) - args = parser.parse_args() - return args - -def main(): - args = get_args() - fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) - fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0) - fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"] - fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"] - fetchFile_filtered_renamed = fetchFile_filtered - for i in fileFile_filtered["File_Name"]: - fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0]) - fetchFile_filtered_renamed.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/workflow/scripts/splitFetch.py b/workflow/scripts/splitFetch.py deleted file mode 100644 index c8f60043be43a70ae570800f6edb117923d91810..0000000000000000000000000000000000000000 --- a/workflow/scripts/splitFetch.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import pandas as pd -import os - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True) - args = parser.parse_args() - return args - -def main(): - args = get_args() - fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) - fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0) - replicateRID = fileFile.Replicate_RID.unique() - fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID} - for i in replicateRID: - if not os.path.exists(i): - os.mkdir("Replicate_"+i) - fetchFile[fetchFile[0].str.contains('|'.join(fetchArray[i]))].to_csv("Replicate_"+i+"/fetch.txt",sep="\t",header=False,index=False) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/workflow/tests/test_trimData.py b/workflow/tests/test_trimData.py index ea75252f558adc95e5feb564551afbe3f8858cb0..6538a0081aaea69c5814d479751455abd879f4bc 100644 --- a/workflow/tests/test_trimData.py +++ b/workflow/tests/test_trimData.py @@ -9,7 +9,10 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../../' @pytest.mark.trimData -def test_trimData(): +def test_trimData_se(): assert os.path.exists(os.path.join(test_output_path, '16-1ZX4_trimmed.fq.gz')) + +@pytest.mark.trimData +def test_trimData_pe(): assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R1_val_1.fq.gz')) assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R2_val_2.fq.gz')) \ No newline at end of file