Merge branch '20-make.scripts.inputs' into 'develop'

Resolve "Make scripts input files for processes" Closes #20 See merge request !12

Merge branch '20-make.scripts.inputs' into 'develop'
Resolve "Make scripts input files for processes" Closes #20 See merge request !12
d499ea11 · Gervaise Henry · 947e7543 · 81cb8e68 · d499ea11 · 947e7543
Commit d499ea11 authored 5 years ago by Gervaise Henry
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -15,13 +15,7 @@ deriva = Channel
 bdbag = Channel
  .fromPath(params.bdbag)
  .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" }
+repRID = params.repRID
-Channel.from(params.repRID)
-  .into {
-    repRID_getBag
-    repRID_getData
-    repRID_trimData
-  }
 outDir = params.outDir
 logsDir = "${outDir}/Logs"
@@ -29,31 +23,36 @@ logsDir = "${outDir}/Logs"
 // Define fixed files
 derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
+// Define script files
+script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
 /*
 * getData: get bagit file from consortium
 */
 process getBag {
-  executor 'local'
+  tag "${repRID}"
-  tag "${repRID_getBag}"
+  publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getBag.err"
-  publishDir "${logsDir}/getBag", mode: 'symlink', pattern: "${repRID_getBag}.getBag.err"
  input:
-    val repRID_getBag
    path credential, stageAs: 'credential.json' from deriva
    path derivaConfig
  output:
    path ("Replicate_*.zip") into bagit
-    file ("${repRID_getBag}.getBag.err")
+    file ("${repRID}.getBag.err")
  script:
    """
-    hostname >>${repRID_getBag}.getBag.err
+    hostname >>${repRID}.getBag.err
-    ulimit -a >>${repRID_getBag}.getBag.err
+    ulimit -a >>${repRID}.getBag.err
    export https_proxy=\${http_proxy}
-    ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID_getBag}.getBag.err
-    echo "LOG: deriva credentials linked" >>${repRID_getBag}.getBag.err
+    # link credential file for authentication
-    deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID_getBag} 2>>${repRID_getBag}.getBag.err
+    ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID}.getBag.err
+    echo "LOG: deriva credentials linked" >>${repRID}.getBag.err
+    # deriva-download replicate RID
+    deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID} 2>>${repRID}.getBag.err
    """
 }
@@ -61,12 +60,11 @@ process getBag {
 * getData: fetch study files from consortium with downloaded bdbag.zip
 */
 process getData {
-  tag "${repRID_getData}"
+  tag "${repRID}"
-  publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${repRID_getData}.getData.err"
+  publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getData.err"
  input:
-    val repRID_getData
+    path script_bdbagFetch
-    executor 'local'
    path cookies, stageAs: 'deriva-cookies.txt' from bdbag
    path bagit
@@ -75,22 +73,30 @@ process getData {
    file("**/File.csv") into fileMeta
    file("**/Experiment Settings.csv") into experimentSettingsMeta
    file("**/Experiment.csv") into experimentMeta
-    file ("${repRID_getData}.getData.err")
+    file ("${repRID}.getData.err")
  script:
    """
-    hostname >>${repRID_getData}.getData.err
+    hostname >>${repRID}.getData.err
-    ulimit -a >>${repRID_getData}.getData.err
+    ulimit -a >>${repRID}.getData.err
    export https_proxy=\${http_proxy}
-    ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID_getData}.getData.err
-    echo "LOG: deriva cookie linked" >>${repRID_getData}.getData.err
+    # link deriva cookie for authentication
+    ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID}.getData.err
+    echo "LOG: deriva cookie linked" >>${repRID}.getData.err
+    # get bagit basename
    replicate=\$(basename "${bagit}" | cut -d '.' -f1)
-    echo "LOG: \${replicate}" >>${repRID_getData}.getData.err
+    echo "LOG: \${replicate}" >>${repRID}.getData.err
-    unzip ${bagit} 2>>${repRID_getData}.getData.err
-    echo "LOG: replicate bdbag unzipped" >>${repRID_getData}.getData.err
+    # unzip bagit
-    sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} ${repRID_getData} 2>>${repRID_getData}.getData.err
+    unzip ${bagit} 2>>${repRID}.getData.err
-    echo "LOG: replicate bdbag fetched" >>${repRID_getData}.getData.err
+    echo "LOG: replicate bdbag unzipped" >>${repRID}.getData.err
+    # bagit fetch fastq's only and rename by repRID
+    sh bdbagFetch.sh \${replicate} ${repRID} 2>>${repRID}.getData.err
+    echo "LOG: replicate bdbag fetched" >>${repRID}.getData.err
    """
 }
@@ -98,34 +104,31 @@ process getData {
 * trimData: trims any adapter or non-host sequences from the data
 */
 process trimData {
-  tag "${repRID_trimData}"
+  tag "${repRID}"
-  publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${repRID_trimData}.trimData.*"
+  publishDir "${logsDir}", mode: 'copy', pattern: "\${repRID}.trimData.*"
  input:
-    val repRID_trimData
    file(fastq) from fastqs
  output:
    path ("*.fq.gz") into fastqs_trimmed
    val ends
-    file ("${repRID_trimData}.trimData.log")
+    file ("${repRID}.trimData.log")
-    file ("${repRID_trimData}.trimData.err")
+    file ("${repRID}.trimData.err")
  script:
    """
-    if [ `nproc` -gt 8 ]
+    hostname >>${repRID}.trimData.err
-    then
+    ulimit -a >>${repRID}.trimData.err
-      ncore=8
-    else
+    # trim fastqs
-      ncore=`nproc`
-    fi
    if [ '${fastq[1]}' == 'null' ]
    then
      ends='se'
-      trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err;
+      trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err;
    else
      ends='pe'
-      trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} ${fastq[1]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err;
+      trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err;
    fi
    """
 }
\ No newline at end of file
--- a/workflow/scripts/modifyFetch.py
+++ b/workflow/scripts/modifyFetch.py
-#!/usr/bin/env python3
-import argparse
-import pandas as pd
-import re
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
-    args = parser.parse_args()
-    return args
-def main():
-    args = get_args()
-    fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
-    fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0)
-    fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"]
-    fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"]
-    fetchFile_filtered_renamed = fetchFile_filtered
-    for i in fileFile_filtered["File_Name"]:
-        fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0])
-    fetchFile_filtered_renamed.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)
-if __name__ == '__main__':
-    main()
\ No newline at end of file
--- a/workflow/scripts/splitFetch.py
+++ b/workflow/scripts/splitFetch.py
-#!/usr/bin/env python3
-import argparse
-import pandas as pd
-import os
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
-    args = parser.parse_args()
-    return args
-def main():
-    args = get_args()
-    fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
-    fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0)
-    replicateRID = fileFile.Replicate_RID.unique()
-    fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID}
-    for i in replicateRID:
-        if not os.path.exists(i):
-            os.mkdir("Replicate_"+i)
-        fetchFile[fetchFile[0].str.contains('|'.join(fetchArray[i]))].to_csv("Replicate_"+i+"/fetch.txt",sep="\t",header=False,index=False)
-if __name__ == '__main__':
-    main()
\ No newline at end of file
--- a/workflow/tests/test_trimData.py
+++ b/workflow/tests/test_trimData.py
@@ -9,7 +9,10 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
                '/../../'
 @pytest.mark.trimData
-def test_trimData():
+def test_trimData_se():
    assert os.path.exists(os.path.join(test_output_path, '16-1ZX4_trimmed.fq.gz'))
+@pytest.mark.trimData
+def test_trimData_pe():
    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R1_val_1.fq.gz'))
    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R2_val_2.fq.gz'))
\ No newline at end of file