#!/usr/bin/env nextflow

// Define input variables
params.deriva = "/project/BICF/BICF_Core/shared/gudmap/cookies/deriva-cookies.txt"
params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip"

params.outDir = "${baseDir}/../output"

// Parse input variables
deriva = file(params.deriva, checkIfExists: 'true')
bdbag = Channel
  .fromPath(params.bdbag)
  .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }

outDir = params.outDir
logsDir = "${outDir}/Logs"

/*
 * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
 */
process splitData {
  tag "${bdbag.baseName}"
  executor 'local'
  publishDir "${logsDir}/splitData", mode: 'symlink', pattern: "${bdbag.baseName}.splitData.err"

  input:
    file bdbag
    path cookies, stageAs: 'cookies.txt' from deriva

  output:
    file("Replicate_*.zip") into bdbagSplit mode flatten
    file("${bdbag.baseName}/data/File.csv") into fileMeta
    file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta
    file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta
    file ("${bdbag.baseName}.splitData.err")

  script:
    """
    hostname >> ${bdbag.baseName}.splitData.err
    ulimit -a >> ${bdbag.baseName}.splitData.err
    ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt 2>>${bdbag.baseName}.splitData.err
    echo "LOG: deriva cookie linked" >> ${bdbag.baseName}.splitData.err 
    study=`echo "${bdbag}" | cut -d '.' -f1` 2>>${bdbag.baseName}.splitData.err
    echo "LOG: \${study}" >> ${bdbag.baseName}.splitData.err
    unzip ${bdbag} 2>>${bdbag.baseName}.splitData.err
    echo "LOG: bdgag unzipped" >> ${bdbag.baseName}.splitData.err
    python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err
    echo "LOG: fetch file filtered for only .fastq.gz" >> ${bdbag.baseName}.splitData.err
    python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err
    echo "LOG: fetch file split by replicates" >> ${bdbag.baseName}.splitData.err
    sh ${baseDir}/scripts/splitBag.sh \${study} 2>>${bdbag.baseName}.splitData.err
    echo "LOG: bag recreated with replicate split fetch file" >> ${bdbag.baseName}.splitData.err
    """
}

/*
 * getData: fetch study files from consortium with downloaded bdbag.zip
 */
process getData {
  tag "${rep.baseName}"
  publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${rep.baseName}.getData.err"

  input:
    each rep from bdbagSplit

  output:
    set val ("${rep.baseName}"), file ("*.R{1,2}.fastq.gz") into trimming

  script:
    """
    hostname >>${rep.baseName}.getData.err
    ulimit -a >>${rep.baseName}.getData.err
    export https_proxy=\${http_proxy}
    replicate=\$(basename "${rep}" | cut -d '.' -f1)
    echo "LOG: \${replicate}" >>${rep.baseName}.getData.err
    unzip ${rep} 2>>${rep.baseName}.getData.err
    echo "LOG: replicate bdbag unzipped" >>${rep.baseName}.getData.err
    sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} 2>>${rep.baseName}.getData.err
    echo "LOG: replicate bdbag fetched" >>${rep.baseName}.getData.err
    """
}

/*
 * trimData: trims any adapter or non-host sequences from the data
*/
process trimData {
  tag "trim-${repID}"
  publishDir "${outDir}/tempOut/trimmed", mode: "symlink", pattern: "*_val_{1,2}.fq.gz"
  publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${rep}.trimData.*"

  input:
    set repID, reads from trimming

  output:
    path ("*_val_{1,2}.fq.gz", type: 'file', maxDepth: '0')

  script:
    """
    rep=`echo ${repID} | cut -f2- -d '_'`;
    trim_galore --gzip --max_n 1 --paired --basename \${rep} -j `nproc` ${reads[0]} ${reads[1]} 1>>\${rep}.trimData.log 2>>\${rep}.trimData.err;
    """
}