From acebeb68fcbaeb6f7c107e225972edcbab875101 Mon Sep 17 00:00:00 2001
From: Venkat Malladi <venkat.malladi@utsouthwestern.edu>
Date: Wed, 31 Oct 2018 22:05:50 -0500
Subject: [PATCH] Revert main

---
 workflow/main.nf | 598 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 462 insertions(+), 136 deletions(-)

diff --git a/workflow/main.nf b/workflow/main.nf
index bb90254..e770f8f 100644
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -1,139 +1,465 @@
 #!/usr/bin/env nextflow
-   params.design="$baseDir/../test_data/samplesheet.csv"
-   params.bams = "$baseDir/../test_data/*.bam"
-   params.peaks = "$baseDir/../test_data/*.broadPeak"
-   params.genomepath="/project/shared/bicf_workflow_ref/GRCh37"
-   toppeakcount = -1
-   design_file = file(params.design)
-   deeptools_design = Channel.fromPath(params.design)
-   diffbind_design = Channel.fromPath(params.design)
-   chipseeker_design = Channel.fromPath(params.design)
-   meme_design = Channel.fromPath(params.design)
-   index_bams = Channel.fromPath(params.bams)
-   deeptools_bams = Channel.fromPath(params.bams) 
-   deeptools_peaks = Channel.fromPath(params.peaks) 
-   chipseeker_peaks = Channel.fromPath(params.peaks) 
-   diffbind_bams = Channel.fromPath(params.bams) 
-   diffbind_peaks = Channel.fromPath(params.peaks) 
-   meme_peaks = Channel.fromPath(params.peaks)
-
-process bamindex {
-   publishDir "$baseDir/output/", mode: 'copy'
-   input:
-     file index_bam_files from index_bams
-   output:
-     file "*bai" into deeptools_bamindex
-     file "*bai" into diffbind_bamindex
-
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module load samtools/intel/1.3
-     samtools index $index_bam_files
-     """
-}
-
-process run_deeptools {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file deeptools_design_file from deeptools_design
-     file deeptools_bam_files from deeptools_bams.toList()
-     file deeptools_peak_files from deeptools_peaks.toList()
-     file deeptools_bam_indexes from deeptools_bamindex.toList()
-   output:
-     file "*deeptools*" into deeptools_output
-     script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module load deeptools/2.3.5
-     python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}}
-"""
-}
-
-
-process run_diffbind {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file diffbind_design_file from diffbind_design
-     file diffbind_bam_files from diffbind_bams.toList()
-     file diffbind_peak_files from diffbind_peaks.toList()
-     file diffbind_bam_indexes from diffbind_bamindex.toList()
-   output:
-     file "diffpeak.design" into diffpeaksdesign_chipseeker
-     file "diffpeak.design" into diffpeaksdesign_meme
-     file "*_diffbind.bed" into diffpeaks_meme
-     file "*_diffbind.bed" into diffpeaks_chipseeker
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file
-"""
-}
-
-process run_chipseeker_diffpeak {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file diffpeak_design_file from diffpeaksdesign_chipseeker
-     file diffpeaks from diffpeaks_chipseeker
-   output:
-     file "*chipseeker*" into chipseeker_diffpeak_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file ${params.genomepath}
-"""
-}
-
-process run_chipseeker_originalpeak {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file design_file from chipseeker_design
-     file chipseeker_peak_files from chipseeker_peaks.toList()
-   output:
-     file "*chipseeker*" into chipseeker_originalpeak_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath}
-"""
-}
-
-process run_meme_original {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file design_meme from meme_design
-     file meme_peak_files from meme_peaks.toList()
-   output:
-     file "*meme*" into meme_original_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module add deeptools/2.3.5
-     module load meme/4.11.1-gcc-openmpi
-     python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount}
-"""
-}
-
-process run_meme_diffpeak {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file peaks_meme from diffpeaks_meme
-     file diffpeak_design from diffpeaksdesign_meme
-   output:
-     file "*meme*" into meme_diffpeak_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module add deeptools/2.3.5
-     module load meme/4.11.1-gcc-openmpi
-     python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount}
-"""
+
+// Path to an input file, or a pattern for multiple inputs
+// Note - $baseDir is the location of this workflow file main.nf
+
+// Define Input variables
+params.reads = "$baseDir/../test_data/*.fastq.gz"
+params.pairedEnd = false
+params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
+params.genome = 'GRCm38'
+params.genomes = []
+params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
+params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false
+params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
+params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
+params.cutoffRatio = 1.2
+
+// Check inputs
+if( params.bwaIndex ){
+  bwaIndex = Channel
+    .fromPath(params.bwaIndex)
+    .ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" }
+} else {
+  exit 1, "No reference genome specified."
+}
+
+// Define List of Files
+readsList = Channel
+  .fromPath( params.reads )
+  .flatten()
+  .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")}
+  .collectFile( name: 'fileList.tsv', newLine: true )
+
+// Define regular variables
+pairedEnd = params.pairedEnd
+designFile = params.designFile
+genomeSize = params.genomeSize
+genome = params.genome
+chromSizes = params.chromSizes
+fasta = params.fasta
+cutoffRatio = params.cutoffRatio
+
+process checkDesignFile {
+
+  publishDir "$baseDir/output/design", mode: 'copy'
+
+  input:
+
+  designFile
+  file readsList
+
+  output:
+
+  file("design.tsv") into designFilePaths
+
+  script:
+
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
+    """
+  }
+  else {
+    """
+    python $baseDir/scripts/check_design.py -d $designFile -f $readsList
+    """
+  }
+
+}
+
+// Define channel for raw reads
+if (pairedEnd) {
+  rawReads = designFilePaths
+    .splitCsv(sep: '\t', header: true)
+    .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
+} else {
+rawReads = designFilePaths
+  .splitCsv(sep: '\t', header: true)
+  .map { row -> [ row.sample_id, [row.fastq_read1], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
+}
+
+// Trim raw reads using trimgalore
+process trimReads {
+
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+
+  input:
+
+  set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from rawReads
+
+  output:
+
+  set sampleId, file('*.fq.gz'), experimentId, biosample, factor, treatment, replicate, controlId into trimmedReads
+  file('*trimming_report.txt') into trimgalore_results
+
+  script:
+
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
+    """
+  }
+
+}
+
+// Align trimmed reads using bwa
+process alignReads {
+
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+
+  input:
+
+  set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from trimmedReads
+  file index from bwaIndex.first()
+
+  output:
+
+  set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads
+  file '*.srt.bam.flagstat.qc' into mappedReadsStats
+
+  script:
+
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa
+    """
+  }
+
+}
+
+// Dedup reads using sambamba
+process filterReads {
+
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+
+  input:
+
+  set sampleId, mapped, experimentId, biosample, factor, treatment, replicate, controlId from mappedReads
+
+  output:
+
+  set sampleId, file('*.bam'), file('*.bai'), experimentId, biosample, factor, treatment, replicate, controlId into dedupReads
+  set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into convertReads
+  file '*flagstat.qc' into dedupReadsStats
+  file '*pbc.qc' into dedupReadsComplexity
+  file '*dup.qc' into dupReads
+
+  script:
+
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/map_qc.py -b $mapped -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/map_qc.py -b $mapped
+    """
+  }
+
+}
+
+// Define channel collecting dedup reads into new design file
+dedupReads
+.map{ sampleId, bam, bai, experimentId, biosample, factor, treatment, replicate, controlId ->
+"$sampleId\t$bam\t$bai\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
+.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
+.into { dedupDesign; preDiffDesign }
+
+// Quality Metrics using deeptools
+process experimentQC {
+
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+
+  input:
+
+  file dedupDesign
+
+  output:
+
+  file '*.{png,npz}' into deepToolsStats
+
+  script:
+
+  """
+  python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign
+  """
+
 }
 
+// Convert reads to bam
+process convertReads {
+
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+
+  input:
+
+  set sampleId, deduped, experimentId, biosample, factor, treatment, replicate, controlId from convertReads
+
+  output:
+
+  set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, biosample, factor, treatment, replicate, controlId into tagReads
+
+  script:
+
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/convert_reads.py -b $deduped -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/convert_reads.py -b $deduped
+    """
+  }
+
+}
+
+// Calculate Cross-correlation using phantompeaktools
+process crossReads {
+
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+
+  input:
+
+  set sampleId, seTagAlign, tagAlign, experimentId, biosample, factor, treatment, replicate, controlId from tagReads
+
+  output:
+
+  set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads
+  set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
+
+  script:
+
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/xcor.py -t $seTagAlign -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/xcor.py -t $seTagAlign
+    """
+  }
+
+}
+
+// Define channel collecting tagAlign and xcor into design file
+xcorDesign = xcorReads
+              .map{ sampleId, seTagAlign, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId ->
+              "$sampleId\t$seTagAlign\t$tagAlign\t$xcor\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
+              .collectFile(name:'design_xcor.tsv', seed:"sample_id\tse_tag_align\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
+
+// Make Experiment design files to be read in for downstream analysis
+process defineExpDesignFiles {
+
+  publishDir "$baseDir/output/design", mode: 'copy'
+
+  input:
+
+  file xcorDesign
+
+  output:
+
+  file '*.tsv' into experimentObjs mode flatten
+
+  script:
+
+  """
+  python3 $baseDir/scripts/experiment_design.py -d $xcorDesign
+  """
+
+}
+
+
+// Make Experiment design files to be read in for downstream analysis
+process poolAndPsuedoReads {
+
+
+  tag "${experimentObjs.baseName}"
+  publishDir "$baseDir/output/design", mode: 'copy'
+
+  input:
+
+  file experimentObjs
+
+  output:
+
+  file '*.tsv' into experimentPoolObjs
+
+  script:
+
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio
+    """
+  }
+
+}
+
+// Collect list of experiment design files into a single channel
+experimentRows = experimentPoolObjs
+                .splitCsv(sep:'\t', header:true)
+                .map { row -> [ row.sample_id, row.tag_align, row.xcor, row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id, row.control_tag_align] }
+
+// Call Peaks using MACS
+process callPeaksMACS {
+
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+
+  input:
+  set sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId, controlTagAlign from experimentRows
+
+  output:
+
+  set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks
+
+  script:
+
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes
+    """
+  }
+
+}
+
+// Define channel collecting peaks into design file
+peaksDesign = experimentPeaks
+              .map{ sampleId, peak, fcSignal, pvalueSignal, experimentId, biosample, factor, treatment, replicate, controlId ->
+              "$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
+              .collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
+
+// Calculate Consensus Peaks
+process consensusPeaks {
+
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+
+  input:
+
+  file peaksDesign
+  file preDiffDesign
+
+  output:
+
+  file '*.replicated.*' into consensusPeaks
+  file '*.rejected.*' into rejectedPeaks
+  file 'design_diffPeaks.csv'  into designDiffPeaks
+  file 'design_annotatePeaks.tsv'  into designAnnotatePeaks, designMotifSearch
+  file 'unique_experiments.csv' into uniqueExperiments
+
+  script:
+
+  """
+  python3 $baseDir/scripts/overlap_peaks.py -d $peaksDesign -f $preDiffDesign
+  """
+
+}
+
+// Define channel to find number of unique experiments
+uniqueExperiments.splitCsv(sep: '\t', header: true).toList().map{ it.size().toInteger() }.set { noUniqueExperiments }
+
+// Annotate Peaks
+process peakAnnotation {
+
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+
+  input:
+
+  file designAnnotatePeaks
+
+  output:
+
+  file "*chipseeker*" into peakAnnotation
+
+  script:
+
+  """
+  Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $genome
+  """
+
+}
+
+
+// Calculate Differential Binding Activity
+process diffPeaks {
+
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+
+  input:
+
+  file designDiffPeaks
+
+  output:
+
+  file 'design_diffpeak_annotatePeaks.tsv' into diffPeaksDesignAnnotatePeaks, diffPeaksDesignMeme
+  file '*_diffbind.bed' into diffPeaks
+  file '*_diffbind.csv' into diffPeaksCounts
+  file '*.pdf' into diffPeaksStats
+  file 'normcount_peaksets.txt' into normCountPeaks
+
+  script:
+  if (noUniqueExperiments <= 1) {
+    """
+    touch design_diffpeak_annotatePeaks.tsv
+    touch no_diffbind.bed
+    touch no_diffbind.csv
+    touch heatmap.pdf
+    touch pca.pdf
+    touch normcount_peaksets.txt
+    """
+  }
+  else {
+    """
+    Rscript $baseDir/scripts/dba.R $designDiffPeaks
+    """
+  }
+
+}
+
+// Motif Search  Peaks
+process motifSearch {
+
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+
+  input:
+
+  file designMotifSearch
+
+  output:
+
+  file "*meme*" into motifSearch
+
+  script:
+
+  """
+  python2.7 $baseDir/scripts/motif_search.py -i $designMotifSearch -g $fasta
+  """
+}
-- 
GitLab