Rever changes.

7b165eab · Venkat Malladi · 2cdebf21 · 7b165eab · 7b165eab
Commit 7b165eab authored 6 years ago by Venkat Malladi
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -9,7 +9,7 @@
 # A unique identifier for the workflow package, text/underscores only
 name: 'chipseq_analysis_bicf'
 # Who wrote this?
-author: 'Beibei Chen'
+author: 'Beibei Chen and Venkat Malladi'
 # A contact email address for questions
 email: 'biohpc-help@utsouthwestern.edu'
 # A more informative title for the workflow package
@@ -17,7 +17,7 @@ title: 'BICF ChIP-seq Analysis Workflow'
 # A summary of the workflow package in plain text
 description: |
  This is a workflow package for the BioHPC/BICF ChIP-seq workflow system.
-  It implements a simple ChIP-seq analysis workflow using deepTools, Diffbind, ChipSeeker and MEME-ChIP, visualization application.
+  It implements ChIP-seq analysis workflow and visualization application.
 # -----------------------------------------------------------------------------
 # DOCUMENTATION
@@ -33,14 +33,24 @@ documentation_files:
 # NEXTFLOW WORKFLOW CONFIGURATION
 # -----------------------------------------------------------------------------
-# Remember - The workflow file is always named 'workflow/main.f'
+# Remember - The workflow file is always named 'workflow/main.nf'
 #            The workflow must publish all final output into $baseDir
-# A list of clueter environment modules that this workflow requires to run.
+# A list of cluster environment modules that this workflow requires to run.
 # Specify versioned module names to ensure reproducability.
 workflow_modules:
-  - 'deeptools/2.3.5'
+  - 'python/3.6.1-2-anaconda'
-  - 'meme/4.11.1-gcc-openmpi'
+  - 'trimgalore/0.4.1'
+  - 'cutadapt/1.9.1'
+  - 'fastqc/0.11.2'
+  - 'bwa/intel/0.7.12'
+  - 'samtools/1.4.1'
+  - 'sambamba/0.6.6'
+  - 'bedtools/2.26.0'
+  - 'deeptools/2.5.0.1'
+  - 'phantompeakqualtools/1.2'
+  - 'macs/2.1.0-20151222'
+  - 'UCSC_userApps/v317'
 # A list of parameters used by the workflow, defining how to present them,
 # options etc in the web interface. For each parameter:
@@ -76,45 +86,44 @@ workflow_modules:
 workflow_parameters:
-  - id: bams
+  - id: reads
    type: files
    required: true
    description: |
-      Bam files of all samples
+      One or more input FASTQ files from a ChIP-seq expereiment and a design
-    regex: ".*(bam|BAM)"
+      file with the link bewetwen the same file name and sample id
+    regex: ".*(fastq|fq)*"
-  - id: peaks
+  - id: pairedEnd
-    type: files
+    type: select
    required: true
+    choices:
+      - [ 'true', 'True']
+      - [ 'false', 'False']
    description: |
-      Peak files of all samples. Peaks should be sorted by user using either p_value or intensity of the signals.Bed format.
+      In single-end sequencing, the sequencer reads a fragment from only one
-    regex: ".*(narrowPeak|broadPeak|bed|BED)"
+      end to the other, generating the sequence of base pairs. In paired-end
+      reading it starts at one read, finishes this direction at the specified
+      read length, and then starts another round of reading from the opposite
+      end of the fragment.
  - id: design
-    type: files
+    type: file
    required: true
-    regex: ".*(csv)"
    description: |
-      A design file listing pairs of sample name and sample group. Must be in csv format
+      A design file listing sample id, fastq files, corresponding control id
-      Columns must include: SampleID,Tissue, Factor, Condition, Replicate, Peaks, bamReads, bamControl, ControlID, PeakCaller
+      and additional information about the sample.
-  - id: genomepath
+  - id: genome
    type: select
    choices:
-      - [ '/project/shared/bicf_workflow_ref/GRCh38', 'human GRCh38']
+      - [ 'GRCh38', 'Human GRCh38']
-      - [ '/project/shared/bicf_workflow_ref/GRCh37', 'human GRCh37']
+      - [ 'GRCh37', 'Human GRCh37']
-      - [ '/project/shared/bicf_workflow_ref/GRCm38', 'mouse GRCm38']
+      - [ 'GRCh38', 'Mouse GRCh38']
    required: true
    description: |
-      Reference genome for annotation
+      Reference species and genome used for alignment and subsequent analysis.
-  - id: toppeakcount
-    type: integer
-    required: true
-    default: -1
-    description: |
-      The number of top peaks to use for motif discovery. This program will nott sort peak BED files for you, so please make sure your peak files are already sorted.If want all peaks to be used, use -1.
 # -----------------------------------------------------------------------------
 # SHINY APP CONFIGURATION
@@ -132,8 +141,8 @@ vizapp_cran_packages:
  - shiny
  - shinyFiles
-# # List of any Bioconductor packages, not provided by the modules, that must be made
+# List of any Bioconductor packages, not provided by the modules,
-# available to the vizapp
+# that must be made available to the vizapp
 vizapp_bioc_packages:
  - qusage
 #  - ballgown

--- a/workflow/main.nf
+++ b/workflow/main.nf
 #!/usr/bin/env nextflow
-   params.design="$baseDir/../test_data/samplesheet.csv"
-   params.bams = "$baseDir/../test_data/*.bam"
+// Path to an input file, or a pattern for multiple inputs
-   params.peaks = "$baseDir/../test_data/*.broadPeak"
+// Note - $baseDir is the location of this workflow file main.nf
-   params.genomepath="/project/shared/bicf_workflow_ref/GRCh37"
-   toppeakcount = -1
+// Define Input variables
-   design_file = file(params.design)
+params.reads = "$baseDir/../test_data/*.fastq.gz"
-   deeptools_design = Channel.fromPath(params.design)
+params.pairedEnd = false
-   diffbind_design = Channel.fromPath(params.design)
+params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
-   chipseeker_design = Channel.fromPath(params.design)
+params.genome = 'GRCm38'
-   meme_design = Channel.fromPath(params.design)
+params.genomes = []
-   index_bams = Channel.fromPath(params.bams)
+params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
-   deeptools_bams = Channel.fromPath(params.bams)
+params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false
-   deeptools_peaks = Channel.fromPath(params.peaks)
+params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
-   chipseeker_peaks = Channel.fromPath(params.peaks)
+params.cutoffRatio = 1.2
-   diffbind_bams = Channel.fromPath(params.bams)
-   diffbind_peaks = Channel.fromPath(params.peaks)
+// Check inputs
-   meme_peaks = Channel.fromPath(params.peaks)
+if( params.bwaIndex ){
+  bwaIndex = Channel
-process bamindex {
+    .fromPath(params.bwaIndex)
-   publishDir "$baseDir/output/", mode: 'copy'
+    .ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" }
-   input:
+} else {
-     file index_bam_files from index_bams
+  exit 1, "No reference genome specified."
-   output:
+}
-     file "*bai" into deeptools_bamindex
-     file "*bai" into diffbind_bamindex
+// Define List of Files
+readsList = Channel
-   script:
+  .fromPath( params.reads )
-     """
+  .flatten()
-     module load python/2.7.x-anaconda
+  .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")}
-     module load R/3.3.2-gccmkl
+  .collectFile( name: 'fileList.tsv', newLine: true )
-     module load samtools/intel/1.3
-     samtools index $index_bam_files
+// Define regular variables
-     """
+pairedEnd = params.pairedEnd
+designFile = params.designFile
+genomeSize = params.genomeSize
+chromSizes = params.chromSizes
+cutoffRatio = params.cutoffRatio
+process checkDesignFile {
+  publishDir "$baseDir/output/design", mode: 'copy'
+  input:
+  designFile
+  file readsList
+  output:
+  file("design.tsv") into designFilePaths
+  script:
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
+    """
+  }
+  else {
+    """
+    python $baseDir/scripts/check_design.py -d $designFile -f $readsList
+    """
+  }
 }
-process run_deeptools {
+// Define channel for raw reads
-   publishDir "$baseDir/output", mode: 'copy'
+if (pairedEnd) {
-   input:
+  rawReads = designFilePaths
-     file deeptools_design_file from deeptools_design
+    .splitCsv(sep: '\t', header: true)
-     file deeptools_bam_files from deeptools_bams.toList()
+    .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
-     file deeptools_peak_files from deeptools_peaks.toList()
+} else {
-     file deeptools_bam_indexes from deeptools_bamindex.toList()
+rawReads = designFilePaths
-   output:
+  .splitCsv(sep: '\t', header: true)
-     file "*deeptools*" into deeptools_output
+  .map { row -> [ row.sample_id, [row.fastq_read1], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
-     script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module load deeptools/2.3.5
-     python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}}
-"""
 }
+// Trim raw reads using trimgalore
+process trimReads {
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  input:
+  set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from rawReads
+  output:
+  set sampleId, file('*.fq.gz'), experimentId, biosample, factor, treatment, replicate, controlId into trimmedReads
+  file('*trimming_report.txt') into trimgalore_results
+  script:
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
+    """
+  }
-process run_diffbind {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file diffbind_design_file from diffbind_design
-     file diffbind_bam_files from diffbind_bams.toList()
-     file diffbind_peak_files from diffbind_peaks.toList()
-     file diffbind_bam_indexes from diffbind_bamindex.toList()
-   output:
-     file "diffpeak.design" into diffpeaksdesign_chipseeker
-     file "diffpeak.design" into diffpeaksdesign_meme
-     file "*_diffbind.bed" into diffpeaks_meme
-     file "*_diffbind.bed" into diffpeaks_chipseeker
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file
-"""
 }
-process run_chipseeker_diffpeak {
+// Align trimmed reads using bwa
-   publishDir "$baseDir/output", mode: 'copy'
+process alignReads {
-   input:
-     file diffpeak_design_file from diffpeaksdesign_chipseeker
+  tag "$sampleId-$replicate"
-     file diffpeaks from diffpeaks_chipseeker
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
-   output:
-     file "*chipseeker*" into chipseeker_diffpeak_output
+  input:
-   script:
-     """
+  set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from trimmedReads
-     module load python/2.7.x-anaconda
+  file index from bwaIndex.first()
-     module load R/3.3.2-gccmkl
-     Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file ${params.genomepath}
+  output:
-"""
+  set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads
+  file '*.srt.bam.flagstat.qc' into mappedReadsStats
+  script:
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa
+    """
+  }
 }
-process run_chipseeker_originalpeak {
+// Dedup reads using sambamba
-   publishDir "$baseDir/output", mode: 'copy'
+process filterReads {
-   input:
-     file design_file from chipseeker_design
+  tag "$sampleId-$replicate"
-     file chipseeker_peak_files from chipseeker_peaks.toList()
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
-   output:
-     file "*chipseeker*" into chipseeker_originalpeak_output
+  input:
-   script:
-     """
+  set sampleId, mapped, experimentId, biosample, factor, treatment, replicate, controlId from mappedReads
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
+  output:
-     Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath}
-"""
+  set sampleId, file('*.bam'), file('*.bai'), experimentId, biosample, factor, treatment, replicate, controlId into dedupReads
+  set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into convertReads
+  file '*flagstat.qc' into dedupReadsStats
+  file '*pbc.qc' into dedupReadsComplexity
+  file '*dup.qc' into dupReads
+  script:
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/map_qc.py -b $mapped -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/map_qc.py -b $mapped
+    """
+  }
 }
-process run_meme_original {
+// Define channel collecting dedup reads into new design file
-   publishDir "$baseDir/output", mode: 'copy'
+dedupReads
-   input:
+.map{ sampleId, bam, bai, experimentId, biosample, factor, treatment, replicate, controlId ->
-     file design_meme from meme_design
+"$sampleId\t$bam\t$bai\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
-     file meme_peak_files from meme_peaks.toList()
+.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
-   output:
+.into { dedupDesign; preDiffDesign }
-     file "*meme*" into meme_original_output
-   script:
+// Quality Metrics using deeptools
-     """
+process experimentQC {
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
-     module add deeptools/2.3.5
-     module load meme/4.11.1-gcc-openmpi
+  input:
-     python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount}
-"""
+  file dedupDesign
+  output:
+  file '*.{png,npz}' into deepToolsStats
+  script:
+  """
+  python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign
+  """
 }
-process run_meme_diffpeak {
+// Convert reads to bam
-   publishDir "$baseDir/output", mode: 'copy'
+process convertReads {
-   input:
-     file peaks_meme from diffpeaks_meme
+  tag "$sampleId-$replicate"
-     file diffpeak_design from diffpeaksdesign_meme
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
-   output:
-     file "*meme*" into meme_diffpeak_output
+  input:
-   script:
-     """
+  set sampleId, deduped, experimentId, biosample, factor, treatment, replicate, controlId from convertReads
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
+  output:
-     module add deeptools/2.3.5
-     module load meme/4.11.1-gcc-openmpi
+  set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, biosample, factor, treatment, replicate, controlId into tagReads
-     python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount}
-"""
+  script:
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/convert_reads.py -b $deduped -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/convert_reads.py -b $deduped
+    """
+  }
+}
+// Calculate Cross-correlation using phantompeaktools
+process crossReads {
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  input:
+  set sampleId, seTagAlign, tagAlign, experimentId, biosample, factor, treatment, replicate, controlId from tagReads
+  output:
+  set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads
+  set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
+  script:
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/xcor.py -t $seTagAlign -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/xcor.py -t $seTagAlign
+    """
+  }
+}
+// Define channel collecting tagAlign and xcor into design file
+xcorDesign = xcorReads
+              .map{ sampleId, seTagAlign, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId ->
+              "$sampleId\t$seTagAlign\t$tagAlign\t$xcor\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
+              .collectFile(name:'design_xcor.tsv', seed:"sample_id\tse_tag_align\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
+// Make Experiment design files to be read in for downstream analysis
+process defineExpDesignFiles {
+  publishDir "$baseDir/output/design", mode: 'copy'
+  input:
+  file xcorDesign
+  output:
+  file '*.tsv' into experimentObjs mode flatten
+  script:
+  """
+  python3 $baseDir/scripts/experiment_design.py -d $xcorDesign
+  """
+}
+// Make Experiment design files to be read in for downstream analysis
+process poolAndPsuedoReads {
+  tag "${experimentObjs.baseName}"
+  publishDir "$baseDir/output/design", mode: 'copy'
+  input:
+  file experimentObjs
+  output:
+  file '*.tsv' into experimentPoolObjs
+  script:
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio
+    """
+  }
+}
+// Collect list of experiment design files into a single channel
+experimentRows = experimentPoolObjs
+                .splitCsv(sep:'\t', header:true)
+                .map { row -> [ row.sample_id, row.tag_align, row.xcor, row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id, row.control_tag_align] }
+// Call Peaks using MACS
+process callPeaksMACS {
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  input:
+  set sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId, controlTagAlign from experimentRows
+  output:
+  set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks
+  script:
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes
+    """
+  }
+}
+// Define channel collecting peaks into design file
+peaksDesign = experimentPeaks
+              .map{ sampleId, peak, fcSignal, pvalueSignal, experimentId, biosample, factor, treatment, replicate, controlId ->
+              "$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
+              .collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
+// Calculate Consensus Peaks
+process consensusPeaks {
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  input:
+  file peaksDesign
+  file preDiffDesign
+  output:
+  file '*.replicated.*' into consensusPeaks
+  file '*.rejected.*' into rejectedPeaks
+  file("design_diffPeaks.tsv") into designDiffPeaks
+  script:
+  """
+  python3 $baseDir/scripts/overlap_peaks.py -d $peaksDesign -f $preDiffDesign
+  """
 }