Skip to content
Snippets Groups Projects
Commit c011b6a1 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Fix Beibei's version of pipeline.

parent df14067f
Branches
Tags
No related merge requests found
......@@ -9,7 +9,7 @@
# A unique identifier for the workflow package, text/underscores only
name: 'chipseq_analysis_bicf'
# Who wrote this?
author: 'Beibei Chen and Venkat Malladi'
author: 'Beibei Chen'
# A contact email address for questions
email: 'biohpc-help@utsouthwestern.edu'
# A more informative title for the workflow package
......@@ -17,7 +17,7 @@ title: 'BICF ChIP-seq Analysis Workflow'
# A summary of the workflow package in plain text
description: |
This is a workflow package for the BioHPC/BICF ChIP-seq workflow system.
It implements ChIP-seq analysis workflow and visualization application.
It implements a simple ChIP-seq analysis workflow using deepTools, Diffbind, ChipSeeker and MEME-ChIP, visualization application.
# -----------------------------------------------------------------------------
# DOCUMENTATION
......@@ -33,24 +33,14 @@ documentation_files:
# NEXTFLOW WORKFLOW CONFIGURATION
# -----------------------------------------------------------------------------
# Remember - The workflow file is always named 'workflow/main.nf'
# Remember - The workflow file is always named 'workflow/main.f'
# The workflow must publish all final output into $baseDir
# A list of cluster environment modules that this workflow requires to run.
# A list of clueter environment modules that this workflow requires to run.
# Specify versioned module names to ensure reproducability.
workflow_modules:
- 'python/3.6.1-2-anaconda'
- 'trimgalore/0.4.1'
- 'cutadapt/1.9.1'
- 'fastqc/0.11.2'
- 'bwa/intel/0.7.12'
- 'samtools/1.4.1'
- 'sambamba/0.6.6'
- 'bedtools/2.26.0'
- 'deeptools/2.5.0.1'
- 'phantompeakqualtools/1.2'
- 'macs/2.1.0-20151222'
- 'UCSC_userApps/v317'
- 'deeptools/2.3.5'
- 'meme/4.11.1-gcc-openmpi'
# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
......@@ -86,44 +76,45 @@ workflow_modules:
workflow_parameters:
- id: reads
- id: bams
type: files
required: true
description: |
One or more input FASTQ files from a ChIP-seq expereiment and a design
file with the link bewetwen the same file name and sample id
regex: ".*(fastq|fq)*"
Bam files of all samples
regex: ".*(bam|BAM)"
- id: pairedEnd
type: select
- id: peaks
type: files
required: true
choices:
- [ 'true', 'True']
- [ 'false', 'False']
description: |
In single-end sequencing, the sequencer reads a fragment from only one
end to the other, generating the sequence of base pairs. In paired-end
reading it starts at one read, finishes this direction at the specified
read length, and then starts another round of reading from the opposite
end of the fragment.
Peak files of all samples. Peaks should be sorted by user using either p_value or intensity of the signals.Bed format.
regex: ".*(narrowPeak|broadPeak|bed|BED)"
- id: design
type: file
type: files
required: true
regex: ".*(csv)"
description: |
A design file listing sample id, fastq files, corresponding control id
and additional information about the sample.
A design file listing pairs of sample name and sample group. Must be in csv format
Columns must include: SampleID,Tissue, Factor, Condition, Replicate, Peaks, bamReads, bamControl, ControlID, PeakCaller
- id: genome
- id: genomepath
type: select
choices:
- [ 'GRCh38', 'Human GRCh38']
- [ 'GRCh37', 'Human GRCh37']
- [ 'GRCh38', 'Mouse GRCh38']
- [ '/project/shared/bicf_workflow_ref/GRCh38', 'human GRCh38']
- [ '/project/shared/bicf_workflow_ref/GRCh37', 'human GRCh37']
- [ '/project/shared/bicf_workflow_ref/GRCm38', 'mouse GRCm38']
required: true
description: |
Reference species and genome used for alignment and subsequent analysis.
Reference genome for annotation
- id: toppeakcount
type: integer
required: true
default: -1
description: |
The number of top peaks to use for motif discovery. This program will nott sort peak BED files for you, so please make sure your peak files are already sorted.If want all peaks to be used, use -1.
# -----------------------------------------------------------------------------
# SHINY APP CONFIGURATION
......@@ -141,8 +132,8 @@ vizapp_cran_packages:
- shiny
- shinyFiles
# List of any Bioconductor packages, not provided by the modules,
# that must be made available to the vizapp
# # List of any Bioconductor packages, not provided by the modules, that must be made
# available to the vizapp
vizapp_bioc_packages:
- qusage
# - ballgown
......
#!/usr/bin/env nextflow
// Path to an input file, or a pattern for multiple inputs
// Note - $baseDir is the location of this workflow file main.nf
// Define Input variables
params.reads = "$baseDir/../test_data/*.fastq.gz"
params.pairedEnd = false
params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
params.genome = 'GRCm38'
params.genomes = []
params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false
params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
params.cutoffRatio = 1.2
// Check inputs
if( params.bwaIndex ){
bwaIndex = Channel
.fromPath(params.bwaIndex)
.ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" }
} else {
exit 1, "No reference genome specified."
}
// Define List of Files
readsList = Channel
.fromPath( params.reads )
.flatten()
.map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")}
.collectFile( name: 'fileList.tsv', newLine: true )
// Define regular variables
pairedEnd = params.pairedEnd
designFile = params.designFile
genomeSize = params.genomeSize
chromSizes = params.chromSizes
cutoffRatio = params.cutoffRatio
process checkDesignFile {
publishDir "$baseDir/output/design", mode: 'copy'
input:
designFile
file readsList
output:
file("design.tsv") into designFilePaths
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
"""
}
else {
"""
python $baseDir/scripts/check_design.py -d $designFile -f $readsList
"""
}
params.design="$baseDir/../test_data/samplesheet.csv"
params.bams = "$baseDir/../test_data/*.bam"
params.peaks = "$baseDir/../test_data/*.broadPeak"
params.genomepath="/project/shared/bicf_workflow_ref/GRCh37"
toppeakcount = -1
design_file = file(params.design)
deeptools_design = Channel.fromPath(params.design)
diffbind_design = Channel.fromPath(params.design)
chipseeker_design = Channel.fromPath(params.design)
meme_design = Channel.fromPath(params.design)
index_bams = Channel.fromPath(params.bams)
deeptools_bams = Channel.fromPath(params.bams)
deeptools_peaks = Channel.fromPath(params.peaks)
chipseeker_peaks = Channel.fromPath(params.peaks)
diffbind_bams = Channel.fromPath(params.bams)
diffbind_peaks = Channel.fromPath(params.peaks)
meme_peaks = Channel.fromPath(params.peaks)
process bamindex {
publishDir "$baseDir/output/", mode: 'copy'
input:
file index_bam_files from index_bams
output:
file "*bai" into deeptools_bamindex
file "*bai" into diffbind_bamindex
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
module load samtools/intel/1.3
samtools index $index_bam_files
"""
}
// Define channel for raw reads
if (pairedEnd) {
rawReads = designFilePaths
.splitCsv(sep: '\t', header: true)
.map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
} else {
rawReads = designFilePaths
.splitCsv(sep: '\t', header: true)
.map { row -> [ row.sample_id, [row.fastq_read1], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
process run_deeptools {
publishDir "$baseDir/output", mode: 'copy'
input:
file deeptools_design_file from deeptools_design
file deeptools_bam_files from deeptools_bams.toList()
file deeptools_peak_files from deeptools_peaks.toList()
file deeptools_bam_indexes from deeptools_bamindex.toList()
output:
file "*deeptools*" into deeptools_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
module load deeptools/2.3.5
python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}}
"""
}
// Trim raw reads using trimgalore
process trimReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from rawReads
output:
set sampleId, file('*.fq.gz'), experimentId, biosample, factor, treatment, replicate, controlId into trimmedReads
file('*trimming_report.txt') into trimgalore_results
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
"""
}
else {
"""
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
"""
}
process run_diffbind {
publishDir "$baseDir/output", mode: 'copy'
input:
file diffbind_design_file from diffbind_design
file diffbind_bam_files from diffbind_bams.toList()
file diffbind_peak_files from diffbind_peaks.toList()
file diffbind_bam_indexes from diffbind_bamindex.toList()
output:
file "diffpeak.design" into diffpeaksdesign_chipseeker
file "diffpeak.design" into diffpeaksdesign_meme
file "*_diffbind.bed" into diffpeaks_meme
file "*_diffbind.bed" into diffpeaks_chipseeker
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file
"""
}
// Align trimmed reads using bwa
process alignReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from trimmedReads
file index from bwaIndex.first()
output:
set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads
file '*.srt.bam.flagstat.qc' into mappedReadsStats
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p
"""
}
else {
"""
python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa
"""
}
process run_chipseeker_diffpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file diffpeak_design_file from diffpeaksdesign_chipseeker
file diffpeaks from diffpeaks_chipseeker
output:
file "*chipseeker*" into chipseeker_diffpeak_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file ${params.genomepath}
"""
}
// Dedup reads using sambamba
process filterReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
set sampleId, mapped, experimentId, biosample, factor, treatment, replicate, controlId from mappedReads
output:
set sampleId, file('*.bam'), file('*.bai'), experimentId, biosample, factor, treatment, replicate, controlId into dedupReads
set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into convertReads
file '*flagstat.qc' into dedupReadsStats
file '*pbc.qc' into dedupReadsComplexity
file '*dup.qc' into dupReads
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/map_qc.py -b $mapped -p
"""
}
else {
"""
python3 $baseDir/scripts/map_qc.py -b $mapped
"""
}
process run_chipseeker_originalpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file design_file from chipseeker_design
file chipseeker_peak_files from chipseeker_peaks.toList()
output:
file "*chipseeker*" into chipseeker_originalpeak_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath}
"""
}
// Define channel collecting dedup reads into new design file
dedupReads
.map{ sampleId, bam, bai, experimentId, biosample, factor, treatment, replicate, controlId ->
"$sampleId\t$bam\t$bai\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
.into { dedupDesign; preDiffDesign }
// Quality Metrics using deeptools
process experimentQC {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
file dedupDesign
output:
file '*.{png,npz}' into deepToolsStats
script:
"""
python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign
"""
process run_meme_original {
publishDir "$baseDir/output", mode: 'copy'
input:
file design_meme from meme_design
file meme_peak_files from meme_peaks.toList()
output:
file "*meme*" into meme_original_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
module add deeptools/2.3.5
module load meme/4.11.1-gcc-openmpi
python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount}
"""
}
// Convert reads to bam
process convertReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
set sampleId, deduped, experimentId, biosample, factor, treatment, replicate, controlId from convertReads
output:
set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, biosample, factor, treatment, replicate, controlId into tagReads
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/convert_reads.py -b $deduped -p
"""
}
else {
"""
python3 $baseDir/scripts/convert_reads.py -b $deduped
"""
}
}
// Calculate Cross-correlation using phantompeaktools
process crossReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
set sampleId, seTagAlign, tagAlign, experimentId, biosample, factor, treatment, replicate, controlId from tagReads
output:
set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads
set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/xcor.py -t $seTagAlign -p
"""
}
else {
"""
python3 $baseDir/scripts/xcor.py -t $seTagAlign
"""
}
}
// Define channel collecting tagAlign and xcor into design file
xcorDesign = xcorReads
.map{ sampleId, seTagAlign, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId ->
"$sampleId\t$seTagAlign\t$tagAlign\t$xcor\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
.collectFile(name:'design_xcor.tsv', seed:"sample_id\tse_tag_align\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
// Make Experiment design files to be read in for downstream analysis
process defineExpDesignFiles {
publishDir "$baseDir/output/design", mode: 'copy'
input:
file xcorDesign
output:
file '*.tsv' into experimentObjs mode flatten
script:
"""
python3 $baseDir/scripts/experiment_design.py -d $xcorDesign
"""
}
// Make Experiment design files to be read in for downstream analysis
process poolAndPsuedoReads {
tag "${experimentObjs.baseName}"
publishDir "$baseDir/output/design", mode: 'copy'
input:
file experimentObjs
output:
file '*.tsv' into experimentPoolObjs
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio -p
"""
}
else {
"""
python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio
"""
}
}
// Collect list of experiment design files into a single channel
experimentRows = experimentPoolObjs
.splitCsv(sep:'\t', header:true)
.map { row -> [ row.sample_id, row.tag_align, row.xcor, row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id, row.control_tag_align] }
// Call Peaks using MACS
process callPeaksMACS {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
set sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId, controlTagAlign from experimentRows
output:
set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes -p
"""
}
else {
"""
python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes
"""
}
}
// Define channel collecting peaks into design file
peaksDesign = experimentPeaks
.map{ sampleId, peak, fcSignal, pvalueSignal, experimentId, biosample, factor, treatment, replicate, controlId ->
"$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
.collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
// Calculate Consensus Peaks
process consensusPeaks {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
file peaksDesign
file preDiffDesign
output:
file '*.replicated.*' into consensusPeaks
file '*.rejected.*' into rejectedPeaks
file("design_diffPeaks.tsv") into designDiffPeaks
script:
"""
python3 $baseDir/scripts/overlap_peaks.py -d $peaksDesign -f $preDiffDesign
"""
process run_meme_diffpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file peaks_meme from diffpeaks_meme
file diffpeak_design from diffpeaksdesign_meme
output:
file "*meme*" into meme_diffpeak_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
module add deeptools/2.3.5
module load meme/4.11.1-gcc-openmpi
python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount}
"""
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment