diff --git a/workflow/main.nf b/workflow/main.nf index 4fa5d9f62bc75b3c283f39a07e1f8e849a294bef..bb90254ace37a6ba2ac9dc34be79cba7de371d94 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -1,379 +1,139 @@ #!/usr/bin/env nextflow - -// Path to an input file, or a pattern for multiple inputs -// Note - $baseDir is the location of this workflow file main.nf - -// Define Input variables -params.reads = "$baseDir/../test_data/*.fastq.gz" -params.pairedEnd = false -params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt" -params.genome = 'GRCm38' -params.genomes = [] -params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false -params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false -params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false -params.cutoffRatio = 1.2 - -// Check inputs -if( params.bwaIndex ){ - bwaIndex = Channel - .fromPath(params.bwaIndex) - .ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" } -} else { - exit 1, "No reference genome specified." -} - -// Define List of Files -readsList = Channel - .fromPath( params.reads ) - .flatten() - .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")} - .collectFile( name: 'fileList.tsv', newLine: true ) - -// Define regular variables -pairedEnd = params.pairedEnd -designFile = params.designFile -genomeSize = params.genomeSize -chromSizes = params.chromSizes -cutoffRatio = params.cutoffRatio - -process checkDesignFile { - - publishDir "$baseDir/output/design", mode: 'copy' - - input: - - designFile - file readsList - - output: - - file("design.tsv") into designFilePaths - - script: - - if (pairedEnd) { - """ - python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p - """ - } - else { - """ - python $baseDir/scripts/check_design.py -d $designFile -f $readsList - """ - } - -} - -// Define channel for raw reads -if (pairedEnd) { - rawReads = designFilePaths - .splitCsv(sep: '\t', header: true) - .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } -} else { -rawReads = designFilePaths - .splitCsv(sep: '\t', header: true) - .map { row -> [ row.sample_id, [row.fastq_read1], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } -} - -// Trim raw reads using trimgalore -process trimReads { - - tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' - - input: - - set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from rawReads - - output: - - set sampleId, file('*.fq.gz'), experimentId, biosample, factor, treatment, replicate, controlId into trimmedReads - file('*trimming_report.txt') into trimgalore_results - - script: - - if (pairedEnd) { - """ - python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p - """ - } - else { - """ - python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} - """ - } - + params.design="$baseDir/../test_data/samplesheet.csv" + params.bams = "$baseDir/../test_data/*.bam" + params.peaks = "$baseDir/../test_data/*.broadPeak" + params.genomepath="/project/shared/bicf_workflow_ref/GRCh37" + toppeakcount = -1 + design_file = file(params.design) + deeptools_design = Channel.fromPath(params.design) + diffbind_design = Channel.fromPath(params.design) + chipseeker_design = Channel.fromPath(params.design) + meme_design = Channel.fromPath(params.design) + index_bams = Channel.fromPath(params.bams) + deeptools_bams = Channel.fromPath(params.bams) + deeptools_peaks = Channel.fromPath(params.peaks) + chipseeker_peaks = Channel.fromPath(params.peaks) + diffbind_bams = Channel.fromPath(params.bams) + diffbind_peaks = Channel.fromPath(params.peaks) + meme_peaks = Channel.fromPath(params.peaks) + +process bamindex { + publishDir "$baseDir/output/", mode: 'copy' + input: + file index_bam_files from index_bams + output: + file "*bai" into deeptools_bamindex + file "*bai" into diffbind_bamindex + + script: + """ + module load python/2.7.x-anaconda + module load R/3.3.2-gccmkl + module load samtools/intel/1.3 + samtools index $index_bam_files + """ } -// Align trimmed reads using bwa -process alignReads { - - tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' - - input: - - set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from trimmedReads - file index from bwaIndex.first() - - output: - - set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads - file '*.srt.bam.flagstat.qc' into mappedReadsStats - - script: - - if (pairedEnd) { - """ - python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p - """ - } - else { - """ - python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa - """ - } - -} - -// Dedup reads using sambamba -process filterReads { - - tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' - - input: - - set sampleId, mapped, experimentId, biosample, factor, treatment, replicate, controlId from mappedReads - - output: - - set sampleId, file('*.bam'), file('*.bai'), experimentId, biosample, factor, treatment, replicate, controlId into dedupReads - set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into convertReads - file '*flagstat.qc' into dedupReadsStats - file '*pbc.qc' into dedupReadsComplexity - file '*dup.qc' into dupReads - - script: - - if (pairedEnd) { - """ - python3 $baseDir/scripts/map_qc.py -b $mapped -p - """ - } - else { - """ - python3 $baseDir/scripts/map_qc.py -b $mapped - """ - } - -} - -// Define channel collecting dedup reads into new design file -dedupReads -.map{ sampleId, bam, bai, experimentId, biosample, factor, treatment, replicate, controlId -> -"$sampleId\t$bam\t$bai\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} -.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") -.into { dedupDesign; preDiffDesign } - -// Quality Metrics using deeptools -process experimentQC { - - publishDir "$baseDir/output/${task.process}", mode: 'copy' - - input: - - file dedupDesign - - output: - - file '*.{png,npz}' into deepToolsStats - - script: - - """ - python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign - """ - +process run_deeptools { + publishDir "$baseDir/output", mode: 'copy' + input: + file deeptools_design_file from deeptools_design + file deeptools_bam_files from deeptools_bams.toList() + file deeptools_peak_files from deeptools_peaks.toList() + file deeptools_bam_indexes from deeptools_bamindex.toList() + output: + file "*deeptools*" into deeptools_output + script: + """ + module load python/2.7.x-anaconda + module load R/3.3.2-gccmkl + module load deeptools/2.3.5 + python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}} +""" } -// Convert reads to bam -process convertReads { - - tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' - - input: - - set sampleId, deduped, experimentId, biosample, factor, treatment, replicate, controlId from convertReads - - output: - - set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, biosample, factor, treatment, replicate, controlId into tagReads - - script: - - if (pairedEnd) { - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -p - """ - } - else { - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped - """ - } +process run_diffbind { + publishDir "$baseDir/output", mode: 'copy' + input: + file diffbind_design_file from diffbind_design + file diffbind_bam_files from diffbind_bams.toList() + file diffbind_peak_files from diffbind_peaks.toList() + file diffbind_bam_indexes from diffbind_bamindex.toList() + output: + file "diffpeak.design" into diffpeaksdesign_chipseeker + file "diffpeak.design" into diffpeaksdesign_meme + file "*_diffbind.bed" into diffpeaks_meme + file "*_diffbind.bed" into diffpeaks_chipseeker + script: + """ + module load python/2.7.x-anaconda + module load R/3.3.2-gccmkl + Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file +""" } -// Calculate Cross-correlation using phantompeaktools -process crossReads { - - tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' - - input: - - set sampleId, seTagAlign, tagAlign, experimentId, biosample, factor, treatment, replicate, controlId from tagReads - - output: - - set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads - set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats - - script: - - if (pairedEnd) { - """ - python3 $baseDir/scripts/xcor.py -t $seTagAlign -p - """ - } - else { - """ - python3 $baseDir/scripts/xcor.py -t $seTagAlign - """ - } - +process run_chipseeker_diffpeak { + publishDir "$baseDir/output", mode: 'copy' + input: + file diffpeak_design_file from diffpeaksdesign_chipseeker + file diffpeaks from diffpeaks_chipseeker + output: + file "*chipseeker*" into chipseeker_diffpeak_output + script: + """ + module load python/2.7.x-anaconda + module load R/3.3.2-gccmkl + Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file ${params.genomepath} +""" } -// Define channel collecting tagAlign and xcor into design file -xcorDesign = xcorReads - .map{ sampleId, seTagAlign, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId -> - "$sampleId\t$seTagAlign\t$tagAlign\t$xcor\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} - .collectFile(name:'design_xcor.tsv', seed:"sample_id\tse_tag_align\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") - -// Make Experiment design files to be read in for downstream analysis -process defineExpDesignFiles { - - publishDir "$baseDir/output/design", mode: 'copy' - - input: - - file xcorDesign - - output: - - file '*.tsv' into experimentObjs mode flatten - - script: - - """ - python3 $baseDir/scripts/experiment_design.py -d $xcorDesign - """ - +process run_chipseeker_originalpeak { + publishDir "$baseDir/output", mode: 'copy' + input: + file design_file from chipseeker_design + file chipseeker_peak_files from chipseeker_peaks.toList() + output: + file "*chipseeker*" into chipseeker_originalpeak_output + script: + """ + module load python/2.7.x-anaconda + module load R/3.3.2-gccmkl + Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath} +""" } - -// Make Experiment design files to be read in for downstream analysis -process poolAndPsuedoReads { - - - tag "${experimentObjs.baseName}" - publishDir "$baseDir/output/design", mode: 'copy' - - input: - - file experimentObjs - - output: - - file '*.tsv' into experimentPoolObjs - - script: - - if (pairedEnd) { - """ - python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio -p - """ - } - else { - """ - python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio - """ - } - +process run_meme_original { + publishDir "$baseDir/output", mode: 'copy' + input: + file design_meme from meme_design + file meme_peak_files from meme_peaks.toList() + output: + file "*meme*" into meme_original_output + script: + """ + module load python/2.7.x-anaconda + module load R/3.3.2-gccmkl + module add deeptools/2.3.5 + module load meme/4.11.1-gcc-openmpi + python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount} +""" } -// Collect list of experiment design files into a single channel -experimentRows = experimentPoolObjs - .splitCsv(sep:'\t', header:true) - .map { row -> [ row.sample_id, row.tag_align, row.xcor, row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id, row.control_tag_align] } - -// Call Peaks using MACS -process callPeaksMACS { - - tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' - - input: - set sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId, controlTagAlign from experimentRows - - output: - - set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks - - script: - - if (pairedEnd) { - """ - python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes -p - """ - } - else { - """ - python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes - """ - } - +process run_meme_diffpeak { + publishDir "$baseDir/output", mode: 'copy' + input: + file peaks_meme from diffpeaks_meme + file diffpeak_design from diffpeaksdesign_meme + output: + file "*meme*" into meme_diffpeak_output + script: + """ + module load python/2.7.x-anaconda + module load R/3.3.2-gccmkl + module add deeptools/2.3.5 + module load meme/4.11.1-gcc-openmpi + python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount} +""" } -// Define channel collecting peaks into design file -peaksDesign = experimentPeaks - .map{ sampleId, peak, fcSignal, pvalueSignal, experimentId, biosample, factor, treatment, replicate, controlId -> - "$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} - .collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") - -// Calculate Consensus Peaks -process consensusPeaks { - - publishDir "$baseDir/output/${task.process}", mode: 'copy' - - input: - - file peaksDesign - file preDiffDesign - - output: - - file '*.replicated.*' into consensusPeaks - file '*.rejected.*' into rejectedPeaks - file("design_diffPeaks.tsv") into designDiffPeaks - - script: - - """ - python3 $baseDir/scripts/overlap_peaks.py -d $peaksDesign -f $preDiffDesign - """ - -}