From acebeb68fcbaeb6f7c107e225972edcbab875101 Mon Sep 17 00:00:00 2001 From: Venkat Malladi <venkat.malladi@utsouthwestern.edu> Date: Wed, 31 Oct 2018 22:05:50 -0500 Subject: [PATCH] Revert main --- workflow/main.nf | 598 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 462 insertions(+), 136 deletions(-) diff --git a/workflow/main.nf b/workflow/main.nf index bb90254..e770f8f 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -1,139 +1,465 @@ #!/usr/bin/env nextflow - params.design="$baseDir/../test_data/samplesheet.csv" - params.bams = "$baseDir/../test_data/*.bam" - params.peaks = "$baseDir/../test_data/*.broadPeak" - params.genomepath="/project/shared/bicf_workflow_ref/GRCh37" - toppeakcount = -1 - design_file = file(params.design) - deeptools_design = Channel.fromPath(params.design) - diffbind_design = Channel.fromPath(params.design) - chipseeker_design = Channel.fromPath(params.design) - meme_design = Channel.fromPath(params.design) - index_bams = Channel.fromPath(params.bams) - deeptools_bams = Channel.fromPath(params.bams) - deeptools_peaks = Channel.fromPath(params.peaks) - chipseeker_peaks = Channel.fromPath(params.peaks) - diffbind_bams = Channel.fromPath(params.bams) - diffbind_peaks = Channel.fromPath(params.peaks) - meme_peaks = Channel.fromPath(params.peaks) - -process bamindex { - publishDir "$baseDir/output/", mode: 'copy' - input: - file index_bam_files from index_bams - output: - file "*bai" into deeptools_bamindex - file "*bai" into diffbind_bamindex - - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module load samtools/intel/1.3 - samtools index $index_bam_files - """ -} - -process run_deeptools { - publishDir "$baseDir/output", mode: 'copy' - input: - file deeptools_design_file from deeptools_design - file deeptools_bam_files from deeptools_bams.toList() - file deeptools_peak_files from deeptools_peaks.toList() - file deeptools_bam_indexes from deeptools_bamindex.toList() - output: - file "*deeptools*" into deeptools_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module load deeptools/2.3.5 - python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}} -""" -} - - -process run_diffbind { - publishDir "$baseDir/output", mode: 'copy' - input: - file diffbind_design_file from diffbind_design - file diffbind_bam_files from diffbind_bams.toList() - file diffbind_peak_files from diffbind_peaks.toList() - file diffbind_bam_indexes from diffbind_bamindex.toList() - output: - file "diffpeak.design" into diffpeaksdesign_chipseeker - file "diffpeak.design" into diffpeaksdesign_meme - file "*_diffbind.bed" into diffpeaks_meme - file "*_diffbind.bed" into diffpeaks_chipseeker - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file -""" -} - -process run_chipseeker_diffpeak { - publishDir "$baseDir/output", mode: 'copy' - input: - file diffpeak_design_file from diffpeaksdesign_chipseeker - file diffpeaks from diffpeaks_chipseeker - output: - file "*chipseeker*" into chipseeker_diffpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file ${params.genomepath} -""" -} - -process run_chipseeker_originalpeak { - publishDir "$baseDir/output", mode: 'copy' - input: - file design_file from chipseeker_design - file chipseeker_peak_files from chipseeker_peaks.toList() - output: - file "*chipseeker*" into chipseeker_originalpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath} -""" -} - -process run_meme_original { - publishDir "$baseDir/output", mode: 'copy' - input: - file design_meme from meme_design - file meme_peak_files from meme_peaks.toList() - output: - file "*meme*" into meme_original_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module add deeptools/2.3.5 - module load meme/4.11.1-gcc-openmpi - python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount} -""" -} - -process run_meme_diffpeak { - publishDir "$baseDir/output", mode: 'copy' - input: - file peaks_meme from diffpeaks_meme - file diffpeak_design from diffpeaksdesign_meme - output: - file "*meme*" into meme_diffpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module add deeptools/2.3.5 - module load meme/4.11.1-gcc-openmpi - python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount} -""" + +// Path to an input file, or a pattern for multiple inputs +// Note - $baseDir is the location of this workflow file main.nf + +// Define Input variables +params.reads = "$baseDir/../test_data/*.fastq.gz" +params.pairedEnd = false +params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt" +params.genome = 'GRCm38' +params.genomes = [] +params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false +params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false +params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false +params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false +params.cutoffRatio = 1.2 + +// Check inputs +if( params.bwaIndex ){ + bwaIndex = Channel + .fromPath(params.bwaIndex) + .ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" } +} else { + exit 1, "No reference genome specified." +} + +// Define List of Files +readsList = Channel + .fromPath( params.reads ) + .flatten() + .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")} + .collectFile( name: 'fileList.tsv', newLine: true ) + +// Define regular variables +pairedEnd = params.pairedEnd +designFile = params.designFile +genomeSize = params.genomeSize +genome = params.genome +chromSizes = params.chromSizes +fasta = params.fasta +cutoffRatio = params.cutoffRatio + +process checkDesignFile { + + publishDir "$baseDir/output/design", mode: 'copy' + + input: + + designFile + file readsList + + output: + + file("design.tsv") into designFilePaths + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p + """ + } + else { + """ + python $baseDir/scripts/check_design.py -d $designFile -f $readsList + """ + } + +} + +// Define channel for raw reads +if (pairedEnd) { + rawReads = designFilePaths + .splitCsv(sep: '\t', header: true) + .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } +} else { +rawReads = designFilePaths + .splitCsv(sep: '\t', header: true) + .map { row -> [ row.sample_id, [row.fastq_read1], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } +} + +// Trim raw reads using trimgalore +process trimReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from rawReads + + output: + + set sampleId, file('*.fq.gz'), experimentId, biosample, factor, treatment, replicate, controlId into trimmedReads + file('*trimming_report.txt') into trimgalore_results + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p + """ + } + else { + """ + python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} + """ + } + +} + +// Align trimmed reads using bwa +process alignReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from trimmedReads + file index from bwaIndex.first() + + output: + + set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads + file '*.srt.bam.flagstat.qc' into mappedReadsStats + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p + """ + } + else { + """ + python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa + """ + } + +} + +// Dedup reads using sambamba +process filterReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, mapped, experimentId, biosample, factor, treatment, replicate, controlId from mappedReads + + output: + + set sampleId, file('*.bam'), file('*.bai'), experimentId, biosample, factor, treatment, replicate, controlId into dedupReads + set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into convertReads + file '*flagstat.qc' into dedupReadsStats + file '*pbc.qc' into dedupReadsComplexity + file '*dup.qc' into dupReads + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/map_qc.py -b $mapped -p + """ + } + else { + """ + python3 $baseDir/scripts/map_qc.py -b $mapped + """ + } + +} + +// Define channel collecting dedup reads into new design file +dedupReads +.map{ sampleId, bam, bai, experimentId, biosample, factor, treatment, replicate, controlId -> +"$sampleId\t$bam\t$bai\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} +.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") +.into { dedupDesign; preDiffDesign } + +// Quality Metrics using deeptools +process experimentQC { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file dedupDesign + + output: + + file '*.{png,npz}' into deepToolsStats + + script: + + """ + python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign + """ + } +// Convert reads to bam +process convertReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, deduped, experimentId, biosample, factor, treatment, replicate, controlId from convertReads + + output: + + set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, biosample, factor, treatment, replicate, controlId into tagReads + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/convert_reads.py -b $deduped -p + """ + } + else { + """ + python3 $baseDir/scripts/convert_reads.py -b $deduped + """ + } + +} + +// Calculate Cross-correlation using phantompeaktools +process crossReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, seTagAlign, tagAlign, experimentId, biosample, factor, treatment, replicate, controlId from tagReads + + output: + + set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads + set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/xcor.py -t $seTagAlign -p + """ + } + else { + """ + python3 $baseDir/scripts/xcor.py -t $seTagAlign + """ + } + +} + +// Define channel collecting tagAlign and xcor into design file +xcorDesign = xcorReads + .map{ sampleId, seTagAlign, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId -> + "$sampleId\t$seTagAlign\t$tagAlign\t$xcor\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} + .collectFile(name:'design_xcor.tsv', seed:"sample_id\tse_tag_align\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") + +// Make Experiment design files to be read in for downstream analysis +process defineExpDesignFiles { + + publishDir "$baseDir/output/design", mode: 'copy' + + input: + + file xcorDesign + + output: + + file '*.tsv' into experimentObjs mode flatten + + script: + + """ + python3 $baseDir/scripts/experiment_design.py -d $xcorDesign + """ + +} + + +// Make Experiment design files to be read in for downstream analysis +process poolAndPsuedoReads { + + + tag "${experimentObjs.baseName}" + publishDir "$baseDir/output/design", mode: 'copy' + + input: + + file experimentObjs + + output: + + file '*.tsv' into experimentPoolObjs + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio -p + """ + } + else { + """ + python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio + """ + } + +} + +// Collect list of experiment design files into a single channel +experimentRows = experimentPoolObjs + .splitCsv(sep:'\t', header:true) + .map { row -> [ row.sample_id, row.tag_align, row.xcor, row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id, row.control_tag_align] } + +// Call Peaks using MACS +process callPeaksMACS { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + set sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId, controlTagAlign from experimentRows + + output: + + set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes -p + """ + } + else { + """ + python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes + """ + } + +} + +// Define channel collecting peaks into design file +peaksDesign = experimentPeaks + .map{ sampleId, peak, fcSignal, pvalueSignal, experimentId, biosample, factor, treatment, replicate, controlId -> + "$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} + .collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") + +// Calculate Consensus Peaks +process consensusPeaks { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file peaksDesign + file preDiffDesign + + output: + + file '*.replicated.*' into consensusPeaks + file '*.rejected.*' into rejectedPeaks + file 'design_diffPeaks.csv' into designDiffPeaks + file 'design_annotatePeaks.tsv' into designAnnotatePeaks, designMotifSearch + file 'unique_experiments.csv' into uniqueExperiments + + script: + + """ + python3 $baseDir/scripts/overlap_peaks.py -d $peaksDesign -f $preDiffDesign + """ + +} + +// Define channel to find number of unique experiments +uniqueExperiments.splitCsv(sep: '\t', header: true).toList().map{ it.size().toInteger() }.set { noUniqueExperiments } + +// Annotate Peaks +process peakAnnotation { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file designAnnotatePeaks + + output: + + file "*chipseeker*" into peakAnnotation + + script: + + """ + Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $genome + """ + +} + + +// Calculate Differential Binding Activity +process diffPeaks { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file designDiffPeaks + + output: + + file 'design_diffpeak_annotatePeaks.tsv' into diffPeaksDesignAnnotatePeaks, diffPeaksDesignMeme + file '*_diffbind.bed' into diffPeaks + file '*_diffbind.csv' into diffPeaksCounts + file '*.pdf' into diffPeaksStats + file 'normcount_peaksets.txt' into normCountPeaks + + script: + if (noUniqueExperiments <= 1) { + """ + touch design_diffpeak_annotatePeaks.tsv + touch no_diffbind.bed + touch no_diffbind.csv + touch heatmap.pdf + touch pca.pdf + touch normcount_peaksets.txt + """ + } + else { + """ + Rscript $baseDir/scripts/dba.R $designDiffPeaks + """ + } + +} + +// Motif Search Peaks +process motifSearch { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file designMotifSearch + + output: + + file "*meme*" into motifSearch + + script: + + """ + python2.7 $baseDir/scripts/motif_search.py -i $designMotifSearch -g $fasta + """ +} -- GitLab