From 7b165eab3daf4c5547595333249b97e4b43f88ca Mon Sep 17 00:00:00 2001 From: Venkat Malladi <venkat.malladi@utsouthwestern.edu> Date: Fri, 2 Nov 2018 15:35:42 -0500 Subject: [PATCH] Rever changes. --- astrocyte_pkg.yml | 71 ++++--- workflow/main.nf | 487 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 404 insertions(+), 154 deletions(-) diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index 4fffa5c..7db5da7 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -9,7 +9,7 @@ # A unique identifier for the workflow package, text/underscores only name: 'chipseq_analysis_bicf' # Who wrote this? -author: 'Beibei Chen' +author: 'Beibei Chen and Venkat Malladi' # A contact email address for questions email: 'biohpc-help@utsouthwestern.edu' # A more informative title for the workflow package @@ -17,7 +17,7 @@ title: 'BICF ChIP-seq Analysis Workflow' # A summary of the workflow package in plain text description: | This is a workflow package for the BioHPC/BICF ChIP-seq workflow system. - It implements a simple ChIP-seq analysis workflow using deepTools, Diffbind, ChipSeeker and MEME-ChIP, visualization application. + It implements ChIP-seq analysis workflow and visualization application. # ----------------------------------------------------------------------------- # DOCUMENTATION @@ -33,14 +33,24 @@ documentation_files: # NEXTFLOW WORKFLOW CONFIGURATION # ----------------------------------------------------------------------------- -# Remember - The workflow file is always named 'workflow/main.f' +# Remember - The workflow file is always named 'workflow/main.nf' # The workflow must publish all final output into $baseDir -# A list of clueter environment modules that this workflow requires to run. +# A list of cluster environment modules that this workflow requires to run. # Specify versioned module names to ensure reproducability. workflow_modules: - - 'deeptools/2.3.5' - - 'meme/4.11.1-gcc-openmpi' + - 'python/3.6.1-2-anaconda' + - 'trimgalore/0.4.1' + - 'cutadapt/1.9.1' + - 'fastqc/0.11.2' + - 'bwa/intel/0.7.12' + - 'samtools/1.4.1' + - 'sambamba/0.6.6' + - 'bedtools/2.26.0' + - 'deeptools/2.5.0.1' + - 'phantompeakqualtools/1.2' + - 'macs/2.1.0-20151222' + - 'UCSC_userApps/v317' # A list of parameters used by the workflow, defining how to present them, # options etc in the web interface. For each parameter: @@ -76,45 +86,44 @@ workflow_modules: workflow_parameters: - - id: bams + - id: reads type: files required: true description: | - Bam files of all samples - regex: ".*(bam|BAM)" + One or more input FASTQ files from a ChIP-seq expereiment and a design + file with the link bewetwen the same file name and sample id + regex: ".*(fastq|fq)*" - - id: peaks - type: files + - id: pairedEnd + type: select required: true + choices: + - [ 'true', 'True'] + - [ 'false', 'False'] description: | - Peak files of all samples. Peaks should be sorted by user using either p_value or intensity of the signals.Bed format. - regex: ".*(narrowPeak|broadPeak|bed|BED)" - + In single-end sequencing, the sequencer reads a fragment from only one + end to the other, generating the sequence of base pairs. In paired-end + reading it starts at one read, finishes this direction at the specified + read length, and then starts another round of reading from the opposite + end of the fragment. - id: design - type: files + type: file required: true - regex: ".*(csv)" description: | - A design file listing pairs of sample name and sample group. Must be in csv format - Columns must include: SampleID,Tissue, Factor, Condition, Replicate, Peaks, bamReads, bamControl, ControlID, PeakCaller + A design file listing sample id, fastq files, corresponding control id + and additional information about the sample. - - id: genomepath + - id: genome type: select choices: - - [ '/project/shared/bicf_workflow_ref/GRCh38', 'human GRCh38'] - - [ '/project/shared/bicf_workflow_ref/GRCh37', 'human GRCh37'] - - [ '/project/shared/bicf_workflow_ref/GRCm38', 'mouse GRCm38'] + - [ 'GRCh38', 'Human GRCh38'] + - [ 'GRCh37', 'Human GRCh37'] + - [ 'GRCh38', 'Mouse GRCh38'] required: true description: | - Reference genome for annotation + Reference species and genome used for alignment and subsequent analysis. - - id: toppeakcount - type: integer - required: true - default: -1 - description: | - The number of top peaks to use for motif discovery. This program will nott sort peak BED files for you, so please make sure your peak files are already sorted.If want all peaks to be used, use -1. # ----------------------------------------------------------------------------- # SHINY APP CONFIGURATION @@ -132,8 +141,8 @@ vizapp_cran_packages: - shiny - shinyFiles -# # List of any Bioconductor packages, not provided by the modules, that must be made -# available to the vizapp +# List of any Bioconductor packages, not provided by the modules, +# that must be made available to the vizapp vizapp_bioc_packages: - qusage # - ballgown diff --git a/workflow/main.nf b/workflow/main.nf index 26ae3dd..4fa5d9f 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -1,138 +1,379 @@ #!/usr/bin/env nextflow - params.design="$baseDir/../test_data/samplesheet.csv" - params.bams = "$baseDir/../test_data/*.bam" - params.peaks = "$baseDir/../test_data/*.broadPeak" - params.genomepath="/project/shared/bicf_workflow_ref/GRCh37" - toppeakcount = -1 - design_file = file(params.design) - deeptools_design = Channel.fromPath(params.design) - diffbind_design = Channel.fromPath(params.design) - chipseeker_design = Channel.fromPath(params.design) - meme_design = Channel.fromPath(params.design) - index_bams = Channel.fromPath(params.bams) - deeptools_bams = Channel.fromPath(params.bams) - deeptools_peaks = Channel.fromPath(params.peaks) - chipseeker_peaks = Channel.fromPath(params.peaks) - diffbind_bams = Channel.fromPath(params.bams) - diffbind_peaks = Channel.fromPath(params.peaks) - meme_peaks = Channel.fromPath(params.peaks) - -process bamindex { - publishDir "$baseDir/output/", mode: 'copy' - input: - file index_bam_files from index_bams - output: - file "*bai" into deeptools_bamindex - file "*bai" into diffbind_bamindex - - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module load samtools/intel/1.3 - samtools index $index_bam_files - """ + +// Path to an input file, or a pattern for multiple inputs +// Note - $baseDir is the location of this workflow file main.nf + +// Define Input variables +params.reads = "$baseDir/../test_data/*.fastq.gz" +params.pairedEnd = false +params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt" +params.genome = 'GRCm38' +params.genomes = [] +params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false +params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false +params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false +params.cutoffRatio = 1.2 + +// Check inputs +if( params.bwaIndex ){ + bwaIndex = Channel + .fromPath(params.bwaIndex) + .ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" } +} else { + exit 1, "No reference genome specified." +} + +// Define List of Files +readsList = Channel + .fromPath( params.reads ) + .flatten() + .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")} + .collectFile( name: 'fileList.tsv', newLine: true ) + +// Define regular variables +pairedEnd = params.pairedEnd +designFile = params.designFile +genomeSize = params.genomeSize +chromSizes = params.chromSizes +cutoffRatio = params.cutoffRatio + +process checkDesignFile { + + publishDir "$baseDir/output/design", mode: 'copy' + + input: + + designFile + file readsList + + output: + + file("design.tsv") into designFilePaths + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p + """ + } + else { + """ + python $baseDir/scripts/check_design.py -d $designFile -f $readsList + """ + } + } -process run_deeptools { - publishDir "$baseDir/output", mode: 'copy' - input: - file deeptools_design_file from deeptools_design - file deeptools_bam_files from deeptools_bams.toList() - file deeptools_peak_files from deeptools_peaks.toList() - file deeptools_bam_indexes from deeptools_bamindex.toList() - output: - file "*deeptools*" into deeptools_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module load deeptools/2.3.5 - python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}} -""" +// Define channel for raw reads +if (pairedEnd) { + rawReads = designFilePaths + .splitCsv(sep: '\t', header: true) + .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } +} else { +rawReads = designFilePaths + .splitCsv(sep: '\t', header: true) + .map { row -> [ row.sample_id, [row.fastq_read1], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } } +// Trim raw reads using trimgalore +process trimReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from rawReads + + output: + + set sampleId, file('*.fq.gz'), experimentId, biosample, factor, treatment, replicate, controlId into trimmedReads + file('*trimming_report.txt') into trimgalore_results + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p + """ + } + else { + """ + python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} + """ + } -process run_diffbind { - publishDir "$baseDir/output", mode: 'copy' - input: - file diffbind_design_file from diffbind_design - file diffbind_bam_files from diffbind_bams.toList() - file diffbind_peak_files from diffbind_peaks.toList() - file diffbind_bam_indexes from diffbind_bamindex.toList() - output: - file "diffpeak.design" into diffpeaksdesign_chipseeker - file "diffpeak.design" into diffpeaksdesign_meme - file "*_diffbind.bed" into diffpeaks_meme - file "*_diffbind.bed" into diffpeaks_chipseeker - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file -""" } -process run_chipseeker_diffpeak { - publishDir "$baseDir/output", mode: 'copy' - input: - file diffpeak_design_file from diffpeaksdesign_chipseeker - file diffpeaks from diffpeaks_chipseeker - output: - file "*chipseeker*" into chipseeker_diffpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file ${params.genomepath} -""" +// Align trimmed reads using bwa +process alignReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from trimmedReads + file index from bwaIndex.first() + + output: + + set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads + file '*.srt.bam.flagstat.qc' into mappedReadsStats + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p + """ + } + else { + """ + python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa + """ + } + } -process run_chipseeker_originalpeak { - publishDir "$baseDir/output", mode: 'copy' - input: - file design_file from chipseeker_design - file chipseeker_peak_files from chipseeker_peaks.toList() - output: - file "*chipseeker*" into chipseeker_originalpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath} -""" +// Dedup reads using sambamba +process filterReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, mapped, experimentId, biosample, factor, treatment, replicate, controlId from mappedReads + + output: + + set sampleId, file('*.bam'), file('*.bai'), experimentId, biosample, factor, treatment, replicate, controlId into dedupReads + set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into convertReads + file '*flagstat.qc' into dedupReadsStats + file '*pbc.qc' into dedupReadsComplexity + file '*dup.qc' into dupReads + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/map_qc.py -b $mapped -p + """ + } + else { + """ + python3 $baseDir/scripts/map_qc.py -b $mapped + """ + } + } -process run_meme_original { - publishDir "$baseDir/output", mode: 'copy' - input: - file design_meme from meme_design - file meme_peak_files from meme_peaks.toList() - output: - file "*meme*" into meme_original_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module add deeptools/2.3.5 - module load meme/4.11.1-gcc-openmpi - python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount} -""" +// Define channel collecting dedup reads into new design file +dedupReads +.map{ sampleId, bam, bai, experimentId, biosample, factor, treatment, replicate, controlId -> +"$sampleId\t$bam\t$bai\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} +.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") +.into { dedupDesign; preDiffDesign } + +// Quality Metrics using deeptools +process experimentQC { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file dedupDesign + + output: + + file '*.{png,npz}' into deepToolsStats + + script: + + """ + python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign + """ + } -process run_meme_diffpeak { - publishDir "$baseDir/output", mode: 'copy' - input: - file peaks_meme from diffpeaks_meme - file diffpeak_design from diffpeaksdesign_meme - output: - file "*meme*" into meme_diffpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module add deeptools/2.3.5 - module load meme/4.11.1-gcc-openmpi - python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount} -""" +// Convert reads to bam +process convertReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, deduped, experimentId, biosample, factor, treatment, replicate, controlId from convertReads + + output: + + set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, biosample, factor, treatment, replicate, controlId into tagReads + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/convert_reads.py -b $deduped -p + """ + } + else { + """ + python3 $baseDir/scripts/convert_reads.py -b $deduped + """ + } + +} + +// Calculate Cross-correlation using phantompeaktools +process crossReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, seTagAlign, tagAlign, experimentId, biosample, factor, treatment, replicate, controlId from tagReads + + output: + + set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads + set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/xcor.py -t $seTagAlign -p + """ + } + else { + """ + python3 $baseDir/scripts/xcor.py -t $seTagAlign + """ + } + +} + +// Define channel collecting tagAlign and xcor into design file +xcorDesign = xcorReads + .map{ sampleId, seTagAlign, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId -> + "$sampleId\t$seTagAlign\t$tagAlign\t$xcor\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} + .collectFile(name:'design_xcor.tsv', seed:"sample_id\tse_tag_align\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") + +// Make Experiment design files to be read in for downstream analysis +process defineExpDesignFiles { + + publishDir "$baseDir/output/design", mode: 'copy' + + input: + + file xcorDesign + + output: + + file '*.tsv' into experimentObjs mode flatten + + script: + + """ + python3 $baseDir/scripts/experiment_design.py -d $xcorDesign + """ + +} + + +// Make Experiment design files to be read in for downstream analysis +process poolAndPsuedoReads { + + + tag "${experimentObjs.baseName}" + publishDir "$baseDir/output/design", mode: 'copy' + + input: + + file experimentObjs + + output: + + file '*.tsv' into experimentPoolObjs + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio -p + """ + } + else { + """ + python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio + """ + } + +} + +// Collect list of experiment design files into a single channel +experimentRows = experimentPoolObjs + .splitCsv(sep:'\t', header:true) + .map { row -> [ row.sample_id, row.tag_align, row.xcor, row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id, row.control_tag_align] } + +// Call Peaks using MACS +process callPeaksMACS { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + set sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId, controlTagAlign from experimentRows + + output: + + set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes -p + """ + } + else { + """ + python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes + """ + } + +} + +// Define channel collecting peaks into design file +peaksDesign = experimentPeaks + .map{ sampleId, peak, fcSignal, pvalueSignal, experimentId, biosample, factor, treatment, replicate, controlId -> + "$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} + .collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") + +// Calculate Consensus Peaks +process consensusPeaks { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file peaksDesign + file preDiffDesign + + output: + + file '*.replicated.*' into consensusPeaks + file '*.rejected.*' into rejectedPeaks + file("design_diffPeaks.tsv") into designDiffPeaks + + script: + + """ + python3 $baseDir/scripts/overlap_peaks.py -d $peaksDesign -f $preDiffDesign + """ + } -- GitLab