Commit 7b7dc0cb authored by Holly Ruess's avatar Holly Ruess

added peak annotation

parent 8e447c31
Pipeline #7082 failed with stages
in 263 minutes and 38 seconds
......@@ -24,6 +24,7 @@ All notable changes to this project will be documented in this file.
- Added TSS enrichment
- Added Fragment/Insert Length size
- Removal of blacklist regions
- Annotate peaks after removing non-chromosomes
## [publish_1.0.0 ] - 2019-12-03
Initial release of pipeline
......
......@@ -43,7 +43,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
## Pipeline
+ There are 9 steps to the pipeline
+ There are 10 steps to the pipeline
1. Check input files
2. Trim adaptors with TrimGalore!
3. Map reads with BWA, filter with SamTools, and sort with Sambamba
......@@ -51,8 +51,9 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
5. Calculate cross-correlation using PhantomPeakQualTools
6. Call peaks with MACS2 from overlaps of pooled replicates
7. Call consensus peaks and optional removal of blacklist peaks
8. QC metrics
9. MultiQC report
8. Annotate peaks (chr only; either blacklist removed or replicated)
9. QC metrics
10. MultiQC report
## Output Files
......@@ -80,6 +81,9 @@ callPeaksMACS | pooled/*_pooled.narrowPeak | peaks file; see [HERE](https://geno
consensusPeaks | *.rejected.narrowPeak | peaks not supported by multiple testing (replicates and pseudo-replicates)
consensusPeaks | *.replicated.narrowPeak | peaks supported by multiple testing (replicates and pseudo-replicates)
consensusPeaks | *.replicated_noblacklist.narrowPeak | peaks supported by multiple testing (replicates and pseudo-replicates) with blacklist regions removed
peakAnnotation | *.chipseeker_annotation.tsv | annotation of chromosomal peaks of either blacklist removed or replicated peaks
peakAnnotation | *.chipseeker_upsetplot.pdf | upsetplot showing the count of overlaps of the genes with different annotated location
peakAnnotation | *.chipseeker_pie.pdf | pie graph of where narrow annotated peaks occur
experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples
......
......@@ -43,7 +43,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
## Pipeline
+ There are 9 steps to the pipeline
+ There are 10 steps to the pipeline
1. Check input files
2. Trim adaptors with TrimGalore!
3. Map reads with BWA, filter with SamTools, and sort with Sambamba
......@@ -51,8 +51,9 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
5. Calculate cross-correlation using PhantomPeakQualTools
6. Call peaks with MACS2 from overlaps of pooled replicates
7. Call consensus peaks and optional removal of blacklist peaks
8. QC metrics
9. MultiQC report
8. Annotate peaks (chr only; either blacklist removed or replicated)
9. QC metrics
10. MultiQC report
## Output Files
......@@ -80,6 +81,9 @@ callPeaksMACS | pooled/*_pooled.narrowPeak | peaks file; see [HERE](https://geno
consensusPeaks | *.rejected.narrowPeak | peaks not supported by multiple testing (replicates and pseudo-replicates)
consensusPeaks | *.replicated.narrowPeak | peaks supported by multiple testing (replicates and pseudo-replicates)
consensusPeaks | *.replicated_noblacklist.narrowPeak | peaks supported by multiple testing (replicates and pseudo-replicates) with blacklist regions removed
peakAnnotation | *.chipseeker_annotation.tsv | annotation of chromosomal peaks of either blacklist removed or replicated peaks
peakAnnotation | *.chipseeker_upsetplot.pdf | upsetplot showing the count of overlaps of the genes with different annotated location
peakAnnotation | *.chipseeker_pie.pdf | pie graph of where narrow annotated peaks occur
experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples
......
......@@ -45,6 +45,10 @@ process {
module = ['python/3.6.1-2-anaconda', 'bedtools/2.25.0']
executor = 'local'
}
withName: peakAnnotation {
module = ['R/3.3.2-gccmkl']
executor = 'local'
}
withName: experimentQC {
module = ['python/3.6.1-2-anaconda', 'deeptools/2.5.0.1', 'samtools/1.4.1', 'bedtools/2.25.0', 'singularity/2.6.1']
queue = '128GB,256GB,256GBv1'
......@@ -65,6 +69,7 @@ params {
chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt'
gtffile = '/project/shared/bicf_workflow_ref/GRCh38/gencode.v25.chr_patch_hapl_scaff.annotation.gtf'
blacklistfile = '/project/shared/bicf_workflow_ref/GRCh38/ENCFF356LFX.bed'
geneNames = '/project/shared/bicf_workflow_ref/human/GRCh38/genenames.txt'
}
'GRCm38' {
bwa = '/project/shared/bicf_workflow_ref/GRCm38'
......@@ -72,6 +77,7 @@ params {
chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt'
gtffile = '/project/shared/bicf_workflow_ref/GRCm38/gencode.vM20.annotation.gtf'
blacklistfile = '/project/shared/bicf_workflow_ref/GRCm38/ENCFF999QPV.bed'
geneNames = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genenames.txt'
}
}
}
......
......@@ -15,6 +15,7 @@ params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?
params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
params.gtfFile = params.genome ? params.genomes[ params.genome ].gtffile ?: false : false
params.blacklistFile = params.genome ? params.genomes[ params.genome ].blacklistfile ?: false : false
params.geneNames = params.genome ? params.genomes[ params.genome ].geneNames ?: false : false
params.astrocyte = false
params.outDir= "${baseDir}/output"
params.references = "${baseDir}/../docs/references.md"
......@@ -47,6 +48,7 @@ references = params.references
multiqc = params.multiqc
gtfFile = params.gtfFile
blacklistFile = params.blacklistFile
geneNames = params.geneNames
process checkDesignFile {
......@@ -509,6 +511,7 @@ process consensusPeaks {
file '*.rejected.*' into rejectedPeaks
file("design_exQC.tsv") into designExperimentQC
file('version_*.txt') into consensusPeaksVersions
file("design_annotatePeaks.tsv") into designAnnotatePeaks
script:
if (params.astrocyte == true) {
......@@ -543,13 +546,34 @@ process consensusPeaks {
}
// Define channel collecting SampleID and fastq length into new design file
// designFilePaths
// .splitCsv(sep: '\t', header: true)
// .map{ row -> [ row.sample_id, fqLength ->
// "${sampleId}\t${fqLength}\n"}
// .collectFile(name: 'design_fqlength.tsv', seed: "sample_id\tfastq_Length\n", storeDir: "${outDir}/design")
// .into { fqlengthDesign }
// Annotate Peaks
process peakAnnotation {
publishDir "${outDir}/${task.process}", mode: 'copy'
input:
file designAnnotatePeaks
output:
file "*chipseeker*" into peakAnnotation
file('version_*.txt') into peakAnnotationVersions
script:
if (params.astrocyte == true) {
"""
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/annotate_peaks.R ${designAnnotatePeaks} ${gtfFile} ${geneNames}
"""
}
else {
"""
Rscript $baseDir/scripts/annotate_peaks.R ${designAnnotatePeaks} ${gtfFile} ${geneNames}
"""
}
}
// Quality Metrics using deeptools
......
#!/bin/Rscript
# Load libraries
library("ChIPseeker")
library(GenomicFeatures)
# Create parser object
args <- commandArgs(trailingOnly=TRUE)
# Check input args
if (length(args) != 3) {
stop("Usage: annotate_peaks.R annotate_design.tsv gtf geneNames", call.=FALSE)
}
design_file <- args[1]
gtf <- args[2]
geneNames <- args[3]
# Load UCSC Known Genes
txdb <- makeTxDbFromGFF(gtf)
sym <- read.table(geneNames, header=T, sep='\t') [,4:5]
# Output version of ChIPseeker
chipseeker_version = packageVersion('ChIPseeker')
write.table(paste("Version", chipseeker_version), file = "version_ChIPseeker.txt", sep = "\t",
row.names = FALSE, col.names = FALSE)
# Load design file
design <- read.csv(design_file, sep ='\t')
files <- as.list(as.character(design$Peaks))
names(files) <- design$Condition
# Granges of files
peaks <- lapply(files, readPeakFile, as = "GRanges", header = FALSE)
peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
column_names <- c("geneId", "chr", "start", "end", "width", "strand_1", "name", "score",
"strand", "signalValue", "pValue", "qValue", "peak", "annotation",
"geneChr", "geneStart", "geneEnd", "geneLength" ,"geneStrand",
"transcriptId", "distanceToTSS", "symbol")
for(index in c(1:length(peakAnnoList))) {
filename <- paste(names(peaks)[index], ".chipseeker_annotation.tsv", sep="")
df <- as.data.frame(peakAnnoList[[index]])
df$geneId <- gsub('[[:punct:]][0-9]+', '', df$geneId)
df_final <- merge(df, sym, by.x="geneId", by.y="ensembl", all.x=T)
colnames(df_final) <- column_names
write.table(df_final[ , !(names(df_final) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F)
# Draw individual plots
# Define names of Plots
pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="")
upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="")
# Pie Plots
pdf(pie_name)
plotAnnoPie(peakAnnoList[[index]])
dev.off()
# Upset Plot
pdf(upsetplot_name, onefile=F)
upsetplot(peakAnnoList[[index]])
dev.off()
}
......@@ -204,7 +204,7 @@ def FRiP(sample, design):
f.close()
def FLD(sample, design):
def fld(sample, design):
''' Calculate and plot Fragment Length Distribution'''
logger.info("Determining Fragment Length Distribution for PE")
......@@ -281,7 +281,7 @@ def main():
# Fragment Length Distribution - PE only
if paired:
for sample, df_sample in design_df.groupby('SampleID'):
FLD_plots = FLD(sample, df_sample)
fld_plots = fld(sample, df_sample)
if __name__ == '__main__':
main()
......@@ -40,7 +40,8 @@ def get_args():
parser.add_argument('-b', '--blacklist',
help="Bed file of blacklisted regions to remove",
required=False)
required=False,
default="None")
args = parser.parse_args()
return args
......@@ -94,6 +95,7 @@ def overlap(experiment, design):
peak_type = 'narrowPeak'
overlapping_peaks_fn = '%s.replicated.%s' % (experiment, peak_type)
rejected_peaks_fn = '%s.rejected.%s' % (experiment, peak_type)
chr_rm_fn = '%s.replicated_onlyChr.%s' % (experiment, peak_type)
# Intermediate File names
overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
......@@ -170,11 +172,18 @@ def overlap(experiment, design):
], rejected_peaks_fn)
print("%d peaks were rejected" % (utils.count_lines(rejected_peaks_fn)))
# Filter for chr, XYM for annotation
steps = [
'grep "^chr" %s' % (overlapping_peaks_fn),
r""" awk '{OFS = "\t"} $1 !~ /_/ {print $0}'""",
]
out, err = utils.run_pipe(steps, chr_rm_fn)
# Remove temporary files
os.remove(overlap_tr_fn)
os.remove(overlap_pr_fn)
return os.path.abspath(overlapping_peaks_fn)
return os.path.abspath(overlapping_peaks_fn), os.path.abspath(chr_rm_fn)
def blacklist_peaks(experiment, blacklist):
......@@ -182,23 +191,35 @@ def blacklist_peaks(experiment, blacklist):
logger.info("Removing blacklist regions from replicated peaks")
# Assign input
narrowpead_input = "%s.replicated.narrowPeak" % (experiment)
# Assign output
blpo_filename = "%s.replicated_noblacklist.narrowPeak" % (experiment)
chr_rm_fn = "%s.replicated_noblacklist_onlyChr.narrowPeak" % (experiment)
# Remove
# Remove blacklist peaks
steps = [
"bedtools intersect -v -a %s -b %s" % (narrowpead_input, blacklist),
r"""awk 'BEGIN{OFS="\t"} {if ($5>1000) $5=1000; print $0}'""",
]
out, err = utils.run_pipe(steps, blpo_filename)
# Filter for chr, XYM for annotation
steps = [
"grep '^chr' %s" % (blpo_filename),
r""" awk '{OFS = "\t"} $1 !~ /_/ {print $0}'""",
]
out, err = utils.run_pipe(steps, chr_rm_fn)
return os.path.abspath(blpo_filename), os.path.abspath(chr_rm_fn)
def main():
args = get_args()
design = args.design
files = args.files
blacklist = args.files
blacklist = args.blacklist
# Create a file handler
handler = logging.FileHandler('consensus_peaks.log')
......@@ -211,13 +232,25 @@ def main():
design_peaks_df = pd.read_csv(design, sep='\t')
design_files_df = pd.read_csv(files, sep='\t')
# Make a design file for
# Make a design file for QC
design_diff = update_design(design_files_df)
# Make a design file for annotating Peaks
anno_cols = ['Condition', 'Peaks']
design_anno = pd.DataFrame(columns=anno_cols)
# Find consenus overlap peaks for each experiment
for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
replicated_peak = overlap(experiment, df_experiment)
replicated_peak, chr_peak = overlap(experiment, df_experiment)
design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak
design_anno.loc[experiment] = [experiment, chr_peak]
# Remove blacklist regions; if blacklist = True
if os.path.exists(blacklist):
for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
bl_peaks, bl_chr_peak = blacklist_peaks(experiment, blacklist)
design_diff.loc[design_diff.experiment_id == experiment, "peak"] = bl_peaks
design_anno.loc[design_anno.Condition == experiment, "Peaks"] = bl_chr_peak
# Write out file
design_diff.columns = ['SampleID',
......@@ -229,10 +262,7 @@ def main():
design_diff.to_csv("design_exQC.tsv", header=True, sep='\t', index=False)
# Remove blacklist regions; if blacklist = True
if blacklist:
for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
bl_peaks = blacklist_peaks(experiment, blacklist)
design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
import pytest
import pandas as pd
from io import StringIO
import os
import utils
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/peakAnnotation/'
@pytest.mark.singleend_human
def test_annotation_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR265ZXX.chipseeker_pie.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'ENCSR265ZXX.chipseeker_upsetplot.pdf'))
annotation_file = test_output_path + 'ENCSR265ZXX.chipseeker_annotation.tsv'
assert os.path.exists(annotation_file)
assert utils.count_lines(annotation_file) >= 116265
@pytest.mark.pairedend_mouse
def test_annotation_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE.chipseeker_pie.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE.chipseeker_upsetplot.pdf'))
annotation_file = test_output_path + 'ENCSR451NAE.chipseeker_annotation.tsv'
assert os.path.exists(annotation_file)
assert utils.count_lines(annotation_file) >= 77424
......@@ -38,12 +38,21 @@ def test_overlap_peaks_singleend_human():
peak_file = test_output_path + 'ENCSR265ZXX.replicated.narrowPeak'
assert utils.count_lines(peak_file) >= 116680
onlychr_file = test_output_path + 'ENCSR265ZXX.replicated_onlyChr.narrowPeak'
assert utils.count_lines(peak_file) >= 116264
@pytest.mark.pairedend_mouse
def test_overlap_peaks_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE.rejected.narrowPeak'))
peak_file = test_output_path + 'ENCSR451NAE.replicated.narrowPeak'
assert utils.count_lines(peak_file) >= 78039
assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE.replicated_noblacklist.narrowPeak'))
bl_peak_file = test_output_path + 'ENCSR451NAE.replicated_noblacklist.narrowPeak'
assert utils.count_lines(bl_peak_file) >= 77532
assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE.replicated_onlyChr.narrowPeak'))
bl_chr_file = test_output_path + 'ENCSR451NAE.replicated_noblacklist_onlyChr.narrowPeak'
assert utils.count_lines(bl_peak_file) >= 77423
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment