added peak annotation

7b7dc0cb · Holly Ruess · 8e447c31 · 7b7dc0cb · 7b7dc0cb · 7b7dc0cb
Commit 7b7dc0cb authored 4 years ago by Holly Ruess
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,7 @@ All notable changes to this project will be documented in this file.
 - Added TSS enrichment
 - Added Fragment/Insert Length size
 - Removal of blacklist regions
+ - Annotate peaks after removing non-chromosomes

 ## [publish_1.0.0 ] - 2019-12-03
 Initial release of pipeline

--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git


 ## Pipeline
-  + There are 9 steps to the pipeline
+  + There are 10 steps to the pipeline
    1. Check input files
    2. Trim adaptors with TrimGalore!
    3. Map reads with BWA, filter with SamTools, and sort with Sambamba
@@ -51,8 +51,9 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
    5. Calculate cross-correlation using PhantomPeakQualTools
    6. Call peaks with MACS2 from overlaps of pooled replicates
    7. Call consensus peaks and optional removal of blacklist peaks
-    8. QC metrics
-    9. MultiQC report
+    8. Annotate peaks (chr only; either blacklist removed or replicated)
+    9. QC metrics
+    10. MultiQC report


 ## Output Files
@@ -80,6 +81,9 @@ callPeaksMACS | pooled/*_pooled.narrowPeak | peaks file; see [HERE](https://geno
 consensusPeaks | *.rejected.narrowPeak | peaks not supported by multiple testing (replicates and pseudo-replicates)
 consensusPeaks | *.replicated.narrowPeak | peaks supported by multiple testing (replicates and pseudo-replicates)
 consensusPeaks | *.replicated_noblacklist.narrowPeak | peaks supported by multiple testing (replicates and pseudo-replicates) with blacklist regions removed
+peakAnnotation | *.chipseeker_annotation.tsv | annotation of chromosomal peaks of either blacklist removed or replicated peaks
+peakAnnotation | *.chipseeker_upsetplot.pdf | upsetplot showing the count of overlaps of the genes with different annotated location
+peakAnnotation | *.chipseeker_pie.pdf | pie graph of where narrow annotated peaks occur
 experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
 experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
 experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples

--- a/docs/index.md
+++ b/docs/index.md
@@ -43,7 +43,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git


 ## Pipeline
-  + There are 9 steps to the pipeline
+  + There are 10 steps to the pipeline
    1. Check input files
    2. Trim adaptors with TrimGalore!
    3. Map reads with BWA, filter with SamTools, and sort with Sambamba
@@ -51,8 +51,9 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
    5. Calculate cross-correlation using PhantomPeakQualTools
    6. Call peaks with MACS2 from overlaps of pooled replicates
    7. Call consensus peaks and optional removal of blacklist peaks
-    8. QC metrics
-    9. MultiQC report
+    8. Annotate peaks (chr only; either blacklist removed or replicated)
+    9. QC metrics
+    10. MultiQC report


 ## Output Files
@@ -80,6 +81,9 @@ callPeaksMACS | pooled/*_pooled.narrowPeak | peaks file; see [HERE](https://geno
 consensusPeaks | *.rejected.narrowPeak | peaks not supported by multiple testing (replicates and pseudo-replicates)
 consensusPeaks | *.replicated.narrowPeak | peaks supported by multiple testing (replicates and pseudo-replicates)
 consensusPeaks | *.replicated_noblacklist.narrowPeak | peaks supported by multiple testing (replicates and pseudo-replicates) with blacklist regions removed
+peakAnnotation | *.chipseeker_annotation.tsv | annotation of chromosomal peaks of either blacklist removed or replicated peaks
+peakAnnotation | *.chipseeker_upsetplot.pdf | upsetplot showing the count of overlaps of the genes with different annotated location
+peakAnnotation | *.chipseeker_pie.pdf | pie graph of where narrow annotated peaks occur
 experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
 experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
 experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples

--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -45,6 +45,10 @@ process {
    module = ['python/3.6.1-2-anaconda', 'bedtools/2.25.0']
    executor = 'local'
  }
+  withName: peakAnnotation {
+    module = ['R/3.3.2-gccmkl']
+    executor = 'local'
+  }
  withName: experimentQC {
    module = ['python/3.6.1-2-anaconda', 'deeptools/2.5.0.1', 'samtools/1.4.1', 'bedtools/2.25.0', 'singularity/2.6.1']
    queue = '128GB,256GB,256GBv1'
@@ -65,6 +69,7 @@ params {
      chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt'
      gtffile = '/project/shared/bicf_workflow_ref/GRCh38/gencode.v25.chr_patch_hapl_scaff.annotation.gtf'
      blacklistfile = '/project/shared/bicf_workflow_ref/GRCh38/ENCFF356LFX.bed'
+      geneNames = '/project/shared/bicf_workflow_ref/human/GRCh38/genenames.txt'
    }
    'GRCm38' {
      bwa = '/project/shared/bicf_workflow_ref/GRCm38'
@@ -72,6 +77,7 @@ params {
      chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt'
      gtffile = '/project/shared/bicf_workflow_ref/GRCm38/gencode.vM20.annotation.gtf'
      blacklistfile = '/project/shared/bicf_workflow_ref/GRCm38/ENCFF999QPV.bed'
+      geneNames = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genenames.txt'
    }
  }
 }

--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -15,6 +15,7 @@ params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?
 params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
 params.gtfFile = params.genome ? params.genomes[ params.genome ].gtffile ?: false : false
 params.blacklistFile = params.genome ? params.genomes[ params.genome ].blacklistfile ?: false : false
+params.geneNames = params.genome ? params.genomes[ params.genome ].geneNames ?: false : false
 params.astrocyte = false
 params.outDir= "${baseDir}/output"
 params.references = "${baseDir}/../docs/references.md"
@@ -47,6 +48,7 @@ references = params.references
 multiqc = params.multiqc
 gtfFile = params.gtfFile
 blacklistFile = params.blacklistFile
+geneNames = params.geneNames

 process checkDesignFile {

@@ -509,6 +511,7 @@ process consensusPeaks {
    file '*.rejected.*' into rejectedPeaks
    file("design_exQC.tsv") into designExperimentQC
    file('version_*.txt') into consensusPeaksVersions
+    file("design_annotatePeaks.tsv") into designAnnotatePeaks

  script:
    if (params.astrocyte == true) {
@@ -543,13 +546,34 @@ process consensusPeaks {
 }


-// Define channel collecting SampleID and fastq length into new design file
-// designFilePaths
-//    .splitCsv(sep: '\t', header: true)
-//    .map{ row -> [ row.sample_id, fqLength ->
-//    "${sampleId}\t${fqLength}\n"}
-//    .collectFile(name: 'design_fqlength.tsv', seed: "sample_id\tfastq_Length\n", storeDir: "${outDir}/design")
-//    .into { fqlengthDesign }
+// Annotate Peaks
+process peakAnnotation {
+
+  publishDir "${outDir}/${task.process}", mode: 'copy'
+
+  input:
+
+  file designAnnotatePeaks
+
+  output:
+
+  file "*chipseeker*" into peakAnnotation
+  file('version_*.txt') into peakAnnotationVersions
+
+  script:
+    if (params.astrocyte == true) {
+      """
+      module load R/3.3.2-gccmkl
+      Rscript $baseDir/scripts/annotate_peaks.R ${designAnnotatePeaks} ${gtfFile} ${geneNames}
+      """
+    }
+    else {
+      """
+      Rscript $baseDir/scripts/annotate_peaks.R ${designAnnotatePeaks} ${gtfFile} ${geneNames}
+      """
+    }
+
+}


 // Quality Metrics using deeptools

--- a/workflow/scripts/annotate_peaks.R
+++ b/workflow/scripts/annotate_peaks.R
+#!/bin/Rscript
+
+# Load libraries
+library("ChIPseeker")
+library(GenomicFeatures)
+
+# Create parser object
+args <- commandArgs(trailingOnly=TRUE)
+
+# Check input args
+if (length(args) != 3) {
+  stop("Usage: annotate_peaks.R annotate_design.tsv gtf geneNames", call.=FALSE)
+}
+
+design_file <- args[1]
+gtf <- args[2]
+geneNames <- args[3]
+
+# Load UCSC Known Genes
+txdb <- makeTxDbFromGFF(gtf)
+sym <- read.table(geneNames, header=T, sep='\t') [,4:5]
+
+# Output version of ChIPseeker
+chipseeker_version = packageVersion('ChIPseeker')
+write.table(paste("Version", chipseeker_version), file = "version_ChIPseeker.txt", sep = "\t",
+            row.names = FALSE, col.names = FALSE)
+
+# Load design file
+design <- read.csv(design_file, sep ='\t')
+files <- as.list(as.character(design$Peaks))
+names(files) <- design$Condition
+
+# Granges of files
+peaks <- lapply(files, readPeakFile, as = "GRanges", header = FALSE)
+peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
+
+column_names <- c("geneId", "chr", "start", "end", "width", "strand_1", "name", "score", 
+                  "strand", "signalValue", "pValue", "qValue", "peak", "annotation", 
+                  "geneChr", "geneStart", "geneEnd", "geneLength" ,"geneStrand", 
+                   "transcriptId", "distanceToTSS", "symbol")
+
+for(index in c(1:length(peakAnnoList))) {
+  filename <- paste(names(peaks)[index], ".chipseeker_annotation.tsv", sep="")
+  df <- as.data.frame(peakAnnoList[[index]])
+  df$geneId <- gsub('[[:punct:]][0-9]+', '', df$geneId)
+  df_final <- merge(df, sym, by.x="geneId", by.y="ensembl", all.x=T)
+  colnames(df_final) <- column_names
+  write.table(df_final[ , !(names(df_final) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F)
+
+  # Draw individual plots
+
+  # Define names of Plots
+  pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="")
+  upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="")
+
+  # Pie Plots
+  pdf(pie_name)
+  plotAnnoPie(peakAnnoList[[index]])
+  dev.off()
+
+  # Upset Plot
+  pdf(upsetplot_name, onefile=F)
+  upsetplot(peakAnnoList[[index]])
+  dev.off()
+}
--- a/workflow/scripts/experiment_qc.py
+++ b/workflow/scripts/experiment_qc.py
@@ -204,7 +204,7 @@ def FRiP(sample, design):
    f.close()


-def FLD(sample, design):
+def fld(sample, design):
    ''' Calculate and plot Fragment Length Distribution'''

    logger.info("Determining Fragment Length Distribution for PE")
@@ -281,7 +281,7 @@ def main():
    # Fragment Length Distribution - PE only
    if paired:
        for sample, df_sample in design_df.groupby('SampleID'):
-            FLD_plots = FLD(sample, df_sample)
+            fld_plots = fld(sample, df_sample)

 if __name__ == '__main__':
    main()
--- a/workflow/scripts/overlap_peaks.py
+++ b/workflow/scripts/overlap_peaks.py
@@ -40,7 +40,8 @@ def get_args():

    parser.add_argument('-b', '--blacklist',
                        help="Bed file of blacklisted regions to remove",
-                        required=False)
+                        required=False,
+                        default="None")

    args = parser.parse_args()
    return args
@@ -94,6 +95,7 @@ def overlap(experiment, design):
    peak_type = 'narrowPeak'
    overlapping_peaks_fn = '%s.replicated.%s' % (experiment, peak_type)
    rejected_peaks_fn = '%s.rejected.%s' % (experiment, peak_type)
+    chr_rm_fn = '%s.replicated_onlyChr.%s' % (experiment, peak_type)

    # Intermediate File names
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
@@ -170,11 +172,18 @@ def overlap(experiment, design):
        ], rejected_peaks_fn)
    print("%d peaks were rejected" % (utils.count_lines(rejected_peaks_fn)))

+    # Filter for chr, XYM for annotation
+    steps = [
+        'grep "^chr" %s' % (overlapping_peaks_fn),
+        r""" awk '{OFS = "\t"} $1 !~ /_/ {print $0}'""",
+        ]
+    out, err = utils.run_pipe(steps, chr_rm_fn)
+
    # Remove temporary files
    os.remove(overlap_tr_fn)
    os.remove(overlap_pr_fn)

-    return os.path.abspath(overlapping_peaks_fn)
+    return os.path.abspath(overlapping_peaks_fn), os.path.abspath(chr_rm_fn)


 def blacklist_peaks(experiment, blacklist):
@@ -182,23 +191,35 @@ def blacklist_peaks(experiment, blacklist):

    logger.info("Removing blacklist regions from replicated peaks")

+    # Assign input
    narrowpead_input = "%s.replicated.narrowPeak" % (experiment)
+
    # Assign output
    blpo_filename = "%s.replicated_noblacklist.narrowPeak" % (experiment)
+    chr_rm_fn = "%s.replicated_noblacklist_onlyChr.narrowPeak" % (experiment)

-    # Remove
+    # Remove blacklist peaks
    steps = [
        "bedtools intersect -v -a %s -b %s" % (narrowpead_input, blacklist),
        r"""awk 'BEGIN{OFS="\t"} {if ($5>1000) $5=1000; print $0}'""",
        ]
    out, err = utils.run_pipe(steps, blpo_filename)

+    # Filter for chr, XYM for annotation
+    steps = [
+        "grep '^chr' %s" % (blpo_filename),
+        r""" awk '{OFS = "\t"} $1 !~ /_/ {print $0}'""",
+        ]
+    out, err = utils.run_pipe(steps, chr_rm_fn)
+
+    return os.path.abspath(blpo_filename), os.path.abspath(chr_rm_fn)
+

 def main():
    args = get_args()
    design = args.design
    files = args.files
-    blacklist = args.files
+    blacklist = args.blacklist

    # Create a file handler
    handler = logging.FileHandler('consensus_peaks.log')
@@ -211,13 +232,25 @@ def main():
    design_peaks_df = pd.read_csv(design, sep='\t')
    design_files_df = pd.read_csv(files, sep='\t')

-    # Make a design file for
+    # Make a design file for QC
    design_diff = update_design(design_files_df)

+    # Make a design file for annotating Peaks
+    anno_cols = ['Condition', 'Peaks']
+    design_anno = pd.DataFrame(columns=anno_cols)
+
    # Find consenus overlap peaks for each experiment
    for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
-        replicated_peak = overlap(experiment, df_experiment)
+        replicated_peak, chr_peak = overlap(experiment, df_experiment)
        design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak
+        design_anno.loc[experiment] = [experiment, chr_peak]
+
+    # Remove blacklist regions; if blacklist = True
+    if os.path.exists(blacklist):
+        for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
+            bl_peaks, bl_chr_peak = blacklist_peaks(experiment, blacklist)
+            design_diff.loc[design_diff.experiment_id == experiment, "peak"] = bl_peaks
+            design_anno.loc[design_anno.Condition == experiment, "Peaks"] = bl_chr_peak

    # Write out file
    design_diff.columns = ['SampleID',
@@ -229,10 +262,7 @@ def main():

    design_diff.to_csv("design_exQC.tsv", header=True, sep='\t', index=False)

-    # Remove blacklist regions; if blacklist = True
-    if blacklist:
-        for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
-            bl_peaks = blacklist_peaks(experiment, blacklist)
+    design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False)

 if __name__ == '__main__':
    main()
--- a/workflow/tests/test_annotate_peaks.py
+++ b/workflow/tests/test_annotate_peaks.py
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+import utils
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../output/peakAnnotation/'
+
+
+@pytest.mark.singleend_human
+def test_annotation_singleend_human():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCSR265ZXX.chipseeker_pie.pdf'))
+
+    assert os.path.exists(os.path.join(test_output_path, 'ENCSR265ZXX.chipseeker_upsetplot.pdf'))
+
+    annotation_file = test_output_path + 'ENCSR265ZXX.chipseeker_annotation.tsv'
+    assert os.path.exists(annotation_file)
+    assert utils.count_lines(annotation_file) >= 116265
+
+
+@pytest.mark.pairedend_mouse
+def test_annotation_pairedend_mouse():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE.chipseeker_pie.pdf'))
+
+    assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE.chipseeker_upsetplot.pdf'))
+
+    annotation_file = test_output_path + 'ENCSR451NAE.chipseeker_annotation.tsv'
+    assert os.path.exists(annotation_file)
+    assert utils.count_lines(annotation_file) >= 77424
--- a/workflow/tests/test_overlap_peaks.py
+++ b/workflow/tests/test_overlap_peaks.py
@@ -38,12 +38,21 @@ def test_overlap_peaks_singleend_human():
    peak_file = test_output_path + 'ENCSR265ZXX.replicated.narrowPeak'
    assert utils.count_lines(peak_file) >= 116680

+    onlychr_file = test_output_path + 'ENCSR265ZXX.replicated_onlyChr.narrowPeak'
+    assert utils.count_lines(peak_file) >= 116264
+

 @pytest.mark.pairedend_mouse
 def test_overlap_peaks_pairedend_mouse():
    assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE.rejected.narrowPeak'))
+
    peak_file = test_output_path + 'ENCSR451NAE.replicated.narrowPeak'
    assert utils.count_lines(peak_file) >= 78039
-    assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE.replicated_noblacklist.narrowPeak'))
+
    bl_peak_file = test_output_path + 'ENCSR451NAE.replicated_noblacklist.narrowPeak'
    assert utils.count_lines(bl_peak_file) >= 77532
+
+    assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE.replicated_onlyChr.narrowPeak'))
+
+    bl_chr_file = test_output_path + 'ENCSR451NAE.replicated_noblacklist_onlyChr.narrowPeak'
+    assert utils.count_lines(bl_peak_file) >= 77423