Analysis of ChIP-seq Enhancer Prediction ===================================== ## Setup and Imports ```{r init} source("http://bioconductor.org/biocLite.R") biocLite("GenomicFeatures") biocLite("org.Hs.eg.db") library(groHMM) library(org.Hs.eg.db) library(GenomicAlignments) library(GenomicFeatures) ``` ## Alignments ```{r alignments} ES_D10_R1 <- as(readGAlignments("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/ES_D10/H3K27ac/remove-duplicates.sh-1.1.0/SRR1145823.fastq.gz.sorted.bam.filtered.no_dups.bam"), "GRanges") ES_D10_R2 <- as(readGAlignments("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/ES_D10/H3K27ac/remove-duplicates.sh-1.1.0/SRR2130154.fastq.gz.sorted.bam.filtered.no_dups.bam"), "GRanges") ES_D2_R1 <- as(readGAlignments("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/ES_D2/H3K27ac/remove-duplicates.sh-1.1.0/SRR1145802.fastq.gz.sorted.bam.filtered.no_dups.bam"), "GRanges") ES_D2_R2 <- as(readGAlignments("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/ES_D2/H3K27ac/remove-duplicates.sh-1.1.0/SRR1145803.fastq.gz.sorted.bam.filtered.no_dups.bam"), "GRanges") ES_D0_R1 <- as(readGAlignments("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/ES_D0/H3K27ac/remove-duplicates.sh-1.1.0/SRR1145796.fastq.gz.sorted.bam.filtered.no_dups.bam"), "GRanges") ES_D0_R2 <- as(readGAlignments("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/ES_D0/H3K27ac/remove-duplicates.sh-1.1.0/SRR1145795.fastq.gz.sorted.bam.filtered.no_dups.bam"), "GRanges") ES_D7_R1 <- as(readGAlignments("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/ES_D7/H3K27ac/remove-duplicates.sh-1.1.0/SRR1145817.fastq.gz.sorted.bam.filtered.no_dups.bam"), "GRanges") ES_D7_R2 <- as(readGAlignments("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/ES_D7/H3K27ac/remove-duplicates.sh-1.1.0/SRR1145816.fastq.gz.sorted.bam.filtered.no_dups.bam"), "GRanges") ES_D5_R1 <- as(readGAlignments("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/ES_D5/H3K27ac/remove-duplicates.sh-1.1.0/SRR1145810.fastq.gz.sorted.bam.filtered.no_dups.bam"), "GRanges") ES_D5_R2 <- as(readGAlignments("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/ES_D5/H3K27ac/remove-duplicates.sh-1.1.0/SRR1145809.fastq.gz.sorted.bam.filtered.no_dups.bam"), "GRanges") # Combine replicates ES_D10 <- c(ES_D10_R1, ES_D10_R2) ES_D2 <- c(ES_D2_R1, ES_D2_R2) ES_D0 <- c(ES_D0_R1, ES_D0_R1) ES_D7 <- c(ES_D7_R1, ES_D7_R2) ES_D5 <- c(ES_D5_R1, ES_D5_R2) # Library library_ES_D10 <- NROW(ES_D10) library_ES_D2 <- NROW(ES_D2) library_ES_D0 <- NROW(ES_D0) library_ES_D7 <- NROW(ES_D7) library_ES_D5 <- NROW(ES_D5) ``` ## Read in bed files ```{r alignments} ES_D0_true <- import("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/universe_h3k27ac/ES_D0_true.bed", format = "BED") ES_D10_true <-import("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/universe_h3k27ac/ES_D10_true.bed", format = "BED") ES_D2_true <-import("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/universe_h3k27ac/ES_D2_true.bed", format = "BED") ES_D5_true <-import("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/universe_h3k27ac/ES_D5_true.bed", format = "BED") ES_D7_true <-import("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/universe_h3k27ac/ES_D7_true.bed", format = "BED") ES_D0_false <- import("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/universe_h3k27ac/ES_D0_false.bed", format = "BED") ES_D10_false <- import("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/universe_h3k27ac/ES_D10_false.bed", format = "BED") ES_D2_false <- import("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/universe_h3k27ac/ES_D2_false.bed", format = "BED") ES_D5_false <- import("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/universe_h3k27ac/ES_D5_false.bed", format = "BED") ES_D7_false <- import("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/ChIP-seq/universe_h3k27ac/ES_D7_false.bed", format = "BED") ``` ## RPKM = numReads / ( geneLength/1000 * totalNumReads/1,000,000 ) ```{r RPKM} # True rpkm_ES_D10_true <- countOverlaps(ES_D10_true,ES_D10)/ (width(ES_D10_true)/1000 * library_ES_D10/1000000) rpkm_ES_D0_true <- countOverlaps(ES_D0_true,ES_D0)/ (width(ES_D0_true)/1000 * library_ES_D0/1000000) rpkm_ES_D2_true <- countOverlaps(ES_D2_true,ES_D2)/ (width(ES_D2_true)/1000 * library_ES_D2/1000000) rpkm_ES_D5_true <- countOverlaps(ES_D5_true,ES_D5)/ (width(ES_D5_true)/1000 * library_ES_D5/1000000) rpkm_ES_D7_true <- countOverlaps(ES_D7_true,ES_D7)/ (width(ES_D7_true)/1000 * library_ES_D7/1000000) # FALSE rpkm_ES_D10_false <- countOverlaps(ES_D10_false,ES_D10)/ (width(ES_D10_false)/1000 * library_ES_D10/1000000) rpkm_ES_D0_false <- countOverlaps(ES_D0_false,ES_D0)/ (width(ES_D0_false)/1000 * library_ES_D0/1000000) rpkm_ES_D2_false <- countOverlaps(ES_D2_false,ES_D2)/ (width(ES_D2_false)/1000 * library_ES_D2/1000000) rpkm_ES_D5_false <- countOverlaps(ES_D5_false,ES_D5)/ (width(ES_D5_false)/1000 * library_ES_D5/1000000) rpkm_ES_D7_false <- countOverlaps(ES_D7_false,ES_D7)/ (width(ES_D7_false)/1000 * library_ES_D7/1000000) # Combine and plot rpkm_true <- c(rpkm_ES_D10_true,rpkm_ES_D0_true,rpkm_ES_D2_true,rpkm_ES_D5_true,rpkm_ES_D7_true) rpkm_false <- c(rpkm_ES_D10_false,rpkm_ES_D0_false,rpkm_ES_D2_false,rpkm_ES_D5_false,rpkm_ES_D7_false) hist(rpkm_true, col="red") hist(rpkm_false, add=T, col=rgb(0, 1, 0, 0.5) ) summary(rpkm_true) summary(rpkm_false) # Leads to an RPKM cutoff of 1 ```