Commit 35c417bb authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Analysis of basal cell state

parent 721d6ff4
RIP-seq full analysis
=====================================
## Setup and Imports
```{r init}
source("http://bioconductor.org/biocLite.R")
biocLite("GenomicFeatures")
biocLite("groHMM")
biocLite("org.Hs.eg.db")
biocLite("edgeR")
biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
biocLite("goseq")
install.packages("ggplot2")
install.packages("VennDiagram")
install.packages("reshape")
library(groHMM)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(edgeR)
library(org.Hs.eg.db)
library(GenomicAlignments)
library(GenomicFeatures)
library(ggplot2)
library(VennDiagram)
library(reshape)
```
## Functions
```{r functions}
reverseStrandGA <- function(ga) {
temp <- ga
strand(temp[as.character(strand(ga)) == "+",]) <- "-"
strand(temp[as.character(strand(ga)) == "-",]) <- "+"
return(temp)
}
readBam <- function(bam_file, reverse=TRUE) {
#print(sam)
bam <- file.path(bam_file)
reads <- as(readGAlignments(bam), "GRanges")
gr <- granges(reads)
gr <- sortSeqlevels(gr)
#print(paste("Lib:", NROW(gr)), quote=FALSE)
if (reverse)
reads <- reverseStrandGA(reads)
return(reads)
}
countReads <- function(anno, sam, snrna_anno, removeSNORNA=TRUE) {
bam <- readBam(sam, reverse=TRUE)
lib <- NROW(bam)
if (removeSNORNA) {
o <- findOverlaps(bam, snrna_anno)
bam <- bam[-unique(queryHits(o)),]
}
counts <- countOverlaps(anno, bam)
return(list(lib=lib, counts=counts))
}
CountRefseq <- function(anno, sam, snrna_anno, removeSNORNA=TRUE) {
cnt1 <- countReads(anno=anno, sam=sam, snrna_anno=snrna_anno, removeSNORNA=removeSNORNA)
cnt2 <- countReads(anno=reduce(anno), sam=sam, snrna_anno=snrna_anno, removeSNORNA=removeSNORNA)
return(cnt1)
}
CountSNRNA <- function(anno, sam) {
cnt <- countReads(anno=anno, sam=sam, removeSNORNA=FALSE)
return(cnt)
}
```
## Transcripts
```{r transcripts}
## Load Annotation File
pc_file <- file.path("./", "ucsc_refseq_genes.gtf")
txdb <- makeTxDbFromGFF(pc_file, format="gtf")
pc_tx <- transcripts(txdb, columns=c("GENEID", "TXID", "TXNAME"))
pc_mapping <- read.csv("./refseq_mapping.txt", header=T, sep='\t')
pc_inx <- match(unlist(mcols(pc_tx)$TXNAME), pc_mapping$name)
mcols(pc_tx)$gene_name <- as.character(pc_mapping[pc_inx,"name2"])
seqlevels(pc_tx,force=TRUE) <- c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9", "chr10", "chr11", "chr12","chr13", "chr14", "chr15","chr16", "chr17", "chr18","chr19", "chr20", "chr21","chr22", "chrX","chrY" )
names(mcols(pc_tx)) <- c("gene_id", "tx_id", "tx_name","gene_name")
ref_tx <- pc_tx[grep("NM_*",mcols(pc_tx)$gene_id)]
NROW(unique(unlist(mcols(ref_tx)$gene_id)))
NROW(unique(unlist(mcols(ref_tx)$gene_name)))
lnc_file <- file.path("./", "lncipedia_4_0_hg19.gtf")
txdb <- makeTxDbFromGFF(lnc_file, format="gtf")
lnc_tx <- transcripts(txdb, columns=c("GENEID", "TXID", "TXNAME"))
names(mcols(lnc_tx)) <- c("gene_id", "tx_id", "tx_name")
seqlevels(lnc_tx,force=TRUE) <- c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9", "chr10", "chr11", "chr12","chr13", "chr14", "chr15","chr16", "chr17", "chr18","chr19", "chr20", "chr21","chr22", "chrX","chrY" )
NROW(unique(unlist(mcols(lnc_tx)$tx_name)))
NROW(unique(unlist(mcols(lnc_tx)$gene_id)))
sn_file <- file.path("./", "gencode.v19.annotation_snRNA.gtf")
txdb <- makeTxDbFromGFF(sn_file, format="gtf")
sn_tx <- transcripts(txdb, columns=c("GENEID", "TXID", "TXNAME"))
names(mcols(sn_tx)) <- c("gene_id", "tx_id", "tx_name")
seqlevels(sn_tx,force=TRUE) <- c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9", "chr10", "chr11", "chr12","chr13", "chr14", "chr15","chr16", "chr17", "chr18","chr19", "chr20", "chr21","chr22", "chrX","chrY" )
NROW(unique(unlist(mcols(sn_tx)$tx_name)))
NROW(unique(unlist(mcols(sn_tx)$gene_id)))
sno_file <- file.path("./", "gencode.v19.annotation_snoRNA.gtf")
txdb <- makeTxDbFromGFF(sno_file, format="gtf")
sno_tx <- transcripts(txdb, columns=c("GENEID", "TXID", "TXNAME"))
names(mcols(sno_tx)) <- c("gene_id", "tx_id", "tx_name")
seqlevels(sno_tx,force=TRUE) <- c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9", "chr10", "chr11", "chr12","chr13", "chr14", "chr15","chr16", "chr17", "chr18","chr19", "chr20", "chr21","chr22", "chrX","chrY" )
NROW(unique(unlist(mcols(sno_tx)$tx_name)))
NROW(unique(unlist(mcols(sno_tx)$gene_id)))
all_sn_file <- file.path("./", "gencode.v19.annotation_all_snRNA.gtf")
txdb <- makeTxDbFromGFF(all_sn_file, format="gtf")
all_sn_tx <- transcripts(txdb, columns=c("GENEID", "TXID", "TXNAME"))
names(mcols(all_sn_tx)) <- c("gene_id", "tx_id", "tx_name")
seqlevels(all_sn_tx,force=TRUE) <- c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9", "chr10", "chr11", "chr12","chr13", "chr14", "chr15","chr16", "chr17", "chr18","chr19", "chr20", "chr21","chr22", "chrX","chrY" )
```
```{r alignments}
MLK_RNA1 <- "/Volumes/project/GCRB/Lee_Lab/s163035/DK_RIP-seq/PARP1_RNAseq_in_AC16_MCF7_withTNFa_E2/raw/Sample_MLK_RNA1/align-star-se.sh-1.0.0/MLK_RNA1_AGTCAA_L008_R1.rDNA.filtered.fastq.gz_filtered_chrMAligned.sortedByCoord.out.bam"
MLK_RNA2 <- "/Volumes/project/GCRB/Lee_Lab/s163035/DK_RIP-seq/PARP1_RNAseq_in_AC16_MCF7_withTNFa_E2/raw/Sample_MLK_RNA2/align-star-se.sh-1.0.0/MLK_RNA2_GTCCGC_L008_R1.rDNA.filtered.fastq.gz_filtered_chrMAligned.sortedByCoord.out.bam"
MLK_RIP_PI1 <- "/Volumes/project/GCRB/Lee_Lab/s163035/DK_RIP-seq/PARP1_RIPseq_in_AC16_MCF7_withTNFa_E2/raw/Sample_MLK_RIP1_PI/align-star-se.sh-2.0.0/MLK_RIP1_PI_ACAGTG_L006_R1.rDNA.filtered.fastq.gz_filtered_chrMAligned.sortedByCoord.out.bam"
MLK_RIP_PI2 <- "/Volumes/project/GCRB/Lee_Lab/s163035/DK_RIP-seq/PARP1_RIPseq_in_AC16_MCF7_withTNFa_E2/raw/Sample_MLK_RIP2_PI/align-star-se.sh-2.0.0/MLK_RIP2_PI_GTTTCG_L008_R1.rDNA.filtered.fastq.gz_filtered_chrMAligned.sortedByCoord.out.bam"
MLK_RIP_PARP1 <- "/Volumes/project/GCRB/Lee_Lab/s163035/DK_RIP-seq/PARP1_RIPseq_in_AC16_MCF7_withTNFa_E2/raw/Sample_MLK_RIP1_PARP1/align-star-se.sh-2.0.0/MLK_RIP1_PARP1_GCCAAT_L005_R1.rDNA.filtered.fastq.gz_filtered_chrMAligned.sortedByCoord.out.bam"
MLK_RIP_PARP2 <- "/Volumes/project/GCRB/Lee_Lab/s163035/DK_RIP-seq/PARP1_RIPseq_in_AC16_MCF7_withTNFa_E2/raw/Sample_MLK_RIP2_PARP1/align-star-se.sh-2.0.0/MLK_RIP2_PARP1_CGTACG_L005_R1.rDNA.filtered.fastq.gz_filtered_chrMAligned.sortedByCoord.out.bam"
MPK_RIP_PARP1 <- "/Volumes/project/GCRB/Lee_Lab/s163035/DK_RIP-seq/PARP1_RIPseq_in_AC16_MCF7_withTNFa_E2/raw/Sample_MPK_RIP1_PARP1/align-star-se.sh-2.0.0/MPK_RIP1_PARP1_TAGCTT_L008_R1.rDNA.filtered.fastq.gz_filtered_chrMAligned.sortedByCoord.out.bam"
MPK_RIP_PARP2 <- "/Volumes/project/GCRB/Lee_Lab/s163035/DK_RIP-seq/PARP1_RIPseq_in_AC16_MCF7_withTNFa_E2/raw/Sample_MPK_RIP2_PARP1/align-star-se.sh-2.0.0/MPK_RIP2_PARP1_ATGAGC_L008_R1.rDNA.filtered.fastq.gz_filtered_chrMAligned.sortedByCoord.out.bam"
```
```{r counts}
MLK_RNA1_counts_ref <- CountRefseq(ref_tx, MLK_RNA1, all_sn_tx, removeSNORNA=TRUE)
MLK_RNA2_counts_ref <- CountRefseq(ref_tx, MLK_RNA2, all_sn_tx, removeSNORNA=TRUE)
MLK_RNA1_counts_lnc <- CountRefseq(lnc_tx, MLK_RNA1, all_sn_tx, removeSNORNA=TRUE)
MLK_RNA2_counts_lnc <- CountRefseq(lnc_tx, MLK_RNA2, all_sn_tx, removeSNORNA=TRUE)
MLK_RNA1_counts_sn <- CountSNRNA(sn_tx, MLK_RNA1)
MLK_RNA2_counts_sn <- CountSNRNA(sn_tx, MLK_RNA2)
MLK_RNA1_counts_sno <- CountSNRNA(sno_tx, MLK_RNA1)
MLK_RNA2_counts_sno <- CountSNRNA(sno_tx, MLK_RNA2)
MLK_RIP_PI1_counts_ref <- CountRefseq(ref_tx, MLK_RIP_PI1, all_sn_tx, removeSNORNA=TRUE)
MLK_RIP_PI2_counts_ref <- CountRefseq(ref_tx, MLK_RIP_PI2, all_sn_tx, removeSNORNA=TRUE)
MLK_RIP_PI1_counts_lnc <- CountRefseq(lnc_tx, MLK_RIP_PI1, all_sn_tx, removeSNORNA=TRUE)
MLK_RIP_PI2_counts_lnc <- CountRefseq(lnc_tx, MLK_RIP_PI2, all_sn_tx, removeSNORNA=TRUE)
MLK_RIP_PI1_counts_sn <- CountSNRNA(sn_tx, MLK_RIP_PI1)
MLK_RIP_PI2_counts_sn <- CountSNRNA(sn_tx, MLK_RIP_PI2)
MLK_RIP_PI1_counts_sno <- CountSNRNA(sno_tx, MLK_RIP_PI1)
MLK_RIP_PI2_counts_sno <- CountSNRNA(sno_tx, MLK_RIP_PI2)
MLK_RIP_PARP1_counts_ref <- CountRefseq(ref_tx, MLK_RIP_PARP1, all_sn_tx, removeSNORNA=TRUE)
MLK_RIP_PARP2_counts_ref <- CountRefseq(ref_tx, MLK_RIP_PARP2, all_sn_tx, removeSNORNA=TRUE)
MLK_RIP_PARP1_counts_lnc <- CountRefseq(lnc_tx, MLK_RIP_PARP1, all_sn_tx, removeSNORNA=TRUE)
MLK_RIP_PARP2_counts_lnc <- CountRefseq(lnc_tx, MLK_RIP_PARP2, all_sn_tx, removeSNORNA=TRUE)
MLK_RIP_PARP1_counts_sn <- CountSNRNA(sn_tx, MLK_RIP_PARP1)
MLK_RIP_PARP2_counts_sn <- CountSNRNA(sn_tx, MLK_RIP_PARP2)
MLK_RIP_PARP1_counts_sno <- CountSNRNA(sno_tx, MLK_RIP_PARP1)
MLK_RIP_PARP2_counts_sno <- CountSNRNA(sno_tx, MLK_RIP_PARP2)
MPK_RIP_PARP1_counts_sn <- CountSNRNA(sn_tx, MPK_RIP_PARP1)
MPK_RIP_PARP2_counts_sn <- CountSNRNA(sn_tx, MPK_RIP_PARP2)
MPK_RIP_PARP1_counts_sno <- CountSNRNA(sno_tx, MPK_RIP_PARP1)
MPK_RIP_PARP2_counts_sno <- CountSNRNA(sno_tx, MLK_RIP_PARP2)
```
# RPKM Cutoff
``` {r rpkm}
MLK_RNA_counts_ref <- MLK_RNA1_counts_ref$counts + MLK_RNA2_counts_ref$counts
MLK_RNA_lib_ref <- MLK_RNA1_counts_ref$lib + MLK_RNA2_counts_ref$lib
MLK_RNA_ref_d <- DGEList(counts=as.matrix(MLK_RNA_counts_ref), lib.size=MLK_RNA_lib_ref)
MLK_RNA_ref_rpkm <- data.frame(rpkm(MLK_RNA_ref_d,width(ref_tx)))
mlk_pc_rpkm <- ref_tx[MLK_RNA_ref_rpkm>=1]
NROW(unique(unlist(mcols(mlk_pc_rpkm)$tx_name)))
NROW(unique(unlist(mcols(mlk_pc_rpkm)$gene_name)))
MLK_RIP_PARP_counts_ref <- MLK_RIP_PARP1_counts_ref$counts + MLK_RIP_PARP2_counts_ref$counts
MLK_RIP_PARP_lib_ref <- MLK_RIP_PARP1_counts_ref$lib + MLK_RIP_PARP2_counts_ref$lib
MLK_RIP_PI_counts_ref <- MLK_RIP_PI1_counts_ref$counts + MLK_RIP_PI2_counts_ref$counts
MLK_RIP_PI_lib_ref <- MLK_RIP_PI1_counts_ref$lib + MLK_RIP_PI1_counts_ref$lib
MLK_RIP_PI_d <- DGEList(counts=as.matrix(MLK_RIP_PI_counts_ref), lib.size=MLK_RIP_PI_lib_ref)
MLK_RIP_PI_rpkm <- data.frame(rpkm(MLK_RIP_PI_d,width(ref_tx)))
MLK_RIP_PARP_d <- DGEList(counts=as.matrix(MLK_RIP_PARP_counts_ref), lib.size=MLK_RIP_PARP_lib_ref)
MLK_RIP_PARP_rpkm <- data.frame(rpkm(MLK_RIP_PARP_d,width(ref_tx)))
MLK_RIP_FC <- data.frame((MLK_RIP_PARP_rpkm+1)/(MLK_RIP_PI_rpkm+1))
mlk_pc_rpkm_fc <- ref_tx[MLK_RNA_ref_rpkm>=1 & MLK_RIP_FC>=2 ]
NROW(unique(unlist(mcols(mlk_pc_rpkm_fc)$tx_name)))
NROW(unique(unlist(mcols(mlk_pc_rpkm_fc)$gene_name)))
pc_ratio <- NROW(unique(unlist(mcols(mlk_pc_rpkm_fc)$gene_name)))/NROW(unique(unlist(mcols(mlk_pc_rpkm)$gene_name)))
MLK_RNA_counts_lnc <- MLK_RNA1_counts_lnc$counts + MLK_RNA2_counts_lnc$counts
MLK_RNA_lib_lnc <- MLK_RNA1_counts_lnc$lib + MLK_RNA2_counts_lnc$lib
MLK_RNA_lnc_d <- DGEList(counts=as.matrix(MLK_RNA_counts_lnc), lib.size=MLK_RNA_lib_lnc)
MLK_RNA_lnc_rpkm <- data.frame(rpkm(MLK_RNA_lnc_d,width(lnc_tx)))
mlk_lnc_rpkm <- lnc_tx[MLK_RNA_lnc_rpkm>=1]
NROW(unique(unlist(mcols(mlk_lnc_rpkm)$tx_name)))
NROW(unique(unlist(mcols(mlk_lnc_rpkm)$gene_id)))
MLK_RIP_PARP_counts_lnc <- MLK_RIP_PARP1_counts_lnc$counts + MLK_RIP_PARP2_counts_lnc$counts
MLK_RIP_PARP_lib_lnc<- MLK_RIP_PARP1_counts_lnc$lib + MLK_RIP_PARP2_counts_lnc$lib
MLK_RIP_PI_counts_lnc <- MLK_RIP_PI1_counts_lnc$counts + MLK_RIP_PI2_counts_lnc$counts
MLK_RIP_PI_lib_lnc <- MLK_RIP_PI1_counts_lnc$lib + MLK_RIP_PI1_counts_lnc$lib
MLK_RIP_PI_d <- DGEList(counts=as.matrix(MLK_RIP_PI_counts_lnc), lib.size=MLK_RIP_PI_lib_lnc)
MLK_RIP_PI_rpkm <- data.frame(rpkm(MLK_RIP_PI_d,width(lnc_tx)))
MLK_RIP_PARP_d <- DGEList(counts=as.matrix(MLK_RIP_PARP_counts_lnc), lib.size=MLK_RIP_PARP_lib_lnc)
MLK_RIP_PARP_rpkm <- data.frame(rpkm(MLK_RIP_PARP_d,width(lnc_tx)))
MLK_RIP_FC <- data.frame((MLK_RIP_PARP_rpkm+1)/(MLK_RIP_PI_rpkm+1))
mlk_lnc_rpkm_fc <- lnc_tx[MLK_RNA_lnc_rpkm>=1 & MLK_RIP_FC>=2 ]
NROW(unique(unlist(mcols(mlk_lnc_rpkm_fc)$tx_name)))
NROW(unique(unlist(mcols(mlk_lnc_rpkm_fc)$gene_id)))
lnc_ratio <- NROW(unique(unlist(mcols(mlk_lnc_rpkm_fc)$gene_id)))/NROW(unique(unlist(mcols(mlk_lnc_rpkm)$gene_id)))
MLK_RNA_counts_sno <- MLK_RNA1_counts_sno$counts + MLK_RNA2_counts_sno$counts
MLK_RNA_lib_sno <- MLK_RNA1_counts_sno$lib + MLK_RNA1_counts_sno$lib
MLK_RNA_sno_d <- DGEList(counts=as.matrix(MLK_RNA_counts_sno), lib.size=MLK_RNA_lib_sno)
MLK_RNA_sno_rpkm <- data.frame(rpkm(MLK_RNA_sno_d,width(sno_tx)))
mlk_sno_rpkm <- sno_tx[MLK_RNA_sno_rpkm>=1]
NROW(unique(unlist(mcols(mlk_sno_rpkm)$tx_name)))
MLK_RIP_PARP_counts_sno <- MLK_RIP_PARP1_counts_sno$counts + MLK_RIP_PARP2_counts_sno$counts
MLK_RIP_PARP_lib_sno<- MLK_RIP_PARP1_counts_sno$lib + MLK_RIP_PARP2_counts_sno$lib
MLK_RIP_PI_counts_sno <- MLK_RIP_PI1_counts_sno$counts + MLK_RIP_PI2_counts_sno$counts
MLK_RIP_PI_lib_sno <- MLK_RIP_PI1_counts_sno$lib + MLK_RIP_PI1_counts_sno$lib
MPK_RIP_PARP_counts_sno <- MPK_RIP_PARP1_counts_sno$counts + MPK_RIP_PARP2_counts_sno$counts
MPK_RIP_PARP_lib_sno<- MPK_RIP_PARP1_counts_sno$lib + MPK_RIP_PARP2_counts_sno$lib
MLK_RIP_PI_d <- DGEList(counts=as.matrix(MLK_RIP_PI_counts_sno), lib.size=MLK_RIP_PI_lib_sno)
MLK_RIP_PI_rpkm <- data.frame(rpkm(MLK_RIP_PI_d,width(sno_tx)))
MLK_RIP_PARP_d <- DGEList(counts=as.matrix(MLK_RIP_PARP_counts_sno), lib.size=MLK_RIP_PARP_lib_sno)
MLK_RIP_PARP_rpkm <- data.frame(rpkm(MLK_RIP_PARP_d,width(sno_tx)))
MPK_RIP_PARP_d <- DGEList(counts=as.matrix(MPK_RIP_PARP_counts_sno), lib.size=MPK_RIP_PARP_lib_sno)
MPK_RIP_PARP_rpkm <- data.frame(rpkm(MPK_RIP_PARP_d,width(sno_tx)))
MLK_RIP_FC <- data.frame((MLK_RIP_PARP_rpkm+1)/(MLK_RIP_PI_rpkm+1))
mlk_sno_rpkm_fc <- sno_tx[MLK_RNA_sno_rpkm>=1 & MLK_RIP_FC>=2 ]
NROW(unique(unlist(mcols(mlk_sno_rpkm_fc)$tx_name)))
sno_ratio <- NROW(unique(unlist(mcols(mlk_sno_rpkm_fc)$tx_name)))/NROW(unique(unlist(mcols(mlk_sno_rpkm)$tx_name)))
MLK_RIP_sno <- data.frame(MLK_RIP_PI_rpkm,MLK_RIP_PARP_rpkm)
colnames(MLK_RIP_sno) <- c('PI', 'PARP1')
MLK_RIP_sno_rpkm <- MLK_RIP_sno[MLK_RNA_sno_rpkm>=1 & MLK_RIP_FC>=2,]
MLK_RNA_counts_sn <- MLK_RNA1_counts_sn$counts + MLK_RNA2_counts_sn$counts
MLK_RNA_lib_sn <- MLK_RNA1_counts_sn$lib + MLK_RNA1_counts_sn$lib
MLK_RNA_sn_d <- DGEList(counts=as.matrix(MLK_RNA_counts_sn), lib.size=MLK_RNA_lib_sn)
MLK_RNA_sn_rpkm <- data.frame(rpkm(MLK_RNA_sn_d,width(sn_tx)))
mlk_sn_rpkm <- sn_tx[MLK_RNA_sn_rpkm>=1]
NROW(unique(unlist(mcols(mlk_sn_rpkm)$tx_name)))
MLK_RIP_PARP_counts_sn <- MLK_RIP_PARP1_counts_sn$counts + MLK_RIP_PARP2_counts_sn$counts
MLK_RIP_PARP_lib_sn<- MLK_RIP_PARP1_counts_sn$lib + MLK_RIP_PARP2_counts_sn$lib
MLK_RIP_PI_counts_sn <- MLK_RIP_PI1_counts_sn$counts + MLK_RIP_PI2_counts_sn$counts
MLK_RIP_PI_lib_sn <- MLK_RIP_PI1_counts_sn$lib + MLK_RIP_PI1_counts_sn$lib
MLK_RIP_PI_d <- DGEList(counts=as.matrix(MLK_RIP_PI_counts_sn), lib.size=MLK_RIP_PI_lib_sn)
MLK_RIP_PI_rpkm <- data.frame(rpkm(MLK_RIP_PI_d,width(sn_tx)))
MLK_RIP_PARP_d <- DGEList(counts=as.matrix(MLK_RIP_PARP_counts_sn), lib.size=MLK_RIP_PARP_lib_sn)
MLK_RIP_PARP_rpkm <- data.frame(rpkm(MLK_RIP_PARP_d,width(sn_tx)))
MLK_RIP_FC <- data.frame((MLK_RIP_PARP_rpkm+1)/(MLK_RIP_PI_rpkm+1))
mlk_sn_rpkm_fc <- sn_tx[MLK_RNA_sn_rpkm>=1 & MLK_RIP_FC>=2 ]
NROW(unique(unlist(mcols(mlk_sn_rpkm_fc)$tx_name)))
sn_ratio <- NROW(unique(unlist(mcols(mlk_sn_rpkm_fc)$tx_name)))/NROW(unique(unlist(mcols(mlk_sn_rpkm)$tx_name)))
allsno <- c(mlk_sno_rpkm_fc,mlk_sn_rpkm_fc)
# bar plot figure (B)
# Grouped Bar Plot
jpeg('barchart-mcf-7-basal-species.jpg')
ratios <- c(pc_ratio,lnc_ratio,sno_ratio,sn_ratio)
barplot(ratios, main="SNORNA MCF-7 Basal", col=c("red","black", "dimgrey", "lightgrey"))
dev.off()
boundTesting <-matrix(c(952,5471,205,239),nrow = 2,dimnames = list(Guess = c("bound", "unbound"),Truth = c("mRNA", "snoRNA")))
fisher.test(boundTesting)
# Boxplot of Preimmune vs PARP-1 (C)
jpeg('boxplot-mcf-7-basal-species.jpg')
boxplot(MLK_RIP_sno_rpkm,col=(c("orange","blue")),outline=FALSE,ylim = c(0,400),yaxt="n", cex.axis=1,las=2,lwd=4,lty=1)
axis(2, at=seq(0,400,100))
dev.off()
wilcox.test(MLK_RIP_sno_rpkm[,1], MLK_RIP_sno_rpkm[,2],paired=T)
# Get Categories of snRNA
mlk_gr_snotx <- data.frame(seqnames=seqnames(mlk_sno_rpkm),
starts=start(mlk_sno_rpkm)-1,
ends=end(mlk_sno_rpkm),
strand=strand(mlk_sno_rpkm),
tx_name = elementMetadata(mlk_sno_rpkm)$tx_name,
gene_id = unlist(elementMetadata(mlk_sno_rpkm)$gene_id))
sno_annotations <- read.table('RNA_small_nucleolar.txt',header = TRUE, sep = "\t")
sno_inx <- match(matrix(unlist(strsplit(as.character(mlk_gr_snotx$gene_id),split='.',fixed=T)),ncol=2,byrow=T)[,1], sno_annotations$ensembl_gene_id)
mlk_gr_snotx$symbol <- as.character(sno_annotations[sno_inx,"symbol"])
mlk_gr_snotx$gene_family <- as.character(sno_annotations[sno_inx,"gene_family"])
mlk_gr_snotx$gene_family[mlk_gr_snotx$gene_family == "Small nucleolar RNAs, H/ACA box"] <- "H/ACA box"
mlk_gr_snotx$gene_family[mlk_gr_snotx$gene_family == "Small nucleolar RNAs, C/D box"] <- "C/D box"
mlk_gr_snotx$gene_family[mlk_gr_snotx$gene_family == "Small Cajal body-specific RNAs"] <- "scaRNA"
mlk_gr_snotx$gene_family[mlk_gr_snotx$gene_family == ""] <- "Other"
mlk_gr_snotx$gene_family[is.na(mlk_gr_snotx$gene_family)] <- "Other"
ha_rna <- dim(mlk_gr_snotx[mlk_gr_snotx$gene_family == "H/ACA box",])[1]
cd_rna <- dim(mlk_gr_snotx[mlk_gr_snotx$gene_family == "C/D box",])[1]
sc_rna <- dim(mlk_gr_snotx[mlk_gr_snotx$gene_family == "scaRNA",])[1]
dat = data.frame(count=c(ha_rna, cd_rna, sc_rna), category=c("H/ACA box", "C/D box", "scaRNA"))
dat$fraction = dat$count / sum(dat$count)
dat$ymax = cumsum(dat$fraction)
dat$ymin = c(0, head(dat$ymax, n=-1))
dat$category <- factor(dat$category, levels = c("H/ACA box", "C/D box", "scaRNA"))
# (E)
jpeg('piechart-mcf-7-basal-species.jpg')
ggplot(dat, aes(fill=category, ymax=ymax, ymin=ymin, xmax=4, xmin=2)) +
geom_rect(fill=c('darkorchid1', "darkorange1","darkgreen")) +
coord_polar(theta="y") +
xlim(c(0, 4)) +
theme_bw() +
theme(panel.grid=element_blank()) +
theme(axis.text=element_blank()) +
theme(axis.ticks=element_blank())
dev.off()
mlk_gr_snotx <- data.frame(seqnames=seqnames(mlk_sno_rpkm_fc),
starts=start(mlk_sno_rpkm_fc)-1,
ends=end(mlk_sno_rpkm_fc),
strand=strand(mlk_sno_rpkm_fc),
tx_name = elementMetadata(mlk_sno_rpkm_fc)$tx_name,
gene_id = unlist(elementMetadata(mlk_sno_rpkm_fc)$gene_id))
sno_inx <- match(matrix(unlist(strsplit(as.character(mlk_gr_snotx$gene_id),split='.',fixed=T)),ncol=2,byrow=T)[,1], sno_annotations$ensembl_gene_id)
mlk_gr_snotx$symbol <- as.character(sno_annotations[sno_inx,"symbol"])
mlk_gr_snotx$gene_family <- as.character(sno_annotations[sno_inx,"gene_family"])
mlk_gr_snotx$gene_family[mlk_gr_snotx$gene_family == "Small nucleolar RNAs, H/ACA box"] <- "H/ACA box"
mlk_gr_snotx$gene_family[mlk_gr_snotx$gene_family == "Small nucleolar RNAs, C/D box"] <- "C/D box"
mlk_gr_snotx$gene_family[mlk_gr_snotx$gene_family == "Small Cajal body-specific RNAs"] <- "scaRNA"
mlk_gr_snotx$gene_family[mlk_gr_snotx$gene_family == ""] <- "Other"
mlk_gr_snotx$gene_family[is.na(mlk_gr_snotx$gene_family)] <- "Other"
ha_rip <- dim(mlk_gr_snotx[mlk_gr_snotx$gene_family == "H/ACA box",])[1]
cd_rip <- dim(mlk_gr_snotx[mlk_gr_snotx$gene_family == "C/D box",])[1]
sc_rip<- dim(mlk_gr_snotx[mlk_gr_snotx$gene_family == "scaRNA",])[1]
# bar plot figure (F)
# Bar plot
jpeg('barchart-mcf-7-basal-species_snospecies.jpg')
ratios <- c(sn_ratio,ha_rip/ha_rna,cd_rip/cd_rna,sc_rip/sc_rna)
barplot(ratios, main="snoRNA species MCF-7 Basal", col=c('pink1','darkorchid1', "darkorange1","darkgreen"),ylim=c(0,1))
dev.off()
typeTesting <-matrix(c(63,19,38,76),nrow = 2,dimnames = list(Guess = c("bound", "unbound"),Truth = c("HA", "CD")))
fisher.test(typeTesting)
```
This diff is collapsed.
No preview for this file type
No preview for this file type
This diff is collapsed.
This diff is collapsed.
...@@ -13,3 +13,16 @@ grep transcript gencode.v19.annotation_snoRNA.gtf | cut -f9 | cut -f1,2,5,8 -d ...@@ -13,3 +13,16 @@ grep transcript gencode.v19.annotation_snoRNA.gtf | cut -f9 | cut -f1,2,5,8 -d
grep transcript gencode.v19.annotation.gtf | cut -f9 | cut -f1,2,5,8 -d ";" | cut -f2,4,6,8 -d " " | sed 's/"//g' | sed 's/;//g' | sort -u | sed 's/ /,/g'| grep ENST > mapping.txt grep transcript gencode.v19.annotation.gtf | cut -f9 | cut -f1,2,5,8 -d ";" | cut -f2,4,6,8 -d " " | sed 's/"//g' | sed 's/;//g' | sort -u | sed 's/ /,/g'| grep ENST > mapping.txt
# Filter for Transcripts that also play host to ncRNA
bedtools intersect -wa -a gencode.v19.annotation_protein_coding.gtf -b gencode.v19.annotation_snRNA.gtf gencode.v19.annotation_snoRNA.gtf > ncRNA_overlap.gtf
grep transcript ncRNA_overlap.gtf | cut -f9 | cut -f1,2,5,8 -d ";" | cut -f2,4,6,8 -d " " | sed 's/"//g' | sed 's/;//g' | sort -u | sed 's/ /,/g' | grep ENST > gencode.v19.annotation_protein_coding_ncRNA_host.txt
bedtools intersect -wa -a gencode.v19.annotation_lncRNA.gtf -b gencode.v19.annotation_snRNA.gtf gencode.v19.annotation_snoRNA.gtf > ncRNA_overlap.gtf
grep transcript ncRNA_overlap.gtf | cut -f9 | cut -f1,2,5,8 -d ";" | cut -f2,4,6,8 -d " " | sed 's/"//g' | sed 's/;//g' | sort -u | sed 's/ /,/g' | grep ENST > gencode.v19.annotation_lncRNA_mapping_ncRNA_host.txt
# Make bed file format
gtf2bed < gencode.v19.annotation.gtf > gencode.v19.bed
grep -v exon gencode.v19.bed | grep ENST | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"(substr($13, index($10,'transcript_id')))}' | sed 's/"//g' | sed 's/;//g' > gencode.v19.transcripts.bed
awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"(substr($0, index($0,$10)))}'
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment