diff --git a/diff_exp/DEA_htseq.R b/diff_exp/DEA_htseq.R deleted file mode 100755 index 058dd1817f6d448baf3a9c84fdd22f8fc197ad3b..0000000000000000000000000000000000000000 --- a/diff_exp/DEA_htseq.R +++ /dev/null @@ -1,236 +0,0 @@ -#!/cm/shared/apps/R/intel/3.2.1/bin/Rscript -library(edgeR) -library(DESeq2) -library("RColorBrewer") -library("gplots") -library(qusage) - -rowMax <- function(x) apply(x,1,max) -col.grp <- function(n,b) { - colorvec <- vector(mode="character", length=length(n)) - plotcols <- rainbow(length(b)) - for (i in 1:length(n)) { - for (j in 1:length(b)) { - if ( n[i] == b[j] ) { - colorvec[i] = plotcols[j] - } - } - } - c(colorvec) -} - - -#################### Read in Data ################################ -genenames <- read.table(file="genenames.txt",header=TRUE,sep='\t') - -tbl <- read.table('countTable.txt',header=TRUE,sep="\t") -tbl2 <- read.table('countTable.logCPM.txt',header=TRUE,sep="\t") -ct <- tbl[,4:length(tbl)] -row.names(ct) <- tbl$ENSEMBL - -samples<- names(ct) - -dtbl <- read.table('design.txt',header=TRUE,sep="\t") -samtbl <- merge(as.data.frame(samples),dtbl,by.x="samples",by.y="SampleID",all.x=TRUE,sort=FALSE) -grpnames <- levels(factor(as.character(samtbl$SampleGroup))) -samtbl$SampleGroup <- factor(samtbl$SampleGroup, levels=grpnames) - -colData <- samtbl[c('SampleGroup','SubjectID')] -row.names(colData) <- samtbl$samples - -dds <- DESeqDataSetFromMatrix(countData=ct,colData= colData,design= ~ SampleGroup) -dds <- dds[ rowMax(counts(dds)) > 30, ] -dds <- dds[ colSums(counts(dds)) > 1000000] - -if (nrow(counts(dds)) < 1) { -print(paste("Samples are filtered if there is < 1M reads. There are less than no remaining sample(s) after this filter",sep=' ')) -q() -} - -countTable <- counts(dds) -#grps <- as.character(levels(colData(dds)$SampleGroup)) -grps <- as.character(colData(dds)$SampleGroup) -col.blocks <-col.grp(grps,levels(factor(grps))) -libSizes <- as.vector(colSums(countTable)) - -logcpm <- tbl2[,4:length(tbl2)] -row.names(logcpm) <- tbl2$SYMBOL - -tmp.tab <- aggregate(t(logcpm),by=list(as.character(samtbl$SampleGroup)),FUN=mean) -row.names(tmp.tab) <- tmp.tab$Group.1 -mean.by.group <- round(log2(t(tmp.tab[,c(2:ncol(tmp.tab))])+1), digits = 2) - -#################### Run DESEQ2 ################################ -dds <- DESeq(dds) -rld <- rlogTransformation(dds, blind=TRUE) -sampleDists <- dist(t(assay(rld))) - -png(file="samples_heatmap.png",bg ="transparent",height=768,width=1024) -heatmap.2(as.matrix(sampleDists), col = bluered(100),RowSideColors = col.blocks,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) -dev.off() - -#Compare Samples using PCA -png(file="pca.png",bg ="transparent",height=768,width=1024) -print(plotPCA(rld, intgroup="SampleGroup"),col.hab=col.blocks) -dev.off() - -#Do all pairwise comparisons -contrast <- resultsNames(dds) -cond<- levels(colData(dds)$SampleGroup) -a <- length(cond)-1 -for (i in 1:a) { - for (j in 2:length(cond)) { - if (i == j) { - next - } else { - res <- as.data.frame(results(dds,contrast=c("SampleGroup",cond[i],cond[j]))) - res2 <- merge(genenames,res,by.x='ensembl',by.y='row.names',all.y=TRUE,all.x=FALSE) - output <- merge(res2,mean.by.group,by.y="row.names",by.x='symbol') - output$rawP <- output$pvalue - output$logFC <- output$log2FoldChange - output$fdr <- output$padj - output$bonf <- p.adjust(output$rawP, method ='bonferroni') - write.table(output,file=paste(cond[i],'_',cond[j],'.deseq2.txt',sep=""),quote=FALSE,row.names=FALSE,sep='\t') - filt.out <- na.omit(output[output$fdr < 0.05,]) - if (nrow(filt.out) > 2) { - subset <- logcpm[row.names(logcpm) %in% filt.out$symbol,] - gnames <- filt.out[c('ensembl','symbol')] - s <- merge(gnames,subset,by.x="ensembl",by.y="row.names",all.x=FALSE,all.y=TRUE,sort=FALSE) - STREE <- hclust(dist(t(subset))) - zscores <- scale(t(subset)) - ngenes <- length(colnames(zscores)) - textscale <- (1/(ngenes/30)) - if (textscale > 1) { - textscale <-1 - } - if (textscale < 0.1) { - textscale <- 0.1 - } - png(file=paste(cond[i],'_',cond[j],'.heatmap.deseq2.png',sep=""),height=768,width=1024) - heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,labCol=s$symbol,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) - legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20,cex=0.5) - dev.off() - } - } - } -} - -###################### Run EdgeR ######################## -###################### Run QuSage ###################### - -MSIG.geneSets <- read.gmt('geneset.gmt') - -design <- model.matrix(~grps) -d <- DGEList(counts=countTable,group=grps,lib.size=libSizes) -d <- calcNormFactors(d) -d <- estimateCommonDisp(d) -png(file="mds.png",bg ="transparent",height=768,width=1024) -plotMDS(d, labels=grps,col=col.blocks) -legend("topleft",legend=grpnames,col=rainbow(length(grpnames)),pch=20) -dev.off() -cond <-levels(d$samples$group) -colnames(design) <- levels(d$samples$group) -a <- length(cond)-1 -for (i in 1:a) { - for (j in 2:length(cond)) { - if (i == j) { - next - } else { - c <- exactTest(d, pair=c(cond[j],cond[i])) - res <- c$table - res2 <- merge(genenames,res,by.x='ensembl',by.y='row.names',all.y=TRUE,all.x=FALSE) - output <- merge(res2,mean.by.group,by.y="row.names",by.x='symbol') - output$rawP <- output$PValue - output$logFC <- output$logFC - output$fdr <- p.adjust(output$rawP, method ='fdr') - output$bonf <- p.adjust(output$rawP, method ='bonferroni') - write.table(output,file=paste(cond[i],'_',cond[j],'.edgeR.txt',sep=""),quote=FALSE,row.names=FALSE,sep='\t') - filt.out <- na.omit(output[output$fdr < 0.05,]) - if (nrow(filt.out) > 2) { - subset <- logcpm[row.names(logcpm) %in% filt.out$symbol,] - gnames <- filt.out[c('ensembl','symbol')] - s <- merge(gnames,subset,by.x="ensembl",by.y="row.names",all.x=FALSE,all.y=TRUE,sort=FALSE) - STREE <- hclust(dist(t(subset))) - zscores <- scale(t(subset)) - ngenes <- length(colnames(zscores)) - textscale <- (1/(ngenes/30)) - if (textscale > 1) { - textscale <-1 - } - if (textscale < 0.1) { - textscale <- 0.1 - } - png(file=paste(cond[i],'_',cond[j],'.heatmap.edgeR.png',sep=""),height=768,width=1024) - heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,labCol=s$symbol,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) - legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20,cex=0.5) - dev.off() - gcont <- paste(cond[j],cond[i],sep='-') - qs.results = qusage(logcpm, grps,gcont,MSIG.geneSets) - save(qs.results,file=paste(cond[i],'_',cond[j],'.qusage.rda',sep="")) - } - } - } -} - -###################### Run Limma VOOM ######################## - -# design <- model.matrix(~0+grps) -# colnames(design) <- grpnames -# a <- length(cond)-1 -# design.pairs <- c() -# k <- 0 -# for (i in 1:a) { -# for (j in 2:length(cond)) { -# if (i == j) { -# next -# } else { -# k <- k+1 -# design.pairs[k] <- paste(cond[i],'-',cond[j],sep='') -# } -# } -# } - -#contrast.matrix <- makeContrasts(design.pairs,levels=design) - -#d <- DGEList(counts=countTable,group=grps,lib.size=libSizes) -#d <- calcNormFactors(d) -#d <- estimateCommonDisp(d) - -# v <- voom(d,design,plot=TRUE) -# fit <- lmFit(v,design) -# fit2 <- contrasts.fit(fit, contrast.matrix) -# fit2 <- eBayes(fit2) - -# comps <- colnames(fit2$coefficients) -# for (i in 1:length(comps)) { -# res <- topTable(fit2,coef=i,number=Inf,sort.by="P") -# res2 <- merge(genenames,res,by.x='ensembl',by.y='row.names',all.y=TRUE,all.x=FALSE) -# output <- merge(res2,mean.by.group,by.y="row.names",by.x='symbol') -# output$rawP <- output$P.Value -# output$logFC <- output$adj.P.Val -# output$fdr <- p.adjust(output$rawP, method ='fdr') -# output$bonf <- p.adjust(output$rawP, method ='bonferroni') -# write.table(output,file=paste(cond[i],'_',cond[j],'.voom.txt',sep=""),quote=FALSE,row.names=FALSE,sep='\t') -# filt.out <- na.omit(output[output$fdr < 0.05,]) -# if (nrow(filt.out) > 2) { -# subset <- logcpm[row.names(logcpm) %in% filt.out$ensembl,] -# gnames <- filt.out[c('ensembl','symbol')] -# s <- merge(gnames,subset,by.x="ensembl",by.y="row.names",all.x=FALSE,all.y=TRUE,sort=FALSE) -# STREE <- hclust(dist(t(subset))) -# zscores <- scale(t(subset)) -# ngenes <- length(colnames(zscores)) -# textscale <- (1/(ngenes/30)) -# if (textscale > 1) { -# textscale <-1 -# } -# if (textscale < 0.1) { -# textscale <- 0.1 -# } -# png(file=paste(cond[i],'_',cond[j],'.heatmap.voom.png',sep=""),height=768,width=1024) -# heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,labCol=s$symbol,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) -# legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20,cex=0.5) -# dev.off() -# } -# } - - diff --git a/diff_exp/DEA_htseq.R b/diff_exp/DEA_htseq.R new file mode 120000 index 0000000000000000000000000000000000000000..c7e331c456c6c43b82300cb40a88762a0f849218 --- /dev/null +++ b/diff_exp/DEA_htseq.R @@ -0,0 +1 @@ +../genect_rnaseq/DEA_htseq.R \ No newline at end of file diff --git a/diff_exp/build_ballgown.R b/diff_exp/build_ballgown.R deleted file mode 100755 index b74bfa29b69e8c491e23545e0bf249cd781ef7f0..0000000000000000000000000000000000000000 --- a/diff_exp/build_ballgown.R +++ /dev/null @@ -1,16 +0,0 @@ -library(ballgown) - -args<-commandArgs(TRUE) - -sample_dirs <- args -samples <- gsub('_stringtie','',sample_dirs) -design <- "design.txt" - -samtbl <- read.table(file=design,header=TRUE,sep='\t') - -bg <- ballgown(samples=sample_dirs, meas='all') -samples <- gsub('_stringtie','',sampleNames(bg)) -mergetbl <- merge(as.data.frame(samples),samtbl,by.x="samples",by.y="SampleID",all.x=TRUE,sort=FALSE) -pData(bg) = data.frame(id=samples, group=as.character(mergetbl$SampleGroup)) - -save(bg,file="bg.rda") diff --git a/diff_exp/build_ballgown.R b/diff_exp/build_ballgown.R new file mode 120000 index 0000000000000000000000000000000000000000..9a7a90160726d8e6ead4cbb016e7de28473fae53 --- /dev/null +++ b/diff_exp/build_ballgown.R @@ -0,0 +1 @@ +../genect_rnaseq/build_ballgown.R \ No newline at end of file diff --git a/diff_exp/concat_edgeR.pl b/diff_exp/concat_edgeR.pl deleted file mode 100755 index e736e822cc07a4225d19bdb936a7f6f3fc342055..0000000000000000000000000000000000000000 --- a/diff_exp/concat_edgeR.pl +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/perl -w -#concat_edgeR.pl - -#symbol chrom start end ensembl type logFC logCPM PValue monocytes neutrophils rawP fdr bonf - -my @files = @ARGV; -open OUT, ">edgeR.results.txt" or die $!; -print OUT join("\t",'G1','G2','ENSEMBL','SYMBOL','TYPE','logFC','logCPM', - 'pval','fdr','G1.Mean','G2.Mean'),"\n"; -foreach $f (@files) { - open IN, "<$f" or die $!; - my ($fname,$ext) = split(/\.edgeR\./,$f); - $fname =~ s/edgeR\///; - my ($g1,$g2) = split(/_/,$fname); - my $head = <IN>; - chomp($head); - my @colnames = split(/\t/,$head); - while (my $line = <IN>) { - chomp($line); - my @row = split(/\t/,$line); - my %hash; - foreach my $i (0..$#row) { - $hash{$colnames[$i]} = $row[$i]; - } - print OUT join("\t",$g1,$g2,$hash{ensembl},$hash{symbol},$hash{type}, - sprintf("%.2f",$hash{logFC}), - sprintf("%.2f",$hash{logCPM}), - sprintf("%.1e",$hash{rawP}),sprintf("%.1e",$hash{fdr}), - sprintf("%.0f",$hash{$g1}),sprintf("%.0f",$hash{$g2})),"\n"; - } -} diff --git a/diff_exp/concat_edgeR.pl b/diff_exp/concat_edgeR.pl new file mode 120000 index 0000000000000000000000000000000000000000..f6a17c415097ac34a5301c6452e7fc6a743efe26 --- /dev/null +++ b/diff_exp/concat_edgeR.pl @@ -0,0 +1 @@ +../genect_rnaseq/concat_edgeR.pl \ No newline at end of file diff --git a/diff_exp/dea.R b/diff_exp/dea.R deleted file mode 100755 index 058dd1817f6d448baf3a9c84fdd22f8fc197ad3b..0000000000000000000000000000000000000000 --- a/diff_exp/dea.R +++ /dev/null @@ -1,236 +0,0 @@ -#!/cm/shared/apps/R/intel/3.2.1/bin/Rscript -library(edgeR) -library(DESeq2) -library("RColorBrewer") -library("gplots") -library(qusage) - -rowMax <- function(x) apply(x,1,max) -col.grp <- function(n,b) { - colorvec <- vector(mode="character", length=length(n)) - plotcols <- rainbow(length(b)) - for (i in 1:length(n)) { - for (j in 1:length(b)) { - if ( n[i] == b[j] ) { - colorvec[i] = plotcols[j] - } - } - } - c(colorvec) -} - - -#################### Read in Data ################################ -genenames <- read.table(file="genenames.txt",header=TRUE,sep='\t') - -tbl <- read.table('countTable.txt',header=TRUE,sep="\t") -tbl2 <- read.table('countTable.logCPM.txt',header=TRUE,sep="\t") -ct <- tbl[,4:length(tbl)] -row.names(ct) <- tbl$ENSEMBL - -samples<- names(ct) - -dtbl <- read.table('design.txt',header=TRUE,sep="\t") -samtbl <- merge(as.data.frame(samples),dtbl,by.x="samples",by.y="SampleID",all.x=TRUE,sort=FALSE) -grpnames <- levels(factor(as.character(samtbl$SampleGroup))) -samtbl$SampleGroup <- factor(samtbl$SampleGroup, levels=grpnames) - -colData <- samtbl[c('SampleGroup','SubjectID')] -row.names(colData) <- samtbl$samples - -dds <- DESeqDataSetFromMatrix(countData=ct,colData= colData,design= ~ SampleGroup) -dds <- dds[ rowMax(counts(dds)) > 30, ] -dds <- dds[ colSums(counts(dds)) > 1000000] - -if (nrow(counts(dds)) < 1) { -print(paste("Samples are filtered if there is < 1M reads. There are less than no remaining sample(s) after this filter",sep=' ')) -q() -} - -countTable <- counts(dds) -#grps <- as.character(levels(colData(dds)$SampleGroup)) -grps <- as.character(colData(dds)$SampleGroup) -col.blocks <-col.grp(grps,levels(factor(grps))) -libSizes <- as.vector(colSums(countTable)) - -logcpm <- tbl2[,4:length(tbl2)] -row.names(logcpm) <- tbl2$SYMBOL - -tmp.tab <- aggregate(t(logcpm),by=list(as.character(samtbl$SampleGroup)),FUN=mean) -row.names(tmp.tab) <- tmp.tab$Group.1 -mean.by.group <- round(log2(t(tmp.tab[,c(2:ncol(tmp.tab))])+1), digits = 2) - -#################### Run DESEQ2 ################################ -dds <- DESeq(dds) -rld <- rlogTransformation(dds, blind=TRUE) -sampleDists <- dist(t(assay(rld))) - -png(file="samples_heatmap.png",bg ="transparent",height=768,width=1024) -heatmap.2(as.matrix(sampleDists), col = bluered(100),RowSideColors = col.blocks,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) -dev.off() - -#Compare Samples using PCA -png(file="pca.png",bg ="transparent",height=768,width=1024) -print(plotPCA(rld, intgroup="SampleGroup"),col.hab=col.blocks) -dev.off() - -#Do all pairwise comparisons -contrast <- resultsNames(dds) -cond<- levels(colData(dds)$SampleGroup) -a <- length(cond)-1 -for (i in 1:a) { - for (j in 2:length(cond)) { - if (i == j) { - next - } else { - res <- as.data.frame(results(dds,contrast=c("SampleGroup",cond[i],cond[j]))) - res2 <- merge(genenames,res,by.x='ensembl',by.y='row.names',all.y=TRUE,all.x=FALSE) - output <- merge(res2,mean.by.group,by.y="row.names",by.x='symbol') - output$rawP <- output$pvalue - output$logFC <- output$log2FoldChange - output$fdr <- output$padj - output$bonf <- p.adjust(output$rawP, method ='bonferroni') - write.table(output,file=paste(cond[i],'_',cond[j],'.deseq2.txt',sep=""),quote=FALSE,row.names=FALSE,sep='\t') - filt.out <- na.omit(output[output$fdr < 0.05,]) - if (nrow(filt.out) > 2) { - subset <- logcpm[row.names(logcpm) %in% filt.out$symbol,] - gnames <- filt.out[c('ensembl','symbol')] - s <- merge(gnames,subset,by.x="ensembl",by.y="row.names",all.x=FALSE,all.y=TRUE,sort=FALSE) - STREE <- hclust(dist(t(subset))) - zscores <- scale(t(subset)) - ngenes <- length(colnames(zscores)) - textscale <- (1/(ngenes/30)) - if (textscale > 1) { - textscale <-1 - } - if (textscale < 0.1) { - textscale <- 0.1 - } - png(file=paste(cond[i],'_',cond[j],'.heatmap.deseq2.png',sep=""),height=768,width=1024) - heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,labCol=s$symbol,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) - legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20,cex=0.5) - dev.off() - } - } - } -} - -###################### Run EdgeR ######################## -###################### Run QuSage ###################### - -MSIG.geneSets <- read.gmt('geneset.gmt') - -design <- model.matrix(~grps) -d <- DGEList(counts=countTable,group=grps,lib.size=libSizes) -d <- calcNormFactors(d) -d <- estimateCommonDisp(d) -png(file="mds.png",bg ="transparent",height=768,width=1024) -plotMDS(d, labels=grps,col=col.blocks) -legend("topleft",legend=grpnames,col=rainbow(length(grpnames)),pch=20) -dev.off() -cond <-levels(d$samples$group) -colnames(design) <- levels(d$samples$group) -a <- length(cond)-1 -for (i in 1:a) { - for (j in 2:length(cond)) { - if (i == j) { - next - } else { - c <- exactTest(d, pair=c(cond[j],cond[i])) - res <- c$table - res2 <- merge(genenames,res,by.x='ensembl',by.y='row.names',all.y=TRUE,all.x=FALSE) - output <- merge(res2,mean.by.group,by.y="row.names",by.x='symbol') - output$rawP <- output$PValue - output$logFC <- output$logFC - output$fdr <- p.adjust(output$rawP, method ='fdr') - output$bonf <- p.adjust(output$rawP, method ='bonferroni') - write.table(output,file=paste(cond[i],'_',cond[j],'.edgeR.txt',sep=""),quote=FALSE,row.names=FALSE,sep='\t') - filt.out <- na.omit(output[output$fdr < 0.05,]) - if (nrow(filt.out) > 2) { - subset <- logcpm[row.names(logcpm) %in% filt.out$symbol,] - gnames <- filt.out[c('ensembl','symbol')] - s <- merge(gnames,subset,by.x="ensembl",by.y="row.names",all.x=FALSE,all.y=TRUE,sort=FALSE) - STREE <- hclust(dist(t(subset))) - zscores <- scale(t(subset)) - ngenes <- length(colnames(zscores)) - textscale <- (1/(ngenes/30)) - if (textscale > 1) { - textscale <-1 - } - if (textscale < 0.1) { - textscale <- 0.1 - } - png(file=paste(cond[i],'_',cond[j],'.heatmap.edgeR.png',sep=""),height=768,width=1024) - heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,labCol=s$symbol,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) - legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20,cex=0.5) - dev.off() - gcont <- paste(cond[j],cond[i],sep='-') - qs.results = qusage(logcpm, grps,gcont,MSIG.geneSets) - save(qs.results,file=paste(cond[i],'_',cond[j],'.qusage.rda',sep="")) - } - } - } -} - -###################### Run Limma VOOM ######################## - -# design <- model.matrix(~0+grps) -# colnames(design) <- grpnames -# a <- length(cond)-1 -# design.pairs <- c() -# k <- 0 -# for (i in 1:a) { -# for (j in 2:length(cond)) { -# if (i == j) { -# next -# } else { -# k <- k+1 -# design.pairs[k] <- paste(cond[i],'-',cond[j],sep='') -# } -# } -# } - -#contrast.matrix <- makeContrasts(design.pairs,levels=design) - -#d <- DGEList(counts=countTable,group=grps,lib.size=libSizes) -#d <- calcNormFactors(d) -#d <- estimateCommonDisp(d) - -# v <- voom(d,design,plot=TRUE) -# fit <- lmFit(v,design) -# fit2 <- contrasts.fit(fit, contrast.matrix) -# fit2 <- eBayes(fit2) - -# comps <- colnames(fit2$coefficients) -# for (i in 1:length(comps)) { -# res <- topTable(fit2,coef=i,number=Inf,sort.by="P") -# res2 <- merge(genenames,res,by.x='ensembl',by.y='row.names',all.y=TRUE,all.x=FALSE) -# output <- merge(res2,mean.by.group,by.y="row.names",by.x='symbol') -# output$rawP <- output$P.Value -# output$logFC <- output$adj.P.Val -# output$fdr <- p.adjust(output$rawP, method ='fdr') -# output$bonf <- p.adjust(output$rawP, method ='bonferroni') -# write.table(output,file=paste(cond[i],'_',cond[j],'.voom.txt',sep=""),quote=FALSE,row.names=FALSE,sep='\t') -# filt.out <- na.omit(output[output$fdr < 0.05,]) -# if (nrow(filt.out) > 2) { -# subset <- logcpm[row.names(logcpm) %in% filt.out$ensembl,] -# gnames <- filt.out[c('ensembl','symbol')] -# s <- merge(gnames,subset,by.x="ensembl",by.y="row.names",all.x=FALSE,all.y=TRUE,sort=FALSE) -# STREE <- hclust(dist(t(subset))) -# zscores <- scale(t(subset)) -# ngenes <- length(colnames(zscores)) -# textscale <- (1/(ngenes/30)) -# if (textscale > 1) { -# textscale <-1 -# } -# if (textscale < 0.1) { -# textscale <- 0.1 -# } -# png(file=paste(cond[i],'_',cond[j],'.heatmap.voom.png',sep=""),height=768,width=1024) -# heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,labCol=s$symbol,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) -# legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20,cex=0.5) -# dev.off() -# } -# } - - diff --git a/diff_exp/dea.R b/diff_exp/dea.R new file mode 120000 index 0000000000000000000000000000000000000000..3c564e6b4a23f0fcba9f3d0287b5ad404759b834 --- /dev/null +++ b/diff_exp/dea.R @@ -0,0 +1 @@ +../genect_rnaseq/dea.R \ No newline at end of file diff --git a/diff_exp/gencode_genename.pl b/diff_exp/gencode_genename.pl deleted file mode 100755 index f2a81f5a6b3214dfad1de511839c8a53b49a8024..0000000000000000000000000000000000000000 --- a/diff_exp/gencode_genename.pl +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/perl -w -#parse_gencode.pl - -open OUT, ">genenames.txt" or die $!; -print OUT join("\t",'chrom','start','end','ensembl','symbol','type'),"\n"; -my $gtf = shift @ARGV; -open GCODE, "<$gtf" or die $!; -while (my $line = <GCODE>) { - chomp($line); - next if ($line =~ m/^#/); - my ($chrom,$source,$feature,$start,$end,$filter,$phase,$frame,$info) = - split(/\t/,$line); - next unless ($feature eq 'gene'); - $info =~ s/\"//g; - my %hash; - foreach $a (split(/;\s*/,$info)) { - my ($key,$val) = split(/ /,$a); - $hash{$key} = $val; - } - $hash{gene_id} =~ s/\.\d+//; - print OUT join("\t",$chrom,$start,$end,$hash{gene_id},$hash{gene_name},$hash{gene_type}),"\n"; -} diff --git a/diff_exp/gencode_genename.pl b/diff_exp/gencode_genename.pl new file mode 120000 index 0000000000000000000000000000000000000000..2cd704644eb079851e1fa46f634b0c4660f8f721 --- /dev/null +++ b/diff_exp/gencode_genename.pl @@ -0,0 +1 @@ +../genect_rnaseq/gencode_genename.pl \ No newline at end of file diff --git a/diff_exp/geneabundance.sh b/diff_exp/geneabundance.sh deleted file mode 100644 index e763ef7c10a6e3ec2f595fd1deb97f99d435b1c7..0000000000000000000000000000000000000000 --- a/diff_exp/geneabundance.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -#rnaseqalign.sh - -usage() { - echo "-h Help documentation for hisat.sh" - echo "-b --BAM File" - echo "-g --GTF File" - echo "-s --stranded" - echo "-p --Prefix for output file name" - echo "Example: bash geneabudance.sh genect_rnaseq/geneabundance.sh -s stranded -g gtf_file -p pair_id -b bam" - exit 1 -} -OPTIND=1 # Reset OPTIND -while getopts :b:g:p:s:h opt -do - case $opt in - b) sbam=$OPTARG;; - g) gtf=$OPTARG;; - p) pair_id=$OPTARG;; - s) stranded=$OPTARG;; - h) usage;; - esac -done - -shift $(($OPTIND -1)) - -# Check for mandatory options -if [[ -z $pair_id ]] || [[ -z $sbam ]] -then - usage -fi -if [[ -z $SLURM_CPUS_ON_NODE ]] -then - SLURM_CPUS_ON_NODE=1 -fi -if [[ $SLURM_CPUS_ON_NODE > 64 ]] -then - SLURM_CPUS_ON_NODE=64 -fi -source /etc/profile.d/modules.sh -module load subread/1.6.1 stringtie/1.3.2d-intel -featureCounts -s $stranded -M --fraction -J --ignoreDup -T $SLURM_CPUS_ON_NODE -p -g gene_name -a ${gtf} -o ${pair_id}.cts ${sbam} -mkdir ${pair_id}_stringtie -cd ${pair_id}_stringtie -stringtie ../${sbam} -p $SLURM_CPUS_ON_NODE -G ${gtf} -B -e -o denovo.gtf -A ../${pair_id}.fpkm.txt - diff --git a/diff_exp/geneabundance.sh b/diff_exp/geneabundance.sh new file mode 120000 index 0000000000000000000000000000000000000000..831e0e8fbadd7a08da89004eaf6fe13319109849 --- /dev/null +++ b/diff_exp/geneabundance.sh @@ -0,0 +1 @@ +../genect_rnaseq/geneabundance.sh \ No newline at end of file diff --git a/genect_rnaseq/DEA_htseq.R b/genect_rnaseq/DEA_htseq.R new file mode 100755 index 0000000000000000000000000000000000000000..058dd1817f6d448baf3a9c84fdd22f8fc197ad3b --- /dev/null +++ b/genect_rnaseq/DEA_htseq.R @@ -0,0 +1,236 @@ +#!/cm/shared/apps/R/intel/3.2.1/bin/Rscript +library(edgeR) +library(DESeq2) +library("RColorBrewer") +library("gplots") +library(qusage) + +rowMax <- function(x) apply(x,1,max) +col.grp <- function(n,b) { + colorvec <- vector(mode="character", length=length(n)) + plotcols <- rainbow(length(b)) + for (i in 1:length(n)) { + for (j in 1:length(b)) { + if ( n[i] == b[j] ) { + colorvec[i] = plotcols[j] + } + } + } + c(colorvec) +} + + +#################### Read in Data ################################ +genenames <- read.table(file="genenames.txt",header=TRUE,sep='\t') + +tbl <- read.table('countTable.txt',header=TRUE,sep="\t") +tbl2 <- read.table('countTable.logCPM.txt',header=TRUE,sep="\t") +ct <- tbl[,4:length(tbl)] +row.names(ct) <- tbl$ENSEMBL + +samples<- names(ct) + +dtbl <- read.table('design.txt',header=TRUE,sep="\t") +samtbl <- merge(as.data.frame(samples),dtbl,by.x="samples",by.y="SampleID",all.x=TRUE,sort=FALSE) +grpnames <- levels(factor(as.character(samtbl$SampleGroup))) +samtbl$SampleGroup <- factor(samtbl$SampleGroup, levels=grpnames) + +colData <- samtbl[c('SampleGroup','SubjectID')] +row.names(colData) <- samtbl$samples + +dds <- DESeqDataSetFromMatrix(countData=ct,colData= colData,design= ~ SampleGroup) +dds <- dds[ rowMax(counts(dds)) > 30, ] +dds <- dds[ colSums(counts(dds)) > 1000000] + +if (nrow(counts(dds)) < 1) { +print(paste("Samples are filtered if there is < 1M reads. There are less than no remaining sample(s) after this filter",sep=' ')) +q() +} + +countTable <- counts(dds) +#grps <- as.character(levels(colData(dds)$SampleGroup)) +grps <- as.character(colData(dds)$SampleGroup) +col.blocks <-col.grp(grps,levels(factor(grps))) +libSizes <- as.vector(colSums(countTable)) + +logcpm <- tbl2[,4:length(tbl2)] +row.names(logcpm) <- tbl2$SYMBOL + +tmp.tab <- aggregate(t(logcpm),by=list(as.character(samtbl$SampleGroup)),FUN=mean) +row.names(tmp.tab) <- tmp.tab$Group.1 +mean.by.group <- round(log2(t(tmp.tab[,c(2:ncol(tmp.tab))])+1), digits = 2) + +#################### Run DESEQ2 ################################ +dds <- DESeq(dds) +rld <- rlogTransformation(dds, blind=TRUE) +sampleDists <- dist(t(assay(rld))) + +png(file="samples_heatmap.png",bg ="transparent",height=768,width=1024) +heatmap.2(as.matrix(sampleDists), col = bluered(100),RowSideColors = col.blocks,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) +dev.off() + +#Compare Samples using PCA +png(file="pca.png",bg ="transparent",height=768,width=1024) +print(plotPCA(rld, intgroup="SampleGroup"),col.hab=col.blocks) +dev.off() + +#Do all pairwise comparisons +contrast <- resultsNames(dds) +cond<- levels(colData(dds)$SampleGroup) +a <- length(cond)-1 +for (i in 1:a) { + for (j in 2:length(cond)) { + if (i == j) { + next + } else { + res <- as.data.frame(results(dds,contrast=c("SampleGroup",cond[i],cond[j]))) + res2 <- merge(genenames,res,by.x='ensembl',by.y='row.names',all.y=TRUE,all.x=FALSE) + output <- merge(res2,mean.by.group,by.y="row.names",by.x='symbol') + output$rawP <- output$pvalue + output$logFC <- output$log2FoldChange + output$fdr <- output$padj + output$bonf <- p.adjust(output$rawP, method ='bonferroni') + write.table(output,file=paste(cond[i],'_',cond[j],'.deseq2.txt',sep=""),quote=FALSE,row.names=FALSE,sep='\t') + filt.out <- na.omit(output[output$fdr < 0.05,]) + if (nrow(filt.out) > 2) { + subset <- logcpm[row.names(logcpm) %in% filt.out$symbol,] + gnames <- filt.out[c('ensembl','symbol')] + s <- merge(gnames,subset,by.x="ensembl",by.y="row.names",all.x=FALSE,all.y=TRUE,sort=FALSE) + STREE <- hclust(dist(t(subset))) + zscores <- scale(t(subset)) + ngenes <- length(colnames(zscores)) + textscale <- (1/(ngenes/30)) + if (textscale > 1) { + textscale <-1 + } + if (textscale < 0.1) { + textscale <- 0.1 + } + png(file=paste(cond[i],'_',cond[j],'.heatmap.deseq2.png',sep=""),height=768,width=1024) + heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,labCol=s$symbol,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) + legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20,cex=0.5) + dev.off() + } + } + } +} + +###################### Run EdgeR ######################## +###################### Run QuSage ###################### + +MSIG.geneSets <- read.gmt('geneset.gmt') + +design <- model.matrix(~grps) +d <- DGEList(counts=countTable,group=grps,lib.size=libSizes) +d <- calcNormFactors(d) +d <- estimateCommonDisp(d) +png(file="mds.png",bg ="transparent",height=768,width=1024) +plotMDS(d, labels=grps,col=col.blocks) +legend("topleft",legend=grpnames,col=rainbow(length(grpnames)),pch=20) +dev.off() +cond <-levels(d$samples$group) +colnames(design) <- levels(d$samples$group) +a <- length(cond)-1 +for (i in 1:a) { + for (j in 2:length(cond)) { + if (i == j) { + next + } else { + c <- exactTest(d, pair=c(cond[j],cond[i])) + res <- c$table + res2 <- merge(genenames,res,by.x='ensembl',by.y='row.names',all.y=TRUE,all.x=FALSE) + output <- merge(res2,mean.by.group,by.y="row.names",by.x='symbol') + output$rawP <- output$PValue + output$logFC <- output$logFC + output$fdr <- p.adjust(output$rawP, method ='fdr') + output$bonf <- p.adjust(output$rawP, method ='bonferroni') + write.table(output,file=paste(cond[i],'_',cond[j],'.edgeR.txt',sep=""),quote=FALSE,row.names=FALSE,sep='\t') + filt.out <- na.omit(output[output$fdr < 0.05,]) + if (nrow(filt.out) > 2) { + subset <- logcpm[row.names(logcpm) %in% filt.out$symbol,] + gnames <- filt.out[c('ensembl','symbol')] + s <- merge(gnames,subset,by.x="ensembl",by.y="row.names",all.x=FALSE,all.y=TRUE,sort=FALSE) + STREE <- hclust(dist(t(subset))) + zscores <- scale(t(subset)) + ngenes <- length(colnames(zscores)) + textscale <- (1/(ngenes/30)) + if (textscale > 1) { + textscale <-1 + } + if (textscale < 0.1) { + textscale <- 0.1 + } + png(file=paste(cond[i],'_',cond[j],'.heatmap.edgeR.png',sep=""),height=768,width=1024) + heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,labCol=s$symbol,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) + legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20,cex=0.5) + dev.off() + gcont <- paste(cond[j],cond[i],sep='-') + qs.results = qusage(logcpm, grps,gcont,MSIG.geneSets) + save(qs.results,file=paste(cond[i],'_',cond[j],'.qusage.rda',sep="")) + } + } + } +} + +###################### Run Limma VOOM ######################## + +# design <- model.matrix(~0+grps) +# colnames(design) <- grpnames +# a <- length(cond)-1 +# design.pairs <- c() +# k <- 0 +# for (i in 1:a) { +# for (j in 2:length(cond)) { +# if (i == j) { +# next +# } else { +# k <- k+1 +# design.pairs[k] <- paste(cond[i],'-',cond[j],sep='') +# } +# } +# } + +#contrast.matrix <- makeContrasts(design.pairs,levels=design) + +#d <- DGEList(counts=countTable,group=grps,lib.size=libSizes) +#d <- calcNormFactors(d) +#d <- estimateCommonDisp(d) + +# v <- voom(d,design,plot=TRUE) +# fit <- lmFit(v,design) +# fit2 <- contrasts.fit(fit, contrast.matrix) +# fit2 <- eBayes(fit2) + +# comps <- colnames(fit2$coefficients) +# for (i in 1:length(comps)) { +# res <- topTable(fit2,coef=i,number=Inf,sort.by="P") +# res2 <- merge(genenames,res,by.x='ensembl',by.y='row.names',all.y=TRUE,all.x=FALSE) +# output <- merge(res2,mean.by.group,by.y="row.names",by.x='symbol') +# output$rawP <- output$P.Value +# output$logFC <- output$adj.P.Val +# output$fdr <- p.adjust(output$rawP, method ='fdr') +# output$bonf <- p.adjust(output$rawP, method ='bonferroni') +# write.table(output,file=paste(cond[i],'_',cond[j],'.voom.txt',sep=""),quote=FALSE,row.names=FALSE,sep='\t') +# filt.out <- na.omit(output[output$fdr < 0.05,]) +# if (nrow(filt.out) > 2) { +# subset <- logcpm[row.names(logcpm) %in% filt.out$ensembl,] +# gnames <- filt.out[c('ensembl','symbol')] +# s <- merge(gnames,subset,by.x="ensembl",by.y="row.names",all.x=FALSE,all.y=TRUE,sort=FALSE) +# STREE <- hclust(dist(t(subset))) +# zscores <- scale(t(subset)) +# ngenes <- length(colnames(zscores)) +# textscale <- (1/(ngenes/30)) +# if (textscale > 1) { +# textscale <-1 +# } +# if (textscale < 0.1) { +# textscale <- 0.1 +# } +# png(file=paste(cond[i],'_',cond[j],'.heatmap.voom.png',sep=""),height=768,width=1024) +# heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,labCol=s$symbol,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) +# legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20,cex=0.5) +# dev.off() +# } +# } + + diff --git a/genect_rnaseq/build_ballgown.R b/genect_rnaseq/build_ballgown.R new file mode 100755 index 0000000000000000000000000000000000000000..b74bfa29b69e8c491e23545e0bf249cd781ef7f0 --- /dev/null +++ b/genect_rnaseq/build_ballgown.R @@ -0,0 +1,16 @@ +library(ballgown) + +args<-commandArgs(TRUE) + +sample_dirs <- args +samples <- gsub('_stringtie','',sample_dirs) +design <- "design.txt" + +samtbl <- read.table(file=design,header=TRUE,sep='\t') + +bg <- ballgown(samples=sample_dirs, meas='all') +samples <- gsub('_stringtie','',sampleNames(bg)) +mergetbl <- merge(as.data.frame(samples),samtbl,by.x="samples",by.y="SampleID",all.x=TRUE,sort=FALSE) +pData(bg) = data.frame(id=samples, group=as.character(mergetbl$SampleGroup)) + +save(bg,file="bg.rda") diff --git a/genect_rnaseq/concat_edgeR.pl b/genect_rnaseq/concat_edgeR.pl new file mode 100755 index 0000000000000000000000000000000000000000..e736e822cc07a4225d19bdb936a7f6f3fc342055 --- /dev/null +++ b/genect_rnaseq/concat_edgeR.pl @@ -0,0 +1,31 @@ +#!/usr/bin/perl -w +#concat_edgeR.pl + +#symbol chrom start end ensembl type logFC logCPM PValue monocytes neutrophils rawP fdr bonf + +my @files = @ARGV; +open OUT, ">edgeR.results.txt" or die $!; +print OUT join("\t",'G1','G2','ENSEMBL','SYMBOL','TYPE','logFC','logCPM', + 'pval','fdr','G1.Mean','G2.Mean'),"\n"; +foreach $f (@files) { + open IN, "<$f" or die $!; + my ($fname,$ext) = split(/\.edgeR\./,$f); + $fname =~ s/edgeR\///; + my ($g1,$g2) = split(/_/,$fname); + my $head = <IN>; + chomp($head); + my @colnames = split(/\t/,$head); + while (my $line = <IN>) { + chomp($line); + my @row = split(/\t/,$line); + my %hash; + foreach my $i (0..$#row) { + $hash{$colnames[$i]} = $row[$i]; + } + print OUT join("\t",$g1,$g2,$hash{ensembl},$hash{symbol},$hash{type}, + sprintf("%.2f",$hash{logFC}), + sprintf("%.2f",$hash{logCPM}), + sprintf("%.1e",$hash{rawP}),sprintf("%.1e",$hash{fdr}), + sprintf("%.0f",$hash{$g1}),sprintf("%.0f",$hash{$g2})),"\n"; + } +} diff --git a/genect_rnaseq/dea.R b/genect_rnaseq/dea.R new file mode 100755 index 0000000000000000000000000000000000000000..058dd1817f6d448baf3a9c84fdd22f8fc197ad3b --- /dev/null +++ b/genect_rnaseq/dea.R @@ -0,0 +1,236 @@ +#!/cm/shared/apps/R/intel/3.2.1/bin/Rscript +library(edgeR) +library(DESeq2) +library("RColorBrewer") +library("gplots") +library(qusage) + +rowMax <- function(x) apply(x,1,max) +col.grp <- function(n,b) { + colorvec <- vector(mode="character", length=length(n)) + plotcols <- rainbow(length(b)) + for (i in 1:length(n)) { + for (j in 1:length(b)) { + if ( n[i] == b[j] ) { + colorvec[i] = plotcols[j] + } + } + } + c(colorvec) +} + + +#################### Read in Data ################################ +genenames <- read.table(file="genenames.txt",header=TRUE,sep='\t') + +tbl <- read.table('countTable.txt',header=TRUE,sep="\t") +tbl2 <- read.table('countTable.logCPM.txt',header=TRUE,sep="\t") +ct <- tbl[,4:length(tbl)] +row.names(ct) <- tbl$ENSEMBL + +samples<- names(ct) + +dtbl <- read.table('design.txt',header=TRUE,sep="\t") +samtbl <- merge(as.data.frame(samples),dtbl,by.x="samples",by.y="SampleID",all.x=TRUE,sort=FALSE) +grpnames <- levels(factor(as.character(samtbl$SampleGroup))) +samtbl$SampleGroup <- factor(samtbl$SampleGroup, levels=grpnames) + +colData <- samtbl[c('SampleGroup','SubjectID')] +row.names(colData) <- samtbl$samples + +dds <- DESeqDataSetFromMatrix(countData=ct,colData= colData,design= ~ SampleGroup) +dds <- dds[ rowMax(counts(dds)) > 30, ] +dds <- dds[ colSums(counts(dds)) > 1000000] + +if (nrow(counts(dds)) < 1) { +print(paste("Samples are filtered if there is < 1M reads. There are less than no remaining sample(s) after this filter",sep=' ')) +q() +} + +countTable <- counts(dds) +#grps <- as.character(levels(colData(dds)$SampleGroup)) +grps <- as.character(colData(dds)$SampleGroup) +col.blocks <-col.grp(grps,levels(factor(grps))) +libSizes <- as.vector(colSums(countTable)) + +logcpm <- tbl2[,4:length(tbl2)] +row.names(logcpm) <- tbl2$SYMBOL + +tmp.tab <- aggregate(t(logcpm),by=list(as.character(samtbl$SampleGroup)),FUN=mean) +row.names(tmp.tab) <- tmp.tab$Group.1 +mean.by.group <- round(log2(t(tmp.tab[,c(2:ncol(tmp.tab))])+1), digits = 2) + +#################### Run DESEQ2 ################################ +dds <- DESeq(dds) +rld <- rlogTransformation(dds, blind=TRUE) +sampleDists <- dist(t(assay(rld))) + +png(file="samples_heatmap.png",bg ="transparent",height=768,width=1024) +heatmap.2(as.matrix(sampleDists), col = bluered(100),RowSideColors = col.blocks,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) +dev.off() + +#Compare Samples using PCA +png(file="pca.png",bg ="transparent",height=768,width=1024) +print(plotPCA(rld, intgroup="SampleGroup"),col.hab=col.blocks) +dev.off() + +#Do all pairwise comparisons +contrast <- resultsNames(dds) +cond<- levels(colData(dds)$SampleGroup) +a <- length(cond)-1 +for (i in 1:a) { + for (j in 2:length(cond)) { + if (i == j) { + next + } else { + res <- as.data.frame(results(dds,contrast=c("SampleGroup",cond[i],cond[j]))) + res2 <- merge(genenames,res,by.x='ensembl',by.y='row.names',all.y=TRUE,all.x=FALSE) + output <- merge(res2,mean.by.group,by.y="row.names",by.x='symbol') + output$rawP <- output$pvalue + output$logFC <- output$log2FoldChange + output$fdr <- output$padj + output$bonf <- p.adjust(output$rawP, method ='bonferroni') + write.table(output,file=paste(cond[i],'_',cond[j],'.deseq2.txt',sep=""),quote=FALSE,row.names=FALSE,sep='\t') + filt.out <- na.omit(output[output$fdr < 0.05,]) + if (nrow(filt.out) > 2) { + subset <- logcpm[row.names(logcpm) %in% filt.out$symbol,] + gnames <- filt.out[c('ensembl','symbol')] + s <- merge(gnames,subset,by.x="ensembl",by.y="row.names",all.x=FALSE,all.y=TRUE,sort=FALSE) + STREE <- hclust(dist(t(subset))) + zscores <- scale(t(subset)) + ngenes <- length(colnames(zscores)) + textscale <- (1/(ngenes/30)) + if (textscale > 1) { + textscale <-1 + } + if (textscale < 0.1) { + textscale <- 0.1 + } + png(file=paste(cond[i],'_',cond[j],'.heatmap.deseq2.png',sep=""),height=768,width=1024) + heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,labCol=s$symbol,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) + legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20,cex=0.5) + dev.off() + } + } + } +} + +###################### Run EdgeR ######################## +###################### Run QuSage ###################### + +MSIG.geneSets <- read.gmt('geneset.gmt') + +design <- model.matrix(~grps) +d <- DGEList(counts=countTable,group=grps,lib.size=libSizes) +d <- calcNormFactors(d) +d <- estimateCommonDisp(d) +png(file="mds.png",bg ="transparent",height=768,width=1024) +plotMDS(d, labels=grps,col=col.blocks) +legend("topleft",legend=grpnames,col=rainbow(length(grpnames)),pch=20) +dev.off() +cond <-levels(d$samples$group) +colnames(design) <- levels(d$samples$group) +a <- length(cond)-1 +for (i in 1:a) { + for (j in 2:length(cond)) { + if (i == j) { + next + } else { + c <- exactTest(d, pair=c(cond[j],cond[i])) + res <- c$table + res2 <- merge(genenames,res,by.x='ensembl',by.y='row.names',all.y=TRUE,all.x=FALSE) + output <- merge(res2,mean.by.group,by.y="row.names",by.x='symbol') + output$rawP <- output$PValue + output$logFC <- output$logFC + output$fdr <- p.adjust(output$rawP, method ='fdr') + output$bonf <- p.adjust(output$rawP, method ='bonferroni') + write.table(output,file=paste(cond[i],'_',cond[j],'.edgeR.txt',sep=""),quote=FALSE,row.names=FALSE,sep='\t') + filt.out <- na.omit(output[output$fdr < 0.05,]) + if (nrow(filt.out) > 2) { + subset <- logcpm[row.names(logcpm) %in% filt.out$symbol,] + gnames <- filt.out[c('ensembl','symbol')] + s <- merge(gnames,subset,by.x="ensembl",by.y="row.names",all.x=FALSE,all.y=TRUE,sort=FALSE) + STREE <- hclust(dist(t(subset))) + zscores <- scale(t(subset)) + ngenes <- length(colnames(zscores)) + textscale <- (1/(ngenes/30)) + if (textscale > 1) { + textscale <-1 + } + if (textscale < 0.1) { + textscale <- 0.1 + } + png(file=paste(cond[i],'_',cond[j],'.heatmap.edgeR.png',sep=""),height=768,width=1024) + heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,labCol=s$symbol,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) + legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20,cex=0.5) + dev.off() + gcont <- paste(cond[j],cond[i],sep='-') + qs.results = qusage(logcpm, grps,gcont,MSIG.geneSets) + save(qs.results,file=paste(cond[i],'_',cond[j],'.qusage.rda',sep="")) + } + } + } +} + +###################### Run Limma VOOM ######################## + +# design <- model.matrix(~0+grps) +# colnames(design) <- grpnames +# a <- length(cond)-1 +# design.pairs <- c() +# k <- 0 +# for (i in 1:a) { +# for (j in 2:length(cond)) { +# if (i == j) { +# next +# } else { +# k <- k+1 +# design.pairs[k] <- paste(cond[i],'-',cond[j],sep='') +# } +# } +# } + +#contrast.matrix <- makeContrasts(design.pairs,levels=design) + +#d <- DGEList(counts=countTable,group=grps,lib.size=libSizes) +#d <- calcNormFactors(d) +#d <- estimateCommonDisp(d) + +# v <- voom(d,design,plot=TRUE) +# fit <- lmFit(v,design) +# fit2 <- contrasts.fit(fit, contrast.matrix) +# fit2 <- eBayes(fit2) + +# comps <- colnames(fit2$coefficients) +# for (i in 1:length(comps)) { +# res <- topTable(fit2,coef=i,number=Inf,sort.by="P") +# res2 <- merge(genenames,res,by.x='ensembl',by.y='row.names',all.y=TRUE,all.x=FALSE) +# output <- merge(res2,mean.by.group,by.y="row.names",by.x='symbol') +# output$rawP <- output$P.Value +# output$logFC <- output$adj.P.Val +# output$fdr <- p.adjust(output$rawP, method ='fdr') +# output$bonf <- p.adjust(output$rawP, method ='bonferroni') +# write.table(output,file=paste(cond[i],'_',cond[j],'.voom.txt',sep=""),quote=FALSE,row.names=FALSE,sep='\t') +# filt.out <- na.omit(output[output$fdr < 0.05,]) +# if (nrow(filt.out) > 2) { +# subset <- logcpm[row.names(logcpm) %in% filt.out$ensembl,] +# gnames <- filt.out[c('ensembl','symbol')] +# s <- merge(gnames,subset,by.x="ensembl",by.y="row.names",all.x=FALSE,all.y=TRUE,sort=FALSE) +# STREE <- hclust(dist(t(subset))) +# zscores <- scale(t(subset)) +# ngenes <- length(colnames(zscores)) +# textscale <- (1/(ngenes/30)) +# if (textscale > 1) { +# textscale <-1 +# } +# if (textscale < 0.1) { +# textscale <- 0.1 +# } +# png(file=paste(cond[i],'_',cond[j],'.heatmap.voom.png',sep=""),height=768,width=1024) +# heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,labCol=s$symbol,srtRow=45,srtCol=45,trace="none", margins=c(5, 5)) +# legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20,cex=0.5) +# dev.off() +# } +# } + + diff --git a/genect_rnaseq/gencode_genename.pl b/genect_rnaseq/gencode_genename.pl new file mode 100755 index 0000000000000000000000000000000000000000..f2a81f5a6b3214dfad1de511839c8a53b49a8024 --- /dev/null +++ b/genect_rnaseq/gencode_genename.pl @@ -0,0 +1,22 @@ +#!/usr/bin/perl -w +#parse_gencode.pl + +open OUT, ">genenames.txt" or die $!; +print OUT join("\t",'chrom','start','end','ensembl','symbol','type'),"\n"; +my $gtf = shift @ARGV; +open GCODE, "<$gtf" or die $!; +while (my $line = <GCODE>) { + chomp($line); + next if ($line =~ m/^#/); + my ($chrom,$source,$feature,$start,$end,$filter,$phase,$frame,$info) = + split(/\t/,$line); + next unless ($feature eq 'gene'); + $info =~ s/\"//g; + my %hash; + foreach $a (split(/;\s*/,$info)) { + my ($key,$val) = split(/ /,$a); + $hash{$key} = $val; + } + $hash{gene_id} =~ s/\.\d+//; + print OUT join("\t",$chrom,$start,$end,$hash{gene_id},$hash{gene_name},$hash{gene_type}),"\n"; +} diff --git a/genect_rnaseq/geneabundance.sh b/genect_rnaseq/geneabundance.sh new file mode 100644 index 0000000000000000000000000000000000000000..e763ef7c10a6e3ec2f595fd1deb97f99d435b1c7 --- /dev/null +++ b/genect_rnaseq/geneabundance.sh @@ -0,0 +1,46 @@ +#!/bin/bash +#rnaseqalign.sh + +usage() { + echo "-h Help documentation for hisat.sh" + echo "-b --BAM File" + echo "-g --GTF File" + echo "-s --stranded" + echo "-p --Prefix for output file name" + echo "Example: bash geneabudance.sh genect_rnaseq/geneabundance.sh -s stranded -g gtf_file -p pair_id -b bam" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :b:g:p:s:h opt +do + case $opt in + b) sbam=$OPTARG;; + g) gtf=$OPTARG;; + p) pair_id=$OPTARG;; + s) stranded=$OPTARG;; + h) usage;; + esac +done + +shift $(($OPTIND -1)) + +# Check for mandatory options +if [[ -z $pair_id ]] || [[ -z $sbam ]] +then + usage +fi +if [[ -z $SLURM_CPUS_ON_NODE ]] +then + SLURM_CPUS_ON_NODE=1 +fi +if [[ $SLURM_CPUS_ON_NODE > 64 ]] +then + SLURM_CPUS_ON_NODE=64 +fi +source /etc/profile.d/modules.sh +module load subread/1.6.1 stringtie/1.3.2d-intel +featureCounts -s $stranded -M --fraction -J --ignoreDup -T $SLURM_CPUS_ON_NODE -p -g gene_name -a ${gtf} -o ${pair_id}.cts ${sbam} +mkdir ${pair_id}_stringtie +cd ${pair_id}_stringtie +stringtie ../${sbam} -p $SLURM_CPUS_ON_NODE -G ${gtf} -B -e -o denovo.gtf -A ../${pair_id}.fpkm.txt + diff --git a/genect_rnaseq/statanal.sh b/genect_rnaseq/statanal.sh index e221f1b8bd7b0fa72964cbf07a515065cef81988..ad5cab388e0f1edbe81cb8af30c4515dc6b6d14a 100644 --- a/genect_rnaseq/statanal.sh +++ b/genect_rnaseq/statanal.sh @@ -17,6 +17,7 @@ do done shift $(($OPTIND -1)) +baseDir="`dirname \"$0\"`" # Check for mandatory options if [[ -z $SLURM_CPUS_ON_NODE ]] @@ -25,9 +26,9 @@ then fi source /etc/profile.d/modules.sh -perl $baseDir/scripts/concat_cts.pl -o ./ *.cts -perl $baseDir/scripts/concat_fpkm.pl -o ./ *.fpkm.txt -perl $baseDir/scripts/concat_ctsum.pl -o ./ *.cts.summary +perl $baseDir/concat_cts.pl -o ./ *.cts +perl $baseDir/concat_fpkm.pl -o ./ *.fpkm.txt +perl $baseDir/concat_ctsum.pl -o ./ *.cts.summary cp design.txt design.shiny.txt cp geneset.gmt geneset.shiny.gmt @@ -37,7 +38,7 @@ then touch bg.rda else module load R/3.2.1-intel - Rscript $baseDir/scripts/dea.R - Rscript $baseDir/scripts/build_ballgown.R *_stringtie - perl $baseDir/scripts/concat_edgeR.pl *.edgeR.txt + Rscript $baseDir/dea.R + Rscript $baseDir/build_ballgown.R *_stringtie + perl $baseDir/concat_edgeR.pl *.edgeR.txt fi