call-transcripts.R

#!/bin/Rscript

# Load libraries
if(!require("argparse")){
    install.packages("argparse", repos="http://cran.r-project.org")
    library(argparse)
}
if(!require("jsonlite")){
    install.packages("jsonlite", repos="http://cran.r-project.org")
    library("jsonlite")
}
source("http://bioconductor.org/biocLite.R")
if (!require("groHMM")){
    biocLite("groHMM")
    library("groHMM")
}
if (!require("GenomicFeatures")){
    biocLite("GenomicFeatures")
    library("GenomicFeatures")
}
if (!require("rtracklayer")){
    biocLite("rtracklayer")
    library("rtracklayer")
}
if(!require("jsonlite")){
    install.packages("jsonlite", repos="http://cran.r-project.org")
    library("jsonlite")
}

# Currently mouse or human
if (!require("TxDb.Hsapiens.UCSC.hg19.knownGene")){
    biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
    library("TxDb.Hsapiens.UCSC.hg19.knownGene")
}
if (!require("TxDb.Mmusculus.UCSC.mm10.knownGene")){
    biocLite("TxDb.Mmusculus.UCSC.mm10.knownGene")
    library("TxDb.Mmusculus.UCSC.mm10.knownGene")
}
if (!require("org.Hs.eg.db")){
    biocLite("org.Hs.eg.db")
    library("org.Hs.eg.db")
}
if (!require("org.Mm.eg.db")){
    biocLite("org.Mm.eg.db")
    library("org.Mm.eg.db")
}

# Create parser object
parser <- ArgumentParser()

# Specify our desired options
parser$add_argument("-r1", "--replicate1", nargs=1, help = "File paths to Replicate 1", required = TRUE)
parser$add_argument("-g", "--genome", help = "The ucsc genome", required = TRUE)
parser$add_argument("-o", "--out", help = "The output directory", required = TRUE)
parser$add_argument("-b", "--ltprob", type="integer", help = "The LtsProbB", required = TRUE)
parser$add_argument("-u", "--uts", type="integer", help = "The UTS", required = TRUE)

# Parse arguments
args <- parser$parse_args()

# Set the working directory to output directory
setwd(args$out)

# Set mc.cores to 4
options(mc.cores=getCores(4))

# Load alignment files
alignment_1 <- as(readGAlignments(args$replicate1), "GRanges")
alignments <- sort(c(alignment_1))

# Load UCSC Genes
if(args$genome=='hg19') {
    kgdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
    org <- org.Hs.eg.db
} else if(args$genome=='mm10')  {
    kgdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
    org <- org.Mm.eg.db
}
kgtx <- transcripts(kgdb,columns=c("gene_id", "tx_id", "tx_name"))

# Collapse overlapping annotations
kgConsensus <- makeConsensusAnnotations(kgtx, keytype="gene_id",
        mc.cores=getOption("mc.cores"))
map <- select(org, keys=unlist(mcols(kgConsensus)$gene_id),
        columns=c("SYMBOL"), keytype=c("ENTREZID"))
mcols(kgConsensus)$symbol <- map$SYMBOL
mcols(kgConsensus)$type <- "gene"

# Detect transcripts
hmmResult <- detectTranscripts(alignments, LtProbB=args$ltprob, UTS=args$uts)
txHMM <- hmmResult$transcripts

getExpressedAnnotations <- function(features, reads) {
       fLimit <- limitToXkb(features)
       count <- countOverlaps(fLimit, reads)
       features <- features[count!=0,]
       return(features[(quantile(width(features), .05) < width(features))
      & (width(features) < quantile(width(features), .95)),])}

conExpressed <- getExpressedAnnotations(features=kgConsensus,reads=alignments)

# Plot Density
png('transcript-density-plot.png') # TODO: Fix plot output
td <- getTxDensity(txHMM, conExpressed, mc.cores=getOption("mc.cores"))
u <- par("usr")
lines(c(u[1], 0, 0, 1000, 1000, u[2]), c(0,0,u[4]-.04,u[4]-.04,0,0),col="red")
legend("topright", lty=c(1,1), col=c(2,1), c("ideal", "groHMM"))
text(c(-500,500), c(.05,.5), c("FivePrimeFP", "TP"))
dev.off()
td_JSON <- toJSON(td, null='null',auto_unbox = TRUE, pretty=TRUE)
write(td_JSON,"transcript-density-quality-metrics.json", ncolumns = 1, append = FALSE, sep = " ")

# Repairing called transcripts
break_plus <- breakTranscriptsOnGenes(txHMM, kgConsensus, strand="+")
break_minus <- breakTranscriptsOnGenes(txHMM, kgConsensus, strand="-")
txBroken <- c(break_plus, break_minus)
txFinal <- combineTranscripts(txBroken, kgConsensus)
export(txFinal, con="final-transcripts.bed")