tune-hmm.R

#!/bin/Rscript

# Load libraries
if(!require("optparse")){
    install.packages("optparse", repos="http://cran.r-project.org")
    library("optparse")
}
source("http://bioconductor.org/biocLite.R")
if (!require("groHMM")){
    biocLite("groHMM")
    library("groHMM")
}
if (!require("GenomicFeatures")){
    biocLite("GenomicFeatures")
    library("GenomicFeatures")
}

# Currently mouse or human
if (!require("TxDb.Hsapiens.UCSC.hg19.knownGene")){
    biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
    library("TxDb.Hsapiens.UCSC.hg19.knownGene")
}
if (!require("TxDb.Mmusculus.UCSC.mm10.knownGene")){
    biocLite("TxDb.Mmusculus.UCSC.mm10.knownGene")
    library("TxDb.Mmusculus.UCSC.mm10.knownGene")
}
if (!require("org.Hs.eg.db")){
    biocLite("org.Hs.eg.db")
    library("org.Hs.eg.db")
}
if (!require("org.Mm.eg.db")){
    biocLite("org.Mm.eg.db")
    library("org.Mm.eg.db")
}

# Specify our desired options
option_list <- list(
    make_option(c("-a1", "--alignment1"), action="store", type='character', help = "File path to Replicate 1"),
    make_option(c("-a2", "--alignment2"), action="store", type='character', help = "File path to Replicate 2"),
    make_option(c("-g", "--genome"), action="store", type='character', help = "The UCSC genome to use"),
    make_option(c("-o", "--out"), action="store", type='character', help = "The output directory")
    )

# Parse arguments
args = parse_args(OptionParser(option_list=option_list))

# Set the working directory to output directory
setwd(args$out)

# Set mc.cores to 1
options(mc.cores=getCores(1))

# Load alignment files
alignment_1 <- as(readGAlignments(args$alignment1), "GRanges")
alignments <- sort(c(alignment_1))

# Load UCSC Genes
if(args$genome=='hg19') {
    kgdb <- loadDb("/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/GRO-seq/gencode.sqlite")
} else if(args$genome=='mm10')  {
    kgdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
    org <- org.Mm.eg.db
}

kgtx <- transcripts(kgdb,columns=c("gene_id", "tx_id", "tx_name"))

# Collapse overlapping annotations
kgConsensus <- makeConsensusAnnotations(kgtx, keytype="gene_id",
        mc.cores=getOption("mc.cores"))
#map <- select(org, keys=unlist(mcols(kgConsensus)$gene_id),
#        columns=c("SYMBOL"), keytype=c("ENTREZID"))
#mcols(kgConsensus)$symbol <- map$SYMBOL
#mcols(kgConsensus)$type <- "gene"


# Tune HMM
tune <- data.frame(LtProbB=c(rep(-100,9), rep(-150,9), rep(-200,9), rep(-250,9), rep(-300,9),rep(-350,9),rep(-400,9) ),
           UTS=rep(c(5,10,15,20,25,30,35,40,45), 7))

evals <- mclapply(seq_len(63), function(x) {
        hmm <- detectTranscripts(reads=alignments, LtProbB=tune$LtProbB[x], UTS=tune$UTS[x])
        e <- evaluateHMMInAnnotations(hmm$transcripts, kgConsensus)
        e$eval
        }, mc.cores=getOption("mc.cores"),  mc.silent=TRUE)
tune <- cbind(tune, do.call(rbind, evals))


write.table(tune,file="tune.tsv", quote=F, sep="\t", row.names=F, col.names=T)