Skip to content
Snippets Groups Projects
lca.R 4.72 KiB
Newer Older
Brandi Cantarel's avatar
Brandi Cantarel committed
#!/usr/bin/Rscript

library(R2jags)
library(getopt)
library(reshape2)

discard = runif(1) # fixes this error: 'Error in set.seed() : argument "seed" is missing, with no default'

# get options, using the spec as defined by the enclosed list.
# we read the options from the default: commandArgs(TRUE).
spec = matrix(c(
'verbose', 'v', 2, "logical",
'countsFile', 'c', 1, "character",
'statsOutFile', 's', 1, "character",
'help' , 'h', 0, "logical"
), byrow=TRUE, ncol=4);
opt = getopt(spec);

# if help was asked for print a friendly message and exit with a non-zero error code
if ( !is.null(opt$help) ) {
  cat(getopt(spec, usage=TRUE));
  q(status=1);
}

if ( is.null(opt$countsFile ) ) { 
  cat("please specify a --countsFile\n")
  cat(getopt(spec, usage=TRUE));
  q(status=1);
}

if ( is.null(opt$statsOutFile ) ) { 
  cat("please specify a --statsOutFile\n")
  cat(getopt(spec, usage=TRUE));
  q(status=1);
}

########################################
########################################

# This function generates a JAGS file, to be written out and used by R2Jags
# The JAGS file changes depending on the number of SNP calls
generateJagsFile <- function(numberOfSnpCallSets, outJagsFile) {

	 # numberOfSnpCallSets is the number of VCF files the user gives us

jagCode = 
"model {

  theta ~ dbeta(1,2);

  for (i in 1:N) {
    fp[i] ~ dbeta(1,2);
    fn[i] ~ dbeta(1,2);
    tp[i] <- 1-fn[i];
    tn[i] <- 1-fp[i];
  }

";

  jagCode = paste(jagCode, "for (j in 1:M) {\n\tmargprobs[j] <- theta * \n" )
  for (i in 1:numberOfSnpCallSets){ 
     if ( i == numberOfSnpCallSets ){
       jagCode = paste(jagCode, "\t(fn[", i, "]^(1-x[j,", i, "])) * (tp[", i, "]^x[j,", i, "]) \n", sep="") 
     }
     else {
       jagCode = paste(jagCode, "\t(fn[", i, "]^(1-x[j,", i, "])) * (tp[", i, "]^x[j,", i, "]) *\n", sep="") 
     }
  }
  jagCode = paste(jagCode, "\t+ (1-theta) * \n")
  for (i in 1:numberOfSnpCallSets){ 
     if ( i == numberOfSnpCallSets ){
          jagCode = paste(jagCode, "\t(tn[", i, "]^(1-x[j,", i, "])) * (fp[", i, "]^x[j,", i, "]);\n\n", sep="")
     }
     else {
          jagCode = paste(jagCode, "\t(tn[", i, "]^(1-x[j,", i, "])) * (fp[", i, "]^x[j,", i, "]) *\n", sep="")
     }
  }

  jagCode = paste(jagCode, "\tpostprobs[j] <- theta * \n", sep="")

  for (i in 1:numberOfSnpCallSets){ 
     if ( i == numberOfSnpCallSets ){
          jagCode = paste(jagCode, "\t(fn[", i, "]^(1-x[j,", i, "])) * (tp[", i, "]^x[j,", i, "])\n\t/ margprobs[j];\n", sep="")
     }
     else {
          jagCode = paste(jagCode, "\t(fn[", i, "]^(1-x[j,", i, "])) * (tp[", i, "]^x[j,", i, "]) *\n", sep="")
     }
  }
  jagCode = paste(jagCode, "}

counts ~ dmulti(margprobs, total);
}", sep="")

  fileConn<-file(outJagsFile)
  writeLines(jagCode, fileConn)
  close(fileConn)

}

########################################
########################################

# read in counts file
raw.data = read.table(opt$countsFile, header=F, skip=1 )

# counts <- c(2942808473, 17491655, 21576, 23189, 339805, 89159,
#             168214, 76044, 43138288, 530963, 22682, 22169,
#             462052, 129472, 2804257, 3454104);

counts = raw.data[,length(raw.data)]
M <- length(counts)
N <- log(M, base=2)
total <- sum(counts);

# make matrix of all possible 0/1 values for each SNP caller
cmdstring =  paste( "expand.grid(", paste( rep("c(0,1)", N), collapse = ", " ), ")", sep="")
x <- eval( parse(text = cmdstring ) )

jags.data <- c("N", "M", "x", "counts", "total");
jags.params <- c("theta", "fp", "fn", "margprobs", "postprobs");

jags.inits <- NULL;

# write out lca.jags file 
tempJagFile = tempfile();
generateJagsFile(N, tempJagFile);

jagsfit1 <- jags(data=jags.data, inits=jags.inits, param=jags.params, DIC=FALSE,
                                n.chains=1, n.iter=120000, n.thin=10, n.burnin=10000,
                                model.file=tempJagFile)

mcmc <- as.mcmc(jagsfit1);

# below is what Aaron was doing before, but I'm just going to write out the whole summary statistics file
# print(params <- summary(mcmc)$statistics[c(1:8,41),1, drop=F])
# print(pp <- summary(mcmc)$statistics[25:40,1, drop=F])
# print(cbind(x, pp, pp >= 0.5))

# write out summary statistics
summaryStats = summary(mcmc)$statistics

idx <- 2*N + 1 # skip the fp/fn parameters
data <- cbind(x, cts = counts, margprobs = summary(mcmc)$statistics[idx:(idx+M-1), 1])

bvrs <- matrix(NA, ncol=N, nrow=N);

for (i in 2:N) {
  for (j in 1:(i-1)) {
    obs <- dcast(data[,i]+data[,j] ~ ., sum, value.var="cts", data=data)[,3]
    exp <- dcast(data[,i]+data[,j] ~ ., sum, value.var="margprobs", data=data)[,3]
    bvrs[i,j] <- chisq.test(obs, p=exp, rescale.p=T)$statistic / 3 # BVR == Chi-square/d.f.

  }
}

sink(opt$statsOutFile)
cat("==== Summary Statistics ====\n")
summaryStats
cat("==== Bivariate residuals ====\n")
bvrs
sink()