Commit 66071e02 authored by zhanxw's avatar zhanxw

fix test lib/test to incorporate htslib

parent a3192827
......@@ -14,128 +14,33 @@
#include "third/htslib/include/htslib/kstring.h"
#include "third/htslib/include/htslib/vcf.h"
// KSTREAM_INIT(gzFile, gzread, 4096)
// typedef struct {
// gzFile fp;
// FILE *fpout;
// kstream_t *ks;
// void *refhash;
// kstring_t line;
// int max_ref;
// } vcf_t;
/**
* Adopt some functions from vcf.c
*/
// extern "C" {
// void *bcf_str2id_init();
// extern void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s);
// }
// static void my_write_header(bcf_hdr_t *h);
htsFile *my_vcf_open(const char *fn, const char *mode) {
if (strchr(mode, 'b')) return hts_open(fn, mode);
htsFile *bp; // bcf file handle
htsFile *v; // vcdf file handle
// write uncompress vcf as outputs
// bp = (htsFile *)calloc(1, sizeof(htsFile));
// v = (htsFile *)calloc(1, sizeof(htsFile));
// bp->is_vcf = 1;
// bp->v = v;
// v->refhash = bcf_str2id_init();
// if (strchr(mode, 'r')) {
// v->fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
// v->ks = ks_init(v->fp);
// } else if (strchr(mode, 'w')) {
// // disable open stdout/external file
// // v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout;
// v->fpout = NULL;
// }
return hts_open(fn, mode);
return bp;
}
#define my_vcf_read vcf_read
// vcf_hdr_read
// vcf_hdr_write(bout, hout, &header);
int my_vcf_hdr_write(htsFile *bp, const bcf_hdr_t *h, std::string *hdr) {
// vcf_t *v = (vcf_t*)bp->v;
int i, has_ver = 0;
// if (!bp->is_vcf) {
// fprintf(stderr, "Something is wrong when reading BCF header at %s:%d\n",
// __FILE__, __LINE__);
// return bcf_hdr_write(bp, h);
// }
if (hts_get_format(bp)->format != bcf) {
fprintf(stderr, "Something is wrong when reading BCF header at %s:%d\n",
__FILE__, __LINE__);
}
std::string &s = *hdr;
// copied from htslib vcf.c
kstring_t htxt = {0, 0, 0};
bcf_hdr_format(h, 0, &htxt);
while (htxt.l && htxt.s[htxt.l - 1] == '\0') --htxt.l; // kill trailing zeros
int ret;
s.append(htxt.s, htxt.l);
free(htxt.s);
return ret < 0 ? -1 : 0;
// if (h->l_txt > 0) {
// if (strstr(h->txt, "##fileformat=")) has_ver = 1;
// if (has_ver == 0) {
// // fprintf(v->fpout, "##fileformat=VCFv4.1\n");
// s = "##fileformat=VCFv4.1\n";
// }
// // fwrite(h->txt, 1, h->l_txt - 1, v->fpout);
// s += h->txt;
// }
// if (h->l_txt == 0) {
// // fprintf(v->fpout, "##fileformat=VCFv4.1\n");
// s = "##fileformat=VCFv4.1\n";
// }
// // fprintf(v->fpout,
// "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
// s += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";
// for (i = 0; i < h->n_smpl; ++i) {
// // fprintf(v->fpout, "\t%s", h->sns[i]);
// s += "\t";
// s += h->sns[i];
// }
// // fputc('\n', v->fpout);
// s += "\n";
return 0;
}
// adopted from vcf_write() in vcf.c
int my_vcf_write(htsFile *bp, bcf_hdr_t *h, bcf1_t *b, std::string *line) {
// vcf_t *v = (vcf_t*)bp->v;
// if (!bp->is_vcf) {
// fprintf(stderr, "Something is wrong when reading BCF at %s:%d\n",
// __FILE__,
// __LINE__);
// return bcf_write(bp, h, b);
// }
if (hts_get_format(bp)->format != bcf) {
fprintf(stderr, "Something is wrong when reading BCF header at %s:%d\n",
__FILE__, __LINE__);
// copied from bcf_write (equivalent to bcf_write1) from vcf.c in htslib
if (h->dirty) bcf_hdr_sync(h);
if (bcf_hdr_nsamples(h) != b->n_sample) {
hts_log_error(
"Broken VCF record, the number of columns at %s:%d does not match the "
"number of samples (%d vs %d)",
bcf_seqname(h, b), b->pos + 1, b->n_sample, bcf_hdr_nsamples(h));
return -1;
}
// kstring_t str;
// memset(&str, 0, sizeof(kstring_t));
// bcf_fmt_core(h, b, &str);
// // bcf_fmt_core(h, b, &v->line);
// // fwrite(v->line.s, 1, v->line.l, v->fpout);
// // fputc('\n', v->fpout);
// now copy from vcf_write
kstring_t ks = {0, 0, 0};
vcf_format(h, b, &ks);
line->assign(ks.s, ks.l);
return ks.l + 1;
// return v->line.l + 1;
if (vcf_format1(h, b, &ks) != 0) {
return -1;
}
(*line) = ks.s;
return 0;
}
int BCFReader::open(const std::string &fn) {
......@@ -145,21 +50,19 @@ int BCFReader::open(const std::string &fn) {
this->cannotOpen = true;
return -1;
}
b = (bcf1_t *)calloc(1, sizeof(bcf1_t)); // each bcf read block
b = bcf_init();
if (b == 0) {
return -1;
}
// write header
hin = hout = vcf_hdr_read(bp);
bout = my_vcf_open("-", "wu");
hin = bcf_hdr_read(bp);
// my_write_header(
// hout); // always print the header, put certain fields in header.
// write results out
// vcf_hdr_write(bout, hout);
my_vcf_hdr_write(bout, hout, &header);
// adapted from htslib vcf.c vcf_hdr_write()
kstring_t htxt = {0, 0, 0};
bcf_hdr_format(hin, 0, &htxt);
while (htxt.l && htxt.s[htxt.l - 1] == '\0') --htxt.l; // kill trailing zeros
header = htxt.s;
free(htxt.s);
// open index
this->hasIndex = this->openIndex(fn);
......@@ -176,13 +79,14 @@ bool BCFReader::readLine(std::string *line) {
// openOK?
if (cannotOpen) return false;
// read
// check read mode
if (range.empty()) {
// read line by line
if (my_vcf_read(bp, hin, b) > 0) {
my_vcf_write(bout, hout, b, line);
return true;
// if (my_vcf_read(bp, hin, b) > 0) {
if (bcf_read(bp, hin, b) == 0) {
if (my_vcf_write(bp, hin, b, line) == 0) {
return true;
}
}
return false;
}
......@@ -197,19 +101,10 @@ bool BCFReader::readLine(std::string *line) {
if (iter) {
// read a record
while (my_vcf_read(bp, hin, b) > 0) {
// if (tid >= 0) {
// int l = strlen(b->ref);
// l = b->pos + (l > 0 ? l : 1);
// if (b->tid != tid || b->pos >= end)
// break; // current record has passed prespecified region
// if (!(l > begin && end > b->pos))
// continue; // not sure when this will happen
vcf_format(hin, b, &ks);
// my_vcf_write(bout, hout, b, line);
line->append(ks.s, ks.l);
return true;
// }
while (bcf_itr_next(bp, iter, b) >= 0) {
if (my_vcf_write(bp, hin, b, line) == 0) {
return true;
}
}
}
......@@ -220,35 +115,6 @@ bool BCFReader::readLine(std::string *line) {
this->rangeIterator.getChrom().c_str(),
this->rangeIterator.getBegin(), this->rangeIterator.getEnd());
rangeBuffer[127] = '\0';
// // // int tid, beg, end, len;
// // if (!str2id) {
// // str2id = bcf_build_refhash(hout);
// // }
// if (bcf_parse_region(str2id, rangeBuffer, &tid, &begin, &end) >= 0) {
// off = bcf_idx_query(idx, tid, begin);
// if (off == 0) {
// // fprintf(stderr, "[%s] no records in the query region.\n",
// __func__);
// // return 1; // FIXME: a lot of memory leaks...
// continue;
// }
// bgzf_seek(bp->fp, off, SEEK_SET);
// while (my_vcf_read(bp, hin, b) > 0) {
// if (tid >= 0) {
// int l = strlen(b->ref);
// l = b->pos + (l > 0 ? l : 1);
// if (b->tid != tid || b->pos >= end)
// break; // current record has passed prespecified region
// if (!(l > begin && end > b->pos))
// continue; // not sure when this will happen
// ++rangeIterator;
// my_vcf_write(bout, hout, b, line);
// return true;
// }
// }
// }
iter = bcf_itr_querys(idx, hin, rangeBuffer);
if (!iter) {
......@@ -256,186 +122,10 @@ bool BCFReader::readLine(std::string *line) {
}
while (bcf_itr_next(bp, iter, b) >= 0) {
++rangeIterator;
vcf_format(hin, b, &ks);
line->append(ks.s, ks.l);
return true;
if (my_vcf_write(bp, hin, b, line) == 0) {
return true;
}
}
}
return false;
};
#if 0
static void my_write_header(bcf_hdr_t *h) {
kstring_t str;
memset(&str, 0, sizeof(kstring_t));
str.l = h->l_txt ? h->l_txt - 1 : 0;
str.m = str.l + 1;
str.s = h->txt;
if (!strstr(str.s, "##INFO=<ID=DP,"))
kputs(
"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=DP4,"))
kputs(
"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"# high-quality "
"ref-forward bases, ref-reverse, alt-forward and alt-reverse "
"bases\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=MQ,"))
kputs(
"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root-mean-square "
"mapping quality of covering reads\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=FQ,"))
kputs(
"##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of "
"all samples being the same\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=AF1,"))
kputs(
"##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood "
"estimate of the first ALT allele frequency (assuming HWE)\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=AC1,"))
kputs(
"##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood "
"estimate of the first ALT allele count (no HWE assumption)\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=AN,"))
kputs(
"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of "
"alleles in called genotypes\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=IS,"))
kputs(
"##INFO=<ID=IS,Number=2,Type=Float,Description=\"Maximum number of "
"reads supporting an indel and fraction of indel reads\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=AC,"))
kputs(
"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in "
"genotypes for each ALT allele, in the same order as listed\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=G3,"))
kputs(
"##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of "
"genotype frequencies\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=HWE,"))
kputs(
"##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test "
"P-value based on G3\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=CLR,"))
kputs(
"##INFO=<ID=CLR,Number=1,Type=Integer,Description=\"Log ratio of "
"genotype likelihoods with and without the constraint\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=UGT,"))
kputs(
"##INFO=<ID=UGT,Number=1,Type=String,Description=\"The most probable "
"unconstrained genotype configuration in the trio\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=CGT,"))
kputs(
"##INFO=<ID=CGT,Number=1,Type=String,Description=\"The most probable "
"constrained genotype configuration in the trio\">\n",
&str);
// if (!strstr(str.s, "##INFO=<ID=CI95,"))
// kputs("##INFO=<ID=CI95,Number=2,Type=Float,Description=\"Equal-tail
//Bayesian credible interval of the site allele frequency at the 95%
//level\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=PV4,"))
kputs(
"##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand "
"bias, baseQ bias, mapQ bias and tail distance bias\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=INDEL,"))
kputs(
"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the "
"variant is an INDEL.\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=PC2,"))
kputs(
"##INFO=<ID=PC2,Number=2,Type=Integer,Description=\"Phred probability "
"of the nonRef allele frequency in group1 samples being larger "
"(,smaller) than in group2.\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=PCHI2,"))
kputs(
"##INFO=<ID=PCHI2,Number=1,Type=Float,Description=\"Posterior weighted "
"chi^2 P-value for testing the association between group1 and group2 "
"samples.\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=QCHI2,"))
kputs(
"##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled "
"PCHI2.\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=RP,"))
kputs(
"##INFO=<ID=PR,Number=1,Type=Integer,Description=\"# permutations "
"yielding a smaller PCHI2.\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=QBD,"))
kputs(
"##INFO=<ID=QBD,Number=1,Type=Float,Description=\"Quality by Depth: "
"QUAL/#reads\">\n",
&str);
// if (!strstr(str.s, "##INFO=<ID=RPS,"))
// kputs("##INFO=<ID=RPS,Number=3,Type=Float,Description=\"Read Position
// Stats: depth, average, stddev\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=RPB,"))
kputs(
"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Read Position "
"Bias\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=MDV,"))
kputs(
"##INFO=<ID=MDV,Number=1,Type=Integer,Description=\"Maximum number of "
"high-quality nonRef reads in samples\">\n",
&str);
if (!strstr(str.s, "##INFO=<ID=VDB,"))
kputs(
"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance "
"Bias (v2) for filtering splice-site artefacts in RNA-seq data. Note: "
"this version may be broken.\">\n",
&str);
if (!strstr(str.s, "##FORMAT=<ID=GT,"))
kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
&str);
if (!strstr(str.s, "##FORMAT=<ID=GQ,"))
kputs(
"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype "
"Quality\">\n",
&str);
if (!strstr(str.s, "##FORMAT=<ID=GL,"))
kputs(
"##FORMAT=<ID=GL,Number=3,Type=Float,Description=\"Likelihoods for "
"RR,RA,AA genotypes (R=ref,A=alt)\">\n",
&str);
if (!strstr(str.s, "##FORMAT=<ID=DP,"))
kputs(
"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"# high-quality "
"bases\">\n",
&str);
if (!strstr(str.s, "##FORMAT=<ID=DV,"))
kputs(
"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"# high-quality "
"non-reference bases\">\n",
&str);
if (!strstr(str.s, "##FORMAT=<ID=SP,"))
kputs(
"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled "
"strand bias P-value\">\n",
&str);
if (!strstr(str.s, "##FORMAT=<ID=PL,"))
kputs(
"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of "
"Phred-scaled genotype likelihoods\">\n",
&str);
h->l_txt = str.l + 1;
h->txt = str.s;
}
#endif
......@@ -4,19 +4,18 @@
#include "base/RangeList.h"
#include "third/htslib/include/htslib/vcf.h"
// static void write_header(bcf_hdr_t *h);
class BCFReader {
public:
BCFReader(const std::string& fn)
: cannotOpen(false),
hasIndex(false),
readyToRead(false),
bp(0),
b(0),
bout(0) // ,
// off(0) // ,
// str2id(0)
{
hin(0),
idx(0),
iter(0) {
ks = {0, 0, 0};
open(fn);
};
......@@ -87,16 +86,8 @@ class BCFReader {
// destroy range iterator
// close index
bcf_hdr_destroy(hin);
bcf_destroy(b); // bcf_destroy(blast);
hts_close(bp); // close bcf handle for input
hts_close(bout); // close bcf handle for output
/* // resume stdout */
/* stdout = fdopen(this->origStdout, "w"); */
/* assert(stdout); */
// if (str2id) {
// hts_str2id_destroy(str2id);
// }
bcf_destroy(b); // bcf_destroy(blast);
hts_close(bp); // close bcf handle for input
closeIndex();
};
void resetRangeIterator() {
......@@ -119,21 +110,10 @@ class BCFReader {
// bcftools part
htsFile* bp;
bcf1_t* b;
htsFile* bout;
bcf_hdr_t* hin;
bcf_hdr_t* hout;
hts_idx_t* idx;
hts_itr_t* iter;
// int tid, begin, end;
// uint64_t off;
kstring_t ks;
// void* str2id;
/* BCF_t* BCFHandle; */
/* ti_iter_t iter; */
// const char* line;
// int line_len;
// int origStdout;
std::string header;
};
......
......@@ -3,6 +3,7 @@
#include "base/RangeList.h"
#include "third/htslib/include/htslib/hts.h"
#include "third/htslib/include/htslib/kseq.h" // defined KS_SEP_LINE
#include "third/htslib/include/htslib/tbx.h"
#include "third/htslib/include/htslib/vcf.h"
......@@ -32,25 +33,41 @@ class TabixReader {
// check read mode
if (range.empty()) {
// read line by line
if (!iter) {
// iter = ti_query(this->tabixHandle, 0, 0, 0);
iter = tbx_itr_querys(tabixIndex, ".");
if (!iter) return false;
}
// if (!iter) {
// // iter = ti_query(this->tabixHandle, 0, 0, 0);
// iter = tbx_itr_querys(tabixIndex, ".");
// if (!iter) return false;
// }
// if (!this->firstLine.empty()) {
// (*line) = this->firstLine;
// this->firstLine.clear();
// return true;
// }
// // while ((ti_line = ti_read(this->tabixHandle, iter, &ti_line_len)) !=
// 0)
// // {
// while (tbx_itr_next(tabixHandle, tabixIndex, iter, &tabixLine) >= 0) {
// // need to skip header here
// if (tabixLine.l > 0 &&
// (int)(tabixLine.s[0]) == this->tabixIndex->conf.meta_char)
// continue;
// // (*line) = (ti_line);
// return true;
// }
// return false;
if (!this->firstLine.empty()) {
(*line) = this->firstLine;
this->firstLine.clear();
return true;
}
// while ((ti_line = ti_read(this->tabixHandle, iter, &ti_line_len)) != 0)
// {
while (tbx_itr_next(tabixHandle, tabixIndex, iter, &tabixLine) >= 0) {
// need to skip header here
while (hts_getline(this->tabixHandle, KS_SEP_LINE, &tabixLine) >= 0) {
// skip header lines
if (tabixLine.l > 0 &&
(int)(tabixLine.s[0]) == this->tabixIndex->conf.meta_char)
continue;
// (*line) = (ti_line);
(*line) = tabixLine.s;
return true;
}
return false;
......@@ -69,7 +86,7 @@ class TabixReader {
// if (this->ti_line) {
if (tbx_itr_next(tabixHandle, tabixIndex, iter, &tabixLine) > 0) {
// (*line) = ti_line;
(*line) = tabixLine.s;
return true;
}
}
......@@ -87,7 +104,7 @@ class TabixReader {
// continue;
// }
// ti_iter_destroy(iter);
tbx_destroy(this->tabixIndex);
// do not destroy index: tbx_destroy(this->tabixIndex);
// iter = 0;
// this->iter = ti_queryi(this->tabixHandle, tid, beg, end);
this->iter = tbx_itr_querys(this->tabixIndex, rangeBuffer);
......@@ -167,7 +184,7 @@ class TabixReader {
this->hasIndex = true;
return true;
};
}
void closeIndex() {
// fpritnf(stderr, "close index...");
......@@ -215,31 +232,40 @@ class TabixReader {
if (!this->hasIndex) {
return -1;
}
// do not attempt to read
// // this->iter = ti_query(this->tabixHandle, 0, 0, 0);
// this->iter = tbx_itr_querys(this->tabixIndex, ".");
// if (!this->iter) {
// return -1;
// }
// this->iter = ti_query(this->tabixHandle, 0, 0, 0);
this->iter = tbx_itr_querys(this->tabixIndex, ".");
if (!this->iter) {
return -1;
}
// while ((ti_line = ti_read(this->tabixHandle, this->iter,
// &this->ti_line_len)) != 0) {
// if ((int)(*ti_line) != idxconf->meta_char) {
// this->firstLine = ti_line;
// // while ((ti_line = ti_read(this->tabixHandle, this->iter,
// // &this->ti_line_len)) != 0) {
// // if ((int)(*ti_line) != idxconf->meta_char) {
// // this->firstLine = ti_line;
// // break;
// // }
// // // fputs(ti_line, stdout); fputc('\n', stdout);
// // this->header += ti_line;
// // this->header += "\n";
// // }
// while (tbx_itr_next(this->tabixHandle, this->tabixHandle, this->iter,
// &this->tabixLine) >= 0) {
// if (tabixLine.s[0] == this->tabixIndex->conf.meta_char) {
// this->firstLine = tabixLine.s;
// break;
// }
// // fputs(ti_line, stdout); fputc('\n', stdout);
// this->header += ti_line;
// this->header += "\n";
// this->header += tabixLine.s;
// this->header += '\n';
// }
while (tbx_itr_next(this->tabixHandle, this->tabixHandle, this->iter,
&this->tabixLine) >= 0) {
if (tabixLine.s[0] == this->tabixIndex->conf.meta_char) {
while (hts_getline(this->tabixHandle, KS_SEP_LINE, &tabixLine) >= 0) {
if (tabixLine.l > 0 &&
(int)(tabixLine.s[0]) != this->tabixIndex->conf.meta_char) {
this->firstLine = tabixLine.s;
break;
}
this->header += tabixLine.s;
this->header += '\n';
this->header += "\n";
}
cannotOpen = false;
......
......@@ -18,13 +18,12 @@ CXX_FLAGS = -O0 -ggdb -fopenmp \
-I../../third/samtools/bcftools \
../../libVcf/lib-dbg-vcf.a \
../../base/lib-dbg-base.a ../../libsrc/lib-dbg-goncalo.a \
../../third/tabix/libtabix.a \
../../third/htslib/lib/libhts.a \
../../third/libdeflate/lib/libdeflate.a \
../../third/pcre/lib/libpcreposix.a ../../third/pcre/lib/libpcre.a \