Commit 413083be authored by zhanxw's avatar zhanxw
Browse files

optimize VCF parsing speed

parent f94cb163
......@@ -77,7 +77,7 @@ class VCFBuffer {
private:
char* buf; // pointer to string
size_t len; // len (not including the trailing '\0'
size_t len; // len (not including the trailing '\0')
size_t bufLen; // memory capacity
};
......
......@@ -6,7 +6,7 @@ char VCFIndividual::defaultValue[2] = {'.', '\0'};
VCFValue VCFIndividual::defaultVCFValue(&(defaultValue[0]), 0, 0);
void VCFIndividual::output(FileWriter* fp) const {
for (unsigned int i = 0; i < fd.size(); ++i) {
for (unsigned int i = 0; i < fdLen; ++i) {
if (i) fp->write(':');
this->fd[i].output(fp);
}
......@@ -14,7 +14,7 @@ void VCFIndividual::output(FileWriter* fp) const {
void VCFIndividual::toStr(std::string* fp) const {
std::string s;
for (unsigned int i = 0; i < fd.size(); ++i) {
for (unsigned int i = 0; i < fdLen; ++i) {
if (i) fp->push_back(':');
this->fd[i].toStr(&s);
fp->append(s);
......
......@@ -13,7 +13,7 @@ class FileWriter;
class VCFIndividual {
public:
// FUNC parseFunction[4];
VCFIndividual() {
VCFIndividual() : fdLen(0) {
this->include(); // by default, enable everyone
}
/**
......@@ -35,24 +35,25 @@ class VCFIndividual {
this->parsed.attach(&vcfValue.line[vcfValue.beg],
vcfValue.end - vcfValue.beg);
// need to consider missing field
this->fd.resize(0);
VCFValue v;
v.line = this->parsed.getBuffer();
size_t idx = 0;
int beg = 0;
int ret;
while ((ret = v.parseTill(this->parsed, beg, ':')) == 0) {
this->parsed[v.end] = '\0';
beg = v.end + 1;
while (true) {
if (idx == fdLen) {
VCFValue v;
fd.push_back(v);
++fdLen;
}
if (ret == 1) {
VCFValue& v = fd[idx++];
ret = v.parseTill(this->parsed, beg, ':');
this->parsed[v.end] = '\0';
fd.push_back(v);
beg = v.end + 1;
if (ret == 1) { // last element
break;
}
if (fd.size() == 0) {
}
fdLen = idx;
if (fdLen == 0) {
fprintf(stderr, "Empty individual column - very strange!!\n");
fprintf(stderr, "vcfValue = %s\n", vcfValue.toStr());
}
......@@ -66,13 +67,13 @@ class VCFIndividual {
const VCFValue& operator[](const unsigned int i) const
__attribute__((deprecated)) {
if (i >= fd.size()) {
if (i >= fdLen) {
FATAL("index out of bound!");
}
return (this->fd[i]);
}
VCFValue& operator[](const unsigned int i) __attribute__((deprecated)) {
if (i >= fd.size()) {
if (i >= fdLen) {
FATAL("index out of bound!");
}
return (this->fd[i]);
......@@ -82,7 +83,7 @@ class VCFIndividual {
* in ith field is missing
*/
const VCFValue& get(unsigned int i, bool* isMissing) const {
if (i >= fd.size()) {
if (i >= fdLen) {
*isMissing = true;
return VCFIndividual::defaultVCFValue;
}
......@@ -93,7 +94,7 @@ class VCFIndividual {
* @return VCFValue without checking missingness
*/
const VCFValue& justGet(unsigned int i) {
if (i >= fd.size()) {
if (i >= fdLen) {
return VCFIndividual::defaultVCFValue;
}
return (this->fd[i]);
......@@ -102,12 +103,12 @@ class VCFIndividual {
// VCFValue& getSelf() { return this->self; }
// const VCFValue& getSelf() const { return this->self; }
size_t size() const { return this->fd.size(); }
size_t size() const { return this->fdLen; }
/**
* dump the content of VCFIndividual column
*/
void output(FILE* fp) const {
for (unsigned int i = 0; i < fd.size(); ++i) {
for (size_t i = 0; i < fdLen; ++i) {
if (i) fputc(':', fp);
this->fd[i].output(fp);
}
......@@ -120,7 +121,10 @@ class VCFIndividual {
std::string name; // id name
// VCFValue self; // whole field for the individual (unparsed)
VCFBuffer parsed; // store parsed string (where \0 added)
std::vector<VCFValue> fd; // each field separated by ':'
std::vector<VCFValue> fd; // each field separated by ':', for optimizaiton,
// do not use clear(), resize()
size_t fdLen; // number of elements in fd
static VCFValue defaultVCFValue;
static char defaultValue[2];
}; // end VCFIndividual
......
......@@ -176,12 +176,20 @@ bool VCFInputFile::readRecord() {
if (!nRead) return false;
// star parsing
bool ret = this->record.parse(&this->line);
int ret;
this->record.attach(&this->line);
ret = this->record.parseSite();
if (ret) {
reportReadError(this->line);
}
if (!this->passFilter()) continue;
if (!this->isAllowedSite()) continue;
ret = this->record.parseIndividual();
if (ret) {
reportReadError(this->line);
}
if (!this->passFilter()) continue;
// break;
return true;
}
......
......@@ -24,6 +24,20 @@ class VCFRecord {
* @return 0: if success
*/
int parse(std::string* pVcfLine) {
attach(pVcfLine);
// go through VCF sites (first 9 columns)
int ret = parseSite();
if (ret) {
return -1;
}
ret = parseIndividual();
if (ret) {
return -1;
}
}
void attach(std::string* pVcfLine) {
std::string& vcfLine = *pVcfLine;
this->vcfInfo.reset();
// this->parsed = vcfLine.c_str();
......@@ -31,49 +45,52 @@ class VCFRecord {
// this->self.beg = 0;
// this->self.end = this->parsed.size();
this->parsed.attach(&vcfLine[0], (int)vcfLine.size());
}
// go through VCF sites (first 9 columns)
int parseSite() {
int ret;
if ((ret = this->chrom.parseTill(this->parsed, 0, '\t'))) {
fprintf(stderr, "Error when parsing CHROM [ %s ]\n", vcfLine.c_str());
fprintf(stderr, "Error when parsing CHROM [ %s ]\n",
this->parsed.c_str());
return -1;
}
this->parsed[this->chrom.end] = '\0';
if ((ret =
(this->pos.parseTill(this->parsed, this->chrom.end + 1, '\t')))) {
fprintf(stderr, "Error when parsing POS [ %s ]\n", vcfLine.c_str());
fprintf(stderr, "Error when parsing POS [ %s ]\n", this->parsed.c_str());
return -1;
}
this->parsed[this->pos.end] = '\0';
if ((ret = (this->id.parseTill(this->parsed, this->pos.end + 1, '\t')))) {
fprintf(stderr, "Error when parsing ID [ %s ]\n", vcfLine.c_str());
fprintf(stderr, "Error when parsing ID [ %s ]\n", this->parsed.c_str());
return -1;
}
this->parsed[this->id.end] = '\0';
if ((ret = (this->ref.parseTill(this->parsed, this->id.end + 1, '\t')))) {
fprintf(stderr, "Error when parsing REF [ %s ]\n", vcfLine.c_str());
fprintf(stderr, "Error when parsing REF [ %s ]\n", this->parsed.c_str());
return -1;
}
this->parsed[this->ref.end] = '\0';
if ((ret = (this->alt.parseTill(this->parsed, this->ref.end + 1, '\t')))) {
fprintf(stderr, "Error when parsing ALT [ %s ]\n", vcfLine.c_str());
fprintf(stderr, "Error when parsing ALT [ %s ]\n", this->parsed.c_str());
return -1;
}
this->parsed[this->alt.end] = '\0';
if ((ret = (this->qual.parseTill(this->parsed, this->alt.end + 1, '\t')))) {
fprintf(stderr, "Error when parsing QUAL [ %s ]\n", vcfLine.c_str());
fprintf(stderr, "Error when parsing QUAL [ %s ]\n", this->parsed.c_str());
return -1;
}
this->parsed[this->qual.end] = '\0';
if ((ret =
(this->filt.parseTill(this->parsed, this->qual.end + 1, '\t')))) {
fprintf(stderr, "Error when parsing FILTER [ %s ]\n", vcfLine.c_str());
fprintf(stderr, "Error when parsing FILTER [ %s ]\n",
this->parsed.c_str());
return -1;
}
this->parsed[this->filt.end] = '\0';
......@@ -82,7 +99,8 @@ class VCFRecord {
(this->info.parseTill(this->parsed, this->filt.end + 1, '\t')))) {
// either parsing error, or we are dealing with site only VCFs
if (ret < 0) { // error happens
fprintf(stderr, "Error when parsing INFO [ %s ]\n", vcfLine.c_str());
fprintf(stderr, "Error when parsing INFO [ %s ]\n",
this->parsed.c_str());
return -1;
}
}
......@@ -94,15 +112,21 @@ class VCFRecord {
if ((ret = (this->format.parseTill(this->parsed, this->info.end + 1,
'\t')))) {
fprintf(stderr, "Error when parsing FORMAT [ %s ]\n", vcfLine.c_str());
fprintf(stderr, "Error when parsing FORMAT [ %s ]\n",
this->parsed.c_str());
return -1;
}
this->parsed[this->format.end] = '\0';
return 0;
}
int parseIndividual() {
int ret;
// now comes each individual genotype
unsigned int idx = 0; // peopleIdx
VCFIndividual* p;
VCFValue indv;
static VCFValue indv;
int beg = this->format.end + 1;
while ((ret = indv.parseTill(this->parsed, beg, '\t')) == 0) {
if (idx >= this->allIndv.size()) {
......@@ -114,13 +138,18 @@ class VCFRecord {
}
this->parsed[indv.end] = '\0';
p = this->allIndv[idx];
p->parse(indv);
// p = this->allIndv[idx];
// p->parse(indv);
beg = indv.end + 1;
idx++;
}
#pragma openmp parallel for
for (int i = 0; i < idx; ++i) {
this->allIndv[i]->parse(indv);
}
// if ret == 1 menas reaches end of this->parsed
if (ret != 1) {
fputs("Parsing error in line: ", stderr);
......@@ -147,6 +176,7 @@ class VCFRecord {
return 0;
}
void createIndividual(const std::string& line) {
std::vector<std::string> sa;
stringTokenize(line, '\t', &sa);
......@@ -163,6 +193,7 @@ class VCFRecord {
p->setName(sa[i]);
}
}
void deleteIndividual() {
for (unsigned int i = 0; i < this->allIndv.size(); i++) {
if (this->allIndv[i]) delete this->allIndv[i];
......
......@@ -7,6 +7,7 @@
#include "VCFConstant.h"
#include "base/Exception.h"
#include "base/TypeConversion.h"
#include "base/Utils.h"
class FileWriter;
......@@ -264,7 +265,8 @@ class VCFValue {
this->beg = b;
this->end = b;
char* r = strchr(line + beg, c);
// char* r = strchr(line + beg, c);
const char* r = ssechr(line + beg, c);
if (r != NULL) {
end = r - line;
return 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment