Commit 413083be authored by zhanxw's avatar zhanxw
Browse files

optimize VCF parsing speed

parent f94cb163
...@@ -77,7 +77,7 @@ class VCFBuffer { ...@@ -77,7 +77,7 @@ class VCFBuffer {
private: private:
char* buf; // pointer to string char* buf; // pointer to string
size_t len; // len (not including the trailing '\0' size_t len; // len (not including the trailing '\0')
size_t bufLen; // memory capacity size_t bufLen; // memory capacity
}; };
......
...@@ -6,7 +6,7 @@ char VCFIndividual::defaultValue[2] = {'.', '\0'}; ...@@ -6,7 +6,7 @@ char VCFIndividual::defaultValue[2] = {'.', '\0'};
VCFValue VCFIndividual::defaultVCFValue(&(defaultValue[0]), 0, 0); VCFValue VCFIndividual::defaultVCFValue(&(defaultValue[0]), 0, 0);
void VCFIndividual::output(FileWriter* fp) const { void VCFIndividual::output(FileWriter* fp) const {
for (unsigned int i = 0; i < fd.size(); ++i) { for (unsigned int i = 0; i < fdLen; ++i) {
if (i) fp->write(':'); if (i) fp->write(':');
this->fd[i].output(fp); this->fd[i].output(fp);
} }
...@@ -14,7 +14,7 @@ void VCFIndividual::output(FileWriter* fp) const { ...@@ -14,7 +14,7 @@ void VCFIndividual::output(FileWriter* fp) const {
void VCFIndividual::toStr(std::string* fp) const { void VCFIndividual::toStr(std::string* fp) const {
std::string s; std::string s;
for (unsigned int i = 0; i < fd.size(); ++i) { for (unsigned int i = 0; i < fdLen; ++i) {
if (i) fp->push_back(':'); if (i) fp->push_back(':');
this->fd[i].toStr(&s); this->fd[i].toStr(&s);
fp->append(s); fp->append(s);
......
...@@ -13,7 +13,7 @@ class FileWriter; ...@@ -13,7 +13,7 @@ class FileWriter;
class VCFIndividual { class VCFIndividual {
public: public:
// FUNC parseFunction[4]; // FUNC parseFunction[4];
VCFIndividual() { VCFIndividual() : fdLen(0) {
this->include(); // by default, enable everyone this->include(); // by default, enable everyone
} }
/** /**
...@@ -35,24 +35,25 @@ class VCFIndividual { ...@@ -35,24 +35,25 @@ class VCFIndividual {
this->parsed.attach(&vcfValue.line[vcfValue.beg], this->parsed.attach(&vcfValue.line[vcfValue.beg],
vcfValue.end - vcfValue.beg); vcfValue.end - vcfValue.beg);
size_t idx = 0;
// need to consider missing field
this->fd.resize(0);
VCFValue v;
v.line = this->parsed.getBuffer();
int beg = 0; int beg = 0;
int ret; int ret;
while ((ret = v.parseTill(this->parsed, beg, ':')) == 0) { while (true) {
if (idx == fdLen) {
VCFValue v;
fd.push_back(v);
++fdLen;
}
VCFValue& v = fd[idx++];
ret = v.parseTill(this->parsed, beg, ':');
this->parsed[v.end] = '\0'; this->parsed[v.end] = '\0';
beg = v.end + 1; beg = v.end + 1;
fd.push_back(v); if (ret == 1) { // last element
break;
}
} }
if (ret == 1) { fdLen = idx;
this->parsed[v.end] = '\0'; if (fdLen == 0) {
fd.push_back(v);
}
if (fd.size() == 0) {
fprintf(stderr, "Empty individual column - very strange!!\n"); fprintf(stderr, "Empty individual column - very strange!!\n");
fprintf(stderr, "vcfValue = %s\n", vcfValue.toStr()); fprintf(stderr, "vcfValue = %s\n", vcfValue.toStr());
} }
...@@ -66,13 +67,13 @@ class VCFIndividual { ...@@ -66,13 +67,13 @@ class VCFIndividual {
const VCFValue& operator[](const unsigned int i) const const VCFValue& operator[](const unsigned int i) const
__attribute__((deprecated)) { __attribute__((deprecated)) {
if (i >= fd.size()) { if (i >= fdLen) {
FATAL("index out of bound!"); FATAL("index out of bound!");
} }
return (this->fd[i]); return (this->fd[i]);
} }
VCFValue& operator[](const unsigned int i) __attribute__((deprecated)) { VCFValue& operator[](const unsigned int i) __attribute__((deprecated)) {
if (i >= fd.size()) { if (i >= fdLen) {
FATAL("index out of bound!"); FATAL("index out of bound!");
} }
return (this->fd[i]); return (this->fd[i]);
...@@ -82,7 +83,7 @@ class VCFIndividual { ...@@ -82,7 +83,7 @@ class VCFIndividual {
* in ith field is missing * in ith field is missing
*/ */
const VCFValue& get(unsigned int i, bool* isMissing) const { const VCFValue& get(unsigned int i, bool* isMissing) const {
if (i >= fd.size()) { if (i >= fdLen) {
*isMissing = true; *isMissing = true;
return VCFIndividual::defaultVCFValue; return VCFIndividual::defaultVCFValue;
} }
...@@ -93,7 +94,7 @@ class VCFIndividual { ...@@ -93,7 +94,7 @@ class VCFIndividual {
* @return VCFValue without checking missingness * @return VCFValue without checking missingness
*/ */
const VCFValue& justGet(unsigned int i) { const VCFValue& justGet(unsigned int i) {
if (i >= fd.size()) { if (i >= fdLen) {
return VCFIndividual::defaultVCFValue; return VCFIndividual::defaultVCFValue;
} }
return (this->fd[i]); return (this->fd[i]);
...@@ -102,12 +103,12 @@ class VCFIndividual { ...@@ -102,12 +103,12 @@ class VCFIndividual {
// VCFValue& getSelf() { return this->self; } // VCFValue& getSelf() { return this->self; }
// const VCFValue& getSelf() const { return this->self; } // const VCFValue& getSelf() const { return this->self; }
size_t size() const { return this->fd.size(); } size_t size() const { return this->fdLen; }
/** /**
* dump the content of VCFIndividual column * dump the content of VCFIndividual column
*/ */
void output(FILE* fp) const { void output(FILE* fp) const {
for (unsigned int i = 0; i < fd.size(); ++i) { for (size_t i = 0; i < fdLen; ++i) {
if (i) fputc(':', fp); if (i) fputc(':', fp);
this->fd[i].output(fp); this->fd[i].output(fp);
} }
...@@ -120,7 +121,10 @@ class VCFIndividual { ...@@ -120,7 +121,10 @@ class VCFIndividual {
std::string name; // id name std::string name; // id name
// VCFValue self; // whole field for the individual (unparsed) // VCFValue self; // whole field for the individual (unparsed)
VCFBuffer parsed; // store parsed string (where \0 added) VCFBuffer parsed; // store parsed string (where \0 added)
std::vector<VCFValue> fd; // each field separated by ':' std::vector<VCFValue> fd; // each field separated by ':', for optimizaiton,
// do not use clear(), resize()
size_t fdLen; // number of elements in fd
static VCFValue defaultVCFValue; static VCFValue defaultVCFValue;
static char defaultValue[2]; static char defaultValue[2];
}; // end VCFIndividual }; // end VCFIndividual
......
...@@ -176,12 +176,20 @@ bool VCFInputFile::readRecord() { ...@@ -176,12 +176,20 @@ bool VCFInputFile::readRecord() {
if (!nRead) return false; if (!nRead) return false;
// star parsing // star parsing
bool ret = this->record.parse(&this->line); int ret;
this->record.attach(&this->line);
ret = this->record.parseSite();
if (ret) { if (ret) {
reportReadError(this->line); reportReadError(this->line);
} }
if (!this->passFilter()) continue;
if (!this->isAllowedSite()) continue; if (!this->isAllowedSite()) continue;
ret = this->record.parseIndividual();
if (ret) {
reportReadError(this->line);
}
if (!this->passFilter()) continue;
// break; // break;
return true; return true;
} }
......
...@@ -24,6 +24,20 @@ class VCFRecord { ...@@ -24,6 +24,20 @@ class VCFRecord {
* @return 0: if success * @return 0: if success
*/ */
int parse(std::string* pVcfLine) { int parse(std::string* pVcfLine) {
attach(pVcfLine);
// go through VCF sites (first 9 columns)
int ret = parseSite();
if (ret) {
return -1;
}
ret = parseIndividual();
if (ret) {
return -1;
}
}
void attach(std::string* pVcfLine) {
std::string& vcfLine = *pVcfLine; std::string& vcfLine = *pVcfLine;
this->vcfInfo.reset(); this->vcfInfo.reset();
// this->parsed = vcfLine.c_str(); // this->parsed = vcfLine.c_str();
...@@ -31,49 +45,52 @@ class VCFRecord { ...@@ -31,49 +45,52 @@ class VCFRecord {
// this->self.beg = 0; // this->self.beg = 0;
// this->self.end = this->parsed.size(); // this->self.end = this->parsed.size();
this->parsed.attach(&vcfLine[0], (int)vcfLine.size()); this->parsed.attach(&vcfLine[0], (int)vcfLine.size());
}
// go through VCF sites (first 9 columns) int parseSite() {
int ret; int ret;
if ((ret = this->chrom.parseTill(this->parsed, 0, '\t'))) { if ((ret = this->chrom.parseTill(this->parsed, 0, '\t'))) {
fprintf(stderr, "Error when parsing CHROM [ %s ]\n", vcfLine.c_str()); fprintf(stderr, "Error when parsing CHROM [ %s ]\n",
this->parsed.c_str());
return -1; return -1;
} }
this->parsed[this->chrom.end] = '\0'; this->parsed[this->chrom.end] = '\0';
if ((ret = if ((ret =
(this->pos.parseTill(this->parsed, this->chrom.end + 1, '\t')))) { (this->pos.parseTill(this->parsed, this->chrom.end + 1, '\t')))) {
fprintf(stderr, "Error when parsing POS [ %s ]\n", vcfLine.c_str()); fprintf(stderr, "Error when parsing POS [ %s ]\n", this->parsed.c_str());
return -1; return -1;
} }
this->parsed[this->pos.end] = '\0'; this->parsed[this->pos.end] = '\0';
if ((ret = (this->id.parseTill(this->parsed, this->pos.end + 1, '\t')))) { if ((ret = (this->id.parseTill(this->parsed, this->pos.end + 1, '\t')))) {
fprintf(stderr, "Error when parsing ID [ %s ]\n", vcfLine.c_str()); fprintf(stderr, "Error when parsing ID [ %s ]\n", this->parsed.c_str());
return -1; return -1;
} }
this->parsed[this->id.end] = '\0'; this->parsed[this->id.end] = '\0';
if ((ret = (this->ref.parseTill(this->parsed, this->id.end + 1, '\t')))) { if ((ret = (this->ref.parseTill(this->parsed, this->id.end + 1, '\t')))) {
fprintf(stderr, "Error when parsing REF [ %s ]\n", vcfLine.c_str()); fprintf(stderr, "Error when parsing REF [ %s ]\n", this->parsed.c_str());
return -1; return -1;
} }
this->parsed[this->ref.end] = '\0'; this->parsed[this->ref.end] = '\0';
if ((ret = (this->alt.parseTill(this->parsed, this->ref.end + 1, '\t')))) { if ((ret = (this->alt.parseTill(this->parsed, this->ref.end + 1, '\t')))) {
fprintf(stderr, "Error when parsing ALT [ %s ]\n", vcfLine.c_str()); fprintf(stderr, "Error when parsing ALT [ %s ]\n", this->parsed.c_str());
return -1; return -1;
} }
this->parsed[this->alt.end] = '\0'; this->parsed[this->alt.end] = '\0';
if ((ret = (this->qual.parseTill(this->parsed, this->alt.end + 1, '\t')))) { if ((ret = (this->qual.parseTill(this->parsed, this->alt.end + 1, '\t')))) {
fprintf(stderr, "Error when parsing QUAL [ %s ]\n", vcfLine.c_str()); fprintf(stderr, "Error when parsing QUAL [ %s ]\n", this->parsed.c_str());
return -1; return -1;
} }
this->parsed[this->qual.end] = '\0'; this->parsed[this->qual.end] = '\0';
if ((ret = if ((ret =
(this->filt.parseTill(this->parsed, this->qual.end + 1, '\t')))) { (this->filt.parseTill(this->parsed, this->qual.end + 1, '\t')))) {
fprintf(stderr, "Error when parsing FILTER [ %s ]\n", vcfLine.c_str()); fprintf(stderr, "Error when parsing FILTER [ %s ]\n",
this->parsed.c_str());
return -1; return -1;
} }
this->parsed[this->filt.end] = '\0'; this->parsed[this->filt.end] = '\0';
...@@ -82,7 +99,8 @@ class VCFRecord { ...@@ -82,7 +99,8 @@ class VCFRecord {
(this->info.parseTill(this->parsed, this->filt.end + 1, '\t')))) { (this->info.parseTill(this->parsed, this->filt.end + 1, '\t')))) {
// either parsing error, or we are dealing with site only VCFs // either parsing error, or we are dealing with site only VCFs
if (ret < 0) { // error happens if (ret < 0) { // error happens
fprintf(stderr, "Error when parsing INFO [ %s ]\n", vcfLine.c_str()); fprintf(stderr, "Error when parsing INFO [ %s ]\n",
this->parsed.c_str());
return -1; return -1;
} }
} }
...@@ -94,15 +112,21 @@ class VCFRecord { ...@@ -94,15 +112,21 @@ class VCFRecord {
if ((ret = (this->format.parseTill(this->parsed, this->info.end + 1, if ((ret = (this->format.parseTill(this->parsed, this->info.end + 1,
'\t')))) { '\t')))) {
fprintf(stderr, "Error when parsing FORMAT [ %s ]\n", vcfLine.c_str()); fprintf(stderr, "Error when parsing FORMAT [ %s ]\n",
this->parsed.c_str());
return -1; return -1;
} }
this->parsed[this->format.end] = '\0'; this->parsed[this->format.end] = '\0';
return 0;
}
int parseIndividual() {
int ret;
// now comes each individual genotype // now comes each individual genotype
unsigned int idx = 0; // peopleIdx unsigned int idx = 0; // peopleIdx
VCFIndividual* p; VCFIndividual* p;
VCFValue indv; static VCFValue indv;
int beg = this->format.end + 1; int beg = this->format.end + 1;
while ((ret = indv.parseTill(this->parsed, beg, '\t')) == 0) { while ((ret = indv.parseTill(this->parsed, beg, '\t')) == 0) {
if (idx >= this->allIndv.size()) { if (idx >= this->allIndv.size()) {
...@@ -114,13 +138,18 @@ class VCFRecord { ...@@ -114,13 +138,18 @@ class VCFRecord {
} }
this->parsed[indv.end] = '\0'; this->parsed[indv.end] = '\0';
p = this->allIndv[idx]; // p = this->allIndv[idx];
p->parse(indv); // p->parse(indv);
beg = indv.end + 1; beg = indv.end + 1;
idx++; idx++;
} }
#pragma openmp parallel for
for (int i = 0; i < idx; ++i) {
this->allIndv[i]->parse(indv);
}
// if ret == 1 menas reaches end of this->parsed // if ret == 1 menas reaches end of this->parsed
if (ret != 1) { if (ret != 1) {
fputs("Parsing error in line: ", stderr); fputs("Parsing error in line: ", stderr);
...@@ -147,6 +176,7 @@ class VCFRecord { ...@@ -147,6 +176,7 @@ class VCFRecord {
return 0; return 0;
} }
void createIndividual(const std::string& line) { void createIndividual(const std::string& line) {
std::vector<std::string> sa; std::vector<std::string> sa;
stringTokenize(line, '\t', &sa); stringTokenize(line, '\t', &sa);
...@@ -163,6 +193,7 @@ class VCFRecord { ...@@ -163,6 +193,7 @@ class VCFRecord {
p->setName(sa[i]); p->setName(sa[i]);
} }
} }
void deleteIndividual() { void deleteIndividual() {
for (unsigned int i = 0; i < this->allIndv.size(); i++) { for (unsigned int i = 0; i < this->allIndv.size(); i++) {
if (this->allIndv[i]) delete this->allIndv[i]; if (this->allIndv[i]) delete this->allIndv[i];
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "VCFConstant.h" #include "VCFConstant.h"
#include "base/Exception.h" #include "base/Exception.h"
#include "base/TypeConversion.h" #include "base/TypeConversion.h"
#include "base/Utils.h"
class FileWriter; class FileWriter;
...@@ -264,7 +265,8 @@ class VCFValue { ...@@ -264,7 +265,8 @@ class VCFValue {
this->beg = b; this->beg = b;
this->end = b; this->end = b;
char* r = strchr(line + beg, c); // char* r = strchr(line + beg, c);
const char* r = ssechr(line + beg, c);
if (r != NULL) { if (r != NULL) {
end = r - line; end = r - line;
return 0; return 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment