Commit f94cb163 authored by zhanxw's avatar zhanxw
Browse files

Improve VCF parsing speed

parent 444e2f1e
#include "IO.h"
// cannot forward declare an typdef anonymous struct
// http://stackoverflow.com/questions/804894/forward-declaration-of-a-typedef-in-c
// so include the header file
#include "third/samtools/bgzf.h"
#include <algorithm>
#include "base/Utils.h"
//////////////////////////////////////////////////
// Plain file reader
......@@ -430,7 +432,7 @@ int BufferedReader::search(int left, int right, const char* sep) {
assert(right <= bufEnd);
const char* p;
for (int i = left; i < right; ++i) {
p = strchr(sep, buf[i]);
p = ssechr(sep, buf[i]);
if (p != NULL) {
bufPtr = i + 1;
return i;
......@@ -446,12 +448,12 @@ int BufferedReader::search(int left, int right, const char* sep1,
assert(right <= bufEnd);
const char* p;
for (int i = left; i < right; ++i) {
p = strchr(sep1, buf[i]);
p = ssechr(sep1, buf[i]);
if (p != NULL) {
bufPtr = i + 1;
return i;
}
p = strchr(sep2, buf[i]);
p = ssechr(sep2, buf[i]);
if (p != NULL) {
bufPtr = i + 1;
return i;
......@@ -507,7 +509,7 @@ int BufferedReader::readLineBySep(std::vector<std::string>* fields,
fields->resize(fields->size() - 1);
return fields->size();
}
} else if (strchr(sep, buf[ptr])) { // separator
} else if (ssechr(sep, buf[ptr])) { // separator
fields->resize(fields->size() + 1);
fields->back().resize(0);
} else if (buf[ptr] == '\r') {
......
......@@ -11,6 +11,8 @@
#include <string>
#include <vector>
#include "base/Utils.h"
// #define IO_DEBUG
typedef enum FileType {
......
#include "Utils.h"
#ifdef __SSE2__
#pragma message "Enable SSE2 optimized ssechr"
// copy from:
// https://mischasan.wordpress.com/2011/06/22/what-the-is-sse2-good-for-char-search-in-long-strings/
#include <emmintrin.h>
char const* ssechr(char const* s, char ch) {
__m128i zero = _mm_setzero_si128(); // set zero 16 times
__m128i cx16 = _mm_set1_epi8(ch); // (ch) replicated 16 times.
while (1) {
// load 128 bit, @param s does not need to be aligned
// on little endian system, s[0] is the least significant part
// on memory, it looks like s[15], s[14], ..., s[0]
__m128i x = _mm_loadu_si128(
(__m128i const*)
s);
// _mm_cmpeq_epi8: compare 16 times on the 8 bit number, set ff for equal or
// 00 for not equal
// _mm_movemask_epi8: extract each of the highest significnat bit of the 16
// 8bit number
// this command identify the location of '\0'
unsigned u = _mm_movemask_epi8(_mm_cmpeq_epi8(zero, x));
// ~u, change all 1 to 0, all 0 to 1, e.g. 0110 0000 -> 1001 1111
// (u-1), least significant 0s and 1 will be flipped, 0110 0000 -> 0101 1111
// ~u & (u-1), the large significant part -> 0, the least significat 1 and all traiting zeros -> 1
// e.g. 0110 0000 -> 0001 1111
// _mm_movemask_epi8(_mm_cmpeq_epi8(cx16, x)): will set the bit where s[i] == ch to 1
unsigned v = _mm_movemask_epi8(_mm_cmpeq_epi8(cx16, x)) & ~u & (u - 1);
// ffs find the first bit in a word, e.g. 0110 0000 -> 6
if (v) return s + ffs(v) - 1; //
if (u) return NULL; // does not find char
s += 16;
}
}
#else
#pragma message "Disabled SSE2 => no optimized ssechr"
#define ssechr strchr
#endif
......@@ -400,4 +400,6 @@ inline bool endsWith(const std::string& s, const std::string& tail) {
return true;
}
extern char const* ssechr(char const* s, char ch);
#endif /* _UTILS_H_ */
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment