Commit 732e8102 authored by zhanxw's avatar zhanxw
Browse files

support bgen

parent 96be9b56
...@@ -310,9 +310,11 @@ bool BGenFile::parseLayout2() { ...@@ -310,9 +310,11 @@ bool BGenFile::parseLayout2() {
const uint8_t isPhased = *(uint8_t*)(buf.data() + 8 + N); const uint8_t isPhased = *(uint8_t*)(buf.data() + 8 + N);
var.isPhased = isPhased != 0; var.isPhased = isPhased != 0;
const uint8_t B = *(uint8_t*)(buf.data() + 8 + N + 1); // bits const uint8_t B = *(uint8_t*)(buf.data() + 8 + N + 1); // bits
assert(1 <= B && B <= 32);
#ifdef DEBUG
int totalBit = 0; int totalBit = 0;
int cumBit = 0; int cumBit = 0;
assert(1 <= B && B <= 32); #endif
var.missing.resize(N); var.missing.resize(N);
var.ploidy.resize(N); var.ploidy.resize(N);
...@@ -333,9 +335,9 @@ bool BGenFile::parseLayout2() { ...@@ -333,9 +335,9 @@ bool BGenFile::parseLayout2() {
// ploidy = Z, allele = K // ploidy = Z, allele = K
if (var.isPhased) { if (var.isPhased) {
// total Z * (K- 1) * B bits bits used // total Z * (K- 1) * B bits bits used
totalBit = Z * (var.K - 1) * B;
#ifdef DEBUG #ifdef DEBUG
totalBit = Z * (var.K - 1) * B;
printf("Total phased bits = %d\n", totalBit); printf("Total phased bits = %d\n", totalBit);
#endif #endif
...@@ -351,8 +353,8 @@ bool BGenFile::parseLayout2() { ...@@ -351,8 +353,8 @@ bool BGenFile::parseLayout2() {
} else { } else {
// total ( ( (Z+K-1) choose (K-1) ) -1 ) * B bits used // total ( ( (Z+K-1) choose (K-1) ) -1 ) * B bits used
const int nCombination = choose((Z + var.K - 1), (var.K - 1)); const int nCombination = choose((Z + var.K - 1), (var.K - 1));
totalBit = (nCombination - 1) * B;
#ifdef DEBUG #ifdef DEBUG
totalBit = (nCombination - 1) * B;
printf("Total unphased bits = %d\n", totalBit); printf("Total unphased bits = %d\n", totalBit);
#endif #endif
float remainProb = 1.0; float remainProb = 1.0;
...@@ -363,7 +365,9 @@ bool BGenFile::parseLayout2() { ...@@ -363,7 +365,9 @@ bool BGenFile::parseLayout2() {
} }
var.prob.push_back(remainProb); var.prob.push_back(remainProb);
} }
#ifdef DEBUG
cumBit += totalBit; cumBit += totalBit;
#endif
} }
var.index.push_back(var.prob.size()); var.index.push_back(var.prob.size());
...@@ -394,8 +398,7 @@ void BGenFile::parseString(FILE* fp, int lenByte, std::string* out) { ...@@ -394,8 +398,7 @@ void BGenFile::parseString(FILE* fp, int lenByte, std::string* out) {
int nRead = fread(&siLen, sizeof(siLen), 1, fp); int nRead = fread(&siLen, sizeof(siLen), 1, fp);
assert(nRead == 1); assert(nRead == 1);
(*out).resize(siLen + 1); (*out).resize(siLen);
(*out)[siLen] = '\0';
nRead = fread(&(*out)[0], sizeof((*out)[0]), siLen, fp); nRead = fread(&(*out)[0], sizeof((*out)[0]), siLen, fp);
#ifdef DEBUG #ifdef DEBUG
printf("nRead = %d, read = %s\n", nRead, (*out).c_str()); printf("nRead = %d, read = %s\n", nRead, (*out).c_str());
...@@ -449,6 +452,7 @@ void BGenFile::setPeopleMask(const std::string& s, bool b) { ...@@ -449,6 +452,7 @@ void BGenFile::setPeopleMask(const std::string& s, bool b) {
sampleMask[i] = b; sampleMask[i] = b;
} }
} }
buildEffectiveIndex();
} }
void BGenFile::includePeople(const std::string& s) { setPeopleMask(s, false); } void BGenFile::includePeople(const std::string& s) { setPeopleMask(s, false); }
...@@ -468,12 +472,14 @@ void BGenFile::setPeopleMaskFromFile(const char* fn, bool b) { ...@@ -468,12 +472,14 @@ void BGenFile::setPeopleMaskFromFile(const char* fn, bool b) {
setPeopleMask(fd[i].c_str(), b); setPeopleMask(fd[i].c_str(), b);
} }
} }
buildEffectiveIndex();
} }
void BGenFile::includePeopleFromFile(const char* fn) { void BGenFile::includePeopleFromFile(const char* fn) {
setPeopleMaskFromFile(fn, false); setPeopleMaskFromFile(fn, false);
} }
void BGenFile::includeAllPeople() { void BGenFile::includeAllPeople() {
std::fill(sampleMask.begin(), sampleMask.end(), false); std::fill(sampleMask.begin(), sampleMask.end(), false);
buildEffectiveIndex();
} }
void BGenFile::excludePeople(const std::string& s) { setPeopleMask(s, true); } void BGenFile::excludePeople(const std::string& s) { setPeopleMask(s, true); }
...@@ -487,6 +493,7 @@ void BGenFile::excludePeopleFromFile(const char* fn) { ...@@ -487,6 +493,7 @@ void BGenFile::excludePeopleFromFile(const char* fn) {
} }
void BGenFile::excludeAllPeople() { void BGenFile::excludeAllPeople() {
std::fill(sampleMask.begin(), sampleMask.end(), true); std::fill(sampleMask.begin(), sampleMask.end(), true);
buildEffectiveIndex();
} }
////////////////////////////////////////////////// //////////////////////////////////////////////////
...@@ -567,3 +574,33 @@ int BGenFile::setSiteFile(const std::string& fn) { ...@@ -567,3 +574,33 @@ int BGenFile::setSiteFile(const std::string& fn) {
} }
return 0; return 0;
} }
int BGenFile::getNumEffectiveSample() const {
size_t ret = 0;
for (size_t i = 0; i != sampleMask.size(); ++i) {
if (sampleMask[i]) continue;
ret++;
}
return ret;
}
void BGenFile::getIncludedSampleName(std::vector<std::string>* p) const {
if (!p) return;
p->clear();
for (size_t i = 0; i != sampleMask.size(); ++i) {
if (sampleMask[i]) continue;
p->push_back(sampleIdentifier[i]);
}
}
void BGenFile::buildEffectiveIndex() {
effectiveIndex.resize(0);
for (size_t i = 0; i != N; ++i) {
if (sampleMask[i]) continue;
effectiveIndex.push_back(i);
}
}
int BGenFile::getEffectiveIndex(int idx) const {
return this->effectiveIndex[idx];
}
...@@ -12,6 +12,11 @@ ...@@ -12,6 +12,11 @@
#include "BGenIndex.h" #include "BGenIndex.h"
#include "BGenVariant.h" #include "BGenVariant.h"
// copied from libVcf/VCFConstant.h
#define MISSING_GENOTYPE -9
#define PLINK_MALE 1
#define PLINK_FEMALE 2
class RangeList; class RangeList;
class BGenFile { class BGenFile {
...@@ -63,10 +68,13 @@ class BGenFile { ...@@ -63,10 +68,13 @@ class BGenFile {
public: public:
int getNumMarker() const { return M; } int getNumMarker() const { return M; }
int getNumSample() const { return N; } int getNumSample() const { return N; }
int getNumEffectiveSample() const;
const std::vector<std::string>& getSampleIdentifier() const { const std::vector<std::string>& getSampleIdentifier() const {
return sampleIdentifier; return sampleIdentifier;
} }
void getIncludedSampleName(std::vector<std::string>* p) const;
const BGenVariant& getVariant() const { return var; } const BGenVariant& getVariant() const { return var; }
int getEffectiveIndex(int idx) const;
private: private:
BGenFile(const BGenFile&); BGenFile(const BGenFile&);
...@@ -89,6 +97,8 @@ class BGenFile { ...@@ -89,6 +97,8 @@ class BGenFile {
void setPeopleMaskFromFile(const char* fn, bool b); void setPeopleMaskFromFile(const char* fn, bool b);
void setRangeMode(); void setRangeMode();
// range list related // range list related
void buildEffectiveIndex();
private: private:
std::string bgenFileName; std::string bgenFileName;
FILE* fp; FILE* fp;
...@@ -119,6 +129,7 @@ class BGenFile { ...@@ -119,6 +129,7 @@ class BGenFile {
bool autoMergeRange; bool autoMergeRange;
Mode mode; /// read consecutively or read by index Mode mode; /// read consecutively or read by index
std::vector<bool> sampleMask; // true means exclusion std::vector<bool> sampleMask; // true means exclusion
std::vector<int> effectiveIndex;
// allow chromosomal sites // allow chromosomal sites
std::set<std::string> allowedSite; std::set<std::string> allowedSite;
}; // class BGenFile }; // class BGenFile
......
...@@ -273,6 +273,23 @@ struct BGenVariant { ...@@ -273,6 +273,23 @@ struct BGenVariant {
fprintf(fp, "%g", prob[i]); fprintf(fp, "%g", prob[i]);
} }
} }
/// Handle dosage //////////////////////////////////////////////////
void printDosage(int i, FILE* fp) const {
if (missing[i]) {
fputs(".", fp);
return;
}
if (ploidy[i] == 2 && K == 2) {
// const float prob0 = prob[index[i]];
const float prob1 = prob[index[i] + 1];
const float prob2 = prob[index[i] + 2];
fprintf(fp, "%g", prob1 + 2.0 * prob2);
} else {
fputs(".", fp);
}
}
}; };
#endif /* _BGENVARIANT_H_ */ #endif /* _BGENVARIANT_H_ */
...@@ -233,15 +233,15 @@ GenotypeExtractor::~GenotypeExtractor() {} ...@@ -233,15 +233,15 @@ GenotypeExtractor::~GenotypeExtractor() {}
// } // }
// } // }
// void GenotypeExtractor::assign(const std::vector<double>& from, int nrow, void GenotypeExtractor::assign(const std::vector<double>& from, int nrow,
// int ncol, Matrix* to) { int ncol, Matrix* to) {
// assert(to); assert(to);
// Matrix& out = *to; Matrix& out = *to;
// out.Dimension(nrow, ncol); out.Dimension(nrow, ncol);
// assert((int)from.size() == nrow * ncol); assert((int)from.size() == nrow * ncol);
// for (int i = 0; i < nrow; ++i) { for (int i = 0; i < nrow; ++i) {
// for (int j = 0; j < ncol; ++j) { for (int j = 0; j < ncol; ++j) {
// out[i][j] = from[nrow * j + i]; out[i][j] = from[nrow * j + i];
// } }
// } }
// } }
...@@ -11,8 +11,8 @@ ...@@ -11,8 +11,8 @@
class Matrix; class Matrix;
class RangeList; class RangeList;
class Result; class Result;
class VCFExtractor; // class VCFExtractor;
class VCFIndividual; // class VCFIndividual;
class GenotypeCounter; class GenotypeCounter;
class GenotypeExtractor { class GenotypeExtractor {
...@@ -45,8 +45,6 @@ class GenotypeExtractor { ...@@ -45,8 +45,6 @@ class GenotypeExtractor {
virtual void setSiteDepthMax(int d) = 0; virtual void setSiteDepthMax(int d) = 0;
// @return true if GD is valid // @return true if GD is valid
// if GD is missing, we will take GD = 0 // if GD is missing, we will take GD = 0
virtual bool checkGD(VCFIndividual& indv, int gdIdx) = 0;
virtual bool checkGQ(VCFIndividual& indv, int gqIdx) = 0;
virtual void setGDmin(int m) = 0; virtual void setGDmin(int m) = 0;
virtual void setGDmax(int m) = 0; virtual void setGDmax(int m) = 0;
virtual void setGQmin(int m) = 0; virtual void setGQmin(int m) = 0;
...@@ -109,8 +107,8 @@ class GenotypeExtractor { ...@@ -109,8 +107,8 @@ class GenotypeExtractor {
// assign extracted genotype @param from to a @param nrow by @param ncol // assign extracted genotype @param from to a @param nrow by @param ncol
// output matrix @param to // output matrix @param to
void assign(const std::vector<double>& from, int nrow, int ncol, Matrix* to);
#endif #endif
void assign(const std::vector<double>& from, int nrow, int ncol, Matrix* to);
void enableMultiAllelicMode() { this->multiAllelicMode = true; } void enableMultiAllelicMode() { this->multiAllelicMode = true; }
public: public:
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "libsrc/MathMatrix.h" #include "libsrc/MathMatrix.h"
#include "libsrc/MathVector.h" #include "libsrc/MathVector.h"
#include "src/BGenGenotypeExtractor.h"
#include "src/DataConsolidator.h" #include "src/DataConsolidator.h"
#include "src/DataLoader.h" #include "src/DataLoader.h"
#include "src/GenotypeExtractor.h" #include "src/GenotypeExtractor.h"
...@@ -28,7 +29,6 @@ ...@@ -28,7 +29,6 @@
#include "src/ModelManager.h" #include "src/ModelManager.h"
#include "src/Result.h" #include "src/Result.h"
#include "src/VCFGenotypeExtractor.h" #include "src/VCFGenotypeExtractor.h"
// #include "src/BGenGenotypeExtractor.h"
Logger* logger = NULL; Logger* logger = NULL;
...@@ -247,6 +247,7 @@ void welcome() { ...@@ -247,6 +247,7 @@ void welcome() {
BEGIN_PARAMETER_LIST(); BEGIN_PARAMETER_LIST();
ADD_PARAMETER_GROUP("Basic Input/Output"); ADD_PARAMETER_GROUP("Basic Input/Output");
ADD_STRING_PARAMETER(inVcf, "--inVcf", "Input VCF File"); ADD_STRING_PARAMETER(inVcf, "--inVcf", "Input VCF File");
ADD_STRING_PARAMETER(inBgen, "--inBgen", "Input BGEN File");
ADD_STRING_PARAMETER(outPrefix, "--out", "Output prefix"); ADD_STRING_PARAMETER(outPrefix, "--out", "Output prefix");
ADD_BOOL_PARAMETER(outputRaw, "--outputRaw", ADD_BOOL_PARAMETER(outputRaw, "--outputRaw",
"Output genotypes, phenotype, covariates(if any); and " "Output genotypes, phenotype, covariates(if any); and "
...@@ -439,8 +440,15 @@ int main(int argc, char** argv) { ...@@ -439,8 +440,15 @@ int main(int argc, char** argv) {
if (!FLAG_outPrefix.size()) FLAG_outPrefix = "rvtest"; if (!FLAG_outPrefix.size()) FLAG_outPrefix = "rvtest";
REQUIRE_STRING_PARAMETER(FLAG_inVcf, if (FLAG_inVcf.empty() && FLAG_inBgen.empty()) {
"Please provide input file using: --inVcf"); fprintf(stderr, "Please provide one input file using: --inVcf or --inBgen");
exit(1);
}
if (!FLAG_inVcf.empty() && !FLAG_inBgen.empty()) {
fprintf(stderr,
"Please provide one kind of input file using: --inVcf or --inBgen");
exit(1);
}
// check new version // check new version
if (!FLAG_noweb) { if (!FLAG_noweb) {
...@@ -496,6 +504,8 @@ int main(int argc, char** argv) { ...@@ -496,6 +504,8 @@ int main(int argc, char** argv) {
GenotypeExtractor* ge = NULL; GenotypeExtractor* ge = NULL;
if (!FLAG_inVcf.empty()) { if (!FLAG_inVcf.empty()) {
ge = new VCFGenotypeExtractor(FLAG_inVcf); ge = new VCFGenotypeExtractor(FLAG_inVcf);
} else if (!FLAG_inBgen.empty()) {
ge = new BGenGenotypeExtractor(FLAG_inBgen);
} else { } else {
assert(false); assert(false);
} }
......
...@@ -26,6 +26,7 @@ BASE = Main \ ...@@ -26,6 +26,7 @@ BASE = Main \
Model \ Model \
GenotypeExtractor \ GenotypeExtractor \
VCFGenotypeExtractor \ VCFGenotypeExtractor \
BGenGenotypeExtractor \
DataLoader \ DataLoader \
GenotypeCounter \ GenotypeCounter \
......
...@@ -481,16 +481,3 @@ double VCFGenotypeExtractor::getGenotypeForAltAllele( ...@@ -481,16 +481,3 @@ double VCFGenotypeExtractor::getGenotypeForAltAllele(
return MISSING_GENOTYPE; return MISSING_GENOTYPE;
} }
} }
void VCFGenotypeExtractor::assign(const std::vector<double>& from, int nrow,
int ncol, Matrix* to) {
assert(to);
Matrix& out = *to;
out.Dimension(nrow, ncol);
assert((int)from.size() == nrow * ncol);
for (int i = 0; i < nrow; ++i) {
for (int j = 0; j < ncol; ++j) {
out[i][j] = from[nrow * j + i];
}
}
}
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// class Matrix; // class Matrix;
// class RangeList; // class RangeList;
// class Result; // class Result;
// class VCFExtractor; class VCFExtractor;
// class VCFIndividual; class VCFIndividual;
// class GenotypeCounter; // class GenotypeCounter;
/** /**
...@@ -83,9 +83,6 @@ class VCFGenotypeExtractor : public GenotypeExtractor { ...@@ -83,9 +83,6 @@ class VCFGenotypeExtractor : public GenotypeExtractor {
void getPeopleName(std::vector<std::string>* p); void getPeopleName(std::vector<std::string>* p);
void getIncludedPeopleName(std::vector<std::string>* p) const; void getIncludedPeopleName(std::vector<std::string>* p) const;
const std::vector<GenotypeCounter>& getGenotypeCounter() const {
return this->counter;
}
/** /**
* @return weigth, its length equals to # of markers * @return weigth, its length equals to # of markers
*/ */
...@@ -95,10 +92,10 @@ class VCFGenotypeExtractor : public GenotypeExtractor { ...@@ -95,10 +92,10 @@ class VCFGenotypeExtractor : public GenotypeExtractor {
this->dosageTag = tag; this->dosageTag = tag;
} }
void unsetDosageTag() { this->dosageTag.clear(); } void unsetDosageTag() { this->dosageTag.clear(); }
bool isDosage() const { return !this->dosageTag.empty(); } // bool isDosage() const { return !this->dosageTag.empty(); }
void setParRegion(ParRegion* p) { this->parRegion = p; } // void setParRegion(ParRegion* p) { this->parRegion = p; }
// Sex (1=male; 2=female; other=unknown) // // Sex (1=male; 2=female; other=unknown)
void setSex(const std::vector<int>* sex) { this->sex = sex; } // void setSex(const std::vector<int>* sex) { this->sex = sex; }
// coding male chromX as 0/2 instead of 0/1 // coding male chromX as 0/2 instead of 0/1
// similarly, for dosage, just multiply 2.0 from original dosage // similarly, for dosage, just multiply 2.0 from original dosage
// void enableClaytonCoding() { this->claytonCoding = true; } // void enableClaytonCoding() { this->claytonCoding = true; }
...@@ -118,14 +115,9 @@ class VCFGenotypeExtractor : public GenotypeExtractor { ...@@ -118,14 +115,9 @@ class VCFGenotypeExtractor : public GenotypeExtractor {
// assign extracted genotype @param from to a @param nrow by @param ncol // assign extracted genotype @param from to a @param nrow by @param ncol
// output matrix @param to // output matrix @param to
void assign(const std::vector<double>& from, int nrow, int ncol, Matrix* to); // void assign(const std::vector<double>& from, int nrow, int ncol, Matrix*
void enableMultiAllelicMode() { this->multiAllelicMode = true; } // to);
// void enableMultiAllelicMode() { this->multiAllelicMode = true; }
public:
const static int SUCCEED = 0;
const static int ERROR = -1;
const static int FILE_END = -2;
const static int FAIL_FILTER = -3;
private: private:
VCFExtractor* vin; VCFExtractor* vin;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment