Commit 732e8102 authored by zhanxw's avatar zhanxw

support bgen

parent 96be9b56
......@@ -310,9 +310,11 @@ bool BGenFile::parseLayout2() {
const uint8_t isPhased = *(uint8_t*)(buf.data() + 8 + N);
var.isPhased = isPhased != 0;
const uint8_t B = *(uint8_t*)(buf.data() + 8 + N + 1); // bits
assert(1 <= B && B <= 32);
#ifdef DEBUG
int totalBit = 0;
int cumBit = 0;
assert(1 <= B && B <= 32);
#endif
var.missing.resize(N);
var.ploidy.resize(N);
......@@ -333,9 +335,9 @@ bool BGenFile::parseLayout2() {
// ploidy = Z, allele = K
if (var.isPhased) {
// total Z * (K- 1) * B bits bits used
totalBit = Z * (var.K - 1) * B;
// total Z * (K- 1) * B bits bits used
#ifdef DEBUG
totalBit = Z * (var.K - 1) * B;
printf("Total phased bits = %d\n", totalBit);
#endif
......@@ -351,8 +353,8 @@ bool BGenFile::parseLayout2() {
} else {
// total ( ( (Z+K-1) choose (K-1) ) -1 ) * B bits used
const int nCombination = choose((Z + var.K - 1), (var.K - 1));
totalBit = (nCombination - 1) * B;
#ifdef DEBUG
totalBit = (nCombination - 1) * B;
printf("Total unphased bits = %d\n", totalBit);
#endif
float remainProb = 1.0;
......@@ -363,7 +365,9 @@ bool BGenFile::parseLayout2() {
}
var.prob.push_back(remainProb);
}
#ifdef DEBUG
cumBit += totalBit;
#endif
}
var.index.push_back(var.prob.size());
......@@ -394,8 +398,7 @@ void BGenFile::parseString(FILE* fp, int lenByte, std::string* out) {
int nRead = fread(&siLen, sizeof(siLen), 1, fp);
assert(nRead == 1);
(*out).resize(siLen + 1);
(*out)[siLen] = '\0';
(*out).resize(siLen);
nRead = fread(&(*out)[0], sizeof((*out)[0]), siLen, fp);
#ifdef DEBUG
printf("nRead = %d, read = %s\n", nRead, (*out).c_str());
......@@ -449,6 +452,7 @@ void BGenFile::setPeopleMask(const std::string& s, bool b) {
sampleMask[i] = b;
}
}
buildEffectiveIndex();
}
void BGenFile::includePeople(const std::string& s) { setPeopleMask(s, false); }
......@@ -468,12 +472,14 @@ void BGenFile::setPeopleMaskFromFile(const char* fn, bool b) {
setPeopleMask(fd[i].c_str(), b);
}
}
buildEffectiveIndex();
}
void BGenFile::includePeopleFromFile(const char* fn) {
setPeopleMaskFromFile(fn, false);
}
void BGenFile::includeAllPeople() {
std::fill(sampleMask.begin(), sampleMask.end(), false);
buildEffectiveIndex();
}
void BGenFile::excludePeople(const std::string& s) { setPeopleMask(s, true); }
......@@ -487,6 +493,7 @@ void BGenFile::excludePeopleFromFile(const char* fn) {
}
void BGenFile::excludeAllPeople() {
std::fill(sampleMask.begin(), sampleMask.end(), true);
buildEffectiveIndex();
}
//////////////////////////////////////////////////
......@@ -567,3 +574,33 @@ int BGenFile::setSiteFile(const std::string& fn) {
}
return 0;
}
int BGenFile::getNumEffectiveSample() const {
size_t ret = 0;
for (size_t i = 0; i != sampleMask.size(); ++i) {
if (sampleMask[i]) continue;
ret++;
}
return ret;
}
void BGenFile::getIncludedSampleName(std::vector<std::string>* p) const {
if (!p) return;
p->clear();
for (size_t i = 0; i != sampleMask.size(); ++i) {
if (sampleMask[i]) continue;
p->push_back(sampleIdentifier[i]);
}
}
void BGenFile::buildEffectiveIndex() {
effectiveIndex.resize(0);
for (size_t i = 0; i != N; ++i) {
if (sampleMask[i]) continue;
effectiveIndex.push_back(i);
}
}
int BGenFile::getEffectiveIndex(int idx) const {
return this->effectiveIndex[idx];
}
......@@ -12,6 +12,11 @@
#include "BGenIndex.h"
#include "BGenVariant.h"
// copied from libVcf/VCFConstant.h
#define MISSING_GENOTYPE -9
#define PLINK_MALE 1
#define PLINK_FEMALE 2
class RangeList;
class BGenFile {
......@@ -63,10 +68,13 @@ class BGenFile {
public:
int getNumMarker() const { return M; }
int getNumSample() const { return N; }
int getNumEffectiveSample() const;
const std::vector<std::string>& getSampleIdentifier() const {
return sampleIdentifier;
}
void getIncludedSampleName(std::vector<std::string>* p) const;
const BGenVariant& getVariant() const { return var; }
int getEffectiveIndex(int idx) const;
private:
BGenFile(const BGenFile&);
......@@ -89,6 +97,8 @@ class BGenFile {
void setPeopleMaskFromFile(const char* fn, bool b);
void setRangeMode();
// range list related
void buildEffectiveIndex();
private:
std::string bgenFileName;
FILE* fp;
......@@ -119,6 +129,7 @@ class BGenFile {
bool autoMergeRange;
Mode mode; /// read consecutively or read by index
std::vector<bool> sampleMask; // true means exclusion
std::vector<int> effectiveIndex;
// allow chromosomal sites
std::set<std::string> allowedSite;
}; // class BGenFile
......
......@@ -273,6 +273,23 @@ struct BGenVariant {
fprintf(fp, "%g", prob[i]);
}
}
/// Handle dosage //////////////////////////////////////////////////
void printDosage(int i, FILE* fp) const {
if (missing[i]) {
fputs(".", fp);
return;
}
if (ploidy[i] == 2 && K == 2) {
// const float prob0 = prob[index[i]];
const float prob1 = prob[index[i] + 1];
const float prob2 = prob[index[i] + 2];
fprintf(fp, "%g", prob1 + 2.0 * prob2);
} else {
fputs(".", fp);
}
}
};
#endif /* _BGENVARIANT_H_ */
......@@ -233,15 +233,15 @@ GenotypeExtractor::~GenotypeExtractor() {}
// }
// }
// void GenotypeExtractor::assign(const std::vector<double>& from, int nrow,
// int ncol, Matrix* to) {
// assert(to);
// Matrix& out = *to;
// out.Dimension(nrow, ncol);
// assert((int)from.size() == nrow * ncol);
// for (int i = 0; i < nrow; ++i) {
// for (int j = 0; j < ncol; ++j) {
// out[i][j] = from[nrow * j + i];
// }
// }
// }
void GenotypeExtractor::assign(const std::vector<double>& from, int nrow,
int ncol, Matrix* to) {
assert(to);
Matrix& out = *to;
out.Dimension(nrow, ncol);
assert((int)from.size() == nrow * ncol);
for (int i = 0; i < nrow; ++i) {
for (int j = 0; j < ncol; ++j) {
out[i][j] = from[nrow * j + i];
}
}
}
......@@ -11,8 +11,8 @@
class Matrix;
class RangeList;
class Result;
class VCFExtractor;
class VCFIndividual;
// class VCFExtractor;
// class VCFIndividual;
class GenotypeCounter;
class GenotypeExtractor {
......@@ -45,8 +45,6 @@ class GenotypeExtractor {
virtual void setSiteDepthMax(int d) = 0;
// @return true if GD is valid
// if GD is missing, we will take GD = 0
virtual bool checkGD(VCFIndividual& indv, int gdIdx) = 0;
virtual bool checkGQ(VCFIndividual& indv, int gqIdx) = 0;
virtual void setGDmin(int m) = 0;
virtual void setGDmax(int m) = 0;
virtual void setGQmin(int m) = 0;
......@@ -109,8 +107,8 @@ class GenotypeExtractor {
// assign extracted genotype @param from to a @param nrow by @param ncol
// output matrix @param to
void assign(const std::vector<double>& from, int nrow, int ncol, Matrix* to);
#endif
void assign(const std::vector<double>& from, int nrow, int ncol, Matrix* to);
void enableMultiAllelicMode() { this->multiAllelicMode = true; }
public:
......
......@@ -21,6 +21,7 @@
#include "libsrc/MathMatrix.h"
#include "libsrc/MathVector.h"
#include "src/BGenGenotypeExtractor.h"
#include "src/DataConsolidator.h"
#include "src/DataLoader.h"
#include "src/GenotypeExtractor.h"
......@@ -28,7 +29,6 @@
#include "src/ModelManager.h"
#include "src/Result.h"
#include "src/VCFGenotypeExtractor.h"
// #include "src/BGenGenotypeExtractor.h"
Logger* logger = NULL;
......@@ -247,6 +247,7 @@ void welcome() {
BEGIN_PARAMETER_LIST();
ADD_PARAMETER_GROUP("Basic Input/Output");
ADD_STRING_PARAMETER(inVcf, "--inVcf", "Input VCF File");
ADD_STRING_PARAMETER(inBgen, "--inBgen", "Input BGEN File");
ADD_STRING_PARAMETER(outPrefix, "--out", "Output prefix");
ADD_BOOL_PARAMETER(outputRaw, "--outputRaw",
"Output genotypes, phenotype, covariates(if any); and "
......@@ -439,8 +440,15 @@ int main(int argc, char** argv) {
if (!FLAG_outPrefix.size()) FLAG_outPrefix = "rvtest";
REQUIRE_STRING_PARAMETER(FLAG_inVcf,
"Please provide input file using: --inVcf");
if (FLAG_inVcf.empty() && FLAG_inBgen.empty()) {
fprintf(stderr, "Please provide one input file using: --inVcf or --inBgen");
exit(1);
}
if (!FLAG_inVcf.empty() && !FLAG_inBgen.empty()) {
fprintf(stderr,
"Please provide one kind of input file using: --inVcf or --inBgen");
exit(1);
}
// check new version
if (!FLAG_noweb) {
......@@ -496,6 +504,8 @@ int main(int argc, char** argv) {
GenotypeExtractor* ge = NULL;
if (!FLAG_inVcf.empty()) {
ge = new VCFGenotypeExtractor(FLAG_inVcf);
} else if (!FLAG_inBgen.empty()) {
ge = new BGenGenotypeExtractor(FLAG_inBgen);
} else {
assert(false);
}
......
......@@ -26,6 +26,7 @@ BASE = Main \
Model \
GenotypeExtractor \
VCFGenotypeExtractor \
BGenGenotypeExtractor \
DataLoader \
GenotypeCounter \
......
......@@ -481,16 +481,3 @@ double VCFGenotypeExtractor::getGenotypeForAltAllele(
return MISSING_GENOTYPE;
}
}
void VCFGenotypeExtractor::assign(const std::vector<double>& from, int nrow,
int ncol, Matrix* to) {
assert(to);
Matrix& out = *to;
out.Dimension(nrow, ncol);
assert((int)from.size() == nrow * ncol);
for (int i = 0; i < nrow; ++i) {
for (int j = 0; j < ncol; ++j) {
out[i][j] = from[nrow * j + i];
}
}
}
......@@ -12,8 +12,8 @@
// class Matrix;
// class RangeList;
// class Result;
// class VCFExtractor;
// class VCFIndividual;
class VCFExtractor;
class VCFIndividual;
// class GenotypeCounter;
/**
......@@ -83,9 +83,6 @@ class VCFGenotypeExtractor : public GenotypeExtractor {
void getPeopleName(std::vector<std::string>* p);
void getIncludedPeopleName(std::vector<std::string>* p) const;
const std::vector<GenotypeCounter>& getGenotypeCounter() const {
return this->counter;
}
/**
* @return weigth, its length equals to # of markers
*/
......@@ -95,10 +92,10 @@ class VCFGenotypeExtractor : public GenotypeExtractor {
this->dosageTag = tag;
}
void unsetDosageTag() { this->dosageTag.clear(); }
bool isDosage() const { return !this->dosageTag.empty(); }
void setParRegion(ParRegion* p) { this->parRegion = p; }
// Sex (1=male; 2=female; other=unknown)
void setSex(const std::vector<int>* sex) { this->sex = sex; }
// bool isDosage() const { return !this->dosageTag.empty(); }
// void setParRegion(ParRegion* p) { this->parRegion = p; }
// // Sex (1=male; 2=female; other=unknown)
// void setSex(const std::vector<int>* sex) { this->sex = sex; }
// coding male chromX as 0/2 instead of 0/1
// similarly, for dosage, just multiply 2.0 from original dosage
// void enableClaytonCoding() { this->claytonCoding = true; }
......@@ -118,14 +115,9 @@ class VCFGenotypeExtractor : public GenotypeExtractor {
// assign extracted genotype @param from to a @param nrow by @param ncol
// output matrix @param to
void assign(const std::vector<double>& from, int nrow, int ncol, Matrix* to);
void enableMultiAllelicMode() { this->multiAllelicMode = true; }
public:
const static int SUCCEED = 0;
const static int ERROR = -1;
const static int FILE_END = -2;
const static int FAIL_FILTER = -3;
// void assign(const std::vector<double>& from, int nrow, int ncol, Matrix*
// to);
// void enableMultiAllelicMode() { this->multiAllelicMode = true; }
private:
VCFExtractor* vin;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment