Commit 31df5d39 authored by zhanxw's avatar zhanxw
Browse files

support kgg input

parent c6022595
......@@ -2,6 +2,7 @@
#include "base/Exception.h"
#include "base/IO.h"
#include "base/TypeConversion.h"
#ifndef UNUSED
#define UNUSED(x) (void)(x)
......@@ -209,6 +210,143 @@ bool KGGInputFile::readRecord() {
return true;
}
//////////////////////////////////////////////////
// Sample inclusion/exclusion
void KGGInputFile::setPeopleMask(const std::string& s, bool b) {
for (size_t i = 0; i != indv.size(); ++i) {
if (indv[i] == s) {
sampleMask[i] = b;
}
}
buildEffectiveIndex();
}
void KGGInputFile::includePeople(const std::string& s) {
setPeopleMask(s, false);
}
void KGGInputFile::includePeople(const std::vector<std::string>& v) {
for (size_t i = 0; i != v.size(); ++i) {
includePeople(v[i].c_str());
}
}
void KGGInputFile::setPeopleMaskFromFile(const char* fn, bool b) {
if (!fn || strlen(fn) == 0) {
return;
}
LineReader lr(fn);
std::vector<std::string> fd;
while (lr.readLineBySep(&fd, "\t ")) {
for (unsigned int i = 0; i < fd.size(); i++) {
setPeopleMask(fd[i].c_str(), b);
}
}
buildEffectiveIndex();
}
void KGGInputFile::includePeopleFromFile(const char* fn) {
setPeopleMaskFromFile(fn, false);
}
void KGGInputFile::includeAllPeople() {
std::fill(sampleMask.begin(), sampleMask.end(), false);
buildEffectiveIndex();
}
void KGGInputFile::excludePeople(const std::string& s) {
setPeopleMask(s, true);
}
void KGGInputFile::excludePeople(const std::vector<std::string>& v) {
for (size_t i = 0; i != v.size(); ++i) {
excludePeople(v[i]);
}
}
void KGGInputFile::excludePeopleFromFile(const char* fn) {
setPeopleMaskFromFile(fn, true);
}
void KGGInputFile::excludeAllPeople() {
std::fill(sampleMask.begin(), sampleMask.end(), true);
buildEffectiveIndex();
}
//////////////////////////////////////////////////
// Adjust range collections
#if 0
void KGGInputFile::enableAutoMerge() { warnUnsupported("enableAutoMerge"); }
void KGGInputFile::disableAutoMerge() { warnUnsupported("disableAutoMerge"); }
// void clearRange();
void KGGInputFile::setRangeFile(const char* fn) {
warnUnsupported("setRangeFile");
}
// @param l is a string of range(s)
void KGGInputFile::setRange(const char* chrom, int begin, int end) {
warnUnsupported("setRange");
}
void KGGInputFile::setRange(const RangeList& rl) {
warnUnsupported("setRange");
}
void KGGInputFile::setRangeList(const std::string& l) {
warnUnsupported("setRangeList");
}
// this function the entry point for all function add/change region list
void KGGInputFile::setRangeList(const RangeList& rl) {
warnUnsupported("setRangeList");
}
void KGGInputFile::setRangeMode() { warnUnsupported("setRangeMode"); }
#endif
int KGGInputFile::setSiteFile(const std::string& fn) {
if (fn.empty()) return 0;
std::vector<std::string> fd;
LineReader lr(fn);
int pos;
std::string chromPos;
while (lr.readLineBySep(&fd, "\t ")) {
if (fd.empty()) continue;
if (fd[0].find(':') != std::string::npos) {
this->allowedSite.insert(fd[0]);
continue;
}
if (fd.size() >= 2 && str2int(fd[1], &pos) && pos > 0) {
chromPos = fd[0];
chromPos += ':';
chromPos += fd[1];
this->allowedSite.insert(chromPos);
continue;
}
}
return 0;
}
int KGGInputFile::getNumEffectiveSample() const {
size_t ret = 0;
for (size_t i = 0; i != sampleMask.size(); ++i) {
if (sampleMask[i]) continue;
ret++;
}
return ret;
}
void KGGInputFile::getIncludedSampleName(std::vector<std::string>* p) const {
if (!p) return;
p->clear();
for (size_t i = 0; i != sampleMask.size(); ++i) {
if (sampleMask[i]) continue;
p->push_back(getSampleName()[i]);
}
}
void KGGInputFile::buildEffectiveIndex() {
effectiveIndex.resize(0);
const size_t N = getNumSample();
for (size_t i = 0; i != N; ++i) {
if (sampleMask[i]) continue;
effectiveIndex.push_back(i);
}
}
int KGGInputFile::getEffectiveIndex(int idx) const {
return this->effectiveIndex[idx];
}
int KGGInputFile::getGenotype(int indvIdx) {
const int nAllele = alt[variantIdx].size() + 1;
......@@ -266,3 +404,7 @@ void KGGInputFile::buildPhasedTable(int allele) {
m[val].x[0] = -9;
m[val].x[1] = -9;
}
void KGGInputFile::warnUnsupported(const char* tag) {
fprintf(stderr, "Please remove unsupported features related to %s", tag);
}
......@@ -2,6 +2,7 @@
#define _KGGINPUTFILE_H_
#include <map>
#include <set>
#include <string>
#include <vector>
......@@ -17,6 +18,20 @@ class KGGInputFile {
// @return false if reached end
bool readRecord();
//////////////////////////////////////////////////
// Sample inclusion/exclusion
void includePeople(const std::string& s);
void includePeople(const std::vector<std::string>& v);
void includePeopleFromFile(const char* fn);
void includeAllPeople();
void excludePeople(const std::string& s);
void excludePeople(const std::vector<std::string>& v);
void excludePeopleFromFile(const char* fn);
void excludeAllPeople();
// No range related function
int setSiteFile(const std::string& fn);
int getGenotype(int indvIdx);
void getAllele(int indvIdx, int* a1, int* a2);
......@@ -40,9 +55,13 @@ class KGGInputFile {
}
int getNumIndv() const { return this->indv.size(); }
int getNumSample() const { return this->indv.size(); }
int getNumEffectiveSample() const;
int getNumMarker() const { return this->snp2Idx.size(); }
const std::vector<std::string>& getIndv() const { return this->indv; }
const std::vector<std::string>& getSampleName() const { return this->indv; }
void getIncludedSampleName(std::vector<std::string>* p) const;
int getEffectiveIndex(int idx) const;
const std::vector<std::string>& getIID() const { return this->indv; }
const std::vector<std::string>& getChrom() const { return this->chrom; }
const std::vector<std::string>& getMarkerName() const { return this->snp; }
......@@ -71,6 +90,14 @@ class KGGInputFile {
void buildUnphasedTable(int numAllele);
void buildPhasedTable(int numAllele);
// sample inclusion/exclusion related
void setPeopleMask(const std::string& s, bool b);
void setPeopleMaskFromFile(const char* fn, bool b);
void setRangeMode();
// range list related
void buildEffectiveIndex();
void warnUnsupported(const char* tag);
private:
typedef struct TwoChar { unsigned char x[2]; } TwoChar;
std::map<std::string, int> snp2Idx;
......@@ -89,6 +116,11 @@ class KGGInputFile {
bool phased;
std::map<int, std::map<char, TwoChar> > unphasedTable;
std::map<int, std::map<char, TwoChar> > phasedTable;
std::vector<bool> sampleMask; // true means exclusion
std::vector<int> effectiveIndex;
// allow chromosomal sites
std::set<std::string> allowedSite;
};
#endif /* _KGGINPUTFILE_H_ */
#include "KGGGenotypeExtractor.h"
#include "GenotypeCounter.h"
#include "Result.h"
#include "base/Argument.h"
#include "base/Logger.h"
#include "libVcf/KGGInputFile.h"
#include "libVcf/VCFConstant.h"
#include "libsrc/MathMatrix.h"
#include "libsrc/MathVector.h"
DECLARE_BOOL_PARAMETER(outputID);
extern Logger* logger;
KGGGenotypeExtractor::KGGGenotypeExtractor(const std::string& fn)
: GenotypeExtractor(fn),
kggIn(NULL),
altAlleleToParse(-1),
currentVariant(0) {
this->kggIn = new KGGInputFile(fn.c_str(), ".gz");
}
KGGGenotypeExtractor::~KGGGenotypeExtractor() {
if (this->kggIn) {
delete this->kggIn;
this->kggIn = NULL;
}
}
int KGGGenotypeExtractor::extractMultipleGenotype(Matrix* g) {
warnUnsupported("extractMultipleGenotype");
return -1;
#if 0
assert(g);
assert(g->rows >= 0 && g->cols >= 0);
g->Dimension(0, 0);
int row = 0;
this->genotype.clear();
this->altAlleleToParse = -1;
const bool useDosage = false; // TODO: later may support genotype calls
while (true) {
if (this->altAlleleToParse <= 0) {
if (!this->kggIn->readRecord()) {
// reached the end
break;
} else {
// const char* alt = this->kggIn->getKGGRecord().getAlt();
if (multiAllelicMode) {
// parseAltAllele(alt);
// this->altAlleleToParse = altAllele.size();
this->altAlleleToParse =
var.alleles.size() -
1; // num of alternative alleles to be parsed/processed
} else {
// this->altAllele.resize(1);
// this->altAllele[0] = alt;
this->altAlleleToParse = 1;
}
}
}
assert(this->altAlleleToParse > 0);
// parse alt allele one at a time
this->sampleSize = this->kggIn->getNumEffectiveSample();
row++;
this->variantName.resize(row);
this->counter.resize(row);
this->counter.back().reset();
this->hemiRegion.resize(row);
assert(this->parRegion);
const bool isHemiRegion =
this->parRegion->isHemiRegion(var.chrom.c_str(), var.pos);
// e.g.: Loop each (selected) people in the same order as in the KGG
double geno;
/// const int altAlleleGT = this->altAllele.size() - this->altAlleleToParse;
const int altAlleleGT =
var.alleles.size() -
this->altAlleleToParse; // when allele = K, alleleGT = 1, 2. ..., K-1
for (int i = 0; i < sampleSize; i++) {
// indv = people[i];
const int indvIdx = this->kggIn->getEffectiveIndex(i);
if (multiAllelicMode) {
geno = getGenotypeForAltAllele(var, indvIdx, useDosage, isHemiRegion,
(*sex)[i], altAlleleGT);
} else {
geno = getGenotype(var, indvIdx, useDosage, isHemiRegion, (*sex)[i]);
}
this->genotype.push_back(geno);
this->counter.back().add(geno);
} // end for i
// check frequency cutoffs
const double maf = counter.back().getMAF();
if ((this->freqMin > 0. && this->freqMin > maf) ||
(this->freqMax > 0. && this->freqMax < maf)) {
// undo loaded contents
row--;
this->variantName.resize(row);
this->counter.resize(row);
this->hemiRegion.resize(row);
this->genotype.resize(this->genotype.size() - this->sampleSize);
this->altAlleleToParse--;
continue;
}
this->variantName.back() = var.chrom;
this->variantName.back() += ":";
this->variantName.back() += var.pos;
if (multiAllelicMode) {
this->variantName.back() += var.alleles[0];
this->variantName.back() += "/";
this->variantName.back() +=
var.alleles[altAllele.size() - this->altAlleleToParse];
}
this->hemiRegion.back() = (isHemiRegion);
this->altAlleleToParse--;
} // end while (this->kggIn->readRecord())
// now transpose (marker by people -> people by marker)
if (row > 0) {
assert((int)genotype.size() == this->sampleSize * row);
assign(this->genotype, sampleSize, row, g);
for (int i = 0; i < row; ++i) {
g->SetColumnLabel(i, variantName[i].c_str());
}
}
return SUCCEED;
#endif
} // end extractMultipleGenotype(Matrix* g)
int KGGGenotypeExtractor::extractSingleGenotype(Matrix* g, Result* b) {
this->genotype.clear();
Result& buf = *b;
const bool useDosage = true;
int numAltAllele = this->kggIn->getAlt()[currentVariant].size();
if (this->altAlleleToParse <= 0) {
if (!this->kggIn->readRecord()) {
return FILE_END;
} else {
if (multiAllelicMode) {
// parseAltAllele(alt);
this->altAlleleToParse = numAltAllele - 1;
} else {
// this->altAllele.resize(1);
// this->altAllele[0] = alt;
this->altAlleleToParse = 1;
}
}
}
assert(this->altAlleleToParse >= 0);
buf.updateValue("CHROM", this->kggIn->getChrom()[currentVariant]);
buf.updateValue("POS", this->kggIn->getPosition()[currentVariant]);
if (FLAG_outputID) {
buf.updateValue("ID", this->kggIn->getMarkerName()[currentVariant]);
}
buf.updateValue("REF", this->kggIn->getRef()[currentVariant]);
buf.updateValue("ALT",
this->kggIn->getAlt()[currentVariant]
[numAltAllele - this->altAlleleToParse]);
// genotype.Dimension(people.size(), 1);
this->sampleSize = this->kggIn->getNumEffectiveSample();
this->variantName.resize(1);
this->counter.resize(1);
this->counter.back().reset();
this->hemiRegion.resize(1);
bool isHemiRegion = this->parRegion->isHemiRegion(
this->kggIn->getChrom()[currentVariant].c_str(),
this->kggIn->getPosition()[currentVariant]);
// e.g.: Loop each (selected) people in the same order as in the KGG
double geno;
// const int altAlleleGT = this->altAllele.size() - this->altAlleleToParse +
// 1;
const int altAlleleGT = numAltAllele - this->altAlleleToParse;
for (int i = 0; i < sampleSize; i++) {
// indv = people[i];
const int indvIdx = this->kggIn->getEffectiveIndex(i);
if (multiAllelicMode) {
geno = getGenotypeForAltAllele(indvIdx, useDosage, isHemiRegion,
(*sex)[i], altAlleleGT);
} else {
geno = getGenotype(indvIdx, useDosage, isHemiRegion, (*sex)[i]);
}
genotype.push_back(geno);
counter.back().add(geno);
}
// check frequency cutoffs
const double maf = counter[0].getMAF();
if ((this->freqMin > 0. && this->freqMin > maf) ||
(this->freqMax > 0. && this->freqMax < maf)) {
--this->altAlleleToParse;
return FAIL_FILTER;
}
variantName.back() = this->kggIn->getChrom()[currentVariant];
variantName.back() += ':';
variantName.back() += toString(this->kggIn->getPosition()[currentVariant]);
hemiRegion.back() = isHemiRegion;
assert((int)genotype.size() == sampleSize);
assign(genotype, sampleSize, 1, g);
g->SetColumnLabel(0, variantName.back().c_str());
--this->altAlleleToParse;
if (this->altAlleleToParse == 0) {
++currentVariant;
}
return SUCCEED;
} // end extractSingleGenotype()
#if 0
int loadMarkerFromKGG(const std::string& fileName, const std::string& marker,
std::vector<std::string>* rowLabel, Matrix* genotype) {
if (!rowLabel || !genotype) {
// invalid parameter
return -1;
}
Matrix& m = *genotype;
int col = 0;
KGGInputFile kggIn(fileName);
kggIn.setRangeList(marker);
while (kggIn.readRecord()) {
KGGRecord& r = kggIn.getKGGRecord();
KGGPeople& people = r.getPeople();
KGGIndividual* indv;
m.Dimension(people.size(), col + 1);
int GTidx = r.getFormatIndex("GT");
for (int i = 0; i < (int)people.size(); i++) {
indv = people[i];
// get GT index. if you are sure the index will not change,
// call this function only once!
if (GTidx >= 0) {
// printf("%s ", indv->justGet(0).toStr()); // [0] meaning the first
// field of each individual
m[i][col] = indv->justGet(GTidx).getGenotype();
} else {
logger->error("Cannot find GT field!");
return -1;
}
}
if (col == 0) {
// set-up names
rowLabel->resize(people.size());
for (size_t i = 0; i < people.size(); ++i) {
(*rowLabel)[i] = people[i]->getName();
}
}
std::string colLabel = r.getChrom();
colLabel += ":";
colLabel += r.getPosStr();
m.SetColumnLabel(col, colLabel.c_str());
++col;
}
return 0;
}
#endif
bool KGGGenotypeExtractor::setSiteFreqMin(const double f) {
if (f < 0.0 || f > 1.0) {
return false;
}
this->freqMin = f - 1e-10; // allow rounding error
return true;
}
bool KGGGenotypeExtractor::setSiteFreqMax(const double f) {
if (f < 0.0 || f > 1.0) {
return false;
}
this->freqMax = f + 1e-10; // allow rounding error
return true;
}
void KGGGenotypeExtractor::setSiteDepthMin(int d) {
warnUnsupported("site depth");
}
void KGGGenotypeExtractor::setSiteDepthMax(int d) {
warnUnsupported("site depth");
}
// // @return true if GD is valid
// // if GD is missing, we will take GD = 0
// bool KGGGenotypeExtractor::checkGD(KGGIndividual& indv, int gdIdx) {
// warnUnsupported("GD");
// return true;
// }
// bool KGGGenotypeExtractor::checkGQ(KGGIndividual& indv, int gqIdx) {
// warnUnsupported("GQ");
// return true;
// }
void KGGGenotypeExtractor::setGDmin(int m) { warnUnsupported("GD"); }
void KGGGenotypeExtractor::setGDmax(int m) { warnUnsupported("GD"); }
void KGGGenotypeExtractor::setGQmin(int m) { warnUnsupported("GQ"); }
void KGGGenotypeExtractor::setGQmax(int m) { warnUnsupported("GQ"); }
void KGGGenotypeExtractor::setSiteFile(const std::string& fn) {
this->kggIn->setSiteFile(fn);
}
void KGGGenotypeExtractor::setSiteQualMin(int q) {
warnUnsupported("SiteQual");
}
void KGGGenotypeExtractor::setSiteMACMin(int n) { warnUnsupported("SiteMAC"); }
int KGGGenotypeExtractor::setAnnoType(const std::string& s) {
warnUnsupported("anno");
return -1;
}
void KGGGenotypeExtractor::setRange(const RangeList& l) {
warnUnsupported("setRange");
}
void KGGGenotypeExtractor::setRangeList(const std::string& l) {
warnUnsupported("setRangeList");
}
void KGGGenotypeExtractor::setRangeFile(const std::string& fn) {
warnUnsupported("setRangeFile");
}
void KGGGenotypeExtractor::includePeople(const std::string& v) {
this->kggIn->includePeople(v.c_str());
}
void KGGGenotypeExtractor::includePeople(const std::vector<std::string>& v) {
this->kggIn->includePeople(v);
}
void KGGGenotypeExtractor::includePeopleFromFile(const std::string& fn) {
this->kggIn->includePeopleFromFile(fn.c_str());
}
void KGGGenotypeExtractor::excludePeople(const std::string& v) {
this->kggIn->excludePeople(v.c_str());
}
void KGGGenotypeExtractor::excludePeople(const std::vector<std::string>& v) {
this->kggIn->excludePeople(v);
}
void KGGGenotypeExtractor::excludePeopleFromFile(const std::string& fn) {
this->kggIn->excludePeopleFromFile(fn.c_str());
}
void KGGGenotypeExtractor::excludePeople(const std::vector<std::string>& sample,
const std::vector<int>& index) {
for (size_t i = 0; i < index.size(); ++i) {
this->excludePeople(sample[i]);
}
}
void KGGGenotypeExtractor::excludeAllPeople() {
this->kggIn->excludeAllPeople();
}
void KGGGenotypeExtractor::enableAutoMerge() {
warnUnsupported("enableAutoMerge");
}
void KGGGenotypeExtractor::getPeopleName(std::vector<std::string>* p) {
*p = this->kggIn->getSampleName();
return;
}
void KGGGenotypeExtractor::getIncludedPeopleName(
std::vector<std::string>* p) const {
this->kggIn->getIncludedSampleName(p);
return;
}
void KGGGenotypeExtractor::parseAltAllele(const char* s) {
stringTokenize(s, ",", &altAllele);
}
double KGGGenotypeExtractor::getGenotype(int indvIdx, const bool useDosage,
const bool hemiRegion, const int sex) {
const int idx = this->kggIn->getEffectiveIndex(indvIdx);
double ret;
assert(!useDosage);
int a1, a2;
this->kggIn->getAllele(idx, &a1, &a2);
if (a1 < 0 || a2 < 0) {
return MISSING_GENOTYPE;
}
if (hemiRegion && sex == PLINK_MALE && a1 != a2) {