Commit 31df5d39 authored by zhanxw's avatar zhanxw

support kgg input

parent c6022595
......@@ -2,6 +2,7 @@
#include "base/Exception.h"
#include "base/IO.h"
#include "base/TypeConversion.h"
#ifndef UNUSED
#define UNUSED(x) (void)(x)
......@@ -209,6 +210,143 @@ bool KGGInputFile::readRecord() {
return true;
}
//////////////////////////////////////////////////
// Sample inclusion/exclusion
void KGGInputFile::setPeopleMask(const std::string& s, bool b) {
for (size_t i = 0; i != indv.size(); ++i) {
if (indv[i] == s) {
sampleMask[i] = b;
}
}
buildEffectiveIndex();
}
void KGGInputFile::includePeople(const std::string& s) {
setPeopleMask(s, false);
}
void KGGInputFile::includePeople(const std::vector<std::string>& v) {
for (size_t i = 0; i != v.size(); ++i) {
includePeople(v[i].c_str());
}
}
void KGGInputFile::setPeopleMaskFromFile(const char* fn, bool b) {
if (!fn || strlen(fn) == 0) {
return;
}
LineReader lr(fn);
std::vector<std::string> fd;
while (lr.readLineBySep(&fd, "\t ")) {
for (unsigned int i = 0; i < fd.size(); i++) {
setPeopleMask(fd[i].c_str(), b);
}
}
buildEffectiveIndex();
}
void KGGInputFile::includePeopleFromFile(const char* fn) {
setPeopleMaskFromFile(fn, false);
}
void KGGInputFile::includeAllPeople() {
std::fill(sampleMask.begin(), sampleMask.end(), false);
buildEffectiveIndex();
}
void KGGInputFile::excludePeople(const std::string& s) {
setPeopleMask(s, true);
}
void KGGInputFile::excludePeople(const std::vector<std::string>& v) {
for (size_t i = 0; i != v.size(); ++i) {
excludePeople(v[i]);
}
}
void KGGInputFile::excludePeopleFromFile(const char* fn) {
setPeopleMaskFromFile(fn, true);
}
void KGGInputFile::excludeAllPeople() {
std::fill(sampleMask.begin(), sampleMask.end(), true);
buildEffectiveIndex();
}
//////////////////////////////////////////////////
// Adjust range collections
#if 0
void KGGInputFile::enableAutoMerge() { warnUnsupported("enableAutoMerge"); }
void KGGInputFile::disableAutoMerge() { warnUnsupported("disableAutoMerge"); }
// void clearRange();
void KGGInputFile::setRangeFile(const char* fn) {
warnUnsupported("setRangeFile");
}
// @param l is a string of range(s)
void KGGInputFile::setRange(const char* chrom, int begin, int end) {
warnUnsupported("setRange");
}
void KGGInputFile::setRange(const RangeList& rl) {
warnUnsupported("setRange");
}
void KGGInputFile::setRangeList(const std::string& l) {
warnUnsupported("setRangeList");
}
// this function the entry point for all function add/change region list
void KGGInputFile::setRangeList(const RangeList& rl) {
warnUnsupported("setRangeList");
}
void KGGInputFile::setRangeMode() { warnUnsupported("setRangeMode"); }
#endif
int KGGInputFile::setSiteFile(const std::string& fn) {
if (fn.empty()) return 0;
std::vector<std::string> fd;
LineReader lr(fn);
int pos;
std::string chromPos;
while (lr.readLineBySep(&fd, "\t ")) {
if (fd.empty()) continue;
if (fd[0].find(':') != std::string::npos) {
this->allowedSite.insert(fd[0]);
continue;
}
if (fd.size() >= 2 && str2int(fd[1], &pos) && pos > 0) {
chromPos = fd[0];
chromPos += ':';
chromPos += fd[1];
this->allowedSite.insert(chromPos);
continue;
}
}
return 0;
}
int KGGInputFile::getNumEffectiveSample() const {
size_t ret = 0;
for (size_t i = 0; i != sampleMask.size(); ++i) {
if (sampleMask[i]) continue;
ret++;
}
return ret;
}
void KGGInputFile::getIncludedSampleName(std::vector<std::string>* p) const {
if (!p) return;
p->clear();
for (size_t i = 0; i != sampleMask.size(); ++i) {
if (sampleMask[i]) continue;
p->push_back(getSampleName()[i]);
}
}
void KGGInputFile::buildEffectiveIndex() {
effectiveIndex.resize(0);
const size_t N = getNumSample();
for (size_t i = 0; i != N; ++i) {
if (sampleMask[i]) continue;
effectiveIndex.push_back(i);
}
}
int KGGInputFile::getEffectiveIndex(int idx) const {
return this->effectiveIndex[idx];
}
int KGGInputFile::getGenotype(int indvIdx) {
const int nAllele = alt[variantIdx].size() + 1;
......@@ -266,3 +404,7 @@ void KGGInputFile::buildPhasedTable(int allele) {
m[val].x[0] = -9;
m[val].x[1] = -9;
}
void KGGInputFile::warnUnsupported(const char* tag) {
fprintf(stderr, "Please remove unsupported features related to %s", tag);
}
......@@ -2,6 +2,7 @@
#define _KGGINPUTFILE_H_
#include <map>
#include <set>
#include <string>
#include <vector>
......@@ -17,6 +18,20 @@ class KGGInputFile {
// @return false if reached end
bool readRecord();
//////////////////////////////////////////////////
// Sample inclusion/exclusion
void includePeople(const std::string& s);
void includePeople(const std::vector<std::string>& v);
void includePeopleFromFile(const char* fn);
void includeAllPeople();
void excludePeople(const std::string& s);
void excludePeople(const std::vector<std::string>& v);
void excludePeopleFromFile(const char* fn);
void excludeAllPeople();
// No range related function
int setSiteFile(const std::string& fn);
int getGenotype(int indvIdx);
void getAllele(int indvIdx, int* a1, int* a2);
......@@ -40,9 +55,13 @@ class KGGInputFile {
}
int getNumIndv() const { return this->indv.size(); }
int getNumSample() const { return this->indv.size(); }
int getNumEffectiveSample() const;
int getNumMarker() const { return this->snp2Idx.size(); }
const std::vector<std::string>& getIndv() const { return this->indv; }
const std::vector<std::string>& getSampleName() const { return this->indv; }
void getIncludedSampleName(std::vector<std::string>* p) const;
int getEffectiveIndex(int idx) const;
const std::vector<std::string>& getIID() const { return this->indv; }
const std::vector<std::string>& getChrom() const { return this->chrom; }
const std::vector<std::string>& getMarkerName() const { return this->snp; }
......@@ -71,6 +90,14 @@ class KGGInputFile {
void buildUnphasedTable(int numAllele);
void buildPhasedTable(int numAllele);
// sample inclusion/exclusion related
void setPeopleMask(const std::string& s, bool b);
void setPeopleMaskFromFile(const char* fn, bool b);
void setRangeMode();
// range list related
void buildEffectiveIndex();
void warnUnsupported(const char* tag);
private:
typedef struct TwoChar { unsigned char x[2]; } TwoChar;
std::map<std::string, int> snp2Idx;
......@@ -89,6 +116,11 @@ class KGGInputFile {
bool phased;
std::map<int, std::map<char, TwoChar> > unphasedTable;
std::map<int, std::map<char, TwoChar> > phasedTable;
std::vector<bool> sampleMask; // true means exclusion
std::vector<int> effectiveIndex;
// allow chromosomal sites
std::set<std::string> allowedSite;
};
#endif /* _KGGINPUTFILE_H_ */
This diff is collapsed.
#ifndef KGGGENOTYPEEXTRACTOR_H
#define KGGGENOTYPEEXTRACTOR_H
#include <string>
#include <vector>
#include "src/GenotypeExtractor.h"
class KGGInputFile;
/**
* Extract genotype from file @param fileName at the marker @param marker,
* and store sample names in @param rowLabel and genotypes in @param genotype
* (dimension is numSample x 1)
* @return 0 if succeed
*/
// int loadMarkerFromKGG(const std::string& fileName, const std::string&
// marker,
// std::vector<std::string>* rowLabel, Matrix* genotype);
class KGGGenotypeExtractor : public GenotypeExtractor {
public:
explicit KGGGenotypeExtractor(const std::string& fn);
virtual ~KGGGenotypeExtractor();
private:
KGGGenotypeExtractor(const KGGGenotypeExtractor&);
KGGGenotypeExtractor& operator=(const KGGGenotypeExtractor&);
public:
/**
* @param g, store people by marker matrix
* @return 0 for success
*/
int extractMultipleGenotype(Matrix* g);
/**
* @return 0 for success
* @return -2 for reach end.
* @param g: people by 1 matrix, where column name is like "chr:pos"
* @param b: extract information, e.g. "1\t100\tA\tC"
*/
int extractSingleGenotype(Matrix* g, Result* b);
/* Site filters */
bool setSiteFreqMin(const double f);
bool setSiteFreqMax(const double f);
void setSiteDepthMin(int d);
void setSiteDepthMax(int d);
// @return true if GD is valid
// if GD is missing, we will take GD = 0
void setGDmin(int m);
void setGDmax(int m);
void setGQmin(int m);
void setGQmax(int m);
void setSiteFile(const std::string& fn);
void setSiteQualMin(int q);
void setSiteMACMin(int n);
int setAnnoType(const std::string& s);
void setRange(const RangeList& l);
void setRangeList(const std::string& l);
void setRangeFile(const std::string& fn);
void includePeople(const std::string& v);
void includePeople(const std::vector<std::string>& v);
void includePeopleFromFile(const std::string& fn);
void excludePeople(const std::string& v);
void excludePeopleFromFile(const std::string& fn);
void excludePeople(const std::vector<std::string>& sample);
void excludePeople(const std::vector<std::string>& sample,
const std::vector<int>& index);
void excludeAllPeople();
void enableAutoMerge();
void getPeopleName(std::vector<std::string>* p);
void getIncludedPeopleName(std::vector<std::string>* p) const;
void setDosageTag(const std::string& tag) { warnUnsupported("Dosage"); }
/**
* @return weigth, its length equals to # of markers
*/
// std::vector<double>& getWeight() { return this->weight; };
// void setDosageTag(const std::string& tag);
// void unsetDosageTag() ;
// bool isDosage() const ;
// void setParRegion(ParRegion* p) { this->parRegion = p; }
// // Sex (1=male; 2=female; other=unknown)
// void setSex(const std::vector<int>* sex) { this->sex = sex; }
// coding male chromX as 0/2 instead of 0/1
// similarly, for dosage, just multiply 2.0 from original dosage
// void enableClaytonCoding() { this->claytonCoding = true; }
// void disableClaytonCoding() { this->claytonCoding = false; }
// check how many alt alleles at this site
void parseAltAllele(const char* s);
// extract genotype for @param indv
inline double getGenotype(int indvIdx, const bool useDosage,
const bool hemiRegion, const int sex);
double getGenotypeForAltAllele(int indvIdx, const bool useDosage,
const bool hemiRegion, const int sex,
const int alt);
void warnUnsupported(const char* tag);
// assign extracted genotype @param from to a @param nrow by @param ncol
// output matrix @param to
// void assign(const std::vector<double>& from, int nrow, int ncol, Matrix*
// to);
// void enableMultiAllelicMode() { this->multiAllelicMode = true; }
private:
KGGInputFile* kggIn;
std::vector<std::string> altAllele; // store alt alleles
int altAlleleToParse; // number of alleles to parse
int currentVariant; // record which variant to process
}; // class KGGGenotypeExtractor
#endif /* KGGGENOTYPEEXTRACTOR_H */
......@@ -25,6 +25,7 @@
#include "src/DataConsolidator.h"
#include "src/DataLoader.h"
#include "src/GenotypeExtractor.h"
#include "src/KGGGenotypeExtractor.h"
#include "src/ModelFitter.h"
#include "src/ModelManager.h"
#include "src/Result.h"
......@@ -32,7 +33,7 @@
Logger* logger = NULL;
const char* VERSION = "20170818";
const char* VERSION = "20170905";
void banner(FILE* fp) {
const char* string =
......@@ -248,6 +249,7 @@ BEGIN_PARAMETER_LIST();
ADD_PARAMETER_GROUP("Basic Input/Output");
ADD_STRING_PARAMETER(inVcf, "--inVcf", "Input VCF File");
ADD_STRING_PARAMETER(inBgen, "--inBgen", "Input BGEN File");
ADD_STRING_PARAMETER(inKgg, "--inKgg", "Input KGG File");
ADD_STRING_PARAMETER(outPrefix, "--out", "Output prefix");
ADD_BOOL_PARAMETER(outputRaw, "--outputRaw",
"Output genotypes, phenotype, covariates(if any); and "
......@@ -440,13 +442,12 @@ int main(int argc, char** argv) {
if (!FLAG_outPrefix.size()) FLAG_outPrefix = "rvtest";
if (FLAG_inVcf.empty() && FLAG_inBgen.empty()) {
fprintf(stderr, "Please provide one input file using: --inVcf or --inBgen");
exit(1);
}
if (!FLAG_inVcf.empty() && !FLAG_inBgen.empty()) {
if ((FLAG_inVcf.empty() ? 0 : 1) + (FLAG_inBgen.empty() ? 0 : 1) +
(FLAG_inKgg.empty() ? 0 : 1) !=
1) {
fprintf(stderr,
"Please provide one kind of input file using: --inVcf or --inBgen");
"Please provide one type of input file using: --inVcf, --inBgen or "
"--inKgg");
exit(1);
}
......@@ -506,6 +507,8 @@ int main(int argc, char** argv) {
ge = new VCFGenotypeExtractor(FLAG_inVcf);
} else if (!FLAG_inBgen.empty()) {
ge = new BGenGenotypeExtractor(FLAG_inBgen);
} else if (!FLAG_inKgg.empty()) {
ge = new KGGGenotypeExtractor(FLAG_inKgg);
} else {
assert(false);
}
......
......@@ -27,6 +27,7 @@ BASE = Main \
GenotypeExtractor \
VCFGenotypeExtractor \
BGenGenotypeExtractor \
KGGGenotypeExtractor \
DataLoader \
GenotypeCounter \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment