Commit 2fb11df8 authored by zhanxw's avatar zhanxw

refactor codes

parent 04d84469
......@@ -43,7 +43,6 @@ ZSTD_LIB = ../third/zstd/lib/libzstd.a
$(ZSTD_INC) $(ZSTD_LIB):
(cd ../third; $(MAKE) zstd)
THIRD_INC = $(BCF_INC) $(TABIX_INC) $(PCRE_INC) $(BZIP2_INC) $(ZLIB_INC) $(GSL_INC) $(ZSTD_INC) $(SQLITE_INC)
THIRD_INC_FLAGS = $(addprefix -I,$(THIRD_INC))
......
......@@ -4,9 +4,9 @@
#include <vector>
#include "CommonFunction.h"
#include "GenotypeExtractor.h"
#include "Indexer.h"
#include "ModelUtil.h" // copy()
#include "VCFGenotypeExtractor.h"
#include "base/IO.h"
#include "base/Logger.h"
......@@ -313,7 +313,7 @@ int DataLoader::loadMarkerAsCovariate(const std::string& inVcf,
this->FLAG_condition = marker;
Matrix geno;
GenotypeExtractor ge(FLAG_inVcf);
VCFGenotypeExtractor ge(FLAG_inVcf);
ge.excludeAllPeople();
ge.includePeople(phenotype.getRowName());
ge.setRangeList(marker);
......@@ -616,12 +616,12 @@ int DataLoader::inverseNormalizePhenotype() {
return 0;
}
logger->info("Now applying inverse normalize transformation");
logger->info("Now applying inverse normalization transformation");
std::vector<double> v;
phenotype.extractCol(0, &v);
inverseNormalizeLikeMerlin(&v);
phenotype.setCol(0, v);
logger->info("DONE: inverse normal transformation finished");
logger->info("DONE: inverse normalization transformation finished");
return 0;
}
......
This diff is collapsed.
......@@ -15,15 +15,6 @@ class VCFExtractor;
class VCFIndividual;
class GenotypeCounter;
/**
* Extract genotype from file @param fileName at the marker @param marker,
* and store sample names in @param rowLabel and genotypes in @param genotype
* (dimension is numSample x 1)
* @return 0 if succeed
*/
int loadMarkerFromVCF(const std::string& fileName, const std::string& marker,
std::vector<std::string>* rowLabel, Matrix* genotype);
class GenotypeExtractor {
public:
explicit GenotypeExtractor(const std::string& fn);
......@@ -38,49 +29,49 @@ class GenotypeExtractor {
* @param g, store people by marker matrix
* @return 0 for success
*/
int extractMultipleGenotype(Matrix* g);
virtual int extractMultipleGenotype(Matrix* g) = 0;
/**
* @return 0 for success
* @return -2 for reach end.
* @param g: people by 1 matrix, where column name is like "chr:pos"
* @param b: extract information, e.g. "1\t100\tA\tC"
*/
int extractSingleGenotype(Matrix* g, Result* b);
virtual int extractSingleGenotype(Matrix* g, Result* b) = 0;
/* Site filters */
bool setSiteFreqMin(const double f);
bool setSiteFreqMax(const double f);
void setSiteDepthMin(int d);
void setSiteDepthMax(int d);
virtual bool setSiteFreqMin(const double f) = 0;
virtual bool setSiteFreqMax(const double f) = 0;
virtual void setSiteDepthMin(int d) = 0;
virtual void setSiteDepthMax(int d) = 0;
// @return true if GD is valid
// if GD is missing, we will take GD = 0
bool checkGD(VCFIndividual& indv, int gdIdx);
bool checkGQ(VCFIndividual& indv, int gqIdx);
void setGDmin(int m);
void setGDmax(int m);
void setGQmin(int m);
void setGQmax(int m);
void setSiteFile(const std::string& fn);
void setSiteQualMin(int q);
void setSiteMACMin(int n);
int setAnnoType(const std::string& s);
void setRange(const RangeList& l);
void setRangeList(const std::string& l);
void setRangeFile(const std::string& fn);
void includePeople(const std::string& v);
void includePeople(const std::vector<std::string>& v);
void includePeopleFromFile(const std::string& fn);
void excludePeople(const std::string& v);
void excludePeopleFromFile(const std::string& fn);
void excludePeople(const std::vector<std::string>& sample);
void excludePeople(const std::vector<std::string>& sample,
const std::vector<int>& index);
void excludeAllPeople();
void enableAutoMerge();
void getPeopleName(std::vector<std::string>* p);
void getIncludedPeopleName(std::vector<std::string>* p) const;
virtual bool checkGD(VCFIndividual& indv, int gdIdx) = 0;
virtual bool checkGQ(VCFIndividual& indv, int gqIdx) = 0;
virtual void setGDmin(int m) = 0;
virtual void setGDmax(int m) = 0;
virtual void setGQmin(int m) = 0;
virtual void setGQmax(int m) = 0;
virtual void setSiteFile(const std::string& fn) = 0;
virtual void setSiteQualMin(int q) = 0;
virtual void setSiteMACMin(int n) = 0;
virtual int setAnnoType(const std::string& s) = 0;
virtual void setRange(const RangeList& l) = 0;
virtual void setRangeList(const std::string& l) = 0;
virtual void setRangeFile(const std::string& fn) = 0;
virtual void includePeople(const std::string& v) = 0;
virtual void includePeople(const std::vector<std::string>& v) = 0;
virtual void includePeopleFromFile(const std::string& fn) = 0;
virtual void excludePeople(const std::string& v) = 0;
virtual void excludePeopleFromFile(const std::string& fn) = 0;
virtual void excludePeople(const std::vector<std::string>& sample) = 0;
virtual void excludePeople(const std::vector<std::string>& sample,
const std::vector<int>& index) = 0;
virtual void excludeAllPeople() = 0;
virtual void enableAutoMerge() = 0;
virtual void getPeopleName(std::vector<std::string>* p) = 0;
virtual void getIncludedPeopleName(std::vector<std::string>* p) const = 0;
const std::vector<GenotypeCounter>& getGenotypeCounter() const {
return this->counter;
......@@ -98,11 +89,12 @@ class GenotypeExtractor {
void setParRegion(ParRegion* p) { this->parRegion = p; }
// Sex (1=male; 2=female; other=unknown)
void setSex(const std::vector<int>* sex) { this->sex = sex; }
// coding male chromX as 0/2 instead of 0/1
// similarly, for dosage, just multiply 2.0 from original dosage
// void enableClaytonCoding() { this->claytonCoding = true; }
// void disableClaytonCoding() { this->claytonCoding = false; }
// coding male chromX as 0/2 instead of 0/1
// similarly, for dosage, just multiply 2.0 from original dosage
// void enableClaytonCoding() { this->claytonCoding = true; }
// void disableClaytonCoding() { this->claytonCoding = false; }
#if 0
// check how many alt alleles at this site
void parseAltAllele(const char* s);
// extract genotype for @param indv
......@@ -114,10 +106,11 @@ class GenotypeExtractor {
const bool hemiRegion, const int sex,
const int genoIdx, const int GDidx,
const int GQidx, const int alt);
// assign extracted genotype @param from to a @param nrow by @param ncol
// output matrix @param to
void assign(const std::vector<double>& from, int nrow, int ncol, Matrix* to);
#endif
void enableMultiAllelicMode() { this->multiAllelicMode = true; }
public:
......@@ -126,8 +119,8 @@ class GenotypeExtractor {
const static int FILE_END = -2;
const static int FAIL_FILTER = -3;
private:
VCFExtractor* vin;
protected:
// VCFExtractor* vin;
double freqMin;
double freqMax;
int GDmin;
......@@ -153,9 +146,11 @@ class GenotypeExtractor {
std::vector<std::string> variantName; // store extracted variant names
int sampleSize; // number of extracted vcf samples
// for multiallelic
bool multiAllelicMode; // default is false
bool multiAllelicMode; // default is false
#if 0
std::vector<std::string> altAllele; // store alt alleles
int altAlleleToParse; // number of alleles to parse
}; // class GenotypeExtractor
#endif
}; // class GenotypeExtractor
#endif /* GENOTYPEEXTRACTOR_H */
......@@ -27,6 +27,8 @@
#include "src/ModelFitter.h"
#include "src/ModelManager.h"
#include "src/Result.h"
#include "src/VCFGenotypeExtractor.h"
// #include "src/BGenGenotypeExtractor.h"
Logger* logger = NULL;
......@@ -491,45 +493,50 @@ int main(int argc, char** argv) {
time_t startTime = time(0);
logger->info("Analysis started at: %s", currentTime().c_str());
GenotypeExtractor ge(FLAG_inVcf);
GenotypeExtractor* ge = NULL;
if (!FLAG_inVcf.empty()) {
ge = new VCFGenotypeExtractor(FLAG_inVcf);
} else {
assert(false);
}
// set range filters here
ge.setRangeList(FLAG_rangeList.c_str());
ge.setRangeFile(FLAG_rangeFile.c_str());
ge->setRangeList(FLAG_rangeList.c_str());
ge->setRangeFile(FLAG_rangeFile.c_str());
// set people filters here
if (FLAG_peopleIncludeID.size() || FLAG_peopleIncludeFile.size()) {
ge.excludeAllPeople();
ge.includePeople(FLAG_peopleIncludeID.c_str());
ge.includePeopleFromFile(FLAG_peopleIncludeFile.c_str());
ge->excludeAllPeople();
ge->includePeople(FLAG_peopleIncludeID.c_str());
ge->includePeopleFromFile(FLAG_peopleIncludeFile.c_str());
}
ge.excludePeople(FLAG_peopleExcludeID.c_str());
ge.excludePeopleFromFile(FLAG_peopleExcludeFile.c_str());
ge->excludePeople(FLAG_peopleExcludeID.c_str());
ge->excludePeopleFromFile(FLAG_peopleExcludeFile.c_str());
if (!FLAG_siteFile.empty()) {
ge.setSiteFile(FLAG_siteFile);
ge->setSiteFile(FLAG_siteFile);
logger->info("Restrict analysis based on specified site file [ %s ]",
FLAG_siteFile.c_str());
}
if (FLAG_siteDepthMin > 0) {
ge.setSiteDepthMin(FLAG_siteDepthMin);
ge->setSiteDepthMin(FLAG_siteDepthMin);
logger->info("Set site depth minimum to %d", FLAG_siteDepthMin);
}
if (FLAG_siteDepthMax > 0) {
ge.setSiteDepthMax(FLAG_siteDepthMax);
ge->setSiteDepthMax(FLAG_siteDepthMax);
logger->info("Set site depth maximum to %d", FLAG_siteDepthMax);
}
if (FLAG_siteMACMin > 0) {
ge.setSiteMACMin(FLAG_siteMACMin);
ge->setSiteMACMin(FLAG_siteMACMin);
logger->info("Set site minimum MAC to %d", FLAG_siteDepthMin);
}
if (FLAG_annoType != "") {
ge.setAnnoType(FLAG_annoType.c_str());
ge->setAnnoType(FLAG_annoType.c_str());
logger->info("Set annotype type filter to %s", FLAG_annoType.c_str());
}
std::vector<std::string> vcfSampleNames;
ge.getPeopleName(&vcfSampleNames);
ge->getPeopleName(&vcfSampleNames);
logger->info("Loaded [ %zu ] samples from VCF files", vcfSampleNames.size());
DataLoader dataLoader;
......@@ -579,7 +586,7 @@ int main(int argc, char** argv) {
// rearrange phenotypes
// drop samples from phenotype or vcf
matchPhenotypeAndVCF("missing phenotype", &dataLoader, &ge);
matchPhenotypeAndVCF("missing phenotype", &dataLoader, ge);
// // phenotype names (vcf sample names) arranged in the same order as in
// VCF
......@@ -591,7 +598,7 @@ int main(int argc, char** argv) {
// &phenotypeInOrder, FLAG_imputePheno);
// if (vcfSampleToDrop.size()) {
// // exclude this sample from parsing VCF
// ge.excludePeople(vcfSampleToDrop);
// ge->excludePeople(vcfSampleToDrop);
// // output dropped samples
// for (size_t i = 0; i < vcfSampleToDrop.size(); ++i) {
// if (i == 0)
......@@ -625,7 +632,7 @@ int main(int argc, char** argv) {
// // phenotypeNameInOrder
// }
dataLoader.loadCovariate(FLAG_cov, FLAG_covName);
matchCovariateAndVCF("missing covariate", &dataLoader, &ge);
matchCovariateAndVCF("missing covariate", &dataLoader, ge);
// // load covariate
// Matrix covariate;
......@@ -676,20 +683,20 @@ int main(int argc, char** argv) {
// for (std::set<std::string>::const_iterator iter =
// sampleToDropInCovariate.begin();
// iter != sampleToDropInCovariate.end(); ++iter) {
// ge.excludePeople(iter->c_str());
// ge->excludePeople(iter->c_str());
// }
// }
} else {
dataLoader.loadMultiplePhenotype(FLAG_multiplePheno, FLAG_pheno, FLAG_cov);
matchPhenotypeAndVCF("missing phenotype", &dataLoader, &ge);
matchCovariateAndVCF("missing covariate", &dataLoader, &ge);
matchPhenotypeAndVCF("missing phenotype", &dataLoader, ge);
matchCovariateAndVCF("missing covariate", &dataLoader, ge);
}
dataLoader.loadSex();
if (FLAG_sex) {
dataLoader.useSexAsCovariate();
matchCovariateAndVCF("missing sex", &dataLoader, &ge);
matchCovariateAndVCF("missing sex", &dataLoader, ge);
}
// // load sex
// std::vector<int> sex;
......@@ -711,7 +718,7 @@ int main(int argc, char** argv) {
if (!FLAG_condition.empty()) {
dataLoader.loadMarkerAsCovariate(FLAG_inVcf, FLAG_condition);
matchCovariateAndVCF("missing in conditioned marker(s)", &dataLoader, &ge);
matchCovariateAndVCF("missing in conditioned marker(s)", &dataLoader, ge);
}
// // load conditional markers
// if (!FLAG_condition.empty()) {
......@@ -833,7 +840,7 @@ int main(int argc, char** argv) {
// logger->info("Now applying inverse normalize transformation.");
// inverseNormalizeLikeMerlin(&phenotypeInOrder);
// g_SummaryHeader->setInverseNormalize(FLAG_inverseNormal);
// logger->info("DONE: inverse normal transformation finished.");
// logger->info("DONE: inverse normalization transformation finished.");
// }
// }
......@@ -865,7 +872,7 @@ int main(int argc, char** argv) {
if (dataLoader.isBinaryPhenotype()) {
modelManager.setBinaryOutcome();
matchPhenotypeAndVCF("missing phenotype (not case/control)", &dataLoader,
&ge);
ge);
} else {
modelManager.setQuantitativeOutcome();
}
......@@ -943,7 +950,7 @@ int main(int argc, char** argv) {
DataConsolidator dc;
dc.setSex(&dataLoader.getSex());
dc.setFormula(&dataLoader.getFormula());
dc.setGenotypeCounter(ge.getGenotypeCounter());
dc.setGenotypeCounter(ge->getGenotypeCounter());
// load kinshp if needed by family models
if (modelManager.hasFamilyModel() ||
......@@ -1000,44 +1007,44 @@ int main(int argc, char** argv) {
// genotype will be extracted and stored
Matrix& genotype = dc.getOriginalGenotype();
if (FLAG_freqUpper > 0) {
ge.setSiteFreqMax(FLAG_freqUpper);
ge->setSiteFreqMax(FLAG_freqUpper);
logger->info("Set upper minor allele frequency limit to %g",
FLAG_freqUpper);
}
if (FLAG_freqLower > 0) {
ge.setSiteFreqMin(FLAG_freqLower);
ge->setSiteFreqMin(FLAG_freqLower);
logger->info("Set lower minor allele frequency limit to %g",
FLAG_freqLower);
}
// handle sex chromosome
ge.setParRegion(&parRegion);
ge.setSex(&dataLoader.getSex());
ge->setParRegion(&parRegion);
ge->setSex(&dataLoader.getSex());
// use dosage instead GT
if (!FLAG_dosageTag.empty()) {
ge.setDosageTag(FLAG_dosageTag);
ge->setDosageTag(FLAG_dosageTag);
logger->info("Use dosage genotype from VCF flag %s.",
FLAG_dosageTag.c_str());
}
if (FLAG_multiAllele) {
ge.enableMultiAllelicMode();
ge->enableMultiAllelicMode();
logger->info("Enable analysis using multiple allelic models");
}
// genotype QC options
if (FLAG_indvDepthMin > 0) {
ge.setGDmin(FLAG_indvDepthMin);
ge->setGDmin(FLAG_indvDepthMin);
logger->info("Minimum GD set to %d (or marked as missing genotype).",
FLAG_indvDepthMin);
}
if (FLAG_indvDepthMax > 0) {
ge.setGDmax(FLAG_indvDepthMax);
ge->setGDmax(FLAG_indvDepthMax);
logger->info("Maximum GD set to %d (or marked as missing genotype).",
FLAG_indvDepthMax);
}
if (FLAG_indvQualMin > 0) {
ge.setGQmin(FLAG_indvQualMin);
ge->setGQmin(FLAG_indvQualMin);
logger->info("Minimum GQ set to %d (or marked as missing genotype).",
FLAG_indvQualMin);
}
......@@ -1083,7 +1090,7 @@ int main(int argc, char** argv) {
int variantProcessed = 0;
while (true) {
buf.clearValue();
int ret = ge.extractSingleGenotype(&genotype, &buf);
int ret = ge->extractSingleGenotype(&genotype, &buf);
if (ret == GenotypeExtractor::FILE_END) { // reach file end
break;
......@@ -1137,11 +1144,11 @@ int main(int argc, char** argv) {
int variantProcessed = 0;
for (size_t i = 0; i < geneRange.size(); ++i) {
geneRange.at(i, &geneName, &rangeList);
ge.setRange(rangeList);
ge->setRange(rangeList);
while (true) {
buf.clearValue();
int ret = ge.extractSingleGenotype(&genotype, &buf);
int ret = ge->extractSingleGenotype(&genotype, &buf);
if (ret == GenotypeExtractor::FILE_END) { // reach end of this region
break;
}
......@@ -1190,12 +1197,12 @@ int main(int argc, char** argv) {
std::string geneName;
RangeList rangeList;
int variantProcessed = 0;
ge.enableAutoMerge();
ge->enableAutoMerge();
for (size_t i = 0; i < geneRange.size(); ++i) {
geneRange.at(i, &geneName, &rangeList);
ge.setRange(rangeList);
ge->setRange(rangeList);
buf.clearValue();
int ret = ge.extractMultipleGenotype(&genotype);
int ret = ge->extractMultipleGenotype(&genotype);
if (ret != GenotypeExtractor::SUCCEED) {
logger->error("Extract genotype failed for gene %s!", geneName.c_str());
......
......@@ -25,6 +25,7 @@ BASE = Main \
ModelParser \
Model \
GenotypeExtractor \
VCFGenotypeExtractor \
DataLoader \
GenotypeCounter \
......
......@@ -544,7 +544,7 @@ class MetaCovUnrelatedQtl : public MetaCovBase {
REF_TO_EIGEN(x1, x1E);
REF_TO_EIGEN(x2, x2E);
*covXX = x1E.col(0).dot(x2E.col(0));
*covXX = x1E.col(0).dot(x2E.col(0)) / this->sigma2;
return 0;
}
int calculateXZ(FloatMatrixRef& x, FloatMatrixRef& covXZ) {
......@@ -684,7 +684,7 @@ class MetaCovFamBinary : public MetaCovBase {
const EigenMatrix* U;
const EigenMatrix* S;
FastLMM metaCov;
};
}; // end MetaCovFamBinary
class MetaCovUnrelatedBinary : public MetaCovBase {
int FitNullModel(Matrix& genotype, DataConsolidator* dc) {
......@@ -853,7 +853,7 @@ int MetaCovTest::fitWithGivenGenotype(Matrix& genotype, DataConsolidator* dc) {
if (nSample < 0) { // uninitialized
// calculate variance of y
nSample = genotype.rows;
nCovariate = dc->getCovariate().cols;
nCovariate = dc->getCovariate().cols + 1; // intercept
genoPool.setChunkSize(nSample);
genoCovPool.setChunkSize(nCovariate);
}
......@@ -909,9 +909,9 @@ int MetaCovTest::fitWithGivenGenotype(Matrix& genotype, DataConsolidator* dc) {
if (!useBolt) {
// model->transformGenotype(&loci.geno, dc);
// model->calculateXZ(loci.geno, &loci.covXZ);
const int numCovariate = dc->getCovariate().cols;
// const int numCovariate = dc->getCovariate().cols;
FloatMatrixRef x(genoPool.chunk(loci.geno), nSample, 1);
FloatMatrixRef xz(genoCovPool.chunk(loci.geno), 1, numCovariate);
FloatMatrixRef xz(genoCovPool.chunk(loci.geno), 1, nCovariate);
model->transformGenotype(x, dc);
if (nCovariate) {
model->calculateXZ(x, xz);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment