Commit e9ef491c authored by zhanxw's avatar zhanxw

add KGGInputFile

parent 75c090f1
......@@ -221,6 +221,21 @@ inline int stringTokenize(const std::string& str, const char delim,
return (stringTokenize(str, d, result));
};
inline std::vector<std::string> stringTokenize(const std::string& str,
const std::string& delim) {
std::vector<std::string> result;
stringTokenize(str, delim, &result);
return result;
};
inline std::vector<std::string> stringTokenize(const std::string& str,
const char delim) {
std::vector<std::string> result;
std::string d(1, delim);
stringTokenize(str, d, &result);
return result;
};
// pretty much like stringTokenize, but @param result will not contain empty
// string
inline int stringNaturalTokenize(const std::string& str,
......
#include "KGGInputFile.h"
#include "base/Exception.h"
#include "base/IO.h"
#ifndef UNUSED
#define UNUSED(x) (void)(x)
#endif
#define ROUND_UP_TO_4X(x) (((x) + 3) & ~0x03)
KGGInputFile::KGGInputFile(const std::string& fnPrefix) { init(fnPrefix, ""); }
KGGInputFile::KGGInputFile(const std::string& fnPrefix,
const std::string& fnSuffix) {
if (!fnSuffix.empty() && fnSuffix[0] != '.') {
std::string s = ".";
s += fnSuffix;
init(fnPrefix, s);
} else {
init(fnPrefix, fnSuffix);
}
}
int KGGInputFile::init(const std::string& fnPrefix,
const std::string& fnSuffix) {
this->prefix = fnPrefix;
this->fpKed = new BufferedReader((prefix + ".ked" + fnSuffix).c_str(), 1024);
this->fpKim = new BufferedReader((prefix + ".kim" + fnSuffix).c_str(), 1024);
this->fpKam = new BufferedReader((prefix + ".kam" + fnSuffix).c_str(), 1024);
if (!this->fpKed || !this->fpKim || !this->fpKam) {
REPORT("Cannot open binary KGG file!");
abort();
}
// read KED header
int c;
// magic number
unsigned char magic[] = {0x9e, 0x82};
c = this->fpKed->getc();
if ((unsigned char)c != magic[0]) {
fprintf(stderr, "Magic number of binary KGG file does not match!\n");
abort();
}
c = this->fpKed->getc();
if ((unsigned char)c != magic[1]) {
fprintf(stderr, "Magic number of binary KGG file does not match!\n");
abort();
}
// snp major mode
const int PHASED_MODE = 0x01; // 0b00000001;
const int UNPHASED_MODE = 0x00;
c = this->fpKed->getc();
if (c == UNPHASED_MODE) {
this->phased = false;
} else if (c == PHASED_MODE) {
this->phased = true;
} else {
fprintf(stderr, "Unrecognized major mode in binary KGG file.\n");
exit(1);
}
// read bim
LineReader* lr = new LineReader((this->prefix + ".kim" + fnSuffix).c_str());
std::string chrPos;
std::vector<std::string> fd;
while (lr->readLineBySep(&fd, " \t")) {
if (fd.size() != 7) {
fprintf(stderr, "Wrong format in bim file.\n");
exit(1);
}
// when rsid == ".", use "chr:pos" as dict key
if (fd[1] == ".") {
chrPos = fd[0];
chrPos += ":";
chrPos += fd[3];
} else {
chrPos = fd[1];
}
if (snp2Idx.find(chrPos) == snp2Idx.end()) {
chrom.push_back(fd[0]);
snp.push_back(fd[1]);
snp2Idx[chrPos] = 0;
const int val = snp2Idx.size() - 1;
snp2Idx[chrPos] = val;
mapDist.push_back(atof(fd[2].c_str()));
pos.push_back(atoi(fd[3].c_str()));
ref.push_back(fd[4]);
alt.push_back(stringTokenize(fd[5], ","));
} else {
fprintf(stderr,
"Error found: duplicated marker name or chromosomal position [ "
"%s ]!\n",
fd[1].c_str());
exit(1);
}
}
delete lr;
// read kam
lr = new LineReader((this->prefix + ".kam" + fnSuffix).c_str());
while (lr->readLineBySep(&fd, " \t")) {
if (fd.size() != 6) {
fprintf(stderr, "Wrong format in kam file.\n");
exit(1);
}
// will skip loading kam, fatherid, motherid
if (pid2Idx.find(fd[1]) == pid2Idx.end()) {
pid2Idx[fd[1]] = -1;
const int val = pid2Idx.size() - 1;
pid2Idx[fd[1]] = val;
indv.push_back(fd[1]);
sex.push_back(atoi(fd[4].c_str()));
pheno.push_back(atof(fd[5].c_str()));
} else {
fprintf(stderr, "duplicated person id [ %s ], ignore!\n", fd[1].c_str());
exit(1);
}
}
delete lr;
data.resize(getNumSample());
variantIdx = 0;
fprintf(stderr,
"Finished loading %s.{kam,kim,ked}, %zu markers, %zu samples\n",
fnPrefix.c_str(), snp2Idx.size(), indv.size());
return 0;
}
KGGInputFile::~KGGInputFile() {
delete (this->fpKed);
delete (this->fpKim);
delete (this->fpKam);
}
bool KGGInputFile::readRecord() {
// read a record
if (variantIdx >= getNumMarker()) {
return false;
}
if (this->fpKed->isEof()) {
return false;
}
const int numAllele = alt[variantIdx].size() + 1;
const int sampleSize = getNumSample();
int bytes;
if (phased) {
switch (numAllele) {
case 2:
bits = 3;
break;
case 3:
bits = 4;
break;
case 4:
bits = 5;
break;
default:
bits = ceil(log(numAllele * numAllele + 1) / log(2));
break;
}
} else { // unphased, genotype
switch (numAllele) {
case 2:
bits = 2;
break;
case 3:
bits = 3;
break;
case 4:
bits = 4;
break;
case 5:
bits = ceil(log(((numAllele + 1) * numAllele / 2) + 1) / log(2));
break;
}
}
bytes = (bits * sampleSize + 8) / 8;
if (phased) {
buildPhasedTable(numAllele);
} else {
buildUnphasedTable(numAllele);
}
buffer.resize(bytes);
this->fpKed->read(buffer.data(), bytes);
for (int i = 0; i < sampleSize; ++i) {
data[i] = 0;
}
int block = 0;
unsigned char mask = 1 << 7;
for (int j = 0; j < bits; ++j) {
for (int i = 0; i < sampleSize; ++i) {
data[i] <<= 1;
data[i] |= (buffer[block] & mask) ? 1 : 0;
mask >>= 1;
if (mask == 0) {
mask = 1 << 7;
++block;
assert(block < bytes);
}
}
}
return true;
}
int KGGInputFile::getGenotype(int indvIdx) {
const int nAllele = alt[variantIdx].size() + 1;
if (phased) {
return phasedTable[nAllele][data[indvIdx]].x[0] +
phasedTable[nAllele][data[indvIdx]].x[1];
} else {
return unphasedTable[nAllele][data[indvIdx]].x[0] +
unphasedTable[nAllele][data[indvIdx]].x[1];
}
}
void KGGInputFile::getAllele(int indvIdx, int* a1, int* a2) {
const int nAllele = alt[variantIdx].size() + 1;
if (phased) {
*a1 = phasedTable[nAllele][data[indvIdx]].x[0];
*a2 = phasedTable[nAllele][data[indvIdx]].x[1];
} else {
*a1 = unphasedTable[nAllele][data[indvIdx]].x[0];
*a2 = unphasedTable[nAllele][data[indvIdx]].x[1];
}
}
void KGGInputFile::buildUnphasedTable(int allele) {
if (unphasedTable.count(allele)) {
return;
}
std::map<char, TwoChar>& m = unphasedTable[allele];
int val = 0;
for (int i = 0; i < allele; ++i) {
for (int j = 0; j <= i; ++j) {
m[val].x[0] = j;
m[val].x[1] = i;
val++;
}
}
m[val].x[0] = -9;
m[val].x[1] = -9;
}
void KGGInputFile::buildPhasedTable(int allele) {
if (phasedTable.count(allele)) {
return;
}
std::map<char, TwoChar>& m = phasedTable[allele];
int val = 0;
for (int i = 0; i < allele; ++i) {
for (int j = 0; j < allele; ++j) {
m[val].x[0] = j;
m[val].x[1] = i;
val++;
}
}
m[val].x[0] = -9;
m[val].x[1] = -9;
}
#ifndef _KGGINPUTFILE_H_
#define _KGGINPUTFILE_H_
#include <map>
#include <string>
#include <vector>
class BufferedReader;
class KGGInputFile {
public:
KGGInputFile(const std::string& fnPrefix);
KGGInputFile(const std::string& fnPrefix, const std::string& fnSuffix);
~KGGInputFile();
int init(const std::string& fnPrefix, const std::string& fnSuffix);
// @return false if reached end
bool readRecord();
int getGenotype(int indvIdx);
void getAllele(int indvIdx, int* a1, int* a2);
// @param m is the maker name.
// can be used to check whether a marker exists
int getMarkerIdx(const std::string& m) {
if (this->snp2Idx.find(m) == this->snp2Idx.end()) {
return -1;
} else {
return (this->snp2Idx[m]);
}
}
// @param p is the sample name
// can be used to check whether a sample exists
int getSampleIdx(const std::string& p) {
if (this->pid2Idx.find(p) == this->pid2Idx.end()) {
return -1;
} else {
return (this->pid2Idx[p]);
}
}
int getNumIndv() const { return this->indv.size(); }
int getNumSample() const { return this->indv.size(); }
int getNumMarker() const { return this->snp2Idx.size(); }
const std::vector<std::string>& getIndv() const { return this->indv; }
const std::vector<std::string>& getSampleName() const { return this->indv; }
const std::vector<std::string>& getIID() const { return this->indv; }
const std::vector<std::string>& getChrom() const { return this->chrom; }
const std::vector<std::string>& getMarkerName() const { return this->snp; }
const std::vector<double>& getMapDist() const { return this->mapDist; }
const std::vector<int>& getPosition() const { return this->pos; }
const std::vector<std::string>& getRef() const { return this->ref; }
const std::vector<std::vector<std::string> >& getAlt() const {
return this->alt;
}
const std::vector<int>& getSex() const { return this->sex; }
const std::vector<double>& getPheno() const { return this->pheno; }
public:
std::vector<std::string> chrom;
std::vector<std::string> snp;
std::vector<double> mapDist;
std::vector<int> pos;
std::vector<std::string> ref;
std::vector<std::vector<std::string> > alt;
std::vector<std::string> indv; /// people ids
std::vector<int> sex;
std::vector<double> pheno;
private:
void buildUnphasedTable(int numAllele);
void buildPhasedTable(int numAllele);
private:
typedef struct TwoChar { unsigned char x[2]; } TwoChar;
std::map<std::string, int> snp2Idx;
std::map<std::string, int> pid2Idx;
BufferedReader* fpKed;
BufferedReader* fpKim;
BufferedReader* fpKam;
std::string prefix;
int bits; // bits[0, 1, ... (bits-1)] is a block
std::vector<unsigned char> buffer; // raw data from ked file
std::vector<unsigned char> data; // genotype data
int variantIdx;
bool phased;
std::map<int, std::map<char, TwoChar> > unphasedTable;
std::map<int, std::map<char, TwoChar> > phasedTable;
};
#endif /* _KGGINPUTFILE_H_ */
......@@ -5,7 +5,8 @@ LIB = lib-vcf.a
LIB_DBG = lib-dbg-vcf.a
BASE = PeopleSet VCFUtil PlinkInputFile PlinkOutputFile VCFInfo VCFInputFile \
VCFIndividual SiteSet VCFHeader BCFReader VCFExtractor VCFFilter VCFValue \
VCFBuffer
VCFBuffer KGGInputFile
OBJ = $(BASE:=.o)
OBJ_DBG = $(BASE:%=%_dbg.o)
......
......@@ -7,7 +7,8 @@ EXE = testReadVCF testReadVCFInfo testReadVCFByIndividual testReadVCFByRange \
testVCFSite \
testVCFExtractChromXPar \
testPlinkOutputFile \
testPlinkOutputFile2
testPlinkOutputFile2 \
testKGGInputFile
all: $(EXE)
debug: all
......@@ -40,7 +41,7 @@ endef
$(foreach s, $(EXE), $(eval $(call BUILD_each, $(s))))
check: check1 check2 check3 check4 check5 check6 check7 check8 check9 check10 \
check11 check12 check13 check14 check15
check11 check12 check13 check14 check15 check16
check1:
./testPlinkInputFile > testPlinkInputFile.output
diff -q testPlinkInputFile.output testPlinkInputFile.output.correct
......@@ -96,6 +97,8 @@ check15:
-awk '{$$1 = NR; $$3 = 0; $$4 = 0; print ;}' testPlinkInputFile.fam | diff -b testPlinkOutputFile.output.fam -
diff -q testPlinkOutputFile.output.bim testPlinkInputFile.bim
diff -q testPlinkOutputFile.output.bed testPlinkInputFile.bed
check16:
./testKGGInputFile > testKGGInputFile.vcf.output
diff testKGGInputFile.vcf.output testKGGInputFile.vcf.output.correct
clean:
-rm -f $(EXE) *.d *.output
#include <stdio.h>
#include <stdlib.h>
#include "KGGInputFile.h"
#include "base/Utils.h"
void printVCFMeta() {
// GP is between 0 and 1 in VCF v4.3, but phred-scaled value in VCF v4.2
printf("##fileformat=VCFv4.3\n");
printf(
"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype "
"calls\">\n");
}
void printVCFHeader(const std::vector<std::string>& sm) {
printf("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
for (size_t i = 0; i != sm.size(); ++i) {
printf("\t%s", sm[i].c_str());
}
printf("\n");
}
void printGeno(int i, const std::vector<int>& g) {
printf("%d => ", i);
for (int i = 0; i < g.size(); ++i) {
printf(" %d", g[i]);
}
printf("\n");
}
int main(int argc, char* argv[]) {
KGGInputFile read("kggseq1", "gz");
const int N = read.getNumSample();
const int M = read.getNumMarker();
const std::vector<std::string> sm = read.getSampleName();
printVCFMeta();
printVCFHeader(sm);
int a1, a2;
for (int i = 0; i < M; ++i) {
if (!read.readRecord()) {
continue;
}
printf("%s", read.getChrom()[i].c_str());
printf("\t%d", read.getPosition()[i]);
printf("\t%s", read.getMarkerName()[i].c_str());
printf("\t%s", read.getRef()[i].c_str());
printf("\t%s", stringJoin(read.getAlt()[i], ',').c_str());
printf("\t.\t.\t.\tGT");
for (int j = 0; j < N; ++j) {
read.getAllele(j, &a1, &a2);
printf("\t%d/%d", a1, a2);
}
printf("\n");
} // loop marker
return 0;
}
##fileformat=VCFv4.3
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype calls">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT P1 P2 P3 P4 P5 P6 P7 P8 P9
1 1 chr1:1 A G . . . GT 0/1 0/0 0/0 1/1 0/1 0/0 0/1 0/1 0/0
1 2 chr1:2 A G . . . GT 0/0 0/0 0/0 0/0 0/0 0/0 0/1 0/0 0/1
1 3 chr1:3 A G . . . GT 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/0 0/0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment