Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
xiaowei zhan
RVTESTS
Commits
732e8102
Commit
732e8102
authored
Sep 01, 2017
by
zhanxw
Browse files
support bgen
parent
96be9b56
Changes
9
Hide whitespace changes
Inline
Side-by-side
libBgen/BGenFile.cpp
View file @
732e8102
...
...
@@ -310,9 +310,11 @@ bool BGenFile::parseLayout2() {
const
uint8_t
isPhased
=
*
(
uint8_t
*
)(
buf
.
data
()
+
8
+
N
);
var
.
isPhased
=
isPhased
!=
0
;
const
uint8_t
B
=
*
(
uint8_t
*
)(
buf
.
data
()
+
8
+
N
+
1
);
// bits
assert
(
1
<=
B
&&
B
<=
32
);
#ifdef DEBUG
int
totalBit
=
0
;
int
cumBit
=
0
;
assert
(
1
<=
B
&&
B
<=
32
);
#endif
var
.
missing
.
resize
(
N
);
var
.
ploidy
.
resize
(
N
);
...
...
@@ -333,9 +335,9 @@ bool BGenFile::parseLayout2() {
// ploidy = Z, allele = K
if
(
var
.
isPhased
)
{
// total Z * (K- 1) * B bits bits used
totalBit
=
Z
*
(
var
.
K
-
1
)
*
B
;
// total Z * (K- 1) * B bits bits used
#ifdef DEBUG
totalBit
=
Z
*
(
var
.
K
-
1
)
*
B
;
printf
(
"Total phased bits = %d
\n
"
,
totalBit
);
#endif
...
...
@@ -351,8 +353,8 @@ bool BGenFile::parseLayout2() {
}
else
{
// total ( ( (Z+K-1) choose (K-1) ) -1 ) * B bits used
const
int
nCombination
=
choose
((
Z
+
var
.
K
-
1
),
(
var
.
K
-
1
));
totalBit
=
(
nCombination
-
1
)
*
B
;
#ifdef DEBUG
totalBit
=
(
nCombination
-
1
)
*
B
;
printf
(
"Total unphased bits = %d
\n
"
,
totalBit
);
#endif
float
remainProb
=
1.0
;
...
...
@@ -363,7 +365,9 @@ bool BGenFile::parseLayout2() {
}
var
.
prob
.
push_back
(
remainProb
);
}
#ifdef DEBUG
cumBit
+=
totalBit
;
#endif
}
var
.
index
.
push_back
(
var
.
prob
.
size
());
...
...
@@ -394,8 +398,7 @@ void BGenFile::parseString(FILE* fp, int lenByte, std::string* out) {
int
nRead
=
fread
(
&
siLen
,
sizeof
(
siLen
),
1
,
fp
);
assert
(
nRead
==
1
);
(
*
out
).
resize
(
siLen
+
1
);
(
*
out
)[
siLen
]
=
'\0'
;
(
*
out
).
resize
(
siLen
);
nRead
=
fread
(
&
(
*
out
)[
0
],
sizeof
((
*
out
)[
0
]),
siLen
,
fp
);
#ifdef DEBUG
printf
(
"nRead = %d, read = %s
\n
"
,
nRead
,
(
*
out
).
c_str
());
...
...
@@ -449,6 +452,7 @@ void BGenFile::setPeopleMask(const std::string& s, bool b) {
sampleMask
[
i
]
=
b
;
}
}
buildEffectiveIndex
();
}
void
BGenFile
::
includePeople
(
const
std
::
string
&
s
)
{
setPeopleMask
(
s
,
false
);
}
...
...
@@ -468,12 +472,14 @@ void BGenFile::setPeopleMaskFromFile(const char* fn, bool b) {
setPeopleMask
(
fd
[
i
].
c_str
(),
b
);
}
}
buildEffectiveIndex
();
}
void
BGenFile
::
includePeopleFromFile
(
const
char
*
fn
)
{
setPeopleMaskFromFile
(
fn
,
false
);
}
void
BGenFile
::
includeAllPeople
()
{
std
::
fill
(
sampleMask
.
begin
(),
sampleMask
.
end
(),
false
);
buildEffectiveIndex
();
}
void
BGenFile
::
excludePeople
(
const
std
::
string
&
s
)
{
setPeopleMask
(
s
,
true
);
}
...
...
@@ -487,6 +493,7 @@ void BGenFile::excludePeopleFromFile(const char* fn) {
}
void
BGenFile
::
excludeAllPeople
()
{
std
::
fill
(
sampleMask
.
begin
(),
sampleMask
.
end
(),
true
);
buildEffectiveIndex
();
}
//////////////////////////////////////////////////
...
...
@@ -567,3 +574,33 @@ int BGenFile::setSiteFile(const std::string& fn) {
}
return
0
;
}
int
BGenFile
::
getNumEffectiveSample
()
const
{
size_t
ret
=
0
;
for
(
size_t
i
=
0
;
i
!=
sampleMask
.
size
();
++
i
)
{
if
(
sampleMask
[
i
])
continue
;
ret
++
;
}
return
ret
;
}
void
BGenFile
::
getIncludedSampleName
(
std
::
vector
<
std
::
string
>*
p
)
const
{
if
(
!
p
)
return
;
p
->
clear
();
for
(
size_t
i
=
0
;
i
!=
sampleMask
.
size
();
++
i
)
{
if
(
sampleMask
[
i
])
continue
;
p
->
push_back
(
sampleIdentifier
[
i
]);
}
}
void
BGenFile
::
buildEffectiveIndex
()
{
effectiveIndex
.
resize
(
0
);
for
(
size_t
i
=
0
;
i
!=
N
;
++
i
)
{
if
(
sampleMask
[
i
])
continue
;
effectiveIndex
.
push_back
(
i
);
}
}
int
BGenFile
::
getEffectiveIndex
(
int
idx
)
const
{
return
this
->
effectiveIndex
[
idx
];
}
libBgen/BGenFile.h
View file @
732e8102
...
...
@@ -12,6 +12,11 @@
#include "BGenIndex.h"
#include "BGenVariant.h"
// copied from libVcf/VCFConstant.h
#define MISSING_GENOTYPE -9
#define PLINK_MALE 1
#define PLINK_FEMALE 2
class
RangeList
;
class
BGenFile
{
...
...
@@ -63,10 +68,13 @@ class BGenFile {
public:
int
getNumMarker
()
const
{
return
M
;
}
int
getNumSample
()
const
{
return
N
;
}
int
getNumEffectiveSample
()
const
;
const
std
::
vector
<
std
::
string
>&
getSampleIdentifier
()
const
{
return
sampleIdentifier
;
}
void
getIncludedSampleName
(
std
::
vector
<
std
::
string
>*
p
)
const
;
const
BGenVariant
&
getVariant
()
const
{
return
var
;
}
int
getEffectiveIndex
(
int
idx
)
const
;
private:
BGenFile
(
const
BGenFile
&
);
...
...
@@ -89,6 +97,8 @@ class BGenFile {
void
setPeopleMaskFromFile
(
const
char
*
fn
,
bool
b
);
void
setRangeMode
();
// range list related
void
buildEffectiveIndex
();
private:
std
::
string
bgenFileName
;
FILE
*
fp
;
...
...
@@ -119,6 +129,7 @@ class BGenFile {
bool
autoMergeRange
;
Mode
mode
;
/// read consecutively or read by index
std
::
vector
<
bool
>
sampleMask
;
// true means exclusion
std
::
vector
<
int
>
effectiveIndex
;
// allow chromosomal sites
std
::
set
<
std
::
string
>
allowedSite
;
};
// class BGenFile
...
...
libBgen/BGenVariant.h
View file @
732e8102
...
...
@@ -273,6 +273,23 @@ struct BGenVariant {
fprintf
(
fp
,
"%g"
,
prob
[
i
]);
}
}
/// Handle dosage //////////////////////////////////////////////////
void
printDosage
(
int
i
,
FILE
*
fp
)
const
{
if
(
missing
[
i
])
{
fputs
(
"."
,
fp
);
return
;
}
if
(
ploidy
[
i
]
==
2
&&
K
==
2
)
{
// const float prob0 = prob[index[i]];
const
float
prob1
=
prob
[
index
[
i
]
+
1
];
const
float
prob2
=
prob
[
index
[
i
]
+
2
];
fprintf
(
fp
,
"%g"
,
prob1
+
2
.
0
*
prob2
);
}
else
{
fputs
(
"."
,
fp
);
}
}
};
#endif
/* _BGENVARIANT_H_ */
src/GenotypeExtractor.cpp
View file @
732e8102
...
...
@@ -233,15 +233,15 @@ GenotypeExtractor::~GenotypeExtractor() {}
// }
// }
//
void GenotypeExtractor::assign(const std::vector<double>& from, int nrow,
//
int ncol, Matrix* to) {
//
assert(to);
//
Matrix& out = *to;
//
out.Dimension(nrow, ncol);
//
assert((int)from.size() == nrow * ncol);
//
for (int i = 0; i < nrow; ++i) {
//
for (int j = 0; j < ncol; ++j) {
//
out[i][j] = from[nrow * j + i];
//
}
//
}
//
}
void
GenotypeExtractor
::
assign
(
const
std
::
vector
<
double
>&
from
,
int
nrow
,
int
ncol
,
Matrix
*
to
)
{
assert
(
to
);
Matrix
&
out
=
*
to
;
out
.
Dimension
(
nrow
,
ncol
);
assert
((
int
)
from
.
size
()
==
nrow
*
ncol
);
for
(
int
i
=
0
;
i
<
nrow
;
++
i
)
{
for
(
int
j
=
0
;
j
<
ncol
;
++
j
)
{
out
[
i
][
j
]
=
from
[
nrow
*
j
+
i
];
}
}
}
src/GenotypeExtractor.h
View file @
732e8102
...
...
@@ -11,8 +11,8 @@
class
Matrix
;
class
RangeList
;
class
Result
;
class
VCFExtractor
;
class
VCFIndividual
;
//
class VCFExtractor;
//
class VCFIndividual;
class
GenotypeCounter
;
class
GenotypeExtractor
{
...
...
@@ -45,8 +45,6 @@ class GenotypeExtractor {
virtual
void
setSiteDepthMax
(
int
d
)
=
0
;
// @return true if GD is valid
// if GD is missing, we will take GD = 0
virtual
bool
checkGD
(
VCFIndividual
&
indv
,
int
gdIdx
)
=
0
;
virtual
bool
checkGQ
(
VCFIndividual
&
indv
,
int
gqIdx
)
=
0
;
virtual
void
setGDmin
(
int
m
)
=
0
;
virtual
void
setGDmax
(
int
m
)
=
0
;
virtual
void
setGQmin
(
int
m
)
=
0
;
...
...
@@ -109,8 +107,8 @@ class GenotypeExtractor {
// assign extracted genotype @param from to a @param nrow by @param ncol
// output matrix @param to
void assign(const std::vector<double>& from, int nrow, int ncol, Matrix* to);
#endif
void
assign
(
const
std
::
vector
<
double
>&
from
,
int
nrow
,
int
ncol
,
Matrix
*
to
);
void
enableMultiAllelicMode
()
{
this
->
multiAllelicMode
=
true
;
}
public:
...
...
src/Main.cpp
View file @
732e8102
...
...
@@ -21,6 +21,7 @@
#include "libsrc/MathMatrix.h"
#include "libsrc/MathVector.h"
#include "src/BGenGenotypeExtractor.h"
#include "src/DataConsolidator.h"
#include "src/DataLoader.h"
#include "src/GenotypeExtractor.h"
...
...
@@ -28,7 +29,6 @@
#include "src/ModelManager.h"
#include "src/Result.h"
#include "src/VCFGenotypeExtractor.h"
// #include "src/BGenGenotypeExtractor.h"
Logger
*
logger
=
NULL
;
...
...
@@ -247,6 +247,7 @@ void welcome() {
BEGIN_PARAMETER_LIST
();
ADD_PARAMETER_GROUP
(
"Basic Input/Output"
);
ADD_STRING_PARAMETER
(
inVcf
,
"--inVcf"
,
"Input VCF File"
);
ADD_STRING_PARAMETER
(
inBgen
,
"--inBgen"
,
"Input BGEN File"
);
ADD_STRING_PARAMETER
(
outPrefix
,
"--out"
,
"Output prefix"
);
ADD_BOOL_PARAMETER
(
outputRaw
,
"--outputRaw"
,
"Output genotypes, phenotype, covariates(if any); and "
...
...
@@ -439,8 +440,15 @@ int main(int argc, char** argv) {
if
(
!
FLAG_outPrefix
.
size
())
FLAG_outPrefix
=
"rvtest"
;
REQUIRE_STRING_PARAMETER
(
FLAG_inVcf
,
"Please provide input file using: --inVcf"
);
if
(
FLAG_inVcf
.
empty
()
&&
FLAG_inBgen
.
empty
())
{
fprintf
(
stderr
,
"Please provide one input file using: --inVcf or --inBgen"
);
exit
(
1
);
}
if
(
!
FLAG_inVcf
.
empty
()
&&
!
FLAG_inBgen
.
empty
())
{
fprintf
(
stderr
,
"Please provide one kind of input file using: --inVcf or --inBgen"
);
exit
(
1
);
}
// check new version
if
(
!
FLAG_noweb
)
{
...
...
@@ -496,6 +504,8 @@ int main(int argc, char** argv) {
GenotypeExtractor
*
ge
=
NULL
;
if
(
!
FLAG_inVcf
.
empty
())
{
ge
=
new
VCFGenotypeExtractor
(
FLAG_inVcf
);
}
else
if
(
!
FLAG_inBgen
.
empty
())
{
ge
=
new
BGenGenotypeExtractor
(
FLAG_inBgen
);
}
else
{
assert
(
false
);
}
...
...
src/Makefile
View file @
732e8102
...
...
@@ -26,6 +26,7 @@ BASE = Main \
Model
\
GenotypeExtractor
\
VCFGenotypeExtractor
\
BGenGenotypeExtractor
\
DataLoader
\
GenotypeCounter
\
...
...
src/VCFGenotypeExtractor.cpp
View file @
732e8102
...
...
@@ -481,16 +481,3 @@ double VCFGenotypeExtractor::getGenotypeForAltAllele(
return
MISSING_GENOTYPE
;
}
}
void
VCFGenotypeExtractor
::
assign
(
const
std
::
vector
<
double
>&
from
,
int
nrow
,
int
ncol
,
Matrix
*
to
)
{
assert
(
to
);
Matrix
&
out
=
*
to
;
out
.
Dimension
(
nrow
,
ncol
);
assert
((
int
)
from
.
size
()
==
nrow
*
ncol
);
for
(
int
i
=
0
;
i
<
nrow
;
++
i
)
{
for
(
int
j
=
0
;
j
<
ncol
;
++
j
)
{
out
[
i
][
j
]
=
from
[
nrow
*
j
+
i
];
}
}
}
src/VCFGenotypeExtractor.h
View file @
732e8102
...
...
@@ -12,8 +12,8 @@
// class Matrix;
// class RangeList;
// class Result;
//
class VCFExtractor;
//
class VCFIndividual;
class
VCFExtractor
;
class
VCFIndividual
;
// class GenotypeCounter;
/**
...
...
@@ -83,9 +83,6 @@ class VCFGenotypeExtractor : public GenotypeExtractor {
void
getPeopleName
(
std
::
vector
<
std
::
string
>*
p
);
void
getIncludedPeopleName
(
std
::
vector
<
std
::
string
>*
p
)
const
;
const
std
::
vector
<
GenotypeCounter
>&
getGenotypeCounter
()
const
{
return
this
->
counter
;
}
/**
* @return weigth, its length equals to # of markers
*/
...
...
@@ -95,10 +92,10 @@ class VCFGenotypeExtractor : public GenotypeExtractor {
this
->
dosageTag
=
tag
;
}
void
unsetDosageTag
()
{
this
->
dosageTag
.
clear
();
}
bool
isDosage
()
const
{
return
!
this
->
dosageTag
.
empty
();
}
void
setParRegion
(
ParRegion
*
p
)
{
this
->
parRegion
=
p
;
}
// Sex (1=male; 2=female; other=unknown)
void
setSex
(
const
std
::
vector
<
int
>*
sex
)
{
this
->
sex
=
sex
;
}
//
bool isDosage() const { return !this->dosageTag.empty(); }
//
void setParRegion(ParRegion* p) { this->parRegion = p; }
//
//
Sex (1=male; 2=female; other=unknown)
//
void setSex(const std::vector<int>* sex) { this->sex = sex; }
// coding male chromX as 0/2 instead of 0/1
// similarly, for dosage, just multiply 2.0 from original dosage
// void enableClaytonCoding() { this->claytonCoding = true; }
...
...
@@ -118,14 +115,9 @@ class VCFGenotypeExtractor : public GenotypeExtractor {
// assign extracted genotype @param from to a @param nrow by @param ncol
// output matrix @param to
void
assign
(
const
std
::
vector
<
double
>&
from
,
int
nrow
,
int
ncol
,
Matrix
*
to
);
void
enableMultiAllelicMode
()
{
this
->
multiAllelicMode
=
true
;
}
public:
const
static
int
SUCCEED
=
0
;
const
static
int
ERROR
=
-
1
;
const
static
int
FILE_END
=
-
2
;
const
static
int
FAIL_FILTER
=
-
3
;
// void assign(const std::vector<double>& from, int nrow, int ncol, Matrix*
// to);
// void enableMultiAllelicMode() { this->multiAllelicMode = true; }
private:
VCFExtractor
*
vin
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment