Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
BioHPC
slopetree
Commits
f7316f49
Commit
f7316f49
authored
Aug 12, 2015
by
David Trudgian
Browse files
Initial bug fixes and optimizations so working on BioHPC
parent
38ed0730
Changes
10
Hide whitespace changes
Inline
Side-by-side
amino_acids.h
View file @
f7316f49
...
...
@@ -13,12 +13,16 @@ class amino_acids
amino_acids
();
char
get_aa
(
int
i
);
int
get_aa
(
char
c
);
static
const
string
valid_aas
;
};
const
string
amino_acids
::
valid_aas
=
"ACDEFGHIKLMNPQRSTVWY"
;
amino_acids
::
amino_acids
()
{
}
char
amino_acids
::
get_aa
(
int
i
)
{
if
(
i
==
0
)
...
...
filter.h
View file @
f7316f49
...
...
@@ -80,7 +80,6 @@ class filter
void
read_in_files
();
void
open_tag_files
();
void
read_in_tags
(
vector
<
string
>&
tags
,
char
current_amino_acid
,
int
&
num_refs
);
void
quicksort
(
vector
<
string
>&
tags
,
int
left
,
int
right
);
bool
compare_tags
(
string
s1
,
string
s2
);
void
form_clusters
(
vector
<
string
>&
tags
);
//, string s);
void
view_clusters2
(
vector
<
cluster2
>&
clusters2
,
vector
<
string
>&
tags
);
...
...
makefile
View file @
f7316f49
CPPFLAGS
=
-O3
all
:
mif sttag tmerg filt fmerg cm mdist fh dbc mtax
mif
:
mif.cpp directory_reader.h
g++ mif.cpp
-o
mif
g++
$(CPPFLAGS)
mif.cpp
-o
mif
sttag
:
sttag.cpp sttagger.h
g++
sttag.cp
p
-o
sttag
sttag
:
sttag.cpp sttagger.h
org.h util.h amino_acids.h
g++
$(CPPFLAGS)
sttag.cpp
-fopenm
p
-o
sttag
tmerg
:
tmerg.cpp tmerg.h
g++ tmerg.cpp
-o
tmerg
g++
$(CPPFLAGS)
tmerg.cpp
-o
tmerg
filt
:
filter.cpp filter.h
g++ filter.cpp
-o
filt
g++
$(CPPFLAGS)
filter.cpp
-o
filt
fmerg
:
fmerg.cpp util.h
g++ fmerg.cpp
-o
fmerg
g++
$(CPPFLAGS)
fmerg.cpp
-o
fmerg
cm
:
cm.cpp util.h
g++ cm.cpp
-fopenmp
-o
cm
# g++ cm.cpp -o cm
g++
$(CPPFLAGS)
cm.cpp
-fopenmp
-o
cm
# g++
$(CPPFLAGS)
cm.cpp -o cm
dbc
:
dbc.cpp
g++ dbc.cpp
-fopenmp
-o
dbc
g++
$(CPPFLAGS)
dbc.cpp
-fopenmp
-o
dbc
mdist
:
mdist.cpp mdist.h
g++ mdist.cpp
-fopenmp
-o
mdist
# g++ mdist.cpp -o mdist
g++
$(CPPFLAGS)
mdist.cpp
-fopenmp
-o
mdist
# g++
$(CPPFLAGS)
mdist.cpp -o mdist
fh
:
fix_hgt.cpp mdist.h mctr.h
g++ fix_hgt.cpp
-fopenmp
-o
fh
# g++ fix_hgt.cpp -o fh
g++
$(CPPFLAGS)
fix_hgt.cpp
-fopenmp
-o
fh
# g++
$(CPPFLAGS)
fix_hgt.cpp -o fh
mtax
:
mtax.cpp
g++ mtax.cpp
-o
mtax
g++
$(CPPFLAGS)
mtax.cpp
-o
mtax
clean
:
\r
m mif
...
...
mctr.h
View file @
f7316f49
...
...
@@ -6,6 +6,7 @@
#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <map>
#include <sstream>
#include <math.h>
...
...
@@ -41,7 +42,6 @@ struct krow
};
//int find_top_string(vector<file_aa_pair> v);
void
sort_problem_tags
(
vector
<
string
>&
tags
,
int
left
,
int
right
);
class
mctr
{
...
...
@@ -2831,7 +2831,10 @@ void mctr::setup_it1(ifstream& infostream)
get_problem_tags
(
files1
,
"0"
);
get_problem_tags
(
files2
,
"1"
);
sort_problem_tags
(
problem_tags
,
0
,
problem_tags
.
size
()
-
1
);
std
::
sort
(
problem_tags
.
begin
(),
problem_tags
.
end
());
if
(
MCTR_DEBUG
)
{
...
...
@@ -2935,7 +2938,7 @@ void mctr::setup_it2(ifstream& infostream)
get_problem_tags
(
files2
,
"1"
,
hgt_genes1
);
//cout<<"problem_tags at 0 = "<<problem_tags.at(0)<<endl;
//cout<<"size of problem_tags = "<<problem_tags.size()<<endl;
sort
_
problem_tags
(
problem_tags
,
0
,
problem_tags
.
size
()
-
1
);
std
::
sort
(
problem_tags
.
begin
(),
problem_tags
.
end
());
//cout<<"problem_tags at 0 = "<<problem_tags.at(0)<<endl;
/* for(int i=0; i<10; i++)
{
...
...
@@ -3523,38 +3526,7 @@ bool mctr::check_composition(string s)
return
true
;
}
void
sort_problem_tags
(
vector
<
string
>&
tags
,
int
left
,
int
right
)
{
int
i
=
left
;
int
j
=
right
;
string
pivot
=
tags
.
at
((
left
+
right
)
/
2
);
while
(
i
<=
j
)
{
while
(
pivot
>
tags
.
at
(
i
))
{
i
++
;
}
while
(
pivot
<
tags
.
at
(
j
))
{
j
--
;
}
if
(
i
<=
j
)
{
string
temp
=
tags
.
at
(
i
);
tags
.
at
(
i
)
=
tags
.
at
(
j
);
tags
.
at
(
j
)
=
temp
;
i
++
;
j
--
;
}
};
if
(
left
<
j
)
sort_problem_tags
(
tags
,
left
,
j
);
if
(
i
<
right
)
sort_problem_tags
(
tags
,
i
,
right
);
}
int
mctr
::
get_sos_cutoff
()
{
...
...
mdist.h
View file @
f7316f49
...
...
@@ -1825,6 +1825,8 @@ void mdist::read_in_orgs()
ifstream
instream
;
string
orgfile
=
path
+
"/"
+
bfn
+
"/"
+
bfn
+
"_saved_orgs.txt"
;
u
.
open_ifile
(
instream
,
orgfile
,
"orgfile"
);
cout
<<
orgfile
<<
endl
;
string
eat
;
int
ord
;
...
...
@@ -1840,6 +1842,8 @@ void mdist::read_in_orgs()
//Set the ordinal.
new_org
.
set_ordinal
(
ord
);
cout
<<
"Organism: "
<<
ord
<<
endl
;
//Set the path.
instream
>>
eat
>>
s
;
new_org
.
set_path
(
s
);
...
...
@@ -1857,13 +1861,17 @@ void mdist::read_in_orgs()
instream
>>
eat
>>
i
;
new_org
.
set_tag_length
(
i
);
//Set bool conserved.
bool
b
;
instream
>>
eat
>>
b
;
instream
>>
eat
>>
b
;
new_org
.
set_conserved
(
b
);
//Set the files.
instream
>>
eat
>>
i
;
cout
<<
"numfiles: "
<<
i
<<
endl
;
for
(
int
q
=
0
;
q
<
i
;
q
++
)
{
instream
>>
s
;
...
...
@@ -1871,7 +1879,10 @@ void mdist::read_in_orgs()
}
//Set the file sizes.
instream
>>
eat
>>
i
;
instream
>>
eat
>>
i
;
cout
<<
"numfilesizes: "
<<
i
<<
endl
;
for
(
int
q
=
0
;
q
<
i
;
q
++
)
{
int
fs
;
...
...
@@ -1879,8 +1890,13 @@ void mdist::read_in_orgs()
new_org
.
push_back_file_size
(
fs
);
}
//Set the amino acid counts.
instream
>>
eat
>>
i
;
cout
<<
"numaacounts: "
<<
i
<<
endl
;
for
(
int
q
=
0
;
q
<
i
;
q
++
)
{
int
aacount
;
...
...
@@ -1895,7 +1911,10 @@ void mdist::read_in_orgs()
double
d
;
instream
>>
eat
>>
d
;
//"discrepancy <double>"
new_org
.
setp_discrepancy
(
d
);
cout
<<
"discrepancy: "
<<
d
<<
endl
;
instream
>>
eat
>>
i
;
//"conserved <int>"
new_org
.
setp_conserved
(
i
);
...
...
@@ -1906,13 +1925,22 @@ void mdist::read_in_orgs()
new_org
.
set_ref
(
b
);
instream
>>
eat
>>
i
;
//"proteins.size() <integer which should be 0>"
cout
<<
"proteins.size: "
<<
i
<<
endl
;
instream
>>
eat
>>
i
;
//"total_num_proteins <integer which should be greater than 0>"
new_org
.
set_total_num_proteins
(
i
);
cout
<<
"total_num_proteins: "
<<
i
<<
endl
;
new_org
.
set_total_num_proteins
(
i
);
instream
>>
eat
>>
i
;
//"pv.size() <integer which should not be 0>"
new_org
.
setp_set_pv_size
(
i
);
for
(
int
q
=
0
;
q
<
i
;
q
++
)
cout
<<
"ppv_size: "
<<
i
<<
endl
;
for
(
int
q
=
0
;
q
<
i
;
q
++
)
{
int
keep
;
instream
>>
eat
>>
keep
;
...
...
@@ -1922,10 +1950,15 @@ void mdist::read_in_orgs()
int
fs
;
//filtering_steps
instream
>>
eat
>>
fs
;
cout
<<
"filtering_steps: "
<<
fs
<<
endl
;
int
pvv_size
;
instream
>>
eat
>>
pvv_size
;
//"pvv.size() <integer which should match the size above>"
new_org
.
setp_set_pvv_size
(
pvv_size
);
cout
<<
"pvvsize: "
<<
pvv_size
<<
endl
;
for
(
int
q
=
0
;
q
<
pvv_size
;
q
++
)
{
instream
>>
eat
>>
i
;
//keeper
...
...
@@ -1943,6 +1976,9 @@ void mdist::read_in_orgs()
}
instream
>>
eat
>>
i
;
//"sosv.size() <int like 401>"
cout
<<
"sosv_size: "
<<
pvv_size
<<
endl
;
new_org
.
setp_set_sosv_size
(
i
);
vector
<
int
>
sosv_vals
;
...
...
mif.cpp
View file @
f7316f49
...
...
@@ -7,6 +7,7 @@ Identifies redundancy in the input (i.e. exact same proteome present in both ref
#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <cstdlib>
#include <getopt.h>
...
...
@@ -127,8 +128,8 @@ int main(int argc, char* argv[])
logstream
<<
"Number of directories in FAA/ : directories.size() = "
<<
directories
.
size
()
<<
endl
;
logstream
<<
"Number of directories in FAA_ref/ : directoreis_ref.size() = "
<<
directories_ref
.
size
()
<<
endl
;
sort
_
directories
(
directories
);
sort
_
directories
(
directories_ref
);
std
::
sort
(
directories
.
begin
(),
directories
.
end
()
);
std
::
sort
(
directories
_ref
.
begin
(),
directories_ref
.
end
()
);
if
(
MIF_DEBUG
||
verbose
)
...
...
@@ -234,19 +235,4 @@ int main(int argc, char* argv[])
repstream
.
close
();
}
//Alphabetically sort the directories.
void
sort_directories
(
vector
<
string
>&
directories
)
{
for
(
int
i
=
0
;
i
<
directories
.
size
();
i
++
)
{
for
(
int
j
=
i
+
1
;
j
<
directories
.
size
();
j
++
)
{
if
(
directories
.
at
(
i
)
>
directories
.
at
(
j
))
{
string
temp
=
directories
.
at
(
i
);
directories
.
at
(
i
)
=
directories
.
at
(
j
);
directories
.
at
(
j
)
=
temp
;
}
}
}
}
org.h
View file @
f7316f49
...
...
@@ -133,6 +133,11 @@ org::org()
array
.
push_back
(
0
);
}
total_num_proteins
=
0
;
// DCT - Wasn't initialized. Causes error later in mdist reading org file.
conserved
=
false
;
}
org
::~
org
()
...
...
proteome_v2.h
View file @
f7316f49
...
...
@@ -4,9 +4,11 @@
#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <set>
#include <cstdlib>
#include <sstream>
#include <iomanip>
#include <cmath>
#include "protein.h"
...
...
@@ -19,6 +21,7 @@ const double KEEPERS_CUTOFF=0.6;
const
bool
SUMOFSQUARES
=
true
;
const
double
CONSERVATION_CUTOFF
=
1.3
;
const
bool
PROTEOME_DEBUG
=
false
;
const
int
MAXORDDIGITS
=
5
;
using
namespace
std
;
...
...
@@ -71,7 +74,6 @@ class proteome_v2
void
populate_proteins
(
string
file
);
void
populate_kmers
();
void
quicksort
(
vector
<
string
>&
tv
,
int
left
,
int
right
);
bool
is_valid
(
string
s
);
amino_acids
aas
;
double
get_ratio
(
int
pindex
);
...
...
@@ -156,9 +158,15 @@ proteome_v2::proteome_v2()
// {
// sosv.push_back(0);
// }
discrepancy
=
0
;
conserved
=
0
;
filtering_steps
=
10.0
;
//Default.
// DCT - Wasn't initialized. Causes error later in mdist reading org file.
chosen_column
=
0
;
ref
=
false
;
}
int
proteome_v2
::
get_tags_size
()
...
...
@@ -242,6 +250,7 @@ void proteome_v2::set_sosv_size(int i)
{
cout
<<
"proteome_v2::set_sosv_size(int)"
<<
endl
;
cout
<<
"i = "
<<
i
<<
endl
;
cout
<<
"Current sosv.size: "
<<
sosv
.
size
()
<<
endl
;
cout
<<
"Error condition"
<<
endl
;
exit
(
1
);
}
...
...
@@ -353,30 +362,32 @@ void proteome_v2::set_pvv_gnums_val(int index,vector<int> vals)
void
proteome_v2
::
generate_tags
(
int
ordinal
,
bool
do_scrambled
)
{
stringstream
ss1
;
ss1
<<
ordinal
;
string
so
=
ss1
.
str
();
while
(
so
.
length
()
<
MAXORDDIGITS
)
{
so
=
'0'
+
so
;
}
stringstream
sso
;
sso
<<
setfill
(
'0'
)
<<
setw
(
MAXORDDIGITS
)
<<
ordinal
;
string
so
=
sso
.
str
();
for
(
int
i
=
0
;
i
<
proteins
.
size
();
i
++
)
{
stringstream
ss
;
ss
<<
i
;
string
is
=
ss
.
str
();
for
(
int
j
=
0
;
j
<
proteins
.
at
(
i
).
get_sequence
().
length
();
j
++
)
{
string
s
=
proteins
.
at
(
i
).
get_sequence
().
substr
(
j
,
tag_length
);
if
(
is_valid
(
s
))
{
while
(
s
.
length
()
<
tag_length
)
{
s
+=
"^"
;
}
stringstream
ss
;
ss
<<
i
;
string
is
=
ss
.
str
();
if
(
s
.
length
()
<
tag_length
){
s
.
append
(
tag_length
-
s
.
length
(),
'^'
);
}
s
+=
" "
+
so
+
" "
+
is
;
tags
.
push_back
(
s
);
}
}
...
...
@@ -386,6 +397,7 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
ostemp
.
keeper
=
false
;
ostemp
.
deleter
=
false
;
ostemp
.
internal_repeats
=
0
;
pv
.
push_back
(
ostemp
);
osv
ostemp2
;
...
...
@@ -399,6 +411,7 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
ostemp2
.
gnums
.
push_back
(
0
);
ostemp2
.
gis
.
push_back
(
0
);
}
pvv
.
push_back
(
ostemp2
);
if
(
do_scrambled
)
...
...
@@ -409,14 +422,14 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
string
s
=
scrambled
.
substr
(
j
,
tag_length
);
if
(
is_valid
(
s
))
{
while
(
s
.
length
()
<
tag_length
)
{
s
+=
"^"
;
}
if
(
s
.
length
()
<
tag_length
){
s
.
append
(
tag_length
-
s
.
length
(),
'^'
);
}
stringstream
ss
;
ss
<<
i
;
string
is
=
ss
.
str
();
s
+=
" "
+
so
+
" "
+
is
;
tags_scr
.
push_back
(
s
);
}
}
...
...
@@ -443,11 +456,13 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
}
}
quicksort
(
tags
,
0
,
tags
.
size
()
-
1
);
std
::
sort
(
tags
.
begin
(),
tags
.
end
());
if
(
do_scrambled
)
{
quick
sort
(
tags_scr
,
0
,
tags_scr
.
size
()
-
1
);
std:
sort
(
tags_scr
.
begin
(),
tags_scr
.
end
()
);
}
else
{
...
...
@@ -604,6 +619,11 @@ void proteome_v2::generate_scrambled_tags(int ordinal)
for
(
int
i
=
0
;
i
<
proteins
.
size
();
i
++
)
{
stringstream
ss
;
ss
<<
i
;
string
is
=
ss
.
str
();
string
q
=
proteins
.
at
(
i
).
get_sequence
();
scramble_v2
(
q
);
...
...
@@ -612,13 +632,10 @@ void proteome_v2::generate_scrambled_tags(int ordinal)
string
s
=
q
.
substr
(
j
,
tag_length
);
if
(
is_valid
(
s
))
{
while
(
s
.
length
()
<
tag_length
)
{
s
+=
"^"
;
}
stringstream
ss
;
ss
<<
i
;
string
is
=
ss
.
str
();
if
(
s
.
length
()
<
tag_length
){
s
.
append
(
tag_length
-
s
.
length
()
,
'^'
);
}
s
+=
" "
+
so
+
" "
+
is
;
//cout<<"s = "<<s<<endl;
tags
.
push_back
(
s
);
...
...
@@ -630,7 +647,7 @@ void proteome_v2::generate_scrambled_tags(int ordinal)
cout
<<
"Size of size_real_tags = "
<<
size_real_tags
<<
endl
;
exit
(
1
);
cout
<<
"Sorting tags: "
<<
endl
;
quick
sort
(
tags
,
0
,
tags
.
size
()
-
1
);
std
::
sort
(
tags
.
begin
(),
tags
.
end
()
);
cout
<<
"Done sorting. First elements = "
<<
tags
.
at
(
0
)
<<
endl
<<
tags
.
at
(
1
)
<<
endl
<<
tags
.
at
(
2
)
<<
endl
;
}
...
...
@@ -657,12 +674,12 @@ vector<string> proteome_v2::get_scr_tags()
/*
void proteome_v2::read_proteins(string infile)
{
populate_proteins(infile);
populate_proteins(infile);
for(int i=0; i<proteins.size(); i++)
{
proteins.at(i).set_id(i);
}
populate_kmers();
cout<<"size of kmers = "<<kmers.size()<<endl;
}
...
...
@@ -681,12 +698,12 @@ void proteome_v2::add_protein(string info_line, string aaseq)
void proteome_v2::mark_gene(string s, int hit_genome_id)
{
cout<<"In mark_gene s="<<s<<" hit_genome_id="<<hit_genome_id<<endl;
int bottom=0;
int top=kmers.size();
int middle=top/2;
int former_middle=-1;
bool found=false;
while(!found && former_middle!=middle)
{
...
...
@@ -704,7 +721,7 @@ void proteome_v2::add_protein(string info_line, string aaseq)
}
else if(kmers.at(middle).s==s)
{
found=true;
found=true;
proteins.at(kmers.at(middle).source_protein_index).add_hit(hit_genome_id);
cout<<"protein info: "<<proteins.at(kmers.at(middle).source_protein_index).get_genome_hits_size()<<endl;
cout<<"The code is making it in here. kmer = "<<s<<endl;
...
...
@@ -1564,38 +1581,7 @@ void proteome_v2::write_out_gene(int index)
}
}
void
proteome_v2
::
quicksort
(
vector
<
string
>&
tv
,
int
left
,
int
right
)
{
int
i
=
left
;
int
j
=
right
;
string
pivot
=
tv
.
at
((
left
+
right
)
/
2
);
while
(
i
<=
j
)
{
while
(
pivot
>
tv
.
at
(
i
))
{
i
++
;
}
while
(
pivot
<
tv
.
at
(
j
))
{
j
--
;
}
if
(
i
<=
j
)
{
string
temp
=
tv
.
at
(
i
);
tv
.
at
(
i
)
=
tv
.
at
(
j
);
tv
.
at
(
j
)
=
temp
;
i
++
;
j
--
;
}
};
if
(
left
<
j
)
quicksort
(
tv
,
left
,
j
);
if
(
i
<
right
)
quicksort
(
tv
,
i
,
right
);
}
/*string proteome_v2::scramble(string s)
{
...
...
@@ -1627,13 +1613,10 @@ void proteome_v2::quicksort(vector<string>& tv,int left, int right)
bool
proteome_v2
::
is_valid
(
string
s
)
{
for
(
int
i
=
0
;
i
<
s
.
length
();
i
++
)
{
int
index
=
aas
.
get_aa
(
s
[
i
]);
if
(
index
<
0
||
index
>
19
)
{
return
false
;
}
std
::
size_t
found
=
s
.
find_first_not_of
(
aas
.
valid_aas
);
if
(
found
!=
std
::
string
::
npos
){
return
false
;
}
if
(
SUMOFSQUARES
)
...
...
@@ -1647,20 +1630,12 @@ bool proteome_v2::is_valid(string s)
bool
proteome_v2
::
check_composition
(
string
s
)
{
vector
<
int
>
array
;
//int array[20];
for
(
int
i
=
0
;
i
<
20
;
i
++
)
{
//array[i]=0;
array
.
push_back
(
0
);
}
for
(
int
i
=
0
;
i
<
s
.
length
()
;
i
++
)
for
(
int
i
=
0
;
i
<
20
;
i
++
)
{
int
index
=
aas
.
get_aa
(
s
[
i
]);
if
(
index
>=
0
&&
index
<
20
)
{
array
[
index
]
++
;
}
char
aa
=
aas
.
get_aa
(
i
);
int
aa_count
=
std
::
count
(
s
.
begin
(),
s
.
end
(),
aa
);
array
.
push_back
(
aa_count
);
}
int
result
=
0
;
...
...
sttagger.h
View file @
f7316f49
...
...
@@ -18,6 +18,7 @@ Class reads in proteins from FAA files specified in info file generated by make_
#include <cmath>
#include <omp.h>
#include "org.h"
#include "util.h"
...
...
@@ -40,7 +41,7 @@ const int NUM_FILES_CUTOFF = 5; //NUM_FILES_CUTOFF: For each directory of .faa
//problematic_inputs.txt. This is to facilitate identifying bad downloads