Commit f7316f49 authored by David Trudgian's avatar David Trudgian

Initial bug fixes and optimizations so working on BioHPC

parent 38ed0730
...@@ -13,12 +13,16 @@ class amino_acids ...@@ -13,12 +13,16 @@ class amino_acids
amino_acids(); amino_acids();
char get_aa(int i); char get_aa(int i);
int get_aa(char c); int get_aa(char c);
static const string valid_aas;
}; };
const string amino_acids::valid_aas ="ACDEFGHIKLMNPQRSTVWY";
amino_acids::amino_acids() amino_acids::amino_acids()
{ {
} }
char amino_acids::get_aa(int i) char amino_acids::get_aa(int i)
{ {
if(i==0) if(i==0)
......
...@@ -80,7 +80,6 @@ class filter ...@@ -80,7 +80,6 @@ class filter
void read_in_files(); void read_in_files();
void open_tag_files(); void open_tag_files();
void read_in_tags(vector<string>& tags, char current_amino_acid, int& num_refs); void read_in_tags(vector<string>& tags, char current_amino_acid, int& num_refs);
void quicksort(vector<string>& tags, int left, int right);
bool compare_tags(string s1, string s2); bool compare_tags(string s1, string s2);
void form_clusters(vector<string>& tags);//, string s); void form_clusters(vector<string>& tags);//, string s);
void view_clusters2(vector<cluster2>& clusters2,vector<string>& tags); void view_clusters2(vector<cluster2>& clusters2,vector<string>& tags);
......
CPPFLAGS=-O3
all: mif sttag tmerg filt fmerg cm mdist fh dbc mtax all: mif sttag tmerg filt fmerg cm mdist fh dbc mtax
mif: mif.cpp directory_reader.h mif: mif.cpp directory_reader.h
g++ mif.cpp -o mif g++ $(CPPFLAGS) mif.cpp -o mif
sttag: sttag.cpp sttagger.h sttag: sttag.cpp sttagger.h org.h util.h amino_acids.h
g++ sttag.cpp -o sttag g++ $(CPPFLAGS) sttag.cpp -fopenmp -o sttag
tmerg: tmerg.cpp tmerg.h tmerg: tmerg.cpp tmerg.h
g++ tmerg.cpp -o tmerg g++ $(CPPFLAGS) tmerg.cpp -o tmerg
filt: filter.cpp filter.h filt: filter.cpp filter.h
g++ filter.cpp -o filt g++ $(CPPFLAGS) filter.cpp -o filt
fmerg: fmerg.cpp util.h fmerg: fmerg.cpp util.h
g++ fmerg.cpp -o fmerg g++ $(CPPFLAGS) fmerg.cpp -o fmerg
cm: cm.cpp util.h cm: cm.cpp util.h
g++ cm.cpp -fopenmp -o cm g++ $(CPPFLAGS) cm.cpp -fopenmp -o cm
# g++ cm.cpp -o cm # g++ $(CPPFLAGS) cm.cpp -o cm
dbc: dbc.cpp dbc: dbc.cpp
g++ dbc.cpp -fopenmp -o dbc g++ $(CPPFLAGS) dbc.cpp -fopenmp -o dbc
mdist: mdist.cpp mdist.h mdist: mdist.cpp mdist.h
g++ mdist.cpp -fopenmp -o mdist g++ $(CPPFLAGS) mdist.cpp -fopenmp -o mdist
# g++ mdist.cpp -o mdist # g++ $(CPPFLAGS) mdist.cpp -o mdist
fh: fix_hgt.cpp mdist.h mctr.h fh: fix_hgt.cpp mdist.h mctr.h
g++ fix_hgt.cpp -fopenmp -o fh g++ $(CPPFLAGS) fix_hgt.cpp -fopenmp -o fh
# g++ fix_hgt.cpp -o fh # g++ $(CPPFLAGS) fix_hgt.cpp -o fh
mtax: mtax.cpp mtax: mtax.cpp
g++ mtax.cpp -o mtax g++ $(CPPFLAGS) mtax.cpp -o mtax
clean: clean:
\rm mif \rm mif
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include <vector> #include <vector>
#include <algorithm>
#include <map> #include <map>
#include <sstream> #include <sstream>
#include <math.h> #include <math.h>
...@@ -41,7 +42,6 @@ struct krow ...@@ -41,7 +42,6 @@ struct krow
}; };
//int find_top_string(vector<file_aa_pair> v); //int find_top_string(vector<file_aa_pair> v);
void sort_problem_tags(vector<string>& tags, int left, int right);
class mctr class mctr
{ {
...@@ -2831,7 +2831,10 @@ void mctr::setup_it1(ifstream& infostream) ...@@ -2831,7 +2831,10 @@ void mctr::setup_it1(ifstream& infostream)
get_problem_tags(files1,"0"); get_problem_tags(files1,"0");
get_problem_tags(files2,"1"); get_problem_tags(files2,"1");
sort_problem_tags(problem_tags,0,problem_tags.size()-1);
std::sort(problem_tags.begin(), problem_tags.end());
if(MCTR_DEBUG) if(MCTR_DEBUG)
{ {
...@@ -2935,7 +2938,7 @@ void mctr::setup_it2(ifstream& infostream) ...@@ -2935,7 +2938,7 @@ void mctr::setup_it2(ifstream& infostream)
get_problem_tags(files2,"1",hgt_genes1); get_problem_tags(files2,"1",hgt_genes1);
//cout<<"problem_tags at 0 = "<<problem_tags.at(0)<<endl; //cout<<"problem_tags at 0 = "<<problem_tags.at(0)<<endl;
//cout<<"size of problem_tags = "<<problem_tags.size()<<endl; //cout<<"size of problem_tags = "<<problem_tags.size()<<endl;
sort_problem_tags(problem_tags,0,problem_tags.size()-1); std::sort(problem_tags.begin(), problem_tags.end());
//cout<<"problem_tags at 0 = "<<problem_tags.at(0)<<endl; //cout<<"problem_tags at 0 = "<<problem_tags.at(0)<<endl;
/* for(int i=0; i<10; i++) /* for(int i=0; i<10; i++)
{ {
...@@ -3523,38 +3526,7 @@ bool mctr::check_composition(string s) ...@@ -3523,38 +3526,7 @@ bool mctr::check_composition(string s)
return true; return true;
} }
void sort_problem_tags(vector<string>& tags, int left, int right)
{
int i=left;
int j=right;
string pivot=tags.at((left+right)/2);
while(i<=j)
{
while(pivot>tags.at(i))
{
i++;
}
while(pivot<tags.at(j))
{
j--;
}
if(i<=j)
{
string temp = tags.at(i);
tags.at(i) = tags.at(j);
tags.at(j) = temp;
i++;
j--;
}
};
if(left<j)
sort_problem_tags(tags,left,j);
if(i<right)
sort_problem_tags(tags,i,right);
}
int mctr::get_sos_cutoff() int mctr::get_sos_cutoff()
{ {
......
...@@ -1825,6 +1825,8 @@ void mdist::read_in_orgs() ...@@ -1825,6 +1825,8 @@ void mdist::read_in_orgs()
ifstream instream; ifstream instream;
string orgfile = path+"/"+bfn+"/"+bfn+"_saved_orgs.txt"; string orgfile = path+"/"+bfn+"/"+bfn+"_saved_orgs.txt";
u.open_ifile(instream,orgfile,"orgfile"); u.open_ifile(instream,orgfile,"orgfile");
cout << orgfile << endl;
string eat; string eat;
int ord; int ord;
...@@ -1840,6 +1842,8 @@ void mdist::read_in_orgs() ...@@ -1840,6 +1842,8 @@ void mdist::read_in_orgs()
//Set the ordinal. //Set the ordinal.
new_org.set_ordinal(ord); new_org.set_ordinal(ord);
cout << "Organism: " << ord << endl;
//Set the path. //Set the path.
instream>>eat>>s; instream>>eat>>s;
new_org.set_path(s); new_org.set_path(s);
...@@ -1857,13 +1861,17 @@ void mdist::read_in_orgs() ...@@ -1857,13 +1861,17 @@ void mdist::read_in_orgs()
instream>>eat>>i; instream>>eat>>i;
new_org.set_tag_length(i); new_org.set_tag_length(i);
//Set bool conserved.
bool b; bool b;
instream>>eat>>b; instream>>eat>>b;
new_org.set_conserved(b); new_org.set_conserved(b);
//Set the files. //Set the files.
instream>>eat>>i; instream>>eat>>i;
cout << "numfiles: " << i << endl;
for(int q=0; q<i; q++) for(int q=0; q<i; q++)
{ {
instream>>s; instream>>s;
...@@ -1871,7 +1879,10 @@ void mdist::read_in_orgs() ...@@ -1871,7 +1879,10 @@ void mdist::read_in_orgs()
} }
//Set the file sizes. //Set the file sizes.
instream>>eat>>i; instream>>eat>>i;
cout << "numfilesizes: " << i << endl;
for(int q=0; q<i; q++) for(int q=0; q<i; q++)
{ {
int fs; int fs;
...@@ -1879,8 +1890,13 @@ void mdist::read_in_orgs() ...@@ -1879,8 +1890,13 @@ void mdist::read_in_orgs()
new_org.push_back_file_size(fs); new_org.push_back_file_size(fs);
} }
//Set the amino acid counts. //Set the amino acid counts.
instream>>eat>>i; instream>>eat>>i;
cout << "numaacounts: " << i << endl;
for(int q=0; q<i; q++) for(int q=0; q<i; q++)
{ {
int aacount; int aacount;
...@@ -1895,7 +1911,10 @@ void mdist::read_in_orgs() ...@@ -1895,7 +1911,10 @@ void mdist::read_in_orgs()
double d; double d;
instream>>eat>>d; //"discrepancy <double>" instream>>eat>>d; //"discrepancy <double>"
new_org.setp_discrepancy(d); new_org.setp_discrepancy(d);
cout << "discrepancy: " << d << endl;
instream>>eat>>i; //"conserved <int>" instream>>eat>>i; //"conserved <int>"
new_org.setp_conserved(i); new_org.setp_conserved(i);
...@@ -1906,13 +1925,22 @@ void mdist::read_in_orgs() ...@@ -1906,13 +1925,22 @@ void mdist::read_in_orgs()
new_org.set_ref(b); new_org.set_ref(b);
instream>>eat>>i; //"proteins.size() <integer which should be 0>" instream>>eat>>i; //"proteins.size() <integer which should be 0>"
cout << "proteins.size: " << i << endl;
instream>>eat>>i; //"total_num_proteins <integer which should be greater than 0>" instream>>eat>>i; //"total_num_proteins <integer which should be greater than 0>"
new_org.set_total_num_proteins(i);
cout << "total_num_proteins: " << i << endl;
new_org.set_total_num_proteins(i);
instream>>eat>>i; //"pv.size() <integer which should not be 0>" instream>>eat>>i; //"pv.size() <integer which should not be 0>"
new_org.setp_set_pv_size(i); new_org.setp_set_pv_size(i);
for(int q=0; q<i; q++) cout << "ppv_size: " << i << endl;
for(int q=0; q<i; q++)
{ {
int keep; int keep;
instream>>eat>>keep; instream>>eat>>keep;
...@@ -1922,10 +1950,15 @@ void mdist::read_in_orgs() ...@@ -1922,10 +1950,15 @@ void mdist::read_in_orgs()
int fs; //filtering_steps int fs; //filtering_steps
instream>>eat>>fs; instream>>eat>>fs;
cout << "filtering_steps: " << fs << endl;
int pvv_size; int pvv_size;
instream>>eat>>pvv_size; //"pvv.size() <integer which should match the size above>" instream>>eat>>pvv_size; //"pvv.size() <integer which should match the size above>"
new_org.setp_set_pvv_size(pvv_size); new_org.setp_set_pvv_size(pvv_size);
cout << "pvvsize: " << pvv_size << endl;
for(int q=0; q<pvv_size; q++) for(int q=0; q<pvv_size; q++)
{ {
instream>>eat>>i; //keeper instream>>eat>>i; //keeper
...@@ -1943,6 +1976,9 @@ void mdist::read_in_orgs() ...@@ -1943,6 +1976,9 @@ void mdist::read_in_orgs()
} }
instream>>eat>>i; //"sosv.size() <int like 401>" instream>>eat>>i; //"sosv.size() <int like 401>"
cout << "sosv_size: " << pvv_size << endl;
new_org.setp_set_sosv_size(i); new_org.setp_set_sosv_size(i);
vector<int> sosv_vals; vector<int> sosv_vals;
......
...@@ -7,6 +7,7 @@ Identifies redundancy in the input (i.e. exact same proteome present in both ref ...@@ -7,6 +7,7 @@ Identifies redundancy in the input (i.e. exact same proteome present in both ref
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include <vector> #include <vector>
#include <algorithm>
#include <cstdlib> #include <cstdlib>
#include <getopt.h> #include <getopt.h>
...@@ -127,8 +128,8 @@ int main(int argc, char* argv[]) ...@@ -127,8 +128,8 @@ int main(int argc, char* argv[])
logstream<<"Number of directories in FAA/ : directories.size() = "<<directories.size()<<endl; logstream<<"Number of directories in FAA/ : directories.size() = "<<directories.size()<<endl;
logstream<<"Number of directories in FAA_ref/ : directoreis_ref.size() = "<<directories_ref.size()<<endl; logstream<<"Number of directories in FAA_ref/ : directoreis_ref.size() = "<<directories_ref.size()<<endl;
sort_directories(directories); std::sort(directories.begin(), directories.end());
sort_directories(directories_ref); std::sort(directories_ref.begin(), directories_ref.end());
if(MIF_DEBUG || verbose) if(MIF_DEBUG || verbose)
...@@ -234,19 +235,4 @@ int main(int argc, char* argv[]) ...@@ -234,19 +235,4 @@ int main(int argc, char* argv[])
repstream.close(); repstream.close();
} }
//Alphabetically sort the directories.
void sort_directories(vector<string>& directories)
{
for(int i=0; i<directories.size(); i++)
{
for(int j=i+1; j<directories.size(); j++)
{
if(directories.at(i)>directories.at(j))
{
string temp = directories.at(i);
directories.at(i) = directories.at(j);
directories.at(j) = temp;
}
}
}
}
...@@ -133,6 +133,11 @@ org::org() ...@@ -133,6 +133,11 @@ org::org()
array.push_back(0); array.push_back(0);
} }
total_num_proteins=0; total_num_proteins=0;
// DCT - Wasn't initialized. Causes error later in mdist reading org file.
conserved = false;
} }
org::~org() org::~org()
......
...@@ -4,9 +4,11 @@ ...@@ -4,9 +4,11 @@
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include <vector> #include <vector>
#include <algorithm>
#include <set> #include <set>
#include <cstdlib> #include <cstdlib>
#include <sstream> #include <sstream>
#include <iomanip>
#include <cmath> #include <cmath>
#include "protein.h" #include "protein.h"
...@@ -19,6 +21,7 @@ const double KEEPERS_CUTOFF=0.6; ...@@ -19,6 +21,7 @@ const double KEEPERS_CUTOFF=0.6;
const bool SUMOFSQUARES=true; const bool SUMOFSQUARES=true;
const double CONSERVATION_CUTOFF=1.3; const double CONSERVATION_CUTOFF=1.3;
const bool PROTEOME_DEBUG=false; const bool PROTEOME_DEBUG=false;
const int MAXORDDIGITS=5; const int MAXORDDIGITS=5;
using namespace std; using namespace std;
...@@ -71,7 +74,6 @@ class proteome_v2 ...@@ -71,7 +74,6 @@ class proteome_v2
void populate_proteins(string file); void populate_proteins(string file);
void populate_kmers(); void populate_kmers();
void quicksort(vector<string>& tv, int left, int right);
bool is_valid(string s); bool is_valid(string s);
amino_acids aas; amino_acids aas;
double get_ratio(int pindex); double get_ratio(int pindex);
...@@ -156,9 +158,15 @@ proteome_v2::proteome_v2() ...@@ -156,9 +158,15 @@ proteome_v2::proteome_v2()
// { // {
// sosv.push_back(0); // sosv.push_back(0);
// } // }
discrepancy=0; discrepancy=0;
conserved=0; conserved=0;
filtering_steps=10.0; //Default. filtering_steps=10.0; //Default.
// DCT - Wasn't initialized. Causes error later in mdist reading org file.
chosen_column = 0;
ref = false;
} }
int proteome_v2::get_tags_size() int proteome_v2::get_tags_size()
...@@ -242,6 +250,7 @@ void proteome_v2::set_sosv_size(int i) ...@@ -242,6 +250,7 @@ void proteome_v2::set_sosv_size(int i)
{ {
cout<<"proteome_v2::set_sosv_size(int)"<<endl; cout<<"proteome_v2::set_sosv_size(int)"<<endl;
cout<<"i = "<<i<<endl; cout<<"i = "<<i<<endl;
cout<<"Current sosv.size: " << sosv.size() << endl;
cout<<"Error condition"<<endl; cout<<"Error condition"<<endl;
exit(1); exit(1);
} }
...@@ -353,30 +362,32 @@ void proteome_v2::set_pvv_gnums_val(int index,vector<int> vals) ...@@ -353,30 +362,32 @@ void proteome_v2::set_pvv_gnums_val(int index,vector<int> vals)
void proteome_v2::generate_tags(int ordinal,bool do_scrambled) void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
{ {
stringstream ss1;
ss1<<ordinal; stringstream sso;
string so=ss1.str(); sso << setfill('0') << setw(MAXORDDIGITS) << ordinal;
while(so.length()<MAXORDDIGITS) string so = sso.str();
{
so='0'+so;
}
for(int i=0; i<proteins.size(); i++) for(int i=0; i<proteins.size(); i++)
{ {
stringstream ss;
ss<<i;
string is = ss.str();
for(int j=0; j<proteins.at(i).get_sequence().length(); j++) for(int j=0; j<proteins.at(i).get_sequence().length(); j++)
{ {
string s = proteins.at(i).get_sequence().substr(j,tag_length); string s = proteins.at(i).get_sequence().substr(j,tag_length);
if(is_valid(s)) if(is_valid(s))
{ {
while(s.length()<tag_length)
{ if(s.length() < tag_length){
s+="^"; s.append(tag_length - s.length(), '^');
} }
stringstream ss;
ss<<i;
string is = ss.str();
s += " "+so+" "+is; s += " "+so+" "+is;
tags.push_back(s); tags.push_back(s);
} }
} }
...@@ -386,6 +397,7 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled) ...@@ -386,6 +397,7 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
ostemp.keeper=false; ostemp.keeper=false;
ostemp.deleter=false; ostemp.deleter=false;
ostemp.internal_repeats=0; ostemp.internal_repeats=0;
pv.push_back(ostemp); pv.push_back(ostemp);
osv ostemp2; osv ostemp2;
...@@ -399,6 +411,7 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled) ...@@ -399,6 +411,7 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
ostemp2.gnums.push_back(0); ostemp2.gnums.push_back(0);
ostemp2.gis.push_back(0); ostemp2.gis.push_back(0);
} }
pvv.push_back(ostemp2); pvv.push_back(ostemp2);
if(do_scrambled) if(do_scrambled)
...@@ -409,14 +422,14 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled) ...@@ -409,14 +422,14 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
string s = scrambled.substr(j,tag_length); string s = scrambled.substr(j,tag_length);
if(is_valid(s)) if(is_valid(s))
{ {
while(s.length()<tag_length) if(s.length() < tag_length){
{ s.append(tag_length - s.length(), '^');
s+="^"; }
}
stringstream ss; stringstream ss;
ss<<i; ss<<i;
string is = ss.str(); string is = ss.str();
s += " "+so+" "+is; s += " "+so+" "+is;
tags_scr.push_back(s); tags_scr.push_back(s);
} }
} }
...@@ -443,11 +456,13 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled) ...@@ -443,11 +456,13 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
} }
} }
quicksort(tags,0,tags.size()-1); std::sort(tags.begin(), tags.end());
if(do_scrambled) if(do_scrambled)
{ {
quicksort(tags_scr,0,tags_scr.size()-1); std:sort(tags_scr.begin(), tags_scr.end());
} }
else else
{ {
...@@ -604,6 +619,11 @@ void proteome_v2::generate_scrambled_tags(int ordinal) ...@@ -604,6 +619,11 @@ void proteome_v2::generate_scrambled_tags(int ordinal)