Commit f7316f49 authored by David Trudgian's avatar David Trudgian

Initial bug fixes and optimizations so working on BioHPC

parent 38ed0730
......@@ -13,12 +13,16 @@ class amino_acids
amino_acids();
char get_aa(int i);
int get_aa(char c);
static const string valid_aas;
};
const string amino_acids::valid_aas ="ACDEFGHIKLMNPQRSTVWY";
amino_acids::amino_acids()
{
}
char amino_acids::get_aa(int i)
{
if(i==0)
......
......@@ -80,7 +80,6 @@ class filter
void read_in_files();
void open_tag_files();
void read_in_tags(vector<string>& tags, char current_amino_acid, int& num_refs);
void quicksort(vector<string>& tags, int left, int right);
bool compare_tags(string s1, string s2);
void form_clusters(vector<string>& tags);//, string s);
void view_clusters2(vector<cluster2>& clusters2,vector<string>& tags);
......
CPPFLAGS=-O3
all: mif sttag tmerg filt fmerg cm mdist fh dbc mtax
mif: mif.cpp directory_reader.h
g++ mif.cpp -o mif
g++ $(CPPFLAGS) mif.cpp -o mif
sttag: sttag.cpp sttagger.h
g++ sttag.cpp -o sttag
sttag: sttag.cpp sttagger.h org.h util.h amino_acids.h
g++ $(CPPFLAGS) sttag.cpp -fopenmp -o sttag
tmerg: tmerg.cpp tmerg.h
g++ tmerg.cpp -o tmerg
g++ $(CPPFLAGS) tmerg.cpp -o tmerg
filt: filter.cpp filter.h
g++ filter.cpp -o filt
g++ $(CPPFLAGS) filter.cpp -o filt
fmerg: fmerg.cpp util.h
g++ fmerg.cpp -o fmerg
g++ $(CPPFLAGS) fmerg.cpp -o fmerg
cm: cm.cpp util.h
g++ cm.cpp -fopenmp -o cm
# g++ cm.cpp -o cm
g++ $(CPPFLAGS) cm.cpp -fopenmp -o cm
# g++ $(CPPFLAGS) cm.cpp -o cm
dbc: dbc.cpp
g++ dbc.cpp -fopenmp -o dbc
g++ $(CPPFLAGS) dbc.cpp -fopenmp -o dbc
mdist: mdist.cpp mdist.h
g++ mdist.cpp -fopenmp -o mdist
# g++ mdist.cpp -o mdist
g++ $(CPPFLAGS) mdist.cpp -fopenmp -o mdist
# g++ $(CPPFLAGS) mdist.cpp -o mdist
fh: fix_hgt.cpp mdist.h mctr.h
g++ fix_hgt.cpp -fopenmp -o fh
# g++ fix_hgt.cpp -o fh
g++ $(CPPFLAGS) fix_hgt.cpp -fopenmp -o fh
# g++ $(CPPFLAGS) fix_hgt.cpp -o fh
mtax: mtax.cpp
g++ mtax.cpp -o mtax
g++ $(CPPFLAGS) mtax.cpp -o mtax
clean:
\rm mif
......
......@@ -6,6 +6,7 @@
#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <map>
#include <sstream>
#include <math.h>
......@@ -41,7 +42,6 @@ struct krow
};
//int find_top_string(vector<file_aa_pair> v);
void sort_problem_tags(vector<string>& tags, int left, int right);
class mctr
{
......@@ -2831,7 +2831,10 @@ void mctr::setup_it1(ifstream& infostream)
get_problem_tags(files1,"0");
get_problem_tags(files2,"1");
sort_problem_tags(problem_tags,0,problem_tags.size()-1);
std::sort(problem_tags.begin(), problem_tags.end());
if(MCTR_DEBUG)
{
......@@ -2935,7 +2938,7 @@ void mctr::setup_it2(ifstream& infostream)
get_problem_tags(files2,"1",hgt_genes1);
//cout<<"problem_tags at 0 = "<<problem_tags.at(0)<<endl;
//cout<<"size of problem_tags = "<<problem_tags.size()<<endl;
sort_problem_tags(problem_tags,0,problem_tags.size()-1);
std::sort(problem_tags.begin(), problem_tags.end());
//cout<<"problem_tags at 0 = "<<problem_tags.at(0)<<endl;
/* for(int i=0; i<10; i++)
{
......@@ -3523,38 +3526,7 @@ bool mctr::check_composition(string s)
return true;
}
void sort_problem_tags(vector<string>& tags, int left, int right)
{
int i=left;
int j=right;
string pivot=tags.at((left+right)/2);
while(i<=j)
{
while(pivot>tags.at(i))
{
i++;
}
while(pivot<tags.at(j))
{
j--;
}
if(i<=j)
{
string temp = tags.at(i);
tags.at(i) = tags.at(j);
tags.at(j) = temp;
i++;
j--;
}
};
if(left<j)
sort_problem_tags(tags,left,j);
if(i<right)
sort_problem_tags(tags,i,right);
}
int mctr::get_sos_cutoff()
{
......
......@@ -1825,6 +1825,8 @@ void mdist::read_in_orgs()
ifstream instream;
string orgfile = path+"/"+bfn+"/"+bfn+"_saved_orgs.txt";
u.open_ifile(instream,orgfile,"orgfile");
cout << orgfile << endl;
string eat;
int ord;
......@@ -1840,6 +1842,8 @@ void mdist::read_in_orgs()
//Set the ordinal.
new_org.set_ordinal(ord);
cout << "Organism: " << ord << endl;
//Set the path.
instream>>eat>>s;
new_org.set_path(s);
......@@ -1857,13 +1861,17 @@ void mdist::read_in_orgs()
instream>>eat>>i;
new_org.set_tag_length(i);
//Set bool conserved.
bool b;
instream>>eat>>b;
instream>>eat>>b;
new_org.set_conserved(b);
//Set the files.
instream>>eat>>i;
cout << "numfiles: " << i << endl;
for(int q=0; q<i; q++)
{
instream>>s;
......@@ -1871,7 +1879,10 @@ void mdist::read_in_orgs()
}
//Set the file sizes.
instream>>eat>>i;
instream>>eat>>i;
cout << "numfilesizes: " << i << endl;
for(int q=0; q<i; q++)
{
int fs;
......@@ -1879,8 +1890,13 @@ void mdist::read_in_orgs()
new_org.push_back_file_size(fs);
}
//Set the amino acid counts.
instream>>eat>>i;
cout << "numaacounts: " << i << endl;
for(int q=0; q<i; q++)
{
int aacount;
......@@ -1895,7 +1911,10 @@ void mdist::read_in_orgs()
double d;
instream>>eat>>d; //"discrepancy <double>"
new_org.setp_discrepancy(d);
cout << "discrepancy: " << d << endl;
instream>>eat>>i; //"conserved <int>"
new_org.setp_conserved(i);
......@@ -1906,13 +1925,22 @@ void mdist::read_in_orgs()
new_org.set_ref(b);
instream>>eat>>i; //"proteins.size() <integer which should be 0>"
cout << "proteins.size: " << i << endl;
instream>>eat>>i; //"total_num_proteins <integer which should be greater than 0>"
new_org.set_total_num_proteins(i);
cout << "total_num_proteins: " << i << endl;
new_org.set_total_num_proteins(i);
instream>>eat>>i; //"pv.size() <integer which should not be 0>"
new_org.setp_set_pv_size(i);
for(int q=0; q<i; q++)
cout << "ppv_size: " << i << endl;
for(int q=0; q<i; q++)
{
int keep;
instream>>eat>>keep;
......@@ -1922,10 +1950,15 @@ void mdist::read_in_orgs()
int fs; //filtering_steps
instream>>eat>>fs;
cout << "filtering_steps: " << fs << endl;
int pvv_size;
instream>>eat>>pvv_size; //"pvv.size() <integer which should match the size above>"
new_org.setp_set_pvv_size(pvv_size);
cout << "pvvsize: " << pvv_size << endl;
for(int q=0; q<pvv_size; q++)
{
instream>>eat>>i; //keeper
......@@ -1943,6 +1976,9 @@ void mdist::read_in_orgs()
}
instream>>eat>>i; //"sosv.size() <int like 401>"
cout << "sosv_size: " << pvv_size << endl;
new_org.setp_set_sosv_size(i);
vector<int> sosv_vals;
......
......@@ -7,6 +7,7 @@ Identifies redundancy in the input (i.e. exact same proteome present in both ref
#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <cstdlib>
#include <getopt.h>
......@@ -127,8 +128,8 @@ int main(int argc, char* argv[])
logstream<<"Number of directories in FAA/ : directories.size() = "<<directories.size()<<endl;
logstream<<"Number of directories in FAA_ref/ : directoreis_ref.size() = "<<directories_ref.size()<<endl;
sort_directories(directories);
sort_directories(directories_ref);
std::sort(directories.begin(), directories.end());
std::sort(directories_ref.begin(), directories_ref.end());
if(MIF_DEBUG || verbose)
......@@ -234,19 +235,4 @@ int main(int argc, char* argv[])
repstream.close();
}
//Alphabetically sort the directories.
void sort_directories(vector<string>& directories)
{
for(int i=0; i<directories.size(); i++)
{
for(int j=i+1; j<directories.size(); j++)
{
if(directories.at(i)>directories.at(j))
{
string temp = directories.at(i);
directories.at(i) = directories.at(j);
directories.at(j) = temp;
}
}
}
}
......@@ -133,6 +133,11 @@ org::org()
array.push_back(0);
}
total_num_proteins=0;
// DCT - Wasn't initialized. Causes error later in mdist reading org file.
conserved = false;
}
org::~org()
......
......@@ -4,9 +4,11 @@
#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <set>
#include <cstdlib>
#include <sstream>
#include <iomanip>
#include <cmath>
#include "protein.h"
......@@ -19,6 +21,7 @@ const double KEEPERS_CUTOFF=0.6;
const bool SUMOFSQUARES=true;
const double CONSERVATION_CUTOFF=1.3;
const bool PROTEOME_DEBUG=false;
const int MAXORDDIGITS=5;
using namespace std;
......@@ -71,7 +74,6 @@ class proteome_v2
void populate_proteins(string file);
void populate_kmers();
void quicksort(vector<string>& tv, int left, int right);
bool is_valid(string s);
amino_acids aas;
double get_ratio(int pindex);
......@@ -156,9 +158,15 @@ proteome_v2::proteome_v2()
// {
// sosv.push_back(0);
// }
discrepancy=0;
conserved=0;
filtering_steps=10.0; //Default.
// DCT - Wasn't initialized. Causes error later in mdist reading org file.
chosen_column = 0;
ref = false;
}
int proteome_v2::get_tags_size()
......@@ -242,6 +250,7 @@ void proteome_v2::set_sosv_size(int i)
{
cout<<"proteome_v2::set_sosv_size(int)"<<endl;
cout<<"i = "<<i<<endl;
cout<<"Current sosv.size: " << sosv.size() << endl;
cout<<"Error condition"<<endl;
exit(1);
}
......@@ -353,30 +362,32 @@ void proteome_v2::set_pvv_gnums_val(int index,vector<int> vals)
void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
{
stringstream ss1;
ss1<<ordinal;
string so=ss1.str();
while(so.length()<MAXORDDIGITS)
{
so='0'+so;
}
stringstream sso;
sso << setfill('0') << setw(MAXORDDIGITS) << ordinal;
string so = sso.str();
for(int i=0; i<proteins.size(); i++)
{
stringstream ss;
ss<<i;
string is = ss.str();
for(int j=0; j<proteins.at(i).get_sequence().length(); j++)
{
string s = proteins.at(i).get_sequence().substr(j,tag_length);
if(is_valid(s))
{
while(s.length()<tag_length)
{
s+="^";
}
stringstream ss;
ss<<i;
string is = ss.str();
if(s.length() < tag_length){
s.append(tag_length - s.length(), '^');
}
s += " "+so+" "+is;
tags.push_back(s);
}
}
......@@ -386,6 +397,7 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
ostemp.keeper=false;
ostemp.deleter=false;
ostemp.internal_repeats=0;
pv.push_back(ostemp);
osv ostemp2;
......@@ -399,6 +411,7 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
ostemp2.gnums.push_back(0);
ostemp2.gis.push_back(0);
}
pvv.push_back(ostemp2);
if(do_scrambled)
......@@ -409,14 +422,14 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
string s = scrambled.substr(j,tag_length);
if(is_valid(s))
{
while(s.length()<tag_length)
{
s+="^";
}
if(s.length() < tag_length){
s.append(tag_length - s.length(), '^');
}
stringstream ss;
ss<<i;
string is = ss.str();
s += " "+so+" "+is;
tags_scr.push_back(s);
}
}
......@@ -443,11 +456,13 @@ void proteome_v2::generate_tags(int ordinal,bool do_scrambled)
}
}
quicksort(tags,0,tags.size()-1);
std::sort(tags.begin(), tags.end());
if(do_scrambled)
{
quicksort(tags_scr,0,tags_scr.size()-1);
std:sort(tags_scr.begin(), tags_scr.end());
}
else
{
......@@ -604,6 +619,11 @@ void proteome_v2::generate_scrambled_tags(int ordinal)
for(int i=0; i<proteins.size(); i++)
{
stringstream ss;
ss<<i;
string is = ss.str();
string q = proteins.at(i).get_sequence();
scramble_v2(q);
......@@ -612,13 +632,10 @@ void proteome_v2::generate_scrambled_tags(int ordinal)
string s = q.substr(j,tag_length);
if(is_valid(s))
{
while(s.length()<tag_length)
{
s+="^";
}
stringstream ss;
ss<<i;
string is = ss.str();
if(s.length() < tag_length){
s.append(tag_length - s.length() ,'^');
}
s += " "+so+" "+is;
//cout<<"s = "<<s<<endl;
tags.push_back(s);
......@@ -630,7 +647,7 @@ void proteome_v2::generate_scrambled_tags(int ordinal)
cout<<"Size of size_real_tags = "<<size_real_tags<<endl;
exit(1);
cout<<"Sorting tags: "<<endl;
quicksort(tags,0,tags.size()-1);
std::sort(tags.begin(), tags.end());
cout<<"Done sorting. First elements = "<<tags.at(0)<<endl<<tags.at(1)<<endl<<tags.at(2)<<endl;
}
......@@ -657,12 +674,12 @@ vector<string> proteome_v2::get_scr_tags()
/*
void proteome_v2::read_proteins(string infile)
{
populate_proteins(infile);
populate_proteins(infile);
for(int i=0; i<proteins.size(); i++)
{
proteins.at(i).set_id(i);
}
populate_kmers();
cout<<"size of kmers = "<<kmers.size()<<endl;
}
......@@ -681,12 +698,12 @@ void proteome_v2::add_protein(string info_line, string aaseq)
void proteome_v2::mark_gene(string s, int hit_genome_id)
{
cout<<"In mark_gene s="<<s<<" hit_genome_id="<<hit_genome_id<<endl;
int bottom=0;
int top=kmers.size();
int middle=top/2;
int former_middle=-1;
bool found=false;
while(!found && former_middle!=middle)
{
......@@ -704,7 +721,7 @@ void proteome_v2::add_protein(string info_line, string aaseq)
}
else if(kmers.at(middle).s==s)
{
found=true;
found=true;
proteins.at(kmers.at(middle).source_protein_index).add_hit(hit_genome_id);
cout<<"protein info: "<<proteins.at(kmers.at(middle).source_protein_index).get_genome_hits_size()<<endl;
cout<<"The code is making it in here. kmer = "<<s<<endl;
......@@ -1564,38 +1581,7 @@ void proteome_v2::write_out_gene(int index)
}
}
void proteome_v2::quicksort(vector<string>& tv,int left, int right)
{
int i=left;
int j=right;
string pivot=tv.at((left+right)/2);
while(i<=j)
{
while(pivot>tv.at(i))
{
i++;
}
while(pivot<tv.at(j))
{
j--;
}
if(i<=j)
{
string temp = tv.at(i);
tv.at(i) = tv.at(j);
tv.at(j) = temp;
i++;
j--;
}
};
if(left<j)
quicksort(tv,left,j);
if(i<right)
quicksort(tv,i,right);
}
/*string proteome_v2::scramble(string s)
{
......@@ -1627,13 +1613,10 @@ void proteome_v2::quicksort(vector<string>& tv,int left, int right)
bool proteome_v2::is_valid(string s)
{
for(int i=0; i<s.length(); i++)
{
int index=aas.get_aa(s[i]);
if(index<0 || index>19)
{
return false;
}
std::size_t found = s.find_first_not_of(aas.valid_aas);
if(found != std::string::npos ){
return false;
}
if(SUMOFSQUARES)
......@@ -1647,20 +1630,12 @@ bool proteome_v2::is_valid(string s)
bool proteome_v2::check_composition(string s)
{
vector<int> array;
//int array[20];
for(int i=0; i<20; i++)
{
//array[i]=0;
array.push_back(0);
}
for(int i=0; i<s.length(); i++)
for(int i=0; i<20; i++)
{
int index=aas.get_aa(s[i]);
if(index>=0 && index<20)
{
array[index]++;
}
char aa = aas.get_aa(i);
int aa_count = std::count(s.begin(),s.end(),aa);
array.push_back( aa_count );
}
int result=0;
......
This diff is collapsed.
......@@ -9,6 +9,7 @@ into a final sorted list.
#include <iostream>
#include <vector>
#include <algorithm>
#include <sstream>
#include <sys/stat.h>
#include <sys/types.h>
......@@ -47,7 +48,6 @@ class tmerg
void read_in_tags_um(vector<string>& tags, char current_amino_acid, int& num_refs);
void write_out_tags(vector<string>& tags,int ordinal, int num_refs);
void write_out_tags_um(vector<string>& tags,int ordinal, int num_refs);
void quicksort(vector<string>& tags, int left, int right);
bool compare_tags(string s1, string s2);
public:
......@@ -359,7 +359,7 @@ void tmerg::write_out_tags(vector<string>& tags,int ordinal, int num_refs)
oFile_ref = fopen( (path + "/"+bfn+"/REF_MERGED_TAGS"+dir+"/"+out_file_name).c_str(), "w");
}
quicksort(tags,0,tags.size()-1); //they should already be sorted; should be using a merge routine earlier.
std::sort(tags.begin(), tags.end());
if(TMERG_DEBUG)
{
cout<<"tmerg::write_out_tags(vector<string>&, int, int) : Successfully completed quicksort..."<<endl;
......@@ -464,38 +464,6 @@ void tmerg::write_out_tags_um(vector<string>& tags,int ordinal, int num_refs)
}
}
void tmerg::quicksort(vector<string>& tags, int left, int right)
{
int i=left;
int j=right;
string pivot=tags.at((left+right)/2);
while(i<=j)
{
while(