Commit 85b8053f authored by Raquel Bromberg's avatar Raquel Bromberg
Browse files

Code cleanup, mostly commented code deleted

parent 1cadca46
......@@ -234,7 +234,7 @@ int main(int argc, char* argv[])
exit(1);
}
ofstream outstream3;
/* ofstream outstream3;
string outfile3 = path+"/"+bfn+"/bin_corrections_unlikelies.txt";
outstream3.open(outfile3.c_str(), ios::out | ios::app);
if(outstream3.fail())
......@@ -242,6 +242,7 @@ int main(int argc, char* argv[])
cout<<"Failed to open outstream3 to "<<outfile3<<endl;
exit(1);
}
*/
outstream<<num_bins<<endl<<endl;
......@@ -330,8 +331,8 @@ int main(int argc, char* argv[])
}
outstream<<endl;
outstream3<<i<<" to "<<j<<endl;
outstream3<<uscore<<endl<<most_unlikely<<endl<<endl;
// outstream3<<i<<" to "<<j<<endl;
// outstream3<<uscore<<endl<<most_unlikely<<endl<<endl;
}
}
}
......
This diff is collapsed.
......@@ -26,7 +26,7 @@ class org
int total_num_aa; //total number of amino acids in the proteome
int total_num_proteins; //total number of proteins in the proteome
int tag_length;
bool conserved; //what the hell is this?
bool conserved;
int filtering_steps;
vector<string> files; //e.g. NC_021726.faa
......@@ -93,14 +93,12 @@ class org
void fill_fields(string dir_par);
void generate_tags(bool do_scrambled, bool partial_run);
// void generate_scrambled_tags();
void clear_tags();
void clear_scr_tags();
void write_out_aa_array(ofstream& outstream);
int mark_keepers(vector<string> keepers);
void write_out_gene(int index);
void clear_proteins();
// void increment_ggs(int gene_id,int gnums_size,int vector_size);
void increment_ggs(int gene_id,int gnums_size,int vector_size,double d);
void write_out_new_proteome(int index);
void add_file(string s);
......@@ -115,14 +113,6 @@ class org
void write_filtered_set(ofstream& outstream, int filt_option);
//GETTERS AND SETTERS
//void add_file(string f);
//void set_uid(string u);
//vector<string> get_files();
//string get_uid();
//string get_path();
//string get_bfn();
int get_ordinal();
};
......@@ -161,11 +151,6 @@ void org::generate_tags(bool do_scrambled, bool partial_run)
p.generate_tags(ordinal,do_scrambled,partial_run);
}
/*void org::generate_scrambled_tags()
{
p.generate_scrambled_tags(ordinal);
}
*/
void org::set_ordinal(int i)
{
ordinal=i;
......@@ -173,7 +158,6 @@ void org::set_ordinal(int i)
void org::setp_set_keeper(int index,int i)
{
// cout<<"In setp_set_keeper(int,int): "<<index<<" "<<i<<endl;
p.set_keeper(index,i);
}
......@@ -202,12 +186,6 @@ void org::setp_size_real_tags(int i)
p.set_size_real_tags(i);
}
/*void org::setp_set_pv_val(int index,int i1, int i2, int i3, int i4, int i5)
{
p.set_pv_val(index,i1,i2,i3,i4,i5);
}
*/
void org::setp_set_pvv_gnums_val(int index, vector<int> vals)
{
p.set_pvv_gnums_val(index,vals);
......@@ -232,11 +210,6 @@ void org::setp_chosen_column(int i)
p.set_chosen_column(i);
}
/*void org::setp_set_pv_size(int i)
{
p.set_pv_size(i);
}*/
void org::setp_set_pvv_size(int i)
{
p.set_pvv_size(i);
......@@ -339,11 +312,7 @@ int org::read_faa_file(int i,int& total_aa)
instream>>eater;
}
//we now have both the info_line and the protein sequence.
//cout<<"Adding a protein"<<endl;
//cout<<info_line<<endl;
p.add_protein(info_line,aaseq);
//cout<<"info_line = "<<info_line<<endl;
//cout<<"aaseq = "<<aaseq<<endl;
add_aa_counts(aaseq);
result+=aaseq.length();
aaseq="";
......@@ -376,19 +345,6 @@ int org::get_tags_size()
return p.get_tags_size();
}
/*
//GETTERS AND SETTERS
void org::add_file(string f)
{
files.push_back(f);
}
vector<string> org::get_files()
{
return files;
}
*/
string org::get_file(int i)
{
if(i>=0 && i<files.size())
......@@ -421,18 +377,7 @@ string org::get_directory()
{
return directory;
}
/*
void org::set_uid(string u)
{
uid = u;
}
string org::get_uid()
{
return uid;
}
*/
void org::set_path(string p)
{
path = p;
......@@ -448,15 +393,6 @@ void org::set_dir(string s)
dir=s;
}
/*
string org::get_path()
{
return path;
}
*/
void org::add_aa_counts(string s)
{
for(int i=0; i<s.length(); i++)
......@@ -489,13 +425,6 @@ void org::write_out_aa_array(ofstream& outstream)
}
}
/*
string org::get_bfn()
{
return bfn;
}
*/
string org::get_dir()
{
return dir;
......@@ -520,7 +449,7 @@ void org::mark_gene_internal_repeats(int index)
void org::write_out_gene(int index)
{
p.print_protein_info(index);
// p.print_protein_info(index);
}
void org::clear_proteins()
......@@ -528,11 +457,6 @@ void org::clear_proteins()
p.clear_proteins();
}
/*void org::increment_ggs(int gene_id,int gnums_size,int vector_size)
{
p.increment_ggs(gene_id,gnums_size,vector_size);
}*/
void org::increment_ggs(int gene_id,int gnums_size,int vector_size,double d)
{
p.increment_ggs(gene_id,gnums_size,vector_size,d);
......@@ -546,24 +470,14 @@ void org::write_out_new_proteome(int index)
cout<<"dir = "<<dir<<endl;
cout<<"directory = "<<directory<<endl;
/*
string outcommand = "rm -r "+path+"/FAA_new/"+directory;
int system_return = system(outcommand.c_str());
outcommand = "mkdir "+path+"/FAA_new/"+directory;
system_return = system(outcommand.c_str());
*/
stringstream ss;
ss<<index;
string col;
ss>>col;
//string s = path+bfn+"/FAA_new_col"+col+"/"+directory;
//cout<<"s = "<<s<<endl;
vector<pair<int,int> > mscv;
populate_mscv(mscv);
p.write_out_new_proteome_v2(path,directory,files,ordinal,mscv,index);
//instream.open( (path+"/"+dir+"/"+directory+"/"+files.at(i)).c_str() );
}
void org::add_file(string s)
......@@ -636,7 +550,6 @@ void org::populate_mscv(vector<pair<int,int> >& mscv)
instream.open(infile.c_str());
if(instream.fail())
{
//Just leave it alone. File doesn't exist. Leave mscv empty.
}
else
{
......@@ -654,7 +567,6 @@ void org::populate_mscv(vector<pair<int,int> >& mscv)
foo.first=gene_id;
foo.second=count;
mscv.push_back(foo);
//mscv[gene_id]=count;
instream>>gnum>>gene_id>>count;
}
}
......
......@@ -35,8 +35,8 @@ class plot_v2
double wi; //weight increment.
exp_grid grid; //to calculate the double exponential fit
vector<float> x_fit_quad; //
vector<float> y_fit_quad; //
vector<float> x_fit_quad;
vector<float> y_fit_quad;
vector<float> x_fit_exp;
vector<float> y_fit_exp;
vector<float> w_fit_exp;
......@@ -109,8 +109,6 @@ class plot_v2
void add_value_scr(int index, int count);
void set_ids(int id1_par, int id2_par);
void print_plots();
//void set_statistics(string mean_par, string rms_par, vector<int> counts_par, int end_index_par, double avg_for_bc_par, double entropy_par, double avg_for_bc_exp_par);//, string min_kmax_bitscore_par);
//void set_statistics(string mean_par, string rms_par, string min_kmax_bitscore_par, double entropy_par);
void calculate_slope(double a_avg, double a_uncertainty, double wi_par, string sout);
void write_out_sheared_plots();
void write_out_hgt_sheared_plot();
......@@ -185,16 +183,8 @@ plot_v2::plot_v2(string bfn_par, string path_par, int bins_par)
void plot_v2::calculate_slope_from_grid(double min_slope_par, double max_slope_par)
{
// cout<<"In plot_v2::calculate_slope_from_grid. id1="<<id1<<" id2="<<id2<<endl;
// cout<<"min_slope_par="<<min_slope_par<<endl;
// cout<<"max_slope_par="<<max_slope_par<<endl;
grid.get_exps(min_slope_par,max_slope_par,x_fit_exp,y_fit_exp,w_fit_exp,exp_slope_b1,exp_slope_b2,match_score_cutoff);
// cout<<"exp_slope_b1="<<exp_slope_b1<<endl;
// cout<<"exp_slope_b2="<<exp_slope_b2<<endl;
// cout<<"match_score_cutoff="<<match_score_cutoff<<endl;
#pragma omp critical
{
ofstream outstream;
......@@ -268,11 +258,6 @@ void plot_v2::shear_data()
{
shear_left();
shear_right();
// do_bin_correction();
//August 21, 2014 moved.
//setxyw(vector<float>& x, vector<float>& y, vector<float>& w)
//set x and y here. the only thing that needs to be calculated each time is the w because the weights change.
}
/***************************************************************************
......@@ -292,8 +277,6 @@ void plot_v2::shear_left()
for(int i=0; i<bins; i++)
{
//cout<<i<<" "<<datascr.at(i)<<" "<<data_sheared.at(i)<<" ";
if(datascr.at(i)>0.25*data.at(i)) //corresponds to nit-scores with a high-level of random counts even over the randomized data, i.e. the uninformative range.
{
data_sheared.at(i)=0;
......@@ -302,7 +285,6 @@ void plot_v2::shear_left()
{
data_sheared.at(i) = data.at(i)-datascr.at(i);
}
//cout<<data_sheared.at(i)<<endl;
}
//2) Find the value for the maximum number of counts, and the index for it in data_sheared. Set all counts with nit-scores lower than this max to 0.
......@@ -331,11 +313,8 @@ void plot_v2::shear_left()
data_sheared.at(i)=0;
}
}
// left_bound=max_index;
left_bound=max_index+1; //being cautious! june 12
//cout<<"left_bound = "<<left_bound<<endl;
left_bound=max_index+1;
}
void plot_v2::shear_right()
......@@ -395,16 +374,7 @@ void plot_v2::shear_right()
}
}
}
// cout<<"end_index = "<<end_index<<endl;
// cout<<"right_bound = "<<right_bound<<endl;
}
/*void plot_v2::do_bin_correction()
{
}
*/
void plot_v2::print_plots()
{
......@@ -415,69 +385,6 @@ void plot_v2::print_plots()
}
}
/*
void plot_v2::set_statistics(string mean_par, string rms_par, vector<int> counts_par, int end_index_par, double avg_for_bc_par, double entropy_par, double avg_for_bc_exp_par)
{
bin_correct=true;
entropy = entropy_par;
if(mean_par=="-nan" || rms_par=="-nan")
{
mean = 0;
rms = 0;
mean_rms=false;
end_index = end_index_par;
avg_for_bc = avg_for_bc_par;
avg_for_bc_exp = avg_for_bc_exp_par;
cout<<"avg_for_bc = "<<avg_for_bc<<endl;
counts=counts_par;
}
else
{
mean = atof(mean_par.c_str());
rms = atof(rms_par.c_str());
mean_rms=true;
end_index = end_index_par;
avg_for_bc = avg_for_bc_par;
avg_for_bc_exp = avg_for_bc_exp_par;
cout<<"avg_for_bc = "<<avg_for_bc<<endl;
counts=counts_par;
}
}
void plot_v2::set_statistics(string mean_par, string rms_par, string min_kmax_bitscore_par,double entropy_par)
{
bin_correct=false;
entropy = entropy_par;
if(mean_par=="-nan" || rms_par=="-nan")
{
mean = 0;
rms = 0;
min_kmax_bitscore=0;
mean_rms=false;
end_index = -1;
avg_for_bc = -1;
}
else
{
mean = atof(mean_par.c_str());
rms = atof(rms_par.c_str());
min_kmax_bitscore=atof(min_kmax_bitscore_par.c_str());
mean_rms=true;
end_index = -1;
avg_for_bc = -1;
}
/* cout<<"In set_statistics"<<endl;
cout<<"mean="<<mean<<endl;
cout<<"rms="<<rms<<endl;
cout<<"min_kmax_bitscore="<<min_kmax_bitscore<<endl;
*/ /*
}
*/
void plot_v2::calculate_slope(double a_avg, double a_uncertainty, double wi_par,string sout)
{
if(a_uncertainty==0)
......@@ -486,7 +393,7 @@ void plot_v2::calculate_slope(double a_avg, double a_uncertainty, double wi_par,
}
wi = wi_par;
//This shouldn't exit. Should just simply be flagged and pruned.
if((right_bound - left_bound) < 3)
{
cout<<"Too few points to calculate a slope: right_bound-left_bound = "<<right_bound-left_bound<<endl;
......@@ -516,9 +423,6 @@ void plot_v2::calculate_slope(double a_avg, double a_uncertainty, double wi_par,
cout<<"rmsd_quadratic = "<<rmsd_quadratic<<endl;
cout<<"wt_rmsd_quadratic = "<<wt_rmsd_quadratic<<endl;
cout<<"a_uncertainty = "<<a_uncertainty<<endl;
int staller;
cout<<"staller: ";
cin>>staller;
}
vector<float> x;
......@@ -571,8 +475,6 @@ void plot_v2::calculate_slope(double a_avg, double a_uncertainty, double wi_par,
y_intercept_linear = a_linear*0 + b_linear;
outstream<<id1<<" "<<id2<<" "<<a_quadratic<<" "<<b_quadratic<<" "<<c_quadratic<<" "<<(int)pow(exp(1),c_quadratic)<< " "<<a_linear<<" "<<b_linear<<" "<<rmsd_quadratic<<" "<<wt_rmsd_quadratic<<" "<<r_squared<<" "<<rmsd_linear<<" "<<r_squared_linear<<" "<<a_avg<<" "<<a_uncertainty<<" "<<end_index<<" "<<avg_for_bc<<endl;
//cout<<id1<<" "<<id2<<" "<<a_quadratic<<" "<<b_quadratic<<" "<<c_quadratic<<" "<<(int)pow(exp(1),c_quadratic)<< " "<<a_linear<<" "<<b_linear<<" "<<rmsd_quadratic<<" "<<wt_rmsd_quadratic<<" "<<r_squared<<" "<<rmsd_linear<<" "<<r_squared_linear<<" "<<a_avg<<" "<<a_uncertainty<<" "<<end_index<<" "<<avg_for_bc<<endl;
}
}
......@@ -603,37 +505,14 @@ void plot_v2::write_out_sheared_plots()
outstream<<id1<<" "<<id2<<endl;
int x_index=0;
// if(left_bound==x_fit_quad.at(0) && right_bound==x_fit_quad.at(x_fit_quad.size()-1))
// {
for(int i=left_bound; i<=right_bound; i++)
{
outstream<<i<<" "<<data_sheared.at(i)<<" ";//<<log(data_sheared.at(i))<<" ";
/*if(bin_correct)
{
outstream<<y_fit_quad.at(x_index);
}*/
// x_index++;
outstream<<i<<" "<<data_sheared.at(i)<<" ";
outstream<<endl;
}
outstream<<endl;
//}
/*else
{
cout<<"mismatch in the dimensions of data_sheared and y"<<endl;
cout<<"left_bound = "<<left_bound<<" right_bound = "<<right_bound<<endl;
cout<<id1<<" "<<id2<<endl;
outstream<<"Cannot write out sheared plot for "<<id1<<" "<<id2<<endl;
}
*/
/* for(int i=0; i<x_fit_quad.size(); i++)
{
outstream<<x_fit_quad.at(i)<<" "<<y_fit_quad.at(i)<<endl;
}
*/ //What is this for loop even for??
outstream.close();
}
......@@ -651,37 +530,14 @@ void plot_v2::write_out_hgt_sheared_plot()
outstream<<id1<<" "<<id2<<endl;
int x_index=0;
// if(left_bound==x_fit_quad.at(0) && right_bound==x_fit_quad.at(x_fit_quad.size()-1))
// {
for(int i=left_bound; i<=right_bound; i++)
{
outstream<<i<<" "<<data_sheared.at(i)<<" ";//<<log(data_sheared.at(i))<<" ";
/*if(bin_correct)
{
outstream<<y_fit_quad.at(x_index);
}*/
// x_index++;
outstream<<i<<" "<<data_sheared.at(i)<<" ";
outstream<<endl;
}
outstream<<endl;
//}
/*else
{
cout<<"mismatch in the dimensions of data_sheared and y"<<endl;
cout<<"left_bound = "<<left_bound<<" right_bound = "<<right_bound<<endl;
cout<<id1<<" "<<id2<<endl;
outstream<<"Cannot write out sheared plot for "<<id1<<" "<<id2<<endl;
}
*/
/* for(int i=0; i<x_fit_quad.size(); i++)
{
outstream<<x_fit_quad.at(i)<<" "<<y_fit_quad.at(i)<<endl;
}
*/ //What is this for loop even for??
outstream.close();
}
......@@ -764,10 +620,6 @@ double plot_v2::get_genomic_distance()
double plot_v2::get_genomic_distance_v2()
{
//cout<<"a = "<<a<<endl;
//cout<<"b = "<<b<<endl;
//cout<<"a_linear = "<<a_linear<<endl;
if(a_quadratic>=0)
{
return b_quadratic*(-1);
......@@ -794,7 +646,6 @@ void plot_v2::setxyw(vector<float>& x, vector<float>& y, vector<float>& w)
y_fit_exp.clear();
x_fit_exp.clear();
w_fit_exp.clear();
//vector<float> w2;
//most of this only has to be done once. Other than the wi part.
for(int i=left_bound; i<=right_bound; i++)
......@@ -804,68 +655,21 @@ void plot_v2::setxyw(vector<float>& x, vector<float>& y, vector<float>& w)
x.push_back( (float)i);
x_fit_quad.push_back((float)i);
x_fit_exp.push_back((float)i);
/*
if( left_bound<20 && i>20)
{
x_fit_exp.push_back((float)i);
}
else if(left_bound>=20 && left_bound<=25 && i>=25)
{
x_fit_exp.push_back((float)i);
}
else if(left_bound>25 && left_bound<=30 && i>=30)
{
x_fit_exp.push_back((float)i);
}
else if(left_bound>30 && left_bound<=35 && i>=35)
{
x_fit_exp.push_back((float)i);
}
*/
y.push_back(log((float)data_sheared.at(i)));
y_fit_quad.push_back(log((float)data_sheared.at(i)));
y_fit_exp.push_back(data_sheared.at(i));//next 2 lines testing raw dat through exp fit
w_fit_exp.push_back(1.0/(data.at(i)+1+0.01*data.at(i)*data.at(i))); //exponential fit weights.
/*
if( left_bound<20 && i>20)
{
y_fit_exp.push_back(data.at(i));
w_fit_exp.push_back(1.0/(data.at(i)+1+0.01*data.at(i)*data.at(i))); //exponential fit weights.
}
else if(left_bound>=20 && left_bound<=25 && i>=25)
{
y_fit_exp.push_back(data.at(i));
w_fit_exp.push_back(1.0/(data.at(i)+1+0.01*data.at(i)*data.at(i))); //exponential fit weights.
}
else if(left_bound>25 && left_bound<=30 && i>=30)
{
y_fit_exp.push_back(data.at(i));
w_fit_exp.push_back(1.0/(data.at(i)+1+0.01*data.at(i)*data.at(i))); //exponential fit weights.
}
else if(left_bound>30 && left_bound<=35 && i>=35)
{
y_fit_exp.push_back(data.at(i));
w_fit_exp.push_back(1.0/(data.at(i)+1+0.01*data.at(i)*data.at(i))); //exponential fit weights.
}
*/
w.push_back( ((float)data.at(i)*(float)data.at(i))/ ( (float)data.at(i)*(float)data.at(i)+wi*wi) );
//w2.push_back( ( (float)data_sheared.at(i) - (float)datascr.at(i))/(float)data_sheared.at(i));
}
else
{
// cout<<"Value of y=0 being thrown away."<<endl;
}
}
//New weights as of August 12, 2014
//Not clear if we want to keep them.
for(int i=0; i<w.size(); i++)
{
//w.at(i)*=w2.at(i);
w_fit_exp_rmsd.push_back(w.at(i));
}
......@@ -880,19 +684,10 @@ void plot_v2::setxyw(vector<float>& x, vector<float>& y, vector<float>& w)
for(int i=0; i<y.size(); i++)
{
// cout<<"i = "<<i<<endl;
// cout<<"y = "<<y.at(i)<<endl;
if(x.at(i)>=0 && x.at(i)<counts.size())
{
y.at(i)-=(log(counts.at(x.at(i)))-avg_for_bc);
y_fit_quad.at(i)=y.at(i);
//double q = counts.at(x.at(i))-avg_for_bc;
//cout<<"q in if(bin_correct) block = "<<q<<endl;
//cout<<"y_fit_exp at i="<<i<<" = "<<y_fit_exp.at(i)<<endl;
//y_fit_exp.at(i)=y_fit_exp.at(i)-q;
//y_fit_exp.at(i)-=(counts.at(x.at(i))-avg_for_bc_exp);
//cout<<"after bin subtraction y_fit_exp.at(i) = "<<i<<" "<<y_fit_exp.at(i)<<endl;
//cout<<i<<" "<<x.at(i)<<" "<<y.at(i)<<" "<<y_fit_exp.at(i)<<" "<<counts.at(i)<<endl;
}
else
{
......@@ -903,17 +698,7 @@ void plot_v2::setxyw(vector<float>& x, vector<float>& y, vector<float>& w)
}
else
{
//just do nothing.
}
/* cout<<"x and y after bin-correction"<<endl;
for(int i=0; i<x.size(); i++)
{
cout<<i<<" "<<x.at(i)<<" "<<y.at(i)<<endl;
}
*/
// cout<<"At the bottom of setxyw"<<endl;
}
void plot_v2::calc_quadratic_rsqr(vector<float> y, vector<float> x, float a_par, float b_par, float c_par)
......@@ -990,11 +775,6 @@ void plot_v2::calc_quadratic_rmsd(vector<float> y, vector<float> x, vector<float
double val = a_par*pow(x.at(i),2)+b_par*x.at(i)+c_par;
val = pow(val-y.at(i),2);
rmsd_quadratic = rmsd_quadratic+val;
/* if(data_sheared.at(i)>=50)
{
wt_rmsd_quadratic += w.at(i)*val;
divisor+=w.at(i);
} */
wt_rmsd_quadratic += w.at(i)*val;
divisor+=w.at(i);
}
......@@ -1091,11 +871,8 @@ bool plot_v2::is_it_stable()
void plot_v2::hgt_reset()
{
//data.clear();
x_fit_quad.clear();
y_fit_quad.clear();
//vector<float> x; //eventually these should jsut be private membe