Commit 85b8053f authored by Raquel Bromberg's avatar Raquel Bromberg

Code cleanup, mostly commented code deleted

parent 1cadca46
......@@ -234,7 +234,7 @@ int main(int argc, char* argv[])
exit(1);
}
ofstream outstream3;
/* ofstream outstream3;
string outfile3 = path+"/"+bfn+"/bin_corrections_unlikelies.txt";
outstream3.open(outfile3.c_str(), ios::out | ios::app);
if(outstream3.fail())
......@@ -242,6 +242,7 @@ int main(int argc, char* argv[])
cout<<"Failed to open outstream3 to "<<outfile3<<endl;
exit(1);
}
*/
outstream<<num_bins<<endl<<endl;
......@@ -330,8 +331,8 @@ int main(int argc, char* argv[])
}
outstream<<endl;
outstream3<<i<<" to "<<j<<endl;
outstream3<<uscore<<endl<<most_unlikely<<endl<<endl;
// outstream3<<i<<" to "<<j<<endl;
// outstream3<<uscore<<endl<<most_unlikely<<endl<<endl;
}
}
}
......
......@@ -8,8 +8,6 @@
#include "histogram.h"
#include "org.h"
//const double RMSD_RATIO_CUTOFF=0.095; //[0,RMSD_RATIO_CUTOFF] = The range for the ratio rmsd_exp/rmsd_quadratic that indicates a problem (HGT) between a pair.
//const int OKAY_CUTOFF=5.0; //TOTALLY ARBITRARY AND MUST BE REMOVED/REIMPLEMENTED.
const bool MDIST_DEBUG=true;
const double P = 15.0; //the val of x at which we take the derivative of the polynomial function.
const int WEIGHT=100; //Weights the counts with the lower nit-scores more than those with the higher nit-scores which are noisier.
......@@ -110,15 +108,11 @@ Public Variables/Functions.
void do_main(); //Calls all the main parts of the run.
void read_in_data(); //id=0 real data, id=1 scrambled data.
void shear_data(); //Calls the plot_v2 functions that select the data cutoffs.
//void print_plots();
void calculate_all(); //Calls the main fit functions from plot_v2
void write_out_sheared_plots();
void write_out_distance_matrix(string details);
double calc_ncertainty_of_a();
void write_out_exp_dm(bool hgt_fixing);
// void analyze_exps(); //
// bool ae_v1(int i,double& val);
// void write_out_ref_set(int size);
void write_out_problem_set();
int wops_v1(int j);
void edit_distances();
......@@ -354,16 +348,6 @@ void mdist::shear_data()
}
}
/*void mdist::print_plots()
{
for(int i=0; i<plots.size(); i++)
{
plots.at(i).print_plots();
}
}
*/
void mdist::calculate_all()
{
for(int i=0; i<plots.size(); i++)
......@@ -391,32 +375,17 @@ void mdist::calculate_all()
plots.at(i).calculate_slope(0, 1, WEIGHT,"raw");
}
// write_out_distance_matrix("raw");
// cout<<"finished writing out distance matrices 1"<<endl;
//cout<<"here 1.1"<<endl;
// write_out_sorted_as("raw");
//cout<<"here 1.2"<<endl;
calc_uncertainty_of_a();
//cout<<"here 1.3"<<endl;
a_uncertainties.push_back(make_pair(WEIGHT,a_uncert_v1)); //for edit_distances.
//cout<<"here 1.4"<<endl;
ofstream outstream2;
//cout<<"here 1.5"<<endl;
prep_plot_statistics(outstream2,"plot_statistics_restrained_"+dts);
//cout<<"here 1.6"<<endl;
/* cout<<"Stalling 1 : ";
int staller1;
cin>>staller1;
*/
count=0;
cout<<"Calculating restrained slopes..."<<endl;
#pragma omp parallel for
for(int i=0; i<plots.size(); i++)
{
//cout<<"Calculating slope for "<<i<<"/"<<plots.size()<<endl;
if(count%1000==0)
{
#pragma omp critical
......@@ -428,21 +397,15 @@ void mdist::calculate_all()
plots.at(i).calculate_slope(a_avg_v1,a_uncert_v1,WEIGHT,"restrained");
}
cout<<"Attempting to write out distance matrix: restrained"<<endl;
/* cout<<"Staller:";
int staller;
cin>>staller;
*/
cout<<"Writing out distance matrix: restrained"<<endl;
write_out_distance_matrix("restrained");
// write_out_sorted_as("restrained");
cout<<"Calculating double exponential fits..."<<endl;
count=0;
#pragma omp parallel for
for(int i=0; i<plots.size(); i++)
{
//cout<<i<<"/"<<plots.size()<<endl;
if(count%1000==0)
{
#pragma omp critical
......@@ -456,7 +419,6 @@ void mdist::calculate_all()
cout<<"Completed double exponential fit calculations."<<endl;
write_out_exp_dm(false);
write_out_fits_vector(false);
// write_out_msc(false);
}
void mdist::write_out_fits_vector(bool hgt_fixing)
......@@ -488,38 +450,9 @@ void mdist::write_out_fits_vector(bool hgt_fixing)
outstream.close();
}
/*void mdist::write_out_msc(bool hgt_fixing)
{
ofstream outstream;
string outfile;
if(!hgt_fixing)
{
outfile = path+"/"+bfn+"/match_score_cutoffs.txt";
}
else
{
outfile = path+"/"+bfn+"/match_score_cutoffs_after_hgt_fix.txt";
}
outstream.open( outfile.c_str());
if(outstream.fail())
{
cout<<"Failed to open outfile = "<<outfile<<" for msc."<<endl;
exit(1);
}
outstream<<"ord1 ord2 cutoff"<<endl;
for(int i=0; i<fits_vector.size(); i++)
{
outstream<<fits_vector.at(i).ord1<<" "<<fits_vector.at(i).ord2<<" "<<fits_vector.at(i).msc<<" "<<fits_vector.at(i).ef_rmsdwt/fits_vector.at(i).qf_rmsdwt<<endl;
}
outstream.close();
}
*/
void mdist::write_out_sheared_plots()
{
cout<<"Attempting to write out sheared_plots"<<endl;
cout<<"Writing out sheared_plots"<<endl;
for(int i=0; i<plots.size(); i++)
{
......@@ -737,45 +670,17 @@ void mdist::do_slope_diff_plot(string details)
void mdist::write_out_exp_dm(bool hgt_fixing)
{
/* ofstream outstream;
string outfile;
if(!hgt_fixing)
{
outfile = path+"/"+bfn+"/DISTANCE_MATRICES"+addon+"/dm_exponential.txt";
}
else
{
outfile = path+"/"+bfn+"/DISTANCE_MATRICES"+addon+"/dm_exponential_after_hgt_fix.txt";
}
outstream.open(outfile.c_str());
if(outstream.fail())
{
cout<<"Failed to open distance matrix file to "<<outfile<<endl;
exit(1);
}
outstream<<num_orgs<<endl;*/
for(int i=0; i<organisms.size(); i++)
{
/* outstream<<organisms.at(i).get_directory();
if(i<refsetsize)
{
outstream<<"_ref";
}
outstream<<" \t";
*/
for(int j=0; j<organisms.size(); j++)
{
int index = get_index(i,j);
if(index==-1)
{
//outstream<<0<<" ";
}
else
{
double d1 = plots.at(index).get_exp_distance();
// outstream<<d1<<" ";
fits_vector.at(index).b1 = plots.at(index).get_exp_b1();
fits_vector.at(index).b2 = plots.at(index).get_exp_b2();
fits_vector.at(index).F = plots.at(index).get_exp_F();
......@@ -785,9 +690,7 @@ void mdist::write_out_exp_dm(bool hgt_fixing)
fits_vector.at(index).stable=true;
}
}
// outstream<<endl;
}
// outstream.close();
}
void mdist::write_out_distance_matrix(string details)
......@@ -803,34 +706,14 @@ void mdist::write_out_distance_matrix(string details)
ofstream outstream, entoutstream, linstream, elinstream, enlstream, polystream, epolystream, enlpolystream, enlpolystreamr;
details += "_wt"+u.int_to_string((int)WEIGHT);
//string dmfile = path+"/"+bfn+"/DISTANCE_MATRICES"+addon+"/dm_linpolycomb_"+details+".txt";
//string edmfile = path+"/"+bfn+"/DISTANCE_MATRICES"+addon+"/dm_linpolycomb_"+details+"_entropy.txt";
//string dmfile2 = path+"/"+bfn+"/DISTANCE_MATRICES"+addon+"/dm_linear_"+details+".txt";
//string edmfile2 = path+"/"+bfn+"/DISTANCE_MATRICES"+addon+"/dm_linear_"+details+"_entropy.txt";
// string dmfile3 = path+"/"+bfn+"/DISTANCE_MATRICES"+addon+"/dm_polynomial_"+details+".txt";
// string edmfile3 = path+"/"+bfn+"/DISTANCE_MATRICES"+addon+"/dm_polynomial_"+details+"_entropy.txt";
string enlfile = path+"/"+bfn+"/DISTANCE_MATRICES"+addon+"/dm_polynomial_"+details+"_entropy_nonlinearity.txt";
string enlfiler = path+"/"+bfn+"/DISTANCE_MATRICES"+addon+"/dm_polynomial_"+details+"_entropy_nonlinearity_taxonomy.txt";
//open_file(outstream,dmfile);
//open_file(entoutstream,edmfile);
//open_file(linstream,dmfile2);
//open_file(elinstream,edmfile2);
// open_file(polystream,dmfile3);
// cout<<"dmfile3 = "<<dmfile3<<endl;
// open_file(epolystream,edmfile3);
open_file(enlpolystream,enlfile);
open_file(enlpolystreamr,enlfiler);
logstream<<"Successfully opened streams for 2 distance matrices:"<<endl;
//outstream<<num_orgs<<endl;
//linstream<<num_orgs<<endl;
//entoutstream<<num_orgs<<endl;
//elinstream<<num_orgs<<endl;
// polystream<<num_orgs<<endl;
// cout<<"just wrote num_orgs to polystream - please check. num_orgs = "<<num_orgs<<endl;
// epolystream<<num_orgs<<endl;
enlpolystream<<num_orgs<<endl;
enlpolystreamr<<num_orgs<<endl;
......@@ -842,34 +725,15 @@ void mdist::write_out_distance_matrix(string details)
cout<<"Attempting to get into the for loop"<<endl;
for(int i=0; i<organisms.size(); i++)
{
//outstream<<organisms.at(i).get_directory();
//linstream<<organisms.at(i).get_directory();
//entoutstream<<organisms.at(i).get_directory();
//elinstream<<organisms.at(i).get_directory();
// polystream<<organisms.at(i).get_directory();
// epolystream<<organisms.at(i).get_directory();
enlpolystream<<organisms.at(i).get_directory();
enlpolystreamr<<genome_mapped.at(i);
if(i<refsetsize)
{
//outstream<<"_ref";
//linstream<<"_ref";
//entoutstream<<"_ref";
//elinstream<<"_ref";
// polystream<<"_ref";
// epolystream<<"_ref";
enlpolystream<<"_ref";
enlpolystreamr<<"_ref";
}
//outstream<<" \t";
//linstream<<" \t";
//entoutstream<<" \t";
//elinstream<<" \t";
// polystream<<" \t";
// epolystream<<" \t";
enlpolystream<<" \t";
enlpolystreamr<<" \t";
......@@ -878,19 +742,12 @@ void mdist::write_out_distance_matrix(string details)
int index = get_index(i,j);
if(index==-1)
{
//outstream<<0<<" ";
//linstream<<0<<" ";
//entoutstream<<0<<" ";
//elinstream<<0<<" ";
// polystream<<0<<" ";
// epolystream<<0<<" ";
enlpolystream<<0<<" ";
enlpolystreamr<<0<<" ";
}
else
{
double d1 = plots.at(index).get_genomic_distance_v2();
//outstream<<d1<<" ";
double d2 = plots.at(index).get_linear_distance();
fits_vector.at(index).ord1=i;
fits_vector.at(index).ord2=j;
......@@ -898,70 +755,41 @@ void mdist::write_out_distance_matrix(string details)
fits_vector.at(index).org2=organisms.at(j).get_directory();
fits_vector.at(index).linear_fit=d2;
fits_vector.at(index).lf_rmsdwt=plots.at(index).get_linear_rmsd();
//linstream<<d2<<" ";
double d3 = plots.at(index).get_polynomial_distance();
//cout<<"d3 = "<<d3<<endl;
fits_vector.at(index).quadratic_fit_b=d3;
fits_vector.at(index).quadratic_fit_a=plots.at(index).get_a();
fits_vector.at(index).qf_rmsdwt=plots.at(index).get_wtrmsd_quadratic();
// polystream<<d3<<" ";
if(min_slope==-1 || min_slope>d3)
{
min_slope=d3; //This is a waste of time. The min_slope will always be 0. But I can't stop myself from coding this in.
min_slope=d3;
}
if(max_slope==-1 || max_slope<d3)
{
max_slope=d3;
}
//entoutstream<<d1*plots.at(index).get_entropy()<<" ";
//elinstream<<d2*plots.at(index).get_entropy()<<" ";
// epolystream<<d3*plots.at(index).get_entropy()<<" ";
double d4=-1.0*(2.0*P*plots.at(index).get_a()+(-1.0*d3));//slope of the quadratic function at P
enlpolystream<<-w*log( (w-d4*plots.at(index).get_entropy())/w)<<" ";
enlpolystreamr<<-w*log( (w-d4*plots.at(index).get_entropy())/w)<<" ";
}
}
//outstream<<endl;
//linstream<<endl;
//entoutstream<<endl;
//elinstream<<endl;
// polystream<<endl;
// epolystream<<endl;
enlpolystream<<endl;
enlpolystreamr<<endl;
}
//outstream.close();
//linstream.close();
//entoutstream.close();
//elinstream.close();
// polystream.close();
// epolystream.close();
enlpolystream.close();
enlpolystreamr.close();
logstream<<"2 distance matrices successfully written and streams closed."<<endl<<endl;
/*
int system_return = system ( ("./rapidnj "+dmfile+" > "+dmfile+"_tree").c_str());
system_return = system ( ("./rapidnj "+dmfile2+" > "+dmfile2+"_tree").c_str());
system_return = system ( ("./rapidnj "+dmfile3+" > "+dmfile3+"_tree").c_str());
system_return = system ( ("./rapidnj "+edmfile+" > "+edmfile+"_tree").c_str());
system_return = system ( ("./rapidnj "+edmfile2+" > "+edmfile2+"_tree").c_str());
system_return = system ( ("./rapidnj "+edmfile3+" > "+edmfile3+"_tree").c_str());
system_return = system ( ("./rapidnj "+enlfiler+" > "+enlfiler+"_tree").c_str());
system_return = system ( ("./rapidnj "+enlfile+" > "+enlfile+"_tree").c_str());
*/
}
int mdist::get_index(int i1, int i2)
{
if(i1==i2)
{
return -1; //this is very dangerous, but what can I do? Make sure to include bound checks wherever this function is called!
return -1; //Include bound checks wherever this function is called
}
else if(i1<i2)
......@@ -979,8 +807,6 @@ int mdist::get_index(int i1, int i2)
}
}
//something needs to be done; if the run is on 2 organisms, a lot of these statistics just don't make sense.
//This is very important because we will be calculating pair distances for incremental additions to larger sets.
void mdist::calc_uncertainty_of_a()
{
ofstream outstream;
......@@ -1065,7 +891,6 @@ void mdist::calc_uncertainty_of_a()
a_avg_v1=a_avg_v1+plots.at(i).get_a();
}
a_avg_v1=a_avg_v1/(double)plots.size();
//cout<<"a_avg_v1="<<a_avg_v1<<endl; //This is exp(alpha)
a_uncert_v1 = 0.0;
for(int i=0; i<plots.size(); i++)
......@@ -1078,7 +903,6 @@ void mdist::calc_uncertainty_of_a()
a_uncert_v1 = sqrt(a_uncert_v1);
outstream<<a_uncert_v1<<" "<<a_avg_v1<<endl;
//cout<<"a_uncert_v1 = "<<a_uncert_v1<<endl;
}
outstream.close();
......@@ -1145,362 +969,6 @@ void open_file(ifstream& instream, string infile)
}
}
/*void mdist::analyze_exps() //are we done or not
{
ofstream outstream;
string outfile = path+"/"+bfn+"/exp_problems.txt";
u.open_ofile_app(outstream,outfile,"outfile");
outstream<<"Using mdist::ae_v1(int,double&). RMSD_RATIO_CUTOFF="<<RMSD_RATIO_CUTOFF<<endl;
outstream<<"Writing out all pairs with 0<=(rmsd_exp/rmsd_quadratic)<RMSD_RATIO_CUTOFF"<<endl;
int hits=0; //Number of pairs requiring further correction for HGT (e.g. single copy phages)
for(int i=0; i<fits_vector.size(); i++)
{
double val;
if(ae_v1(i,val))
{
hits++;
outstream<<fits_vector.at(i).ord1<<" "<<fits_vector.at(i).ord2<<" "<<val<<endl;
}
}
outstream<<endl<<"hits = "<<hits<<endl;
outstream.close();
}
*/
/*
bool mdist::ae_v1(int i,double& val)
{
val = fits_vector.at(i).ef_rmsdwt/fits_vector.at(i).qf_rmsdwt; //Checking whether the rmsd of the double-exponential fit is significantly smaller than the rmsd of the quadratic fit.
if(val>=0 && val<RMSD_RATIO_CUTOFF) //~[0,0.95]
{
return true;
}
else
{
return false;
}
}
*/
/*void mdist::write_out_ref_set(int size)
{
srand(time(NULL));
vector<viewer> fvtemp;
fvtemp = fits_vector;
cout<<"Size of fvtemp = "<<fvtemp.size()<<endl;
//0 make new vector of ordinals.
vector<int> ords;
vector<int> sizes;
for(int i=0; i<num_orgs; i++)
{
ords.push_back(i);
}
string infile2 = path+"/"+bfn+"/"+bfn+"_hist.txt";
ifstream instream2;
instream2.open(infile2.c_str());
if(instream2.fail()) //This should never happen.
{
cout<<"Failed to open infile2="<<infile2<<" in mdist::write_out_ref_set(int)"<<endl;
exit(1);
}
else
{
int aanum;
string eater;
instream2>>eater>>aanum;
while(!instream2.eof())
{
sizes.push_back(aanum);
for(int i=0; i<40; i++)
{
instream2>>eater;
}
instream2>>eater>>aanum;
}
}
//1 check problematic inputs to eliminate
ifstream instream;
string infile = path+"/"+bfn+"/problematic_inputs.txt";
instream.open(infile.c_str());
if(instream.fail())
{
cout<<"No problematic inputs to eliminate when specifying a reference set in mdist::write_out_ref_set(int size)"<<endl;
}
else
{
int ordinal;
instream>>ordinal;
while(!instream.eof())
{
cout<<"Size of ords before deletion = "<<ords.size()<<endl;
for(int i=0; i<ords.size(); i++)
{
if(ords.at(i)==ordinal)
{
cout<<"Made it here."<<endl;
ords.erase(ords.begin()+i);
sizes.erase(sizes.begin()+i);
cout<<"size of fvtemp = "<<fvtemp.size()<<endl;
for(int j=fvtemp.size()-1; j>=0; j--)
{
if(fvtemp.at(j).ord1==ordinal || fvtemp.at(j).ord2==ordinal)
{
fvtemp.erase(fvtemp.begin()+j);
}
}
}
}
cout<<"Size of ords after deletion = "<<ords.size()<<endl;
char buffer[2000];
instream.getline(buffer,2000);
instream>>ordinal;
}
}
//2 reduce vector until vector.size=size.
while(ords.size()>size)
{
cout<<"Top of while loop. ords.size()="<<ords.size()<<endl;
double min_distance=-1;
int ord1=-1;
int ord2=-1;
int ord1_okay=0;
int ord2_okay=0;
int fvtemp_index=-1;
for(int i=0; i<fvtemp.size(); i++)
{
int itemp1,itemp2;
itemp1 = fvtemp.at(i).ord1;
itemp2 = fvtemp.at(i).ord2;
cout<<"itemp1, itemp2 = "<<itemp1<<" "<<itemp2<<endl;
if( (min_distance==-1 || fvtemp.at(i).quadratic_fit_b<min_distance) && itemp1!=itemp2)
{
cout<<"in the if statement. itemp1!=itemp2"<<endl;
ord1=itemp1;
ord2=itemp2;
fvtemp_index=i;
cout<<"ord1="<<ord1<<" ord2="<<ord2<<" "<<endl;
min_distance=fvtemp.at(i).quadratic_fit_b;
}
}
cout<<"min_distance = "<<min_distance<<endl;
for(int i=0; i<fvtemp.size(); i++)
{
int ord1temp,ord2temp;
ord1temp=fvtemp.at(i).ord1;
ord2temp=fvtemp.at(i).ord2;
if( (ord1==ord1temp || ord1==ord2temp) && !fvtemp.at(i).stable)
{
ord1_okay++;
}
if( (ord2==ord1temp || ord2==ord2temp) && !fvtemp.at(i).stable)
{
ord2_okay++;
}
}
if(ord1_okay<ord2_okay && ord1_okay<OKAY_CUTOFF)
{
for(int i=0; i<ords.size(); i++)
{
if(ords.at(i)==ord2)
{
ords.erase(ords.begin()+i);
sizes.erase(sizes.begin()+i);
for(int j=fvtemp.size()-1; j>=0; j--)
{
if(fvtemp.at(j).ord1==ord2 || fvtemp.at(j).ord2==ord2)
{
fvtemp.erase(fvtemp.begin()+j);
}
}
}
}
}
else if(ord2_okay<ord1_okay && ord2_okay<OKAY_CUTOFF)
{
for(int i=0; i<ords.size(); i++)
{
if(ords.at(i)==ord1)
{
ords.erase(ords.begin()+i);
sizes.erase(sizes.begin()+i);
for(int j=fvtemp.size()-1; j>=0; j--)
{
if(fvtemp.at(j).ord1==ord1 || fvtemp.at(j).ord2==ord1)
{
fvtemp.erase(fvtemp.begin()+j);
}
}
}
}
}
//Neither is okay.
else if(ord1_okay==ord2_okay && ord1_okay>OKAY_CUTOFF)
{
for(int i=0; i<ords.size(); i++)
{
if(ords.at(i)==ord1)
{
ords.erase(ords.begin()+i);
sizes.erase(sizes.begin()+i);
for(int j=fvtemp.size()-1; j>=0; j--)
{
if(fvtemp.at(j).ord1==ord1 || fvtemp.at(j).ord2==ord1)
{
fvtemp.erase(fvtemp.begin()+j);
}
}
}
}
for(int i=0; i<ords.size(); i++)
{
if(ords.at(i)==ord2)
{
ords.erase(ords.begin()+i);
sizes.erase(sizes.begin()+i);
for(int j=fvtemp.size()-1; j>=0; j--)
{
if(fvtemp.at(j).ord1==ord1 || fvtemp.at(j).ord2==ord1)
{
fvtemp.erase(fvtemp.begin()+j);
}
}
}
}
}
else if(ord1_okay<OKAY_CUTOFF && ord2_okay<OKAY_CUTOFF)
{
int index1=-1;
int index2=-1;
cout<<"Available ords: "<<endl;
for(int i=0; i<ords.size(); i++)
{
cout<<i<<" "<<ords.at(i)<<endl;
if(ords.at(i)==ord1)
{
index1=i;
}
else if(ords.at(i)==ord2)
{
index2=i;
}
}
if(index1==-1 || index2==-1)
{
cout<<"In mdist::write_out_ref_set(...). Problem. index1="<<index1<<" index2="<<index2<<endl;
exit(1);
}