Commit 4456a07b authored by Raquel Bromberg's avatar Raquel Bromberg

readme.txt --> README. Deleted commented out code and also code that wasn't...

readme.txt --> README. Deleted commented out code and also code that wasn't necessary for the class's main functions (e.g. all_matches and all_all_matches)
parent b0131d06
#ifndef UTIL_H
#define UTIL_H
#include <iostream>
#include <fstream>
#include <vector>
#include <cstdlib>
#include <sys/stat.h>
#include <sys/types.h>
#include <dirent.h>
#include <errno.h>
#include <sstream>
#include <time.h>
using namespace std;
class util
{
private:
public:
util();
vector<string> load_names(string dir);
void extract_paths(string full_path, string& p, string& b);
int rm_mkdir(string path, string dirname);
int rm(string path, string dirname);
int rm(string full_path);
void open_ifile(ifstream& instream, string infile, string var_name);
void open_ofile(ofstream& outstream, string outfile, string var_name);
void open_ofile_ne(ofstream& outstream, string outfile, string var_name);
void open_ofile_app(ofstream& outstream, string outfile, string var_name);
void get_tl(string infile,int& tl);
void get_rss(string infile,int& rss);
string int_to_string(int i);
void write_out_time(ofstream& logstream);
};
util::util()
{
}
void util::extract_paths(string full_path, string& p, string& b)
{
if(full_path[full_path.length()-1]=='/')
{
full_path = full_path.substr(0,full_path.length()-1);
}
p=b="";
bool slash_seen=false;
for(int i=full_path.length()-1; i>=0; i--)
{
if(!slash_seen && full_path[i]!='/')
{
b=full_path[i]+b;
}
else if(full_path[i]=='/' || slash_seen)
{
slash_seen=true;
p=full_path[i]+p;
}
}
}
int util::rm_mkdir(string path, string dirname)
{
string full_path = path+"/"+dirname;
string outcommand = "rm -r "+full_path;
if(system(NULL)) puts ("Ok");
else
{
exit(EXIT_FAILURE);
}
int system_return = system(outcommand.c_str());
//cout<<"rm -r "<<path<<" - system_return="<<system_return<<endl;
outcommand = "mkdir "+full_path;
system_return = system(outcommand.c_str());
//cout<<"mkdir "<<path<<" - system_return="<<system_return<<endl;
return system_return;
}
int util::rm(string path, string dirname)
{
string full_path = path+"/"+dirname;
string outcommand = "rm -r "+full_path;
if(system(NULL)) puts ("Ok");
else
{
exit(EXIT_FAILURE);
}
int system_return = system(outcommand.c_str());
return system_return;
}
int util::rm(string full_path)
{
string outcommand = "rm "+full_path;
if(system(NULL)) puts ("Ok");
else
{
exit(EXIT_FAILURE);
}
int system_return = system(outcommand.c_str());
return system_return;
}
vector<string> util::load_names(string dir)
{
vector<string> folders;
DIR *dp;
struct dirent *dirp;
if((dp = opendir(dir.c_str())) == NULL)
{
cout<<"Failed to open path to dir="<<dir<<endl;
return folders;
}
while ((dirp = readdir(dp)) != NULL)
{
string temp = string(dirp->d_name);
if(temp.length()>5)
{
folders.push_back(string(dirp->d_name));
}
else
{
//cout<<"Rejecting directory named "<<temp<<endl;
}
}
closedir(dp);
return folders;
}
void util::open_ifile(ifstream& instream, string infile, string var_name)
{
instream.open(infile.c_str());
if(instream.fail())
{
cout<<"Failed to open ifstream to "<<var_name<<" = "<<infile<<endl;
exit(1);
}
}
void util::open_ofile(ofstream& outstream, string outfile, string var_name)
{
outstream.open(outfile.c_str());
if(outstream.fail())
{
cout<<"Failed to open ofstream to "<<var_name<<" = "<<outfile<<endl;
exit(1);
}
}
void util::open_ofile_ne(ofstream& outstream, string outfile, string var_name)
{
outstream.open(outfile.c_str());
while(outstream.fail())
{
cout<<"util::open_ofile_ne(ofstream&,string,string)"<<endl;
cout<<"Failed to open stream to outfile = "<<outfile<<endl;
cout<<"Does the directory exist? Please create it otherwise and then enter any int to continue: ";
int wfd;
cin>>wfd;
outstream.open(outfile.c_str());
}
cout<<"util::open_ofile_ne(ofstream&,string,string): Successfully opened ofstream to "<<outfile<<endl;
}
void util::open_ofile_app(ofstream& outstream, string outfile, string var_name)
{
outstream.open(outfile.c_str(), ios::out | ios::app);
if(outstream.fail())
{
cout<<"Failed to open ofstream to "<<var_name<<" = "<<outfile<<endl;
exit(1);
}
}
void util::get_tl(string infile,int& tl)
{
ifstream instream;
open_ifile(instream,infile,"infile");
int i;
instream>>i>>i>>tl;
instream.close();
}
void util::get_rss(string infile,int& rss)
{
ifstream instream;
open_ifile(instream,infile,"infile");
instream>>rss;
instream.close();
}
string util::int_to_string(int i)
{
stringstream ss;
ss<<i;
string result;
ss>>result;
return result;
}
void util::write_out_time(ofstream& logstream)
{
time_t current_time;
current_time=time(NULL);
struct tm* timeinfo;
timeinfo=localtime(&current_time);
logstream<<"Date of run: "<<asctime(timeinfo)<<endl;
}
#endif /*UTIL_H*/
......@@ -24,3 +24,5 @@ script_do_tmerg.sh
buildconcat_output.txt
rzs.cpp
script_filter.sh
#util.h#
readme.txt
......@@ -7,12 +7,11 @@ For all commands listed below, the character "$" indicates the commandline, and
**************************************************************************************************************
CONTENTS OF README:
1. COMPILING SLOPETREE
2. RUNNING SLOPETREE WITHOUT FILTERING OR A REFERENCE SET (most basic run)
3. ST-TREES WITH MOBILE ELEMENTS REMOVED
1) COMPILING SLOPETREE
2) RUNNING SLOPETREE WITHOUT FILTERING OR A REFERENCE SET (most basic run)
3) ST-TREES WITH MOBILE ELEMENTS REMOVED
4) ST-TREES FILTERED BY CONSERVATION
5) USING THE PAIR-WISE CORRECTION FOR HORIZONTAL GENE TRANSFER
6) DESCRIPTION OF EACH SLOPETREE PROGRAM
**************************************************************************************************************
1. COMPILING SLOPETREE
......
/******************************************************************************
Takes sorted lists of kmers and counts matches from lengths 1-k.
Writes the histograms for each pair to DATA/
*****************************************************************************/
#include <cstdlib>
......@@ -43,8 +44,6 @@ struct krow
vector<bool> r;
};
//int find_top_string(vector<file_aa_pair> v);
class mctr
{
private:
......@@ -58,18 +57,15 @@ class mctr
int filtering_option; //=-1 when no filtering was used (same as above)
string merged_directory;
string data_directory;
string am_directory;
string aam_directory;
int sos_cutoff; //default = 6.5*tag_length
double max_single_bit_value;
int max_bit_sum; //e.g. tag_length=20, highest bit store=4, 4*20 = 80
int mbkl; //minimum kmer length for bitter to store it, default=tag_length
vector<string> files; //
int number_of_genomes; //
int mbkl; //minimum kmer length for bitter to store it, default=tag_length
vector<string> files;
int number_of_genomes;
int number_of_genomes_ref;
int offset;
int recursion_counter;
vector<string> all_matches;
ofstream logstream;
vector<string> array;
vector<int> gnums;
......@@ -95,9 +91,6 @@ class mctr
bool msc2;
int genes_found;
double min_msc; //the minimum msc among all the org pairs.
// void get_msc();
//void get_msc2();
//ofstream msc_outstream;
map<std::pair<int,int>,int> msc_map; //delete this when subsequent maps are implemented.
map<std::pair<int,int>,int> con_map;
vector< set<int> > con_map_set0;
......@@ -112,7 +105,6 @@ class mctr
bool check_composition(string s);
void get_problem_tags(vector<string> files,string ordinal);
void get_problem_tags(vector<string> files,string ordinal, set<int> hgt_genes);
double msc_cutoff;
int ppv_index; //what line of the problem pair index vector are we on.
int ref_index; //what line of the reference vector are we on.
vector<std::pair<int, int> > keys;
......@@ -125,11 +117,9 @@ class mctr
void read_in_probfiles(ifstream& infostream,vector<string>& files1, vector<string>& files2);
void set_array_to_0();
int calculate_max_bit_sum();
void open_info_file(ifstream& infostream);
void create_directories();
void open_str(string path_frag, int i, ofstream& outstream);
char get_aa_char(int i);
void set_bitsv();
......@@ -137,18 +127,17 @@ class mctr
void set_array_it1(int i);
void set_array_it2(int i);
void find_matches_it0(int col, int first_row, int last_row, string seq, ofstream& alalstream, ofstream& alstream);
void find_matches_it1(int col, int first_row, int last_row, string seq, ofstream& alalstream, ofstream& alstream);
void find_matches_it2(int col, int first_row, int last_row, string seq, ofstream& alalstream, ofstream& alstream);
void increment_bits_it0(int first_row, int last_row,string seq,ofstream& alalstream, ofstream& alstream);
void increment_bits_it1(int first_row, int last_row,string seq,ofstream& alalstream, ofstream& alstream);
void increment_bits_it2(int first_row, int last_row,string seq,ofstream& alalstream, ofstream& alstream);
void find_matches_it0(int col, int first_row, int last_row, string seq);//, ofstream& alalstream, ofstream& alstream);
void find_matches_it1(int col, int first_row, int last_row, string seq);//, ofstream& alalstream, ofstream& alstream);
void find_matches_it2(int col, int first_row, int last_row, string seq);//, ofstream& alalstream, ofstream& alstream);
void increment_bits_it0(int first_row, int last_row,string seq);//,ofstream& alalstream, ofstream& alstream);
void increment_bits_it1(int first_row, int last_row,string seq);//,ofstream& alalstream, ofstream& alstream);
void increment_bits_it2(int first_row, int last_row,string seq);//,ofstream& alalstream, ofstream& alstream);
int get_aa(char aa);
void write_out_data_it0();
void write_out_data_it1();
void write_out_data_it2();
// void write_out_data();//vector<organism_v2>& organisms);
void read_in_files();
double get_score(int i1,int i2, string seq);
double dither(double d);
......@@ -169,7 +158,6 @@ class mctr
void run_it2(); //run pair against each other with phage/hgt proteins removed.
void set_problem_pair(int prob1_par,int prob2_par);
bool multiple_organisms(int first_row,int last_row);
};
//Constructor 1. Main run.
......@@ -193,7 +181,7 @@ mctr::mctr(string full_path_par, string dir_par, int filtering_steps_par, int fi
mbkl=int(MBKL*(double)tag_length);
filtering_steps=filtering_steps_par;
filtering_option=filtering_option_par;
dir=dir_par; //dir="" when running on the main set.
dir=dir_par; //dir="" when running on the main set; dir="scr" when running on scrambled set.
sos_cutoff=get_sos_cutoff();
u.open_ofile_app(logstream,path+"/"+bfn+"/logfile_cm1.txt","logfile_cm1.txt");
......@@ -208,9 +196,7 @@ mctr::mctr(string full_path_par, string dir_par, int filtering_steps_par, int fi
merged_directory = "MERGED_TAGS"+dir+addon;
data_directory = "DATA"+dir+addon;
am_directory = "ALL_MATCHES"+dir+addon;
aam_directory = "ALL_ALL_MATCHES"+dir+addon;
ifstream infostream;
open_info_file(infostream); //sets tag length and number of genomes.
infostream.close();
......@@ -257,7 +243,6 @@ mctr::mctr(string full_path_par, string dir_par, int filtering_steps_par, int fi
//Set all pair run variables to false.
ppv_index=0;
ref_index=0;
msc_cutoff=-1;//need this?
sos_cutoff=get_sos_cutoff();
u.open_ofile_app(logstream,path+"/"+bfn+"/logfile_cm2.txt","logfile_cm2.txt");
......@@ -270,8 +255,6 @@ mctr::mctr(string full_path_par, string dir_par, int filtering_steps_par, int fi
merged_directory = "MERGED_TAGS"+dir+addon;
data_directory = "DATA"+dir+addon;
am_directory = "ALL_MATCHES"+dir+addon;
aam_directory = "ALL_ALL_MATCHES"+dir+addon;
ifstream infostream;
open_info_file(infostream); //sets tag length and number of genomes.
......@@ -313,39 +296,6 @@ mctr::mctr(string full_path_par, string dir_par, int filtering_steps_par, int fi
populate_bitterv();
recursion_counter=0;
/*
for(int f=0; f<proteins1.size(); f++)
{
krow temp_row;
for(int g=0; g<proteins0.size(); g++)
{
temp_row.r.push_back(false);
}
rows.push_back(temp_row);
}
rows_rows=proteins1.size();
rows_cols=proteins0.size();
for(int f=0; f<proteins0.size(); f++)
{
set<int> temp_set;
con_map_set0.push_back(temp_set);
}
for(int f=0; f<proteins1.size(); f++)
{
set<int> temp_set;
con_map_set1.push_back(temp_set);
}
cout<<"con_map_set0.size() = "<<con_map_set0.size()<<endl;
cout<<"proteins0.size() = "<<proteins0.size()<<endl;
cout<<"con_map_set1.size() = "<<con_map_set1.size()<<endl;
cout<<"proteins1.size() = "<<proteins1.size()<<endl;
*/
}
//Constructor 3. Pair run against just itself.
......@@ -372,7 +322,6 @@ mctr::mctr(string full_path_par, string dir_par, int filtering_steps_par, int fi
prob2=prob2_par;
ppv_index=0;
ref_index=0;
msc_cutoff=-1;
sos_cutoff=get_sos_cutoff();
u.open_ofile_app(logstream,path+"/"+bfn+"/logfile_cm3.txt","logfile_cm3.txt");
......@@ -385,8 +334,6 @@ mctr::mctr(string full_path_par, string dir_par, int filtering_steps_par, int fi
merged_directory = "MERGED_TAGS"+dir+addon;
data_directory = "DATA"+dir+addon;
am_directory = "ALL_MATCHES"+dir+addon;
aam_directory = "ALL_ALL_MATCHES"+dir+addon;
ifstream infostream;
open_info_file(infostream); //sets tag length and number of genomes.
......@@ -413,24 +360,10 @@ void mctr::run_it0()
cout<<"mctr::run_it0()"<<endl;
}
//parallelize command.
if(!partial_run)
{
for(int i=0; i<20; i++)
{
//Coptim1_3Dmat* corr_array_temp; //when it's parallelized, this will be set to zero, filled by each thread, then added into the real one at the end with a lock on it
all_matches.clear();
ofstream aastr;
open_str(aam_directory,i,aastr);
ofstream astr;
open_str(am_directory,i,astr);
//for(int n=0; n<all_matches.size(); n++)
//{
//all_matches_stream<<all_matches.at(n)<<endl;
//}
//all_matches.clear();
array.clear();
gnums.clear();
gene_ids.clear();
......@@ -446,26 +379,12 @@ void mctr::run_it0()
cout<<"Last row of array: "<<endl;
cout<<array.at(array.size()-1)<<endl;
cout<<"Entering recursive function find_matches_it0(...)..."<<endl;
find_matches_it0(0,0,array.size()-1,"", aastr, astr);
find_matches_it0(0,0,array.size()-1,"");//, aastr, astr);
cout<<"Escaped recursive function find_matches(...)!"<<endl;
astr.close();
}
}
else if(partial_run)
{
//Coptim1_3Dmat* corr_array_temp; //when it's parallelized, this will be set to zero, filled by each thread, then added into the real one at the end with a lock on it
all_matches.clear();
ofstream aastr;
open_str(aam_directory,aas.get_aa(partial_run_lead),aastr);
ofstream astr;
open_str(am_directory,aas.get_aa(partial_run_lead),astr);
//for(int n=0; n<all_matches.size(); n++)
//{
//all_matches_stream<<all_matches.at(n)<<endl;
//}
//all_matches.clear();
array.clear();
gnums.clear();
gene_ids.clear();
......@@ -481,9 +400,8 @@ void mctr::run_it0()
cout<<"Last row of array: "<<endl;
cout<<array.at(array.size()-1)<<endl;
cout<<"Entering recursive function find_matches_it0(...)..."<<endl;
find_matches_it0(0,0,array.size()-1,"", aastr, astr);
find_matches_it0(0,0,array.size()-1,"");//, aastr, astr);
cout<<"Escaped recursive function find_matches(...)!"<<endl;
astr.close();
}
cout<<"Done with mctr main algorithm."<<endl;
......@@ -527,34 +445,12 @@ int mctr::run_it1()
cout<<i<<"/"<<20<<endl;
}
all_matches.clear();
ofstream aastr;
open_str(aam_directory,i,aastr);
ofstream astr;
open_str(am_directory,i,astr);
array.clear();
gnums.clear();
gene_ids.clear();
set_array_it1(i);
/* cout<<"First few rows: "<<endl;
for(int i=0; i<2; i++)
{
cout<<array.at(i)<<" "<<gnums.at(i)<<" "<<gene_ids.at(i)<<endl;
}
*/
// cout<<"Last row of array: "<<endl;
// cout<<array.at(array.size()-1)<<" "<<gnums.at(gnums.size()-1)<<" "<<gene_ids.at(gene_ids.size()-1)<<endl;
// cout<<"Second to last row of array: "<<endl;
//cout<<array.at(array.size()-2)<<" "<<gnums.at(gnums.size()-2)<<" "<<gene_ids.at(gene_ids.size()-2)<<endl;
//cout<<"msc="<<msc<<endl;
find_matches_it1(0,0,array.size()-1,"", aastr, astr);
astr.close();
// cout<<"con_map.size() = "<<con_map.size()<<endl;
// cout<<"hgt_map.size() = "<<hgt_map.size()<<endl;
find_matches_it1(0,0,array.size()-1,"");
}
cout<<"Finished match-counting for "<<prob1<<" "<<prob2<<endl;
......@@ -580,12 +476,6 @@ void mctr::run_it2()
for(int i=0; i<20; i++)
{
all_matches.clear();
ofstream aastr;
open_str(aam_directory,i,aastr);
ofstream astr;
open_str(am_directory,i,astr);
array.clear();
gnums.clear();
gene_ids.clear();
......@@ -601,16 +491,16 @@ void mctr::run_it2()
cout<<prob1<<" "<<prob2<<" "<<msc<<endl;
}
find_matches_it2(0,0,array.size()-1,"", aastr, astr);
find_matches_it2(0,0,array.size()-1,"");
if(MCTR_DEBUG)
{
cout<<prob1<<" "<<prob2<<" Done in find_matches"<<endl;
}
astr.close();
}
cout<<"Finished match-counting (it2) for "<<prob1<<" "<<prob2<<endl;
write_out_data_it2();
}
......@@ -661,33 +551,6 @@ void mctr::set_bitsv()
bitsv.push_back(bh_temp);
histstream.close();
}
/* ifstream instream;
u.open_ifile(instream,path+"/"+bfn+"/"+bfn+"_hist.txt","instream");
double max=-1;
for(int i=0; i<number_of_genomes; i++)
{
bits_holder bh_temp;
int gnum;
instream>>gnum;
int total_residues;
instream>>total_residues;
for(int j=0; j<20; j++)
{
int count;
double frequency;
instream>>count>>frequency;
bh_temp.d[j]=(double)count/(double)total_residues;
bh_temp.d[j]=( (log(bh_temp.d[j])) * (-1.0)) / 2.0; //NOTE: using log, which in C++ uses base e.
//NOTE: Divide by 2 here to speed up calculating scores in get_score(...)
if(max==-1 || max<bh_temp.d[j])
{
max = bh_temp.d[j];
}
}
bitsv.push_back(bh_temp);*/
max_single_bit_value=max;
} //end of normal case run_type==0
......@@ -732,9 +595,6 @@ void mctr::set_bitsv()
outstream.close();
// ifstream probstream;
// u.open_ifile(probstream,path+"/"+bfn+"/"+bfn+"_hist.txt","probstream");
//get the bits for the 2 problem proteomes.
double max=-1;
for(int i=0; i<number_of_genomes; i++) //number_of_genomes, NOT number_of_genomes_ref, because the code has to go through the entire set to find a) all the reference genomes and b) the two problem genomes.
......@@ -852,103 +712,6 @@ void mctr::set_bitsv()
} //end of set_bitsv()
/*****************************************************
set_bits()
*****************************************************/
/*void mctr::set_bits()
{
cout<<"Setting bits array."<<endl;
int max=-1;
for(int i=0; i<20; i++)
{
bits[i]=0.0;
}
ifstream instream;
instream.open( (path+"/"+bfn+"/"+bfn+"_hist.txt").c_str());
if(instream.fail())
{
cout<<"Failed to open file for histograms: ";
exit(1);
}
else
{
cout<<"Opened histogram file."<<endl;
}
int total_residues=0;
for(int i=0; i<number_of_genomes; i++)
{
int gnum;
instream>>gnum;
cout<<"read in gnum="<<gnum<<endl;
//if(gnum!=i)
//{
// cout<<"Error reading in gnums."<<endl;
// exit(1);
// }
int temp_tr;
instream>>temp_tr;
cout<<"temp_tr="<<temp_tr<<endl;
total_residues += temp_tr;
for(int j=0; j<20; j++)
{
int int_count;
double double_count;
instream>>int_count>>double_count;
//cout<<"int_count="<<int_count<<" double_count="<<double_count<<endl;
bits[j]+=int_count;
}
}
for(int i=0; i<20; i++)
{
bits[i]=bits[i]/(double)total_residues;
}
for(int i=0; i<20; i++)
{
bits[i]=log(bits[i]);
}
for(int i=0; i<20; i++)
{
bits[i]=bits[i]*(-1);
}
cout<<"Done computing bits[]. Bits: "<<endl;
for(int i=0; i<20; i++)
{
if(bits[i]>max)
{
max = bits[i];
}
cout<<i<<" "<<bits[i]<<endl;
}
cout<<endl;