Commit 262e8216 authored by Raquel Bromberg's avatar Raquel Bromberg
Browse files

Minor code changes (mostly code cleanup)

parent 25aca03f
#include <iostream>
#include <fstream>
#include <cstdlib>
#include <omp.h>
//#include <omp.h>
#include <getopt.h>
#include "mctr.h"
......@@ -58,9 +58,11 @@ int main(int argc, char* argv[])
// cout<<"p="<<full_path<<endl;
// cout<<"fs="<<fs<<endl;
cout<<"Calling run on real data"<<endl;
mctr m(full_path,"",fs,fo,partial_run);
cout<<"Calling run()"<<endl;
m.run_it0();
cout<<"Calling run on scrambled data"<<endl;
mctr mscr(full_path,"scr",fs,fo,partial_run);//scrambled
mscr.run_it0();
......
......@@ -9,10 +9,18 @@ using namespace std;
int main(int argc, char* argv[])
{
string full_path = "";
if(argc<2)
{
cout<<"Wrong use of filter.cpp: ./filt <path to run>"<<endl;
exit(1);
}
string full_path = argv[1];
int mismatch = 13;
int filtering_steps = 10;
char c;
bool verbose=false;
bool very_similar_set=false; //e.g. input of all E.coli
cout<<"Values stored in argv:"<<endl;
for(int i=0; i<argc; i++)
......@@ -20,7 +28,7 @@ int main(int argc, char* argv[])
cout<<argv[i]<<endl;
}
while((c = getopt(argc,argv,"m:p:f:")) != -1)
while((c = getopt(argc,argv,"m:p:f:vs")) != -1)
{
switch(c)
{
......@@ -33,14 +41,19 @@ int main(int argc, char* argv[])
case 'f':
filtering_steps=atoi(optarg);
break;
case 'v':
verbose=true;
break;
case 's':
very_similar_set=true;
break;
default:
abort();
}
}
filter f(full_path,mismatch,filtering_steps);
filter f(full_path,mismatch,filtering_steps,verbose,very_similar_set);
f.write_out_filt_logs();
f.print_final_hist();
f.mark_discrepancies();
return 0;
......
This diff is collapsed.
......@@ -10,6 +10,12 @@ using namespace std;
int main(int argc, char* argv[])
{
if(argc<2)
{
cout<<"Improper input. ./fmerg <full path to run, e.g. ../my_runs/300_bacteria/>"<<endl;
exit(1);
}
string full_path = "";
int filtering_steps = -1;
int filt_choice=-1;
......
/**************************************************************************
fpwrite.cpp
INPUT:
OUTPUT:
*************************************************************************/
#include <iostream>
#include <fstream>
#include <vector>
......@@ -18,12 +24,23 @@ struct h
int main(int argc, char* argv[])
{
cout<<"Usage: ./fpw -p <path> -f <filtering_steps, default=10> -o <stringency level>"<<endl;
if(argc<2)
{
cout<<"Wrong input."<<endl;
cout<<"./fpwrite <path to run, e.g. ../my_runs/medium_run/> "<<endl;
cout<<"verbose: -v"<<endl;
cout<<"help: -h"<<endl;
cout<<"filtering steps (default 10): -f"<<endl;
cout<<"filtering choice (default 0): -o"<<endl;
exit(1);
}
string full_path = "";
int filtering_steps = -1;
int filt_choice=-1;
string full_path =argv[1];
int filtering_steps = 10;
int filt_choice=0;
char c;
bool verbose=false;
bool help=false;
cout<<"Values stored in argv:"<<endl;
for(int i=0; i<argc; i++)
......@@ -31,7 +48,7 @@ int main(int argc, char* argv[])
cout<<argv[i]<<endl;
}
while((c = getopt(argc,argv,"o:p:f:")) != -1)
while((c = getopt(argc,argv,"o:p:f:hv")) != -1)
{
switch(c)
{
......@@ -44,22 +61,45 @@ int main(int argc, char* argv[])
case 'f':
filtering_steps=atoi(optarg);
break;
case 'v':
verbose=true;
break;
case 'h':
help=true;
break;
default:
abort();
}
}
//open the right filtering log, e.g. filtlog_15_8.txt
if(help)
{
cout<<"./fpwrite <path to run, e.g. ../my_runs/medium_run/> "<<endl;
cout<<"verbose: -v"<<endl;
cout<<"help: -h"<<endl;
cout<<"filtering steps (default 10): -f"<<endl;
cout<<"filtering choice (default 0): -o"<<endl;
exit(1);
}
//open the right filtering log file, e.g. filtlog_15_8.txt
util u;
string path,bfn;
u.extract_paths(full_path,path,bfn);
string fss,fcs;
fss=u.int_to_string(filtering_steps);
fcs=u.int_to_string(filt_choice);
cout<<fss<<" "<<fcs<<endl;
if(verbose)
{
cout<<"path="<<path<<endl;
cout<<"bfn="<<bfn<<endl;
cout<<"filtering_steps=fss="<<fss<<endl;
cout<<"filt_choice=fcs="<<fcs<<endl;
}
ifstream instream;
string infile = path+"/"+bfn+"/filt_logs/filtlog_"+fss+"_"+fcs+".txt";
u.open_ifile(instream,infile,"infile");
string filt_infile = path+"/"+bfn+"/FILT_LOGS/filtlog_"+fss+"_"+fcs+".txt";
u.open_ifile(instream,filt_infile,"filt_infile");
//read all the acceptable pairs into a map
set<pair<int,int> > g2k; //genes to keep
......@@ -76,7 +116,10 @@ int main(int argc, char* argv[])
instream>>gnum>>gene_id;
}
cout<<"g2k.size() = "<<g2k.size()<<endl;
if(verbose)
{
cout<<"g2k.size() = "<<g2k.size()<<endl;
}
//Read in the info file.
ifstream infostream;
......@@ -121,12 +164,18 @@ int main(int argc, char* argv[])
infostream>>o;
}
cout<<"hv.size() = "<<hv.size()<<endl;
if(verbose)
{
cout<<"hv.size() = "<<hv.size()<<endl;
}
cout<<"Deleting previous *FAA_csv* and *FAA_ncsv* directories, if they exist..."<<endl;
u.rm_mkdir(path+"/"+bfn+"/","FAA_csv_"+fss+"_"+fcs);
u.rm_mkdir(path+"/"+bfn+"/","FAA_mob_"+fss+"_"+fcs);
u.rm_mkdir(path+"/"+bfn+"/","FAA_ncsv_"+fss+"_"+fcs);
u.rm_mkdir(path+"/"+bfn+"/","FAA_ref_csv_"+fss+"_"+fcs);
u.rm_mkdir(path+"/"+bfn+"/","FAA_ref_mob_"+fss+"_"+fcs);
u.rm_mkdir(path+"/"+bfn+"/","FAA_ref_ncsv_"+fss+"_"+fcs);
cout<<"Creating new *FAA_csv* and *FAA_ncsv* directories..."<<endl;
for(int i=0; i<hv.size(); i++)
{
......@@ -135,7 +184,7 @@ int main(int argc, char* argv[])
string outdirectory = path+"/"+bfn+"/FAA_csv_"+fss+"_"+fcs+"/"+hv.at(i).dir;
int system_return = system ( ("rm -r "+outdirectory).c_str());
system_return = system ( ("mkdir "+outdirectory).c_str());
string outdirectory2 = path+"/"+bfn+"/FAA_mob_"+fss+"_"+fcs+"/"+hv.at(i).dir;
string outdirectory2 = path+"/"+bfn+"/FAA_ncsv_"+fss+"_"+fcs+"/"+hv.at(i).dir;
int system_return2 = system ( ("rm -r "+outdirectory2).c_str());
system_return2 = system ( ("mkdir "+outdirectory2).c_str());
}
......@@ -145,7 +194,7 @@ int main(int argc, char* argv[])
string outdirectory = path+"/"+bfn+"/FAA_ref_csv_"+fss+"_"+fcs+"/"+hv.at(i).dir;
int system_return = system ( ("rm -r "+outdirectory).c_str());
system_return = system ( ("mkdir "+outdirectory).c_str());
string outdirectory2 = path+"/"+bfn+"/FAA_ref_mob_"+fss+"_"+fcs+"/"+hv.at(i).dir;
string outdirectory2 = path+"/"+bfn+"/FAA_ref_ncsv_"+fss+"_"+fcs+"/"+hv.at(i).dir;
int system_return2 = system ( ("rm -r "+outdirectory2).c_str());
system_return2 = system ( ("mkdir "+outdirectory2).c_str());
}
......@@ -203,9 +252,13 @@ int main(int argc, char* argv[])
}
instream.close();
} // Done going through files.
cout<<"p2k.size()="<<p2k.size()<<endl;
cout<<"First protein:"<<endl;
cout<<p2k.at(0).first<<endl<<p2k.at(0).second<<endl;
if(verbose)
{
cout<<"p2k.size()="<<p2k.size()<<endl;
cout<<"First protein:"<<endl;
cout<<p2k.at(0).first<<endl<<p2k.at(0).second<<endl;
}
//Open the 2 output files. (new .faa files)
//conserved write out
......@@ -224,16 +277,16 @@ int main(int argc, char* argv[])
cstream<<p2k.at(j).first<<endl<<p2k.at(j).second<<endl;
}
//mobile write out
//non-conserved proteins write out
ofstream mstream;
string mobfile=path+"/"+bfn+"/FAA";
string ncsv_file=path+"/"+bfn+"/FAA";
if(i<rss)
{
mobfile+="_ref";
ncsv_file+="_ref";
}
mobfile+="_mob_"+fss+"_"+fcs+"/"+hv.at(i).dir+"/mob_proteins.faa";
cout<<"mobfile = "<<mobfile<<endl;
u.open_ofile(mstream,mobfile,"mobfile");
ncsv_file+="_ncsv_"+fss+"_"+fcs+"/"+hv.at(i).dir+"/ncsv_proteins.faa";
cout<<"ncsv_file = "<<ncsv_file<<endl;
u.open_ofile(mstream,ncsv_file,"ncsv_file");
for(int j=0; j<pn2k.size(); j++)
{
......
/*
make_info_file.cpp:
mif.cpp (Make Info File):
Creates the information file that all subsequent programs use to identify the input.
Identifies redundancy in the input (i.e. exact same proteome present in both ref set and main set): if an organism exists in both the reference and main set, it is not included in the file, and therefore will be excluded in the run. These redundant organisms are written to <run_name>_reference_repeats.txt
Identifies redundancy in the input (i.e. exact same proteome present in both ref set and main set): if an organism exists in both the reference and main set, it is only included as a reference organism and excluded from the main set. These redundancies are written to <run_name>_reference_repeats.txt
*/
#include <iostream>
......@@ -19,36 +19,49 @@ void sort_directories(vector<string>& directories);
int main(int argc, char* argv[])
{
if(argc<3)
if(argc<2) //path parameter is required.
{
cout<<"wrong use of fn. ./mif <path to dir containing FAA and FAA_ref> <tag length>"<<endl;
cout<<"Wrong use of mif program: ./mif <path to dir containing FAA and FAA_ref> <k-mer length>"<<endl;
cout<<"For more help: ./mif -h"<<endl;
cout<<"Options: \n\t-v for verbose. \n\t-t for max k-mer length (default=20)"<<endl;
exit(1);
}
if(argc==2)
{
cout<<"No value for k-mer length entered. Using default 20-mers"<<endl;
}
util u;
string path_all = argv[1]; //Entire path to directory containing FAA/ directory.
int tag_length=20;
int k=20;
bool verbose=false;
bool help=false;
char c;
while((c = getopt(argc,argv,"p:t:v")) != -1)
while((c = getopt(argc,argv,"p:k:vh")) != -1)
{
switch(c)
{
case 'p':
path_all = optarg;
break;
case 't':
tag_length=atoi(optarg);
if(tag_length<=0)
case 'k':
k=atoi(optarg);
if(k<=0)
{
cout<<"Tag length invalid. value="<<tag_length<<endl;
cout<<"Tag length must be greater than 0. Default val = 20"<<endl;
cout<<"Invalid k-mer length. k="<<k<<endl;
cout<<"K-mer length must be greater than 0. Default val = 20"<<endl;
exit(1);
}
break;
case 'v':
verbose=true;
cout<<"verbose=true"<<endl;
break;
case 'h':
help=true;
break;
default:
abort();
}
......@@ -63,27 +76,40 @@ int main(int argc, char* argv[])
}
}
if(help)
{
cout<<"mif.cpp help:"<<endl;
//What should go here?
exit(1);
}
string path,rn;
u.extract_paths(path_all,path,rn);
//Open logfile.
ofstream logstream;
u.open_ofile(logstream,path_all+"/logfile_mif.txt","logstream");
u.write_out_time(logstream);
logstream<<"logfile for make_info_file.cpp:"<<endl;
logstream<<"path_all="<<path_all<<endl;
logstream<<"tag_length="<<tag_length<<endl;
logstream<<"verbose="<<verbose<<endl;
logstream<<"path="<<path<<endl;
logstream<<"rn="<<rn<<endl;
logstream<<"logfile for make_info_file.cpp"<<endl<<endl;
logstream<<"Starting parameters: "<<endl;
logstream<<"\tpath_all="<<path_all<<endl;
logstream<<"\tk="<<k<<endl;
logstream<<"\tverbose="<<verbose<<endl;
logstream<<"\thelp="<<help<<endl;
logstream<<"\tpath="<<path<<endl;
logstream<<"\trn="<<rn<<endl<<endl;
cout<<"Details of mif run being written to "<<path_all+"/logfile_mif.txt"<<endl;
if(verbose)
{
cout<<"Details of mif (make_info_file.cpp) run being written to "<<path_all+"/logfile_mif.txt"<<endl;
cout<<"path_all="<<path_all<<endl;
cout<<"tag_length="<<tag_length<<endl;
cout<<"verbose="<<verbose<<endl;
cout<<"path="<<path<<endl;
cout<<"rn="<<rn<<endl;
cout<<"Starting parameters: "<<endl;
cout<<"\tpath_all="<<path_all<<endl;
cout<<"\tk="<<k<<endl;
cout<<"\tverbose="<<verbose<<endl;
cout<<"\thelp="<<help<<endl;
cout<<"\tpath="<<path<<endl;
cout<<"\trn="<<rn<<endl<<endl;
}
//Main outstream to info file
......@@ -102,7 +128,7 @@ int main(int argc, char* argv[])
{
cout<<"Successfully opened ofstream outstream to string info_path="<<info_path<<endl;
}
logstream<<"Successfully opened ofstream outstream to string info_path="<<info_path<<endl;
logstream<<"Successfully opened ofstream outstream to string info_path="<<info_path<<endl<<endl;
}
//Checks if members of reference set exist in the main set (duplicates)
......@@ -117,7 +143,11 @@ int main(int argc, char* argv[])
}
else
{
cout<<"Successfully opened ofstream repstream to string rr_str="<<rr_str<<endl;
if(verbose)
{
cout<<"Successfully opened ofstream repstream to string rr_str="<<rr_str<<endl<<endl;
}
logstream<<"Successfully opened ofstream repstream to string rr_str="<<rr_str<<endl<<endl;
}
vector<string> directories;
......@@ -125,112 +155,160 @@ int main(int argc, char* argv[])
directories = u.load_names(path+"/"+rn+"/FAA/");
directories_ref = u.load_names(path+"/"+rn+"/FAA_ref/");
logstream<<"Number of directories in FAA/ : directories.size() = "<<directories.size()<<endl;
logstream<<"Number of directories in FAA_ref/ : directoreis_ref.size() = "<<directories_ref.size()<<endl;
logstream<<"Number of directories:"<<endl;
logstream<<"\tNumber of directories in FAA/ : directories.size() = "<<directories.size()<<endl;
for(int i=0; i<directories.size(); i++)
{
logstream<<"\t\t"<<i<<" "<<directories.at(i)<<endl;
}
logstream<<"\n\tNumber of directories in FAA_ref/ : directories_ref.size() = "<<directories_ref.size()<<endl;
for(int i=0; i<directories_ref.size(); i++)
{
logstream<<"\t\t"<<i<<" "<<directories_ref.at(i)<<endl;
}
std::sort(directories.begin(), directories.end());
std::sort(directories_ref.begin(), directories_ref.end());
if(MIF_DEBUG || verbose)
logstream<<"Cleaning up vectors: 1) Checking for redundant inputs."<<endl;
if(verbose)
{
cout<<"Size of directories = "<<directories.size()<<endl;
cout<<"Size of directories_ref = "<<directories_ref.size()<<endl;
cout<<"Cleaning up vectors: 1) Checking for redundant inputs."<<endl;
}
cout<<"Size of directories = "<<directories.size()<<endl;
cout<<"Size of directories_ref = "<<directories_ref.size()<<endl;
//Check for the same organism being present in both the reference set and main set.
int repeats_correction=0;
for(int i=0; i<directories.size(); i++)
for(int i=directories.size()-1; i>=0; i--)
{
for(int j=0; j<directories_ref.size(); j++)
{
if(directories.at(i)==directories_ref.at(j))
{
repeats_correction++;
cout<<"Repeat at index="<<i<<" "<<directories.at(i)<<" "<<directories_ref.at(j)<<endl;
logstream<<"Repeat at index="<<i<<" "<<directories.at(i)<<" "<<directories_ref.at(j)<<endl;
logstream<<"\tRepeat at index="<<i<<" "<<directories.at(i)<<" "<<directories_ref.at(j)<<endl;
logstream<<"\tErasing "<<directories.at(i)<<" from vector."<<endl;
if(verbose)
{
cout<<"Repeat at index="<<i<<" "<<directories.at(i)<<" "<<directories_ref.at(j)<<endl;
cout<<"Erasing "<<directories.at(i)<<" from vector."<<endl;
}
repstream<<directories.at(i)<<endl;
directories.erase(directories.begin()+i);
}
}
}
logstream<<"After checking for organisms present in both the main set and reference set: "<<endl;
logstream<<"\tNumber of repeats (present in both FAA and FAA_ref) = "<<repeats_correction<<endl;
logstream<<"\tdirectories.size() = "<<directories.size()<<endl;
logstream<<"\tdirectories_ref.size() = "<<directories_ref.size()<<endl<<endl;
if(verbose)
{
cout<<"Number of repeats (present in both FAA and FAA_ref) = "<<repeats_correction<<endl;
cout<<"Size of directories = "<<directories.size()<<endl;
cout<<"Size of directories_ref = "<<directories_ref.size()<<endl;
}
logstream<<"Cleaning up vectors: 2) Checking for empty inputs."<<endl;
for(int i=directories_ref.size()-1; i>=0; i--)
{
vector<string> files = u.load_names(path+"/"+rn+"/FAA_ref/"+directories_ref.at(i));
if(files.size()==0)
{
logstream<<"\tEmpty reference directory for "<<directories_ref.at(i)<<endl;
logstream<<"\tDeleting organism"<<endl;
if(MIF_DEBUG || verbose)
{
cout<<"Empty reference directory for "<<directories_ref.at(i)<<endl;
cout<<"Deleting organism"<<endl;
}
directories_ref.erase(directories_ref.begin()+i);
}
}
logstream<<"Number of repeats (present in both FAA and FAA_ref) = "<<repeats_correction<<endl;
for(int i=directories.size()-1; i>=0; i--)
{
vector<string> files = u.load_names(path+"/"+rn+"/FAA/"+directories.at(i));
if(files.size()==0)
{
logstream<<"\tEmpty directory for "<<directories.at(i)<<endl;
logstream<<"\tDeleting organism"<<endl;
if(MIF_DEBUG || verbose)
{
cout<<"Empty directory for "<<directories.at(i)<<endl;
cout<<"Deleting organism"<<endl;
}
directories.erase(directories.begin()+i);
}
}
outstream<<directories_ref.size()<<endl<<directories.size()-repeats_correction<<endl<<tag_length<<endl<<endl;
logstream<<"After deleting empty directories: "<<endl;
logstream<<"\tdirectories.size() = "<<directories.size()<<endl;
logstream<<"\tdirectories_ref.size() = "<<directories_ref.size()<<endl;
//First write out the reference organisms, sorted alphabetically.
int transition_ordinal=0; //The integer that keeps track of which ordinal we are on, between the ref set and the main set. Accounts for repeats.
if(MIF_DEBUG || verbose)
{
cout<<"Size of directories = "<<directories.size()<<endl;
cout<<"Size of directories_ref = "<<directories_ref.size()<<endl;
}
logstream<<endl<<"Writing out info file: 1) reference set"<<endl;
outstream<<directories_ref.size()<<endl<<directories.size()-repeats_correction<<endl<<k<<endl<<endl;
//Write out reference organisms to the info file first, sorted alphabetically.
for(int i=0; i<directories_ref.size(); i++)
{
vector<string> files = u.load_names(path+"/"+rn+"/FAA_ref/"+directories_ref.at(i));
if(MIF_DEBUG)
logstream<<"\ti = "<<i<<" "<<directories_ref.at(i)<<" , files.size() = "<<files.size()<<endl;
if(MIF_DEBUG || verbose)
{
cout<<"index = "<<i<<" "<<directories_ref.at(i)<<" , files.size() = "<<files.size()<<endl;
}
for(int j=0; j<files.size(); j++)
{
// outstream<<"Genome ordinal = "<<i<<endl<<"Genome name: "<<directories_ref.at(i)<<endl<<"Genome file: "<<path+"/"+rn+"/FAA_ref/"+directories_ref.at(i)+"/"+files.at(j)<<endl<<endl;
string temp_path = path+"/"+rn+"/FAA_ref/"+directories_ref.at(i)+"/"+files.at(j);
outstream<<i<<endl<<directories_ref.at(i)<<endl<<temp_path<<endl<<endl;
//outstream<<directories_ref.at(i)<<endl;
}
outstream<<endl;
transition_ordinal=i+1;
}
//Second, write out the main organisms, sorted alphabetically.
//Don't write out the redundant proteomes.
int transition_ordinal=directories_ref.size(); //The integer that keeps track of which ordinal we are on, between the ref set and the main set.
logstream<<endl<<"Writing out info file: 2) main set"<<endl;
for(int i=0; i<directories.size(); i++)
{
bool repeat=false;
int ordinal=-1;
for(int j=0; j<directories_ref.size(); j++)
{
if(directories.at(i)==directories_ref.at(j))
{
repeat=true;
ordinal=j;
}
}
vector<string> files = u.load_names(path+"/"+rn+"/FAA/"+directories.at(i));
if(!repeat)
logstream<<"\ti = "<<i<<"("<<transition_ordinal+i<<")"<<directories.at(i)<<" , files.size() = "<<files.size()<<endl;
if(MIF_DEBUG || verbose)
{
vector<string> files = u.load_names(path+"/"+rn+"/FAA/"+directories.at(i));
if(MIF_DEBUG)
{
cout<<"index = "<<i<<" "<<directories.at(i)<<" , files.size() = "<<files.size()<<endl;
}
for(int j=0; j<files.size(); j++)
{
// outstream<<"Genome ordinal = "<<transition_ordinal<<endl<<"Genome name: "<<directories.at(i)<<endl<<"Genome file: "<<path+"/"+rn+"/FAA/"+directories.at(i)+"/"+files.at(j)<<endl<<endl;
string temp_path = path+"/"+rn+"/FAA/"+directories.at(i)+"/"+files.at(j);
outstream<<transition_ordinal<<endl<<directories.at(i)<<endl<<temp_path<<endl<<endl;
//outstream<<directories.at(i)<<endl;
}
outstream<<endl;
transition_ordinal++;
cout<<"index = "<<i<<" "<<directories.at(i)<<" , files.size() = "<<files.size()<<endl;
}
else
int ordinal=transition_ordinal+i;
for(int j=0; j<files.size(); j++)
{
if(MIF_DEBUG)
{
cout<<"Omitting redundant organism at i="<<i<<" for "<<directories.at(i)<<endl;
}
repstream<<ordinal<<" "