Commit 262e8216 authored by Raquel Bromberg's avatar Raquel Bromberg

Minor code changes (mostly code cleanup)

parent 25aca03f
#include <iostream>
#include <fstream>
#include <cstdlib>
#include <omp.h>
//#include <omp.h>
#include <getopt.h>
#include "mctr.h"
......@@ -58,9 +58,11 @@ int main(int argc, char* argv[])
// cout<<"p="<<full_path<<endl;
// cout<<"fs="<<fs<<endl;
cout<<"Calling run on real data"<<endl;
mctr m(full_path,"",fs,fo,partial_run);
cout<<"Calling run()"<<endl;
m.run_it0();
cout<<"Calling run on scrambled data"<<endl;
mctr mscr(full_path,"scr",fs,fo,partial_run);//scrambled
mscr.run_it0();
......
......@@ -9,10 +9,18 @@ using namespace std;
int main(int argc, char* argv[])
{
string full_path = "";
if(argc<2)
{
cout<<"Wrong use of filter.cpp: ./filt <path to run>"<<endl;
exit(1);
}
string full_path = argv[1];
int mismatch = 13;
int filtering_steps = 10;
char c;
bool verbose=false;
bool very_similar_set=false; //e.g. input of all E.coli
cout<<"Values stored in argv:"<<endl;
for(int i=0; i<argc; i++)
......@@ -20,7 +28,7 @@ int main(int argc, char* argv[])
cout<<argv[i]<<endl;
}
while((c = getopt(argc,argv,"m:p:f:")) != -1)
while((c = getopt(argc,argv,"m:p:f:vs")) != -1)
{
switch(c)
{
......@@ -33,14 +41,19 @@ int main(int argc, char* argv[])
case 'f':
filtering_steps=atoi(optarg);
break;
case 'v':
verbose=true;
break;
case 's':
very_similar_set=true;
break;
default:
abort();
}
}
filter f(full_path,mismatch,filtering_steps);
filter f(full_path,mismatch,filtering_steps,verbose,very_similar_set);
f.write_out_filt_logs();
f.print_final_hist();
f.mark_discrepancies();
return 0;
......
/*
SlopeTree CONSERVATION FILTER.
Filter:
Reads in the merged list of kmers (produced by tmerg)
and creates a filtered set of proteins in which proteins
Input: The merged list of kmers (produced by tmerg)
Output: A filtered set of proteins in which proteins
with kmers exhibiting unusual copy number patterns are
removed.
*/
......@@ -27,16 +27,9 @@ removed.
using namespace std;
const double GNUMS_TO_GENEIDS_RATIO=1.3; //GNUMS_TO_GENEIDS_RATIO:
const double CLPAR4=0.9;
const double FLAGGING_CUTOFF=0.8;
const bool FILTER_DEBUG=false;
/*struct cluster
{
vector<string> seqs;
};
*/
//can be replaced by pair<int,int>
struct gnum_gi
{
......@@ -51,7 +44,8 @@ struct range
int end; //inclusive
};
struct cluster2
//can also probably be replaced.
struct cluster
{
vector<range> ranges;
};
......@@ -64,47 +58,56 @@ class filter
int tag_length;
ofstream logstream;
int ref_set_size;
int cluster_id; //this really shouldn't be a member variable
int main_set_size;
int mismatch_cutoff;
vector<string> files;
int final_hist[20]; //Histogram of amino acid frequencies over all the sequences added to any cluster.
int CLUSTERING_CUTOFF; //~13 FOR 20MERS
int CLUSTERING_CUTOFF; //DEFAULT = 13 MATCHES FOR 20MERS
int filtering_steps;
bool VERBOSE;
amino_acids aas;
vector<org> organisms;
vector<string> tags;
util u;
bool VERY_SIMILAR_SET; //like an input of all E.coli.
bool VERY_SIMILAR_SET; //like an input of all E.coli. User has to specify this case.
void read_in_files();
void open_tag_files();
void read_in_tags(vector<string>& tags, char current_amino_acid, int& num_refs);
bool compare_tags(string s1, string s2);
void form_clusters(vector<string>& tags);//, string s);
void view_clusters2(vector<cluster2>& clusters2,vector<string>& tags);
int get_ref_gnums(set<int>& gnums2);
int get_ref_genes(vector<gnum_gi>& v2);
void form_clusters(vector<string>& tags);
void update_orgs_with_clusters(vector<cluster>& clusters);
void filter_against_ref(set<int>& gnums, vector<gnum_gi>& ggv);
void filter_against_self(set<int>& gnums, vector<gnum_gi>& ggv);
void flag_proteomes(set<int>& gnums, vector<gnum_gi>& ggv);
void view_clusters(vector<cluster>& clusters,vector<string>& tags);
int get_ref_gnums(set<int>& gnums);
int get_ref_genes(vector<gnum_gi>& ggv);
void set_organisms_v1();
// void form_clusters(vector<string>& tags, string s);
public:
//constructors
filter(string full_path_par, int mismatch_par, int filtering_steps_par);
filter(string full_path_par, int mismatch_par, int filtering_steps_par, bool verbose_par, bool VERY_SIMILAR_par);
~filter();
void merge();
void read_in_tfile(int i);
void write_out_filt_logs();
void print_final_hist();
//vector<org> return_organisms();
void mark_discrepancies();
void write_out_filt_logs();
void merge();
void read_in_tfile(int i);
};
filter::filter(string path_par, int mismatch_par, int filtering_steps_par)
/*************************************************************************
Constructor.
************************************************************************/
filter::filter(string path_par, int mismatch_par, int filtering_steps_par, bool verbose_par, bool VERY_SIMILAR_par)
{
if(FILTER_DEBUG)
VERBOSE=verbose_par;
VERY_SIMILAR_SET=VERY_SIMILAR_par;
if(FILTER_DEBUG || VERBOSE)
{
cout<<"filter::filter(string,int)"<<endl;
}
......@@ -115,32 +118,365 @@ filter::filter(string path_par, int mismatch_par, int filtering_steps_par)
}
u.extract_paths(path_par,path,bfn);
string logfile = path+"/"+bfn+"/log.txt";
u.open_ofile_app(logstream,logfile,"logfile");
logstream<<"filter::filter(string,int,int,bool)"<<endl;
cout<<"log for filter run being written to logfile="<<logfile<<endl;
u.get_tl(path+"/"+bfn+"/"+bfn+"_info.txt",tag_length);
u.get_rss(path+"/"+bfn+"/"+bfn+"_info.txt",ref_set_size);
u.get_mss(path+"/"+bfn+"/"+bfn+"_info.txt",main_set_size);
CLUSTERING_CUTOFF=mismatch_par;
filtering_steps=filtering_steps_par;
set_organisms_v1(); //read in all information from <bfn>_orgs_saved.txt
string logfile = path+"/"+bfn+"/log.txt";
u.open_ofile_app(logstream,logfile,"logfile");
logstream<<"filter::filter(string,int)"<<endl;
read_in_files();
cluster_id=0;
for(int i=0; i<files.size(); i++)
for(int i=0; i<files.size(); i++) //For all tag files...
{
read_in_tfile(i);
form_clusters(tags);//, s);
form_clusters(tags);
tags.clear();
}
}
/***************
Destructor
***************/
filter::~filter()
{
cout<<"Through filter destructor"<<endl;
if(VERBOSE)
{
cout<<"You may now use the fwrite program to extract proteomes filtered by conservation."<<endl;
}
}
/**********************************************
Note: No information regarding these clusters is
preserved.
The organisms object holds all the counts and
can be used at the end either to
a) write out the reduced proteomes or
b) write out the reduced MERGED_TAGS.
***********************************************/
void filter::form_clusters(vector<string>& tags)
{
if(FILTER_DEBUG || VERBOSE)
{
cout<<"filter::form_clusters(vector<string>&, string)"<<endl;
}
vector<cluster> clusters;
int pivot=0; //marks the index of a sequence not necessarily part of a cluster. When a cluster forms, pivot points to the index of the first sequence in the cluster.
for(int i=0; i<tags.size()-1; i++) //The -1 is because we always compare the tag that we are ON to the NEXT one (+1).
//We could also start at 1 and compare the tag that we are on to the previous one. We just don't do it this way.
{
if(i%100000==0 && (FILTER_DEBUG || VERBOSE) )
{
cout<<"i="<<i<<" "<<tags.at(i)<<endl;
}
/*1. Mark genes with internal kmer repeats for deletion.
// if(tags.at(i)==tags.at(i+1)) //Basically, if there is the same sequence in the same proteome/gene combination.
{
stringstream ss;
ss<<tags.at(i);
string tagseq;
int gnum,gene_id;
ss>>tagseq>>gnum>>gene_id;
organisms.at(gnum).mark_gene_for_deletion(gene_id); //It's 2 tags, but they're identical, i.e. for the same gene.
//So you only have to do one deletion-marking.
//organisms.at(gnum).write_out_gene(gene_id);
}
//End of removing genes with internal kmer repeats.
*/ //this is now taken care of in the mefilter.cpp code totally independently of this module.
int num_aa_matches=0;
//2. Count the number of matching amino acids between the current adjacent
//CLUSTERING_CUTOFF~=13 for diverse inputs, ~=20 for very similar inputs (e.g. all E.coli)
for(int k=0; k<tag_length && num_aa_matches<CLUSTERING_CUTOFF; k++)
{
if(tags.at(i)[k]==tags.at(i+1)[k] && tags.at(i)[k]!='^')
{
num_aa_matches++;
}
}
if(num_aa_matches<CLUSTERING_CUTOFF) //new sequence at i+1 not in the current cluster.
{
if( (i-pivot) > 3 ) //If the preceding already are a cluster, which means at least 4 tags.
{
cluster temp_cluster;
range temp_range;
temp_range.start=pivot; //bounds are inclusive
temp_range.end=i; //bounds are inclusive
temp_cluster.ranges.push_back(temp_range);
clusters.push_back(temp_cluster);
}
else if(clusters.size()>0) //If it's not a cluster, check if it could belong to the previous cluster.
{
int nam=0;
for(int k=0; k<tag_length && nam<CLUSTERING_CUTOFF; k++)
{
int index=clusters.at(clusters.size()-1).ranges.at(clusters.at(clusters.size()-1).ranges.size()-1).end;
if(tags.at(i)[k]==tags.at(index)[k])
{
nam++;
}
}
if(nam>=CLUSTERING_CUTOFF) //if there is enough overlap between the two sequences.
{
range temp_range;
temp_range.start=i;
temp_range.end=i;
clusters.at(clusters.size()-1).ranges.push_back(temp_range);
}
}
pivot=i+1; //Current sequence at index i not part of a cluster with the sequence at i+1. Therefore, move pivot to i+1, which MIGHT form a cluster with the sequence at i+2, etc.
}
}
//Done forming the clusters.
if(FILTER_DEBUG)
{
view_clusters(clusters,tags);
}
int count_merged_clusters=0;
if(VERBOSE || FILTER_DEBUG)
{
cout<<"Number of clusters before merge = "<<clusters.size()<<endl;
cout<<"Merging the clusters..."<<endl;
}
for(int i=clusters.size()-1; i>0; i--) //merge the clusters starting from the end of the vector.
{
int num_aa_matches=0;
for(int k=0; k<tag_length && num_aa_matches<CLUSTERING_CUTOFF; k++)
{
//Compare the first tag in the upper cluster to the last tag in the lower cluster.
int index1=clusters.at(i).ranges.at(0).start;
int index2=clusters.at(i-1).ranges.at(clusters.at(i-1).ranges.size()-1).end;
// if(tags.at(clusters.at(i).ranges.at(0).start)[k]==tags.at(clusters.at(i-1).ranges.at(clusters.at(i-1).ranges.size()-1).end)[k])
if(tags.at(index1)[k]==tags.at(index2)[k])
{
num_aa_matches++;
}
}
//If there's a match, then push all the tags onto the lower cluster and clear the upper one.
if(num_aa_matches>=CLUSTERING_CUTOFF)
{
for(int k=0; k<clusters.at(i).ranges.size(); k++)
{
clusters.at(i-1).ranges.push_back(clusters.at(i).ranges.at(k));
}
clusters.at(i).ranges.clear();
clusters.erase(clusters.begin()+i); // new line of code.
count_merged_clusters++;
}
}
if(VERBOSE || FILTER_DEBUG)
{
cout<<"Number of clusters after merge = "<<clusters.size()<<endl;
}
update_orgs_with_clusters(clusters);
cout<<"Bottom of form_clusters(...)"<<endl;
}
/****************************************************************
***************************************************************/
void filter::update_orgs_with_clusters(vector<cluster>& clusters)
{
for(int i=0; i<clusters.size(); i++) //For all the clusters.
{
set<int> gnums;
vector<gnum_gi> ggv;
//For all the sequence ranges in the cluster
for(int f=0; f<clusters.at(i).ranges.size(); f++)
{
//For all the tags in the range.
for(int g=clusters.at(i).ranges.at(f).start; g<=clusters.at(i).ranges.at(f).end; g++)
{
stringstream ss;
ss<<tags.at(g);
string tagseq;
int gnum,gene_id;
ss>>tagseq>>gnum>>gene_id; //e.g. AKLMNPLSKLRTIIKMHKLA 3 2983
gnums.insert(gnum); //can only hold unique genome ordinals, so max size = input size (like 300)
gnum_gi gg;
gg.gnum=gnum;
gg.gene_id=gene_id;
ggv.push_back(gg); //holds all gnums and gis, including redundant ones, e.g. could have 2 identical instances: 3 2983, 3 2983
}
}
//At this point, gnums has all the UNIQUE gnums in the current cluster,
//and ggv has all the gnum/gi pairs in the cluster (NON-UNIQUE & UNIQUE).
if(ref_set_size>0)
{
filter_against_ref(gnums,ggv);
}
else
{
filter_against_self(gnums,ggv);
}
flag_proteomes(gnums,ggv);
}
if(VERBOSE || FILTER_DEBUG)
{
cout<<"Finished another batch of clusters."<<endl;
cout<<"Size of clusters = "<<clusters.size()<<endl;
}
}
/*********************************************************************
********************************************************************/
void filter::filter_against_ref(set<int>& gnums, vector<gnum_gi>& ggv)
{
// int replacement_for_gnumssize = get_ref_gnums(gnums); //number of reference genomes in cluster.
int ref_gnums_s = get_ref_gnums(gnums); //number of reference genomes in cluster.
// int replacement_for_ggvsize = get_ref_genes(ggv); //number of genes (not necessarily unique) in reference in cluster.
int ref_ggv_s = get_ref_genes(ggv); //number of genes (not necessarily unique) in reference in cluster.
//For all the genes (not necessarily unique) with hits.
//What if the same gene is here more than once? It will get incremented more than once.
for(int f=0; f<ggv.size(); f++)
{
//If there was at least one hit in the reference set.
// if(replacement_for_gnumssize>0)
if(ref_gnums_s>0)
{
// organisms.at(ggv.at(f).gnum).increment_ggs(ggv.at(f).gene_id,replacement_for_gnumssize,replacement_for_ggvsize);
// organisms.at(ggv.at(f).gnum).increment_ggs(ggv.at(f).gene_id,replacement_for_gnumssize,replacement_for_ggvsize, (double)replacement_for_gnumssize/(double)ref_set_size);
// organisms.at(ggv.at(f).gnum).increment_ggs(ggv.at(f).gene_id,ref_gnums_s,replacement_for_ggvsize, (double)ref_gnums_s/(double)ref_set_size);
organisms.at(ggv.at(f).gnum).increment_ggs(ggv.at(f).gene_id,ref_gnums_s,ref_ggv_s, (double)ref_gnums_s/(double)ref_set_size);
}
}
}
/**********************************************************************
*********************************************************************/
void filter::filter_against_self(set<int>& gnums, vector<gnum_gi>& ggv)
{
for(int f=0; f<ggv.size(); f++)
{
// organisms.at(ggv.at(f).gnum).increment_ggs(ggv.at(f).gene_id,gnums.size(),ggv.size());
organisms.at(ggv.at(f).gnum).increment_ggs(ggv.at(f).gene_id,gnums.size(),ggv.size(), (double)gnums.size()/(double)organisms.size());
}
}
/*****************************************************************
Flag proteomes that have an over-representation of conserved kmers.
This code is ONLY for the purpose of flagging proteomes.
****************************************************************/
void filter::flag_proteomes(set<int>& gnums, vector<gnum_gi>& ggv)
{
// int replacement_for_gnumssize = get_ref_gnums(gnums); //number of reference genomes in cluster.
int ref_gnums_s = get_ref_gnums(gnums); //number of reference genomes in cluster.
// int replacement_for_ggvsize = get_ref_genes(ggv); //number of genes (not necessarily unique) in reference in cluster.
int ref_ggv_s = get_ref_genes(ggv); //number of genes (not necessarily unique) in reference in cluster.
double left;
double right;
double avg_representation;
if(ref_set_size)
{
// left=replacement_for_gnumssize;
left=ref_gnums_s;
right=FLAGGING_CUTOFF*(double)ref_set_size; //i.e. If it's a conserved group.
// avg_representation=(double)replacement_for_gnumssize/(double)replacement_for_ggvsize;
// avg_representation=(double)ref_gnums_s/(double)replacement_for_ggvsize;
avg_representation=(double)ref_gnums_s/(double)ref_ggv_s;
}
else if(!ref_set_size)
{
left=gnums.size();
right=FLAGGING_CUTOFF*organisms.size();
avg_representation=(double)gnums.size()/(double)ggv.size();
}
if(VERY_SIMILAR_SET)
{
if(left>right)
{
for(set<int>::iterator f=gnums.begin(); f!=gnums.end(); ++f)
{
int current_gnum=*f;
int instances=0;
for(int k=0; k<ggv.size(); k++)
{
if(ggv.at(k).gnum==current_gnum)
{
instances++;
}
}
organisms.at(*f).add_discr(instances,avg_representation);
}
for(set<int>::iterator f = gnums.begin(); f!=gnums.end(); ++f) //Flag proteomes that are lacking conserved kmers.
{
organisms.at(*f).increment_conserved();
}
}
}
else if(!VERY_SIMILAR_SET)
{
//This block is for the purpose of identifying proteomes that either are lacking many very conserved genes, or have an overrepresentation of them.
//If the number of reference genomes represented in the cluster is above the value set by FLAGGING_CUTOFF (e.g. 95 reference genomes hit out of 100 total with a FLAGGING_CUTOFF of .9 would be above the cutoff)
if(left>=right)
{
for(set<int>::iterator f=gnums.begin(); f!=gnums.end(); ++f)
{
int current_gnum=*f;
int instances=0;
for(int k=0; k<ggv.size(); k++)
{
if(ggv.at(k).gnum==current_gnum)
{
instances++;
}
}
organisms.at(*f).add_discr(instances,avg_representation);
}
//2 flag proteomes that are lacking conserved kmers.
for(set<int>::iterator f = gnums.begin(); f!=gnums.end(); ++f)
{
organisms.at(*f).increment_conserved();
}
}
}
/*Very important final possibility.
//If the "cluster" consists of one gnum, mark the proteins for deletion.
// if(gnums.size()==1)
// {
//proteomes.at(ggv.at(0).gnum).mark_gene_for_deletion(ggv.at(0).gene_id);
// for(int f=0; f<ggv.size(); f++)
// {
// //organisms.at(ggv.at(f).gnum).increment_ggs(ggv.at(f).gene_id,gnums.size(),ggv.size());
// organisms.at(ggv.at(f).gnum).mark_gene_for_deletion(ggv.at(f).gene_id);
// }
// } */ //this was for parallel identification of mobile elements which now has its own module in mefilter.
}
void filter::write_out_filt_logs()
{
u.rm_mkdir(path+"/"+bfn,"filt_logs");
u.rm_mkdir(path+"/"+bfn,"FILT_LOGS");
u.rm(path+"/"+bfn+"/proteome_sizes.txt");
u.rm(path+"/"+bfn+"/pv.txt");
u.rm(path+"/"+bfn+"/pvv.txt");
......@@ -213,7 +549,7 @@ void filter::write_out_filt_logs()
for(int filt_option = 0; filt_option<=filtering_steps; filt_option++)
{
//Assuming that the value produced a unique set
if(filt_option==0 || (num_proteins.at(filt_option)!=num_proteins.at(filt_option-1) ) )
// if(filt_option==0 || (num_proteins.at(filt_option)!=num_proteins.at(filt_option-1) ) )
{
//Open a file that will serve as a log for which proteins to keep and throw away.
stringstream ss;
......@@ -224,7 +560,7 @@ void filter::write_out_filt_logs()
ss.clear();
ss<<filtering_steps;
ss>>fss;
string filtfile=path+"/"+bfn+"/filt_logs/filtlog_"+fss+"_"+col+".txt";
string filtfile=path+"/"+bfn+"/FILT_LOGS/filtlog_"+fss+"_"+col+".txt";
ofstream filtstream;
u.open_ofile_app(filtstream,filtfile,"filtfile");
......@@ -251,7 +587,7 @@ void filter::write_out_filt_logs()
organisms.at(i).set_ref(true);
}
organisms.at(i).write_out_new_proteome(column);
organisms.at(i).write_org_info_v2(savestream);
organisms.at(i).write_org_info_ggv(savestream);
}
}
......@@ -338,383 +674,13 @@ void filter::read_in_tfile(int i)
cout<<"Done reading in tags. tags.size() = "<<tags.size()<<endl;
}
/*
Note: No information regarding these clusters is preserved.
The organisms object holds all the counts and can be used at the end
either to a) write out the reduced proteomes or b) write out the
reduced MERGED_TAGS.
*/
void filter::form_clusters(vector<string>& tags)//, string s)
{
if(FILTER_DEBUG)
{
cout<<"filter::form_clusters(vector<string>&, string)"<<endl;
}
vector<cluster2> clusters2;
int piv1=0;
for(int i=0; i<tags.size()-1; i++) //The -1 is because we always compare the tag that we are ON to the NEXT one (+1).
//We could also start at 1 and compare the tag that we are on to the previous one. We just don't do it this way.
{
if(i%100000==0 && FILTER_DEBUG)
{
cout<<"i="<<i<<" "<<tags.at(i)<<endl;
}
//1. Mark genes with internal kmer repeats for deletion.
if(tags.at(i)==tags.at(i+1)) //Basically, if there is the same sequence in the same proteome/gene combination.
{
stringstream ss;
ss<<tags.at(i);
string tagseq;
int gnum,gene_id;
ss>>tagseq>>gnum>>gene_id;
organisms.at(gnum).mark_gene_for_deletion(gene_id); //It's 2 tags, but they're identical, i.e. for the same gene.
//So you only have to do one deletion-marking.
//organisms.at(gnum).write_out_gene(gene_id);
}
//End of removing genes with internal kmer repeats.
int matches2=0;
//2. Count the number of matching amino acids between the current adjacent
//CLUSTERING_CUTOFF~=13 for diverse inputs, ~=20 for very similar inputs (e.g. all E.coli)
for(int k=0; k<tag_length && matches2<CLUSTERING_CUTOFF; k++)
{
if(tags.at(i)[k]==tags.at(i+1)[k] && tags.at(i)[k]!='^')
{
matches2++;
}
}
if(matches2<CLUSTERING_CUTOFF) //new sequence not in the current cluster.
{
if( (i-piv1) > 3 ) //If the preceding already are a cluster, which means at least 4 tags.
{
cluster2 tempcl2;
range temprange;
temprange.start=piv1;
temprange.end=i;
tempcl2.ranges.push_back(temprange);
clusters2.push_back(tempcl2);
}
else if(clusters2.size()>0) //If it's not a cluster, check if it could belong to the previous cluster.
{
//cout<<"In the else loop. Checking to see if it's part of the previous cluster."<<endl;
int matches3=0;
for(int k=0; k<tag_length && matches3<CLUSTERING_CUTOFF; k++)
{
if(tags.at(i)[k]==tags.at(clusters2.at(clusters2.size()-1).ranges.at(clusters2.at