Commit f84d0ba9 authored by Raquel Bromberg's avatar Raquel Bromberg

wip mefilter.cpp added

parent b06e34d3
......@@ -18,6 +18,7 @@ int main(int argc, char* argv[])
string full_path = argv[1];
char c;
bool partial_run=false;
int fs=-1;
int fo=-1;
......@@ -32,7 +33,7 @@ int main(int argc, char* argv[])
cout<<argv[i]<<endl;
}
while((c = getopt(argc,argv,"p:o:f:")) != -1)
while((c = getopt(argc,argv,"p:o:f:C")) != -1)
{
switch(c)
{
......@@ -45,6 +46,9 @@ int main(int argc, char* argv[])
case 'f':
fs=atoi(optarg);
break;
case 'C':
partial_run=true;
break;
default:
abort();
}
......@@ -54,10 +58,10 @@ int main(int argc, char* argv[])
// cout<<"p="<<full_path<<endl;
// cout<<"fs="<<fs<<endl;
mctr m(full_path,"",fs,fo);
mctr m(full_path,"",fs,fo,partial_run);
cout<<"Calling run()"<<endl;
m.run_it0();
mctr mscr(full_path,"scr",fs,fo);//scrambled
mctr mscr(full_path,"scr",fs,fo,partial_run);//scrambled
mscr.run_it0();
return 0;
......
......@@ -4,6 +4,7 @@
#include <cstdlib>
#include <cmath>
#include <omp.h>
#include <getopt.h>
#include "org.h"
#include "amino_acids.h"
......@@ -45,11 +46,12 @@ struct v
int uscore;
};
void do_counts(Mproteome p1, Mproteome p2, int tag_length, vector<int>& array, int& max, int& min,string& most_unlikely, int& uscore,int SOS_CUTOFF);
void do_counts(Mproteome p1, Mproteome p2, int tag_length, vector<int>& array, int& max, int& min,string& most_unlikely, int& uscore,int SOS_CUTOFF,bool partial_run,char partial_run_char);
avg calc_running_avgs(int array[], int max, int min);
bool check_composition(string s,int tag_length,amino_acids& aas,int SOSCUTOFF);
int get_num_bins(string path,string bfn);
int calc_sos_cutoff(string path,string bfn);
char get_pr_lead(string path, string bfn);
int main(int argc, char* argv[])
{
......@@ -69,6 +71,27 @@ int main(int argc, char* argv[])
cout<<"num_bins = "<<num_bins<<endl;
string PS = "P";
int SOS_CUTOFF = calc_sos_cutoff(path,bfn);
bool partial_run=false;
char partial_run_lead;
char c;
cout<<"Values stored in argv:"<<endl;
for(int i=0; i<argc; i++)
{
cout<<argv[i]<<endl;
}
while((c = getopt(argc,argv,":C")) != -1)
{
switch(c)
{
case 'C':
partial_run=true;
break;
default:
abort();
}
}
//string FM = argv[3]; //always set to true
// int num_bins=atoi(argv[4]); //set it from within
......@@ -76,6 +99,10 @@ int main(int argc, char* argv[])
//int SOS_CUTOFF=atoi(argv[6]); //set it from within
amino_acids aas;
if(partial_run)
{
partial_run_lead=get_pr_lead(path,bfn);
}
//1. read in info_file
ifstream instream;
......@@ -265,8 +292,8 @@ int main(int argc, char* argv[])
int max(0),min(0);
string most_unlikely="";
int uscore=0;
do_counts(proteomes.at(i),proteomes.at(j),tag_length,array,max,min,most_unlikely,uscore,SOS_CUTOFF);
do_counts(proteomes.at(j),proteomes.at(i),tag_length,array,max,min,most_unlikely,uscore,SOS_CUTOFF);
do_counts(proteomes.at(i),proteomes.at(j),tag_length,array,max,min,most_unlikely,uscore,SOS_CUTOFF,partial_run,partial_run_lead);
do_counts(proteomes.at(j),proteomes.at(i),tag_length,array,max,min,most_unlikely,uscore,SOS_CUTOFF,partial_run,partial_run_lead);
//avg result;
//result = calc_running_avgs(array,max,min);
......@@ -357,7 +384,7 @@ int main(int argc, char* argv[])
}
//void do_counts(Mproteome p1, Mproteome p2, int tag_length, int array[],int& max, int& min,string& most_unlikely,int& uscore)
void do_counts(Mproteome p1, Mproteome p2, int tag_length, vector<int>& array,int& max, int& min,string& most_unlikely,int& uscore,int SOS_CUTOFF)
void do_counts(Mproteome p1, Mproteome p2, int tag_length, vector<int>& array,int& max, int& min,string& most_unlikely,int& uscore,int SOS_CUTOFF,bool partial_run,char partial_run_lead)
{
int sum=0;
for(int i=0; i<array.size(); i++)
......@@ -371,6 +398,8 @@ void do_counts(Mproteome p1, Mproteome p2, int tag_length, vector<int>& array,in
}
amino_acids aas;
if(!partial_run)
{
for(int i=0; i<p1.proteins.size(); i++)
{
for(int j=0; j<p1.proteins.at(i).length(); j++)
......@@ -416,6 +445,58 @@ void do_counts(Mproteome p1, Mproteome p2, int tag_length, vector<int>& array,in
}
}
}
}
else if(partial_run)
{
for(int i=0; i<p1.proteins.size(); i++) // for all proteins in proteome p1
{
for(int j=0; j<p1.proteins.at(i).length(); j++) // for all positions in given protein
{
if(p1.proteins.at(i)[j]==partial_run_lead)
{
string s="";
double score=0.0;
for(int k=0; k<tag_length && (j+k)<p1.proteins.at(i).length(); k++)
{
// bool composition_ok=check_composition(p1.proteins.at(i).substr(k,tag_length),tag_length,aas,SOS_CUTOFF);
// if(composition_ok)
// {
s+=p1.proteins.at(i)[j+k];
int index=aas.get_aa(p1.proteins.at(i)[j+k]);
if(index>=0 && index<20)
{
score+=p1.bit_scores[index]+p2.bit_scores[index];
array[(int)(score+.5)]+=1;
if( (score+.5)<min || min==0)
{
min=score+.5;
//cout<<"min score = "<<min<<" sequence = "<<s<<endl;
}
if( (score+.5)>max)
{
max=score+.5;
//cout<<"max score = "<<max<<" sequence = "<<s<<endl;
uscore=max;
most_unlikely=s;
}
}
else
{
k=tag_length;
j+=k-1;
}
// }
// else
// {
// k=tag_length;
// j+=k-1;
// }
}
}
}
}
}
}
bool check_composition(string s,int tag_length,amino_acids& aas,int SOSCUTOFF)
......@@ -541,3 +622,21 @@ int calc_sos_cutoff(string path,string bfn)
instream.close();
return result;
}
char get_pr_lead(string path, string bfn)
{
ifstream instream;
string infile = path+"/"+bfn+"/TAGS/TAGS0.txt";
instream.open(infile.c_str());
if(instream.fail())
{
cout<<"In function tmerg::get_pr_lead(): Failed to open infile to "<<infile<<endl;
exit(1);
}
int num_tags;
instream>>num_tags;
string first_tag;
instream>>first_tag;
instream.close();
return first_tag[0];
}
#ifndef FILTER_H
#define FILTER_H
/*
A filter for mobile elements.
filter:
Reads in the merged list of kmers (produced by tmerg) and creates a filtered set of proteins
in which proteins with kmers exhibiting unusual copy number patterns are removed.
SlopeTree CONSERVATION FILTER.
Filter:
Reads in the merged list of kmers (produced by tmerg)
and creates a filtered set of proteins in which proteins
with kmers exhibiting unusual copy number patterns are
removed.
*/
#ifndef FILTER_H
#define FILTER_H
#include <iostream>
#include <vector>
#include <sstream>
......
......@@ -78,6 +78,8 @@ class mctr
amino_acids aas;
util u;
int run_type; //0=normal run, 1=ref_set_run, 2=final pair run
bool partial_run;
char partial_run_lead;
//Variables necessary for HGT correction on subsequent passes through mctr.
int prob1; //ordinal of one of the 2 problematic proteomes in the whole run.
......@@ -151,10 +153,11 @@ class mctr
void populate_bitterv();
int get_bitterv_index(int i1, int i2);
int get_sos_cutoff();
char get_pr_lead();
public:
//constructors
mctr(string full_path_par,string dir_par, int filtering_steps_par, int filtering_option_par); //Use for the main run when doing entire set.
mctr(string full_path_par,string dir_par, int filtering_steps_par, int filtering_option_par,bool partial_run_par); //Use for the main run when doing entire set.
mctr(string full_path_par, string dir_par, int filtering_steps_par, int filtering_options_par, int prob1_par, int prob2_par,vector<string>& ref_tags_par); //Use when testing a pair against a reference set.
mctr(string full_path_par, string dir_par, int filtering_steps_par, int filtering_option_par, int prob1_par, int prob2_par); //Use when running two proteomes, HGT-genes removed, against each other. Final step.
......@@ -171,13 +174,18 @@ class mctr
//Count all matches between all pairs present in a MERGED_TAGS directory (and also the corresponding MERGED_TAGSscr directory, but has to be called on the data separately using a different object).
//This could be for a set without filtering (e.g. MERGED_TAGS plain) or with filering (e.g. MERGED_TAGS_15_8).
//Without filtering: filtering_steps_par=-1, filtering_option_par=-1
mctr::mctr(string full_path_par, string dir_par, int filtering_steps_par, int filtering_option_par)
mctr::mctr(string full_path_par, string dir_par, int filtering_steps_par, int filtering_option_par,bool partial_run_par)
{
cout<<"mctr::mctr(string,string,int,int)"<<endl;
run_type=0;
partial_run=partial_run_par;
//main variables.
u.extract_paths(full_path_par,path,bfn);
if(partial_run)
{
partial_run_lead=get_pr_lead();
}
u.get_rss(path+"/"+bfn+"/"+bfn+"_info.txt",refsetsize);
u.get_tl(path+"/"+bfn+"/"+bfn+"_info.txt",tag_length);
mbkl=int(MBKL*(double)tag_length);
......@@ -404,6 +412,8 @@ void mctr::run_it0()
}
//parallelize command.
if(!partial_run)
{
for(int i=0; i<20; i++)
{
//Coptim1_3Dmat* corr_array_temp; //when it's parallelized, this will be set to zero, filled by each thread, then added into the real one at the end with a lock on it
......@@ -438,6 +448,41 @@ void mctr::run_it0()
cout<<"Escaped recursive function find_matches(...)!"<<endl;
astr.close();
}
}
else if(partial_run)
{
//Coptim1_3Dmat* corr_array_temp; //when it's parallelized, this will be set to zero, filled by each thread, then added into the real one at the end with a lock on it
all_matches.clear();
ofstream aastr;
open_str(aam_directory,aas.get_aa(partial_run_lead),aastr);
ofstream astr;
open_str(am_directory,aas.get_aa(partial_run_lead),astr);
//for(int n=0; n<all_matches.size(); n++)
//{
//all_matches_stream<<all_matches.at(n)<<endl;
//}
//all_matches.clear();
array.clear();
gnums.clear();
gene_ids.clear();
set_array_it0(aas.get_aa(partial_run_lead));
cout<<"First 3 rows of array: "<<endl;
cout<<array.at(0)<<" "<<gnums.at(0)<<" "<<gene_ids.at(0)<<endl;
cout<<array.at(1)<<" "<<gnums.at(1)<<" "<<gene_ids.at(1)<<endl;
cout<<array.at(2)<<" "<<gnums.at(2)<<" "<<gene_ids.at(2)<<endl;
cout<<"Last row of array: "<<endl;
cout<<array.at(array.size()-1)<<" "<<gnums.at(gnums.size()-1)<<" "<<gene_ids.at(gene_ids.size()-1)<<endl;
cout<<"Second to last row of array: "<<endl;
cout<<array.at(array.size()-2)<<endl;
cout<<"Last row of array: "<<endl;
cout<<array.at(array.size()-1)<<endl;
cout<<"Entering recursive function find_matches_it0(...)..."<<endl;
find_matches_it0(0,0,array.size()-1,"", aastr, astr);
cout<<"Escaped recursive function find_matches(...)!"<<endl;
astr.close();
}
cout<<"Done with mctr main algorithm."<<endl;
......@@ -3533,6 +3578,24 @@ int mctr::get_sos_cutoff()
return 6.5*tag_length;
}
char mctr::get_pr_lead()
{
ifstream instream;
string infile = path+"/"+bfn+"/TAGS/TAGS0.txt";
instream.open(infile.c_str());
if(instream.fail())
{
cout<<"In function tmerg::get_pr_lead(): Failed to open infile to "<<infile<<endl;
exit(1);
}
int num_tags;
instream>>num_tags;
string first_tag;
instream>>first_tag;
return first_tag[0];
instream.close();
}
/*void mctr::fill_genesOI()
{
ifstream instream;
......
......@@ -12,6 +12,7 @@ int main(int argc, char* argv[])
if(argc<2)
{
cout<<"Wrong input to mdist.cpp: ./mdist -p <full path> -f <filtering steps, =0 for unfiltered (e.g. all) data> -o <filtering option, =0 for unfiltered (e.g. all) data>"<<endl;
exit(1);
}
string full_path = argv[1];
......
#include <iostream>
#include <fstream>
//#include "mefilter.h"
#include "util.h"
using namespace std;
void populate(vector<string> files,vector<string>& proteins,vector<string>& info_lines);
int get_gene_id(string s);
void read_in_conserved_kmers(string consdirectory,vector<string>& conserved_kmers);
int main(int argc, char* argv[])
{
if(argc<2)
{
cout<<"Wrong use of mefilter.cpp: ./mef <full_path_to_run>"<<endl;
exit(1);
}
string full_path=argv[1];
string consdirectory=argv[2];
cout<<"full_path="<<full_path;
util u;
string path,bfn;
cout<<"extracting paths..."<<endl;
u.extract_paths(full_path,path,bfn);
cout<<"path="<<path<<endl;
cout<<"bfn="<<bfn<<endl;
ofstream logstream;
cout<<"Opening mefilter.cpp log file..."<<endl;
string logfile = path+"/"+bfn+"/logfile_mefilter.txt";
logstream.open(logfile.c_str());
if(logstream.fail())
{
cout<<"Failed to open logstream to logfile="<<logfile<<endl;
exit(1);
}
else
{
cout<<"Successfully opened logstream to logfile="<<logfile<<endl;
logstream<<"full_path="<<full_path<<endl;
logstream<<"path="<<path<<endl;
logstream<<"bfn="<<bfn<<endl;
}
int tag_length;
u.get_tl2(full_path,tag_length);
logstream<<"tag_length="<<tag_length<<endl;
int total_size;
total_size = u.get_ts(full_path); //total number of organisms.
logstream<<"total_size="<<total_size<<endl;
cout<<"total_size="<<total_size<<endl;
vector<string> conserved_kmers();
read_in_conserved_kmers(consfile,conserved_kmers);
logstream<<"Starting loop through all genome ordinals..."<<endl;
cout<<"mefilter.cpp: Starting loop through all genome ordinals..."<<endl;
for(int i=0; i<total_size; i++)
{
//ordinal=i.
logstream<<"i="<<i<<endl;
cout<<"Doing ordinal i="<<i<<" out of "<<total_size<<endl;
vector<string> files = u.get_files(full_path,i);
cout<<"Size of files = "<<files.size()<<endl;
vector<string> proteins;
vector<string> info_lines;
populate(files,proteins,info_lines);
// vector<string> tags = u.get_tags(full_path,i);
vector<string> tags1;
vector<int> tags2;
vector<int> tags3;
u.get_tags(full_path,i,tags1,tags2,tags3);
cout<<"size of files = "<<files.size()<<endl;
cout<<"size of tags1 = "<<tags1.size()<<endl;
cout<<"size of tags2 = "<<tags2.size()<<endl;
cout<<"size of tags3 = "<<tags3.size()<<endl;
vector<int> gids;
for(int i=0; i<proteins.size(); i++)
{
gids.push_back(0);
}
//At this stage, all proteins and their info lines have been read in and are stored in the info_lines and proteins vectors. Similarly, all tags have been read in and are stored in tags.
int lindex=0;
int rindex=1;
while(lindex<tags1.size() && rindex<tags1.size())
{
if(tags1.at(lindex)[tag_length-1]!='^')
{
while(rindex<tags1.size() && tags1.at(lindex)==tags1.at(rindex))
{
rindex++;
}
}
int findex=rindex-1;
if(findex>lindex) //no cluster.
{
cout<<"Matches: "<<endl;
for(int j=lindex; j<=findex; j++)
{
int gid=tags3.at(j);
gids.at(gid)++;
cout<<tags1.at(j)<<endl;
cout<<info_lines.at(gid)<<endl;
}
cout<<endl;
}
lindex=rindex;
rindex=lindex+1;
}
cout<<"Gids vector: "<<endl;
for(int i=0; i<gids.size(); i++)
{
cout<<i<<" "<<gids.at(i)<<endl;
}
int ME_count=0;
for(int i=0; i<gids.size(); i++)
{
if(gids.at(i)>0)// && gids.at(i)>=(double)proteins.at(i).length()/2.0)
{
ME_count++;
cout<<i<<" "<<gids.at(i)<<" "<<info_lines.at(i)<<endl;
}
}
cout<<"ME_count="<<ME_count<<endl;
cout<<"total="<<gids.size()<<endl;
cout<<"fraction = "<<(double)ME_count/(double)gids.size()<<endl;
exit(1);
} // for all organisms.
logstream.close();
return 0;
}
void populate(vector<string> files,vector<string>& proteins,vector<string>& info_lines)
{
for(int i=0; i<files.size(); i++)
{
ifstream instream;
instream.open(files.at(i).c_str());
if(instream.fail())
{
cout<<"In me_filter.cpp::populate(vector<string>,vector<string>&,vector<string>&): Failed to open instream to "<<files.at(i)<<endl;
exit(1);
}
string eater;
instream>>eater;
while(!instream.eof())
{
if(eater[0]=='>')
{
char buffer[5000];
instream.getline(buffer,5000);
string info_line = eater+" "+buffer;
eater="";
string p;
instream>>eater;
while(eater[0]!='>' && !instream.eof())
{
p=p+eater;
instream>>eater;
}
info_lines.push_back(info_line);
proteins.push_back(p);
}
}
cout<<"Size of info_lines="<<info_lines.size()<<endl;
cout<<"Size of proteins="<<proteins.size()<<endl;
instream.close();
}
}
int get_gene_id(string s)
{
istringstream iss(s);
string temp1,temp2;
int temp3;
iss>>temp1>>temp2>>temp3;
return temp3;
}
void read_in_conserved_kmers(string consdirectory,vector<string>& conserved_kmers)
{
vector<string> files;
util u;
u.get_file();
ifstream instream;
instream.open(consfile.c_str());
if(instream.fail())
{
cout<<"Failed to open consfile "<<consfile<<endl;
exit(1);
}
}
......@@ -92,7 +92,7 @@ class org
int get_num_proteins();
void fill_fields(string dir_par);
void generate_tags(bool do_scrambled);
void generate_tags(bool do_scrambled, bool partial_run);
void generate_scrambled_tags();
void clear_tags();
void clear_scr_tags();
......@@ -136,8 +136,6 @@ org::org()
// DCT - Wasn't initialized. Causes error later in mdist reading org file.
conserved = false;
}
org::~org()
......@@ -156,11 +154,11 @@ int org::mark_keepers(vector<string> keepers)
p.mark_keepers(keepers);
}
void org::generate_tags(bool do_scrambled)
void org::generate_tags(bool do_scrambled, bool partial_run)
{
total_num_aa=0;
read_in_proteins();
p.generate_tags(ordinal,do_scrambled);
p.generate_tags(ordinal,do_scrambled,partial_run);
}
void org::generate_scrambled_tags()
......
......@@ -22,7 +22,7 @@ const double KEEPERS_CUTOFF=0.6;
const bool SUMOFSQUARES=true;
const double CONSERVATION_CUTOFF=1.3;
const bool PROTEOME_DEBUG=false;
const char PARTIAL_RUN_LEAD='A'; //cysteine.
const int MAXORDDIGITS=5;
using namespace std;
......@@ -37,9 +37,9 @@ struct orthscorer
{
int gnums;
int gis;
int internal_repeats; //This may as well be an integer and not a bool.
bool keeper; //keeper > deleter (if keeper=true and deleter=true, keep it)
bool deleter;
int internal_repeats; //This may as well be an integer and not a bool.
};
struct osv
......@@ -72,6 +72,8 @@ class proteome_v2
vector<orthscorer> pv;
vector<osv> pvv;
vector<int> sosv;
//vector<bool> keeper;
//vector<bool> deleter;
void populate_proteins(string file);
void populate_kmers();
......@@ -113,7 +115,7 @@ class proteome_v2
int get_total_num_proteins();
void add_protein(string info_line, string aastr);
void generate_tags(int ordinal,bool do_scrambled);
void generate_tags(int ordinal,bool do_scrambled, bool partial_run);
void generate_scrambled_tags(int ordinal);
void clear_tags();
void clear_scr_tags();
......@@ -194,14 +196,14 @@ void proteome_v2::mark_keepers(vector<string> keepers)
// if(keepers.at(f)==proteins.at(i).get_sequence().substr(j,keepers.at(f).length()))
if(keeper_present(keepers.at(f), prot_seq.substr(j,keepers.at(f).length())))
{
cout<<"Marking keeper: i="<<i<<endl<<keepers.at(f)<<endl<<proteins.at(i).get_info_line()<<endl<<proteins.at(i).get_sequence()<<endl;
cout<<"Marking keeper: i="<<i<<endl<<keepers.at(f)<<endl<<proteins.at(i).get_info_line()<<endl;//<<proteins.at(i).get_sequence()<<endl;
proteins.at(i).mark_keeper();
pv.at(i).keeper=true;
cout<<"pv.at(i)="<<pv.at(i).keeper<<endl;
//cout<<"pv.at(i)="<<pv.at(i).keeper<<endl;
//cout<<"pv.at(i+1)="<<pv.at(i+1).keeper<<endl;
//cout<<"no"<<endl;
pvv.at(i).keeper=true;
cout<<"Marked keeper."<<endl;
//cout<<"Marked keeper."<<endl;
}