/****************************************************************************** Takes sorted lists of kmers and counts matches from lengths 1-k. Writes the histograms for each pair to DATA/ *****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include "Coptim1_3Dmat.h" #include "bitter.h" #include "amino_acids.h" #include "util.h" using namespace std; const int TAGLENGTH=20; const bool MCTR_DEBUG=false; const bool GENEIDRUN=true; const int CON_MAP_CUTOFF=3; //"cmc" variable. //For a protein with a hit between an HGT-pair to be spared deletion (due to possible HGT), it has to have at least cmc hits among the reference set. Therefore, a cmc=15 means at least 15 reference orgs have to have hits for the protein to be considered "conserved" (not delete-able). NOTE: Actually this is wrong! This is not what cmc is! It's the number of hits for a reference set organism, but over all the kmers. So this value can be very high. const int CON_MAP_SET_CUTOFF=3; const int HGT_MIN_LENGTH=12; const int MBKL=1.0; //Default=1.0. If tag_length=20, and MBKL=.75, then private variable mbkl will be set to .75*20=15. 15 will then be the minimum length of a match for its score to be passed to the bitter object, which uses these scores to select a right-hand cutoff for the data in the mdist function. const bool AM=false; const bool AAM=false; struct bits_holder { double d[20]; }; struct krow { vector r; }; class mctr { private: //Variables necessary for all 3 types of run: string bfn; //base file name: directory holding all the data (e.g. 4ecoli_run1) string path; //full path until base file name int tag_length; //default=20 int refsetsize; //0 means no reference genomes. string dir; //either "" empty string or "scr" int filtering_steps; //=-1 when no filtering used (e.g. using MERGED_TAGS rather than MERGED_TAGS_15_8) int filtering_option; //=-1 when no filtering was used (same as above) string merged_directory; string data_directory; int sos_cutoff; //default = 6.5*tag_length double max_single_bit_value; int max_bit_sum; //e.g. tag_length=20, highest bit store=4, 4*20 = 80 int mbkl; //minimum kmer length for bitter to store it, default=tag_length vector files; int number_of_genomes; int number_of_genomes_ref; int offset; int recursion_counter; ofstream logstream; char array_c[][TAGLENGTH]; //20 is the tag length. int *gnums_arr; int *gene_ids_arr; vector array; vector gnums; vector gene_ids; Coptim1_3Dmat* corr_array; vector bitterv; vector bitsv; amino_acids aas; util u; int run_type; //0=normal run, 1=ref_set_run, 2=final pair run bool partial_run; char partial_run_lead; int get_aa_int[30]; char get_aa_char[20]; //Variables necessary for HGT correction on subsequent passes through mctr. int prob1; //ordinal of one of the 2 problematic proteomes in the whole run. int prob2; //ordinal of one of the 2 problematic proteomes in the whole run. vector rows; //provides transitive information on HGT matches; if the pair shares a long length match, but one member of the pair has matches with the reference set, this applies to the other, so neither protein will be deleted. int rows_cols; int rows_rows; bool pair_run; //true if run is for just a pair with HGT, no reference set (3rd iteration) bool msc; //match_score_cutoff; true if the file "match_score_cutoffs.txt" exists, false otherwise. bool msc2; int genes_found; double min_msc; //the minimum msc among all the org pairs. map,int> msc_map; //delete this when subsequent maps are implemented. map,int> con_map; vector< set > con_map_set0; vector< set > con_map_set1; map,int> hgt_map; vector msc_ref_gnums; bool distances; vector problem_tags; //vector for the tags from the 2 proteomes that have the HGT problem. merged into the refset later. vector ref_tags; vector problem_tags_gnums; vector problem_tags_gis; bool check_composition(string s); void get_problem_tags(vector files,string ordinal); void get_problem_tags(vector files,string ordinal, set hgt_genes); int ppv_index; //what line of the problem pair index vector are we on. int ref_index; //what line of the reference vector are we on. vector > keys; vector info_lines0; vector info_lines1; vector proteins0; vector proteins1; void setup_it1(ifstream& infostream); void setup_it2(ifstream& infostream); void read_in_probfiles(ifstream& infostream,vector& files1, vector& files2); int calculate_max_bit_sum(); void open_info_file(ifstream& infostream); void create_directories(); char get_aa_char_fn(int i); void set_bitsv(); void set_get_aa_arrays(); void set_array_it0(int i); void set_array_it1(int i); void set_array_it2(int i); int set_gh(bool gh[], vector& gnums, int first_row, int last_row); void set_score_array(double * score_array,string seq,int count,int gen_ord); void find_matches_it0(int col, int first_row, int last_row, string seq); void find_matches_it1(int col, int first_row, int last_row, string seq); void find_matches_it2(int col, int first_row, int last_row, string seq); void increment_bits_it0(int first_row, int last_row,string seq); void increment_bits_it1(int first_row, int last_row,string seq); void increment_bits_it2(int first_row, int last_row,string seq); int get_aa(char aa); void write_out_data_it0(); void write_out_data_it1(); void write_out_data_it2(); void read_in_files(); double get_score(int i1,int i2, string seq); void populate_bitterv(); int get_bitterv_index(int i1, int i2); int get_sos_cutoff(); char get_pr_lead(); public: //constructors mctr(string full_path_par,string dir_par, int filtering_steps_par, int filtering_option_par,bool partial_run_par); //Use for the main run when doing entire set. mctr(string full_path_par, string dir_par, int filtering_steps_par, int filtering_options_par, int prob1_par, int prob2_par,vector& ref_tags_par); //Use when testing a pair against a reference set. mctr(string full_path_par, string dir_par, int filtering_steps_par, int filtering_option_par, int prob1_par, int prob2_par); //Use when running two proteomes, HGT-genes removed, against each other. Final step. //run the main algorithm void run_it0(); //main run on whole set. int run_it1(); //run pair against each other and the reference set. void run_it2(); //run pair against each other with phage/hgt proteins removed. void set_problem_pair(int prob1_par,int prob2_par); }; //Constructor 1. Main run. //Count all matches between all pairs present in a MERGED_TAGS directory (and also the corresponding MERGED_TAGSscr directory, but has to be called on the data separately using a different object). //This could be for a set without filtering (e.g. MERGED_TAGS plain) or with filering (e.g. MERGED_TAGS_15_8). //Without filtering: filtering_steps_par=-1, filtering_option_par=-1 mctr::mctr(string full_path_par, string dir_par, int filtering_steps_par, int filtering_option_par,bool partial_run_par) { cout<<"mctr::mctr(string,string,int,int)"<0) { int total=refsetsize; if(prob1>=refsetsize) { total++; } if(prob2>=refsetsize) { total++; } number_of_genomes_ref=total; if(prob1>=bound || prob2>=bound) { cout<<"Invalid genome id: prob1="<& ref_tags)"<=refsetsize) { total++; } if(prob2>=refsetsize) { total++; } number_of_genomes=number_of_genomes_ref=total; if(MCTR_DEBUG) { cout<<"total="<>gnum; if(gnum!=numg) { cout<<"Mismatch in set_bitsv case 0."<>total_residues; for(int j=0; j<20; j++) { int count; double frequency; histstream>>count>>frequency; bh_temp.d[j]=(double)count/(double)total_residues; bh_temp.d[j]=( (log(bh_temp.d[j])) * (-1.0)) / 2.0; //NOTE: using log, which in C++ uses base e. //NOTE: Divide by 2 here to speed up calculating scores in get_score(...) if(max==-1 || maxprob1 && i>prob2) { outstream<prob1) { outstream<>gnum; int total_residues; probstream>>total_residues; bits_holder bh_temp; for(int j=0; j<20; j++) { int count; double frequency; probstream>>count>>frequency; bh_temp.d[j]=(double)count/(double)total_residues; bh_temp.d[j]=( (log(bh_temp.d[j])) * (-1.0)) / 2.0; //divide by 2 here to speed up calculating scores in get_score(...) if(max==-1 || max::iterator it; it=bitsv.begin(); it++; bitsv.insert(it,bh_temp); } else if(gnum>gnum; int total_residues; probstream>>total_residues; for(int j=0; j<20; j++) { int count; double frequency; probstream>>count>>frequency; bh_temp.d[j]=(double)count/(double)total_residues; bh_temp.d[j]=( (log(bh_temp.d[j])) * (-1.0)) / 2.0; //divide by 2 here to speed up calculating scores in get_score(...) if(max==-1 || max::iterator it; it=bitsv.begin(); it++; bitsv.insert(it,bh_temp); } probstream.close(); } max_single_bit_value=max; if(bitsv.size()!=2) { cout<<"Problem in void mctr::set_bitsv()"<>gene_id; gene_ids.push_back(gene_id); array.push_back(sequence); gnums.push_back(gnumber); } instream.close(); cout<<"Closed "<=0 && aa_index<20) { find_matches_it0(col+1,first_row,last_row,seq+array[first_row][col]); increment_bits_it0(first_row,last_row,seq+array[first_row][col]); } } //IF THERE ARE DIFFERENT LEADING CHARACTERS, NEED TO PARTITION ARRAY FURTHER else { int index_start=first_row; int index_end=first_row; while(index_start=0) { find_matches_it0(col+1, index_start, index_end-1,seq+leader); increment_bits_it0(index_start,index_end-1,seq+leader); } index_start=index_end; } //end of while(index_start=0 && aa_index<20) { // find_matches_it2(col+1,first_row,last_row,seq+array.at(first_row)[col]); // increment_bits_it2(first_row,last_row,seq+array.at(first_row)[col]); find_matches_it2(col+1,first_row,last_row,seq+array[first_row][col]); increment_bits_it2(first_row,last_row,seq+array[first_row][col]); } }//end of shortcut //IF THERE ARE DIFFERENT LEADING CHARACTERS, NEED TO PARTITION ARRAY FURTHER else { int index_start=first_row; int index_end=first_row; while(index_start=0) { find_matches_it2(col+1, index_start, index_end-1,seq+leader); increment_bits_it2(index_start,index_end-1,seq+leader); } index_start=index_end; } //end of while(index_start gnum_holder; //using a set to deal with repeats within the same organism (sets are non-redundant) //bool store_gene_ids=false; //Used for storing gene numbers in bitterv because their score >= match_score_cutoff. // bool full=false; //set gnum_diag_hit; //pair::iterator,bool> ret; //Store all ordinal IDs in a set. /* for(int i=first_row; i<=last_row && !full; i++) { // int gnum=gnums.at(i); int gnum=gnums[i]; ret = gnum_holder.insert(gnum); if(gnum_holder.size()>=number_of_genomes && gnum_diag_hit.size()>=number_of_genomes) { full=true; } if(ret.second==false) //This means gnum was already in the set. Since the sequence is present twice in the same organism, that means the organism has a match within itself; therefore add the number to the diagonal set. { gnum_diag_hit.insert(gnum); } } */ if(MCTR_DEBUG) { cout<<"In increment_bits_it0(...)"<=1) { set::iterator it3; for(it3=gnum_diag_hit.begin(); it3!=gnum_diag_hit.end(); it3++) { int it3_int = *it3; double score = get_score(it3_int,it3_int,seq); corr_array->increment_element(it3_int,it3_int,(int)(score+.5)); } }*/ //Case 3: A match between ALL organisms // else if(gnum_holder.size()>=number_of_genomes) //shortcut else if(num_uniq_genomes==number_of_genomes) { // cout<<"full sized"<increment_element(i,j,(int)(score+.5)); if(seq.length()>=mbkl) { bitterv[get_bitterv_index(i,j)].add_new_score(score); } } } /* for(int i=0; iincrement_element(i,j,(int)(score+.5)); if(seq.length()>=mbkl) { // bitterv.at(get_bitterv_index(i,j)).add_new_score(score); bitterv[get_bitterv_index(i,j)].add_new_score(score); } } } */ /* if(gnum_diag_hit.size()>0) { set::iterator it1; for(it1=gnum_diag_hit.begin(); it1!=gnum_diag_hit.end(); it1++) { int it1_int = *it1; double score = get_score(it1_int,it1_int,seq); corr_array->increment_element(it1_int,it1_int,(int)(score+.5)); } }*/ delete[] score_array; } //Case 4: A match between SOME non-identical organisms else { /* cout<<"Case 4: Some but not all have matches."<=mbkl) { bitterv[get_bitterv_index(i,j)].add_new_score(score); } // cout<<"got through the if block..."<increment_element(i,j,(int)(score+.5)); if(seq.length()>=mbkl) { bitterv[get_bitterv_index(i,j)].add_new_score(score); } } } } } */ /* set::iterator it1; set::iterator it2; for(it1=gnum_holder.begin(); it1!=gnum_holder.end(); it1++) { for(it2=gnum_holder.begin(); it2!=gnum_holder.end(); it2++) { int ig=*it1; //ig is a gnum int jg=*it2; //jg is a gnum if(igincrement_element(ig,jg,(int)(score+.5)); if(seq.length()>=mbkl) { bitterv[get_bitterv_index(ig,jg)].add_new_score(score); } } } } */ //fill in the diagonal. /* set::iterator it3; for(it3=gnum_diag_hit.begin(); it3!=gnum_diag_hit.end(); it3++) { int it3_int = *it3; double score = get_score(it3_int, it3_int,seq); corr_array->increment_element(it3_int,it3_int,(int)(score+.5)); }*/ delete[] score_array; } delete[] gh; if(MCTR_DEBUG) { cout<<"End of increment_bits_it0(...)"<& gnums, int first_row, int last_row) { int count=0; for(int i=first_row; i<=last_row; i++) { if(!gh[gnums[i]]) { gh[gnums[i]]=true; count++; } } return count; } void mctr::set_score_array(double * score_array,string seq,int count,int gen_ord) { for(int j=0; j gnum_holder; //using a set to deal with repeats within the same organism (sets are non-redundant) bool store_gene_ids=false; //Used for storing gene numbers in bitterv because their score >= match_score_cutoff. Even when msc is true, bool full=false; set gnum_diag_hit; pair::iterator,bool> ret; //Store all ordinal IDs in a set. for(int i=first_row; i<=last_row && !full; i++) { // int gnum=gnums.at(i); int gnum=gnums[i]; ret = gnum_holder.insert(gnum); if(gnum_holder.size()>=number_of_genomes && gnum_diag_hit.size()>=number_of_genomes) { full=true; } if(ret.second==false) //This means gnum was already in the set. Since the sequence is present twice in the same organism, that means the organism has a match within itself; therefore add the number to the diagonal set. { gnum_diag_hit.insert(gnum); } } //Case 1: No matches between organisms or within organisms. if(gnum_holder.size()<=1 && gnum_diag_hit.size()==0) { //do nothing } //Case 2: A match within an organism and not between organisms else if(gnum_holder.size()<=1 && gnum_diag_hit.size()>=1) { set::iterator it3; for(it3=gnum_diag_hit.begin(); it3!=gnum_diag_hit.end(); it3++) { int it3_int = *it3; double score = get_score(it3_int,it3_int,seq); corr_array->increment_element(it3_int,it3_int,(int)(score+.5)); } } //Case 3: A match between ALL organisms else if(gnum_holder.size()>=number_of_genomes) //shortcut { for(int i=0; iincrement_element(i,j,(int)(score+.5)); if(seq.length()>=mbkl) { bitterv[get_bitterv_index(i,j)].add_new_score(score); } if( (i<=1 || j<=1) ) //if either problem org is represented. { if(seq.length()>=HGT_MIN_LENGTH) { store_gene_ids=true; } } } } if(gnum_diag_hit.size()>0) { set::iterator it1; for(it1=gnum_diag_hit.begin(); it1!=gnum_diag_hit.end(); it1++) { int it1_int = *it1; double score = get_score(it1_int,it1_int,seq); corr_array->increment_element(it1_int,it1_int,(int)(score+.5)); } } } //Case 4: A match between SOME non-identical organisms else { set::iterator it1; set::iterator it2; for(it1=gnum_holder.begin(); it1!=gnum_holder.end(); it1++) { for(it2=gnum_holder.begin(); it2!=gnum_holder.end(); it2++) { int ig=*it1; //ig is a gnum int jg=*it2; //jg is a gnum if(igincrement_element(ig,jg,(int)(score+.5)); if(seq.length()>=mbkl) { bitterv[get_bitterv_index(ig,jg)].add_new_score(score); } if(ig<=1 || jg<=1) { if(seq.length()>=HGT_MIN_LENGTH) { store_gene_ids=true; } } } } } set::iterator it3; for(it3=gnum_diag_hit.begin(); it3!=gnum_diag_hit.end(); it3++) { int it3_int = *it3; double score = get_score(it3_int, it3_int,seq); corr_array->increment_element(it3_int,it3_int,(int)(score+.5)); } } if(store_gene_ids) { set::iterator it1; set::iterator it2; int bottom=-1; //Don't let repeats get counted; once a gnum has been handled, ignore all the rest of the same gnum. Fortunately, the vector is sorted. bool hgt_match=true; for(int i=first_row; i=bitterv[index].get_msc() || hgt_match) { std::pair hit1; std::pair hit2; hit1=std::make_pair(ig,gig); //proteome 1 ids hit2=std::make_pair(jg,gjg); //proteome 2 ids if( (ig==0 && jg==1) || (ig==1 && jg==0) ) //An entry for the HGT map. { if(hgt_map.find(std::pair(ig,gig))!=hgt_map.end()) { std::map,int>::iterator it; it = hgt_map.find(std::pair(ig,gig)); (it->second)++; } //entry doesn't exist. else { std::pair,int> foo; foo = std::make_pair(hit1,1); //The 1 stands for the starting score, i.e. 1 hit. hgt_map.insert(foo); } //second entry exists in the hgt map if(hgt_map.find(std::pair(jg,gjg))!=hgt_map.end()) { std::map,int>::iterator it; it = hgt_map.find(std::pair(jg,gjg)); (it->second)++; } else { std::pair,int> foo; foo = std::make_pair(hit2,1); hgt_map.insert(foo); } } //end of ig==0 and jg==1 //Not a match between the problems; a match with a ref. else if(ig==0 || ig==1) { if(con_map.find(std::pair(ig,gig))!=con_map.end()) { std::map,int>::iterator it; it = con_map.find(std::pair(ig,gig)); (it->second)++; } //entry doesn't exist. else { std::pair,int> foo; foo = std::make_pair(hit1,1); //The 1 stands for the starting score, i.e. 1 hit. con_map.insert(foo); } } else if(jg==0 || jg==1) { //second entry exists in the con map if(con_map.find(std::pair(jg,gjg))!=con_map.end()) { std::map,int>::iterator it; it = con_map.find(std::pair(jg,gjg)); (it->second)++; } else { std::pair,int> foo; foo = std::make_pair(hit2,1); con_map.insert(foo); } } //end of jg==0 or jg==1 } //end of score > msc } //end of igbottom } //end of first for } //end of second for } //end of if(msc) } /******************************************* increment_bits_it2(...) - When running just the HGT pair, HGT-genes removed. *********************************************/ void mctr::increment_bits_it2(int first_row, int last_row,string seq)//,ofstream& alalstream, ofstream& alstream) { set gnum_holder; //using a set to deal with repeats within the same organism (sets are non-redundant) bool store_gene_ids=false; //Used for storing gene numbers in bitterv because their score >= match_score_cutoff. Even when msc is true, bool full=false; set gnum_diag_hit; pair::iterator,bool> ret; //Store all ordinal IDs in a set. for(int i=first_row; i<=last_row && !full; i++) { int gnum=gnums[i]; ret = gnum_holder.insert(gnum); if(gnum_holder.size()>=number_of_genomes && gnum_diag_hit.size()>=number_of_genomes) { full=true; } if(ret.second==false) //This means gnum was already in the set. Since the sequence is present twice in the same organism, that means the organism has a match within itself; therefore add the number to the diagonal set. { gnum_diag_hit.insert(gnum); } } if(MCTR_DEBUG) { cout<<"In increment_bits_it2(...)"<=1) { set::iterator it3; for(it3=gnum_diag_hit.begin(); it3!=gnum_diag_hit.end(); it3++) { int it3_int = *it3; double score = get_score(it3_int,it3_int,seq); corr_array->increment_element(it3_int,it3_int,(int)(score+.5)); } } //Case 3: A match between ALL organisms else if(gnum_holder.size()>=number_of_genomes) //shortcut { for(int i=0; iincrement_element(i,j,(int)(score+.5)); if(seq.length()>=mbkl) { // bitterv.at(get_bitterv_index(i,j)).add_new_score(score); bitterv[get_bitterv_index(i,j)].add_new_score(score); } // if(msc && (i<=1 || j<=1) && score>=bitterv.at(get_bitterv_index(i,j)).get_msc() && bitterv.at(get_bitterv_index(i,j)).get_msc()>-1) if(msc && (i<=1 || j<=1) && score>=bitterv[get_bitterv_index(i,j)].get_msc() && bitterv[get_bitterv_index(i,j)].get_msc()>-1) { if(seq.length()>=6) { store_gene_ids=true; } } } } if(gnum_diag_hit.size()>0) { set::iterator it1; for(it1=gnum_diag_hit.begin(); it1!=gnum_diag_hit.end(); it1++) { int it1_int = *it1; double score = get_score(it1_int,it1_int,seq); corr_array->increment_element(it1_int,it1_int,(int)(score+.5)); } } } //Case 4: A match between SOME non-identical organisms else { set::iterator it1; set::iterator it2; for(it1=gnum_holder.begin(); it1!=gnum_holder.end(); it1++) { for(it2=gnum_holder.begin(); it2!=gnum_holder.end(); it2++) { int ig=*it1; //ig is a gnum int jg=*it2; //jg is a gnum if(igincrement_element(ig,jg,(int)(score+.5)); if(seq.length()>=mbkl) { // bitterv.at(get_bitterv_index(ig,jg)).add_new_score(score); bitterv[get_bitterv_index(ig,jg)].add_new_score(score); } if(msc && (ig<=1 || jg<=1) && score>=bitterv[get_bitterv_index(ig,jg)].get_msc() && bitterv[get_bitterv_index(ig,jg)].get_msc()>-1) { if(seq.length()>=6) { store_gene_ids=true; } } } } } set::iterator it3; for(it3=gnum_diag_hit.begin(); it3!=gnum_diag_hit.end(); it3++) { int it3_int = *it3; double score = get_score(it3_int, it3_int,seq); corr_array->increment_element(it3_int,it3_int,(int)(score+.5)); } } } // end of increment_bits_it2(...) int mctr::get_aa(char aa) { if(aa=='A') return 0; else if(aa=='C') return 1; else if(aa=='D') return 2; else if(aa=='E') return 3; else if(aa=='F') return 4; else if(aa=='G') return 5; else if(aa=='H') return 6; else if(aa=='I') return 7; else if(aa=='K') return 8; else if(aa=='L') return 9; else if(aa=='M') return 10; else if(aa=='N') return 11; else if(aa=='P') return 12; else if(aa=='Q') return 13; else if(aa=='R') return 14; else if(aa=='S') return 15; else if(aa=='T') return 16; else if(aa=='V') return 17; else if(aa=='W') return 18; else if(aa=='Y') return 19; else if(aa=='^') return 20; else { cout<<"Invalid character: "<=20) { cout<<"Bad value for int i in fn get_aa_char_fn(int): i = "<get_element(i,j,k); } for(int k=0; kget_element(i,i,j)<, int>::iterator it=con_map.begin(); it!=con_map.end(); ++it) { con_stream<<(it->first).first<<" "<<(it->first).second<<" "<second<, int>::iterator it=hgt_map.begin(); it!=hgt_map.end(); ++it) { hgt_stream<<(it->first).first<<" "<<(it->first).second<<" "<second<, int>::iterator hgt_it=hgt_map.begin(); hgt_it!=hgt_map.end(); ++hgt_it) { bool in_there=false; int in_there_count=0; for(std::map, int>::iterator it=con_map.begin(); it!=con_map.end(); ++it) { if(hgt_it->first.first==it->first.first && hgt_it->first.second==it->first.second) { in_there=true; in_there_count=it->second; } } int ref_hits=0; int ref_hits_other=0; if(!in_there || in_there_count<=CON_MAP_CUTOFF) { cout<<"/nMismatch between conserved set and hgt pair for "<first.first<<" "<first.second<<" "<second<first.second<<" "<second<first.first==0) { cout<<"Info line = "<first.second]<>staller; outstream2.open( (path+"/"+bfn+"/"+data_directory+"/data_after_hgt_cleanup.txt").c_str()); } cout<<"Successfully opened outfile to data.txt"<d_name); if(temp.length()>5) { files.push_back(string(dirp->d_name)); } else { cout<<"Rejecting file named "<>nog_ref>>nog>>tag_length; number_of_genomes=nog+nog_ref; number_of_genomes_ref=nog_ref; } /************************************************************************************************ For a second or third iteration run, when a pair has been selected that may haveHGT and is either being run against a reference set or just run against each other with the suspicious genes marked for exclusion, THIS FUNCTION READS THE INFO FILE AND ADDS ALL .FAA FILES ASSOCIATED WITH EACH OF THE PROBLEMATIC ORGANISMS ONTO A PAIR OF STRING VECTORS. ************************************************************************************************/ void mctr::read_in_probfiles(ifstream& infostream,vector& files1, vector& files2) { if(MCTR_DEBUG) { cout<<"In mctr::read_in_probfiles(...)"< files1; vector files2; read_in_probfiles(infostream,files1,files2); set hgt_genes0; set hgt_genes1; string eater; ifstream instream; instream.open( (path+"/"+bfn+"/hgt_msc_for_filtering.txt").c_str()); if(instream.fail()) { cout<<"Failed to open hgt_msc_for_filtering.txt"<>eater; } instream>>e1>>e2; eater=""; } while(eater!="HGT_instances:" && !instream.eof() ) { instream>>eater; } //at this point, instream should be poised at hgt genes. instream>>eater; //should eat a 0 or 1 OR "End_of_hgt_genes" if there are no hgt genes!!! while(eater!="End_of_hgt_genes") { int gene_id,count; instream>>gene_id>>count; char buffer[2000]; string info_line,p; instream>>info_line; instream.getline(buffer,5000); info_line+=buffer; instream>>p; if(p=="" || info_line=="") { cout<<"p is empty in mctr."<>eater; } instream.close(); get_problem_tags(files1,"0",hgt_genes0); get_problem_tags(files2,"1",hgt_genes1); std::sort(problem_tags.begin(), problem_tags.end()); } void mctr::create_directories() { int system_return = system ( ("rm -r "+path+"/"+bfn+"/"+data_directory).c_str()); system_return = system ( ("mkdir "+path+"/"+bfn+"/"+data_directory).c_str()); } double mctr::get_score(int i1,int i2, string seq) { double result=0; for(int i=0; i=0 && index<20) { // result=result+bitsv.at(i1).d[get_aa(seq[i])]+bitsv.at(i2).d[get_aa(seq[i])]; //result=result+bitsv.at(i1).d[get_aa_int[(int)seq[i]-65]]+bitsv.at(i2).d[get_aa_int[(int)seq[i]-65]]; result=result+bitsv[i1].d[get_aa_int[(int)seq[i]-65]]+bitsv[i2].d[get_aa_int[(int)seq[i]-65]]; } else { cout<<"index out of bounds in get_score(...). seq = "<=refsetsize) total++; if(prob2>=refsetsize) total++; for(int i=0; i files,string ordinal) { int gene_id=0; for(int i=0; i)"<)"<>eater; while(!instream.eof()) { if(eater[0]=='>') { char buffer[2000]; instream.getline(buffer,2000); string info_line=eater+" "+buffer; if(ordinal=="0") { info_lines0.push_back(info_line); } else { info_lines1.push_back(info_line); } instream>>eater; } string p; while(!instream.eof() && eater[0]!='>') { p+=eater; instream>>eater; } //at this point we have the protein. if(ordinal=="0") { proteins0.push_back(p); } else { proteins1.push_back(p); } for(int q=0; q files,string ordinal, set hgt_genes) { cout<<"In get_problem_tags for pair_run."<)"<)"<>eater; while(!instream.eof()) { if(eater[0]=='>') { char buffer[2000]; instream.getline(buffer,2000); string info_line=eater+" "+buffer; if(ordinal=="0") { info_lines0.push_back(info_line); } else { info_lines1.push_back(info_line); } instream>>eater; } string p; while(!instream.eof() && eater[0]!='>') { p+=eater; instream>>eater; } //at this point we have the protein. if(ordinal=="0") { proteins0.push_back(p); } else { proteins1.push_back(p); } set::iterator it; it = hgt_genes.find(gene_id); if(it==hgt_genes.end()) { for(int q=0; q array; for(int i=0; i<20; i++) { array.push_back(0); } for(int i=0; i=0 && index<20) { array[index]++; } else { return false; } } int result=0; for(int i=0; isos_cutoff) { return false; } return true; } int mctr::get_sos_cutoff() { return 6.5*tag_length; } char mctr::get_pr_lead() { ifstream instream; string infile = path+"/"+bfn+"/TAGS/TAGS0.txt"; instream.open(infile.c_str()); if(instream.fail()) { cout<<"In function tmerg::get_pr_lead(): Failed to open infile to "<>num_tags; string first_tag; instream>>first_tag; return first_tag[0]; instream.close(); } void mctr::set_get_aa_arrays() { get_aa_int[0]=0; //A get_aa_int[1]=-1; //B get_aa_int[2]=1; //C get_aa_int[3]=2; //D get_aa_int[4]=3; //E get_aa_int[5]=4; //F get_aa_int[6]=5; //G get_aa_int[7]=6; //H get_aa_int[8]=7; //I get_aa_int[9]=-1; //J get_aa_int[10]=8; //K get_aa_int[11]=9; //L get_aa_int[12]=10;//M get_aa_int[13]=11;//N get_aa_int[14]=-1;//O get_aa_int[15]=12;//P get_aa_int[16]=13;//Q get_aa_int[17]=14;//R get_aa_int[18]=15;//S get_aa_int[19]=16;//T get_aa_int[20]=-1;//U get_aa_int[21]=17;//V get_aa_int[22]=18;//W get_aa_int[23]=-1;//X get_aa_int[24]=19;//Y get_aa_int[25]=-1;//blank get_aa_int[26]=-1;//blank get_aa_int[27]=-1;//blank get_aa_int[28]=-1;//blank get_aa_int[29]=-1;//^ get_aa_char[0]='A'; get_aa_char[1]='C'; get_aa_char[2]='D'; get_aa_char[3]='E'; get_aa_char[4]='F'; get_aa_char[5]='G'; get_aa_char[6]='H'; get_aa_char[7]='I'; get_aa_char[8]='K'; get_aa_char[9]='L'; get_aa_char[10]='M'; get_aa_char[11]='N'; get_aa_char[12]='P'; get_aa_char[13]='Q'; get_aa_char[14]='R'; get_aa_char[15]='S'; get_aa_char[16]='T'; get_aa_char[17]='V'; get_aa_char[18]='W'; get_aa_char[19]='Y'; }