Commit f27a7156 authored by Raquel Bromberg's avatar Raquel Bromberg

minor code changes

parent 9db65c61
......@@ -624,7 +624,42 @@ void mctr::set_bitsv()
if(run_type==0) //the normal case.
{
ifstream instream;
double max=-1;
for(int numg=0; numg<number_of_genomes; numg++)
{
ifstream histstream;
u.open_ifile(histstream,path+"/"+bfn+"/HIST/"+bfn+"_"+u.int_to_string(numg)+"_hist.txt","histstream");
bits_holder bh_temp;
int gnum;
histstream>>gnum;
if(gnum!=numg)
{
cout<<"Mismatch in set_bitsv case 0."<<endl;
exit(1);
}
int total_residues;
histstream>>total_residues;
for(int j=0; j<20; j++)
{
int count;
double frequency;
histstream>>count>>frequency;
bh_temp.d[j]=(double)count/(double)total_residues;
bh_temp.d[j]=( (log(bh_temp.d[j])) * (-1.0)) / 2.0; //NOTE: using log, which in C++ uses base e.
//NOTE: Divide by 2 here to speed up calculating scores in get_score(...)
if(max==-1 || max<bh_temp.d[j])
{
max = bh_temp.d[j];
}
}
bitsv.push_back(bh_temp);
histstream.close();
}
/* ifstream instream;
u.open_ifile(instream,path+"/"+bfn+"/"+bfn+"_hist.txt","instream");
double max=-1;
......@@ -647,11 +682,10 @@ void mctr::set_bitsv()
//NOTE: Divide by 2 here to speed up calculating scores in get_score(...)
if(max==-1 || max<bh_temp.d[j])
{
max = bh_temp.d[j];
max = bh_temp.d[j];
}
}
bitsv.push_back(bh_temp);
}
}
bitsv.push_back(bh_temp);*/
max_single_bit_value=max;
} //end of normal case run_type==0
......@@ -696,13 +730,23 @@ void mctr::set_bitsv()
outstream.close();
ifstream probstream;
u.open_ifile(probstream,path+"/"+bfn+"/"+bfn+"_hist.txt","probstream");
// ifstream probstream;
// u.open_ifile(probstream,path+"/"+bfn+"/"+bfn+"_hist.txt","probstream");
//get the bits for the 2 problem proteomes.
double max=-1;
for(int i=0; i<number_of_genomes; i++) //number_of_genomes, NOT number_of_genomes_ref, because the code has to go through the entire set to find a) all the reference genomes and b) the two problem genomes.
{
ifstream probstream;
string probfile=path+"/"+bfn+"/HIST/"+bfn+"_"+u.int_to_string(i)+"_hist.txt";
probstream.open(probfile.c_str());
if(probstream.fail())
{
cout<<"Failed to open probstream to probfile="<<probfile<<endl;
cout<<"run_type=="<<run_type<<endl;
exit(1);
}
int gnum;
probstream>>gnum;
int total_residues;
......@@ -737,21 +781,28 @@ void mctr::set_bitsv()
{
bitsv.push_back(bh_temp);
}
probstream.close();
}
probstream.close();
max_single_bit_value=max;
}
} //run_type==1
else if(run_type==2) //Running a pair against itself, with the suspicious genes already flagged by the previous run against a reference set.
{
ifstream probstream;
u.open_ifile(probstream,path+"/"+bfn+"/"+bfn+"_hist.txt","probstream");
//get the bits for the 2 problem proteomes.
double max=-1;
for(int i=0; i<number_of_genomes; i++) //number_of_genomes, NOT number_of_genomes_ref, because the code has to go through the entire set to find a) all the reference genomes and b) the two problem genomes.
{
ifstream probstream;
string probfile=path+"/"+bfn+"/HIST/"+bfn+"_"+u.int_to_string(i)+"_hist.txt";
probstream.open(probfile.c_str());
if(probstream.fail())
{
cout<<"Failed to open probstream to probfile="<<probfile<<endl;
cout<<"run_type=="<<run_type<<endl;
exit(1);
}
bits_holder bh_temp;
int gnum;
probstream>>gnum;
......@@ -782,11 +833,11 @@ void mctr::set_bitsv()
it++;
bitsv.insert(it,bh_temp);
}
probstream.close();
}
probstream.close();
max_single_bit_value=max;
if(bitsv.size()!=2)
{
cout<<"Problem in void mctr::set_bitsv()"<<endl;
......@@ -795,7 +846,8 @@ void mctr::set_bitsv()
cout<<"prob2 = "<<prob2<<endl;
exit(1);
}
}
} //run_type==2
} //end of set_bitsv()
/*****************************************************
......
/***********************************************************************
This program takes proteomes as input and writes these same proteomes out minus
the mobile elements, which it identifies by means of copy number.
Example:
g++ mefilter.cpp -o mef
./mef ../RUNS/MEF_10_7/ ../RUNS/MEF_ref_10_7/MERGED_TAGS/
Input:
1) Path to a directory where SlopeTree has been run up to the ./tmerg module.
2) Path to a directory (most likely MERGED_TAGS) containing sorted, merged k-mers
from the highly conserved proteins of a reference set.
Output:
1) A directory is written out, FAA_mobelim, containing all the proteomes of the
input minus the mobile elements. If a reference set (FAA_ref) was used, then
a directory, FAA_ref_mobelim, is also written out, with this reference set also
filtered of mobile elements.
NOTE 1: There are some proteomes where the vast majority or even ALL proteins are
flagged as mobile elements. However, writing out an empty proteome can cause
issues for subsequent ST modules. Therefore, these proteomes are written out
to FAA_mobelim or FAA_ref_mobelim without any changes made (filtering is simply
not performed on them). Their ordinals are written to logfile_mefilter.txt and
also to mefilter_discrepancies.txt. When using mefilter, users should check
either of these output files to see if any organisms in the input had this problem.
NOTE 2: Information in SAVED_ORGS is used to identify "keepers". These are proteins with
at least 1 k-mer that sttagger identified as belonging to a highly conserved
protein with unusual copy number (e.g. EF-Tu). Keepers are never deleted and are
always written out to the new, ME-reduced proteomes.
**********************************************************************/
#include <iostream>
#include <fstream>
#include <map>
#include <omp.h>
#include <getopt.h>
//#include "mefilter.h"
#include "util.h"
using namespace std;
void populate(vector<string> files,vector<string>& proteins,vector<string>& info_lines);
int get_gene_id(string s);
void read_in_conserved_kmers(string consdirectory,vector<string>& conserved_kmers);
void read_in_conserved_kmers(string consdirectory,map<string,int>& conserved_kmers);
bool match(string s1, string s2);
const int ALLOWED_MISMATCHES=1;
const bool MEF_DEBUG=false;
const int CHECK_ORG=-1; //if there is an organism for which user wants to see the output.
const double CONSERVE=1.0;
const double THRESHOLD=8.0;
const double THRESHOLD_GRAY=5.0;
int main(int argc, char* argv[])
{
if(argc<2)
if(argc<3)
{
cout<<"Wrong use of mefilter.cpp: ./mef <full_path_to_run>"<<endl;
cout<<"./mef <path to main directory containing FAA and optionally FAA_ref> <path to directory containing sorted, merged tags from conserved proteins of a reference set>"<<endl;
cout<<"Example: ./mef ../RUNS/MEF_10_7 ../RUNS/MEF_ref_10_7/MERGED_TAGS/"<<endl;
cout<<"Options:"<<endl;
cout<<"-v : verbose"<<endl;
cout<<"-h : help"<<endl;
cout<<"-m : for every proteome, write a file to MEFILTERED/ listing the proteins that were identified as mobile elements"<<endl;
exit(1);
}
string full_path=argv[1];
string consdirectory=argv[2];
cout<<"full_path="<<full_path;
bool write_out_mefiltered=false; //Use the -m option if you want to inspect the genes that are thrown away.
bool verbose=false;
bool help=false;
char c;
while((c = getopt(argc,argv,"p:c:mvh")) != -1)
{
switch(c)
{
case 'p':
full_path = optarg;
break;
case 'c':
consdirectory = optarg;
break;
case 'm':
write_out_mefiltered=true;
break;
case 'v':
verbose=true;
break;
case 'h':
help=true;
break;
default:
abort();
}
}
if(help)
{
cout<<"./mef <path to main directory containing FAA and optionally FAA_ref> <path to directory containing sorted, merged tags from conserved proteins of a reference set>"<<endl;
cout<<"Example: ./mef ../RUNS/MEF_10_7 ../RUNS/MEF_ref_10_7/MERGED_TAGS/"<<endl;
cout<<"Options:"<<endl;
cout<<"-v : verbose"<<endl;
cout<<"-h : help"<<endl;
cout<<"-m : for every proteome, write a file to MEFILTERED/ listing the proteins that were identified as mobile elements"<<endl;
exit(1);
}
if(verbose)
{
cout<<"full_path="<<full_path;
cout<<"extracting paths..."<<endl;
}
util u;
string path,bfn;
cout<<"extracting paths..."<<endl;
u.extract_paths(full_path,path,bfn);
cout<<"path="<<path<<endl;
cout<<"bfn="<<bfn<<endl;
if(verbose)
{
cout<<"path="<<path<<endl;
cout<<"bfn="<<bfn<<endl;
}
cout<<"Deleting previous FAA_mobelim and FAA_ref_mobelim, if they exist..."<<endl;
u.rm_mkdir(path+"/"+bfn+"/"+"/","FAA_mobelim");
u.rm_mkdir(path+"/"+bfn+"/"+"/","FAA_ref_mobelim");
// u.rm_mkdir(path+"/"+bfn+"/"+"/","FAA_mobkept");
// u.rm_mkdir(path+"/"+bfn+"/"+"/","FAA_ref_mobkept");
if(write_out_mefiltered)
{
u.rm_mkdir(path+"/"+bfn+"/"+"/","MEFILTERED");
}
cout<<"Created FAA_mobelim and FAA_ref_mobelim directories for reduced (mobile elements eliminated) proteomes."<<endl;
if(write_out_mefiltered)
{
cout<<"Deleted and created new MEFILTERED directory"<<endl;
}
ofstream logstream;
cout<<"Opening mefilter.cpp log file..."<<endl;
......@@ -43,26 +154,68 @@ int main(int argc, char* argv[])
logstream<<"full_path="<<full_path<<endl;
logstream<<"path="<<path<<endl;
logstream<<"bfn="<<bfn<<endl;
logstream<<"consdirectory="<<consdirectory<<endl;
logstream<<"CONSERVE="<<CONSERVE<<endl;
logstream<<"THRESHOLD="<<THRESHOLD<<endl;
logstream<<"THRESHOLD_GRAY="<<THRESHOLD_GRAY<<endl;
}
int tag_length;
u.get_tl2(full_path,tag_length);
logstream<<"tag_length="<<tag_length<<endl;
int total_size;
total_size = u.get_ts(full_path); //total number of organisms.
int ref_set_size;
u.get_rss(path+"/"+bfn+"/"+bfn+"_info.txt",ref_set_size);
logstream<<"total_size="<<total_size<<endl;
cout<<"total_size="<<total_size<<endl;
vector<string> conserved_kmers();
read_in_conserved_kmers(consfile,conserved_kmers);
map<string,int> conserved_kmers;
read_in_conserved_kmers(consdirectory,conserved_kmers);
logstream<<"Starting loop through all genome ordinals..."<<endl;
cout<<"mefilter.cpp: Starting loop through all genome ordinals..."<<endl;
vector<int> original_size;
vector<int> ME_removed;
for(int i=0; i<total_size; i++)
{
//ordinal=i.
original_size.push_back(0);
ME_removed.push_back(0);
}
vector<int> flag_discrepancies;
#pragma omp parallel for
for(int i=0; i<total_size; i++) //For all organisms.
{
string orgdir=u.get_oname(full_path,i);
ofstream mefstream;
if(write_out_mefiltered)
{
string meffile=path+"/"+bfn+"/MEFILTERED/mef_"+u.int_to_string(i)+".txt";
mefstream.open(meffile.c_str());
if(mefstream.fail())
{
cout<<"Failed to open mefstream to meffile="<<meffile<<endl;
exit(1);
}
}
if(i<ref_set_size) //if current organism is a reference org
{
u.rm_mkdir(path+"/"+bfn+"/"+"/FAA_ref_mobelim/",orgdir);
// u.rm_mkdir(path+"/"+bfn+"/"+"/FAA_ref_mobkept/",orgdir);
}
else
{
u.rm_mkdir(path+"/"+bfn+"/"+"/FAA_mobelim/",orgdir);
// u.rm_mkdir(path+"/"+bfn+"/"+"/FAA_mobkept/",orgdir);
}
logstream<<"i="<<i<<endl;
cout<<"Doing ordinal i="<<i<<" out of "<<total_size<<endl;
vector<string> files = u.get_files(full_path,i);
......@@ -71,79 +224,359 @@ int main(int argc, char* argv[])
vector<string> info_lines;
populate(files,proteins,info_lines);
// vector<string> tags = u.get_tags(full_path,i);
vector<string> tags1;
vector<int> tags2;
vector<int> tags3;
u.get_tags(full_path,i,tags1,tags2,tags3);
cout<<"size of files = "<<files.size()<<endl;
cout<<"size of tags1 = "<<tags1.size()<<endl;
cout<<"size of tags2 = "<<tags2.size()<<endl;
cout<<"size of tags3 = "<<tags3.size()<<endl;
vector<int> gids; //for each gene, counts the number of times that gene has a k-mer that repeats elsewhere in the genome (including elsewhere in the same gene)
vector<int> chits; //conserved hits.
vector<bool> gdels;
vector<bool> keepers;
//Note: keepers > gdels.
//If a protein is marked for deletion in gdels but marked to be kept in keepers, the protein is kept.
vector<int> gids;
for(int i=0; i<proteins.size(); i++)
for(int j=0; j<proteins.size(); j++)
{
gids.push_back(0);
chits.push_back(0);
gdels.push_back(false);
original_size.at(i)=proteins.size();
keepers.push_back(false);
}
//At this stage, all proteins and their info lines have been read in and are stored in the info_lines and proteins vectors. Similarly, all tags have been read in and are stored in tags.
//populate keepers
ifstream keepstream;
string keepfile=path+"/"+bfn+"/SAVEDORGS/"+bfn+"_"+u.int_to_string(i)+"_saved_orgs.txt";
keepstream.open(keepfile.c_str());
if(keepstream.fail())
{
cout<<"Failed to open keepstream to keepfile="<<keepfile<<endl;
exit(1);
}
string eater;
while(eater!="pvv.size()")
{
keepstream>>eater;
}
keepstream>>eater; //eat the size
if(u.int_to_string(proteins.size())!=eater)
{
cout<<"Size mismatch between pvv.size() from saved_orgs file and current proteins vector."<<endl;
cout<<"pvv.size() = "<<eater<<endl;
cout<<"proteins.size() = "<<proteins.size()<<endl;
exit(1);
}
int lindex=0;
int rindex=1;
for(int j=0; j<proteins.size(); j++)
{
int pnum;
bool keep_p;
keepstream>>pnum>>keep_p;
if(keep_p)
{
keepers.at(j)=true;
if(MEF_DEBUG || i==CHECK_ORG)
{
cout<<"keeper at j="<<j<<endl;
}
}
}
//For each gene, count the number of times it has a k-mer in common with a k-mer in the reference set.
map<string,int>::iterator it;
for(int j=0; j<tags1.size(); j++)
{
it=conserved_kmers.find(tags1.at(j));
if(it!=conserved_kmers.end()) //if the current tag exists in the conserved tags map
{
chits.at(tags3.at(j))+=conserved_kmers[tags1.at(j)]; //increment conserved hits for that gene by the number of times that sequence exists in the reference set.
}
}
while(lindex<tags1.size() && rindex<tags1.size())
//At this stage, all proteins and their info lines have been read in
//and are stored in the info_lines and proteins vectors. Similarly,
//all tags have been read in and are stored in tags
int lindex=0; //left index
int rindex=1; //right index
while(lindex<tags1.size() && rindex<tags1.size()) //Go through all the tags and mark genes in gids vector that have k-mers repeating elsewhere.
{
if(tags1.at(lindex)[tag_length-1]!='^')
{
while(rindex<tags1.size() && tags1.at(lindex)==tags1.at(rindex))
while(rindex<tags1.size() && match(tags1.at(lindex),tags1.at(rindex)))
{
rindex++;
}
}
int findex=rindex-1;
if(findex>lindex) //no cluster.
if(findex>lindex)
{
cout<<"Matches: "<<endl;
for(int j=lindex; j<=findex; j++)
for(int tag_index=lindex; tag_index<=findex; tag_index++)
{
int gid=tags3.at(j);
gids.at(gid)++;
cout<<tags1.at(j)<<endl;
cout<<info_lines.at(gid)<<endl;
int gid_index=tags3.at(tag_index); //Passing through tags, not genes.
//Therefore, lindex/findex/rindex correspond to index of the list of tags.
//gindex = index of the gene from which the repeating k-mer comes.
gids.at(gid_index)++;
}
cout<<endl;
}
lindex=rindex;
rindex=lindex+1;
}
cout<<"Gids vector: "<<endl;
for(int i=0; i<gids.size(); i++)
if(write_out_mefiltered)
{
cout<<i<<" "<<gids.at(i)<<endl;
//write out gids vector
mefstream<<"gids vector: "<<endl;
mefstream<<"[gene id] [gid value] [info line]"<<endl;
for(int j=0; j<gids.size(); j++)
{
mefstream<<"gene="<<j<<" "<<gids.at(j)<<" "<<info_lines.at(j)<<endl;
}
//write out chits vector
mefstream<<endl<<"chits vector: "<<endl;
mefstream<<"[gene id] [chits value] [info line]"<<endl;
for(int j=0; j<chits.size(); j++)
{
mefstream<<"gene="<<j<<" "<<chits.at(j)<<" "<<info_lines.at(j)<<endl;
}
mefstream<<endl;
}
int ME_count=0;
for(int i=0; i<gids.size(); i++)
//GENES WITH GIDS>0 TO THROW AWAY
if(MEF_DEBUG || i==CHECK_ORG)
{
cout<<"DEFINITELY THROW AWAY:"<<endl;
cout<<"[gene number] [gids value] [chits value] [info line]"<<endl;
}
if(write_out_mefiltered)
{
mefstream<<endl<<"DEFINITELY THROW AWAY:"<<endl;
mefstream<<"[gene number] [gids value] [chits value] [info line]"<<endl;
}
for(int j=0; j<gids.size(); j++)
{
if(gids.at(i)>0)// && gids.at(i)>=(double)proteins.at(i).length()/2.0)
if(gids.at(j)>=(chits.at(j)*CONSERVE+THRESHOLD))
{
gdels.at(j)=true;
ME_count++;
cout<<i<<" "<<gids.at(i)<<" "<<info_lines.at(i)<<endl;
if(MEF_DEBUG || i==CHECK_ORG)
{
cout<<"gene="<<j<<": "<<gids.at(j)<<" "<<chits.at(j)<<" "<<info_lines.at(j)<<endl;
}
if(write_out_mefiltered)
{
mefstream<<"gene="<<j<<": "<<gids.at(j)<<" "<<chits.at(j)<<" "<<info_lines.at(j)<<endl;
}
}
else if(gids.at(j)>=(chits.at(j)*CONSERVE+THRESHOLD-THRESHOLD_GRAY))
{
gdels.at(j)=true;
if(MEF_DEBUG || i==CHECK_ORG)
{
cout<<"GRAY ZONE: gene="<<j<<": "<<gids.at(j)<<" "<<chits.at(j)<<" "<<info_lines.at(j)<<endl;
}
if(write_out_mefiltered)
{
mefstream<<"GRAY ZONE: gene="<<j<<": "<<gids.at(j)<<" "<<chits.at(j)<<" "<<info_lines.at(j)<<endl;
}
}
}
ME_removed.at(i)=ME_count;
cout<<"ME_count="<<ME_count<<endl;
cout<<"total="<<gids.size()<<endl;
cout<<"fraction = "<<(double)ME_count/(double)gids.size()<<endl;
exit(1);
//GENES WITH GIDS>0 TO NOT THROW AWAY
if(MEF_DEBUG || i==CHECK_ORG)
{
cout<<"DESPITE HAVING HITS IN GIDS (I.E. EXHIBITS SOME HINTS OF UNUSUAL COPY NUMBER) DON'T THROW AWAY:"<<endl;
cout<<"[gene number] [gids value] [chits value] [info line]"<<endl;
}
if(write_out_mefiltered)
{
mefstream<<endl<<"DESPITE HAVING HITS IN GIDS (I.E. EXHIBITS SOME HINTS OF UNUSUAL COPY NUMBER) DON'T THROW AWAY:"<<endl;
mefstream<<"[gene number] [gids value] [chits value] [info line]"<<endl;
}
for(int j=0; j<gids.size(); j++)
{
if(gids.at(j)>0 && gids.at(j)<(chits.at(j)*CONSERVE+THRESHOLD-THRESHOLD_GRAY))
{
if(MEF_DEBUG || i==CHECK_ORG)
{
cout<<"GRAY: gene="<<i<<": "<<gids.at(j)<<" "<<chits.at(j)<<" "<<info_lines.at(j)<<endl;
}
if(write_out_mefiltered)
{
mefstream<<"GRAY: gene="<<i<<": "<<gids.at(j)<<" "<<chits.at(j)<<" "<<info_lines.at(j)<<endl;
}
}
else if(gids.at(j)>0 && gids.at(j)<(chits.at(j)*CONSERVE+THRESHOLD))
{
if(MEF_DEBUG || i==CHECK_ORG)
{
cout<<"gene="<<j<<": "<<gids.at(j)<<" "<<chits.at(j)<<" "<<info_lines.at(j)<<endl;
}
if(write_out_mefiltered)
{
mefstream<<"gene="<<j<<": "<<gids.at(j)<<" "<<chits.at(j)<<" "<<info_lines.at(j)<<endl;
}
}
}
bool del_all=false;
if(ME_count>=0.95*(double)gids.size())
{
del_all=true;
flag_discrepancies.push_back(i);
}
if(MEF_DEBUG || i==CHECK_ORG)
{
cout<<"ME_count="<<ME_count<<endl;
cout<<"total="<<gids.size()<<endl;
cout<<"fraction = "<<(double)ME_count/(double)gids.size()<<endl;
}
logstream<<"i="<<i<<endl;
logstream<<"ME_count="<<ME_count<<endl;
logstream<<"total="<<gids.size()<<endl;
logstream<<"fraction = "<<(double)ME_count/(double)gids.size()<<endl;
logstream<<"del_all = "<<del_all<<endl;
cout<<"Writing out reduced (mobile elements removed) proteome for i="<<i<<endl;
string faa_out;
string faa_out2;
if(i<ref_set_size) //if current organism is a reference org
{
faa_out=path+"/"+bfn+"/"+"/FAA_ref_mobelim/"+orgdir+"/mobelim.faa";
// faa_out2=path+"/"+bfn+"/"+"/FAA_ref_mobkept/"+orgdir+"/mobkept.faa";
faa_out2="/sbg/zo_mega_d1/rbromb/Project1/RUNS/20160216_Ecoli/mobs_unc3/FAA_ncsv_10_3/"+orgdir+"/mobkept.faa";
}
else
{
faa_out=path+"/"+bfn+"/"+"/FAA_mobelim/"+orgdir+"/mobelim.faa";
// faa_out2=path+"/"+bfn+"/"+"/FAA_mobkept/"+orgdir+"/mobkept.faa";
faa_out2="/sbg/zo_mega_d1/rbromb/Project1/RUNS/20160216_Ecoli/mobs_unc3/FAA_ncsv_10_3/"+orgdir+"/mobkept.faa";
}
if(write_out_mefiltered)
{
mefstream<<endl;
}