Commit b0131d06 authored by Raquel Bromberg's avatar Raquel Bromberg

readme.txt updated. Examples directory modified. Output statements from...

readme.txt updated. Examples directory modified. Output statements from programs reduced. All programs and files not involved in the core SlopeTree routines removed. Some minor code cleanup.
parent a107015b
...@@ -9,4 +9,18 @@ mtax ...@@ -9,4 +9,18 @@ mtax
rapidnj rapidnj
sttag sttag
tmerg tmerg
archaea_run_no_filtering.sh
gc.prt
script_cm.sh
script_stmif.sh
buildconcat.cpp
genome_mapped.txt
readme_conflict.txt
script_do_sttagg.sh
script_strun.sh
buildconcat_output_aligned.txt
merge_hgt_pairs_files.cpp
script_do_tmerg.sh
buildconcat_output.txt
rzs.cpp
script_filter.sh
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
>gi|189485766|ref|YP_001956706.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MWNEIIIELKEKIPTEWLEPIKEESFKDDILTLAVPDTYYAQKYETDYEPLIQNILKAKTGKAVGLQFQI
VASKLLAEPVIKDEKPKAQLIKKTKFTGNPYFKNPAEFTKFAKETTKKEIEISIGQKVMTSMPTKYGAKD
VITFSILNNKMFAYPNDKRKNAKVKMDIRFSNDTVKTFDLYRGQLETRGKGYGQLTTTHAKILLAVIHIW
QEQGCKFADEGQACYVNTTVREIAKKLGYKSFSGADHERLLQKTKDLADLPMVIADRKEAHSFTILYDVS
THVIEDSEGNDTNRRTISIVFNPFIAKQLYDRKVILRKPQCYKIKNPTAIKFLLCYDKRIIKGFNLKMNI
SEVANDLEIETNKTSNIITNIKNAFQELNGYELNDSYSLHVELVKEGKEWIVIAERISKEKQQPLKALLA
>gi|189485767|ref|YP_001956707.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKKLICFVMLGCFVLTFVSCGKRSAMLQDVASVKIEEVKLENGENKTPPPDDPAPEDPETITKADRRVGI
IAGMACMAIAFTGLAIYWMCCGSRKPFEFQVPQDEQNEKPDSEIPKLDNEVPEKPLVDANQDKSELNA
>gi|189485768|ref|YP_001956708.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKFKKVICVYLTLVLFLTGCENLKSQIRGNQNTDASADQQTGNIIKSDNNGIIEKEETKKLDDKVIGLVV
GDEKEKTSSKEPELRWQDKAKKYAGDAVHFLYEHKEQVYFVGIAVIYLLWKERLFQQQNHQDQMQWHLAQ
IQQLQQAQDQIQQMPVQDQIQYLQQLQNHFLQAQFNLQQLNQDPNLIQQFQQVQDQIQNGIGQLQAQVQQ
QIQPQLAQLQLG
>gi|189485769|ref|YP_001956709.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MEYKKETDYYENDMRHTADAPKLEKRLRKSLNVFGLILLVLAVYLFGSFLLCSCYRRGVVGGVVADNNAV
LPDAVTELKSDKKLINGAGLSLSERSVVAEPFRFGVIVPVVNNDG
>gi|189485770|ref|YP_001956710.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKFKKALCVYLTIVLFLTGCENLKSRIRGNQNTDASADQQTGNIIKSGNNGIIEKEETKELDGKSFGLDA
NGNGSSLQGENTEKETPSEEPELMWQDKAKKYAGDAGHFLYEHKGKIAFAVVVVAGVYWFGIRPINGLPA
QPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPLLPV
QPLLHVRPEVLQLRQEQALQQYQQQQHEIEELFVPDAVDLFAALGADMTVLDVMLSELRSQITQLYRLQV
RQLQEQLRLQEPQAQEQLLQRQLQERLILAQERQDLLQLRINNLRRHFRQYQAMPFLQIEYLEYQHRLTQ
QLLQQAQQQVQQRQEQQLAELAEQAQQVQQRQEQQLAELAEQAQQLAEQEQQQEQQLAELAEKL
>gi|189485771|ref|YP_001956711.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MRDILMIKIKDKYKLVENIFPINNPFRSSRTVQKALKIINKNFVCNIIVLSLLLCSCGKKQALLNTAITE
KSEVVSELKSELAKIQKELKEKKEKLIEVQEELEKAIEVQEELEKENWQNLWELYDNRMGFFLLGLLTAD
LIVLGCICCTMCSCC
>gi|189485772|ref|YP_001956712.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MEYKKETDYYENDMRHTADAPKLEKRLRKSLNVFGLILLVLAVYLFGSFLLCSCYRRGVVGGVVADNNAV
LPDAVTELKSDKKLINGAGLSLSERSVVAEPFRFGVIVPVVNNDG
>gi|189485773|ref|YP_001956713.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKKIIILLVVFLMFFGCYKKPITNIPVPEEIEEIKIPDVYYFSDMFTACNCFLAISTLFFLATVMFAGYS
FLGKDFDINDYRRPVELGNRIVPQRGNNI
>gi|189485774|ref|YP_001956714.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKKIFAVILMLSVFLSGCHRKVITGISKNENDVPTSTNTPAPEPTSTPMPTSTPAFMDCFPKYKAPSSTP
APLDADDYYDNWTKCVLLYVDDDGNWVNGKHKHSEAHKRLLLKDL
>gi|189485776|ref|YP_001956715.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MWNEIITELKEKIPTEWLEPIKEESFKDDILTLAVPDTYYAQKYETDYKPLIQNILKAKTGKAVGLQFQI
VASKLLAEPVIKDEKPKTQLMKKTKFNGKPYFENPAEFVSIAKAASQVKETDIKNAELTSMPNKYGITNV
ASFSLLDSKFFTYPNDKRKKTKVEMPVRFSNGVVKVYNLYRGQFAINDNGYGQLTTTHAKIFFAIIHIWQ
KQNSRYADNKGFYAVVDISMRELAKQLGYQKVSGADYRRLLQRVKELVDFPMILSDGRVAHTFTFLNSAI
GRTVHQSGKNKLMLRLTLNPFISKQFYERNVILRNPQCYKIKNPTAFKFLLCYDKRVVKGNNLKLNIYEV
ANDLELNISKLCHGVSVLKVAFRELNGYELNDSYRLYVELVKEEREWIVIAERISKEKQQPLKALLA
>gi|189485777|ref|YP_001956716.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKLKKALCVYLTIVLFLTGCENLKSQIRGNQNTVASADQPTGDIIKSDNSGIIEKQKEKESDLKKIENNV
DNPNGDNGGDNNGDNPNGDNGGDNPNGDNGGDNNGDSSQHKSSSEANGKDLRGWLWDHKGSISGIVLASA
VVIGAIGFWGGQPNVCAVNYFYVPQAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQV
QVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQGNQN
PNLVYFDIHSGRYIPGNQNPNLVYFDIHSGRYIPIVRGNPNPILVVQGNLYQVQGNESPFPVLLVEENQV
QVPAVQELPSQQQLQRIQNRLDQLQGQLRNLESHAIFQFQQIQVEAEALQLHNEQYIQLIGSIVDDPLYP
QLNLQFQQICQILRHSHSLQLYDIGFHIFLMREILDHHLHLTARNLPDVYGPPPRWEALRVQEQLERHNQ
QLNQLYQQIEGRLVIEQSVLGARPDSQQRQQELREFNELAIQLQEHQRRIEALLKWLI
>gi|189485778|ref|YP_001956717.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MTAVKFKKIFAVILMLSVFLSGCQRQVITGVSKNENDVSTSTNTPVPEPTPTPTPAPEPTPTPTPVPEPT
PTPTPVPEPTPTPTPVPEPTPTPTPVPEPTPTPTPVPEPTPAFMDCFPKYEAPLSTPAPLDEDDYYDNRV
KWALSHVDDDGNWVNGKHSEAPKCLLVDYYNKF
>gi|189485780|ref|YP_001956718.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MWNEIKTELTEKIPTIWLEPVKEESFKDDILMLNVPNRYYAEKYKTDFKELIQSVIKTKIGKDIGLQCQI
ELLPEPITKDKKPKTTSKTPLIKETKFNGKPYFESPAELTEISKEISQAKEIDVDNTKLTSMPVKYSIKD
VVSFSLLNSKMFTYPNDKRKKTKVEINIRFNNGTIKPLDLYRGQLDFNDEGYGQLTTTHAKIFLAITHIW
QKQGCKFANNSYLAVVDISIRELAKQLGYQKFSGADYKRLLRKTKELADFPMILADMYEAHTFTLLYDVS
NHKLKKSRNNKNMLRILINPFIAKQLYERKVILRNPQCYKIKNPTAIKFLICYDKRIIKGNNLRLNIFEI
ANDLEVNINNITSVAENLKNAFQELNGYELNDSYSLHVELIKENKEWIVVADRVLKEKQQSLKVNCRTDI
ECEA
>gi|189485781|ref|YP_001956719.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MLSLKDNKKSFFSLMCLFFLVVCGCHSAKEFRDGYITGLADGYALSVNEFKYNAELAKIKDEFDWSKVDF
KSEMKMYLDENKDIEKKVYDDYVKNGRKV
>gi|189485782|ref|YP_001956720.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MPTPTPPLTPASPEPASQDKLKKIALFCPPLTQLMFIYGLGLVEGAIQASLPYLFSSPTIISTLSIMFAN
MPIVGAMWFALFAYRSEVSTLSNVLDWSLPALLVFGEFLYFCGVGAVSPFSHWLWSIDPV
>gi|189485766|ref|YP_001956706.1| hypothetical protein TGRD_P1-1 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MWNEIIIELKEKIPTEWLEPIKEESFKDDILTLAVPDTYYAQKYETDYEPLIQNILKAKTGKAVGLQFQI
VASKLLAEPVIKDEKPKAQLIKKTKFTGNPYFKNPAEFTKFAKETTKKEIEISIGQKVMTSMPTKYGAKD
VITFSILNNKMFAYPNDKRKNAKVKMDIRFSNDTVKTFDLYRGQLETRGKGYGQLTTTHAKILLAVIHIW
QEQGCKFADEGQACYVNTTVREIAKKLGYKSFSGADHERLLQKTKDLADLPMVIADRKEAHSFTILYDVS
THVIEDSEGNDTNRRTISIVFNPFIAKQLYDRKVILRKPQCYKIKNPTAIKFLLCYDKRIIKGFNLKMNI
SEVANDLEIETNKTSNIITNIKNAFQELNGYELNDSYSLHVELVKEGKEWIVIAERISKEKQQPLKALLA
>gi|189485767|ref|YP_001956707.1| hypothetical protein TGRD_P1-2 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKKLICFVMLGCFVLTFVSCGKRSAMLQDVASVKIEEVKLENGENKTPPPDDPAPEDPETITKADRRVGI
IAGMACMAIAFTGLAIYWMCCGSRKPFEFQVPQDEQNEKPDSEIPKLDNEVPEKPLVDANQDKSELNA
>gi|189485768|ref|YP_001956708.1| hypothetical protein TGRD_P1-3 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKFKKVICVYLTLVLFLTGCENLKSQIRGNQNTDASADQQTGNIIKSDNNGIIEKEETKKLDDKVIGLVV
GDEKEKTSSKEPELRWQDKAKKYAGDAVHFLYEHKEQVYFVGIAVIYLLWKERLFQQQNHQDQMQWHLAQ
IQQLQQAQDQIQQMPVQDQIQYLQQLQNHFLQAQFNLQQLNQDPNLIQQFQQVQDQIQNGIGQLQAQVQQ
QIQPQLAQLQLG
>gi|189485769|ref|YP_001956709.1| hypothetical protein TGRD_P1-4 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MEYKKETDYYENDMRHTADAPKLEKRLRKSLNVFGLILLVLAVYLFGSFLLCSCYRRGVVGGVVADNNAV
LPDAVTELKSDKKLINGAGLSLSERSVVAEPFRFGVIVPVVNNDG
>gi|189485770|ref|YP_001956710.1| hypothetical protein TGRD_P1-5 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKFKKALCVYLTIVLFLTGCENLKSRIRGNQNTDASADQQTGNIIKSGNNGIIEKEETKELDGKSFGLDA
NGNGSSLQGENTEKETPSEEPELMWQDKAKKYAGDAGHFLYEHKGKIAFAVVVVAGVYWFGIRPINGLPA
QPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPLLPV
QPLLHVRPEVLQLRQEQALQQYQQQQHEIEELFVPDAVDLFAALGADMTVLDVMLSELRSQITQLYRLQV
RQLQEQLRLQEPQAQEQLLQRQLQERLILAQERQDLLQLRINNLRRHFRQYQAMPFLQIEYLEYQHRLTQ
QLLQQAQQQVQQRQEQQLAELAEQAQQVQQRQEQQLAELAEQAQQLAEQEQQQEQQLAELAEKL
>gi|189485771|ref|YP_001956711.1| hypothetical protein TGRD_P1-6 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MRDILMIKIKDKYKLVENIFPINNPFRSSRTVQKALKIINKNFVCNIIVLSLLLCSCGKKQALLNTAITE
KSEVVSELKSELAKIQKELKEKKEKLIEVQEELEKAIEVQEELEKENWQNLWELYDNRMGFFLLGLLTAD
LIVLGCICCTMCSCC
>gi|189485772|ref|YP_001956712.1| hypothetical protein TGRD_P1-7 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MEYKKETDYYENDMRHTADAPKLEKRLRKSLNVFGLILLVLAVYLFGSFLLCSCYRRGVVGGVVADNNAV
LPDAVTELKSDKKLINGAGLSLSERSVVAEPFRFGVIVPVVNNDG
>gi|189485773|ref|YP_001956713.1| hypothetical protein TGRD_P1-8 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKKIIILLVVFLMFFGCYKKPITNIPVPEEIEEIKIPDVYYFSDMFTACNCFLAISTLFFLATVMFAGYS
FLGKDFDINDYRRPVELGNRIVPQRGNNI
>gi|189485774|ref|YP_001956714.1| hypothetical protein TGRD_P1-9 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKKIFAVILMLSVFLSGCHRKVITGISKNENDVPTSTNTPAPEPTSTPMPTSTPAFMDCFPKYKAPSSTP
APLDADDYYDNWTKCVLLYVDDDGNWVNGKHKHSEAHKRLLLKDL
>gi|189485776|ref|YP_001956715.1| hypothetical protein TGRD_P2-1 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MWNEIITELKEKIPTEWLEPIKEESFKDDILTLAVPDTYYAQKYETDYKPLIQNILKAKTGKAVGLQFQI
VASKLLAEPVIKDEKPKTQLMKKTKFNGKPYFENPAEFVSIAKAASQVKETDIKNAELTSMPNKYGITNV
ASFSLLDSKFFTYPNDKRKKTKVEMPVRFSNGVVKVYNLYRGQFAINDNGYGQLTTTHAKIFFAIIHIWQ
KQNSRYADNKGFYAVVDISMRELAKQLGYQKVSGADYRRLLQRVKELVDFPMILSDGRVAHTFTFLNSAI
GRTVHQSGKNKLMLRLTLNPFISKQFYERNVILRNPQCYKIKNPTAFKFLLCYDKRVVKGNNLKLNIYEV
ANDLELNISKLCHGVSVLKVAFRELNGYELNDSYRLYVELVKEEREWIVIAERISKEKQQPLKALLA
>gi|189485777|ref|YP_001956716.1| hypothetical protein TGRD_P2-2 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKLKKALCVYLTIVLFLTGCENLKSQIRGNQNTVASADQPTGDIIKSDNSGIIEKQKEKESDLKKIENNV
DNPNGDNGGDNNGDNPNGDNGGDNPNGDNGGDNNGDSSQHKSSSEANGKDLRGWLWDHKGSISGIVLASA
VVIGAIGFWGGQPNVCAVNYFYVPQAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQV
QVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQGNQN
PNLVYFDIHSGRYIPGNQNPNLVYFDIHSGRYIPIVRGNPNPILVVQGNLYQVQGNESPFPVLLVEENQV
QVPAVQELPSQQQLQRIQNRLDQLQGQLRNLESHAIFQFQQIQVEAEALQLHNEQYIQLIGSIVDDPLYP
QLNLQFQQICQILRHSHSLQLYDIGFHIFLMREILDHHLHLTARNLPDVYGPPPRWEALRVQEQLERHNQ
QLNQLYQQIEGRLVIEQSVLGARPDSQQRQQELREFNELAIQLQEHQRRIEALLKWLI
>gi|189485778|ref|YP_001956717.1| hypothetical protein TGRD_P2-3 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MTAVKFKKIFAVILMLSVFLSGCQRQVITGVSKNENDVSTSTNTPVPEPTPTPTPAPEPTPTPTPVPEPT
PTPTPVPEPTPTPTPVPEPTPTPTPVPEPTPTPTPVPEPTPAFMDCFPKYEAPLSTPAPLDEDDYYDNRV
KWALSHVDDDGNWVNGKHSEAPKCLLVDYYNKF
>gi|189485780|ref|YP_001956718.1| hypothetical protein TGRD_P3-1 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MWNEIKTELTEKIPTIWLEPVKEESFKDDILMLNVPNRYYAEKYKTDFKELIQSVIKTKIGKDIGLQCQI
ELLPEPITKDKKPKTTSKTPLIKETKFNGKPYFESPAELTEISKEISQAKEIDVDNTKLTSMPVKYSIKD
VVSFSLLNSKMFTYPNDKRKKTKVEINIRFNNGTIKPLDLYRGQLDFNDEGYGQLTTTHAKIFLAITHIW
QKQGCKFANNSYLAVVDISIRELAKQLGYQKFSGADYKRLLRKTKELADFPMILADMYEAHTFTLLYDVS
NHKLKKSRNNKNMLRILINPFIAKQLYERKVILRNPQCYKIKNPTAIKFLICYDKRIIKGNNLRLNIFEI
ANDLEVNINNITSVAENLKNAFQELNGYELNDSYSLHVELIKENKEWIVVADRVLKEKQQSLKVNCRTDI
ECEA
>gi|189485781|ref|YP_001956719.1| hypothetical protein TGRD_P3-2 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MLSLKDNKKSFFSLMCLFFLVVCGCHSAKEFRDGYITGLADGYALSVNEFKYNAELAKIKDEFDWSKVDF
KSEMKMYLDENKDIEKKVYDDYVKNGRKV
>gi|189485782|ref|YP_001956720.1| hypothetical protein TGRD_P3-3 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MPTPTPPLTPASPEPASQDKLKKIALFCPPLTQLMFIYGLGLVEGAIQASLPYLFSSPTIISTLSIMFAN
MPIVGAMWFALFAYRSEVSTLSNVLDWSLPALLVFGEFLYFCGVGAVSPFSHWLWSIDPV
#!/bin/sh
# $1 = full path to directory containing FAA directory
# $2 = tag length
./mif $1 $2
#./sttag -p $1 -s B -f 1
#./tmerg $1 $2
#./filt -p $1 -f 10
#./cm -p $1 -f -1 -o -1
#./mdist $1
#./fh $1
#include <iostream>
#include <fstream>
#include <vector>
#include <cstdlib>
#include <omp.h>
#include "util.h"
using namespace std;
const int MATCH_LENGTH=10;
const int MATCH_CUTOFF=8;
struct cset
{
vector<string> cpv;
};
int find_max_overlaps(vector<string> cpv, string s);
int main(int argc, char* argv[])
{
if(argc<2)
{
cout<<"Wrong use of function. ./bc <path to FAA_csv_x_y>"<<endl;
exit(1);
}
string path=argv[1];
util u;
vector<cset> cprots;
vector<string> directories;
directories = u.load_names(path.c_str());
for(int i=0; i<directories.size(); i++)
{
cset cset_temp;
ifstream instream;
instream.open( (path+"/"+directories.at(i)+"/csv_proteins.faa").c_str());
if(instream.fail())
{
cout<<"Failed to open instream to csv_proteins.faa"<<endl;
exit(1);
}
string eater;
instream>>eater;
while(!instream.eof())
{
if(eater[0]=='>')
{
char buffer[1000];
instream.getline(buffer,1000);
string info_line = eater+" "+buffer;
instream>>eater; //first part of protein
string ps;
while(!instream.eof() && eater[0]!='>')
{
ps += eater;
instream>>eater;
}
cset_temp.cpv.push_back(ps);
}
}
cprots.push_back(cset_temp);
instream.close();
}
cout<<"Size of cprots = "<<cprots.size()<<endl;
vector<string> concatenated;
string s1;
for(int i=0; i<cprots.at(0).cpv.size(); i++)
{
s1+=cprots.at(0).cpv.at(i);
}
cout<<s1<<endl;
#pragma omp parallel for
for(int i=1; i<cprots.size(); i++)
{
string s2;
for(int j=0; j<cprots.at(0).cpv.size(); j++)
{
int index = find_max_overlaps(cprots.at(i).cpv,cprots.at(0).cpv.at(j));
s2+=cprots.at(i).cpv.at(index);
}
#pragma omp critical
concatenated.push_back(s2);
cout<<"done with i="<<i<<"/"<<cprots.size()<<endl;
}
ofstream outstream;
outstream.open("buildconcat_output.txt");
for(int i=0; i<concatenated.size(); i++)
{
cout<<">gi| "<<directories.at(i)<<endl<<concatenated.at(i)<<endl;
outstream<<">gi| "<<directories.at(i)<<endl<<concatenated.at(i)<<endl;
}
outstream.close();
return 0;
}
int find_max_overlaps(vector<string> cpv, string s)
{
vector<int> mv;
for(int i=0; i<cpv.size(); i++)
{
mv.push_back(0);
}
for(int i=0; i<cpv.size(); i++)
{
for(int j=0; j<cpv.at(i).length(); j++)
{
string s2=cpv.at(i).substr(j,MATCH_LENGTH);
for(int k=0; k<s.length(); k++)
{
int matches=0;
string s1=s.substr(k,MATCH_LENGTH);
for(int f=0; f<s1.length(); f++)
{
if(s1[f]==s2[f])
{
matches++;
}
}
if(matches>=MATCH_CUTOFF)
{
mv.at(i)++;
}
}
}
}
int result_index=0;
int max=mv.at(0);
for(int i=0; i<mv.size(); i++)
{
if(mv.at(i)>max)
{
max=mv.at(i);
result_index=i;
}
}
return result_index;
}
This diff is collapsed.
This diff is collapsed.
#!/bin/sh #!/bin/sh
# $1 = full path to directory containing FAA directory # $1 = full path to directory containing FAA directory
# $2 = tag length # $2 = tag length
# $3 = 'B' for bacteria, 'A' for archaea, 'O' for other # $3 = 'B' for bacteria, 'A' for archaea, 'O' (capital o, not zero) for Other
# $4 = path to directory containing names.dmp and nodes.dmp # $4 = path to directory containing names.dmp and nodes.dmp
./mif $1 $2 echo "Number of parameters = "$#
if [ "$#" -ne 4 ]; then
echo "Illegal number of parameters"
exit
fi
echo "Input okay. Bash script proceeding..."
./mif $1 -k $2
./sttag $1 -f 1 -s $3 ./sttag $1 -f 1 -s $3
wait wait
./tmerg $1 ./tmerg $1
......
...@@ -6,48 +6,86 @@ ...@@ -6,48 +6,86 @@
using namespace std; using namespace std;
const int TAGLENGTHCHECKER=20; const int TAGLENGTHCHECKER=20;
const bool DEBUG_FI=true;
class file_info class file_info
{ {
private: private:
bool verbose;
public: public:
file_info(); file_info();
~file_info(); ~file_info();
void read_line(); void read_line();
void set_file_name(string file_name_par); void set_file_name(string file_name_par);
char get_caa(); // char get_caa();
void delete_pFile(); // void delete_pFile();
string get_current_line(); string get_current_line();
void open(string path); // void open(string path_par);
string file_name; void set_path(string path_par);
void open();
void set_position();
// string file_name;
string path;
int total_tags; int total_tags;
int strings_read_in; int strings_read_in;
string s; string s;
string last_read;
int gnum; int gnum;
int gene_id; int gene_id;
FILE * pFile; FILE * pFile;
fpos_t position; //so that the files can be closed and then reopened
}; };
file_info::file_info() file_info::file_info()
{ {
cout<<"In constructor"<<endl; if(DEBUG_FI || verbose)
{
cout<<"In file_info::file_info()"<<endl;
}
verbose=false;
} }
void file_info::open(string path) file_info::~file_info()
{ {
cout<<"In open. path = "<<path<<endl; cout<<"Top of file_info destructor."<<endl;
// fclose(pFile);
cout<<"Closed the file."<<endl;
//delete pFile;
cout<<"Bottom of file_info destructor."<<endl;
}
/********************************
Open a TAGS file, e.g. TAGS0.txt,
files found in TAGS/ or TAGSscr/.
Read in the number of k-mers (actually 3-tuples) in the file, which
is the first value in any tags file.
Point position to the first sequence in the file.
Then close the file.
*******************************/
void file_info::set_path(string path_par)
{
path = path_par;
if(DEBUG_FI || verbose)
{
cout<<"In file_info::set_path(string)"<<endl;
cout<<"path="<<path<<endl;
}
pFile = fopen( path.c_str(), "r"); pFile = fopen( path.c_str(), "r");
if(pFile==NULL) if(pFile==NULL)
{ {
perror("Error opening file"); perror("Error opening file");
exit(1);
} }
else else
{ {
char buffer[200]; char buffer[200];
if(fgets(buffer,200,pFile) != NULL) if(fgets(buffer,200,pFile) != NULL) //Read in the total size of the file (first value in a TAGS*.txt file
{ {
total_tags = atoi(buffer); total_tags = atoi(buffer);
cout<<"total_tags = "<<total_tags<<endl;
} }
else else
{ {
...@@ -55,26 +93,74 @@ void file_info::open(string path) ...@@ -55,26 +93,74 @@ void file_info::open(string path)
exit(1); exit(1);
} }
} }
strings_read_in=0; strings_read_in=0;
fgetpos(pFile,&position); //point to the first sequence in the file.
// read_line();
if(DEBUG_FI || verbose)
{
cout<<"total_tags in current TAGS file = "<<total_tags<<endl;
cout<<"End of file_info::set_path(string)"<<endl;
}
fclose(pFile);
}
void file_info::open()
{
if(DEBUG_FI || verbose)
{
cout<<"In file_info::open(string)"<<endl;
}
pFile = fopen( path.c_str(), "r");
if(pFile==NULL)
{
perror("Error opening file");
exit(1);
}
else
{
fsetpos(pFile,&position); //set the position to the last place we read from.
}
// strings_read_in=0;
// fgetpos(pFile,&position); //point to the first sequence in the file.
read_line(); read_line();
cout<<"end of open"<<endl;
if(s==last_read)
{
strings_read_in--;
}
if(DEBUG_FI || verbose)
{
cout<<"total_tags in current TAGS file = "<<total_tags<<endl;
cout<<"End of file_info::open(string)"<<endl;
}
} }
file_info::~file_info() void file_info::set_position()
{ {
cout<<"Top of file_info destructor."<<endl; fgetpos(pFile,&position);
fclose(pFile); fclose(pFile);
cout<<"Closed the file."<<endl;
//delete pFile;
cout<<"Bottom of file_info destructor."<<endl;
} }
/**************************
Literally, just read the next line into variable s.
Strings shorter than TAGLENGTHCHECKER (default=20)
cause the program to exit.
***************************/
void file_info::read_line() void file_info::read_line()
{ {
char buffer[200]; char buffer[200];
if(pFile==NULL) if(pFile==NULL)
{ {
perror("Error opening file"); perror("perror in file_info::read_line(): Error opening file - pFile==NULL");
exit(1);
} }
else else
{ {
...@@ -90,6 +176,8 @@ void file_info::read_line() ...@@ -90,6 +176,8 @@ void file_info::read_line()
cout<<"Big problem in little read_line"<<endl; cout<<"Big problem in little read_line"<<endl;
exit(1); exit(1);
} }
last_read=s;
} }
string file_info::get_current_line() string file_info::get_current_line()
...@@ -97,12 +185,12 @@ string file_info::get_current_line() ...@@ -97,12 +185,12 @@ string file_info::get_current_line()
return s; return s;
} }