Commit b0131d06 authored by Raquel Bromberg's avatar Raquel Bromberg

readme.txt updated. Examples directory modified. Output statements from...

readme.txt updated. Examples directory modified. Output statements from programs reduced. All programs and files not involved in the core SlopeTree routines removed. Some minor code cleanup.
parent a107015b
......@@ -9,4 +9,18 @@ mtax
rapidnj
sttag
tmerg
archaea_run_no_filtering.sh
gc.prt
script_cm.sh
script_stmif.sh
buildconcat.cpp
genome_mapped.txt
readme_conflict.txt
script_do_sttagg.sh
script_strun.sh
buildconcat_output_aligned.txt
merge_hgt_pairs_files.cpp
script_do_tmerg.sh
buildconcat_output.txt
rzs.cpp
script_filter.sh
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
>gi|189485766|ref|YP_001956706.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MWNEIIIELKEKIPTEWLEPIKEESFKDDILTLAVPDTYYAQKYETDYEPLIQNILKAKTGKAVGLQFQI
VASKLLAEPVIKDEKPKAQLIKKTKFTGNPYFKNPAEFTKFAKETTKKEIEISIGQKVMTSMPTKYGAKD
VITFSILNNKMFAYPNDKRKNAKVKMDIRFSNDTVKTFDLYRGQLETRGKGYGQLTTTHAKILLAVIHIW
QEQGCKFADEGQACYVNTTVREIAKKLGYKSFSGADHERLLQKTKDLADLPMVIADRKEAHSFTILYDVS
THVIEDSEGNDTNRRTISIVFNPFIAKQLYDRKVILRKPQCYKIKNPTAIKFLLCYDKRIIKGFNLKMNI
SEVANDLEIETNKTSNIITNIKNAFQELNGYELNDSYSLHVELVKEGKEWIVIAERISKEKQQPLKALLA
>gi|189485767|ref|YP_001956707.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKKLICFVMLGCFVLTFVSCGKRSAMLQDVASVKIEEVKLENGENKTPPPDDPAPEDPETITKADRRVGI
IAGMACMAIAFTGLAIYWMCCGSRKPFEFQVPQDEQNEKPDSEIPKLDNEVPEKPLVDANQDKSELNA
>gi|189485768|ref|YP_001956708.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKFKKVICVYLTLVLFLTGCENLKSQIRGNQNTDASADQQTGNIIKSDNNGIIEKEETKKLDDKVIGLVV
GDEKEKTSSKEPELRWQDKAKKYAGDAVHFLYEHKEQVYFVGIAVIYLLWKERLFQQQNHQDQMQWHLAQ
IQQLQQAQDQIQQMPVQDQIQYLQQLQNHFLQAQFNLQQLNQDPNLIQQFQQVQDQIQNGIGQLQAQVQQ
QIQPQLAQLQLG
>gi|189485769|ref|YP_001956709.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MEYKKETDYYENDMRHTADAPKLEKRLRKSLNVFGLILLVLAVYLFGSFLLCSCYRRGVVGGVVADNNAV
LPDAVTELKSDKKLINGAGLSLSERSVVAEPFRFGVIVPVVNNDG
>gi|189485770|ref|YP_001956710.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKFKKALCVYLTIVLFLTGCENLKSRIRGNQNTDASADQQTGNIIKSGNNGIIEKEETKELDGKSFGLDA
NGNGSSLQGENTEKETPSEEPELMWQDKAKKYAGDAGHFLYEHKGKIAFAVVVVAGVYWFGIRPINGLPA
QPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPLLPV
QPLLHVRPEVLQLRQEQALQQYQQQQHEIEELFVPDAVDLFAALGADMTVLDVMLSELRSQITQLYRLQV
RQLQEQLRLQEPQAQEQLLQRQLQERLILAQERQDLLQLRINNLRRHFRQYQAMPFLQIEYLEYQHRLTQ
QLLQQAQQQVQQRQEQQLAELAEQAQQVQQRQEQQLAELAEQAQQLAEQEQQQEQQLAELAEKL
>gi|189485771|ref|YP_001956711.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MRDILMIKIKDKYKLVENIFPINNPFRSSRTVQKALKIINKNFVCNIIVLSLLLCSCGKKQALLNTAITE
KSEVVSELKSELAKIQKELKEKKEKLIEVQEELEKAIEVQEELEKENWQNLWELYDNRMGFFLLGLLTAD
LIVLGCICCTMCSCC
>gi|189485772|ref|YP_001956712.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MEYKKETDYYENDMRHTADAPKLEKRLRKSLNVFGLILLVLAVYLFGSFLLCSCYRRGVVGGVVADNNAV
LPDAVTELKSDKKLINGAGLSLSERSVVAEPFRFGVIVPVVNNDG
>gi|189485773|ref|YP_001956713.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKKIIILLVVFLMFFGCYKKPITNIPVPEEIEEIKIPDVYYFSDMFTACNCFLAISTLFFLATVMFAGYS
FLGKDFDINDYRRPVELGNRIVPQRGNNI
>gi|189485774|ref|YP_001956714.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKKIFAVILMLSVFLSGCHRKVITGISKNENDVPTSTNTPAPEPTSTPMPTSTPAFMDCFPKYKAPSSTP
APLDADDYYDNWTKCVLLYVDDDGNWVNGKHKHSEAHKRLLLKDL
>gi|189485776|ref|YP_001956715.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MWNEIITELKEKIPTEWLEPIKEESFKDDILTLAVPDTYYAQKYETDYKPLIQNILKAKTGKAVGLQFQI
VASKLLAEPVIKDEKPKTQLMKKTKFNGKPYFENPAEFVSIAKAASQVKETDIKNAELTSMPNKYGITNV
ASFSLLDSKFFTYPNDKRKKTKVEMPVRFSNGVVKVYNLYRGQFAINDNGYGQLTTTHAKIFFAIIHIWQ
KQNSRYADNKGFYAVVDISMRELAKQLGYQKVSGADYRRLLQRVKELVDFPMILSDGRVAHTFTFLNSAI
GRTVHQSGKNKLMLRLTLNPFISKQFYERNVILRNPQCYKIKNPTAFKFLLCYDKRVVKGNNLKLNIYEV
ANDLELNISKLCHGVSVLKVAFRELNGYELNDSYRLYVELVKEEREWIVIAERISKEKQQPLKALLA
>gi|189485777|ref|YP_001956716.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKLKKALCVYLTIVLFLTGCENLKSQIRGNQNTVASADQPTGDIIKSDNSGIIEKQKEKESDLKKIENNV
DNPNGDNGGDNNGDNPNGDNGGDNPNGDNGGDNNGDSSQHKSSSEANGKDLRGWLWDHKGSISGIVLASA
VVIGAIGFWGGQPNVCAVNYFYVPQAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQV
QVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQGNQN
PNLVYFDIHSGRYIPGNQNPNLVYFDIHSGRYIPIVRGNPNPILVVQGNLYQVQGNESPFPVLLVEENQV
QVPAVQELPSQQQLQRIQNRLDQLQGQLRNLESHAIFQFQQIQVEAEALQLHNEQYIQLIGSIVDDPLYP
QLNLQFQQICQILRHSHSLQLYDIGFHIFLMREILDHHLHLTARNLPDVYGPPPRWEALRVQEQLERHNQ
QLNQLYQQIEGRLVIEQSVLGARPDSQQRQQELREFNELAIQLQEHQRRIEALLKWLI
>gi|189485778|ref|YP_001956717.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MTAVKFKKIFAVILMLSVFLSGCQRQVITGVSKNENDVSTSTNTPVPEPTPTPTPAPEPTPTPTPVPEPT
PTPTPVPEPTPTPTPVPEPTPTPTPVPEPTPTPTPVPEPTPAFMDCFPKYEAPLSTPAPLDEDDYYDNRV
KWALSHVDDDGNWVNGKHSEAPKCLLVDYYNKF
>gi|189485780|ref|YP_001956718.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MWNEIKTELTEKIPTIWLEPVKEESFKDDILMLNVPNRYYAEKYKTDFKELIQSVIKTKIGKDIGLQCQI
ELLPEPITKDKKPKTTSKTPLIKETKFNGKPYFESPAELTEISKEISQAKEIDVDNTKLTSMPVKYSIKD
VVSFSLLNSKMFTYPNDKRKKTKVEINIRFNNGTIKPLDLYRGQLDFNDEGYGQLTTTHAKIFLAITHIW
QKQGCKFANNSYLAVVDISIRELAKQLGYQKFSGADYKRLLRKTKELADFPMILADMYEAHTFTLLYDVS
NHKLKKSRNNKNMLRILINPFIAKQLYERKVILRNPQCYKIKNPTAIKFLICYDKRIIKGNNLRLNIFEI
ANDLEVNINNITSVAENLKNAFQELNGYELNDSYSLHVELIKENKEWIVVADRVLKEKQQSLKVNCRTDI
ECEA
>gi|189485781|ref|YP_001956719.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MLSLKDNKKSFFSLMCLFFLVVCGCHSAKEFRDGYITGLADGYALSVNEFKYNAELAKIKDEFDWSKVDF
KSEMKMYLDENKDIEKKVYDDYVKNGRKV
>gi|189485782|ref|YP_001956720.1| hypothetical protein [uncultured Termite group 1 bacterium phylotype Rs-D17]
MPTPTPPLTPASPEPASQDKLKKIALFCPPLTQLMFIYGLGLVEGAIQASLPYLFSSPTIISTLSIMFAN
MPIVGAMWFALFAYRSEVSTLSNVLDWSLPALLVFGEFLYFCGVGAVSPFSHWLWSIDPV
>gi|189485766|ref|YP_001956706.1| hypothetical protein TGRD_P1-1 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MWNEIIIELKEKIPTEWLEPIKEESFKDDILTLAVPDTYYAQKYETDYEPLIQNILKAKTGKAVGLQFQI
VASKLLAEPVIKDEKPKAQLIKKTKFTGNPYFKNPAEFTKFAKETTKKEIEISIGQKVMTSMPTKYGAKD
VITFSILNNKMFAYPNDKRKNAKVKMDIRFSNDTVKTFDLYRGQLETRGKGYGQLTTTHAKILLAVIHIW
QEQGCKFADEGQACYVNTTVREIAKKLGYKSFSGADHERLLQKTKDLADLPMVIADRKEAHSFTILYDVS
THVIEDSEGNDTNRRTISIVFNPFIAKQLYDRKVILRKPQCYKIKNPTAIKFLLCYDKRIIKGFNLKMNI
SEVANDLEIETNKTSNIITNIKNAFQELNGYELNDSYSLHVELVKEGKEWIVIAERISKEKQQPLKALLA
>gi|189485767|ref|YP_001956707.1| hypothetical protein TGRD_P1-2 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKKLICFVMLGCFVLTFVSCGKRSAMLQDVASVKIEEVKLENGENKTPPPDDPAPEDPETITKADRRVGI
IAGMACMAIAFTGLAIYWMCCGSRKPFEFQVPQDEQNEKPDSEIPKLDNEVPEKPLVDANQDKSELNA
>gi|189485768|ref|YP_001956708.1| hypothetical protein TGRD_P1-3 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKFKKVICVYLTLVLFLTGCENLKSQIRGNQNTDASADQQTGNIIKSDNNGIIEKEETKKLDDKVIGLVV
GDEKEKTSSKEPELRWQDKAKKYAGDAVHFLYEHKEQVYFVGIAVIYLLWKERLFQQQNHQDQMQWHLAQ
IQQLQQAQDQIQQMPVQDQIQYLQQLQNHFLQAQFNLQQLNQDPNLIQQFQQVQDQIQNGIGQLQAQVQQ
QIQPQLAQLQLG
>gi|189485769|ref|YP_001956709.1| hypothetical protein TGRD_P1-4 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MEYKKETDYYENDMRHTADAPKLEKRLRKSLNVFGLILLVLAVYLFGSFLLCSCYRRGVVGGVVADNNAV
LPDAVTELKSDKKLINGAGLSLSERSVVAEPFRFGVIVPVVNNDG
>gi|189485770|ref|YP_001956710.1| hypothetical protein TGRD_P1-5 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKFKKALCVYLTIVLFLTGCENLKSRIRGNQNTDASADQQTGNIIKSGNNGIIEKEETKELDGKSFGLDA
NGNGSSLQGENTEKETPSEEPELMWQDKAKKYAGDAGHFLYEHKGKIAFAVVVVAGVYWFGIRPINGLPA
QPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPPSQPPAQPLLPV
QPLLHVRPEVLQLRQEQALQQYQQQQHEIEELFVPDAVDLFAALGADMTVLDVMLSELRSQITQLYRLQV
RQLQEQLRLQEPQAQEQLLQRQLQERLILAQERQDLLQLRINNLRRHFRQYQAMPFLQIEYLEYQHRLTQ
QLLQQAQQQVQQRQEQQLAELAEQAQQVQQRQEQQLAELAEQAQQLAEQEQQQEQQLAELAEKL
>gi|189485771|ref|YP_001956711.1| hypothetical protein TGRD_P1-6 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MRDILMIKIKDKYKLVENIFPINNPFRSSRTVQKALKIINKNFVCNIIVLSLLLCSCGKKQALLNTAITE
KSEVVSELKSELAKIQKELKEKKEKLIEVQEELEKAIEVQEELEKENWQNLWELYDNRMGFFLLGLLTAD
LIVLGCICCTMCSCC
>gi|189485772|ref|YP_001956712.1| hypothetical protein TGRD_P1-7 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MEYKKETDYYENDMRHTADAPKLEKRLRKSLNVFGLILLVLAVYLFGSFLLCSCYRRGVVGGVVADNNAV
LPDAVTELKSDKKLINGAGLSLSERSVVAEPFRFGVIVPVVNNDG
>gi|189485773|ref|YP_001956713.1| hypothetical protein TGRD_P1-8 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKKIIILLVVFLMFFGCYKKPITNIPVPEEIEEIKIPDVYYFSDMFTACNCFLAISTLFFLATVMFAGYS
FLGKDFDINDYRRPVELGNRIVPQRGNNI
>gi|189485774|ref|YP_001956714.1| hypothetical protein TGRD_P1-9 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKKIFAVILMLSVFLSGCHRKVITGISKNENDVPTSTNTPAPEPTSTPMPTSTPAFMDCFPKYKAPSSTP
APLDADDYYDNWTKCVLLYVDDDGNWVNGKHKHSEAHKRLLLKDL
>gi|189485776|ref|YP_001956715.1| hypothetical protein TGRD_P2-1 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MWNEIITELKEKIPTEWLEPIKEESFKDDILTLAVPDTYYAQKYETDYKPLIQNILKAKTGKAVGLQFQI
VASKLLAEPVIKDEKPKTQLMKKTKFNGKPYFENPAEFVSIAKAASQVKETDIKNAELTSMPNKYGITNV
ASFSLLDSKFFTYPNDKRKKTKVEMPVRFSNGVVKVYNLYRGQFAINDNGYGQLTTTHAKIFFAIIHIWQ
KQNSRYADNKGFYAVVDISMRELAKQLGYQKVSGADYRRLLQRVKELVDFPMILSDGRVAHTFTFLNSAI
GRTVHQSGKNKLMLRLTLNPFISKQFYERNVILRNPQCYKIKNPTAFKFLLCYDKRVVKGNNLKLNIYEV
ANDLELNISKLCHGVSVLKVAFRELNGYELNDSYRLYVELVKEEREWIVIAERISKEKQQPLKALLA
>gi|189485777|ref|YP_001956716.1| hypothetical protein TGRD_P2-2 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MKLKKALCVYLTIVLFLTGCENLKSQIRGNQNTVASADQPTGDIIKSDNSGIIEKQKEKESDLKKIENNV
DNPNGDNGGDNNGDNPNGDNGGDNPNGDNGGDNNGDSSQHKSSSEANGKDLRGWLWDHKGSISGIVLASA
VVIGAIGFWGGQPNVCAVNYFYVPQAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQV
QVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQENQAQVQVPAVQGNQN
PNLVYFDIHSGRYIPGNQNPNLVYFDIHSGRYIPIVRGNPNPILVVQGNLYQVQGNESPFPVLLVEENQV
QVPAVQELPSQQQLQRIQNRLDQLQGQLRNLESHAIFQFQQIQVEAEALQLHNEQYIQLIGSIVDDPLYP
QLNLQFQQICQILRHSHSLQLYDIGFHIFLMREILDHHLHLTARNLPDVYGPPPRWEALRVQEQLERHNQ
QLNQLYQQIEGRLVIEQSVLGARPDSQQRQQELREFNELAIQLQEHQRRIEALLKWLI
>gi|189485778|ref|YP_001956717.1| hypothetical protein TGRD_P2-3 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MTAVKFKKIFAVILMLSVFLSGCQRQVITGVSKNENDVSTSTNTPVPEPTPTPTPAPEPTPTPTPVPEPT
PTPTPVPEPTPTPTPVPEPTPTPTPVPEPTPTPTPVPEPTPAFMDCFPKYEAPLSTPAPLDEDDYYDNRV
KWALSHVDDDGNWVNGKHSEAPKCLLVDYYNKF
>gi|189485780|ref|YP_001956718.1| hypothetical protein TGRD_P3-1 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MWNEIKTELTEKIPTIWLEPVKEESFKDDILMLNVPNRYYAEKYKTDFKELIQSVIKTKIGKDIGLQCQI
ELLPEPITKDKKPKTTSKTPLIKETKFNGKPYFESPAELTEISKEISQAKEIDVDNTKLTSMPVKYSIKD
VVSFSLLNSKMFTYPNDKRKKTKVEINIRFNNGTIKPLDLYRGQLDFNDEGYGQLTTTHAKIFLAITHIW
QKQGCKFANNSYLAVVDISIRELAKQLGYQKFSGADYKRLLRKTKELADFPMILADMYEAHTFTLLYDVS
NHKLKKSRNNKNMLRILINPFIAKQLYERKVILRNPQCYKIKNPTAIKFLICYDKRIIKGNNLRLNIFEI
ANDLEVNINNITSVAENLKNAFQELNGYELNDSYSLHVELIKENKEWIVVADRVLKEKQQSLKVNCRTDI
ECEA
>gi|189485781|ref|YP_001956719.1| hypothetical protein TGRD_P3-2 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MLSLKDNKKSFFSLMCLFFLVVCGCHSAKEFRDGYITGLADGYALSVNEFKYNAELAKIKDEFDWSKVDF
KSEMKMYLDENKDIEKKVYDDYVKNGRKV
>gi|189485782|ref|YP_001956720.1| hypothetical protein TGRD_P3-3 [uncultured Termite group 1 bacterium phylotype Rs-D17]
MPTPTPPLTPASPEPASQDKLKKIALFCPPLTQLMFIYGLGLVEGAIQASLPYLFSSPTIISTLSIMFAN
MPIVGAMWFALFAYRSEVSTLSNVLDWSLPALLVFGEFLYFCGVGAVSPFSHWLWSIDPV
#!/bin/sh
# $1 = full path to directory containing FAA directory
# $2 = tag length
./mif $1 $2
#./sttag -p $1 -s B -f 1
#./tmerg $1 $2
#./filt -p $1 -f 10
#./cm -p $1 -f -1 -o -1
#./mdist $1
#./fh $1
#include <iostream>
#include <fstream>
#include <vector>
#include <cstdlib>
#include <omp.h>
#include "util.h"
using namespace std;
const int MATCH_LENGTH=10;
const int MATCH_CUTOFF=8;
struct cset
{
vector<string> cpv;
};
int find_max_overlaps(vector<string> cpv, string s);
int main(int argc, char* argv[])
{
if(argc<2)
{
cout<<"Wrong use of function. ./bc <path to FAA_csv_x_y>"<<endl;
exit(1);
}
string path=argv[1];
util u;
vector<cset> cprots;
vector<string> directories;
directories = u.load_names(path.c_str());
for(int i=0; i<directories.size(); i++)
{
cset cset_temp;
ifstream instream;
instream.open( (path+"/"+directories.at(i)+"/csv_proteins.faa").c_str());
if(instream.fail())
{
cout<<"Failed to open instream to csv_proteins.faa"<<endl;
exit(1);
}
string eater;
instream>>eater;
while(!instream.eof())
{
if(eater[0]=='>')
{
char buffer[1000];
instream.getline(buffer,1000);
string info_line = eater+" "+buffer;
instream>>eater; //first part of protein
string ps;
while(!instream.eof() && eater[0]!='>')
{
ps += eater;
instream>>eater;
}
cset_temp.cpv.push_back(ps);
}
}
cprots.push_back(cset_temp);
instream.close();
}
cout<<"Size of cprots = "<<cprots.size()<<endl;
vector<string> concatenated;
string s1;
for(int i=0; i<cprots.at(0).cpv.size(); i++)
{
s1+=cprots.at(0).cpv.at(i);
}
cout<<s1<<endl;
#pragma omp parallel for
for(int i=1; i<cprots.size(); i++)
{
string s2;
for(int j=0; j<cprots.at(0).cpv.size(); j++)
{
int index = find_max_overlaps(cprots.at(i).cpv,cprots.at(0).cpv.at(j));
s2+=cprots.at(i).cpv.at(index);
}
#pragma omp critical
concatenated.push_back(s2);
cout<<"done with i="<<i<<"/"<<cprots.size()<<endl;
}
ofstream outstream;
outstream.open("buildconcat_output.txt");
for(int i=0; i<concatenated.size(); i++)
{
cout<<">gi| "<<directories.at(i)<<endl<<concatenated.at(i)<<endl;
outstream<<">gi| "<<directories.at(i)<<endl<<concatenated.at(i)<<endl;
}
outstream.close();
return 0;
}
int find_max_overlaps(vector<string> cpv, string s)
{
vector<int> mv;
for(int i=0; i<cpv.size(); i++)
{
mv.push_back(0);
}
for(int i=0; i<cpv.size(); i++)
{
for(int j=0; j<cpv.at(i).length(); j++)
{
string s2=cpv.at(i).substr(j,MATCH_LENGTH);
for(int k=0; k<s.length(); k++)
{
int matches=0;
string s1=s.substr(k,MATCH_LENGTH);
for(int f=0; f<s1.length(); f++)
{
if(s1[f]==s2[f])
{
matches++;
}
}
if(matches>=MATCH_CUTOFF)
{
mv.at(i)++;
}
}
}
}
int result_index=0;
int max=mv.at(0);
for(int i=0; i<mv.size(); i++)
{
if(mv.at(i)>max)
{
max=mv.at(i);
result_index=i;
}
}
return result_index;
}
This diff is collapsed.
This diff is collapsed.
#!/bin/sh
# $1 = full path to directory containing FAA directory
# $2 = tag length
# $3 = 'B' for bacteria, 'A' for archaea, 'O' for other
# $3 = 'B' for bacteria, 'A' for archaea, 'O' (capital o, not zero) for Other
# $4 = path to directory containing names.dmp and nodes.dmp
./mif $1 $2
echo "Number of parameters = "$#
if [ "$#" -ne 4 ]; then
echo "Illegal number of parameters"
exit
fi
echo "Input okay. Bash script proceeding..."
./mif $1 -k $2
./sttag $1 -f 1 -s $3
wait
./tmerg $1
......
......@@ -6,48 +6,86 @@
using namespace std;
const int TAGLENGTHCHECKER=20;
const bool DEBUG_FI=true;
class file_info
{
private:
bool verbose;
public:
file_info();
~file_info();
void read_line();
void set_file_name(string file_name_par);
char get_caa();
void delete_pFile();
// char get_caa();
// void delete_pFile();
string get_current_line();
void open(string path);
string file_name;
// void open(string path_par);
void set_path(string path_par);
void open();
void set_position();
// string file_name;
string path;
int total_tags;
int strings_read_in;
string s;
string last_read;
int gnum;
int gene_id;
FILE * pFile;
fpos_t position; //so that the files can be closed and then reopened
};
file_info::file_info()
{
cout<<"In constructor"<<endl;
if(DEBUG_FI || verbose)
{
cout<<"In file_info::file_info()"<<endl;
}
verbose=false;
}
void file_info::open(string path)
file_info::~file_info()
{
cout<<"In open. path = "<<path<<endl;
cout<<"Top of file_info destructor."<<endl;
// fclose(pFile);
cout<<"Closed the file."<<endl;
//delete pFile;
cout<<"Bottom of file_info destructor."<<endl;
}
/********************************
Open a TAGS file, e.g. TAGS0.txt,
files found in TAGS/ or TAGSscr/.
Read in the number of k-mers (actually 3-tuples) in the file, which
is the first value in any tags file.
Point position to the first sequence in the file.
Then close the file.
*******************************/
void file_info::set_path(string path_par)
{
path = path_par;
if(DEBUG_FI || verbose)
{
cout<<"In file_info::set_path(string)"<<endl;
cout<<"path="<<path<<endl;
}
pFile = fopen( path.c_str(), "r");
if(pFile==NULL)
{
perror("Error opening file");
exit(1);
}
else
{
char buffer[200];
if(fgets(buffer,200,pFile) != NULL)
if(fgets(buffer,200,pFile) != NULL) //Read in the total size of the file (first value in a TAGS*.txt file
{
total_tags = atoi(buffer);
cout<<"total_tags = "<<total_tags<<endl;
}
else
{
......@@ -55,26 +93,74 @@ void file_info::open(string path)
exit(1);
}
}
strings_read_in=0;
fgetpos(pFile,&position); //point to the first sequence in the file.
// read_line();
if(DEBUG_FI || verbose)
{
cout<<"total_tags in current TAGS file = "<<total_tags<<endl;
cout<<"End of file_info::set_path(string)"<<endl;
}
fclose(pFile);
}
void file_info::open()
{
if(DEBUG_FI || verbose)
{
cout<<"In file_info::open(string)"<<endl;
}
pFile = fopen( path.c_str(), "r");
if(pFile==NULL)
{
perror("Error opening file");
exit(1);
}
else
{
fsetpos(pFile,&position); //set the position to the last place we read from.
}
// strings_read_in=0;
// fgetpos(pFile,&position); //point to the first sequence in the file.
read_line();
cout<<"end of open"<<endl;
if(s==last_read)
{
strings_read_in--;
}
if(DEBUG_FI || verbose)
{
cout<<"total_tags in current TAGS file = "<<total_tags<<endl;
cout<<"End of file_info::open(string)"<<endl;
}
}
file_info::~file_info()
void file_info::set_position()
{
cout<<"Top of file_info destructor."<<endl;
fgetpos(pFile,&position);
fclose(pFile);
cout<<"Closed the file."<<endl;
//delete pFile;
cout<<"Bottom of file_info destructor."<<endl;
}
/**************************
Literally, just read the next line into variable s.
Strings shorter than TAGLENGTHCHECKER (default=20)
cause the program to exit.
***************************/
void file_info::read_line()
{
char buffer[200];
if(pFile==NULL)
{
perror("Error opening file");
perror("perror in file_info::read_line(): Error opening file - pFile==NULL");
exit(1);
}
else
{
......@@ -90,6 +176,8 @@ void file_info::read_line()
cout<<"Big problem in little read_line"<<endl;
exit(1);
}
last_read=s;
}
string file_info::get_current_line()
......@@ -97,12 +185,12 @@ string file_info::get_current_line()
return s;
}
char file_info::get_caa()
/*char file_info::get_caa()
{
cout<<"in get_caa(). last_read_sequence = "<<s<<endl;
return s[0];
}
void file_info::delete_pFile()
{
}
*/
//void file_info::delete_pFile()
//{
//}
......@@ -204,6 +204,8 @@ int main(int argc, char* argv[])
//Write out the new proteins according to the log file.
for(int i=0; i<hv.size(); i++)
{
cout<<"Done with organism "<<i<<endl;
vector<pair<string,string> > p2k; //proteome to keep
vector<pair<string,string> > pn2k; //proteome not to keep
int pc=0; //protein counter.
......@@ -244,8 +246,8 @@ int main(int argc, char* argv[])
pn2k.push_back(make_pair(info_line,prot));
}
cout<<"p2k.size()="<<p2k.size()<<endl;
cout<<"pn2k.size()="<<pn2k.size()<<endl;
// cout<<"p2k.size()="<<p2k.size()<<endl;
// cout<<"pn2k.size()="<<pn2k.size()<<endl;
prot="";
pc++;
}
......@@ -269,7 +271,7 @@ int main(int argc, char* argv[])
confile+="_ref";
}
confile+="_csv_"+fss+"_"+fcs+"/"+hv.at(i).dir+"/csv_proteins.faa";
cout<<"confile = "<<confile<<endl;
// cout<<"confile = "<<confile<<endl;
u.open_ofile(cstream,confile,"confile");
for(int j=0; j<p2k.size(); j++)
......@@ -285,7 +287,7 @@ int main(int argc, char* argv[])
ncsv_file+="_ref";
}
ncsv_file+="_ncsv_"+fss+"_"+fcs+"/"+hv.at(i).dir+"/ncsv_proteins.faa";
cout<<"ncsv_file = "<<ncsv_file<<endl;
// cout<<"ncsv_file = "<<ncsv_file<<endl;
u.open_ofile(mstream,ncsv_file,"ncsv_file");
for(int j=0; j<pn2k.size(); j++)
......