Skip to content
Snippets Groups Projects
Commit 0d025942 authored by Vishruth Mullapudi's avatar Vishruth Mullapudi Committed by Vishruth Mullapudi
Browse files

residue modification abundances completed and comments added to code

parent 5ef05ab9
1 merge request!1Fix unlocalized peptide mods not adding
......@@ -17,16 +17,20 @@ def main():
# Get the desired configuration
configuration: dict = toml.load(config_file)
input_files: List[str] = configuration['input']["input_files"]
use_mod_in_master_prot: bool = configuration['parser_config']['master']['use']
if (use_mod_in_master_prot):
use_mod_in_master_prot: bool = False # configuration['parser_config']['master']['use']
if use_mod_in_master_prot:
master_prot_name: bool = configuration['parser_config']['master']['master_protein_name']
master_protein_fasta_ID = configuration['parser_config']['master']['master_protein_fasta_ID']
protein_fasta_files: List[str] = configuration['input']['prot_seq_fasta']
# todo each abundance col into separate dataframe
# todo implement fileID splitting
# todo go down to one abundance column at a time?
abundance_col_titles = configuration['parser_config']['abundance_col_titles']
# Internalize the file's data
# input_data is a list of tuples (filename, pandas DataFrame of the csv)
input_data: List[FileTuple] = ingestfiledata(files=input_files)
protein_seqrecords: Dict = getproteinsequences(protein_fasta_files)
data = (genrawsequences(ftuple) for ftuple in input_data)
protein_seqrecords: Dict = get_proteinsequences(protein_fasta_files)
data = (gen_rawsequences(ftuple) for ftuple in input_data)
localization_col_titles: Dict[str, str] = dict() # Dict containing {proteinID: column title}
frag_localization_col_titles: Dict[str, str] = dict() # Dict containing {proteinID: column title}
......@@ -52,12 +56,17 @@ def main():
for ftuple in localized_data:
# for residue modification analysis, calculate the amount each residue is modified
calcresiduemodproportions(ftuple, localization_col_titles, frag_localization_col_titles, protein_seqrecords)
for abundance_col_title in abundance_col_titles:
calcresiduemodabundances(ftuple, localization_col_titles, frag_localization_col_titles,
abundance_col_title, protein_seqrecords)
def ingestfiledata(files: List[str]) -> List[FileTuple]:
""" Takes the list if files to ingest, reads them, and returns the data as
"""
Takes the list if files to ingest, reads them, and returns the data as
a list of tuples of the file name and a DataFrame of the file contents
:param files: a list of paths to input files
:return: a list containing 2-tuples of the filename and a dataframe of the file contents
"""
data = []
for input_file_path in files:
......@@ -71,7 +80,12 @@ def ingestfiledata(files: List[str]) -> List[FileTuple]:
return data
def getproteinsequences(fasta_files: List[str]) -> Dict:
def get_proteinsequences(fasta_files: List[str]) -> Dict:
"""
Ingests the proteins sequences from a list of fasta files
:param fasta_files: the list of paths to fasta files to use in the alignment and localization
:return: a Dict mapping sequenceIDs to Bio.SeqRecord objects
"""
protein_seqrecords: Dict[str, SeqRecord] = dict()
for file in fasta_files:
with open(file, "r") as handle:
......@@ -80,10 +94,12 @@ def getproteinsequences(fasta_files: List[str]) -> Dict:
return protein_seqrecords
def genrawsequences(ftuple: FileTuple) -> FileTuple:
def gen_rawsequences(ftuple: FileTuple) -> FileTuple:
"""
Adds a column to the DataFrame containing a stripped down peptide without
the cleavage annotations
Adds a column to the DataFrame containing a stripped down peptide without the cleavage annotations
:param ftuple: The filetuple to containing the filedata dataframe to generate the raw sequence of each fragment
:return: a filetuple containing the fileID and a filedata dataframe containing the raw sequence in the
'stripped_sequence' column
"""
file_data: pd.DataFrame = ftuple.FileData
file_data = file_data.assign(
......@@ -97,6 +113,14 @@ def parsemasterlocalizations(ftuple: FileTuple, master_prot_fasta_id) -> Tuple[
# todo: other PTMs
# todo: file specified regex string
# todo fragment localization
"""
Parses the modification and fragment localizations from a daaframe using the positions in master and modifications
in master proteins columns to obtain localization data instead of aligning the fragment to a given master protein
:param ftuple:
:param master_prot_fasta_id:
:return: A tuple containing the filetuple of localized data, a Dict mapping the proteinID to its modification \
localizations in the dataframe, and a Dict mapping proteinID's to their fragment localizations in the dataframe
"""
# NOTE: parses multiple master proteins, but as of now only the first is used
# matches serine, threonine, tyrosine and 0 or more digits
# this provides support for unlocalized PTM where only the amino acid is present and not the localization
......@@ -145,7 +169,15 @@ def parsemasterlocalizations(ftuple: FileTuple, master_prot_fasta_id) -> Tuple[
def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: Dict) -> \
Tuple[FileTuple, Dict[str, str], Dict[str, str]]:
# TODO same as master
"""
Parses out the localizations of the PTM by aligning each modified fragment to the protein sequences given in
protein_seqrecords, parsing out the modification index in each protein fragment, and using the fragment's index in
the full protein as an offset to calculate the localization's index in the full protein.
:param ftuple: a FIleTuple with the data to localize
:param protein_seqrecords: the dict of proteinID mapped to its SequenceRecord
:return: a Tuple containing the FileTuple with localization data, a Dict mapping the proteinID to its modification \
localizations in the dataframe, and a Dict mapping proteinID's to their fragment localizations in the dataframe
"""
# parses out the localizations of the PTM by aligning each modified fragment to the protein sequences given in
# protein_seqrecords, parsing out the modification index in each protein fragment, and using the fragment's index in
# the full protein as an offset to calculate the localization's index in the full protein.
......@@ -164,7 +196,7 @@ def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: Dict) -> \
frag_index_in_prot = protein_seq.find(row.stripped_sequence)
# if the fragment is contained in the current protein
if frag_index_in_prot != -1:
positions_in_master.append((frag_index_in_prot + 1, frag_index_in_prot + len(row.stripped_sequence)))
positions_in_master.append([(frag_index_in_prot + 1, frag_index_in_prot + len(row.stripped_sequence))])
mod_string = row.modifications
if not str(mod_string) == "nan":
matches = re.finditer(regex, mod_string)
......@@ -192,18 +224,50 @@ def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: Dict) -> \
# todo
def calcresiduemodproportions(ftuple, localization_col_titles, frag_localization_col_titles, protein_seqrecords):
def calcresiduemodabundances(ftuple, localization_col_titles, frag_localization_col_titles, abundance_col_title,
protein_seqrecords):
"""
Calculates, for a dataframe of localized ptm data, the abundance of each residue and how much that resiude is
modified.
:param ftuple: FileTuple containing the file ID and the DataFrame with localized data
:param localization_col_titles: dict mapping protein ID to the column containing the mod localization
:param frag_localization_col_titles: dict mapping proteinID to the fragment localization column title
:param abundance_col_title: title of the abundance column of interest in the dataframe
:param protein_seqrecords: dict containing the protein ID mapped to each SeqRecord
:return: a dict mapping proteinIDs to their residue abundance arrays
"""
fdata = ftuple.FileData
prot_abundances: Dict = dict() # a dict of each protein ID mapped to its respective abundance array
for proteinID, mod_column_title in localization_col_titles.items():
protein_len = len(protein_seqrecords[proteinID])
res_abundances = np.zeros(protein_len, dtype=float)
res_mod_abdundances = np.zeros(protein_len, dtype=float)
# THIS IS ONE INDEXED. RESIDUE 1 IS IN INDEX 1 of the array. index 0 is UNUSED
res_abundances = np.zeros((protein_len + 1, 2), dtype=float) # col. 0= mod abundance, col. 1=residue abundance
for row in fdata.iterrows():
fragment = row[1]
mod_localization = fragment[mod_column_title]
frag_localization = fragment[frag_localization_col_titles[proteinID]]
abundance_col_title = abundance_col_title.strip().lower().replace(' ', '_').replace('(', '') \
.replace(')', '').replace("#", "num")
frag_abundance = fragment[abundance_col_title]
# if the fragment is in the given protein
if frag_localization != -1:
# only uses the first localization
for i in range(frag_localization[0][0], frag_localization[0][1] + 1, 1):
# add the abundance to the abundance of each resiude in the fragment
res_abundances[i][1] += frag_abundance
for mod in mod_localization:
# add the abundance to each modified residue contained in the fragment
res_abundances[mod][0] += frag_abundance
prot_abundances.update({proteinID: res_abundances})
return prot_abundances
# todo
def calcpeptidemodproportions(ftuple, localization_col_titles, frag_localization_col_titles):
pass
......
......@@ -14,7 +14,6 @@ title="Abundance Parser Configuration"
# path to file2
# ]
input_files=["tests/sampleInput.csv"]
#fasta file(s) containing the protein sequence(s) to align against
prot_seq_fasta=["data/2N4R_wt_tau.fasta","data/1N4RP301STau.fasta"]
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment