Skip to content
Snippets Groups Projects
Commit c6cc177e authored by Vishruth Mullapudi's avatar Vishruth Mullapudi Committed by Vishruth Mullapudi
Browse files

initial implementation of non-master protein alignment and modification parsing

parent d8127956
Branches
1 merge request!1Fix unlocalized peptide mods not adding
......@@ -2,10 +2,10 @@
#The TOML specification can be found here: https://github.com/toml-lang/toml
title="Perser Regex Configuration"
title="Parser Regex Configuration"
#IMPORTANT: Use string literals instead of strings to allow escape characters
#to function properly.
#Regex Strings should be formatted with single quotes (e.g. 'foobar') or triple
#quotes (e.g. '''this is a string literal as well''') to store them as string
#literals
regex.phosphoregex='r"[STY]([\d]{0,4})"'
\ No newline at end of file
regex.phosphoregex='[STY]([\d]{0,})'
\ No newline at end of file
......@@ -115,6 +115,30 @@ def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: list) -> Tuple
# the full protein as an offset to calculate the localization's index in the full protein.
# returns a filetuple where the file_data contains additional columns for the localization against each protein and
# a list containing tuples all of the fileIDs and the corresponding column titles
regex = r'[STY]([\d]{0,})' # specific to serine, threonine, tyrosine phosphorylation
file_data: pd.DataFrame = ftuple.FileData
for protein in protein_seqrecords:
protein_seq: str = protein.seq.upper()
prot_localizations = []
for row in file_data.itertuples():
frag_index_in_prot = protein_seq.find(row.stripped_sequence)
# if the fragment is contained in the current protein
if frag_index_in_prot != -1:
mod_string = row.modifications
if not str(mod_string) == "nan":
matches = re.finditer(regex, mod_string)
matched_strs = []
for match_obj in matches:
matched_strs.extend(match_obj.groups())
prot_localizations.append(matched_strs)
else:
prot_localizations.append([])
else:
assert (prot_localizations == -1)
prot_localizations.append([frag_index_in_prot]) # if index is -1 the fragment isn't in this protein
file_data.insert(file_data.shape[1], protein.id, prot_localizations)
pass
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment