Skip to content
Snippets Groups Projects
Commit 5c86a3dc authored by Vishruth Mullapudi's avatar Vishruth Mullapudi
Browse files

cleaned up masterlocalization parsing to remove unnecessary insertion of...

cleaned up masterlocalization parsing to remove unnecessary insertion of tuples to the localization list instead of simply appending them
parent 66eb4641
1 merge request!1Fix unlocalized peptide mods not adding
......@@ -23,16 +23,18 @@ def main():
input_data: List[FileTuple] = ingestfiledata(files=input_files)
protein_seqrecords: List[SeqRecord] = getproteinsequences(protein_fasta_files)
data = [genrawsequences(ftuple) for ftuple in input_data]
localization_col_titles = []
localized_data = []
localization_col_titles: List[Tuple[str, str]] = [] # list of tuples containing (fileID, column title)
localized_data: List[FileTuple] = []
# localized_data += [parsemasterlocalizations(ftuple) for ftuple in data]
# localization_col_titles += ['master_localized_mods']
for ftuple in data:
if use_mod_in_master_prot:
file_headers_tuple = parsemasterlocalizations(ftuple)
file_headers_tuple: Tuple[FileTuple, List[Tuple[str, str]]] = parsemasterlocalizations(ftuple)
else:
file_headers_tuple = parseprotlocalizations(ftuple, protein_seqrecords)
file_headers_tuple: Tuple[FileTuple, List[Tuple[str, str]]] = parseprotlocalizations(ftuple,
protein_seqrecords)
localized_data += file_headers_tuple[0] # add the filetuple to the list of localized filetuples
localization_col_titles += file_headers_tuple[1] # add the list of (fileid, column title) tuples
......@@ -79,14 +81,15 @@ def genrawsequences(ftuple: FileTuple) -> FileTuple:
def parsemasterlocalizations(ftuple: FileTuple) -> Tuple[FileTuple, List[Tuple[str, str]]]:
# todo: other PTMs
# todo: file specified regex string
# NOTE: does not work on files with multiple master proteins
# matches serine, threonine, tyrosine and 0 or more digits
# this provides support for unlocalized PTM where only the amino acid is present and not the localization
# regex = eval(r"r'[STY]([\d]{0,})'") can use this syntax to read in regex string regex
regex = r'[STY]([\d]{0,})'
regex = r'[STY]([\d]{0,})' # specific to serine, threonine, tyrosine phosphorylation
file_data = ftuple.FileData
localizations = []
# matches = re.finditer(regex, test_str, re.MULTILINE)
# print(list(matches))
for row in file_data.itertuples():
mod_string = row.modifications_in_master_proteins
# example modstring "P10636-8 2xPhospho [S383; S]"
......@@ -95,11 +98,11 @@ def parsemasterlocalizations(ftuple: FileTuple) -> Tuple[FileTuple, List[Tuple[s
if not str(mod_string) == "nan":
matches = re.finditer(regex, mod_string)
matched_strs = []
for matchobj in matches:
matched_strs.append(matchobj.groups())
for match_obj in matches:
matched_strs.extend(match_obj.groups())
localizations.append(matched_strs)
else:
localizations.append([()])
localizations.append([])
file_data.insert(file_data.shape[1], "master_localized_mods", localizations)
return FileTuple(ftuple.FileName, file_data), [(file_data['master_protein_accessions'][1], "master_localized_mods")]
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment