cleaned up masterlocalization parsing to remove unnecessary insertion of...

cleaned up masterlocalization parsing to remove unnecessary insertion of tuples to the localization list instead of simply appending them

cleaned up masterlocalization parsing to remove unnecessary insertion of...
cleaned up masterlocalization parsing to remove unnecessary insertion of tuples to the localization list instead of simply appending them
5c86a3dc · Vishruth Mullapudi · 66eb4641 · 5c86a3dc
Commit 5c86a3dc authored 5 years ago by Vishruth Mullapudi
--- a/main.py
+++ b/main.py
@@ -23,16 +23,18 @@ def main():
    input_data: List[FileTuple] = ingestfiledata(files=input_files)
    protein_seqrecords: List[SeqRecord] = getproteinsequences(protein_fasta_files)
    data = [genrawsequences(ftuple) for ftuple in input_data]
-    localization_col_titles = []
-    localized_data = []
+
+    localization_col_titles: List[Tuple[str, str]] = []  # list of tuples containing (fileID, column title)
+    localized_data: List[FileTuple] = []

    # localized_data += [parsemasterlocalizations(ftuple) for ftuple in data]
    # localization_col_titles += ['master_localized_mods']
    for ftuple in data:
        if use_mod_in_master_prot:
-            file_headers_tuple = parsemasterlocalizations(ftuple)
+            file_headers_tuple: Tuple[FileTuple, List[Tuple[str, str]]] = parsemasterlocalizations(ftuple)
        else:
-            file_headers_tuple = parseprotlocalizations(ftuple, protein_seqrecords)
+            file_headers_tuple: Tuple[FileTuple, List[Tuple[str, str]]] = parseprotlocalizations(ftuple,
+                                                                                                 protein_seqrecords)
        localized_data += file_headers_tuple[0]  # add the filetuple to the list of localized filetuples
        localization_col_titles += file_headers_tuple[1]  # add the list of (fileid, column title) tuples

@@ -79,14 +81,15 @@ def genrawsequences(ftuple: FileTuple) -> FileTuple:
 def parsemasterlocalizations(ftuple: FileTuple) -> Tuple[FileTuple, List[Tuple[str, str]]]:
    # todo: other PTMs
    # todo: file specified regex string
+    # NOTE: does not work on files with multiple master proteins
    # matches serine, threonine, tyrosine and 0 or more digits
    # this provides support for unlocalized PTM where only the amino acid is present and not the localization
    # regex = eval(r"r'[STY]([\d]{0,})'") can use this syntax to read in regex string regex
-    regex = r'[STY]([\d]{0,})'
+
+    regex = r'[STY]([\d]{0,})'  # specific to serine, threonine, tyrosine phosphorylation
    file_data = ftuple.FileData
    localizations = []
-    # matches = re.finditer(regex, test_str, re.MULTILINE)
-    # print(list(matches))
+
    for row in file_data.itertuples():
        mod_string = row.modifications_in_master_proteins
        # example modstring "P10636-8 2xPhospho [S383; S]"
@@ -95,11 +98,11 @@ def parsemasterlocalizations(ftuple: FileTuple) -> Tuple[FileTuple, List[Tuple[s
        if not str(mod_string) == "nan":
            matches = re.finditer(regex, mod_string)
            matched_strs = []
-            for matchobj in matches:
-                matched_strs.append(matchobj.groups())
+            for match_obj in matches:
+                matched_strs.extend(match_obj.groups())
            localizations.append(matched_strs)
        else:
-            localizations.append([()])
+            localizations.append([])

    file_data.insert(file_data.shape[1], "master_localized_mods", localizations)
    return FileTuple(ftuple.FileName, file_data), [(file_data['master_protein_accessions'][1], "master_localized_mods")]