initial implementation of non-master protein alignment and modification parsing

c6cc177e · Vishruth Mullapudi · Vishruth Mullapudi · d8127956 · c6cc177e · c6cc177e
Commit c6cc177e authored 5 years ago by Vishruth Mullapudi Committed by Vishruth Mullapudi 5 years ago
--- a/data/parser_regex.toml
+++ b/data/parser_regex.toml
@@ -2,10 +2,10 @@

 #The TOML specification can be found here: https://github.com/toml-lang/toml

-title="Perser Regex Configuration"
+title="Parser Regex Configuration"
 #IMPORTANT: Use string literals instead of strings to allow escape characters
 #to function properly.
 #Regex Strings should be formatted with single quotes (e.g. 'foobar') or triple
 #quotes (e.g. '''this is a string literal as well''') to store them as string
 #literals
-regex.phosphoregex='r"[STY]([\d]{0,4})"'
\ No newline at end of file
+regex.phosphoregex='[STY]([\d]{0,})'
\ No newline at end of file
--- a/main.py
+++ b/main.py
@@ -115,6 +115,30 @@ def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: list) -> Tuple
    # the full protein as an offset to calculate the localization's index in the full protein.
    # returns a filetuple where the file_data contains additional columns for the localization against each protein and
    # a list containing tuples all of the fileIDs and the corresponding column titles
+
+    regex = r'[STY]([\d]{0,})'  # specific to serine, threonine, tyrosine phosphorylation
+    file_data: pd.DataFrame = ftuple.FileData
+    for protein in protein_seqrecords:
+        protein_seq: str = protein.seq.upper()
+        prot_localizations = []
+        for row in file_data.itertuples():
+            frag_index_in_prot = protein_seq.find(row.stripped_sequence)
+            # if the fragment is contained in the current protein
+            if frag_index_in_prot != -1:
+                mod_string = row.modifications
+                if not str(mod_string) == "nan":
+                    matches = re.finditer(regex, mod_string)
+                    matched_strs = []
+                    for match_obj in matches:
+                        matched_strs.extend(match_obj.groups())
+                    prot_localizations.append(matched_strs)
+                else:
+                    prot_localizations.append([])
+            else:
+                assert (prot_localizations == -1)
+                prot_localizations.append([frag_index_in_prot])  # if index is -1 the fragment isn't in this protein
+            file_data.insert(file_data.shape[1], protein.id, prot_localizations)
+
    pass