outlined non-master localization parsing

64821ef1 · Vishruth Mullapudi · 68c663db · 64821ef1
Commit 64821ef1 authored 5 years ago by Vishruth Mullapudi
--- a/main.py
+++ b/main.py
 import re
 from collections import namedtuple
-from typing import List
+from typing import List, Tuple

 import pandas as pd
 import toml
@@ -23,12 +23,18 @@ def main():
    input_data: List[FileTuple] = ingestfiledata(files=input_files)
    protein_seqrecords: List[SeqRecord] = getproteinsequences(protein_fasta_files)
    data = [genrawsequences(ftuple) for ftuple in input_data]
+    localization_col_titles = []
+    localized_data = []
    if use_mod_in_master_prot:
-        localized_data = (parsemasterlocalizations(ftuple) for ftuple in data)
+        localized_data += [parsemasterlocalizations(ftuple) for ftuple in data]
+        localization_col_titles += ['master_localized_mods']
+        print("Localized:")
+        print(list(localized_data))
    else:
-        localized_data = (parseprotlocalizations(ftuple, protein_seqrecords) for ftuple in data)
-    print("Localized:")
-    print(list(localized_data))
+        for ftuple in data:
+            file_headers_tuple = parseprotlocalizations(ftuple, protein_seqrecords)
+            localized_data += file_headers_tuple[0]  # add the filetuple to the list of localized filetuples
+            localization_col_titles += file_headers_tuple[1]  # add the list of column titles of the localizations


 def ingestfiledata(files: List[str]) -> List[FileTuple]:
@@ -69,8 +75,10 @@ def genrawsequences(ftuple: FileTuple) -> FileTuple:

 def parsemasterlocalizations(ftuple: FileTuple) -> FileTuple:
    # todo: other PTMs
+    # todo: file specified regex string
    # matches serine, threonine, tyrosine and 0 or more digits
    # this provides support for unlocalized PTM where only the amino acid is present and not the localization
+    # regex = eval(r"r'[STY]([\d]{0,})'") can use this syntax to read in regex string regex
    regex = r'[STY]([\d]{0,})'
    file_data = ftuple.FileData
    localizations = []
@@ -94,8 +102,12 @@ def parsemasterlocalizations(ftuple: FileTuple) -> FileTuple:
    return FileTuple(ftuple.FileName, file_data)


-def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: list) -> FileTuple:
+def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: list) -> Tuple[FileTuple, List[str]]:
    # TODO
+    # parses out the localizations of the PTM by aligning each modified fragment to the protein sequences given in\
+    # protein_seqrecords, parsing out the modification index in each protein fragment, and using the fragment's index in
+    # the full protein as an offset to calculate the localization's index in the full protein.
+    # returns a filetuple where the file_data
    pass