Skip to content
Snippets Groups Projects
Commit 64821ef1 authored by Vishruth Mullapudi's avatar Vishruth Mullapudi
Browse files

outlined non-master localization parsing

parent 68c663db
1 merge request!1Fix unlocalized peptide mods not adding
import re
from collections import namedtuple
from typing import List
from typing import List, Tuple
import pandas as pd
import toml
......@@ -23,12 +23,18 @@ def main():
input_data: List[FileTuple] = ingestfiledata(files=input_files)
protein_seqrecords: List[SeqRecord] = getproteinsequences(protein_fasta_files)
data = [genrawsequences(ftuple) for ftuple in input_data]
localization_col_titles = []
localized_data = []
if use_mod_in_master_prot:
localized_data = (parsemasterlocalizations(ftuple) for ftuple in data)
localized_data += [parsemasterlocalizations(ftuple) for ftuple in data]
localization_col_titles += ['master_localized_mods']
print("Localized:")
print(list(localized_data))
else:
localized_data = (parseprotlocalizations(ftuple, protein_seqrecords) for ftuple in data)
print("Localized:")
print(list(localized_data))
for ftuple in data:
file_headers_tuple = parseprotlocalizations(ftuple, protein_seqrecords)
localized_data += file_headers_tuple[0] # add the filetuple to the list of localized filetuples
localization_col_titles += file_headers_tuple[1] # add the list of column titles of the localizations
def ingestfiledata(files: List[str]) -> List[FileTuple]:
......@@ -69,8 +75,10 @@ def genrawsequences(ftuple: FileTuple) -> FileTuple:
def parsemasterlocalizations(ftuple: FileTuple) -> FileTuple:
# todo: other PTMs
# todo: file specified regex string
# matches serine, threonine, tyrosine and 0 or more digits
# this provides support for unlocalized PTM where only the amino acid is present and not the localization
# regex = eval(r"r'[STY]([\d]{0,})'") can use this syntax to read in regex string regex
regex = r'[STY]([\d]{0,})'
file_data = ftuple.FileData
localizations = []
......@@ -94,8 +102,12 @@ def parsemasterlocalizations(ftuple: FileTuple) -> FileTuple:
return FileTuple(ftuple.FileName, file_data)
def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: list) -> FileTuple:
def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: list) -> Tuple[FileTuple, List[str]]:
# TODO
# parses out the localizations of the PTM by aligning each modified fragment to the protein sequences given in\
# protein_seqrecords, parsing out the modification index in each protein fragment, and using the fragment's index in
# the full protein as an offset to calculate the localization's index in the full protein.
# returns a filetuple where the file_data
pass
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment