residue modification abundances completed and comments added to code

0d025942 · Vishruth Mullapudi · Vishruth Mullapudi · 5ef05ab9 · 0d025942 · 0d025942
Commit 0d025942 authored 5 years ago by Vishruth Mullapudi Committed by Vishruth Mullapudi 5 years ago
--- a/main.py
+++ b/main.py
@@ -17,16 +17,20 @@ def main():
    # Get the desired configuration
    configuration: dict = toml.load(config_file)
    input_files: List[str] = configuration['input']["input_files"]
-    use_mod_in_master_prot: bool = configuration['parser_config']['master']['use']
-    if (use_mod_in_master_prot):
+    use_mod_in_master_prot: bool = False  # configuration['parser_config']['master']['use']
+    if use_mod_in_master_prot:
        master_prot_name: bool = configuration['parser_config']['master']['master_protein_name']
        master_protein_fasta_ID = configuration['parser_config']['master']['master_protein_fasta_ID']
    protein_fasta_files: List[str] = configuration['input']['prot_seq_fasta']
+    # todo each abundance col into separate dataframe
+    # todo implement fileID splitting
+    # todo go down to one abundance column at a time?
+    abundance_col_titles = configuration['parser_config']['abundance_col_titles']
    # Internalize the file's data
    # input_data is a list of tuples (filename, pandas DataFrame of the csv)
    input_data: List[FileTuple] = ingestfiledata(files=input_files)
-    protein_seqrecords: Dict = getproteinsequences(protein_fasta_files)
-    data = (genrawsequences(ftuple) for ftuple in input_data)
+    protein_seqrecords: Dict = get_proteinsequences(protein_fasta_files)
+    data = (gen_rawsequences(ftuple) for ftuple in input_data)

    localization_col_titles: Dict[str, str] = dict()  # Dict containing {proteinID: column title}
    frag_localization_col_titles: Dict[str, str] = dict()  # Dict containing {proteinID: column title}
@@ -52,12 +56,17 @@ def main():

    for ftuple in localized_data:
        # for residue modification analysis, calculate the amount each residue is modified
-        calcresiduemodproportions(ftuple, localization_col_titles, frag_localization_col_titles, protein_seqrecords)
+        for abundance_col_title in abundance_col_titles:
+            calcresiduemodabundances(ftuple, localization_col_titles, frag_localization_col_titles,
+                                     abundance_col_title, protein_seqrecords)


 def ingestfiledata(files: List[str]) -> List[FileTuple]:
-    """ Takes the list if files to ingest, reads them, and returns the data as
+    """
+    Takes the list if files to ingest, reads them, and returns the data as
    a list of tuples of the file name and a DataFrame of the file contents
+    :param files: a list of paths to input files
+    :return: a list containing 2-tuples of the filename and a dataframe of the file contents
    """
    data = []
    for input_file_path in files:
@@ -71,7 +80,12 @@ def ingestfiledata(files: List[str]) -> List[FileTuple]:
    return data


-def getproteinsequences(fasta_files: List[str]) -> Dict:
+def get_proteinsequences(fasta_files: List[str]) -> Dict:
+    """
+    Ingests the proteins sequences from a list of fasta files
+    :param fasta_files: the list of paths to fasta files to use in the alignment and localization
+    :return: a Dict mapping sequenceIDs to Bio.SeqRecord objects
+    """
    protein_seqrecords: Dict[str, SeqRecord] = dict()
    for file in fasta_files:
        with open(file, "r") as handle:
@@ -80,10 +94,12 @@ def getproteinsequences(fasta_files: List[str]) -> Dict:
    return protein_seqrecords


-def genrawsequences(ftuple: FileTuple) -> FileTuple:
+def gen_rawsequences(ftuple: FileTuple) -> FileTuple:
    """
-    Adds a column to the DataFrame containing a stripped down peptide without
-    the cleavage annotations
+    Adds a column to the DataFrame containing a stripped down peptide without the cleavage annotations
+    :param ftuple: The filetuple to containing the filedata dataframe to generate the raw sequence of each fragment
+    :return: a filetuple containing the fileID and a filedata dataframe containing the raw sequence in the
+    'stripped_sequence' column
    """
    file_data: pd.DataFrame = ftuple.FileData
    file_data = file_data.assign(
@@ -97,6 +113,14 @@ def parsemasterlocalizations(ftuple: FileTuple, master_prot_fasta_id) -> Tuple[
    # todo: other PTMs
    # todo: file specified regex string
    # todo fragment localization
+    """
+    Parses the modification and fragment localizations from a daaframe using the positions in master and modifications
+    in master proteins columns to obtain localization data instead of aligning the fragment to a given master protein
+    :param ftuple:
+    :param master_prot_fasta_id:
+    :return: A tuple containing the filetuple of localized data, a Dict mapping the proteinID to its modification \
+    localizations in the dataframe, and a Dict mapping proteinID's to their fragment localizations in the dataframe
+    """
    # NOTE: parses multiple master proteins, but as of now only the first is used
    # matches serine, threonine, tyrosine and 0 or more digits
    # this provides support for unlocalized PTM where only the amino acid is present and not the localization
@@ -145,7 +169,15 @@ def parsemasterlocalizations(ftuple: FileTuple, master_prot_fasta_id) -> Tuple[

 def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: Dict) -> \
        Tuple[FileTuple, Dict[str, str], Dict[str, str]]:
-    # TODO same as master
+    """
+    Parses out the localizations of the PTM by aligning each modified fragment to the protein sequences given in
+    protein_seqrecords, parsing out the modification index in each protein fragment, and using the fragment's index in
+    the full protein as an offset to calculate the localization's index in the full protein.
+    :param ftuple: a FIleTuple with the data to localize
+    :param protein_seqrecords: the dict of proteinID mapped to its SequenceRecord
+    :return: a Tuple containing the FileTuple with localization data, a Dict mapping the proteinID to its modification \
+    localizations in the dataframe, and a Dict mapping proteinID's to their fragment localizations in the dataframe
+    """
    # parses out the localizations of the PTM by aligning each modified fragment to the protein sequences given in
    # protein_seqrecords, parsing out the modification index in each protein fragment, and using the fragment's index in
    # the full protein as an offset to calculate the localization's index in the full protein.
@@ -164,7 +196,7 @@ def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: Dict) -> \
            frag_index_in_prot = protein_seq.find(row.stripped_sequence)
            # if the fragment is contained in the current protein
            if frag_index_in_prot != -1:
-                positions_in_master.append((frag_index_in_prot + 1, frag_index_in_prot + len(row.stripped_sequence)))
+                positions_in_master.append([(frag_index_in_prot + 1, frag_index_in_prot + len(row.stripped_sequence))])
                mod_string = row.modifications
                if not str(mod_string) == "nan":
                    matches = re.finditer(regex, mod_string)
@@ -192,18 +224,50 @@ def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: Dict) -> \


 # todo
-def calcresiduemodproportions(ftuple, localization_col_titles, frag_localization_col_titles, protein_seqrecords):
+def calcresiduemodabundances(ftuple, localization_col_titles, frag_localization_col_titles, abundance_col_title,
+                             protein_seqrecords):
+    """
+    Calculates, for a dataframe of localized ptm data, the abundance of each residue and how much that resiude is
+    modified.
+    :param ftuple: FileTuple containing the file ID and the DataFrame with localized data
+    :param localization_col_titles: dict mapping protein ID to the column containing the mod localization
+    :param frag_localization_col_titles: dict mapping proteinID to the fragment localization column title
+    :param abundance_col_title: title of the abundance column of interest in the dataframe
+    :param protein_seqrecords: dict containing the protein ID mapped to each SeqRecord
+    :return: a dict mapping proteinIDs to their residue abundance arrays
+    """
    fdata = ftuple.FileData
+    prot_abundances: Dict = dict()  # a dict of each protein ID mapped to its respective abundance array
+
    for proteinID, mod_column_title in localization_col_titles.items():
        protein_len = len(protein_seqrecords[proteinID])
-        res_abundances = np.zeros(protein_len, dtype=float)
-        res_mod_abdundances = np.zeros(protein_len, dtype=float)
+        # THIS IS ONE INDEXED. RESIDUE 1 IS IN INDEX 1 of the array. index 0 is UNUSED
+        res_abundances = np.zeros((protein_len + 1, 2), dtype=float)  # col. 0= mod abundance, col. 1=residue abundance
+
        for row in fdata.iterrows():
            fragment = row[1]
            mod_localization = fragment[mod_column_title]
            frag_localization = fragment[frag_localization_col_titles[proteinID]]
+            abundance_col_title = abundance_col_title.strip().lower().replace(' ', '_').replace('(', '') \
+                .replace(')', '').replace("#", "num")
+            frag_abundance = fragment[abundance_col_title]
+
+            # if the fragment is in the given protein
+            if frag_localization != -1:
+                # only uses the first localization
+                for i in range(frag_localization[0][0], frag_localization[0][1] + 1, 1):
+                    # add the abundance to the abundance of each resiude in the fragment
+                    res_abundances[i][1] += frag_abundance
+                for mod in mod_localization:
+                    # add the abundance to each modified residue contained in the fragment
+                    res_abundances[mod][0] += frag_abundance

+        prot_abundances.update({proteinID: res_abundances})

+    return prot_abundances
+
+
+# todo
 def calcpeptidemodproportions(ftuple, localization_col_titles, frag_localization_col_titles):
    pass


--- a/tests/testConfig.toml
+++ b/tests/testConfig.toml
@@ -14,7 +14,6 @@ title="Abundance Parser Configuration"
    #              path to file2
    #             ]
    input_files=["tests/sampleInput.csv"]
-
    #fasta file(s) containing the protein sequence(s) to align against
    prot_seq_fasta=["data/2N4R_wt_tau.fasta","data/1N4RP301STau.fasta"]