fixed unlocalized modifications not being counted in fragment modification...

fixed unlocalized modifications not being counted in fragment modification analysis by adding a field in the data indicating that the fragment posesses a modification

fixed unlocalized modifications not being counted in fragment modification...
fixed unlocalized modifications not being counted in fragment modification analysis by adding a field in the data indicating that the fragment posesses a modification
70a4b2d6 · Vishruth Mullapudi · Vishruth Mullapudi · 28505a8f · 70a4b2d6 · 70a4b2d6
Commit 70a4b2d6 authored 5 years ago by Vishruth Mullapudi Committed by Vishruth Mullapudi 5 years ago
--- a/.idea/abundanceparser.iml
+++ b/.idea/abundanceparser.iml
@@ -4,11 +4,10 @@
    <content url="file://$MODULE_DIR$">
      <excludeFolder url="file://$MODULE_DIR$/venv" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.7 (venv)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.7 (abundanceparser)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="TestRunnerService">
-    <option name="projectConfiguration" value="Twisted Trial" />
    <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
  </component>
 </module>
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (venv)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (abundanceparser)" project-jdk-type="Python SDK" />
 </project>
\ No newline at end of file
--- a/config.toml
+++ b/config.toml
 #This is a TOML Document
-
 #The TOML specification can be found here: https://github.com/toml-lang/toml

 title="Abundance Parser Configuration"

-
 #The input source and parsing parameters
 [input]
    #The CSV input file(s) to read data from
@@ -13,14 +11,14 @@ title="Abundance Parser Configuration"
    #input_files=[ "/input/file1",
    #              path to file2
    #             ]
-    input_files=["data/input/LUM1_705666-peptide.csv","data/input/LUM1_705666-peptidegroups.csv"]
+    input_files=["data/input/PCF-DC-1732--12--2019_peptide.csv"]
    #fasta file(s) containing the protein sequence(s) to align against
    prot_seq_fasta=["data/2N4R_wt_tau.fasta","data/1N4RP301STau.fasta"]


 [output]
    #Relative or absolute path to desired output directory
-    output_directory="output/mouse1yr"
+    output_directory="output/testfix/dailu122019"

    #this stub is prepended by the file name of the input file
    residue_output_name_stub="residueModificationAnalysis"

--- a/main.py
+++ b/main.py
@@ -236,11 +236,13 @@ def parse_prot_localizations(ftuple: FileTuple, protein_seq_records: Dict, mod_r
    """

    file_data: pd.DataFrame = ftuple.FileData
+    file_data.insert(file_data.shape[1], "valid_modification", False)
    mod_loc_column_titles = dict()  # maps Sequence Record ID to modification localization column name in DataFrame
    frag_loc_column_titles = dict()  # maps Sequence Record ID to fragment localization column name in DataFrame

    # iterate through all the input proteins to localize against
    for prot_id, protein in protein_seq_records.items():
+        frag_modification_status = []
        prot_mod_localizations = []
        positions_in_master = []  # master here is the protein currently being aligned against
        # the protein sequence to align against to find the fragment's location in the sequence
@@ -257,17 +259,29 @@ def parse_prot_localizations(ftuple: FileTuple, protein_seq_records: Dict, mod_r
                # PTM modification localization occurs here-------------------------------------------------------------
                mod_string = row.modifications
                if not str(mod_string) == "nan":
+                    is_validloc = False
                    matches = re.finditer(mod_regex, mod_string)
-                    prot_mod_localizations.append([int(mod_localization) + frag_index_in_prot for match_obj in matches
-                                                   for mod_localization in match_obj.groups()
-                                                   if mod_localization is not ''])
+                    localizations = []
+                    for match_obj in matches:
+                        for mod_localization in match_obj.groups():
+                            is_validloc = True
+                            if mod_localization is not '':
+                                localizations.append((int(mod_localization) + frag_index_in_prot))
+
+                    prot_mod_localizations.append(localizations)
+                    frag_modification_status.append(is_validloc)
+                    # prot_mod_localizations.append([int(mod_localization) + frag_index_in_prot for match_obj in matches
+                    #                                for mod_localization in match_obj.groups()
+                    #                                if mod_localization is not ''])
                else:
                    prot_mod_localizations.append([])  # if no modifications add the list containing no modifications
+                    frag_modification_status.append(False)
                # end of PTM/ modification localization-----------------------------------------------------------------

            else:
                prot_mod_localizations.append([frag_index_in_prot])  # if index is -1 the fragment isn't in this protein
                positions_in_master.append([(-1,)])  # indicate that the fragment isn't in this protein
+                frag_modification_status.append(False)

        # clean up the protein ID so we can use it to index things in our DataFrames
        sanitized_protein_id = sanitize_str_for_dataframe_index(prot_id)
@@ -279,6 +293,8 @@ def parse_prot_localizations(ftuple: FileTuple, protein_seq_records: Dict, mod_r
        # columns later
        file_data.insert(file_data.shape[1], prot_mod_column_title, prot_mod_localizations)
        file_data.insert(file_data.shape[1], frag_loc_column_title, positions_in_master)
+        frag_modification_status = [x or y for x, y in zip(frag_modification_status, file_data['valid_modification'])]
+        file_data.loc[:, "valid_modification"] = frag_modification_status
        mod_loc_column_titles.update({protein.id: prot_mod_column_title})
        frag_loc_column_titles.update({protein.id: frag_loc_column_title})

@@ -355,8 +371,8 @@ def calc_peptide_mod_abundances(ftuple, mod_localization_col_titles, frag_locali
                frag_abundance = frags_same_seq[1][abund_col_title].sum()
                # ID all the fragments with modifications
                bools = []
-                for mods in frags_same_seq[1][mod_col_title]:
-                    if mods:
+                for mods, valid_mod in zip(frags_same_seq[1][mod_col_title], frags_same_seq[1]['valid_modification']):
+                    if mods or valid_mod:
                        if -1 not in mods:
                            bools.append(True)
                        else:

--- a/tests/sample2.csv
+++ b/tests/sample2.csv
-Confidence,Annotated Sequence                                  ,Modifications                     ,Modifications in Master Proteins,# Protein Groups,# PSMs,Master Protein Accessions,Positions in Master Proteins        ,# Missed Cleavages,Abundance :F1,Abundance:F2
-High      ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q]            ,2xPhospho [S6; S]                 ,P10636-8 2xPhospho [S383; S]    ,1               ,2     ,P10636-8                 ,P10636-8 [378-409]                  ,0                 ,3770206.25   ,84602312
-High      ,[K].TDHGAEIVYKSPVVSGDTSPR.[H]                       ,2xPhospho [S11; S15]              ,P10636-8 2xPhospho [S367; S371] ,1               ,2     ,P10636-8                 ,P10636-8 [357-377]                  ,1                 ,490543.4063  ,85879545
-High      ,[K].AKTDHGAEIVYKSPVVSGDTSPR.[H]                     ,2xPhospho [S13; S17]              ,P10636-8 2xPhospho [S367; S371] ,1               ,1     ,P10636-8                 ,P10636-8 [355-377]                  ,2                 ,             ,92741293
-High      ,[R].SGYSSPGSPGTPGSR.[S]                             ,2xPhospho [S]                     ,P10636-8 2xPhospho [S]          ,1               ,5     ,P10636-8                 ,P10636-8 [166-180]                  ,0                 ,1425691.75   ,98087441
-High      ,[K].VAVVRTPPKSPSSAK.[S]                             ,1xPhospho [T6]                    ,P10636-8 1xPhospho [T202]       ,1               ,4     ,P10636-8                 ,P10636-8 [197-211]                  ,2                 ,35290570.5   ,36243518
-High      ,[K].KVAVVRTPPKSPSSAK.[S]                            ,1xPhospho [T7]                    ,P10636-8 1xPhospho [T202]       ,1               ,1     ,P10636-8                 ,P10636-8 [196-211]                  ,3                 ,97352696     ,43770115
-High      ,[K].KVAVVRTPPK.[S]                                  ,1xPhospho [T7]                    ,P10636-8 1xPhospho [T202]       ,1               ,1     ,P10636-8                 ,P10636-8 [196-205]                  ,2                 ,1866753.375  ,84341486
-High      ,[R].TPSLPTPPTR.[E]                                  ,1xPhospho [T6]                    ,P10636-8 1xPhospho [T188]       ,1               ,1     ,P10636-8                 ,P10636-8 [183-192]                  ,0                 ,5528542      ,93835341
-High      ,[R].TPSLPTPPTREPK.[K]                               ,1xPhospho [T6]                    ,P10636-8 1xPhospho [T188]       ,1               ,2     ,P10636-8                 ,P10636-8 [183-195]                  ,1                 ,22805599.5   ,67613272
-High      ,[K].TPPAPKTPPSSGEPPKSGDR.[S]                        ,1xPhospho [T7]                    ,P10636-8 1xPhospho [T152]       ,1               ,4     ,P10636-8                 ,P10636-8 [146-165]                  ,2                 ,35066952     ,12591257
-High      ,[K].TPPAPKTPPSSGEPPK.[S]                            ,1xPhospho [T7]                    ,P10636-8 1xPhospho [T152]       ,1               ,4     ,P10636-8                 ,P10636-8 [146-161]                  ,1                 ,147009880    ,94815625
-High      ,[K].TPPSSGEPPK.[S]                                  ,1xPhospho [T1]                    ,P10636-8 1xPhospho [T152]       ,1               ,1     ,P10636-8                 ,P10636-8 [152-161]                  ,0                 ,1141578.25   ,72979440
-High      ,[K].SPVVSGDTSPR.[H]                                 ,1xPhospho [T/S]                   ,P10636-8 1xPhospho [T/S]        ,1               ,5     ,P10636-8                 ,P10636-8 [367-377]                  ,0                 ,291275882    ,76313867
-High      ,[K].TDHGAEIVYKSPVVSGDTSPR.[H]                       ,1xPhospho [S11]                   ,P10636-8 1xPhospho [S367]       ,1               ,1     ,P10636-8                 ,P10636-8 [357-377]                  ,1                 ,3844751.25   ,90047135
-High      ,[K].IGSLDNITHVPGGGNK.[K]                            ,1xPhospho [S3]                    ,P10636-8 1xPhospho [S327]       ,1               ,1     ,P10636-8                 ,P10636-8 [325-340]                  ,0                 ,3098245      ,5332778
-High      ,[R].SRTPSLPTPPTR.[E]                                ,1xPhospho [S5]                    ,P10636-8 1xPhospho [S185]       ,1               ,1     ,P10636-8                 ,P10636-8 [181-192]                  ,1                 ,6525094.5    ,17852668
-High      ,[R].SGYSSPGSPGTPGSR.[S]                             ,1xPhospho [S/T]                   ,P10636-8 1xPhospho [S]          ,1               ,7     ,P10636-8                 ,P10636-8 [166-180]                  ,0                 ,149733789.5  ,22124243
-High      ,[K].STPTAEAEEAGIGDTPSLEDEAAGHVTQAR.[M]              ,1xPhospho [T/S]                   ,P10636-8 1xPhospho [S/T]        ,1               ,3     ,P10636-8                 ,P10636-8 [68-97]                    ,0                 ,7920354.125  ,9185267
-High      ,[R].TPPKSPSSAK.[S]                                  ,1xPhospho [S/T]                   ,P10636-8 1xPhospho [S/T]        ,1               ,3     ,P10636-8                 ,P10636-8 [202-211]                  ,1                 ,2525454.594  ,58090514
-High      ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q]            ,1xPhospho [T/S]; 1xOxidation [M13],P10636-8 1xPhospho [S/T]        ,1               ,4     ,P10636-8                 ,P10636-8 [378-409]                  ,0                 ,37349012     ,66648919
-High      ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q]            ,1xPhospho [S/T]                   ,P10636-8 1xPhospho [S/T]        ,1               ,5     ,P10636-8                 ,P10636-8 [378-409]                  ,0                 ,42549668     ,41174096
-High      ,[R].KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S],1xPhospho [T/S]; 1xOxidation [M8] ,P10636-8 1xPhospho [S/T]        ,1               ,6     ,P10636-8                 ,P10636-8 [24-67]                    ,2                 ,2833359      ,90435088
-High      ,[R].KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S],1xPhospho [S/T]                   ,P10636-8 1xPhospho [S/T]        ,1               ,5     ,P10636-8                 ,P10636-8 [24-67]                    ,2                 ,381969       ,33380708
-High      ,[R].LQTAPVPMPDLKNVK.[S]                             ,1xOxidation [M8]                  ,                                ,1               ,2     ,P10636-8                 ,P10636-8 [214-228]                  ,1                 ,11043190     ,59925563
-High      ,[R].LQTAPVPMPDLKNVK.[S]                             ,                                  ,                                ,1               ,2     ,P10636-8                 ,P10636-8 [214-228]                  ,1                 ,3375361.5    ,66934687
-High      ,[R].LQTAPVPMPDLK.[N]                                ,1xOxidation [M8]                  ,                                ,1               ,26    ,P10636-8                 ,P10636-8 [214-225]                  ,0                 ,2938043353   ,55115903
-High      ,[R].LQTAPVPMPDLK.[N]                                ,                                  ,                                ,1               ,10    ,P10636-8                 ,P10636-8 [214-225]                  ,0                 ,2345507657   ,18349423
-High      ,[K].LDLSNVQSK.[C]                                   ,                                  ,                                ,1               ,9     ,P10636-8                 ,P10636-8 [253-261]                  ,0                 ,1056765568   ,24227906
-High      ,[K].KLDLSNVQSK.[C]                                  ,                                  ,                                ,1               ,5     ,P10636-8                 ,P10636-8 [252-261]                  ,1                 ,1525892608   ,69084694
-High      ,[K].IGSLDNITHVPGGGNKK.[I]                           ,                                  ,                                ,1               ,5     ,P10636-8                 ,P10636-8 [325-341]                  ,1                 ,963006714.5  ,41478772
-High      ,[K].IGSLDNITHVPGGGNK.[K]                            ,                                  ,                                ,1               ,16    ,P10636-8                 ,P10636-8 [325-340]                  ,0                 ,4624086515   ,19351259
-High      ,[K].IGSTENLKHQPGGGK.[V]                             ,                                  ,                                ,1               ,1     ,P10636-8                 ,P10636-8 [231-245]                  ,1                 ,148045.0938  ,9514705
-High      ,[R].SRTPSLPTPPTREPK.[K]                             ,                                  ,                                ,1               ,3     ,P10636-8                 ,P10636-8 [181-195]                  ,2                 ,698763648    ,36101344
-High      ,[K].STPTAEAEEAGIGDTPSLEDEAAGHVTQAR.[M]              ,                                  ,                                ,1               ,12    ,P10636-8                 ,P10636-8 [68-97]                    ,0                 ,725873185.9  ,5036078
-High      ,[K].SPVVSGDTSPR.[H]                                 ,                                  ,                                ,1               ,5     ,P10636-8                 ,P10636-8 [367-377]                  ,0                 ,1303747568   ,71492316
-High      ,[R].TPPKSPSSAK.[S]                                  ,                                  ,                                ,1               ,5     ,P10636-8                 ,P10636-8 [202-211]                  ,1                 ,66745472     ,41408951
-High      ,[K].TPPSSGEPPK.[S]                                  ,                                  ,                                ,1               ,5     ,P10636-8                 ,P10636-8 [152-161]                  ,0                 ,257468414.4  ,61879916
-High      ,[K].TPPSSGEPPKSGDR.[S]                              ,                                  ,                                ,1               ,6     ,P10636-8                 ,P10636-8 [152-165]                  ,1                 ,103032414.3  ,38342837
-High      ,[R].TPSLPTPPTREPK.[K]                               ,                                  ,                                ,1               ,8     ,P10636-8                 ,P10636-8 [183-195]                  ,1                 ,1604505144   ,80068010
-High      ,[K].TDHGAEIVYK.[S]                                  ,                                  ,                                ,1               ,5     ,P10636-8                 ,P10636-8 [357-366]                  ,0                 ,529712534    ,96010574
-High      ,[K].SKDGTGSDDKK.[A]                                 ,                                  ,                                ,1               ,1     ,P10636-8                 ,P10636-8 [102-112]                  ,2                 ,1939050.375  ,32759389
-High      ,[R].QEFEVMEDHAGTYGLGDRK.[D]                         ,1xOxidation [M6]                  ,                                ,1               ,16    ,P10636-8                 ,P10636-8 [6-24]                     ,1                 ,234584178.3  ,33165088
-High      ,[R].QEFEVMEDHAGTYGLGDRK.[D]                         ,                                  ,                                ,1               ,8     ,P10636-8                 ,P10636-8 [6-24]                     ,1                 ,352983975.5  ,95560066
-High      ,[R].QEFEVMEDHAGTYGLGDR.[K]                          ,1xOxidation [M6]                  ,                                ,1               ,24    ,P10636-8                 ,P10636-8 [6-23]                     ,0                 ,541478609    ,5099972
-High      ,[R].QEFEVMEDHAGTYGLGDR.[K]                          ,                                  ,                                ,1               ,8     ,P10636-8                 ,P10636-8 [6-23]                     ,0                 ,663580316    ,57096692
-High      ,[R].SGYSSPGSPGTPGSR.[S]                             ,                                  ,                                ,1               ,7     ,P10636-8                 ,P10636-8 [166-180]                  ,0                 ,1886451761   ,72762532
-High      ,[K].DQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S] ,1xOxidation [M7]                  ,                                ,1               ,8     ,P10636-8                 ,P10636-8 [25-67]                    ,1                 ,242811709.3  ,34474881
-High      ,[K].DQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S] ,                                  ,                                ,1               ,4     ,P10636-8                 ,P10636-8 [25-67]                    ,1                 ,263117809.9  ,68130546
-High      ,[K].DQGGYTMHQDQEGDTDAGLK.[E]                        ,1xOxidation [M7]                  ,                                ,1               ,15    ,P10636-8                 ,P10636-8 [25-44]                    ,0                 ,148222051.8  ,32337478
-High      ,[K].DQGGYTMHQDQEGDTDAGLK.[E]                        ,                                  ,                                ,1               ,5     ,P10636-8                 ,P10636-8 [25-44]                    ,0                 ,169405957.9  ,92807240
-High      ,[K].DNIKHVSGGGSVQIVYKPVDLSK.[V]                     ,                                  ,                                ,1               ,1     ,P10636-8                 ,P10636-8 [266-288]                  ,1                 ,             ,38907207
-High      ,[K].ESPLQTPTEDGSEEPGSETSDAK.[S]                     ,                                  ,                                ,1               ,9     ,P10636-8                 ,P10636-8 [45-67]                    ,0                 ,740036840    ,99535953
-High      ,[K].AKTDHGAEIVYK.[S]                                ,                                  ,                                ,1               ,3     ,P10636-8                 ,P10636-8 [355-366]                  ,1                 ,652942514.6  ,84744142
-High      ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q]            ,1xOxidation [M13]                 ,                                ,1               ,17    ,P10636-8                 ,P10636-8 [378-409]                  ,0                 ,465597371.5  ,29112938
-High      ,[K].HVSGGGSVQIVYKPVDLSK.[V]                         ,                                  ,                                ,1               ,56    ,P10636-8                 ,P10636-8 [270-288]                  ,0                 ,617176649.5  ,33872557
-High      ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q]            ,                                  ,                                ,1               ,13    ,P10636-8                 ,P10636-8 [378-409]                  ,0                 ,549771859.3  ,44853427
-High      ,[R].KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S],1xOxidation [M8]                  ,                                ,1               ,2     ,P10636-8                 ,P10636-8 [24-67]                    ,2                 ,132546086.5  ,98766340
-High      ,[R].KDQGGYTMHQDQEGDTDAGLK.[E]                       ,1xOxidation [M8]                  ,                                ,1               ,12    ,P10636-8                 ,P10636-8 [24-44]                    ,1                 ,68147217.5   ,6462453
-High      ,[R].KDQGGYTMHQDQEGDTDAGLK.[E]                       ,                                  ,                                ,1               ,2     ,P10636-8                 ,P10636-8 [24-44]                    ,1                 ,47451437.75  ,34182396
-High      ,[R].KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S],                                  ,                                ,1               ,3     ,P10636-8                 ,P10636-8 [24-67]                    ,2                 ,25293893.63  ,63120561
-High      ,[K].SRLQTAPVPMPDLK.[N]                              ,                                  ,                                ,1               ,1     ,P10636-8                 ,P10636-8 [212-225]                  ,1                 ,5722389      ,33652639
-High      ,[R].IPAKTPPAPK.[T]                                  ,                                  ,                                ,1               ,1     ,P10636-8                 ,P10636-8 [142-151]                  ,1                 ,2015586.75   ,38656450
-High      ,[K].CGSLGNIHHKPGGGQVEVK.[S]                         ,1xCarbamidomethyl [C1]            ,                                ,1               ,3     ,P10636-8                 ,P10636-8 [293-311]                  ,0                 ,218379137.5  ,63191884
-High      ,[K].IGSTENLK.[H]                                    ,                                  ,                                ,1               ,5     ,P10636-8                 ,P10636-8 [231-238]                  ,0                 ,887492361    ,95150178
-High      ,[R].TPSLPTPPTR.[E]                                  ,                                  ,                                ,1               ,4     ,P10636-8                 ,P10636-8 [183-192]                  ,0                 ,1085564561   ,84114105
-High      ,[R].SRTPSLPTPPTR.[E]                                ,                                  ,                                ,1               ,1     ,P10636-8                 ,P10636-8 [181-192]                  ,1                 ,308283264    ,78590822
-High      ,[K].SKIGSTENLKHQPGGGK.[V]                           ,                                  ,                                ,1               ,1     ,P10636-8                 ,P10636-8 [229-245]                  ,2                 ,551345.375   ,38888975
-High      ,[K].GADGKTKIATPR.[G]                                ,                                  ,                                ,1               ,1     ,P10636-8                 ,P10636-8 [115-126]                  ,2                 ,593317.6875  ,45036840
-High      ,[K].SEKLDFKDR.[V]                                   ,                                  ,                                ,1               ,2     ,P10636-8                 ,P10636-8 [312-320]                  ,2                 ,32312068     ,53312265
-High      ,[KR].CGSKDNIK.[H]                                   ,1xCarbamidomethyl [C1]            ,                                ,2               ,1     ,P10636-8; P27546         ,P10636-8 [262-269]; P27546 [981-988],1                 ,4449589      ,82881130