Skip to content
Snippets Groups Projects
Commit 70a4b2d6 authored by Vishruth Mullapudi's avatar Vishruth Mullapudi Committed by Vishruth Mullapudi
Browse files

fixed unlocalized modifications not being counted in fragment modification...

fixed unlocalized modifications not being counted in fragment modification analysis by adding a field in the data indicating that the fragment posesses a modification
parent 28505a8f
1 merge request!1Fix unlocalized peptide mods not adding
......@@ -4,11 +4,10 @@
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.7 (venv)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.7 (abundanceparser)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="projectConfiguration" value="Twisted Trial" />
<option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (venv)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (abundanceparser)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
#This is a TOML Document
#The TOML specification can be found here: https://github.com/toml-lang/toml
title="Abundance Parser Configuration"
#The input source and parsing parameters
[input]
#The CSV input file(s) to read data from
......@@ -13,14 +11,14 @@ title="Abundance Parser Configuration"
#input_files=[ "/input/file1",
# path to file2
# ]
input_files=["data/input/LUM1_705666-peptide.csv","data/input/LUM1_705666-peptidegroups.csv"]
input_files=["data/input/PCF-DC-1732--12--2019_peptide.csv"]
#fasta file(s) containing the protein sequence(s) to align against
prot_seq_fasta=["data/2N4R_wt_tau.fasta","data/1N4RP301STau.fasta"]
[output]
#Relative or absolute path to desired output directory
output_directory="output/mouse1yr"
output_directory="output/testfix/dailu122019"
#this stub is prepended by the file name of the input file
residue_output_name_stub="residueModificationAnalysis"
......
......@@ -236,11 +236,13 @@ def parse_prot_localizations(ftuple: FileTuple, protein_seq_records: Dict, mod_r
"""
file_data: pd.DataFrame = ftuple.FileData
file_data.insert(file_data.shape[1], "valid_modification", False)
mod_loc_column_titles = dict() # maps Sequence Record ID to modification localization column name in DataFrame
frag_loc_column_titles = dict() # maps Sequence Record ID to fragment localization column name in DataFrame
# iterate through all the input proteins to localize against
for prot_id, protein in protein_seq_records.items():
frag_modification_status = []
prot_mod_localizations = []
positions_in_master = [] # master here is the protein currently being aligned against
# the protein sequence to align against to find the fragment's location in the sequence
......@@ -257,17 +259,29 @@ def parse_prot_localizations(ftuple: FileTuple, protein_seq_records: Dict, mod_r
# PTM modification localization occurs here-------------------------------------------------------------
mod_string = row.modifications
if not str(mod_string) == "nan":
is_validloc = False
matches = re.finditer(mod_regex, mod_string)
prot_mod_localizations.append([int(mod_localization) + frag_index_in_prot for match_obj in matches
for mod_localization in match_obj.groups()
if mod_localization is not ''])
localizations = []
for match_obj in matches:
for mod_localization in match_obj.groups():
is_validloc = True
if mod_localization is not '':
localizations.append((int(mod_localization) + frag_index_in_prot))
prot_mod_localizations.append(localizations)
frag_modification_status.append(is_validloc)
# prot_mod_localizations.append([int(mod_localization) + frag_index_in_prot for match_obj in matches
# for mod_localization in match_obj.groups()
# if mod_localization is not ''])
else:
prot_mod_localizations.append([]) # if no modifications add the list containing no modifications
frag_modification_status.append(False)
# end of PTM/ modification localization-----------------------------------------------------------------
else:
prot_mod_localizations.append([frag_index_in_prot]) # if index is -1 the fragment isn't in this protein
positions_in_master.append([(-1,)]) # indicate that the fragment isn't in this protein
frag_modification_status.append(False)
# clean up the protein ID so we can use it to index things in our DataFrames
sanitized_protein_id = sanitize_str_for_dataframe_index(prot_id)
......@@ -279,6 +293,8 @@ def parse_prot_localizations(ftuple: FileTuple, protein_seq_records: Dict, mod_r
# columns later
file_data.insert(file_data.shape[1], prot_mod_column_title, prot_mod_localizations)
file_data.insert(file_data.shape[1], frag_loc_column_title, positions_in_master)
frag_modification_status = [x or y for x, y in zip(frag_modification_status, file_data['valid_modification'])]
file_data.loc[:, "valid_modification"] = frag_modification_status
mod_loc_column_titles.update({protein.id: prot_mod_column_title})
frag_loc_column_titles.update({protein.id: frag_loc_column_title})
......@@ -355,8 +371,8 @@ def calc_peptide_mod_abundances(ftuple, mod_localization_col_titles, frag_locali
frag_abundance = frags_same_seq[1][abund_col_title].sum()
# ID all the fragments with modifications
bools = []
for mods in frags_same_seq[1][mod_col_title]:
if mods:
for mods, valid_mod in zip(frags_same_seq[1][mod_col_title], frags_same_seq[1]['valid_modification']):
if mods or valid_mod:
if -1 not in mods:
bools.append(True)
else:
......
Confidence,Annotated Sequence ,Modifications ,Modifications in Master Proteins,# Protein Groups,# PSMs,Master Protein Accessions,Positions in Master Proteins ,# Missed Cleavages,Abundance :F1,Abundance:F2
High ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q] ,2xPhospho [S6; S] ,P10636-8 2xPhospho [S383; S] ,1 ,2 ,P10636-8 ,P10636-8 [378-409] ,0 ,3770206.25 ,84602312
High ,[K].TDHGAEIVYKSPVVSGDTSPR.[H] ,2xPhospho [S11; S15] ,P10636-8 2xPhospho [S367; S371] ,1 ,2 ,P10636-8 ,P10636-8 [357-377] ,1 ,490543.4063 ,85879545
High ,[K].AKTDHGAEIVYKSPVVSGDTSPR.[H] ,2xPhospho [S13; S17] ,P10636-8 2xPhospho [S367; S371] ,1 ,1 ,P10636-8 ,P10636-8 [355-377] ,2 , ,92741293
High ,[R].SGYSSPGSPGTPGSR.[S] ,2xPhospho [S] ,P10636-8 2xPhospho [S] ,1 ,5 ,P10636-8 ,P10636-8 [166-180] ,0 ,1425691.75 ,98087441
High ,[K].VAVVRTPPKSPSSAK.[S] ,1xPhospho [T6] ,P10636-8 1xPhospho [T202] ,1 ,4 ,P10636-8 ,P10636-8 [197-211] ,2 ,35290570.5 ,36243518
High ,[K].KVAVVRTPPKSPSSAK.[S] ,1xPhospho [T7] ,P10636-8 1xPhospho [T202] ,1 ,1 ,P10636-8 ,P10636-8 [196-211] ,3 ,97352696 ,43770115
High ,[K].KVAVVRTPPK.[S] ,1xPhospho [T7] ,P10636-8 1xPhospho [T202] ,1 ,1 ,P10636-8 ,P10636-8 [196-205] ,2 ,1866753.375 ,84341486
High ,[R].TPSLPTPPTR.[E] ,1xPhospho [T6] ,P10636-8 1xPhospho [T188] ,1 ,1 ,P10636-8 ,P10636-8 [183-192] ,0 ,5528542 ,93835341
High ,[R].TPSLPTPPTREPK.[K] ,1xPhospho [T6] ,P10636-8 1xPhospho [T188] ,1 ,2 ,P10636-8 ,P10636-8 [183-195] ,1 ,22805599.5 ,67613272
High ,[K].TPPAPKTPPSSGEPPKSGDR.[S] ,1xPhospho [T7] ,P10636-8 1xPhospho [T152] ,1 ,4 ,P10636-8 ,P10636-8 [146-165] ,2 ,35066952 ,12591257
High ,[K].TPPAPKTPPSSGEPPK.[S] ,1xPhospho [T7] ,P10636-8 1xPhospho [T152] ,1 ,4 ,P10636-8 ,P10636-8 [146-161] ,1 ,147009880 ,94815625
High ,[K].TPPSSGEPPK.[S] ,1xPhospho [T1] ,P10636-8 1xPhospho [T152] ,1 ,1 ,P10636-8 ,P10636-8 [152-161] ,0 ,1141578.25 ,72979440
High ,[K].SPVVSGDTSPR.[H] ,1xPhospho [T/S] ,P10636-8 1xPhospho [T/S] ,1 ,5 ,P10636-8 ,P10636-8 [367-377] ,0 ,291275882 ,76313867
High ,[K].TDHGAEIVYKSPVVSGDTSPR.[H] ,1xPhospho [S11] ,P10636-8 1xPhospho [S367] ,1 ,1 ,P10636-8 ,P10636-8 [357-377] ,1 ,3844751.25 ,90047135
High ,[K].IGSLDNITHVPGGGNK.[K] ,1xPhospho [S3] ,P10636-8 1xPhospho [S327] ,1 ,1 ,P10636-8 ,P10636-8 [325-340] ,0 ,3098245 ,5332778
High ,[R].SRTPSLPTPPTR.[E] ,1xPhospho [S5] ,P10636-8 1xPhospho [S185] ,1 ,1 ,P10636-8 ,P10636-8 [181-192] ,1 ,6525094.5 ,17852668
High ,[R].SGYSSPGSPGTPGSR.[S] ,1xPhospho [S/T] ,P10636-8 1xPhospho [S] ,1 ,7 ,P10636-8 ,P10636-8 [166-180] ,0 ,149733789.5 ,22124243
High ,[K].STPTAEAEEAGIGDTPSLEDEAAGHVTQAR.[M] ,1xPhospho [T/S] ,P10636-8 1xPhospho [S/T] ,1 ,3 ,P10636-8 ,P10636-8 [68-97] ,0 ,7920354.125 ,9185267
High ,[R].TPPKSPSSAK.[S] ,1xPhospho [S/T] ,P10636-8 1xPhospho [S/T] ,1 ,3 ,P10636-8 ,P10636-8 [202-211] ,1 ,2525454.594 ,58090514
High ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q] ,1xPhospho [T/S]; 1xOxidation [M13],P10636-8 1xPhospho [S/T] ,1 ,4 ,P10636-8 ,P10636-8 [378-409] ,0 ,37349012 ,66648919
High ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q] ,1xPhospho [S/T] ,P10636-8 1xPhospho [S/T] ,1 ,5 ,P10636-8 ,P10636-8 [378-409] ,0 ,42549668 ,41174096
High ,[R].KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S],1xPhospho [T/S]; 1xOxidation [M8] ,P10636-8 1xPhospho [S/T] ,1 ,6 ,P10636-8 ,P10636-8 [24-67] ,2 ,2833359 ,90435088
High ,[R].KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S],1xPhospho [S/T] ,P10636-8 1xPhospho [S/T] ,1 ,5 ,P10636-8 ,P10636-8 [24-67] ,2 ,381969 ,33380708
High ,[R].LQTAPVPMPDLKNVK.[S] ,1xOxidation [M8] , ,1 ,2 ,P10636-8 ,P10636-8 [214-228] ,1 ,11043190 ,59925563
High ,[R].LQTAPVPMPDLKNVK.[S] , , ,1 ,2 ,P10636-8 ,P10636-8 [214-228] ,1 ,3375361.5 ,66934687
High ,[R].LQTAPVPMPDLK.[N] ,1xOxidation [M8] , ,1 ,26 ,P10636-8 ,P10636-8 [214-225] ,0 ,2938043353 ,55115903
High ,[R].LQTAPVPMPDLK.[N] , , ,1 ,10 ,P10636-8 ,P10636-8 [214-225] ,0 ,2345507657 ,18349423
High ,[K].LDLSNVQSK.[C] , , ,1 ,9 ,P10636-8 ,P10636-8 [253-261] ,0 ,1056765568 ,24227906
High ,[K].KLDLSNVQSK.[C] , , ,1 ,5 ,P10636-8 ,P10636-8 [252-261] ,1 ,1525892608 ,69084694
High ,[K].IGSLDNITHVPGGGNKK.[I] , , ,1 ,5 ,P10636-8 ,P10636-8 [325-341] ,1 ,963006714.5 ,41478772
High ,[K].IGSLDNITHVPGGGNK.[K] , , ,1 ,16 ,P10636-8 ,P10636-8 [325-340] ,0 ,4624086515 ,19351259
High ,[K].IGSTENLKHQPGGGK.[V] , , ,1 ,1 ,P10636-8 ,P10636-8 [231-245] ,1 ,148045.0938 ,9514705
High ,[R].SRTPSLPTPPTREPK.[K] , , ,1 ,3 ,P10636-8 ,P10636-8 [181-195] ,2 ,698763648 ,36101344
High ,[K].STPTAEAEEAGIGDTPSLEDEAAGHVTQAR.[M] , , ,1 ,12 ,P10636-8 ,P10636-8 [68-97] ,0 ,725873185.9 ,5036078
High ,[K].SPVVSGDTSPR.[H] , , ,1 ,5 ,P10636-8 ,P10636-8 [367-377] ,0 ,1303747568 ,71492316
High ,[R].TPPKSPSSAK.[S] , , ,1 ,5 ,P10636-8 ,P10636-8 [202-211] ,1 ,66745472 ,41408951
High ,[K].TPPSSGEPPK.[S] , , ,1 ,5 ,P10636-8 ,P10636-8 [152-161] ,0 ,257468414.4 ,61879916
High ,[K].TPPSSGEPPKSGDR.[S] , , ,1 ,6 ,P10636-8 ,P10636-8 [152-165] ,1 ,103032414.3 ,38342837
High ,[R].TPSLPTPPTREPK.[K] , , ,1 ,8 ,P10636-8 ,P10636-8 [183-195] ,1 ,1604505144 ,80068010
High ,[K].TDHGAEIVYK.[S] , , ,1 ,5 ,P10636-8 ,P10636-8 [357-366] ,0 ,529712534 ,96010574
High ,[K].SKDGTGSDDKK.[A] , , ,1 ,1 ,P10636-8 ,P10636-8 [102-112] ,2 ,1939050.375 ,32759389
High ,[R].QEFEVMEDHAGTYGLGDRK.[D] ,1xOxidation [M6] , ,1 ,16 ,P10636-8 ,P10636-8 [6-24] ,1 ,234584178.3 ,33165088
High ,[R].QEFEVMEDHAGTYGLGDRK.[D] , , ,1 ,8 ,P10636-8 ,P10636-8 [6-24] ,1 ,352983975.5 ,95560066
High ,[R].QEFEVMEDHAGTYGLGDR.[K] ,1xOxidation [M6] , ,1 ,24 ,P10636-8 ,P10636-8 [6-23] ,0 ,541478609 ,5099972
High ,[R].QEFEVMEDHAGTYGLGDR.[K] , , ,1 ,8 ,P10636-8 ,P10636-8 [6-23] ,0 ,663580316 ,57096692
High ,[R].SGYSSPGSPGTPGSR.[S] , , ,1 ,7 ,P10636-8 ,P10636-8 [166-180] ,0 ,1886451761 ,72762532
High ,[K].DQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S] ,1xOxidation [M7] , ,1 ,8 ,P10636-8 ,P10636-8 [25-67] ,1 ,242811709.3 ,34474881
High ,[K].DQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S] , , ,1 ,4 ,P10636-8 ,P10636-8 [25-67] ,1 ,263117809.9 ,68130546
High ,[K].DQGGYTMHQDQEGDTDAGLK.[E] ,1xOxidation [M7] , ,1 ,15 ,P10636-8 ,P10636-8 [25-44] ,0 ,148222051.8 ,32337478
High ,[K].DQGGYTMHQDQEGDTDAGLK.[E] , , ,1 ,5 ,P10636-8 ,P10636-8 [25-44] ,0 ,169405957.9 ,92807240
High ,[K].DNIKHVSGGGSVQIVYKPVDLSK.[V] , , ,1 ,1 ,P10636-8 ,P10636-8 [266-288] ,1 , ,38907207
High ,[K].ESPLQTPTEDGSEEPGSETSDAK.[S] , , ,1 ,9 ,P10636-8 ,P10636-8 [45-67] ,0 ,740036840 ,99535953
High ,[K].AKTDHGAEIVYK.[S] , , ,1 ,3 ,P10636-8 ,P10636-8 [355-366] ,1 ,652942514.6 ,84744142
High ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q] ,1xOxidation [M13] , ,1 ,17 ,P10636-8 ,P10636-8 [378-409] ,0 ,465597371.5 ,29112938
High ,[K].HVSGGGSVQIVYKPVDLSK.[V] , , ,1 ,56 ,P10636-8 ,P10636-8 [270-288] ,0 ,617176649.5 ,33872557
High ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q] , , ,1 ,13 ,P10636-8 ,P10636-8 [378-409] ,0 ,549771859.3 ,44853427
High ,[R].KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S],1xOxidation [M8] , ,1 ,2 ,P10636-8 ,P10636-8 [24-67] ,2 ,132546086.5 ,98766340
High ,[R].KDQGGYTMHQDQEGDTDAGLK.[E] ,1xOxidation [M8] , ,1 ,12 ,P10636-8 ,P10636-8 [24-44] ,1 ,68147217.5 ,6462453
High ,[R].KDQGGYTMHQDQEGDTDAGLK.[E] , , ,1 ,2 ,P10636-8 ,P10636-8 [24-44] ,1 ,47451437.75 ,34182396
High ,[R].KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S], , ,1 ,3 ,P10636-8 ,P10636-8 [24-67] ,2 ,25293893.63 ,63120561
High ,[K].SRLQTAPVPMPDLK.[N] , , ,1 ,1 ,P10636-8 ,P10636-8 [212-225] ,1 ,5722389 ,33652639
High ,[R].IPAKTPPAPK.[T] , , ,1 ,1 ,P10636-8 ,P10636-8 [142-151] ,1 ,2015586.75 ,38656450
High ,[K].CGSLGNIHHKPGGGQVEVK.[S] ,1xCarbamidomethyl [C1] , ,1 ,3 ,P10636-8 ,P10636-8 [293-311] ,0 ,218379137.5 ,63191884
High ,[K].IGSTENLK.[H] , , ,1 ,5 ,P10636-8 ,P10636-8 [231-238] ,0 ,887492361 ,95150178
High ,[R].TPSLPTPPTR.[E] , , ,1 ,4 ,P10636-8 ,P10636-8 [183-192] ,0 ,1085564561 ,84114105
High ,[R].SRTPSLPTPPTR.[E] , , ,1 ,1 ,P10636-8 ,P10636-8 [181-192] ,1 ,308283264 ,78590822
High ,[K].SKIGSTENLKHQPGGGK.[V] , , ,1 ,1 ,P10636-8 ,P10636-8 [229-245] ,2 ,551345.375 ,38888975
High ,[K].GADGKTKIATPR.[G] , , ,1 ,1 ,P10636-8 ,P10636-8 [115-126] ,2 ,593317.6875 ,45036840
High ,[K].SEKLDFKDR.[V] , , ,1 ,2 ,P10636-8 ,P10636-8 [312-320] ,2 ,32312068 ,53312265
High ,[KR].CGSKDNIK.[H] ,1xCarbamidomethyl [C1] , ,2 ,1 ,P10636-8; P27546 ,P10636-8 [262-269]; P27546 [981-988],1 ,4449589 ,82881130
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment