Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
abundanceparser
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
vish_joachimiak_lab
abundanceparser
Commits
0d025942
Commit
0d025942
authored
5 years ago
by
Vishruth Mullapudi
Committed by
Vishruth Mullapudi
5 years ago
Browse files
Options
Downloads
Patches
Plain Diff
residue modification abundances completed and comments added to code
parent
5ef05ab9
1 merge request
!1
Fix unlocalized peptide mods not adding
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
main.py
+79
-15
79 additions, 15 deletions
main.py
tests/testConfig.toml
+0
-1
0 additions, 1 deletion
tests/testConfig.toml
with
79 additions
and
16 deletions
main.py
+
79
−
15
View file @
0d025942
...
...
@@ -17,16 +17,20 @@ def main():
# Get the desired configuration
configuration
:
dict
=
toml
.
load
(
config_file
)
input_files
:
List
[
str
]
=
configuration
[
'
input
'
][
"
input_files
"
]
use_mod_in_master_prot
:
bool
=
configuration
[
'
parser_config
'
][
'
master
'
][
'
use
'
]
if
(
use_mod_in_master_prot
)
:
use_mod_in_master_prot
:
bool
=
False
#
configuration['parser_config']['master']['use']
if
use_mod_in_master_prot
:
master_prot_name
:
bool
=
configuration
[
'
parser_config
'
][
'
master
'
][
'
master_protein_name
'
]
master_protein_fasta_ID
=
configuration
[
'
parser_config
'
][
'
master
'
][
'
master_protein_fasta_ID
'
]
protein_fasta_files
:
List
[
str
]
=
configuration
[
'
input
'
][
'
prot_seq_fasta
'
]
# todo each abundance col into separate dataframe
# todo implement fileID splitting
# todo go down to one abundance column at a time?
abundance_col_titles
=
configuration
[
'
parser_config
'
][
'
abundance_col_titles
'
]
# Internalize the file's data
# input_data is a list of tuples (filename, pandas DataFrame of the csv)
input_data
:
List
[
FileTuple
]
=
ingestfiledata
(
files
=
input_files
)
protein_seqrecords
:
Dict
=
getproteinsequences
(
protein_fasta_files
)
data
=
(
genrawsequences
(
ftuple
)
for
ftuple
in
input_data
)
protein_seqrecords
:
Dict
=
get
_
proteinsequences
(
protein_fasta_files
)
data
=
(
gen
_
rawsequences
(
ftuple
)
for
ftuple
in
input_data
)
localization_col_titles
:
Dict
[
str
,
str
]
=
dict
()
# Dict containing {proteinID: column title}
frag_localization_col_titles
:
Dict
[
str
,
str
]
=
dict
()
# Dict containing {proteinID: column title}
...
...
@@ -52,12 +56,17 @@ def main():
for
ftuple
in
localized_data
:
# for residue modification analysis, calculate the amount each residue is modified
calcresiduemodproportions
(
ftuple
,
localization_col_titles
,
frag_localization_col_titles
,
protein_seqrecords
)
for
abundance_col_title
in
abundance_col_titles
:
calcresiduemodabundances
(
ftuple
,
localization_col_titles
,
frag_localization_col_titles
,
abundance_col_title
,
protein_seqrecords
)
def
ingestfiledata
(
files
:
List
[
str
])
->
List
[
FileTuple
]:
"""
Takes the list if files to ingest, reads them, and returns the data as
"""
Takes the list if files to ingest, reads them, and returns the data as
a list of tuples of the file name and a DataFrame of the file contents
:param files: a list of paths to input files
:return: a list containing 2-tuples of the filename and a dataframe of the file contents
"""
data
=
[]
for
input_file_path
in
files
:
...
...
@@ -71,7 +80,12 @@ def ingestfiledata(files: List[str]) -> List[FileTuple]:
return
data
def
getproteinsequences
(
fasta_files
:
List
[
str
])
->
Dict
:
def
get_proteinsequences
(
fasta_files
:
List
[
str
])
->
Dict
:
"""
Ingests the proteins sequences from a list of fasta files
:param fasta_files: the list of paths to fasta files to use in the alignment and localization
:return: a Dict mapping sequenceIDs to Bio.SeqRecord objects
"""
protein_seqrecords
:
Dict
[
str
,
SeqRecord
]
=
dict
()
for
file
in
fasta_files
:
with
open
(
file
,
"
r
"
)
as
handle
:
...
...
@@ -80,10 +94,12 @@ def getproteinsequences(fasta_files: List[str]) -> Dict:
return
protein_seqrecords
def
genrawsequences
(
ftuple
:
FileTuple
)
->
FileTuple
:
def
gen
_
rawsequences
(
ftuple
:
FileTuple
)
->
FileTuple
:
"""
Adds a column to the DataFrame containing a stripped down peptide without
the cleavage annotations
Adds a column to the DataFrame containing a stripped down peptide without the cleavage annotations
:param ftuple: The filetuple to containing the filedata dataframe to generate the raw sequence of each fragment
:return: a filetuple containing the fileID and a filedata dataframe containing the raw sequence in the
'
stripped_sequence
'
column
"""
file_data
:
pd
.
DataFrame
=
ftuple
.
FileData
file_data
=
file_data
.
assign
(
...
...
@@ -97,6 +113,14 @@ def parsemasterlocalizations(ftuple: FileTuple, master_prot_fasta_id) -> Tuple[
# todo: other PTMs
# todo: file specified regex string
# todo fragment localization
"""
Parses the modification and fragment localizations from a daaframe using the positions in master and modifications
in master proteins columns to obtain localization data instead of aligning the fragment to a given master protein
:param ftuple:
:param master_prot_fasta_id:
:return: A tuple containing the filetuple of localized data, a Dict mapping the proteinID to its modification
\
localizations in the dataframe, and a Dict mapping proteinID
'
s to their fragment localizations in the dataframe
"""
# NOTE: parses multiple master proteins, but as of now only the first is used
# matches serine, threonine, tyrosine and 0 or more digits
# this provides support for unlocalized PTM where only the amino acid is present and not the localization
...
...
@@ -145,7 +169,15 @@ def parsemasterlocalizations(ftuple: FileTuple, master_prot_fasta_id) -> Tuple[
def
parseprotlocalizations
(
ftuple
:
FileTuple
,
protein_seqrecords
:
Dict
)
->
\
Tuple
[
FileTuple
,
Dict
[
str
,
str
],
Dict
[
str
,
str
]]:
# TODO same as master
"""
Parses out the localizations of the PTM by aligning each modified fragment to the protein sequences given in
protein_seqrecords, parsing out the modification index in each protein fragment, and using the fragment
'
s index in
the full protein as an offset to calculate the localization
'
s index in the full protein.
:param ftuple: a FIleTuple with the data to localize
:param protein_seqrecords: the dict of proteinID mapped to its SequenceRecord
:return: a Tuple containing the FileTuple with localization data, a Dict mapping the proteinID to its modification
\
localizations in the dataframe, and a Dict mapping proteinID
'
s to their fragment localizations in the dataframe
"""
# parses out the localizations of the PTM by aligning each modified fragment to the protein sequences given in
# protein_seqrecords, parsing out the modification index in each protein fragment, and using the fragment's index in
# the full protein as an offset to calculate the localization's index in the full protein.
...
...
@@ -164,7 +196,7 @@ def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: Dict) -> \
frag_index_in_prot
=
protein_seq
.
find
(
row
.
stripped_sequence
)
# if the fragment is contained in the current protein
if
frag_index_in_prot
!=
-
1
:
positions_in_master
.
append
((
frag_index_in_prot
+
1
,
frag_index_in_prot
+
len
(
row
.
stripped_sequence
)))
positions_in_master
.
append
(
[
(
frag_index_in_prot
+
1
,
frag_index_in_prot
+
len
(
row
.
stripped_sequence
))
]
)
mod_string
=
row
.
modifications
if
not
str
(
mod_string
)
==
"
nan
"
:
matches
=
re
.
finditer
(
regex
,
mod_string
)
...
...
@@ -192,18 +224,50 @@ def parseprotlocalizations(ftuple: FileTuple, protein_seqrecords: Dict) -> \
# todo
def
calcresiduemodproportions
(
ftuple
,
localization_col_titles
,
frag_localization_col_titles
,
protein_seqrecords
):
def
calcresiduemodabundances
(
ftuple
,
localization_col_titles
,
frag_localization_col_titles
,
abundance_col_title
,
protein_seqrecords
):
"""
Calculates, for a dataframe of localized ptm data, the abundance of each residue and how much that resiude is
modified.
:param ftuple: FileTuple containing the file ID and the DataFrame with localized data
:param localization_col_titles: dict mapping protein ID to the column containing the mod localization
:param frag_localization_col_titles: dict mapping proteinID to the fragment localization column title
:param abundance_col_title: title of the abundance column of interest in the dataframe
:param protein_seqrecords: dict containing the protein ID mapped to each SeqRecord
:return: a dict mapping proteinIDs to their residue abundance arrays
"""
fdata
=
ftuple
.
FileData
prot_abundances
:
Dict
=
dict
()
# a dict of each protein ID mapped to its respective abundance array
for
proteinID
,
mod_column_title
in
localization_col_titles
.
items
():
protein_len
=
len
(
protein_seqrecords
[
proteinID
])
res_abundances
=
np
.
zeros
(
protein_len
,
dtype
=
float
)
res_mod_abdundances
=
np
.
zeros
(
protein_len
,
dtype
=
float
)
# THIS IS ONE INDEXED. RESIDUE 1 IS IN INDEX 1 of the array. index 0 is UNUSED
res_abundances
=
np
.
zeros
((
protein_len
+
1
,
2
),
dtype
=
float
)
# col. 0= mod abundance, col. 1=residue abundance
for
row
in
fdata
.
iterrows
():
fragment
=
row
[
1
]
mod_localization
=
fragment
[
mod_column_title
]
frag_localization
=
fragment
[
frag_localization_col_titles
[
proteinID
]]
abundance_col_title
=
abundance_col_title
.
strip
().
lower
().
replace
(
'
'
,
'
_
'
).
replace
(
'
(
'
,
''
)
\
.
replace
(
'
)
'
,
''
).
replace
(
"
#
"
,
"
num
"
)
frag_abundance
=
fragment
[
abundance_col_title
]
# if the fragment is in the given protein
if
frag_localization
!=
-
1
:
# only uses the first localization
for
i
in
range
(
frag_localization
[
0
][
0
],
frag_localization
[
0
][
1
]
+
1
,
1
):
# add the abundance to the abundance of each resiude in the fragment
res_abundances
[
i
][
1
]
+=
frag_abundance
for
mod
in
mod_localization
:
# add the abundance to each modified residue contained in the fragment
res_abundances
[
mod
][
0
]
+=
frag_abundance
prot_abundances
.
update
({
proteinID
:
res_abundances
})
return
prot_abundances
# todo
def
calcpeptidemodproportions
(
ftuple
,
localization_col_titles
,
frag_localization_col_titles
):
pass
...
...
This diff is collapsed.
Click to expand it.
tests/testConfig.toml
+
0
−
1
View file @
0d025942
...
...
@@ -14,7 +14,6 @@ title="Abundance Parser Configuration"
# path to file2
# ]
input_files
=[
"tests/sampleInput.csv"
]
#fasta file(s) containing the protein sequence(s) to align against
prot_seq_fasta
=[
"data/2N4R_wt_tau.fasta"
,
"data/1N4RP301STau.fasta"
]
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment