Skip to content
Snippets Groups Projects
Commit c8b64eee authored by Vishruth Mullapudi's avatar Vishruth Mullapudi
Browse files

rearrangement and file input


Signed-off-by: default avatarVishruth Mullapudi <vmullapudi1@gmail.com>
parents
Branches
No related merge requests found
# Default ignored files
/workspace.xml
\ No newline at end of file
<component name="ProjectCodeStyleConfiguration">
<state>
<option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
</state>
</component>
\ No newline at end of file
<component name="ProjectDictionaryState">
<dictionary name="vmull" />
</component>
\ No newline at end of file
#This is a TOML Document
#The TOML specification can be found here: https://github.com/toml-lang/toml
title="Abundance Parser Configuration"
#The input source and parsing parameters
[input]
#The CSV input file(s) to read data from
#The files should be of the same format (column titles,
#Example:
#input_files=[ "/input/file1",
# path to file2
# ]
input_files=["/input/test.csv"]
#fasta file(s) containing the protein sequence(s) to align against
prot_seq_fasta=["/data/1N4RP301STau.fasta","/data/2N4R_wt_tau.fasta"]
[output]
#Relative or absolute path to desired output directory
output_directory="output/"
#this stub is prepended by the file name of the input file
output_name_stub="residueModificationAnalysis"
#Configuration of parser settings
[parser_config]
#If false, files will be specified via abundance column
using_fileID_column=false
#Title of column containing abundance. If using_fileID_column is set to
#true, only the first entry will be used
abundance_col_titles=["Abundance","Abundance: F1"]
[parser_config.master]
#Configs for use of pre-localized modification in master protein in
#parsing
master.use=false
#Column header containing the title of the column containing the
#localized modifications
master.modification_header="Modifications in Master Proteins"
>1N4R P301S Tau
MAEPRQEFEVMEDHAGTYGLGDRKDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAKSTPTAEAEEAGIGDTPSLEDEAAGHVTQARMVSKSKDGTGSDDKKAKGADGKTKIATPRGAAPPGQKGQANATRIPAKTPPAPKTPPSSGEPPKSGDRSGYSSPGSPGTPGSRSRTPSLPTPPTREPKKVAVVRTPPKSPSSAKSRLQTAPVPMPDLKNVKSKIGSTENLKHQPGGGKVQIINKKLDLSNVQSKCGSKDNIKHVSGGGSVQIVYKPVDLSKVTSKCGSLGNIHHKPGGGQVEVKSEKLDFKDRVQSKIGSLDNITHVPGGGNKKIETHKLTFRENAKAKTDHGAEIVYKSPVVSGDTSPRHLSNVSSTGSIDMVDSPQLATLADEVSASLAKQGL
\ No newline at end of file
>sp|P10636-8|TAU_HUMAN Isoform Tau-F of Microtubule-associated protein tau OS=Homo sapiens OX=9606 GN=MAPT
MAEPRQEFEVMEDHAGTYGLGDRKDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPG
SETSDAKSTPTAEDVTAPLVDEGAPGKQAAAQPHTEIPEGTTAEEAGIGDTPSLEDEAAG
HVTQARMVSKSKDGTGSDDKKAKGADGKTKIATPRGAAPPGQKGQANATRIPAKTPPAPK
TPPSSGEPPKSGDRSGYSSPGSPGTPGSRSRTPSLPTPPTREPKKVAVVRTPPKSPSSAK
SRLQTAPVPMPDLKNVKSKIGSTENLKHQPGGGKVQIINKKLDLSNVQSKCGSKDNIKHV
PGGGSVQIVYKPVDLSKVTSKCGSLGNIHHKPGGGQVEVKSEKLDFKDRVQSKIGSLDNI
THVPGGGNKKIETHKLTFRENAKAKTDHGAEIVYKSPVVSGDTSPRHLSNVSSTGSIDMV
DSPQLATLADEVSASLAKQGL
\ No newline at end of file
#This is a TOML Document
#The TOML specification can be found here: https://github.com/toml-lang/toml
title="Perser Regex Configuration"
#IMPORTANT: Use string literals instead of strings to allow escape characters
#to function properly.
#Regex Strings should be formatted with single quotes (e.g. 'foobar') or triple
#quotes (e.g. '''this is a string literal as well''') to store them as string
#literals
regex.phosphoregex=''
\ No newline at end of file
MIT License
Copyright (c) [2019] [Vishruth Mullapudi]
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
main.py 0 → 100644
import csv
import toml
def main():
config_file = "data/tests/testConfig.toml"
configuration = toml.load(config_file)
input_files = configuration['input']["input_files"]
analyze_files(input_files)
def analyze_files(files, config):
for input_file_path in files:
with open(input_file_path, mode='r') as infile:
file_reader = csv.DictReader(infile,
delimiter=',',
skipinitialspace=True,
strict=True)
headers = file_reader.fieldnames
print(headers)
for row in file_reader:
print(row)
if __name__ == '__main__':
# execute only if run as a script
main()
Confidence,Annotated Sequence ,Modifications ,Modifications in Master Proteins,# Protein Groups,# PSMs,Master Protein Accessions,Positions in Master Proteins ,# Missed Cleavages,Abundance :F1
High ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q] ,2xPhospho [S6; S] ,P10636-8 2xPhospho [S383; S] ,1 ,2 ,P10636-8 ,P10636-8 [378-409] ,0 ,3770206.25
High ,[K].TDHGAEIVYKSPVVSGDTSPR.[H] ,2xPhospho [S11; S15] ,P10636-8 2xPhospho [S367; S371] ,1 ,2 ,P10636-8 ,P10636-8 [357-377] ,1 ,490543.4063
High ,[K].AKTDHGAEIVYKSPVVSGDTSPR.[H] ,2xPhospho [S13; S17] ,P10636-8 2xPhospho [S367; S371] ,1 ,1 ,P10636-8 ,P10636-8 [355-377] ,2 ,
High ,[R].SGYSSPGSPGTPGSR.[S] ,2xPhospho [S] ,P10636-8 2xPhospho [S] ,1 ,5 ,P10636-8 ,P10636-8 [166-180] ,0 ,1425691.75
High ,[K].VAVVRTPPKSPSSAK.[S] ,1xPhospho [T6] ,P10636-8 1xPhospho [T202] ,1 ,4 ,P10636-8 ,P10636-8 [197-211] ,2 ,35290570.5
High ,[K].KVAVVRTPPKSPSSAK.[S] ,1xPhospho [T7] ,P10636-8 1xPhospho [T202] ,1 ,1 ,P10636-8 ,P10636-8 [196-211] ,3 ,97352696
High ,[K].KVAVVRTPPK.[S] ,1xPhospho [T7] ,P10636-8 1xPhospho [T202] ,1 ,1 ,P10636-8 ,P10636-8 [196-205] ,2 ,1866753.375
High ,[R].TPSLPTPPTR.[E] ,1xPhospho [T6] ,P10636-8 1xPhospho [T188] ,1 ,1 ,P10636-8 ,P10636-8 [183-192] ,0 ,5528542
High ,[R].TPSLPTPPTREPK.[K] ,1xPhospho [T6] ,P10636-8 1xPhospho [T188] ,1 ,2 ,P10636-8 ,P10636-8 [183-195] ,1 ,22805599.5
High ,[K].TPPAPKTPPSSGEPPKSGDR.[S] ,1xPhospho [T7] ,P10636-8 1xPhospho [T152] ,1 ,4 ,P10636-8 ,P10636-8 [146-165] ,2 ,35066952
High ,[K].TPPAPKTPPSSGEPPK.[S] ,1xPhospho [T7] ,P10636-8 1xPhospho [T152] ,1 ,4 ,P10636-8 ,P10636-8 [146-161] ,1 ,147009880
High ,[K].TPPSSGEPPK.[S] ,1xPhospho [T1] ,P10636-8 1xPhospho [T152] ,1 ,1 ,P10636-8 ,P10636-8 [152-161] ,0 ,1141578.25
High ,[K].SPVVSGDTSPR.[H] ,1xPhospho [T/S] ,P10636-8 1xPhospho [T/S] ,1 ,5 ,P10636-8 ,P10636-8 [367-377] ,0 ,291275882
High ,[K].TDHGAEIVYKSPVVSGDTSPR.[H] ,1xPhospho [S11] ,P10636-8 1xPhospho [S367] ,1 ,1 ,P10636-8 ,P10636-8 [357-377] ,1 ,3844751.25
High ,[K].IGSLDNITHVPGGGNK.[K] ,1xPhospho [S3] ,P10636-8 1xPhospho [S327] ,1 ,1 ,P10636-8 ,P10636-8 [325-340] ,0 ,3098245
High ,[R].SRTPSLPTPPTR.[E] ,1xPhospho [S5] ,P10636-8 1xPhospho [S185] ,1 ,1 ,P10636-8 ,P10636-8 [181-192] ,1 ,6525094.5
High ,[R].SGYSSPGSPGTPGSR.[S] ,1xPhospho [S/T] ,P10636-8 1xPhospho [S] ,1 ,7 ,P10636-8 ,P10636-8 [166-180] ,0 ,149733789.5
High ,[K].STPTAEAEEAGIGDTPSLEDEAAGHVTQAR.[M] ,1xPhospho [T/S] ,P10636-8 1xPhospho [S/T] ,1 ,3 ,P10636-8 ,P10636-8 [68-97] ,0 ,7920354.125
High ,[R].TPPKSPSSAK.[S] ,1xPhospho [S/T] ,P10636-8 1xPhospho [S/T] ,1 ,3 ,P10636-8 ,P10636-8 [202-211] ,1 ,2525454.594
High ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q] ,1xPhospho [T/S]; 1xOxidation [M13],P10636-8 1xPhospho [S/T] ,1 ,4 ,P10636-8 ,P10636-8 [378-409] ,0 ,37349012
High ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q] ,1xPhospho [S/T] ,P10636-8 1xPhospho [S/T] ,1 ,5 ,P10636-8 ,P10636-8 [378-409] ,0 ,42549668
High ,[R].KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S],1xPhospho [T/S]; 1xOxidation [M8] ,P10636-8 1xPhospho [S/T] ,1 ,6 ,P10636-8 ,P10636-8 [24-67] ,2 ,2833359
High ,[R].KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S],1xPhospho [S/T] ,P10636-8 1xPhospho [S/T] ,1 ,5 ,P10636-8 ,P10636-8 [24-67] ,2 ,381969
High ,[R].LQTAPVPMPDLKNVK.[S] ,1xOxidation [M8] , ,1 ,2 ,P10636-8 ,P10636-8 [214-228] ,1 ,11043190
High ,[R].LQTAPVPMPDLKNVK.[S] , , ,1 ,2 ,P10636-8 ,P10636-8 [214-228] ,1 ,3375361.5
High ,[R].LQTAPVPMPDLK.[N] ,1xOxidation [M8] , ,1 ,26 ,P10636-8 ,P10636-8 [214-225] ,0 ,2938043353
High ,[R].LQTAPVPMPDLK.[N] , , ,1 ,10 ,P10636-8 ,P10636-8 [214-225] ,0 ,2345507657
High ,[K].LDLSNVQSK.[C] , , ,1 ,9 ,P10636-8 ,P10636-8 [253-261] ,0 ,1056765568
High ,[K].KLDLSNVQSK.[C] , , ,1 ,5 ,P10636-8 ,P10636-8 [252-261] ,1 ,1525892608
High ,[K].IGSLDNITHVPGGGNKK.[I] , , ,1 ,5 ,P10636-8 ,P10636-8 [325-341] ,1 ,963006714.5
High ,[K].IGSLDNITHVPGGGNK.[K] , , ,1 ,16 ,P10636-8 ,P10636-8 [325-340] ,0 ,4624086515
High ,[K].IGSTENLKHQPGGGK.[V] , , ,1 ,1 ,P10636-8 ,P10636-8 [231-245] ,1 ,148045.0938
High ,[R].SRTPSLPTPPTREPK.[K] , , ,1 ,3 ,P10636-8 ,P10636-8 [181-195] ,2 ,698763648
High ,[K].STPTAEAEEAGIGDTPSLEDEAAGHVTQAR.[M] , , ,1 ,12 ,P10636-8 ,P10636-8 [68-97] ,0 ,725873185.9
High ,[K].SPVVSGDTSPR.[H] , , ,1 ,5 ,P10636-8 ,P10636-8 [367-377] ,0 ,1303747568
High ,[R].TPPKSPSSAK.[S] , , ,1 ,5 ,P10636-8 ,P10636-8 [202-211] ,1 ,66745472
High ,[K].TPPSSGEPPK.[S] , , ,1 ,5 ,P10636-8 ,P10636-8 [152-161] ,0 ,257468414.4
High ,[K].TPPSSGEPPKSGDR.[S] , , ,1 ,6 ,P10636-8 ,P10636-8 [152-165] ,1 ,103032414.3
High ,[R].TPSLPTPPTREPK.[K] , , ,1 ,8 ,P10636-8 ,P10636-8 [183-195] ,1 ,1604505144
High ,[K].TDHGAEIVYK.[S] , , ,1 ,5 ,P10636-8 ,P10636-8 [357-366] ,0 ,529712534
High ,[K].SKDGTGSDDKK.[A] , , ,1 ,1 ,P10636-8 ,P10636-8 [102-112] ,2 ,1939050.375
High ,[R].QEFEVMEDHAGTYGLGDRK.[D] ,1xOxidation [M6] , ,1 ,16 ,P10636-8 ,P10636-8 [6-24] ,1 ,234584178.3
High ,[R].QEFEVMEDHAGTYGLGDRK.[D] , , ,1 ,8 ,P10636-8 ,P10636-8 [6-24] ,1 ,352983975.5
High ,[R].QEFEVMEDHAGTYGLGDR.[K] ,1xOxidation [M6] , ,1 ,24 ,P10636-8 ,P10636-8 [6-23] ,0 ,541478609
High ,[R].QEFEVMEDHAGTYGLGDR.[K] , , ,1 ,8 ,P10636-8 ,P10636-8 [6-23] ,0 ,663580316
High ,[R].SGYSSPGSPGTPGSR.[S] , , ,1 ,7 ,P10636-8 ,P10636-8 [166-180] ,0 ,1886451761
High ,[K].DQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S] ,1xOxidation [M7] , ,1 ,8 ,P10636-8 ,P10636-8 [25-67] ,1 ,242811709.3
High ,[K].DQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S] , , ,1 ,4 ,P10636-8 ,P10636-8 [25-67] ,1 ,263117809.9
High ,[K].DQGGYTMHQDQEGDTDAGLK.[E] ,1xOxidation [M7] , ,1 ,15 ,P10636-8 ,P10636-8 [25-44] ,0 ,148222051.8
High ,[K].DQGGYTMHQDQEGDTDAGLK.[E] , , ,1 ,5 ,P10636-8 ,P10636-8 [25-44] ,0 ,169405957.9
High ,[K].DNIKHVSGGGSVQIVYKPVDLSK.[V] , , ,1 ,1 ,P10636-8 ,P10636-8 [266-288] ,1 ,
High ,[K].ESPLQTPTEDGSEEPGSETSDAK.[S] , , ,1 ,9 ,P10636-8 ,P10636-8 [45-67] ,0 ,740036840
High ,[K].AKTDHGAEIVYK.[S] , , ,1 ,3 ,P10636-8 ,P10636-8 [355-366] ,1 ,652942514.6
High ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q] ,1xOxidation [M13] , ,1 ,17 ,P10636-8 ,P10636-8 [378-409] ,0 ,465597371.5
High ,[K].HVSGGGSVQIVYKPVDLSK.[V] , , ,1 ,56 ,P10636-8 ,P10636-8 [270-288] ,0 ,617176649.5
High ,[R].HLSNVSSTGSIDMVDSPQLATLADEVSASLAK.[Q] , , ,1 ,13 ,P10636-8 ,P10636-8 [378-409] ,0 ,549771859.3
High ,[R].KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S],1xOxidation [M8] , ,1 ,2 ,P10636-8 ,P10636-8 [24-67] ,2 ,132546086.5
High ,[R].KDQGGYTMHQDQEGDTDAGLK.[E] ,1xOxidation [M8] , ,1 ,12 ,P10636-8 ,P10636-8 [24-44] ,1 ,68147217.5
High ,[R].KDQGGYTMHQDQEGDTDAGLK.[E] , , ,1 ,2 ,P10636-8 ,P10636-8 [24-44] ,1 ,47451437.75
High ,[R].KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK.[S], , ,1 ,3 ,P10636-8 ,P10636-8 [24-67] ,2 ,25293893.63
High ,[K].SRLQTAPVPMPDLK.[N] , , ,1 ,1 ,P10636-8 ,P10636-8 [212-225] ,1 ,5722389
High ,[R].IPAKTPPAPK.[T] , , ,1 ,1 ,P10636-8 ,P10636-8 [142-151] ,1 ,2015586.75
High ,[K].CGSLGNIHHKPGGGQVEVK.[S] ,1xCarbamidomethyl [C1] , ,1 ,3 ,P10636-8 ,P10636-8 [293-311] ,0 ,218379137.5
High ,[K].IGSTENLK.[H] , , ,1 ,5 ,P10636-8 ,P10636-8 [231-238] ,0 ,887492361
High ,[R].TPSLPTPPTR.[E] , , ,1 ,4 ,P10636-8 ,P10636-8 [183-192] ,0 ,1085564561
High ,[R].SRTPSLPTPPTR.[E] , , ,1 ,1 ,P10636-8 ,P10636-8 [181-192] ,1 ,308283264
High ,[K].SKIGSTENLKHQPGGGK.[V] , , ,1 ,1 ,P10636-8 ,P10636-8 [229-245] ,2 ,551345.375
High ,[K].GADGKTKIATPR.[G] , , ,1 ,1 ,P10636-8 ,P10636-8 [115-126] ,2 ,593317.6875
High ,[K].SEKLDFKDR.[V] , , ,1 ,2 ,P10636-8 ,P10636-8 [312-320] ,2 ,32312068
High ,[KR].CGSKDNIK.[H] ,1xCarbamidomethyl [C1] , ,2 ,1 ,P10636-8; P27546 ,P10636-8 [262-269]; P27546 [981-988],1 ,4449589
\ No newline at end of file
#This is a TOML Document
#The TOML specification can be found here: https://github.com/toml-lang/toml
title="Abundance Parser Configuration"
#The input source and parsing parameters
[input]
#The CSV input file(s) to read data from
#The files should be of the same format (column titles,
#Example:
#input_files=[ "/input/file1",
# path to file2
# ]
input_files=["tests/sampleInput.csv"]
#fasta file(s) containing the protein sequence(s) to align against
prot_seq_fasta=["data/htau40.txt","data/1N4RP301STau.txt"]
[output]
#Relative or absolute path to desired output directory
output_directory="/output/"
#this stub is prepended by the file name of the input file
output_name_stub="residueModificationAnalysis"
#Configuration of parser settings
[parser_config]
#If false, files will be specified via abundance column
using_fileID_column=false
#Title of column containing abundance. If using_fileID_column is set to
#true, only the first entry will be used
abundance_col_titles=["Abundance","Abundance: F1"]
[parser_config.master]
#Configs for use of pre-localized modification in master protein in
#parsing
master.use=false
#Column header containing the title of the column containing the
#localized modifications
master.modification_header="Modifications in Master Proteins"
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment