Commit 41c71dac authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Transform germline gene tables.

parent 2629dcde
Pipeline #5058 passed with stage
in 5 seconds
#!/usr/bin/env python3
'''Generate Germline Mutation Table'''
import argparse
import datetime
import os
import pandas as pd
EPILOG = '''
For more details:
%(prog)s --help
'''
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-f', '--file',
help="Germline Cancer Gene File (csv format).",
required=True)
parser.add_argument('-d', '--date',
help="Date Shift (tsv format).",
required=True)
parser.add_argument('-s', '--significance',
help="Mutation Significance Map File (csv format).",
required=True)
parser.add_argument('-o', '--out',
help="The output path (csv format).",
required=True)
args = parser.parse_args()
return args
def reformat_record(cancer_gene):
'''Reformat cancer gene values'''
# Reformat for one gene per patient in a row
id_vars = ["MRN", "REGISTER_DATE", "SERVICE_DATE",
"CLINIC"]
cancer_gene_df = pd.melt(cancer_gene, id_vars = id_vars,
var_name='Gene', value_name='Number')
return cancer_gene_df
def convert_number(merged, significance):
'''Converts number to apporpriate string for database'''
merged.loc[merged['Number'].isna(), 'Number'] = 0
merged['Number'] = merged['Number'].astype(int)
merged['Number'] = merged['Number'].map(significance.set_index('Significance')['Number'])
return merged
def calculate_shift(cancer_gene, date_shift):
'''Shift Date for start date'''
# Convert to DateTime
date_shift['Shift'] = pd.to_timedelta(date_shift['Shift'], unit='s')
cancer_gene['REGISTER_DATE'] = pd.to_datetime(cancer_gene['REGISTER_DATE'])
cancer_gene['SERVICE_DATE'] = pd.to_datetime(cancer_gene['SERVICE_DATE'])
# Merge data
merged = cancer_gene.merge(date_shift, left_on='MRN', right_on='MRN', how='inner')
# Calculate Date Shift
merged['REGISTER_DATE'] = merged['REGISTER_DATE'] + merged['Shift']
merged['SERVICE_DATE'] = merged['SERVICE_DATE'] + merged['Shift']
# Drop Shift and Clinic column
merged.drop(['Shift', 'CLINIC'], axis=1, inplace=True)
# Int MRN
merged.MRN = merged.MRN.astype(int)
return merged
def main():
args = get_args()
cancer_gene = args.file
significance = args.significance
date = args.date
out_path = args.out
# Make output files
cancer_gene_table = os.path.join(out_path + 'germline_gene_table.csv')
# Read in files
cancer_gene_df = pd.read_csv(cancer_gene)
significance_map = pd.read_csv(significance)
date_shift = pd.read_csv(date)
# Reformat data
cancer_gene_reformat = reformat_record(cancer_gene_df)
# Convert missing Number
fix_number = convert_number(cancer_gene_reformat, significance_map)
# Calculate Date Shift
shifted_df = calculate_shift(fix_number, date_shift)
# Write out radiation table
shifted_df.to_csv(cancer_gene_table, index=False)
if __name__ == '__main__':
main()
Significance,Number
0,Not Available
1,Positive
2,Negative
3,Positive and Variant
4,Variant
5,Pending
MRN,REGISTER_DATE,SERVICE_DATE,CLINIC,BRCA1,BRCA2,MULTISITE3,BART,MLH1,MSH2,MSH6,PMS2,MSI,IHCMLH1,IHCMSH2,IHCMSH6,IHCPMS2,BRAF,EPCAM,APC,MYH,MLH1METH,ALK,ATM,ATR,AXIN2,BAP1,BARD1,BMPR1A,BRIP1,CDH1,CDK4,CDKN2A,CHEK1,CHEK2,FAM175A,FLCN,FH,GALNT12,GEN1,GREM1,HOXB1B,MAX,MEN1,MET,MITF,MLH3,MRE11A,NBN,NF1,NF2,PHOX2B,PALB2,PTCH1,PTEN,P16,P53,PRSS1,RAD50,RAD51,RAD51C,RAD51D,RB,RET,SDHA,SDHAF2,SDHC,SDHD,SDHB,SMAD4,STK11,SUFU,TMEM127,TP53BP1,TSC1,TSC2,VHL,XRCC2,OTHER,RESEARCH
934,11/9/2016,11/13/2016,,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
904,1/17/2008,2/2/2008,,,,,,2,2,2,2,,,,,,,2,2,2,,,2,,,,2,2,2,2,2,,,2,,,,,,,,,,,,,,2,,,,2,,2,2,2,,,,2,2,,,,,,,,2,2,,,,,,1,,,
411,3/15/2015,3/19/2015,,2,2,,,2,2,2,2,,,,,,,2,,,,,,,,2,,,,,,,,,,2,2,,,,,,,2,2,,,,,,,,,2,,2,,,,,,,,2,,2,2,2,,,,,,2,2,3,,,
399,10/5/2004,10/24/2004,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,
903,6/13/2011,6/14/2011,,2,2,,,2,2,2,2,,,,,,,2,,,,,,,,2,,,,,,,,,,2,2,,,,,,,2,2,,,,,,,,,2,,2,,,,,,,,2,,2,2,2,,,,,,2,2,4,,,
628,12/13/1995,12/30/1995,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,
558,1/2/2014,1/3/2014,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,,,
590,12/25/2003,1/5/2004,,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
687,12/22/2008,12/27/2008,,,,,,2,2,2,2,,,,,,,2,2,2,,,2,,,,2,2,2,2,2,,,2,,,,,,,,,,,,,,2,,,,2,,2,2,2,,,,2,2,,,,,,,,2,2,,,,,,2,,,
#!/usr/bin/env python3
import pytest
import os
import pandas as pd
from StringIO import StringIO
import transform_germline_mutations
GERMLINE_STRING = """MRN,REGISTER_DATE,SERVICE_DATE,CLINIC,BRCA1,BRCA2,MULTISITE3,BART,MLH1,MSH2,MSH6,PMS2,MSI,IHCMLH1,IHCMSH2,IHCMSH6,IHCPMS2,BRAF,EPCAM,APC,MYH,MLH1METH,ALK,ATM,ATR,AXIN2,BAP1,BARD1,BMPR1A,BRIP1,CDH1,CDK4,CDKN2A,CHEK1,CHEK2,FAM175A,FLCN,FH,GALNT12,GEN1,GREM1,HOXB1B,MAX,MEN1,MET,MITF,MLH3,MRE11A,NBN,NF1,NF2,PHOX2B,PALB2,PTCH1,PTEN,P16,P53,PRSS1,RAD50,RAD51,RAD51C,RAD51D,RB,RET,SDHA,SDHAF2,SDHC,SDHD,SDHB,SMAD4,STK11,SUFU,TMEM127,TP53BP1,TSC1,TSC2,VHL,XRCC2,OTHER,RESEARCH
934,11/9/2016,11/13/2016,,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""
standards_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../standards/'
@pytest.fixture
def germline():
germline_file = StringIO(GERMLINE_STRING)
germline_df = pd.read_csv(germline_file)
return germline_df
@pytest.fixture
def significance_map():
significance_map = standards_path + 'germline_significance.csv'
significance_map_df = pd.read_csv(significance_map)
return significance_map_df
@pytest.fixture
def germline_1(germline):
germline.loc[0, 'PMS2'] = 1
return germline
@pytest.fixture
def germline_2(germline):
germline.loc[0, 'IHCMLH1'] = 2
return germline
@pytest.fixture
def germline_3(germline):
germline.loc[0, 'ATR'] = 3
return germline
@pytest.fixture
def germline_4(germline):
germline.loc[0, 'BAP1'] = 4
return germline
@pytest.fixture
def germline_5(germline):
germline.loc[0, 'SDHC'] = 5
return germline
@pytest.mark.unit
def test_check_conversion(germline_1):
transform_germline = transform_germline_mutations.reformat_record(germline_1)
assert transform_germline.shape[0] == 76
assert transform_germline.shape[1] == 6
@pytest.mark.unit
def test_check_convert_number_0(germline_1, significance_map):
transform_germline = transform_germline_mutations.reformat_record(germline_1)
converted_germline = transform_germline_mutations.convert_number(transform_germline, significance_map)
row_selection = converted_germline[converted_germline['Gene'] == 'MULTISITE3'].index.item()
assert converted_germline.loc[row_selection, 'Number'] == 'Not Available'
@pytest.mark.unit
def test_check_convert_number_1(germline_1, significance_map):
transform_germline = transform_germline_mutations.reformat_record(germline_1)
converted_germline = transform_germline_mutations.convert_number(transform_germline, significance_map)
row_selection = converted_germline[converted_germline['Gene'] == 'PMS2'].index.item()
assert converted_germline.loc[row_selection, 'Number'] == 'Positive'
@pytest.mark.unit
def test_check_convert_number_2(germline_2, significance_map):
transform_germline = transform_germline_mutations.reformat_record(germline_2)
converted_germline = transform_germline_mutations.convert_number(transform_germline, significance_map)
row_selection = converted_germline[converted_germline['Gene'] == 'IHCMLH1'].index.item()
assert converted_germline.loc[row_selection, 'Number'] == 'Negative'
@pytest.mark.unit
def test_check_convert_number_3(germline_3, significance_map):
transform_germline = transform_germline_mutations.reformat_record(germline_3)
converted_germline = transform_germline_mutations.convert_number(transform_germline, significance_map)
row_selection = converted_germline[converted_germline['Gene'] == 'ATR'].index.item()
assert converted_germline.loc[row_selection, 'Number'] == 'Positive and Variant'
@pytest.mark.unit
def test_check_convert_number_4(germline_4, significance_map):
transform_germline = transform_germline_mutations.reformat_record(germline_4)
converted_germline = transform_germline_mutations.convert_number(transform_germline, significance_map)
row_selection = converted_germline[converted_germline['Gene'] == 'BAP1'].index.item()
assert converted_germline.loc[row_selection, 'Number'] == 'Variant'
@pytest.mark.unit
def test_check_convert_number_5(germline_5, significance_map):
transform_germline = transform_germline_mutations.reformat_record(germline_5)
converted_germline = transform_germline_mutations.convert_number(transform_germline, significance_map)
row_selection = converted_germline[converted_germline['Gene'] == 'SDHC'].index.item()
assert converted_germline.loc[row_selection, 'Number'] == 'Pending'
......@@ -152,6 +152,7 @@ def test_check_unit_coersion(lab, unit_map, unit_conversion):
assert converted_labs.loc[0, 'value_units'] == 'g/dl'
@pytest.mark.unit
def test_check_flag_coersion(lab, unit_map, unit_conversion):
lab.loc[0, 'VALUEFLAG_CD'] = "LL"
filtered_labs = transform_labs.parse_blob(lab)
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment