transform_patients.py 4.33 KB
Newer Older
Venkat Malladi's avatar
Venkat Malladi committed
1
#!/usr/bin/env python3
Venkat Malladi's avatar
Venkat Malladi committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

'''Generate Demographics'''

import datetime
import os
import argparse
import pandas as pd

EPILOG = '''
For more details:
        %(prog)s --help
'''


def get_args():
    '''Define arguments.'''

    parser = argparse.ArgumentParser(
        description=__doc__, epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('-f', '--file',
                        help="Demographic File (csv format).",
                        required=True)

27
28
    parser.add_argument('-t', '--tumor',
                        help="Tumor Registry File (csv format).",
Venkat Malladi's avatar
Venkat Malladi committed
29
30
31
32
33
34
35
36
37
38
                        required=True)

    parser.add_argument('-o', '--out',
                        help="The output path (csv format).",
                        required=True)

    args = parser.parse_args()
    return args


39
40
def filter_table(demographics, tumor_registry):
    '''Filter demographics and tumor_registry columns'''
Venkat Malladi's avatar
Venkat Malladi committed
41

42
43
44
    # Column filters
    demo_filter_col = ["MRN", "GENDER", "ETHNICITY", "RACE", "BIRTH_DATE", "DEATH_DATE", "DEATH_SOURCE"]
    tumor_filter_col = ["Medical Record Number", "Date of Last Contact-Date", "Vital Status"]
Venkat Malladi's avatar
Venkat Malladi committed
45

46
47
48
    # Filter
    demo_filter = demographics.loc[:, demo_filter_col]
    tumor_filter = tumor_registry.loc[:, tumor_filter_col]
Venkat Malladi's avatar
Venkat Malladi committed
49

50
51
52
53
54
55
56
    # Filter for dead status
    tumor_dead = tumor_filter[tumor_filter["Vital Status"] == 0]

    # merge
    df_merged = demo_filter.merge(tumor_dead, left_on='MRN', right_on='Medical Record Number', how='left')

    return df_merged
Venkat Malladi's avatar
Venkat Malladi committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92


def convert_gender(merged):
    '''Converts gender to apporpriate string for database'''

    merged['GENDER'] = merged['GENDER'].str.title()

    return merged


def convert_ethnicity(merged):
    '''Converts ethnicity to apporpriate string for database'''

    merged.loc[merged['ETHNICITY'] == '@', 'ETHNICITY'] = 'Unknown'

    return merged


def convert_race(merged):
    '''Converts race to apporpriate string for database'''

    merged.loc[merged['RACE'] == 'AmIndAN', 'RACE'] = 'American Indian'
    merged.loc[merged['RACE'] == 'HawPacIs', 'RACE'] = 'Hawaiian Pacific Islander'

    return merged


def get_seconds(date_time):
    return date_time.total_seconds()


def calculate_shift(merged):
    '''Calculate date shift'''

    # Convert to DateTime
    merged['BIRTH_DATE'] = pd.to_datetime(merged['BIRTH_DATE'])
93
94
    merged['DEATH_DATE'] = pd.to_datetime(merged['DEATH_DATE'])
    merged['Date of Last Contact-Date'] = pd.to_datetime(merged['Date of Last Contact-Date'])
Venkat Malladi's avatar
Venkat Malladi committed
95
96
97
98
99

    # Calculate date shift relative to 1800/01/01
    shift_date = datetime.datetime(1800, 1, 1)
    merged['Shift'] = shift_date - merged['BIRTH_DATE']

100
101
102
103
104
    # Update death source
    merged.loc[pd.notna(merged['DEATH_DATE']), 'DEATH_SOURCE'] = 'UTSouthwestern Hospital'

    # Update death date if from tumor registry
    merged.loc[pd.notna(merged['Vital Status']), 'DEATH_SOURCE'] = 'Tumor Registry'
Venkat Malladi's avatar
Venkat Malladi committed
105
    merged.loc[pd.notna(merged['Vital Status']), 'DEATH_DATE'] = merged['Date of Last Contact-Date']
106
107
108
109

    # Calculate date Shift for death date
    merged['DEATH_DATE'] = merged['DEATH_DATE'] + merged['Shift']

Venkat Malladi's avatar
Venkat Malladi committed
110
111
112
113
    # Shift to total seconds
    time_delta_series = merged['Shift']
    merged['Shift'] = time_delta_series.apply(get_seconds)

114

Venkat Malladi's avatar
Venkat Malladi committed
115
116
117
118
119
120
    return merged


def main():
    args = get_args()
    demo = args.file
121
    tumor = args.tumor
Venkat Malladi's avatar
Venkat Malladi committed
122
123
124
125
126
127
128
    out_path = args.out

    # Make output files
    patients = os.path.join(out_path + 'patients.csv')
    shift_file = os.path.join(out_path + 'mrn_shift.csv')

    # Read in files
129
130
    demographics = pd.read_csv(demo)
    tumor_registry = pd.read_csv(tumor)
Venkat Malladi's avatar
Venkat Malladi committed
131

132
133
    # Filter table
    filter_df = filter_table(demographics, tumor_registry)
Venkat Malladi's avatar
Venkat Malladi committed
134
135

    # Convert Gender
136
    fix_gender = convert_gender(filter_df)
Venkat Malladi's avatar
Venkat Malladi committed
137
138
139
140
141
142
143
144
145
146
147

    # Convert missing Ethnicity
    fix_ethnicity = convert_ethnicity(fix_gender)

    # Convert Race
    fix_race = convert_race(fix_ethnicity)

    # Calculate Date Shift
    shifted_df = calculate_shift(fix_race)

    # Write out patients table
148
149
    shifted_filtered = shifted_df[["MRN", "GENDER", "ETHNICITY", "RACE", 'DEATH_DATE', 'DEATH_SOURCE']]
    shifted_filtered.columns = ['mrn', 'gender', 'ethnicity', 'race', 'death_date', 'death_source']
Venkat Malladi's avatar
Venkat Malladi committed
150
151
152
153
154
155
156
157
158
    shifted_filtered.to_csv(patients, index=False)

    # Write out shift table
    shift_only = shifted_df[["MRN", "Shift"]]
    shift_only.to_csv(shift_file, index=False)


if __name__ == '__main__':
    main()