Commit d47e352e authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Add tumor registry data and transformation.

parent 7abde7f8
......@@ -24,8 +24,8 @@ def get_args():
help="Demographic File (csv format).",
required=True)
parser.add_argument('-d', '--diagnosis',
help="Diagnosis File (csv format).",
parser.add_argument('-t', '--tumor',
help="Tumor Registry File (csv format).",
required=True)
parser.add_argument('-o', '--out',
......@@ -36,18 +36,24 @@ def get_args():
return args
def merge_tables(demographics, diagnosis):
'''Merge demographics and diagnosis tables and filter columns'''
def filter_table(demographics, tumor_registry):
'''Filter demographics and tumor_registry columns'''
# Merge
merged_df = demographics.join(diagnosis)
merged_df['MRN'] = merged_df.index.values
# Column filters
demo_filter_col = ["MRN", "GENDER", "ETHNICITY", "RACE", "BIRTH_DATE", "DEATH_DATE", "DEATH_SOURCE"]
tumor_filter_col = ["Medical Record Number", "Date of Last Contact-Date", "Vital Status"]
# Filter for columns
filter_col = ["MRN", "GENDER", "ETHNICITY", "RACE", "BIRTH_DATE", 'Date']
merged_df_filter = merged_df.loc[:, filter_col]
# Filter
demo_filter = demographics.loc[:, demo_filter_col]
tumor_filter = tumor_registry.loc[:, tumor_filter_col]
return merged_df_filter
# Filter for dead status
tumor_dead = tumor_filter[tumor_filter["Vital Status"] == 0]
# merge
df_merged = demo_filter.merge(tumor_dead, left_on='MRN', right_on='Medical Record Number', how='left')
return df_merged
def convert_gender(merged):
......@@ -75,21 +81,6 @@ def convert_race(merged):
return merged
def calculate_age(row):
'''Calculates age from birth_date and date of diagnosis'''
birth_date = row['BIRTH_DATE']
diagnosis_date = row['Date']
age = diagnosis_date.year - birth_date.year - \
((diagnosis_date.month, birth_date.day) < \
(diagnosis_date.month, birth_date.day))
if age >= 90:
age = '90 or above'
return age
def get_seconds(date_time):
return date_time.total_seconds()
......@@ -99,27 +90,36 @@ def calculate_shift(merged):
# Convert to DateTime
merged['BIRTH_DATE'] = pd.to_datetime(merged['BIRTH_DATE'])
merged['Date'] = pd.to_datetime(merged['Date'])
# Calculate Age of diagnosis
merged['Age'] = merged.apply(calculate_age, axis=1)
merged['age_units'] = 'years'
merged['DEATH_DATE'] = pd.to_datetime(merged['DEATH_DATE'])
merged['Date of Last Contact-Date'] = pd.to_datetime(merged['Date of Last Contact-Date'])
# Calculate date shift relative to 1800/01/01
shift_date = datetime.datetime(1800, 1, 1)
merged['Shift'] = shift_date - merged['BIRTH_DATE']
# Update death source
merged.loc[pd.notna(merged['DEATH_DATE']), 'DEATH_SOURCE'] = 'UTSouthwestern Hospital'
# Update death date if from tumor registry
merged.loc[pd.notna(merged['Vital Status']), 'DEATH_SOURCE'] = 'Tumor Registry'
merged.loc[pd.notna(merged['Vital Status']), 'DEATH_DATE'] = 'Date of Last Contact-Date'
# Calculate date Shift for death date
merged['DEATH_DATE'] = merged['DEATH_DATE'] + merged['Shift']
# Shift to total seconds
time_delta_series = merged['Shift']
merged['Shift'] = time_delta_series.apply(get_seconds)
return merged
def main():
args = get_args()
demo = args.file
diagnosis = args.diagnosis
tumor = args.tumor
out_path = args.out
# Make output files
......@@ -127,14 +127,14 @@ def main():
shift_file = os.path.join(out_path + 'mrn_shift.csv')
# Read in files
demographics = pd.read_csv(demo, index_col='MRN')
diagnosis = pd.read_csv(diagnosis, index_col='MRN')
demographics = pd.read_csv(demo)
tumor_registry = pd.read_csv(tumor)
# Merge tables
merged_df = merge_tables(demographics, diagnosis)
# Filter table
filter_df = filter_table(demographics, tumor_registry)
# Convert Gender
fix_gender = convert_gender(merged_df)
fix_gender = convert_gender(filter_df)
# Convert missing Ethnicity
fix_ethnicity = convert_ethnicity(fix_gender)
......@@ -146,8 +146,8 @@ def main():
shifted_df = calculate_shift(fix_race)
# Write out patients table
shifted_filtered = shifted_df[["MRN", "GENDER", "ETHNICITY", "RACE", "Age", 'age_units']]
shifted_filtered.columns = ['mrn', 'gender', 'ethnicity', 'race', 'age', 'age_units']
shifted_filtered = shifted_df[["MRN", "GENDER", "ETHNICITY", "RACE", 'DEATH_DATE', 'DEATH_SOURCE']]
shifted_filtered.columns = ['mrn', 'gender', 'ethnicity', 'race', 'death_date', 'death_source']
shifted_filtered.to_csv(patients, index=False)
# Write out shift table
......
This diff is collapsed.
mrn,gender,ethnicity,race,age,age_units
934,Female,Hispanic,White,60,years
313,Female,Non-hispanic,Black,55,years
398,Female,Non-hispanic,Black,72,years
215,Female,Non-hispanic,White,34,years
904,Male,Non-hispanic,White,34,years
411,Female,Unknown,Unknown,79,years
266,Male,Hispanic,White,62,years
707,Male,Non-hispanic,American Indian,82,years
350,Female,Unknown,Unknown,28,years
399,Male,Non-hispanic,Asian,78,years
903,Female,Non-hispanic,White,23,years
628,Male,Non-hispanic,White,21,years
558,Male,Hispanic,White,44,years
590,Female,Hispanic,Other,38,years
822,Female,Unknown,Unknown,90 or above,years
532,Male,Declined,Declined,47,years
687,Male,Non-hispanic,White,54,years
423,Female,Declined,White,71,years
33,Male,Non-hispanic,Asian,67,years
88,Female,Non-hispanic,Hawaiian Pacific Islander,60,years
mrn,gender,ethnicity,race,death_date,death_source
934,Female,Hispanic,White,,
313,Female,Non-hispanic,Black,1858-01-14,UTSouthwestern Hospital
398,Female,Non-hispanic,Black,,
215,Female,Non-hispanic,White,,
904,Male,Non-hispanic,White,,
411,Female,Unknown,Unknown,,
266,Male,Hispanic,White,,
707,Male,Non-hispanic,American Indian,,
350,Female,Unknown,Unknown,,
399,Male,Non-hispanic,Asian,,
903,Female,Non-hispanic,White,1824-09-11,UTSouthwestern Hospital
628,Male,Non-hispanic,White,,
558,Male,Hispanic,White,,
590,Female,Hispanic,Other,,
822,Female,Unknown,Unknown,,
532,Male,Declined,Declined,,
687,Male,Non-hispanic,White,,
423,Female,Declined,White,,
33,Male,Non-hispanic,Asian,,
88,Female,Non-hispanic,Hawaiian Pacific Islander,1860-09-18,UTSouthwestern Hospital
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment