transform_germline_mutations.py 3.9 KB
Newer Older
1
2
#!/usr/bin/env python3

Venkat Malladi's avatar
Venkat Malladi committed
3
'''Generate Germline Mutations'''
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

import argparse
import datetime
import os
import pandas as pd


EPILOG = '''
For more details:
        %(prog)s --help
'''


def get_args():
    '''Define arguments.'''

    parser = argparse.ArgumentParser(
        description=__doc__, epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('-f', '--file',
                        help="Germline Cancer Gene File (csv format).",
                        required=True)

    parser.add_argument('-d', '--date',
                        help="Date Shift (tsv format).",
                        required=True)

    parser.add_argument('-s', '--significance',
                        help="Mutation Significance Map File (csv format).",
                        required=True)

    parser.add_argument('-o', '--out',
                        help="The output path (csv format).",
                        required=True)

    args = parser.parse_args()
    return args


def reformat_record(cancer_gene):
    '''Reformat cancer gene values'''

    # Reformat for one gene per patient in a row
    id_vars = ["MRN", "REGISTER_DATE", "SERVICE_DATE",
              "CLINIC"]

    cancer_gene_df = pd.melt(cancer_gene, id_vars = id_vars,
                            var_name='Gene', value_name='Number')

    return cancer_gene_df


def convert_number(merged, significance):
    '''Converts number to apporpriate string for database'''

    merged.loc[merged['Number'].isna(), 'Number'] = 0
    merged['Number'] = merged['Number'].astype(int)
    merged['Number'] = merged['Number'].map(significance.set_index('Significance')['Number'])

    return merged


67
68
69
70
71
72
73
74
75
76
77
78
79
def reformat_gene(merged):
    '''Converts gene names to apporpriate string for database'''

    merged['Gene'] = merged['Gene'].astype(str) + '(Sequencing)'

    # If IHC then switch
    pat = r"^IHC(?P<one>\w+)\((Sequencing\))"
    repl = lambda m: m.group('one') + '(IHC)'

    merged.Gene = merged.Gene.str.replace(pat, repl, regex=True)
    return merged


80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def calculate_shift(cancer_gene, date_shift):
    '''Shift Date for start date'''

    # Convert to DateTime
    date_shift['Shift'] = pd.to_timedelta(date_shift['Shift'], unit='s')
    cancer_gene['REGISTER_DATE'] = pd.to_datetime(cancer_gene['REGISTER_DATE'])
    cancer_gene['SERVICE_DATE'] = pd.to_datetime(cancer_gene['SERVICE_DATE'])

    # Merge data
    merged = cancer_gene.merge(date_shift, left_on='MRN', right_on='MRN', how='inner')

    # Calculate Date Shift
    merged['REGISTER_DATE'] = merged['REGISTER_DATE'] + merged['Shift']
    merged['SERVICE_DATE'] = merged['SERVICE_DATE'] + merged['Shift']

Venkat Malladi's avatar
Venkat Malladi committed
95
96
97
    merged['REGISTER_DATE'] = merged['REGISTER_DATE'].dt.date
    merged['SERVICE_DATE'] = merged['SERVICE_DATE'].dt.date

98
99
100
101
102
103
    # Drop Shift and Clinic column
    merged.drop(['Shift', 'CLINIC'], axis=1, inplace=True)

    # Int MRN
    merged.MRN = merged.MRN.astype(int)

Venkat Malladi's avatar
Venkat Malladi committed
104
105
106
107
108
    # Rename columns
    germline_columns = ["mrn", "register_date", "service_date",
                   "target", "significance",]
    merged.columns = germline_columns

109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
    return merged


def main():
    args = get_args()
    cancer_gene = args.file
    significance = args.significance
    date = args.date
    out_path = args.out

    # Make output files
    cancer_gene_table = os.path.join(out_path + 'germline_gene_table.csv')

    # Read in files
    cancer_gene_df = pd.read_csv(cancer_gene)
    significance_map = pd.read_csv(significance)
    date_shift = pd.read_csv(date)

    # Reformat data
    cancer_gene_reformat = reformat_record(cancer_gene_df)

    # Convert missing Number
    fix_number = convert_number(cancer_gene_reformat, significance_map)

133
134
135
    # Convert gene names
    fix_gene = reformat_gene(fix_number)

136
    # Calculate Date Shift
137
    shifted_df = calculate_shift(fix_gene, date_shift)
138
139
140
141
142
143
144

    # Write out radiation table
    shifted_df.to_csv(cancer_gene_table, index=False)


if __name__ == '__main__':
    main()