initial functionality for looking at energetics of center chain and per...

initial functionality for looking at energetics of center chain and per residue energy change of mutations

initial functionality for looking at energetics of center chain and per...
initial functionality for looking at energetics of center chain and per residue energy change of mutations
e5de9f0b · Vishruth Mullapudi · 537ab1e7 · e5de9f0b
Commit e5de9f0b authored 2 years ago by Vishruth Mullapudi
--- a/mutation_scan_interface_ddg/analyze_center_chain_energetics.py
+++ b/mutation_scan_interface_ddg/analyze_center_chain_energetics.py
@@ -9,6 +9,10 @@ from itertools import takewhile, islice, dropwhile
 import pandas as pd
 global rosetta_output_file_name
 global output_database_name
+global rosetta_path
+global rosetta_db
+global USE_MULTITHREADING
+global PROCESS_COUNT

 def find_finished_jobs(output_folder):
    return_dict = {}
@@ -46,46 +50,117 @@ def rosetta_output_succeeded(potential_struct_dir):

    return no_more_batches_line_found and success_line_found

-def analyze(casefolder, replicate_folders, renumber=None):
+def extract_pose_tables(casefolder, replicate_folders, renumber=None):
+    pose_table = pd.DataFrame()
    for replicate_folder in replicate_folders:
        # 1) get dataframe containing center chains from pdb wt and mut
        # 2) calculate bound ddG from mut-wt aligning on the pose number and chain
        # 3) optionally renumber (needed for phf I think but not cbd?)
        wt_pdb = os.path.join(casefolder, replicate_folder,"wt_%05d.pdb" % trajectory_stride)
-        mut_pdb = os.path.isfile(os.path.join(casefolder, replicate_folder,"mut_%05d.pdb" % trajectory_stride))
+        mut_pdb = os.path.join(casefolder, replicate_folder,"mut_%05d.pdb" % trajectory_stride)
        with open(wt_pdb) as fid:
            # need to convert pose numbering to 
-            pose_table = pd.DataFrame(x.split() for x in takewhile(lambda x: '#END_POSE_ENERGIES_TABLE' not in x, islice(dropwhile(lambda x: '#BEGIN_POSE_ENERGIES_TABLE' not in x, fid), 1, None)))
-            pose_table.columns = pose_table.iloc[0]
-            pass
+            wt_pose_table = pd.DataFrame(x.split() for x in takewhile(lambda x: '#END_POSE_ENERGIES_TABLE' not in x, islice(dropwhile(lambda x: '#BEGIN_POSE_ENERGIES_TABLE' not in x, fid), 1, None)))
+            wt_pose_table.columns = wt_pose_table.iloc[0]
+            wt_pose_table = wt_pose_table.iloc[3:].assign(scored_state='wt_dG').assign(case_name=os.path.basename(casefolder)).assign(replicate=os.path.basename(replicate_folder))
+        with open(mut_pdb) as fid:
+            # need to convert pose numbering to 
+            mut_pose_table = pd.DataFrame(x.split() for x in takewhile(lambda x: '#END_POSE_ENERGIES_TABLE' not in x, islice(dropwhile(lambda x: '#BEGIN_POSE_ENERGIES_TABLE' not in x, fid), 1, None)))
+            mut_pose_table.columns = mut_pose_table.iloc[0]
+            mut_pose_table = mut_pose_table.iloc[3:].assign(scored_state='mut_dG').assign(case_name=os.path.basename(casefolder)).assign(replicate=os.path.basename(replicate_folder))
+        pose_table = pd.concat((pose_table, mut_pose_table, wt_pose_table))
+    pose_table[['PDB_name', 'Mutated Chains', 'mutant']] = pose_table.case_name.str.split('_', expand=True)
+    pose_table[['wt_aa', 'res_num', "mutant_aa"]] = pose_table.mutant.str.split(r'([A-Z]{3})(\d{1,})([A-Z])', expand=True) \
+        .replace("", float("NaN")).dropna(how='all', axis=1)
+    pose_table.reset_index(inplace=True)
+    pose_table['res_num'] = pose_table['res_num'].astype(int)
+    return pose_table
+
+
+def calc_dGs(df, minposenum, maxposenum):
+    per_res_ddgs = pd.DataFrame()
+    total_ddgs = pd.DataFrame()
+    center_ddgs = pd.DataFrame()
+    for column in df.columns:
+        if column not in ['index', 'case_name', 'PDB_name', 
+        'Mutated Chains', 'wt_aa', 'replicate', 'res_num', 
+        'mutant_aa', 'scored_state', 'label', 'mutant']:
+            df.loc[:,column] = df.loc[:,column].astype(float)
+    df[['wt_aa', 'pose_num']] = df['label'].str.rsplit('_', 1, expand=True)
+    df.loc[:,'pose_num'] = df['pose_num'].astype(int)
+    for case, data in df.groupby('case_name'):
+        wt = data.loc[data['scored_state'] == 'wt_dG']
+        mut = data.loc[data['scored_state'] == 'mut_dG']
+        for column in wt.columns:
+        # multiply their score fields by negative 1
+            if column not in ['index', 'case_name', 'PDB_name', 
+            'Mutated Chains', 'wt_aa', 'replicate', 'res_num', 
+            'mutant_aa', 'scored_state', 'label', 'mutant', 'wt_aa', 
+            'pose_num']:
+                wt.loc[:, column] *= -1.0
+        bound_dG = pd.concat([mut, wt])
+        
+        # get the bound ddg for each replicate at each pose number (per residue ddG)
+        bound_dG = bound_dG.groupby(
+            ['PDB_name', 'Mutated Chains', 'replicate', 'res_num', 
+            'mutant_aa', 'mutant', 'pose_num']).sum().reset_index()
+        # average all the nstruct dG's together
+        bound_dG = bound_dG.groupby(['PDB_name', 'Mutated Chains', 
+        'res_num', 'mutant_aa', 'mutant', 'pose_num']).mean().round(
+            decimals=5).reset_index()
+        bound_dG.loc[:,'scored_state'] = "bound_ddG"
+        bound_dG['case_name'] = case
+        per_res_ddgs = pd.concat([per_res_ddgs, bound_dG]) 
+        total_ddgs = pd.concat([total_ddgs, bound_dG.groupby('mutant').sum().reset_index()]) 
+        center_chains = bound_dG[(bound_dG['pose_num'] >= minposenum) & (bound_dG['pose_num'] < maxposenum)]
+        temp = center_chains.groupby(['mutant']).sum().drop(columns=['res_num', 'pose_num']).reset_index()
+        temp['case'] = case
+        center_ddgs = pd.concat([center_ddgs, temp])
+        #ddgs = per_case_per_res_dGs.groupby(['mutant', 'pose_num']).sum()
+    return per_res_ddgs, total_ddgs, center_ddgs

-def main(input_dir):
+
+def main(input_dir, minposenum, maxposenum):
    print("Listing Alanine Scan Ouptut Folders")
    pdb_dirs = find_finished_jobs(input_dir)
-    print('Found {:d} directories to analyze'.format(
-        len(pdb_dirs)))
+    print('Found {:d} directories to analyze'.format(len(pdb_dirs)))

+    resultdf = pd.DataFrame()
    if USE_MULTIPROCESSING:
        with tqdm(total=len(pdb_dirs), unit="cases") as pbar:
            with concurrent.futures.ThreadPoolExecutor(max_workers=PROCESS_COUNT) as executor:
-                #futures = {executor.submit(extract_structures, pdb_dir, **{
-                #                           'rename_function': flex_ddG_rename}): struct_db_path for struct_db_path in struct_dbs}
-                # results = {}
-                # for future in concurrent.futures.as_completed(futures):
-                #     arg = futures[future]
-                #     results[arg] = future.result()
-                #     pbar.update(1)
-                print()
+                futures = {executor.submit(extract_pose_tables, casefolder, replicatefolders): (casefolder, replicatefolders) for casefolder, replicatefolders in pdb_dirs.items()}
+                results = {}
+                for future in concurrent.futures.as_completed(futures):
+                    arg = futures[future]
+                    results[arg[0]] = future.result()
+                    pbar.update(1)
+        resultdf = pd.concat(list(results.values()))
    else:
        for casefolder, replicatefolders in tqdm(pdb_dirs.items()):
-            data = analyze(casefolder, replicatefolders)
-        print()
+            resultdf = pd.concat((resultdf, extract_pose_tables(casefolder, replicatefolders)))
+    
+    resultdf = resultdf.reindex(columns=['PDB_name', 'Mutated Chains',
+                        'wt_aa', 'replicate', 'res_num', 'mutant_aa', 'total', 
+                        'scored_state','fa_atr', 'fa_dun', 'fa_elec', 
+                        'fa_intra_rep', 'fa_rep','fa_sol', 'hbond_bb_sc', 
+                        'hbond_lr_bb', 'hbond_sc', 'hbond_sr_bb', 'omega', 
+                        'p_aa_pp', 'pro_close', 'rama', 'ref', 'yhh_planarity', 
+                        'dslf_fa13' , 'linear_chainbreak', 'overlap_chainbreak', 
+                        'case_name', 'label', 'mutant'])\
+                        .sort_values(by=['res_num', 'replicate']).reset_index()
+    per_res_ddgs, total_ddgs, center_ddgs = calc_dGs(resultdf, minposenum, maxposenum) 
+    per_res_ddgs.sort_values(by=['res_num', 'pose_num'], inplace=True)
+    #center_ddgs.sort_values(by=['res_num'])
+    basename = os.path.basename((input_dir + "/").rstrip("\\/"))
+    script_output_folder = 'analysis_output_sample'
+    center_ddgs.to_csv(os.path.join(script_output_folder, basename + '-center_chain_ddGs.csv'), index=False)
+    per_res_ddgs.to_csv(os.path.join(script_output_folder, basename + '-per_res_ddGs.csv'), index=False)

 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="""Extracts structures generated by mutation scan script """
    )
-
    parser.add_argument(
        'output_folder',
        type=str,
@@ -99,16 +174,19 @@ if __name__ == '__main__':
        help="The number of Rosetta processes to run in parallel. Take care to not exceed the number of available CPU threads or avaiable system memory when using larger numbers of threads.",
    )
    parser.add_argument(
-        "-f",
-        "--config-file",
-        type=pathlib.Path,
-        default="mutation_scan_config.toml",
-        help="The path to the mutation scan config file. Default: mutation_scan_config.toml",
+        "-m",
+        "--min",
+        type=int,
+        help="min pose num",
+    )
+    parser.add_argument(
+        "-x",
+        "--max",
+        type=int,
+        help="max pose num",
    )
    args = vars(parser.parse_args())

-    with args['config_file'].open() as f:
-        config = toml.load(f)
    # regenerates global values for all the multiprocessing pool child processes

    # This is the Rosetta structure output sqlite database found in the alanine scan output subfolders 
@@ -117,16 +195,12 @@ if __name__ == '__main__':
    # Important - to correctly name extracted structures by the stride, this trajectory_stride must be used
    # Set this to what was used to run the alanine scan with
    global trajectory_stride 
-    trajectory_stride = config["mutation_scan_parameters"][
-        "backrub_sampling_settings"
-    ]["backrub_trajectory_stride"]
+    trajectory_stride = 10
    # Enter the path to your Rosetta installation's score_jd2 binary here
-    global rosetta_path
-    global rosetta_db
-    rosetta_path = config['rosetta_setup']['rosetta_score_path']
-    rosetta_db = config['rosetta_setup']['rosetta_db_path']
-    global USE_MULTITHREADING
-    global PROCESS_COUNT
+
+    rosetta_path = '/home/vish/rosetta_3.13/main/source/bin/score_jd2.linuxgccrelease_native'
+    rosetta_db = '/home/vish/rosetta_3.13/main/database'
+
    if args['num_cpu'] == 1:
        USE_MULTIPROCESSING = False
        PROCESS_COUNT = 1
@@ -134,6 +208,6 @@ if __name__ == '__main__':
        USE_MULTIPROCESSING = True
        PROCESS_COUNT = max(os.cpu_count(), args['num_cpu'])
    if os.path.isdir(args['output_folder']):
-        main(args['output_folder'])
+        main(args['output_folder'], args['min'], args['max'])
    else:
        print('ERROR: %s is not a valid directory' % args['output_folder'])
\ No newline at end of file