import Bio.motifs import pandas as pd import numpy as np import csv import re import string from sklearn import preprocessing from sklearn.decomposition import PCA import matplotlib.pyplot as plt import seaborn as sns import scipy from scipy.stats import pearsonr, spearmanr, cumfreq # Order of Cell Lines reorder = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'] ### Load and Process the ChIP-seq Data #Load the matrix of Input data enhancers_universe_Input= pd.DataFrame.from_csv("Input_filtered_peaks.tsv", sep="\t", header=0) # Filter for these columns Input_columns = ['name','ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'] Input_index = enhancers_universe_Input.name.values Input_tmp = pd.DataFrame(enhancers_universe_Input, columns=Input_columns ) Input_values = Input_tmp.set_index(Input_index) # Filter for only input values Input_columns = ['name','ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'] only_Input_values = pd.DataFrame(Input_values, columns=Input_columns) # Rename columns and reorder only_Input_values = only_Input_values[reorder] x = only_Input_values.stack() y = filter(lambda a: a != 0, x) Input_factor = min(y) Input_values_std_robust = only_Input_values + Input_factor #Load the matrix of H3K4Me1 enhancers_universe_H3K4me1 = pd.DataFrame.from_csv("H3K4me1_filtered_peaks.tsv", sep="\t", header=0) # Filter for these columns H3K4me1_columns = ['name','ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'] H3K4me1_index = enhancers_universe_H3K4me1.name.values H3K4me1_tmp = pd.DataFrame(enhancers_universe_H3K4me1, columns=H3K4me1_columns ) H3K4me1_values = H3K4me1_tmp.set_index(H3K4me1_index) # Filter for only input values H3K4me1_columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'] only_H3K4me1_values = pd.DataFrame(H3K4me1_values, columns=H3K4me1_columns) # Rename columns and reorder only_H3K4me1_values = only_H3K4me1_values[reorder] x = only_H3K4me1_values.stack() y = filter(lambda a: a != 0, x) H3K4me1_factor = min(y) H3K4me1_values_std_robust = only_H3K4me1_values + H3K4me1_factor #Load the matrix of H3K27ac enhancers_universe_H3K27ac = pd.DataFrame.from_csv("H3K27ac_filtered_peaks.tsv", sep="\t", header=0) # Filter for these columns H3K27ac_columns = ['name','ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'] H3K27ac_index = enhancers_universe_H3K27ac.name.values H3K27ac_tmp = pd.DataFrame(enhancers_universe_H3K27ac, columns=H3K27ac_columns ) H3K27ac_values = H3K27ac_tmp.set_index(H3K27ac_index) # Filter for only input values H3K27ac_columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'] only_H3K27ac_values = pd.DataFrame(H3K27ac_values, columns=H3K27ac_columns) # Rename columns and reorder only_H3K27ac_values = only_H3K27ac_values[reorder] x = only_H3K27ac_values.stack() y = filter(lambda a: a != 0, x) H3K27ac_factor = min(y) H3K27ac_values_std_robust = only_H3K27ac_values + H3K27ac_factor #Divide Histone Marks by Input H3K4me1_values_std_input = H3K4me1_values_std_robust.divide(Input_values_std_robust) H3K27ac_values_std_input = H3K27ac_values_std_robust.divide(Input_values_std_robust) # Scale from 0-1 # H3K4me1 scaler = preprocessing.MinMaxScaler() H3K4me1_values_std_robust_transform = H3K4me1_values_std_input.T norm = scaler.fit_transform(H3K4me1_values_std_robust_transform.values) H3K4me1_scaled = pd.DataFrame(data=norm.T, columns=list(H3K4me1_values_std_robust.columns.values), index = H3K4me1_values_std_robust.index ) # H3k27ac scaler = preprocessing.MinMaxScaler() H3K27ac_values_std_robust_transform = H3K27ac_values_std_input.T norm = scaler.fit_transform(H3K27ac_values_std_robust_transform.values) H3K27ac_scaled = pd.DataFrame(data=norm.T, columns=list(H3K27ac_values_std_robust.columns.values), index = H3K27ac_values_std_robust.index ) ### Parse MEME and TOMTOM Motif data # Loop through meme output meme_cell_dict = { "MEME_op_ES_D0_Histone_enhancers_1kb_zoops": "tomtom_op_ES_D0_Histone_enhancers_1kb", "MEME_op_ES_D2_Histone_enhancers_1kb_zoops": "tomtom_op_ES_D2_Histone_enhancers_1kb", "MEME_op_ES_D5_Histone_enhancers_1kb_zoops": "tomtom_op_ES_D5_Histone_enhancers_1kb", "MEME_op_ES_D7_Histone_enhancers_1kb_zoops": "tomtom_op_ES_D7_Histone_enhancers_1kb", "MEME_op_ES_D10_Histone_enhancers_1kb_zoops": "tomtom_op_ES_D10_Histone_enhancers_1kb" } # Read Target ID to Motif into dictionary motif_id_dict = {} with open("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/Motif/motif_Ids_name.txt", "rb") as data: motif_ids = csv.DictReader(data, delimiter="\t") for line in motif_ids: motif_id_dict[line['ID']] = line['NAME'] meme_tomtom = pd.DataFrame() for meme,tom in meme_cell_dict.iteritems(): # load meme output meme_file = '%s/meme.txt' % (meme) record = Bio.motifs.parse(open(meme_file), 'meme') # Loop through all motifs and make dataframe meme_positions = pd.DataFrame() for motif in record: name = motif.name.split(" ")[1] ones = [1] * len(motif.instances) names = [] for instance in motif.instances: names.append(instance.sequence_name) new = pd.DataFrame({name: ones},index = names) temp = pd.concat([meme_positions, new], axis=1).fillna(0) meme_positions = temp # Read tomtom file tomtom_file = "/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/Motif/%s/tomtom.txt" % (tom) tomtom_dict = {} with open(tomtom_file, "rb") as data: tomtom = csv.DictReader(data, delimiter="\t") for line in tomtom: target = line['Target ID'] motif = line['#Query ID'] pval = float(line['p-value']) tfs = motif_id_dict[target].upper() motif_pvalue = { motif: [pval]} # JASPAR :: means that any TF can either protein, split the protein tf_list = tfs.split("::") for tf in tf_list: # Reduce split form splice to single value [ID]_# single_isoform = tf.split("_")[0] if single_isoform in tomtom_dict.keys(): if motif in tomtom_dict[single_isoform].keys(): tomtom_dict[single_isoform][motif].append(pval) else: tomtom_dict[single_isoform].update(motif_pvalue) else: tomtom_dict[single_isoform] = motif_pvalue # Make dataframe tomtom_motif = pd.DataFrame() for key,motif in tomtom_dict.iteritems(): pvalue_dict = {} # Loop through motifs to see if length greater than 1, if so do pvalue scaling for m,p in motif.iteritems(): if len(p) > 1: stouffer_statistic, stouffer_pval = scipy.stats.combine_pvalues(p,method = 'stouffer', weights = None) pvalue_dict[m] = stouffer_pval else: pvalue_dict[m] = p[0] pvalues = np.array(pvalue_dict.values()) new = pd.DataFrame({key: pvalues},index = pvalue_dict.keys()) temp = pd.concat([tomtom_motif, new], axis=1).fillna(0).sort_index(level=int) tomtom_motif = temp # Reorder tomtom_motif_reorder = tomtom_motif.reindex( list(meme_positions.columns.values)).fillna(0) # dot product meme_tomtom_cell = meme_positions.dot(tomtom_motif_reorder) # Scale and add scaler = preprocessing.MinMaxScaler() meme_tomtom_cell_transform = meme_tomtom_cell.T norm = scaler.fit_transform(meme_tomtom_cell_transform.values) # norm across enhancers for each enhancer meme_tomtom_cell_std = pd.DataFrame(data=norm.T, columns=list(meme_tomtom_cell.columns.values), index = meme_tomtom_cell.index ) # Add to previous data temp = meme_tomtom.add(meme_tomtom_cell_std, fill_value=0).fillna(0).sort_index(level=int) meme_tomtom = temp # Transform meme tom_tom motif_enhancers = meme_tomtom.T # Rename column headers motif_enhancers.rename(columns=lambda x: x.split('-')[0], inplace=True) motif_enhancers.rename(columns=lambda x: x.replace(':', "_"), inplace=True) # Standardize to range 0-1 scaler = preprocessing.MinMaxScaler() motif_enhancers_transform = motif_enhancers.T norm = scaler.fit_transform(motif_enhancers_transform.values) # norm across enhancers for each enhancer motif_enhancers_scaled = pd.DataFrame(data=norm.T, columns=list(motif_enhancers.columns.values), index = motif_enhancers.index) ### Load and Parse FPKM data from RNA-seq # Grab TF FPKM levels fpkm = pd.read_table("rna.tsv") gene_names_mapping = pd.read_csv("../gencode.v19.annotation_protein_coding_ids.txt",names=['gene_id', 'symbol']) fpkm_symbol = fpkm.merge(gene_names_mapping) fpkm_symbol = fpkm_symbol.set_index(['symbol']) # Get only TF's in JASPAR all_motifs = list(motif_enhancers.index) fpkm_tfs = list(fpkm_symbol.index) for i in range(0,len(fpkm_tfs)): tf = fpkm_tfs[i] tfs = tf.split(',') if len(tfs) == 1: fpkm_tfs[i] = tfs[0] else: for t in tfs: if t in all_motifs: fpkm_tfs[i] = t tf_fpkm = fpkm_symbol.loc[fpkm_symbol.index.isin(all_motifs)] # Get subset of only cell line FPKM calues headers = list(tf_fpkm.columns.values) subset = [] for value in headers: if re.search('ES_D',value): subset.append(value) tf_cell_lines = tf_fpkm[subset] # For Fusion 'EWSR1-FLI' take the lowest FPKM and add that to the tf_cell_lines hetero_dimer_motifs = [] hetero_dimer = {} for motif in all_motifs: if re.search("-[a-zA-Z]",motif): tfs = motif.split('-') tf_fpkm_hd = fpkm.loc[fpkm.index.isin(tfs)] tf_fpkm_hd_cell_lines = tf_fpkm_hd[subset] hd_fpkm = tf_fpkm_hd_cell_lines.min(axis=0).to_frame() hd_fpkm_transform = hd_fpkm.T hd_fpkm_transform.name = 'gene_short_name' hd_fpkm_transform.index = [motif] temp = pd.concat([tf_fpkm, hd_fpkm_transform], axis=0) tf_cell_lines = temp # Rename headers for cell lines headers = list(tf_cell_lines.columns.values) new_headers = [] for h in headers: new_headers.append(h.split('_')[1]) # Note 5 TFs not represented ['TCFE2A', 'RAR', 'ZFP423', 'RXR', 'TCFCP2L1'] tf_cell_lines.columns = new_headers tf_cell_lines = tf_fpkm[subset] # Log2 scale FPKM x = tf_cell_lines.stack() y = filter(lambda a: a != 0, x) tf_factor = min(y) # min is 5.2535600000000006e-05 force_zero = np.log2(0.0000005) tf_cell_lines_std = tf_cell_lines.apply(np.log2).replace(-np.inf,force_zero) scaler = preprocessing.RobustScaler() norm = scaler.fit_transform(tf_cell_lines_std.values) tf_cell_lines_std_robust = pd.DataFrame(data=norm, columns=list(tf_cell_lines_std.columns.values), index = tf_cell_lines_std.index ) # Scale from 0-1 scaler = preprocessing.MinMaxScaler() tf_cell_lines_std_robust_transform = tf_cell_lines_std_robust.T norm = scaler.fit_transform(tf_cell_lines_std_robust_transform.values) tf_scaled_tmp = pd.DataFrame(data=norm.T, columns=list(tf_cell_lines_std_robust.columns.values), index = tf_cell_lines_std_robust.index ) # Binarize (.4 cutoff for intial values) threshold_1q = .4 scaler = preprocessing.Binarizer(threshold=threshold_1q) norm = scaler.fit_transform(tf_cell_lines.values) tf_scaled_binarize = pd.DataFrame(data=norm, columns=list(tf_cell_lines.columns.values), index = tf_cell_lines.index ) tf_scaled = tf_scaled_tmp.multiply(tf_scaled_binarize) ### Start Integration Clcuations # 0. Filteration step test = list(motif_enhancers_scaled.columns.values) test_2 = list(H3K27ac_values_std_input.index.values) #needed_rows = [row for row in H3K27ac_scaled.index if row in list(motif_enhancers_scaled.columns.values)] needed_rows = list(set(test_2) & set(test)) H3K27ac_robust_filtered= H3K27ac_values_std_input.loc[needed_rows] H3K4me1_robust_filtered= H3K4me1_values_std_input.loc[needed_rows] H3K27ac_values_std_robust_filtered = H3K27ac_values_std_robust.loc[needed_rows] H3K4me1_values_std_robust_filtered = H3K4me1_values_std_robust.loc[needed_rows] # 0.5 # Scale from 0-1 # H3K4me1 scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) H3K4me1_values_std_robust_transform = H3K4me1_robust_filtered.T norm = scaler.fit_transform(H3K4me1_values_std_robust_transform.values) H3K4me1_scaled = pd.DataFrame(data=norm.T, columns=list(H3K4me1_values_std_robust_filtered.columns.values), index = H3K4me1_values_std_robust_filtered.index ) # H3k27ac scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) H3K27ac_values_std_robust_transform = H3K4me1_robust_filtered.T norm = scaler.fit_transform(H3K27ac_values_std_robust_transform.values) H3K27ac_scaled = pd.DataFrame(data=norm.T, columns=list(H3K27ac_values_std_robust_filtered.columns.values), index = H3K27ac_values_std_robust_filtered.index ) # 1. add H3K27ac and H3K4me1 signal H3K27ac_H3K4me1 = H3K27ac_scaled.add(H3K4me1_scaled) # 2. Make Score Matrix ## Enhancers RPKM x Motif Enhancers motif_cell_line = motif_enhancers_scaled.dot(H3K27ac_H3K4me1) needed_rows = [row for row in motif_cell_line.index if row in list(tf_scaled.index)] motif_cell_line_filtered_tfs = motif_cell_line.loc[needed_rows] motif_cell_line_filtered_tfs.columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'] motif_cell_line_filtered_tfs = motif_cell_line_filtered_tfs[reorder] # reindex tf_scaled_ordered = tf_scaled.reindex(list(motif_cell_line_filtered_tfs.index)) tf_scaled_ordered = tf_scaled_ordered[reorder] # 4. .multiply() to to Element-by-element multiplication Score Enhancers by TF cell_tf_values = motif_cell_line_filtered_tfs.multiply(tf_scaled_ordered) cell_tf_values.columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'] cell_tf_values_colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"] # 5. Z-score Standardize for each cell line to see important TF's scaler = preprocessing.StandardScaler() norm = scaler.fit_transform(cell_tf_values.values) cell_tf_values_std = pd.DataFrame(data=norm, columns=list(cell_tf_values.columns.values), index = cell_tf_values.index ) # Seaborn settings sns.axes_style({'image.cmap': u'Blacks','lines.linewidth': 100.0}) # Cluster Heatmap sns.set_context("paper") hmap = sns.clustermap(cell_tf_values_std,xticklabels=True, yticklabels=True, cmap="RdBu_r", method = "complete", metric = "euclidean", figsize=(20, 20), col_colors=sns.color_palette(cell_tf_values_colors)) plt.setp(hmap.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) plt.savefig('final_full_cluster_heatmap.png') plt.clf() # 6. Reorder based on clustering reorder_clustering = cell_tf_values_std.columns.values[hmap.dendrogram_col.reordered_ind] cell_tf_values_std_ordered = cell_tf_values_std[reorder_clustering] reindex_cluserting = cell_tf_values_std.index.values[hmap.dendrogram_row.reordered_ind] cell_tf_values_std_ordered = cell_tf_values_std_ordered.reindex(reindex_cluserting) cell_tf_values_std_ordered.to_csv("final_full_cluster_z_score.csv", encoding='utf-8') # 7. Rank Order from sklearn import metrics from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.preprocessing import scale from sklearn.manifold import TSNE plt.style.use('classic') #TNSE tnse = TSNE(n_components=2,verbose=2,learning_rate=100,perplexity=50) tnse_fit = tnse.fit_transform(cell_tf_values) k_model = KMeans(n_clusters=3).fit(cell_tf_values) labels = k_model.labels_ centroids = k_model.cluster_centers_ vis_x = tnse_fit[:, 0] vis_y = tnse_fit[:, 1] plt.scatter(vis_x, vis_y, c=labels, cmap=plt.cm.get_cmap("jet", 3),facecolor='white',marker='o',s=100,linewidth=2) #plt.colorbar(ticks=None) plt.tick_params(axis='y', direction='out') plt.tick_params(axis='x', direction='out') plt.tick_params(top='off', right='off') plt.savefig('k_means_clustering.png') plt.clf() k_model = KMeans(n_clusters=3,random_state=1).fit(cell_tf_values_std) labels = k_model.labels_ centroids = k_model.cluster_centers_ pca = PCA(n_components=2).fit(cell_tf_values_std) pca_2d = pca.transform(cell_tf_values_std) vis_x = pca_2d[:, 0] vis_y = pca_2d[:, 1] plt.scatter(vis_x, vis_y, c=labels, cmap=plt.cm.get_cmap("jet", 3),facecolor='white',marker='o',s=50,linewidth='2') plt.colorbar(ticks=range(3)) plt.savefig('k_means_clustering_pca.png') plt.clf() # Seperate into 3 cluster cell_tf_values_std_cluster = cell_tf_values_std cell_tf_values_std_cluster['cluster'] = list(labels) cell_tf_values_std_cluster.to_csv('clustering_tfs.csv') cell_tf_values_std_cluster_1 = cell_tf_values_std_cluster.loc[cell_tf_values_std_cluster['cluster'] == 0] cell_tf_values_std_cluster_2 = cell_tf_values_std_cluster.loc[cell_tf_values_std_cluster['cluster'] == 1] cell_tf_values_std_cluster_3 = cell_tf_values_std_cluster.loc[cell_tf_values_std_cluster['cluster'] == 2] # col_colors colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"] medianprops = dict(linestyle='-', linewidth=4, color='black') box = cell_tf_values_std_cluster_1.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops) plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5) plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5) for patch, color in zip(box['boxes'], colors): patch.set_facecolor(color) plt.tick_params(axis='y', direction='out') plt.tick_params(axis='x', direction='out') plt.tick_params(top='off', right='off') plt.grid(b=False) plt.ylim((-1.5,5.5)) plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10']) plt.savefig('box_plot_cluster_1.png') plt.clf() box = cell_tf_values_std_cluster_2.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops) plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5) plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5) for patch, color in zip(box['boxes'], colors): patch.set_facecolor(color) plt.tick_params(axis='y', direction='out') plt.tick_params(axis='x', direction='out') plt.tick_params(top='off', right='off') plt.grid(b=False) plt.ylim((-1,1)) plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10']) plt.savefig('box_plot_cluster_2.png') plt.clf() box = cell_tf_values_std_cluster_3.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops) plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5) plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5) for patch, color in zip(box['boxes'], colors): patch.set_facecolor(color) plt.tick_params(axis='y', direction='out') plt.tick_params(axis='x', direction='out') plt.tick_params(top='off', right='off') plt.grid(b=False) plt.ylim((-1.5,5.5)) plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10']) plt.savefig('box_plot_cluster_3.png') plt.clf() # Wilcox rank sum test: # Cluster 1 1 e-2 scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D0'],cell_tf_values_std_cluster_1['ES_D2']) scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D0'],cell_tf_values_std_cluster_1['ES_D5']) scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D0'],cell_tf_values_std_cluster_1['ES_D7']) scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D0'],cell_tf_values_std_cluster_1['ES_D10']) scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D2'],cell_tf_values_std_cluster_1['ES_D5']) scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D2'],cell_tf_values_std_cluster_1['ES_D7']) scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D2'],cell_tf_values_std_cluster_1['ES_D10']) scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D5'],cell_tf_values_std_cluster_1['ES_D7']) scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D5'],cell_tf_values_std_cluster_1['ES_D10']) scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D7'],cell_tf_values_std_cluster_1['ES_D10']) # Cluster 2 1 e-2 scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D0'],cell_tf_values_std_cluster_2['ES_D2']) scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D0'],cell_tf_values_std_cluster_2['ES_D5']) scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D0'],cell_tf_values_std_cluster_2['ES_D7']) scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D0'],cell_tf_values_std_cluster_2['ES_D10']) scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D2'],cell_tf_values_std_cluster_2['ES_D5']) scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D2'],cell_tf_values_std_cluster_2['ES_D7']) scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D2'],cell_tf_values_std_cluster_2['ES_D10']) scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D5'],cell_tf_values_std_cluster_2['ES_D7']) scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D5'],cell_tf_values_std_cluster_2['ES_D10']) scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D7'],cell_tf_values_std_cluster_2['ES_D10']) # Cluster 3 1 e-2 scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D0'],cell_tf_values_std_cluster_3['ES_D2']) scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D0'],cell_tf_values_std_cluster_3['ES_D5']) scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D0'],cell_tf_values_std_cluster_3['ES_D7']) scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D0'],cell_tf_values_std_cluster_3['ES_D10']) scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D2'],cell_tf_values_std_cluster_3['ES_D5']) scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D2'],cell_tf_values_std_cluster_3['ES_D7']) scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D2'],cell_tf_values_std_cluster_3['ES_D10']) scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D5'],cell_tf_values_std_cluster_3['ES_D7']) scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D5'],cell_tf_values_std_cluster_3['ES_D10']) scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D7'],cell_tf_values_std_cluster_3['ES_D10']) # Look at Cluster 4 for expression of TF's cluster4_tfs = tf_cell_lines.loc[cell_tf_values_std_cluster_4.index.values] cluster4_tfs.to_csv("cluster_4_tfs.csv") box = cluster4_tfs.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops) plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3) plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3) for patch, color in zip(box['boxes'], colors): patch.set_facecolor(color) plt.tick_params(axis='y', direction='out') plt.tick_params(axis='x', direction='out') plt.tick_params(top='off', right='off') plt.grid(b=False) plt.ylim((-5,65)) plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10']) plt.savefig('box_plot_cluster_4_tfs_fpkm.png') plt.clf() # Cluster tfs 1 e-3 scipy.stats.ranksums(cluster4_tfs['ES_D0'],cluster4_tfs['ES_D2']) scipy.stats.ranksums(cluster4_tfs['ES_D0'],cluster4_tfs['ES_D5']) scipy.stats.ranksums(cluster4_tfs['ES_D0'],cluster4_tfs['ES_D7']) scipy.stats.ranksums(cluster4_tfs['ES_D0'],cluster4_tfs['ES_D10']) scipy.stats.ranksums(cluster4_tfs['ES_D2'],cluster4_tfs['ES_D5']) scipy.stats.ranksums(cluster4_tfs['ES_D2'],cluster4_tfs['ES_D7']) scipy.stats.ranksums(cluster4_tfs['ES_D2'],cluster4_tfs['ES_D10']) scipy.stats.ranksums(cluster4_tfs['ES_D5'],cluster4_tfs['ES_D7']) scipy.stats.ranksums(cluster4_tfs['ES_D5'],cluster4_tfs['ES_D10']) scipy.stats.ranksums(cluster4_tfs['ES_D7'],cluster4_tfs['ES_D10']) cluster4_motifs = motif_enhancers.loc[cell_tf_values_std_cluster_4.index.values] cluster4_enhancers = only_rpkm_values.loc[cluster4_motifs.loc[:, (cluster4_motifs != 0).all(axis=0)].columns.values] cluster4_enhancers.to_csv("cluster_4_enhancers.csv") box = cluster4_enhancers.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops) plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3) plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3) for patch, color in zip(box['boxes'], colors): patch.set_facecolor(color) plt.tick_params(axis='y', direction='out') plt.tick_params(axis='x', direction='out') plt.tick_params(top='off', right='off') plt.grid(b=False) plt.ylim((-5,105)) plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10']) plt.savefig('box_plot_cluster_4_enhancers_rpkm.png') plt.clf() # Cluster tfs 1 e-4 scipy.stats.ranksums(cluster4_enhancers['ES_D0'],cluster4_enhancers['ES_D2']) scipy.stats.ranksums(cluster4_enhancers['ES_D0'],cluster4_enhancers['ES_D5']) scipy.stats.ranksums(cluster4_enhancers['ES_D0'],cluster4_enhancers['ES_D7']) scipy.stats.ranksums(cluster4_enhancers['ES_D0'],cluster4_enhancers['ES_D10']) scipy.stats.ranksums(cluster4_enhancers['ES_D2'],cluster4_enhancers['ES_D5']) scipy.stats.ranksums(cluster4_enhancers['ES_D2'],cluster4_enhancers['ES_D7']) scipy.stats.ranksums(cluster4_enhancers['ES_D2'],cluster4_enhancers['ES_D10']) scipy.stats.ranksums(cluster4_enhancers['ES_D5'],cluster4_enhancers['ES_D7']) scipy.stats.ranksums(cluster4_enhancers['ES_D5'],cluster4_enhancers['ES_D10']) scipy.stats.ranksums(cluster4_enhancers['ES_D7'],cluster4_enhancers['ES_D10']) # Look at Cluster 3 for expression of TF's cluster3_tfs = tf_cell_lines.loc[cell_tf_values_std_cluster_3.index.values] cluster3_tfs.to_csv("cluster_3_tfs.csv") box = cluster3_tfs.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops) plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3) plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3) for patch, color in zip(box['boxes'], colors): patch.set_facecolor(color) plt.tick_params(axis='y', direction='out') plt.tick_params(axis='x', direction='out') plt.tick_params(top='off', right='off') plt.grid(b=False) plt.ylim((-5,55)) plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10']) plt.savefig('box_plot_cluster_3_tfs_fpkm.png') plt.clf() # Cluster tfs 1 e-3 (NS) scipy.stats.ranksums(cluster3_tfs['ES_D0'],cluster3_tfs['ES_D2']) scipy.stats.ranksums(cluster3_tfs['ES_D0'],cluster3_tfs['ES_D5']) scipy.stats.ranksums(cluster3_tfs['ES_D0'],cluster3_tfs['ES_D7']) scipy.stats.ranksums(cluster3_tfs['ES_D0'],cluster3_tfs['ES_D10']) scipy.stats.ranksums(cluster3_tfs['ES_D2'],cluster3_tfs['ES_D5']) scipy.stats.ranksums(cluster3_tfs['ES_D2'],cluster3_tfs['ES_D7']) scipy.stats.ranksums(cluster3_tfs['ES_D2'],cluster3_tfs['ES_D10']) scipy.stats.ranksums(cluster3_tfs['ES_D5'],cluster3_tfs['ES_D7']) scipy.stats.ranksums(cluster3_tfs['ES_D5'],cluster3_tfs['ES_D10']) scipy.stats.ranksums(cluster3_tfs['ES_D7'],cluster3_tfs['ES_D10']) cluster3_motifs = motif_enhancers.loc[cell_tf_values_std_cluster_3.index.values] cluster3_enhancers = only_rpkm_values.loc[cluster3_motifs.loc[:, (cluster3_motifs != 0).all(axis=0)].columns.values] cluster3_enhancers.to_csv("cluster_3_enhancers.csv") box = cluster3_enhancers.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops) plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3) plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3) for patch, color in zip(box['boxes'], colors): patch.set_facecolor(color) plt.tick_params(axis='y', direction='out') plt.tick_params(axis='x', direction='out') plt.tick_params(top='off', right='off') plt.grid(b=False) plt.ylim((-5,65)) plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10']) plt.savefig('box_plot_cluster_3_enhancers_rpkm.png') plt.clf() # Cluster tfs 1 e-12 scipy.stats.ranksums(cluster3_enhancers['ES_D0'],cluster3_enhancers['ES_D2']) scipy.stats.ranksums(cluster3_enhancers['ES_D0'],cluster3_enhancers['ES_D5']) scipy.stats.ranksums(cluster3_enhancers['ES_D0'],cluster3_enhancers['ES_D7']) scipy.stats.ranksums(cluster3_enhancers['ES_D0'],cluster3_enhancers['ES_D10']) scipy.stats.ranksums(cluster3_enhancers['ES_D2'],cluster3_enhancers['ES_D5']) scipy.stats.ranksums(cluster3_enhancers['ES_D2'],cluster3_enhancers['ES_D7']) scipy.stats.ranksums(cluster3_enhancers['ES_D2'],cluster3_enhancers['ES_D10']) scipy.stats.ranksums(cluster3_enhancers['ES_D5'],cluster3_enhancers['ES_D7']) scipy.stats.ranksums(cluster3_enhancers['ES_D5'],cluster3_enhancers['ES_D10']) scipy.stats.ranksums(cluster3_enhancers['ES_D7'],cluster3_enhancers['ES_D10']) ## Analysis of only RNA-seq # 1. Z-score Standardize for each cell line to see important TF's scaler = preprocessing.StandardScaler() norm = scaler.fit_transform(tf_scaled_ordered.values) tf_scaled_std = pd.DataFrame(data=norm, columns=list(tf_scaled_ordered.columns.values), index = tf_scaled_ordered.index ) # Seaborn settings sns.axes_style({'image.cmap': u'Blacks','lines.linewidth': 100.0}) # Cluster Heatmap sns.set_context("paper") hmap = sns.clustermap(tf_scaled_std,xticklabels=True, yticklabels=True, cmap="RdBu_r", method = "complete", metric = "euclidean", figsize=(20, 20), col_colors=sns.color_palette(cell_tf_values_colors)) plt.setp(hmap.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) plt.savefig('final_full_cluster_heatmap_rna-seq.png') labels = [item.get_text() for item in hmap.ax_heatmap.yaxis.get_majorticklabels()] labels.reverse() with open("final_full_cluster_heatmap_rna-seq.csv", 'wb') as csv_file: wr = csv.writer(csv_file,dialect='excel',quoting=csv.QUOTE_ALL) for tf in labels: wr.writerow([tf,]) # 2. Reorder based on clustering reorder_clustering = tf_scaled_std.columns.values[hmap.dendrogram_col.reordered_ind] tf_scaled_std_ordered = tf_scaled_std[reorder_clustering] reindex_cluserting = tf_scaled_std.index.values[hmap.dendrogram_row.reordered_ind] tf_scaled_std_ordered = tf_scaled_std_ordered.reindex(reindex_cluserting) tf_scaled_std_ordered.to_csv("final_full_cluster_z_score-rnaseq.csv", encoding='utf-8') ## Analysis of only GRO-seq data # 1. Make Score Matrix ## Enhancers RPKM x Motif Enhancers motif_cell_line = motif_enhancers_scaled.dot(rpkm_robust_filtered) needed_rows = [row for row in motif_cell_line.index if row in list(tf_scaled.index)] motif_cell_line_filtered_tfs = motif_cell_line.loc[needed_rows] motif_cell_line_filtered_tfs.columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'] motif_cell_line_filtered_tfs = motif_cell_line_filtered_tfs[reorder] # 2. Z-score Standardize for each cell line to see important TF's scaler = preprocessing.StandardScaler() norm = scaler.fit_transform(motif_cell_line_filtered_tfs.values) motif_cell_line_filtered_tfs_std = pd.DataFrame(data=norm, columns=list(motif_cell_line_filtered_tfs.columns.values), index = motif_cell_line_filtered_tfs.index ) # Seaborn settings sns.axes_style({'image.cmap': u'Blacks','lines.linewidth': 100.0}) # Cluster Heatmap sns.set_context("paper") hmap = sns.clustermap(motif_cell_line_filtered_tfs_std,xticklabels=True, yticklabels=True, cmap="RdBu_r", method = "complete", metric = "euclidean", figsize=(20, 20), col_colors=sns.color_palette(cell_tf_values_colors)) plt.setp(hmap.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) plt.savefig('final_full_cluster_heatmap_gro-seq.png') labels = [item.get_text() for item in hmap.ax_heatmap.yaxis.get_majorticklabels()] labels.reverse() with open("final_full_cluster_heatmap_gro-seq.csv", 'wb') as csv_file: wr = csv.writer(csv_file,dialect='excel',quoting=csv.QUOTE_ALL) for tf in labels: wr.writerow([tf,]) # 3. Reorder based on clustering reorder_clustering = motif_cell_line_filtered_tfs_std.columns.values[hmap.dendrogram_col.reordered_ind] motif_cell_line_filtered_tfs_std_ordered = motif_cell_line_filtered_tfs_std[reorder_clustering] reindex_cluserting = motif_cell_line_filtered_tfs_std.index.values[hmap.dendrogram_row.reordered_ind] motif_cell_line_filtered_tfs_std_ordered = motif_cell_line_filtered_tfs_std_ordered.reindex(reindex_cluserting) motif_cell_line_filtered_tfs_std_ordered.to_csv("final_full_cluster_z_score-groseq.csv", encoding='utf-8') ## Analysis of only GRO-seq data + RNA-seq # 1. Make Score Matrix ## Enhancers RPKM x Motif Enhancers motif_cell_line = motif_enhancers_scaled.dot(rpkm_robust_filtered) needed_rows = [row for row in motif_cell_line.index if row in list(tf_scaled.index)] motif_cell_line_filtered_tfs = motif_cell_line.loc[needed_rows] motif_cell_line_filtered_tfs.columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'] motif_cell_line_filtered_tfs = motif_cell_line_filtered_tfs[reorder] # reindex tf_scaled_ordered = tf_scaled.reindex(list(motif_cell_line_filtered_tfs.index)) tf_scaled_ordered = tf_scaled_ordered[reorder] # 2. .multiply() to to Element-by-element multiplication Score Enhancers by TF cell_tf_values = motif_cell_line_filtered_tfs.multiply(tf_scaled_ordered) cell_tf_values.columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'] cell_tf_values_colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"] # 3. Z-score Standardize for each cell line to see important TF's scaler = preprocessing.StandardScaler() norm = scaler.fit_transform(cell_tf_values.values) cell_tf_values_std = pd.DataFrame(data=norm, columns=list(cell_tf_values.columns.values), index = cell_tf_values.index ) # Seaborn settings sns.axes_style({'image.cmap': u'Blacks','lines.linewidth': 100.0}) # Cluster Heatmap sns.set_context("paper") hmap = sns.clustermap(cell_tf_values_std,xticklabels=True, yticklabels=True, cmap="RdBu_r", method = "complete", metric = "euclidean", figsize=(20, 20), col_colors=sns.color_palette(cell_tf_values_colors)) plt.setp(hmap.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) plt.savefig('final_full_cluster_heatmap_gro_rna.png') labels = [item.get_text() for item in hmap.ax_heatmap.yaxis.get_majorticklabels()] labels.reverse() with open("final_full_cluster_heatmap_gro_rna.csv", 'wb') as csv_file: wr = csv.writer(csv_file,dialect='excel',quoting=csv.QUOTE_ALL) for tf in labels: wr.writerow([tf,]) # 4. Reorder based on clustering reorder_clustering = cell_tf_values_std.columns.values[hmap.dendrogram_col.reordered_ind] cell_tf_values_std_ordered = cell_tf_values_std[reorder_clustering] reindex_cluserting = cell_tf_values_std.index.values[hmap.dendrogram_row.reordered_ind] cell_tf_values_std_ordered = cell_tf_values_std_ordered.reindex(reindex_cluserting) cell_tf_values_std_ordered.to_csv("final_full_cluster_z_score_gro_rna.csv", encoding='utf-8')