Skip to content
Snippets Groups Projects
matrix_analysis.py 36.9 KiB
Newer Older
import Bio.motifs
import pandas as pd
import numpy as np
import csv
import re
import string
from sklearn import preprocessing
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy.stats import pearsonr, spearmanr, cumfreq


# Order of Cell Lines
reorder = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']


### Load and Process the ChIP-seq Data

#Load the matrix of Input data
enhancers_universe_Input= pd.DataFrame.from_csv("Input_filtered_peaks.tsv", sep="\t", header=0)

# Filter for these columns
Input_columns = ['name','ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
Input_index = enhancers_universe_Input.name.values
Input_tmp = pd.DataFrame(enhancers_universe_Input, columns=Input_columns )
Input_values = Input_tmp.set_index(Input_index)

# Filter for SSP and SUP
Input_ssp = Input_values[Input_values['name'].str.contains("SSP")]
Input_sunp = Input_values[Input_values['name'].str.contains("SUNP")]
Input_columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
filter_Input_values = pd.concat([Input_ssp,Input_sunp])
only_Input_values = pd.DataFrame(filter_Input_values, columns=Input_columns)


# Rename columns and reorder
only_Input_values = only_Input_values[reorder]
x = only_Input_values.stack()
y = filter(lambda a: a != 0, x)
Input_factor = min(y)
Input_values_std_robust = only_Input_values + Input_factor


#Load the matrix of H3K4Me1
enhancers_universe_H3K4me1 = pd.DataFrame.from_csv("H3K4me1_filtered_peaks.tsv", sep="\t", header=0)

# Filter for these columns
H3K4me1_columns = ['name','ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
H3K4me1_index = enhancers_universe_H3K4me1.name.values
H3K4me1_tmp = pd.DataFrame(enhancers_universe_H3K4me1, columns=H3K4me1_columns )
H3K4me1_values = H3K4me1_tmp.set_index(H3K4me1_index)

# Filter for SSP and SUP
H3K4me1_ssp = H3K4me1_values[H3K4me1_values['name'].str.contains("SSP")]
H3K4me1_sunp = H3K4me1_values[H3K4me1_values['name'].str.contains("SUNP")]
H3K4me1_columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
filter_H3K4me1_values = pd.concat([H3K4me1_ssp,H3K4me1_sunp])
only_H3K4me1_values = pd.DataFrame(filter_H3K4me1_values, columns=H3K4me1_columns)

# Rename columns and reorder
only_H3K4me1_values = only_H3K4me1_values[reorder]
x = only_H3K4me1_values.stack()
y = filter(lambda a: a != 0, x)
H3K4me1_factor = min(y)
H3K4me1_values_std_robust = only_H3K4me1_values + H3K4me1_factor


#Load the matrix of H3K27ac
enhancers_universe_H3K27ac = pd.DataFrame.from_csv("H3K27ac_filtered_peaks.tsv", sep="\t", header=0)

# Filter for these columns
H3K27ac_columns = ['name','ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
H3K27ac_index = enhancers_universe_H3K27ac.name.values
H3K27ac_tmp = pd.DataFrame(enhancers_universe_H3K27ac, columns=H3K27ac_columns )
H3K27ac_values = H3K27ac_tmp.set_index(H3K27ac_index)

# Filter for SSP and SUP
H3K27ac_ssp = H3K27ac_values[H3K27ac_values['name'].str.contains("SSP")]
H3K27ac_sunp = H3K27ac_values[H3K27ac_values['name'].str.contains("SUNP")]
H3K27ac_columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
filter_H3K27ac_values = pd.concat([H3K27ac_ssp,H3K27ac_sunp])
only_H3K27ac_values = pd.DataFrame(filter_H3K27ac_values, columns=H3K27ac_columns)

# Rename columns and reorder
only_H3K27ac_values = only_H3K27ac_values[reorder]
x = only_H3K27ac_values.stack()
y = filter(lambda a: a != 0, x)
H3K27ac_factor = min(y)
H3K27ac_values_std_robust = only_H3K27ac_values + H3K27ac_factor


#Divide Histone Marks by Input
H3K4me1_values_std_input = H3K4me1_values_std_robust.divide(Input_values_std_robust)
H3K27ac_values_std_input = H3K27ac_values_std_robust.divide(Input_values_std_robust)

# Scale from 0-1
# H3K4me1
scaler = preprocessing.MinMaxScaler()
H3K4me1_values_std_robust_transform = H3K4me1_values_std_input.T
norm = scaler.fit_transform(H3K4me1_values_std_robust_transform.values)
H3K4me1_scaled = pd.DataFrame(data=norm.T, columns=list(H3K4me1_values_std_robust.columns.values), index = H3K4me1_values_std_robust.index )

# H3k27ac
scaler = preprocessing.MinMaxScaler()
H3K27ac_values_std_robust_transform = H3K27ac_values_std_input.T
norm = scaler.fit_transform(H3K27ac_values_std_robust_transform.values)
H3K27ac_scaled = pd.DataFrame(data=norm.T, columns=list(H3K27ac_values_std_robust.columns.values), index = H3K27ac_values_std_robust.index )

### Load and Process the GRO-seq Enhancer data

# load the matix of location and RPKM values
enhancers_universe = pd.DataFrame.from_csv("GRO_filtered_peaks.tsv", sep="\t", header=0, index_col=None)

# Filter for these columns
rpkm_columns = ['name','ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
rpkm_index = enhancers_universe.name.values
rpkm_tmp = pd.DataFrame(enhancers_universe, columns=rpkm_columns )
rpkm_values = rpkm_tmp.set_index(rpkm_index)

# Filter for SSP and SUP
rpkm_ssp = rpkm_values[rpkm_values['name'].str.contains("SSP")]
rpkm_sunp = rpkm_values[rpkm_values['name'].str.contains("SUNP")]
rpkm_columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
filter_rpkm_values = pd.concat([rpkm_ssp,rpkm_sunp])
only_rpkm_values = pd.DataFrame(filter_rpkm_values, columns=rpkm_columns)

# Log2 scale RPKM
# get mininum value to non-zero value to scale by
x = only_rpkm_values.stack()
y = filter(lambda a: a != 0, x)
rpkm_factor = min(y) # min is 0.000119637562731
force_zero = np.log2(0.0005)
only_rpkm_values_factor = only_rpkm_values + rpkm_factor
rpkm_values_std = only_rpkm_values.apply(np.log2).replace(-np.inf,force_zero)
scaler = preprocessing.RobustScaler()
norm = scaler.fit_transform(rpkm_values_std.values)
rpkm_values_std_robust = pd.DataFrame(data=norm, columns=list(rpkm_values_std.columns.values), index = rpkm_values_std.index )
rpkm_values_std_robust = rpkm_values_std_robust[reorder]

# Scale from 0-1
scaler = preprocessing.MinMaxScaler()
rpkm_values_std_robust_transform = rpkm_values_std_robust.T
norm = scaler.fit_transform(rpkm_values_std_robust_transform.values)
rpkm_scaled = pd.DataFrame(data=norm.T, columns=list(rpkm_values_std_robust.columns.values), index = rpkm_values_std_robust.index )


### Parse MEME and TOMTOM Motif data

# Loop through meme output
meme_cell_dict = {
"MEME_op_ES_D0_gro-seq_enhancers_1kb_zoops": "tomtom_op_ES_D0_gro-seq_enhancers_1kb",
"MEME_op_ES_D2_gro-seq_enhancers_1kb_zoops": "tomtom_op_ES_D2_gro-seq_enhancers_1kb",
"MEME_op_ES_D5_gro-seq_enhancers_1kb_zoops": "tomtom_op_ES_D5_gro-seq_enhancers_1kb",
"MEME_op_ES_D7_gro-seq_enhancers_1kb_zoops":  "tomtom_op_ES_D7_gro-seq_enhancers_1kb",
"MEME_op_ES_D10_gro-seq_enhancers_1kb_zoops":  "tomtom_op_ES_D10_gro-seq_enhancers_1kb"
}

# Read Target ID to Motif into dictionary
motif_id_dict = {}
with open("/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/Motif/motif_Ids_name.txt", "rb") as data:
    motif_ids = csv.DictReader(data, delimiter="\t")
    for line in motif_ids:
        motif_id_dict[line['ID']] = line['NAME']

meme_tomtom = pd.DataFrame()
for meme,tom in meme_cell_dict.iteritems():
    # load meme output
    meme_file = '%s/meme.txt' % (meme)
    record = Bio.motifs.parse(open(meme_file), 'meme')
    # Loop through all motifs and make dataframe
    meme_positions = pd.DataFrame()
    for motif in record:
        name = motif.name.split(" ")[1]
        ones = [1] * len(motif.instances)
        names = []
        for instance in motif.instances:
            names.append(instance.sequence_name)
        new = pd.DataFrame({name: ones},index = names)
        temp = pd.concat([meme_positions, new], axis=1).fillna(0)
        meme_positions = temp
    # Read tomtom file
    tomtom_file = "/Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT_25842977/Motif/%s/tomtom.txt" % (tom)
    tomtom_dict = {}
    with open(tomtom_file, "rb") as data:
        tomtom = csv.DictReader(data, delimiter="\t")
        for line in tomtom:
            target = line['Target ID']
            motif = line['#Query ID']
            pval = float(line['p-value'])
            tfs = motif_id_dict[target].upper()
            motif_pvalue = { motif:  [pval]}
            # JASPAR :: means that any TF can either protein, split the protein
            tf_list = tfs.split("::")
            for tf in tf_list:
                # Reduce split form splice to single value [ID]_#
                single_isoform = tf.split("_")[0]
                if single_isoform in tomtom_dict.keys():
                    if motif in tomtom_dict[single_isoform].keys():
                        tomtom_dict[single_isoform][motif].append(pval)
                    else:
                        tomtom_dict[single_isoform].update(motif_pvalue)
                else:
                    tomtom_dict[single_isoform] = motif_pvalue
    # Make dataframe
    tomtom_motif = pd.DataFrame()
    for key,motif in tomtom_dict.iteritems():
        pvalue_dict = {}
        # Loop through motifs to see if length greater than 1, if so do pvalue scaling
        for m,p in motif.iteritems():
            if len(p) > 1:
                stouffer_statistic, stouffer_pval = scipy.stats.combine_pvalues(p,method = 'stouffer', weights = None)
                pvalue_dict[m] = stouffer_pval
            else:
                pvalue_dict[m] = p[0]
        pvalues = np.array(pvalue_dict.values())
        new = pd.DataFrame({key: pvalues},index = pvalue_dict.keys())
        temp = pd.concat([tomtom_motif, new], axis=1).fillna(0).sort_index(level=int)
        tomtom_motif = temp
    # Reorder
    tomtom_motif_reorder = tomtom_motif.reindex( list(meme_positions.columns.values)).fillna(0)
    # dot product
    meme_tomtom_cell = meme_positions.dot(tomtom_motif_reorder)
    # Scale and add
    scaler = preprocessing.MinMaxScaler()
    meme_tomtom_cell_transform = meme_tomtom_cell.T
    norm = scaler.fit_transform(meme_tomtom_cell_transform.values) # norm across enhancers for each enhancer
    meme_tomtom_cell_std = pd.DataFrame(data=norm.T, columns=list(meme_tomtom_cell.columns.values), index = meme_tomtom_cell.index )
    # Add to previous data
    temp = meme_tomtom.add(meme_tomtom_cell_std, fill_value=0).fillna(0).sort_index(level=int)
    meme_tomtom = temp

# Transform meme tom_tom
motif_enhancers = meme_tomtom.T

# Rename column headers
motif_enhancers.rename(columns=lambda x: x.split('-')[0], inplace=True)
motif_enhancers.rename(columns=lambda x: x.replace(':', "_"), inplace=True)

# Standardize to range 0-1
scaler = preprocessing.MinMaxScaler()
motif_enhancers_transform = motif_enhancers.T
norm = scaler.fit_transform(motif_enhancers_transform.values) # norm across enhancers for each enhancer
motif_enhancers_scaled = pd.DataFrame(data=norm.T, columns=list(motif_enhancers.columns.values), index = motif_enhancers.index)



### Load and Parse FPKM data from RNA-seq

# Grab TF FPKM levels
fpkm = pd.read_table("rna.tsv")
gene_names_mapping = pd.read_csv("../gencode.v19.annotation_protein_coding_ids.txt",names=['gene_id', 'symbol'])
fpkm_symbol = fpkm.merge(gene_names_mapping)
fpkm_symbol = fpkm_symbol.set_index(['symbol'])

# Get only TF's in JASPAR
all_motifs = list(motif_enhancers.index)
fpkm_tfs = list(fpkm_symbol.index)
for i in range(0,len(fpkm_tfs)):
    tf = fpkm_tfs[i]
    tfs = tf.split(',')
    if len(tfs) == 1:
        fpkm_tfs[i] = tfs[0]
    else:
        for t in tfs:
            if t in all_motifs:
                fpkm_tfs[i] = t

tf_fpkm = fpkm_symbol.loc[fpkm_symbol.index.isin(all_motifs)]

# Get subset of only cell line FPKM calues
headers = list(tf_fpkm.columns.values)
subset = []
for value in headers:
    if re.search('ES_D',value):
        subset.append(value)

tf_cell_lines = tf_fpkm[subset]

# For Fusion 'EWSR1-FLI' take the lowest FPKM and add that to the tf_cell_lines
hetero_dimer_motifs = []
hetero_dimer = {}
for motif in all_motifs:
    if re.search("-[a-zA-Z]",motif):
        tfs = motif.split('-')
        tf_fpkm_hd = fpkm.loc[fpkm.index.isin(tfs)]
        tf_fpkm_hd_cell_lines = tf_fpkm_hd[subset]
        hd_fpkm = tf_fpkm_hd_cell_lines.min(axis=0).to_frame()
        hd_fpkm_transform = hd_fpkm.T
        hd_fpkm_transform.name = 'gene_short_name'
        hd_fpkm_transform.index = [motif]
        temp = pd.concat([tf_fpkm, hd_fpkm_transform], axis=0)
        tf_cell_lines = temp


# Rename headers for cell lines
headers = list(tf_cell_lines.columns.values)
new_headers = []
for h in headers:
     new_headers.append(h.split('_')[1])

# Note 5 TFs not represented ['TCFE2A', 'RAR', 'ZFP423', 'RXR', 'TCFCP2L1']

tf_cell_lines.columns = new_headers
tf_cell_lines = tf_fpkm[subset]

# Log2 scale FPKM
x = tf_cell_lines.stack()
y = filter(lambda a: a != 0, x)
tf_factor = min(y) # min is 5.2535600000000006e-05
force_zero = np.log2(0.0000005)
tf_cell_lines_std = tf_cell_lines.apply(np.log2).replace(-np.inf,force_zero)
scaler = preprocessing.RobustScaler()
norm = scaler.fit_transform(tf_cell_lines_std.values)
tf_cell_lines_std_robust = pd.DataFrame(data=norm, columns=list(tf_cell_lines_std.columns.values), index = tf_cell_lines_std.index )

# Scale from 0-1
scaler = preprocessing.MinMaxScaler()
tf_cell_lines_std_robust_transform = tf_cell_lines_std_robust.T
norm = scaler.fit_transform(tf_cell_lines_std_robust_transform.values)
tf_scaled_tmp = pd.DataFrame(data=norm.T, columns=list(tf_cell_lines_std_robust.columns.values), index = tf_cell_lines_std_robust.index )

# Binarize (.4 cutoff for intial values)
threshold_1q = .4
scaler = preprocessing.Binarizer(threshold=threshold_1q)
norm = scaler.fit_transform(tf_cell_lines.values)
tf_scaled_binarize = pd.DataFrame(data=norm, columns=list(tf_cell_lines.columns.values), index = tf_cell_lines.index )
tf_scaled = tf_scaled_tmp.multiply(tf_scaled_binarize)


### Start Integration Clcuations

# 0. Filteration step
test = list(motif_enhancers_scaled.columns.values)
test_2 = list(rpkm_scaled.index.values)
set(test_2).intersection(test)
needed_rows = [row for row in rpkm_scaled.index if row in list(motif_enhancers_scaled.columns.values)]
rpkm_robust_filtered= rpkm_scaled.loc[needed_rows]
H3K27ac_robust_filtered= H3K27ac_scaled.loc[needed_rows]
H3K4me1_robust_filtered= H3K4me1_scaled.loc[needed_rows]
fd

# 1. add H3K27ac and H3K4me1 signal
H3K27ac_H3K4me1 = H3K27ac_scaled.add(H3K4me1_scaled)

# 2. Add H3K27ac by RPKM
rpkm_H3K27ac_H3K4me1 = H3K27ac_H3K4me1.add(rpkm_scaled)

# 3. Make Score Matrix
## Enhancers RPKM x Motif Enhancers
motif_cell_line = motif_enhancers_scaled.dot(rpkm_H3K27ac_H3K4me1)
needed_rows = [row for row in motif_cell_line.index if row in list(tf_scaled.index)]
motif_cell_line_filtered_tfs = motif_cell_line.loc[needed_rows]
motif_cell_line_filtered_tfs.columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
motif_cell_line_filtered_tfs = motif_cell_line_filtered_tfs[reorder]

# reindex
tf_scaled_ordered = tf_scaled.reindex(list(motif_cell_line_filtered_tfs.index))
tf_scaled_ordered = tf_scaled_ordered[reorder]

# 4. .multiply() to to Element-by-element multiplication Score Enhancers by TF
cell_tf_values = motif_cell_line_filtered_tfs.multiply(tf_scaled_ordered)
cell_tf_values.columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
cell_tf_values_colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"]

# 5. Z-score Standardize for each cell line to see important TF's
scaler = preprocessing.StandardScaler()
norm = scaler.fit_transform(cell_tf_values.values)
cell_tf_values_std = pd.DataFrame(data=norm, columns=list(cell_tf_values.columns.values), index = cell_tf_values.index )


# Seaborn settings
sns.axes_style({'image.cmap': u'Blacks','lines.linewidth': 100.0})

# Cluster Heatmap
sns.set_context("paper")
hmap = sns.clustermap(cell_tf_values_std,xticklabels=True, yticklabels=True, cmap="RdBu_r", method = "complete", metric = "euclidean", figsize=(20, 20), col_colors=sns.color_palette(cell_tf_values_colors))
plt.setp(hmap.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
plt.savefig('final_full_cluster_heatmap.png')


# 6. Reorder based on clustering
reorder_clustering = cell_tf_values_std.columns.values[hmap.dendrogram_col.reordered_ind]
cell_tf_values_std_ordered = cell_tf_values_std[reorder_clustering]
reindex_cluserting = cell_tf_values_std.index.values[hmap.dendrogram_row.reordered_ind]
cell_tf_values_std_ordered = cell_tf_values_std_ordered.reindex(reindex_cluserting)
cell_tf_values_std_ordered.to_csv("final_full_cluster_z_score.csv", encoding='utf-8')

# 7. Rank Order
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.manifold import TSNE
tnse = TSNE(n_components=2, random_state=0)
tnse_fit = tnse.fit_transform(cell_tf_values_std)

k_model = KMeans(n_clusters=4,random_state=1).fit(cell_tf_values_std)
labels = k_model.labels_
centroids = k_model.cluster_centers_


vis_x = tnse_fit[:, 0]
vis_y = tnse_fit[:, 1]

plt.scatter(vis_x, vis_y, c=labels, cmap=plt.cm.get_cmap("jet", 4),facecolor='white')
plt.colorbar(ticks=None)
plt.xlim(-300,  300)
plt.ylim(-300, 300)
plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.savefig('k_means_clustering.png')
plt.clf()



k_model = KMeans(n_clusters=4,random_state=1).fit(cell_tf_values_std)
labels = k_model.labels_
centroids = k_model.cluster_centers_

pca = PCA(n_components=2).fit(cell_tf_values_std)
pca_2d = pca.transform(cell_tf_values_std)

vis_x = pca_2d[:, 0]
vis_y = pca_2d[:, 1]


plt.scatter(vis_x, vis_y, c=labels, cmap=plt.cm.get_cmap("jet", 4))
plt.colorbar(ticks=range(4))
plt.savefig('k_means_clustering_pca.png')
plt.clf()


# Seperate into 4 cluster
cell_tf_values_std_cluster = cell_tf_values_std
cell_tf_values_std_cluster['cluster'] = list(labels)

cell_tf_values_std_cluster.to_csv('clustering_tfs.csv')

cell_tf_values_std_cluster_1 = cell_tf_values_std_cluster.loc[cell_tf_values_std_cluster['cluster'] == 0]
cell_tf_values_std_cluster_2 = cell_tf_values_std_cluster.loc[cell_tf_values_std_cluster['cluster'] == 1]
cell_tf_values_std_cluster_3 = cell_tf_values_std_cluster.loc[cell_tf_values_std_cluster['cluster'] == 2]
cell_tf_values_std_cluster_4 = cell_tf_values_std_cluster.loc[cell_tf_values_std_cluster['cluster'] == 3]

# col_colors
colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"]
medianprops = dict(linestyle='-', linewidth=2, color='black')

box = cell_tf_values_std_cluster_1.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3)

for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)

plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-1.5,1.5))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'])
plt.savefig('box_plot_cluster_1.png')
plt.clf()


box = cell_tf_values_std_cluster_2.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 1.5)

for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)

plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-1.5,2.5))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'])
plt.savefig('box_plot_cluster_2.png')
plt.clf()

box = cell_tf_values_std_cluster_3.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3)

for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)

plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-1.5,3.5))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'])
plt.savefig('box_plot_cluster_3.png')
plt.clf()


box = cell_tf_values_std_cluster_4.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3)

for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)

plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-1.5,4.5))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'])
plt.savefig('box_plot_cluster_4.png')
plt.clf()


# Wilcox rank sum test:
scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D0'],cell_tf_values_std_cluster_1['ES_D2'])
scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D0'],cell_tf_values_std_cluster_1['ES_D5'])
scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D0'],cell_tf_values_std_cluster_1['ES_D7'])
scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D0'],cell_tf_values_std_cluster_1['ES_D10'])


scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D2'],cell_tf_values_std_cluster_1['ES_D5'])
scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D2'],cell_tf_values_std_cluster_1['ES_D7'])
scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D2'],cell_tf_values_std_cluster_1['ES_D10'])

scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D5'],cell_tf_values_std_cluster_1['ES_D7'])
scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D5'],cell_tf_values_std_cluster_1['ES_D10'])

scipy.stats.ranksums(cell_tf_values_std_cluster_1['ES_D7'],cell_tf_values_std_cluster_1['ES_D10'])


scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D0'],cell_tf_values_std_cluster_2['ES_D2'])
scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D0'],cell_tf_values_std_cluster_2['ES_D5'])
scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D0'],cell_tf_values_std_cluster_2['ES_D7'])
scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D0'],cell_tf_values_std_cluster_2['ES_D10'])


scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D2'],cell_tf_values_std_cluster_2['ES_D5'])
scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D2'],cell_tf_values_std_cluster_2['ES_D7'])
scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D2'],cell_tf_values_std_cluster_2['ES_D10'])

scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D5'],cell_tf_values_std_cluster_2['ES_D7'])
scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D5'],cell_tf_values_std_cluster_2['ES_D10'])

scipy.stats.ranksums(cell_tf_values_std_cluster_2['ES_D7'],cell_tf_values_std_cluster_2['ES_D10'])

scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D0'],cell_tf_values_std_cluster_3['ES_D2'])
scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D0'],cell_tf_values_std_cluster_3['ES_D5'])
scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D0'],cell_tf_values_std_cluster_3['ES_D7'])
scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D0'],cell_tf_values_std_cluster_3['ES_D10'])


scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D2'],cell_tf_values_std_cluster_3['ES_D5'])
scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D2'],cell_tf_values_std_cluster_3['ES_D7'])
scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D2'],cell_tf_values_std_cluster_3['ES_D10'])

scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D5'],cell_tf_values_std_cluster_3['ES_D7'])
scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D5'],cell_tf_values_std_cluster_3['ES_D10'])

scipy.stats.ranksums(cell_tf_values_std_cluster_3['ES_D7'],cell_tf_values_std_cluster_3['ES_D10'])

scipy.stats.ranksums(cell_tf_values_std_cluster_4['ES_D0'],cell_tf_values_std_cluster_4['ES_D2'])
scipy.stats.ranksums(cell_tf_values_std_cluster_4['ES_D0'],cell_tf_values_std_cluster_4['ES_D5'])
scipy.stats.ranksums(cell_tf_values_std_cluster_4['ES_D0'],cell_tf_values_std_cluster_4['ES_D7'])
scipy.stats.ranksums(cell_tf_values_std_cluster_4['ES_D0'],cell_tf_values_std_cluster_4['ES_D10'])


scipy.stats.ranksums(cell_tf_values_std_cluster_4['ES_D2'],cell_tf_values_std_cluster_4['ES_D5'])
scipy.stats.ranksums(cell_tf_values_std_cluster_4['ES_D2'],cell_tf_values_std_cluster_4['ES_D7'])
scipy.stats.ranksums(cell_tf_values_std_cluster_4['ES_D2'],cell_tf_values_std_cluster_4['ES_D10'])

scipy.stats.ranksums(cell_tf_values_std_cluster_4['ES_D5'],cell_tf_values_std_cluster_4['ES_D7'])
scipy.stats.ranksums(cell_tf_values_std_cluster_4['ES_D5'],cell_tf_values_std_cluster_4['ES_D10'])

scipy.stats.ranksums(cell_tf_values_std_cluster_4['ES_D7'],cell_tf_values_std_cluster_4['ES_D10'])


# Look at Cluster 4 for expression of TF's
cluster4_tfs = tf_cell_lines.loc[cell_tf_values_std_cluster_4.index.values]
cluster4_tfs.to_csv("cluster_4_tfs.csv")
box = cluster4_tfs.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3)

for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)

plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,65))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'])
plt.savefig('box_plot_cluster_4_tfs_fpkm.png')
# Cluster tfs 1 e-3
scipy.stats.ranksums(cluster4_tfs['ES_D0'],cluster4_tfs['ES_D2'])
scipy.stats.ranksums(cluster4_tfs['ES_D0'],cluster4_tfs['ES_D5'])
scipy.stats.ranksums(cluster4_tfs['ES_D0'],cluster4_tfs['ES_D7'])
scipy.stats.ranksums(cluster4_tfs['ES_D0'],cluster4_tfs['ES_D10'])
scipy.stats.ranksums(cluster4_tfs['ES_D2'],cluster4_tfs['ES_D5'])
scipy.stats.ranksums(cluster4_tfs['ES_D2'],cluster4_tfs['ES_D7'])
scipy.stats.ranksums(cluster4_tfs['ES_D2'],cluster4_tfs['ES_D10'])
scipy.stats.ranksums(cluster4_tfs['ES_D5'],cluster4_tfs['ES_D7'])
scipy.stats.ranksums(cluster4_tfs['ES_D5'],cluster4_tfs['ES_D10'])
scipy.stats.ranksums(cluster4_tfs['ES_D7'],cluster4_tfs['ES_D10'])
cluster4_motifs = motif_enhancers.loc[cell_tf_values_std_cluster_4.index.values]
cluster4_enhancers = only_rpkm_values.loc[cluster4_motifs.loc[:, (cluster4_motifs != 0).all(axis=0)].columns.values]
cluster4_enhancers.to_csv("cluster_4_enhancers.csv")
box = cluster4_enhancers.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3)

for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)

plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,105))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'])
plt.savefig('box_plot_cluster_4_enhancers_rpkm.png')
plt.clf()

# Cluster tfs 1 e-4
scipy.stats.ranksums(cluster4_enhancers['ES_D0'],cluster4_enhancers['ES_D2'])
scipy.stats.ranksums(cluster4_enhancers['ES_D0'],cluster4_enhancers['ES_D5'])
scipy.stats.ranksums(cluster4_enhancers['ES_D0'],cluster4_enhancers['ES_D7'])
scipy.stats.ranksums(cluster4_enhancers['ES_D0'],cluster4_enhancers['ES_D10'])
scipy.stats.ranksums(cluster4_enhancers['ES_D2'],cluster4_enhancers['ES_D5'])
scipy.stats.ranksums(cluster4_enhancers['ES_D2'],cluster4_enhancers['ES_D7'])
scipy.stats.ranksums(cluster4_enhancers['ES_D2'],cluster4_enhancers['ES_D10'])
scipy.stats.ranksums(cluster4_enhancers['ES_D5'],cluster4_enhancers['ES_D7'])
scipy.stats.ranksums(cluster4_enhancers['ES_D5'],cluster4_enhancers['ES_D10'])
scipy.stats.ranksums(cluster4_enhancers['ES_D7'],cluster4_enhancers['ES_D10'])
# Look at Cluster 3 for expression of TF's
cluster3_tfs = tf_cell_lines.loc[cell_tf_values_std_cluster_3.index.values]
cluster3_tfs.to_csv("cluster_3_tfs.csv")

box = cluster3_tfs.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3)

for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)

plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,55))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'])
plt.savefig('box_plot_cluster_3_tfs_fpkm.png')
plt.clf()

# Cluster tfs 1 e-3 (NS)
scipy.stats.ranksums(cluster3_tfs['ES_D0'],cluster3_tfs['ES_D2'])
scipy.stats.ranksums(cluster3_tfs['ES_D0'],cluster3_tfs['ES_D5'])
scipy.stats.ranksums(cluster3_tfs['ES_D0'],cluster3_tfs['ES_D7'])
scipy.stats.ranksums(cluster3_tfs['ES_D0'],cluster3_tfs['ES_D10'])


scipy.stats.ranksums(cluster3_tfs['ES_D2'],cluster3_tfs['ES_D5'])
scipy.stats.ranksums(cluster3_tfs['ES_D2'],cluster3_tfs['ES_D7'])
scipy.stats.ranksums(cluster3_tfs['ES_D2'],cluster3_tfs['ES_D10'])

scipy.stats.ranksums(cluster3_tfs['ES_D5'],cluster3_tfs['ES_D7'])
scipy.stats.ranksums(cluster3_tfs['ES_D5'],cluster3_tfs['ES_D10'])

scipy.stats.ranksums(cluster3_tfs['ES_D7'],cluster3_tfs['ES_D10'])




cluster3_motifs = motif_enhancers.loc[cell_tf_values_std_cluster_3.index.values]
cluster3_enhancers = only_rpkm_values.loc[cluster3_motifs.loc[:, (cluster3_motifs != 0).all(axis=0)].columns.values]
cluster3_enhancers.to_csv("cluster_3_enhancers.csv")

box = cluster3_enhancers.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3)

for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)

plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,65))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'])
plt.savefig('box_plot_cluster_3_enhancers_rpkm.png')
plt.clf()

# Cluster tfs 1 e-12
scipy.stats.ranksums(cluster3_enhancers['ES_D0'],cluster3_enhancers['ES_D2'])
scipy.stats.ranksums(cluster3_enhancers['ES_D0'],cluster3_enhancers['ES_D5'])
scipy.stats.ranksums(cluster3_enhancers['ES_D0'],cluster3_enhancers['ES_D7'])
scipy.stats.ranksums(cluster3_enhancers['ES_D0'],cluster3_enhancers['ES_D10'])


scipy.stats.ranksums(cluster3_enhancers['ES_D2'],cluster3_enhancers['ES_D5'])
scipy.stats.ranksums(cluster3_enhancers['ES_D2'],cluster3_enhancers['ES_D7'])
scipy.stats.ranksums(cluster3_enhancers['ES_D2'],cluster3_enhancers['ES_D10'])

scipy.stats.ranksums(cluster3_enhancers['ES_D5'],cluster3_enhancers['ES_D7'])
scipy.stats.ranksums(cluster3_enhancers['ES_D5'],cluster3_enhancers['ES_D10'])

scipy.stats.ranksums(cluster3_enhancers['ES_D7'],cluster3_enhancers['ES_D10'])


## Analysis of only RNA-seq
# 1. Z-score Standardize for each cell line to see important TF's
scaler = preprocessing.StandardScaler()
norm = scaler.fit_transform(tf_scaled_ordered.values)
tf_scaled_std = pd.DataFrame(data=norm, columns=list(tf_scaled_ordered.columns.values), index = tf_scaled_ordered.index )

# Seaborn settings
sns.axes_style({'image.cmap': u'Blacks','lines.linewidth': 100.0})

# Cluster Heatmap
sns.set_context("paper")
hmap = sns.clustermap(tf_scaled_std,xticklabels=True, yticklabels=True, cmap="RdBu_r", method = "complete", metric = "euclidean", figsize=(20, 20), col_colors=sns.color_palette(cell_tf_values_colors))
plt.setp(hmap.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
plt.savefig('final_full_cluster_heatmap_rna-seq.png')

labels = [item.get_text() for item in hmap.ax_heatmap.yaxis.get_majorticklabels()]
labels.reverse()
with  open("final_full_cluster_heatmap_rna-seq.csv", 'wb') as csv_file:
    wr = csv.writer(csv_file,dialect='excel',quoting=csv.QUOTE_ALL)
    for tf in labels:
        wr.writerow([tf,])

# 2. Reorder based on clustering
reorder_clustering = tf_scaled_std.columns.values[hmap.dendrogram_col.reordered_ind]
tf_scaled_std_ordered = tf_scaled_std[reorder_clustering]
reindex_cluserting = tf_scaled_std.index.values[hmap.dendrogram_row.reordered_ind]
tf_scaled_std_ordered = tf_scaled_std_ordered.reindex(reindex_cluserting)
tf_scaled_std_ordered.to_csv("final_full_cluster_z_score-rnaseq.csv", encoding='utf-8')


## Analysis of only GRO-seq data
# 1. Make Score Matrix
## Enhancers RPKM x Motif Enhancers
motif_cell_line = motif_enhancers_scaled.dot(rpkm_robust_filtered)
needed_rows = [row for row in motif_cell_line.index if row in list(tf_scaled.index)]
motif_cell_line_filtered_tfs = motif_cell_line.loc[needed_rows]
motif_cell_line_filtered_tfs.columns =  ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
motif_cell_line_filtered_tfs = motif_cell_line_filtered_tfs[reorder]

# 2. Z-score Standardize for each cell line to see important TF's
scaler = preprocessing.StandardScaler()
norm = scaler.fit_transform(motif_cell_line_filtered_tfs.values)
motif_cell_line_filtered_tfs_std = pd.DataFrame(data=norm, columns=list(motif_cell_line_filtered_tfs.columns.values), index = motif_cell_line_filtered_tfs.index )

# Seaborn settings
sns.axes_style({'image.cmap': u'Blacks','lines.linewidth': 100.0})

# Cluster Heatmap
sns.set_context("paper")
hmap = sns.clustermap(motif_cell_line_filtered_tfs_std,xticklabels=True, yticklabels=True, cmap="RdBu_r", method = "complete", metric = "euclidean", figsize=(20, 20), col_colors=sns.color_palette(cell_tf_values_colors))
plt.setp(hmap.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
plt.savefig('final_full_cluster_heatmap_gro-seq.png')

labels = [item.get_text() for item in hmap.ax_heatmap.yaxis.get_majorticklabels()]
labels.reverse()
with  open("final_full_cluster_heatmap_gro-seq.csv", 'wb') as csv_file:
    wr = csv.writer(csv_file,dialect='excel',quoting=csv.QUOTE_ALL)
    for tf in labels:
        wr.writerow([tf,])

# 3. Reorder based on clustering
reorder_clustering = motif_cell_line_filtered_tfs_std.columns.values[hmap.dendrogram_col.reordered_ind]
motif_cell_line_filtered_tfs_std_ordered = motif_cell_line_filtered_tfs_std[reorder_clustering]
reindex_cluserting = motif_cell_line_filtered_tfs_std.index.values[hmap.dendrogram_row.reordered_ind]
motif_cell_line_filtered_tfs_std_ordered = motif_cell_line_filtered_tfs_std_ordered.reindex(reindex_cluserting)
motif_cell_line_filtered_tfs_std_ordered.to_csv("final_full_cluster_z_score-groseq.csv", encoding='utf-8')

## Analysis of only GRO-seq data + RNA-seq

# 1. Make Score Matrix
## Enhancers RPKM x Motif Enhancers
motif_cell_line = motif_enhancers_scaled.dot(rpkm_robust_filtered)
needed_rows = [row for row in motif_cell_line.index if row in list(tf_scaled.index)]
motif_cell_line_filtered_tfs = motif_cell_line.loc[needed_rows]
motif_cell_line_filtered_tfs.columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
motif_cell_line_filtered_tfs = motif_cell_line_filtered_tfs[reorder]

# reindex
tf_scaled_ordered = tf_scaled.reindex(list(motif_cell_line_filtered_tfs.index))
tf_scaled_ordered = tf_scaled_ordered[reorder]

# 2. .multiply() to to Element-by-element multiplication Score Enhancers by TF
cell_tf_values = motif_cell_line_filtered_tfs.multiply(tf_scaled_ordered)
cell_tf_values.columns = ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10']
cell_tf_values_colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"]

# 3. Z-score Standardize for each cell line to see important TF's
scaler = preprocessing.StandardScaler()
norm = scaler.fit_transform(cell_tf_values.values)
cell_tf_values_std = pd.DataFrame(data=norm, columns=list(cell_tf_values.columns.values), index = cell_tf_values.index )


# Seaborn settings
sns.axes_style({'image.cmap': u'Blacks','lines.linewidth': 100.0})

# Cluster Heatmap
sns.set_context("paper")
hmap = sns.clustermap(cell_tf_values_std,xticklabels=True, yticklabels=True, cmap="RdBu_r", method = "complete", metric = "euclidean", figsize=(20, 20), col_colors=sns.color_palette(cell_tf_values_colors))
plt.setp(hmap.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
plt.savefig('final_full_cluster_heatmap_gro_rna.png')

labels = [item.get_text() for item in hmap.ax_heatmap.yaxis.get_majorticklabels()]
labels.reverse()
with  open("final_full_cluster_heatmap_gro_rna.csv", 'wb') as csv_file:
    wr = csv.writer(csv_file,dialect='excel',quoting=csv.QUOTE_ALL)
    for tf in labels:
        wr.writerow([tf,])


# 4. Reorder based on clustering
reorder_clustering = cell_tf_values_std.columns.values[hmap.dendrogram_col.reordered_ind]
cell_tf_values_std_ordered = cell_tf_values_std[reorder_clustering]
reindex_cluserting = cell_tf_values_std.index.values[hmap.dendrogram_row.reordered_ind]
cell_tf_values_std_ordered = cell_tf_values_std_ordered.reindex(reindex_cluserting)
cell_tf_values_std_ordered.to_csv("final_full_cluster_z_score_gro_rna.csv", encoding='utf-8')