diff --git a/GRO_seq_TFSEE/rank_order.py b/GRO_seq_TFSEE/rank_order.py index c55da37dc554c6e7b302f3c276ed9fa380863f1e..d633ec27fa558737193c029ec3f77d7e772199c4 100644 --- a/GRO_seq_TFSEE/rank_order.py +++ b/GRO_seq_TFSEE/rank_order.py @@ -59,3 +59,54 @@ plt.xlabel('Rank Order',fontsize=8, fontweight='bold') plt.ylabel('delta z',fontsize=8, fontweight='bold') plt.savefig('cluster3_enriched_tfs.png') plt.clf() + +tfsee_cluster2 = tfsee[tfsee['cluster'] == 1] + + +tfsee_cluster2['early'] = tfsee_cluster2[['ES_D0','ES_D2','ES_D7','ES_D10']].mean(axis=1) +tfsee_cluster2['late'] = tfsee_cluster2[['ES_D5']].mean(axis=1) +tfsee_cluster2['diff'] = tfsee_cluster2['late'] - tfsee_cluster2['early'] +tfsee_cluster2['rank'] = tfsee_cluster2['diff'].rank() + +x = list(tfsee_cluster2['rank']) + +z = np.polyfit(tfsee_cluster2['rank'], tfsee_cluster2['diff'], 3) +f = np.poly1d(z) +x_new = np.linspace(1, 24, num=len(x)*10) + + +plt.figure(figsize=(25,20)) +plt.plot(x_new, f(x_new), color = 'k', linewidth=4.0) +plt.scatter(x=tfsee_cluster2['rank'], y=tfsee_cluster2['diff'], color='#C42555', s=600) +plt.ylim([-0.5,3.0]) +plt.suptitle('GT TFS', fontsize=8, fontweight='bold') +plt.xlabel('Rank Order',fontsize=8, fontweight='bold') +plt.ylabel('delta z',fontsize=8, fontweight='bold') +plt.savefig('cluster2_enriched_tfs.png') +plt.clf() + + +tfsee_cluster1 = tfsee[tfsee['cluster'] == 0] + + +tfsee_cluster1['early'] = tfsee_cluster1[['ES_D0','ES_D2','ES_D7','ES_D10']].mean(axis=1) +tfsee_cluster1['late'] = tfsee_cluster1[['ES_D5']].mean(axis=1) +tfsee_cluster1['diff'] = tfsee_cluster1['early'] - tfsee_cluster1['late'] +tfsee_cluster1['rank'] = tfsee_cluster1['diff'].rank() + +x = list(tfsee_cluster1['rank']) + +z = np.polyfit(tfsee_cluster1['rank'], tfsee_cluster1['diff'], 3) +f = np.poly1d(z) +x_new = np.linspace(1, 64, num=len(x)*10) + + +plt.figure(figsize=(25,20)) +plt.plot(x_new, f(x_new), color = 'k', linewidth=4.0) +plt.scatter(x=tfsee_cluster1['rank'], y=tfsee_cluster1['diff'], color='#DACF5D', s=600) +plt.ylim([-1.0,1.5]) +plt.suptitle('Not GT TFS', fontsize=8, fontweight='bold') +plt.xlabel('Rank Order',fontsize=8, fontweight='bold') +plt.ylabel('delta z',fontsize=8, fontweight='bold') +plt.savefig('cluster1_enriched_tfs.png') +plt.clf() diff --git a/Histone_TFSEE/box_plot_cluster_2_genes_fpkm.png b/Histone_TFSEE/box_plot_cluster_2_genes_fpkm.png new file mode 100644 index 0000000000000000000000000000000000000000..d4032c391d7e7df8e875065f4acbcd6622385983 Binary files /dev/null and b/Histone_TFSEE/box_plot_cluster_2_genes_fpkm.png differ diff --git a/Histone_TFSEE/box_plot_cluster_3_genes_fpkm.png b/Histone_TFSEE/box_plot_cluster_3_genes_fpkm.png new file mode 100644 index 0000000000000000000000000000000000000000..5bfd380cb29770e43db70f3d4fe98e819a8cc092 Binary files /dev/null and b/Histone_TFSEE/box_plot_cluster_3_genes_fpkm.png differ diff --git a/Histone_TFSEE/closest_genes.py b/Histone_TFSEE/closest_genes.py new file mode 100644 index 0000000000000000000000000000000000000000..9130809588df73b49cb782b4399d0d541685b890 --- /dev/null +++ b/Histone_TFSEE/closest_genes.py @@ -0,0 +1,124 @@ +import pandas as pd +import numpy as np +import csv +import matplotlib.pyplot as plt +import seaborn as sns +import scipy + +# Find nearest genes + +# Grab TF FPKM levels +fpkm = pd.read_table("rna.tsv") +gene_names_mapping = pd.read_csv("../gencode.v19.annotation_protein_coding_ids.txt",names=['gene_id', 'symbol']) +fpkm_symbol = fpkm.merge(gene_names_mapping) +fpkm_symbol = fpkm.set_index(['gene_id']) + +# Enhancers +enhancers_universe = pd.DataFrame.from_csv("Histone_pe_filtered_peaks.bed", sep="\t", header=None, index_col=3) + + + + +# Read in cluster 3 enhancers +cluster_3 = pd.DataFrame.from_csv("cluster_3_enhancers.csv", sep=",", header=0, index_col=0) + +# Choose enhacners exprssed in cluster 3 +enhancers_universe_cluster_3 = enhancers_universe.loc[cluster_3.index.values] +enhancers_universe_cluster_3.to_csv("cluster_3_enhancers_locations.bed", sep="\t",header=None, index=False) + + +# Read in nearest genes +genes_id = pd.DataFrame.from_csv("cluster_3_genes.txt", sep="\t", header=None, index_col=None) + +needed_rows = [row for row in fpkm_symbol.index if row in genes_id[0].values] +cluster3_genes_expressed = fpkm_symbol.loc[needed_rows] + + +# col_colors +plt.style.use('classic') +colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"] +medianprops = dict(linestyle='-', linewidth=12, color='black') +box = cluster3_genes_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops) +plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 12) +plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 12) + +for patch, color in zip(box['boxes'], colors): + patch.set_facecolor(color) + +plt.tick_params(axis='y', direction='out') +plt.tick_params(axis='x', direction='out') +plt.tick_params(top='off', right='off') +plt.grid(b=False) +plt.ylim((-5,70)) +plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10']) +plt.savefig('box_plot_cluster_3_genes_fpkm.png') +plt.clf() + +# Cluster tfs 1 0.05 +scipy.stats.ranksums(cluster3_genes_expressed['ES_D0'],cluster3_genes_expressed['ES_D2']) +scipy.stats.ranksums(cluster3_genes_expressed['ES_D0'],cluster3_genes_expressed['ES_D5']) +scipy.stats.ranksums(cluster3_genes_expressed['ES_D0'],cluster3_genes_expressed['ES_D7']) +scipy.stats.ranksums(cluster3_genes_expressed['ES_D0'],cluster3_genes_expressed['ES_D10']) + + +scipy.stats.ranksums(cluster3_genes_expressed['ES_D2'],cluster3_genes_expressed['ES_D5']) +scipy.stats.ranksums(cluster3_genes_expressed['ES_D2'],cluster3_genes_expressed['ES_D7']) +scipy.stats.ranksums(cluster3_genes_expressed['ES_D2'],cluster3_genes_expressed['ES_D10']) + +scipy.stats.ranksums(cluster3_genes_expressed['ES_D5'],cluster3_genes_expressed['ES_D7']) +scipy.stats.ranksums(cluster3_genes_expressed['ES_D5'],cluster3_genes_expressed['ES_D10']) + +scipy.stats.ranksums(cluster3_genes_expressed['ES_D7'],cluster3_genes_expressed['ES_D10']) + + + +# Read in cluster 2 enhancers +cluster_2 = pd.DataFrame.from_csv("cluster_2_enhancers.csv", sep=",", header=0, index_col=0) + +# Choose enhacners exprssed in cluster 2 +enhancers_universe_cluster_2 = enhancers_universe.loc[cluster_2.index.values] +enhancers_universe_cluster_2.to_csv("cluster_2_enhancers_locations.bed", sep="\t",header=None, index=False) + + +# Read in nearest genes +genes_id = pd.DataFrame.from_csv("cluster_2_genes.txt", sep="\t", header=None, index_col=None) + +needed_rows = [row for row in fpkm_symbol.index if row in genes_id[0].values] +cluster2_genes_expressed = fpkm_symbol.loc[needed_rows] + + +# col_colors +plt.style.use('classic') +colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"] +medianprops = dict(linestyle='-', linewidth=12, color='black') +box = cluster2_genes_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops) +plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 12) +plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 12) + +for patch, color in zip(box['boxes'], colors): + patch.set_facecolor(color) + +plt.tick_params(axis='y', direction='out') +plt.tick_params(axis='x', direction='out') +plt.tick_params(top='off', right='off') +plt.grid(b=False) +plt.ylim((-5,70)) +plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10']) +plt.savefig('box_plot_cluster_2_genes_fpkm.png') +plt.clf() + +# Cluster tfs 1 e-4 +scipy.stats.ranksums(cluster2_genes_expressed['ES_D0'],cluster2_genes_expressed['ES_D2']) +scipy.stats.ranksums(cluster2_genes_expressed['ES_D0'],cluster2_genes_expressed['ES_D5']) +scipy.stats.ranksums(cluster2_genes_expressed['ES_D0'],cluster2_genes_expressed['ES_D7']) +scipy.stats.ranksums(cluster2_genes_expressed['ES_D0'],cluster2_genes_expressed['ES_D10']) + + +scipy.stats.ranksums(cluster2_genes_expressed['ES_D2'],cluster2_genes_expressed['ES_D5']) +scipy.stats.ranksums(cluster2_genes_expressed['ES_D2'],cluster2_genes_expressed['ES_D7']) +scipy.stats.ranksums(cluster2_genes_expressed['ES_D2'],cluster2_genes_expressed['ES_D10']) + +scipy.stats.ranksums(cluster2_genes_expressed['ES_D5'],cluster2_genes_expressed['ES_D7']) +scipy.stats.ranksums(cluster2_genes_expressed['ES_D5'],cluster2_genes_expressed['ES_D10']) + +scipy.stats.ranksums(cluster2_genes_expressed['ES_D7'],cluster2_genes_expressed['ES_D10']) diff --git a/Histone_TFSEE/closest_genes.sh b/Histone_TFSEE/closest_genes.sh new file mode 100644 index 0000000000000000000000000000000000000000..517ce845203746c9c349bb51af097ca48930c8b7 --- /dev/null +++ b/Histone_TFSEE/closest_genes.sh @@ -0,0 +1,6 @@ +#Closest genes Cluster 3 +bedtools sort -i cluster_3_enhancers_locations.bed | bedtools closest -a - -b gencode.v19.annotation_protein_coding_sorted.gtf | cut -f12 | cut -f1 -d ';' | cut -f2 -d ' ' | sort | uniq | sed 's/"//g' > cluster_3_genes.txt + + +#Closest genes Cluster 2 +bedtools sort -i cluster_2_enhancers_locations.bed | bedtools closest -a - -b gencode.v19.annotation_protein_coding_sorted.gtf | cut -f12 | cut -f1 -d ';' | cut -f2 -d ' ' | sort | uniq | sed 's/"//g' > cluster_2_genes.txt