Skip to content
Snippets Groups Projects
closest_genes.py 5.19 KiB
Newer Older
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# Find nearest genes

# Grab TF FPKM levels
fpkm = pd.read_table("rna.tsv")
gene_names_mapping = pd.read_csv("../gencode.v19.annotation_protein_coding_ids.txt",names=['gene_id', 'symbol'])
fpkm_symbol = fpkm.merge(gene_names_mapping)
fpkm_symbol = fpkm_symbol.set_index(['gene_id'])

# Enhancers
enhancers_universe = pd.DataFrame.from_csv("GRO-seq_enhancers.bed", sep="\t", header=None, index_col=3)


# Read in cluster 4 enhancers
cluster_4 = pd.DataFrame.from_csv("cluster_4_enhancers.csv", sep=",", header=0, index_col=0)

# Choose enhacners exprssed in cluster 4
enhancers_universe_cluster_4 = enhancers_universe.loc[cluster_4.index.values]
enhancers_universe_cluster_4.to_csv("cluster_4_enhancers_locations.bed", sep="\t",header=None, index=False)


# Read in nearest genes
genes_id = pd.DataFrame.from_csv("cluster_4_genes.txt", sep="\t", header=None, index_col=None)

needed_rows = [row for row in fpkm_symbol.index if row in genes_id[0].values]
cluster4_genes_expressed = fpkm_symbol.loc[needed_rows]


# col_colors
plt.style.use('classic')
colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"]
medianprops = dict(linestyle='-', linewidth=2, color='black')
box = cluster4_genes_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5)

for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)

plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,60))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'])
plt.savefig('box_plot_cluster_4_genes_fpkm.png')
plt.clf()

# Cluster tfs 1 e-4
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D2'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D10'])


scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D10'])

scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D10'])

scipy.stats.ranksums(cluster4_genes_expressed['ES_D7'],cluster4_genes_expressed['ES_D10'])


# Read in cluster 3 enhancers
cluster_3 = pd.DataFrame.from_csv("cluster_3_enhancers.csv", sep=",", header=0, index_col=0)

# Choose enhacners exprssed in cluster 3
enhancers_universe_cluster_3 = enhancers_universe.loc[cluster_3.index.values]
enhancers_universe_cluster_3.to_csv("cluster_3_enhancers_locations.bed", sep="\t",header=None, index=False)


# Read in nearest genes
genes_id = pd.DataFrame.from_csv("cluster_4_genes.txt", sep="\t", header=None, index_col=None)

needed_rows = [row for row in fpkm_symbol.index if row in genes_id[0].values]
cluster4_genes_expressed = fpkm_symbol.loc[needed_rows]


# col_colors
plt.style.use('classic')
colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"]
medianprops = dict(linestyle='-', linewidth=2, color='black')
box = cluster4_genes_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5)

for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)

plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,60))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7',  'ES_D10'])
plt.savefig('box_plot_cluster_4_genes_fpkm.png')
plt.clf()

# Cluster tfs 1 e-4
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D2'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D10'])


scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D10'])

scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D10'])

scipy.stats.ranksums(cluster4_genes_expressed['ES_D7'],cluster4_genes_expressed['ES_D10'])