Skip to content
Snippets Groups Projects
Commit 662ba080 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Update matrix analysis to look at TFS and clossest genes and enhancers expression.

parent 1dd72c6a
Branches
No related merge requests found
GRO_seq_TFSEE/box_plot_cluster_3_enhancers_rpkm.png

12.7 KiB | W: | H:

GRO_seq_TFSEE/box_plot_cluster_3_enhancers_rpkm.png

11.4 KiB | W: | H:

GRO_seq_TFSEE/box_plot_cluster_3_enhancers_rpkm.png
GRO_seq_TFSEE/box_plot_cluster_3_enhancers_rpkm.png
GRO_seq_TFSEE/box_plot_cluster_3_enhancers_rpkm.png
GRO_seq_TFSEE/box_plot_cluster_3_enhancers_rpkm.png
  • 2-up
  • Swipe
  • Onion skin
GRO_seq_TFSEE/box_plot_cluster_4_enhancers_rpkm.png

12.5 KiB | W: | H:

GRO_seq_TFSEE/box_plot_cluster_4_enhancers_rpkm.png

12.9 KiB | W: | H:

GRO_seq_TFSEE/box_plot_cluster_4_enhancers_rpkm.png
GRO_seq_TFSEE/box_plot_cluster_4_enhancers_rpkm.png
GRO_seq_TFSEE/box_plot_cluster_4_enhancers_rpkm.png
GRO_seq_TFSEE/box_plot_cluster_4_enhancers_rpkm.png
  • 2-up
  • Swipe
  • Onion skin
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
# Find nearest genes
# Grab TF FPKM levels
fpkm = pd.read_table("rna.tsv")
gene_names_mapping = pd.read_csv("../gencode.v19.annotation_protein_coding_ids.txt",names=['gene_id', 'symbol'])
fpkm_symbol = fpkm.merge(gene_names_mapping)
fpkm_symbol = fpkm_symbol.set_index(['gene_id'])
# Enhancers
enhancers_universe = pd.DataFrame.from_csv("GRO-seq_enhancers.bed", sep="\t", header=None, index_col=3)
# Read in cluster 4 enhancers
cluster_4 = pd.DataFrame.from_csv("cluster_4_enhancers.csv", sep=",", header=0, index_col=0)
# Choose enhacners exprssed in cluster 4
enhancers_universe_cluster_4 = enhancers_universe.loc[cluster_4.index.values]
enhancers_universe_cluster_4.to_csv("cluster_4_enhancers_locations.bed", sep="\t",header=None, index=False)
# Read in nearest genes
genes_id = pd.DataFrame.from_csv("cluster_4_genes.txt", sep="\t", header=None, index_col=None)
needed_rows = [row for row in fpkm_symbol.index if row in genes_id[0].values]
cluster4_genes_expressed = fpkm_symbol.loc[needed_rows]
# col_colors
plt.style.use('classic')
colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"]
medianprops = dict(linestyle='-', linewidth=2, color='black')
box = cluster4_genes_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5)
for patch, color in zip(box['boxes'], colors):
patch.set_facecolor(color)
plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,60))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'])
plt.savefig('box_plot_cluster_4_genes_fpkm.png')
plt.clf()
# Cluster tfs 1 e-4
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D2'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D7'],cluster4_genes_expressed['ES_D10'])
# Read in cluster 3 enhancers
cluster_3 = pd.DataFrame.from_csv("cluster_3_enhancers.csv", sep=",", header=0, index_col=0)
# Choose enhacners exprssed in cluster 3
enhancers_universe_cluster_3 = enhancers_universe.loc[cluster_3.index.values]
enhancers_universe_cluster_3.to_csv("cluster_3_enhancers_locations.bed", sep="\t",header=None, index=False)
# Read in nearest genes
genes_id = pd.DataFrame.from_csv("cluster_4_genes.txt", sep="\t", header=None, index_col=None)
needed_rows = [row for row in fpkm_symbol.index if row in genes_id[0].values]
cluster4_genes_expressed = fpkm_symbol.loc[needed_rows]
# col_colors
plt.style.use('classic')
colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"]
medianprops = dict(linestyle='-', linewidth=2, color='black')
box = cluster4_genes_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5)
for patch, color in zip(box['boxes'], colors):
patch.set_facecolor(color)
plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,60))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'])
plt.savefig('box_plot_cluster_4_genes_fpkm.png')
plt.clf()
# Cluster tfs 1 e-4
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D2'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D7'],cluster4_genes_expressed['ES_D10'])
#Closest genes Cluster 4
bedtools sort -i cluster_4_enhancers_locations.bed | bedtools closest -a - -b gencode.v19.annotation_protein_coding_sorted.gtf | cut -f12 | cut -f1 -d ';' | cut -f2 -d ' ' | sort | uniq | sed 's/"//g' > cluster_4_genes.txt
#Closest genes Cluster 3
bedtools sort -i cluster_3_enhancers_locations.bed | bedtools closest -a - -b gencode.v19.annotation_protein_coding_sorted.gtf | cut -f12 | cut -f1 -d ';' | cut -f2 -d ' ' | sort | uniq | sed 's/"//g' > cluster_3_genes.txt
This diff is collapsed.
This diff is collapsed.
......@@ -607,7 +607,7 @@ plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'])
plt.savefig('box_plot_cluster_4_tfs_fpkm.png')
plt.clf()
# Cluster tfs 1 e-3
# Cluster tfs 1 e-12
scipy.stats.ranksums(cluster4_tfs['ES_D0'],cluster4_tfs['ES_D2'])
scipy.stats.ranksums(cluster4_tfs['ES_D0'],cluster4_tfs['ES_D5'])
scipy.stats.ranksums(cluster4_tfs['ES_D0'],cluster4_tfs['ES_D7'])
......@@ -627,12 +627,15 @@ scipy.stats.ranksums(cluster4_tfs['ES_D7'],cluster4_tfs['ES_D10'])
cluster4_motifs = motif_enhancers.loc[cell_tf_values_std_cluster_4.index.values]
cluster4_enhancers = only_rpkm_values.loc[cluster4_motifs.loc[:, (cluster4_motifs != 0).all(axis=0)].columns.values]
cluster4_enhancers.to_csv("cluster_4_enhancers.csv")
cluster4_enhancers = only_rpkm_values.loc[cluster4_motifs.loc[:, (cluster4_motifs >1).any(axis=0)].columns.values]
cluster_4_expressed_enhancers = np.concatenate((rpkm_ssp[(rpkm_ssp['ES_D7'] >= 0.5) | (rpkm_ssp['ES_D10'] >= 0.5)].index.values,rpkm_sunp[(rpkm_sunp['ES_D7'] >= 1) | (rpkm_sunp['ES_D10'] >=1)].index.values))
needed_rows = [row for row in cluster4_enhancers.index if row in cluster_4_expressed_enhancers]
cluster4_enhancers_expressed = cluster4_enhancers.loc[needed_rows]
cluster4_enhancers_expressed.to_csv("cluster_4_enhancers.csv")
box = cluster4_enhancers.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3)
box = cluster4_enhancers_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5)
for patch, color in zip(box['boxes'], colors):
patch.set_facecolor(color)
......@@ -641,26 +644,26 @@ plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,105))
plt.ylim((-5,75))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'])
plt.savefig('box_plot_cluster_4_enhancers_rpkm.png')
plt.clf()
# Cluster tfs 1 e-4
scipy.stats.ranksums(cluster4_enhancers['ES_D0'],cluster4_enhancers['ES_D2'])
scipy.stats.ranksums(cluster4_enhancers['ES_D0'],cluster4_enhancers['ES_D5'])
scipy.stats.ranksums(cluster4_enhancers['ES_D0'],cluster4_enhancers['ES_D7'])
scipy.stats.ranksums(cluster4_enhancers['ES_D0'],cluster4_enhancers['ES_D10'])
scipy.stats.ranksums(cluster4_enhancers_expressed['ES_D0'],cluster4_enhancers_expressed['ES_D2'])
scipy.stats.ranksums(cluster4_enhancers_expressed['ES_D0'],cluster4_enhancers_expressed['ES_D5'])
scipy.stats.ranksums(cluster4_enhancers_expressed['ES_D0'],cluster4_enhancers_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_enhancers_expressed['ES_D0'],cluster4_enhancers_expressed['ES_D10'])
scipy.stats.ranksums(cluster4_enhancers['ES_D2'],cluster4_enhancers['ES_D5'])
scipy.stats.ranksums(cluster4_enhancers['ES_D2'],cluster4_enhancers['ES_D7'])
scipy.stats.ranksums(cluster4_enhancers['ES_D2'],cluster4_enhancers['ES_D10'])
scipy.stats.ranksums(cluster4_enhancers_expressed['ES_D2'],cluster4_enhancers_expressed['ES_D5'])
scipy.stats.ranksums(cluster4_enhancers_expressed['ES_D2'],cluster4_enhancers_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_enhancers_expressed['ES_D2'],cluster4_enhancers_expressed['ES_D10'])
scipy.stats.ranksums(cluster4_enhancers['ES_D5'],cluster4_enhancers['ES_D7'])
scipy.stats.ranksums(cluster4_enhancers['ES_D5'],cluster4_enhancers['ES_D10'])
scipy.stats.ranksums(cluster4_enhancers_expressed['ES_D5'],cluster4_enhancers_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_enhancers_expressed['ES_D5'],cluster4_enhancers_expressed['ES_D10'])
scipy.stats.ranksums(cluster4_enhancers['ES_D7'],cluster4_enhancers['ES_D10'])
scipy.stats.ranksums(cluster4_enhancers_expressed['ES_D7'],cluster4_enhancers_expressed['ES_D10'])
# Look at Cluster 3 for expression of TF's
......@@ -703,12 +706,15 @@ scipy.stats.ranksums(cluster3_tfs['ES_D7'],cluster3_tfs['ES_D10'])
cluster3_motifs = motif_enhancers.loc[cell_tf_values_std_cluster_3.index.values]
cluster3_enhancers = only_rpkm_values.loc[cluster3_motifs.loc[:, (cluster3_motifs != 0).all(axis=0)].columns.values]
cluster3_enhancers.to_csv("cluster_3_enhancers.csv")
cluster3_enhancers = only_rpkm_values.loc[cluster3_motifs.loc[:, (cluster3_motifs >1).any(axis=0)].columns.values]
cluster_3_expressed_enhancers = np.concatenate((rpkm_ssp[(rpkm_ssp['ES_D0'] >= 0.5) | (rpkm_ssp['ES_D2'] >= 0.5) | (rpkm_ssp['ES_D5'] >= 0.5)].index.values,rpkm_sunp[(rpkm_sunp['ES_D0'] >= 1) | (rpkm_sunp['ES_D2'] >=1) | (rpkm_sunp['ES_D5'] >=1)].index.values))
needed_rows = [row for row in cluster3_enhancers.index if row in cluster_3_expressed_enhancers]
cluster3_enhancers_expressed = cluster3_enhancers.loc[needed_rows]
cluster3_enhancers_expressed.to_csv("cluster_3_enhancers.csv")
box = cluster3_enhancers.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 3)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 3)
box = cluster3_enhancers_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5)
for patch, color in zip(box['boxes'], colors):
patch.set_facecolor(color)
......@@ -717,26 +723,26 @@ plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,65))
plt.ylim((-5,55))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'])
plt.savefig('box_plot_cluster_3_enhancers_rpkm.png')
plt.clf()
# Cluster tfs 1 e-12
scipy.stats.ranksums(cluster3_enhancers['ES_D0'],cluster3_enhancers['ES_D2'])
scipy.stats.ranksums(cluster3_enhancers['ES_D0'],cluster3_enhancers['ES_D5'])
scipy.stats.ranksums(cluster3_enhancers['ES_D0'],cluster3_enhancers['ES_D7'])
scipy.stats.ranksums(cluster3_enhancers['ES_D0'],cluster3_enhancers['ES_D10'])
scipy.stats.ranksums(cluster3_enhancers_expressed['ES_D0'],cluster3_enhancers_expressed['ES_D2'])
scipy.stats.ranksums(cluster3_enhancers_expressed['ES_D0'],cluster3_enhancers_expressed['ES_D5'])
scipy.stats.ranksums(cluster3_enhancers_expressed['ES_D0'],cluster3_enhancers_expressed['ES_D7'])
scipy.stats.ranksums(cluster3_enhancers_expressed['ES_D0'],cluster3_enhancers_expressed['ES_D10'])
scipy.stats.ranksums(cluster3_enhancers['ES_D2'],cluster3_enhancers['ES_D5'])
scipy.stats.ranksums(cluster3_enhancers['ES_D2'],cluster3_enhancers['ES_D7'])
scipy.stats.ranksums(cluster3_enhancers['ES_D2'],cluster3_enhancers['ES_D10'])
scipy.stats.ranksums(cluster3_enhancers_expressed['ES_D2'],cluster3_enhancers_expressed['ES_D5'])
scipy.stats.ranksums(cluster3_enhancers_expressed['ES_D2'],cluster3_enhancers_expressed['ES_D7'])
scipy.stats.ranksums(cluster3_enhancers_expressed['ES_D2'],cluster3_enhancers_expressed['ES_D10'])
scipy.stats.ranksums(cluster3_enhancers['ES_D5'],cluster3_enhancers['ES_D7'])
scipy.stats.ranksums(cluster3_enhancers['ES_D5'],cluster3_enhancers['ES_D10'])
scipy.stats.ranksums(cluster3_enhancers_expressed['ES_D5'],cluster3_enhancers_expressed['ES_D7'])
scipy.stats.ranksums(cluster3_enhancers_expressed['ES_D5'],cluster3_enhancers_expressed['ES_D10'])
scipy.stats.ranksums(cluster3_enhancers['ES_D7'],cluster3_enhancers['ES_D10'])
scipy.stats.ranksums(cluster3_enhancers_expressed['ES_D7'],cluster3_enhancers_expressed['ES_D10'])
## Analysis of only RNA-seq
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment