Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • venkat.malladi/tfsee
  • gcrb/tfsee
Show changes
Showing
with 343 additions and 0 deletions
%% Cell type:code id: tags:
``` python
import pandas as pd
```
%% Cell type:code id: tags:
``` python
cluster4 = pd.read_csv('cluster_4_enhancers.csv', index_col=0)
cluster3 = pd.read_csv('cluster_3_enhancers.csv', index_col=0)
```
%% Cell type:code id: tags:
``` python
len(set(cluster4.index))
```
%% Output
1371
%% Cell type:code id: tags:
``` python
len(set(cluster3.index) - set(cluster4.index))
```
%% Output
1101
%% Cell type:code id: tags:
``` python
set(cluster4.index) - set(cluster3.index)
```
%% Output
{'SSP_236665',
'SSP_236666',
'SSP_313311',
'SUNP_1253075',
'SUNP_333951',
'SUNP_415522',
'SUNP_926025'}
%% Cell type:code id: tags:
``` python
2465-1101
```
%% Output
1364
%% Cell type:code id: tags:
``` python
1101/2465.0
```
%% Output
0.4466531440162272
%% Cell type:code id: tags:
``` python
len(cluster4.index)
```
%% Output
1371
%% Cell type:code id: tags:
``` python
1371-7
```
%% Output
1364
%% Cell type:code id: tags:
``` python
1364/1371.0
```
%% Output
0.9948942377826404
%% Cell type:code id: tags:
``` python
```
analysis/GRO_seq_TFSEE/box_plot_cluster_1.png

12.1 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_1_enhancers_rpkm.png

12.6 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_1_genes_fpkm.png

11.6 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_1_tfs_fpkm.png

11.6 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_2.png

13.9 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_2_enhancers_rpkm.png

14.1 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_2_genes_fpkm.png

11.4 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_2_tfs_fpkm.png

12 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_3.png

9.87 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_3_genes_fpkm.png

11.1 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_3_tfs_fpkm.png

12 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_4.png

10.1 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_4_genes_fpkm.png

12 KiB

analysis/GRO_seq_TFSEE/box_plot_cluster_4_tfs_fpkm.png

12.6 KiB

import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
# Find nearest genes
# Grab TF FPKM levels
fpkm = pd.read_table("rna.tsv")
gene_names_mapping = pd.read_csv("../gencode.v19.annotation_protein_coding_ids.txt",names=['gene_id', 'symbol'])
fpkm_symbol = fpkm.merge(gene_names_mapping)
fpkm_symbol = fpkm.set_index(['gene_id'])
# Enhancers
enhancers_universe = pd.DataFrame.from_csv("GRO-seq_enhancers.bed", sep="\t", header=None, index_col=3)
# Read in cluster 4 enhancers
cluster_4 = pd.DataFrame.from_csv("cluster_4_enhancers.csv", sep=",", header=0, index_col=0)
# Choose enhacners exprssed in cluster 4
enhancers_universe_cluster_4 = enhancers_universe.loc[cluster_4.index.values]
enhancers_universe_cluster_4.to_csv("cluster_4_enhancers_locations.bed", sep="\t",header=None, index=False)
# Read in nearest genes
genes_id = pd.DataFrame.from_csv("cluster_4_genes.txt", sep="\t", header=None, index_col=None)
needed_rows = [row for row in fpkm_symbol.index if row in genes_id[0].values]
cluster4_genes_expressed = fpkm_symbol.loc[needed_rows]
# col_colors
plt.style.use('classic')
colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"]
medianprops = dict(linestyle='-', linewidth=2, color='black')
box = cluster4_genes_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5)
for patch, color in zip(box['boxes'], colors):
patch.set_facecolor(color)
plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,60))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'])
plt.savefig('box_plot_cluster_4_genes_fpkm.png')
plt.clf()
# Cluster tfs 0.05
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D2'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster4_genes_expressed['ES_D7'],cluster4_genes_expressed['ES_D10'])
# Read in cluster 3 enhancers
cluster_3 = pd.DataFrame.from_csv("cluster_3_enhancers.csv", sep=",", header=0, index_col=0)
# Choose enhacners exprssed in cluster 3
enhancers_universe_cluster_3 = enhancers_universe.loc[cluster_3.index.values]
enhancers_universe_cluster_3.to_csv("cluster_3_enhancers_locations.bed", sep="\t",header=None, index=False)
# Read in nearest genes
genes_id = pd.DataFrame.from_csv("cluster_3_genes.txt", sep="\t", header=None, index_col=None)
needed_rows = [row for row in fpkm_symbol.index if row in genes_id[0].values]
cluster3_genes_expressed = fpkm_symbol.loc[needed_rows]
# col_colors
plt.style.use('classic')
colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"]
medianprops = dict(linestyle='-', linewidth=2, color='black')
box = cluster3_genes_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5)
for patch, color in zip(box['boxes'], colors):
patch.set_facecolor(color)
plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,50))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'])
plt.savefig('box_plot_cluster_3_genes_fpkm.png')
plt.clf()
# Cluster tfs 1 e-4
scipy.stats.ranksums(cluster3_genes_expressed['ES_D0'],cluster3_genes_expressed['ES_D2'])
scipy.stats.ranksums(cluster3_genes_expressed['ES_D0'],cluster3_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster3_genes_expressed['ES_D0'],cluster3_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster3_genes_expressed['ES_D0'],cluster3_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster3_genes_expressed['ES_D2'],cluster3_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster3_genes_expressed['ES_D2'],cluster3_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster3_genes_expressed['ES_D2'],cluster3_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster3_genes_expressed['ES_D5'],cluster3_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster3_genes_expressed['ES_D5'],cluster3_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster3_genes_expressed['ES_D7'],cluster3_genes_expressed['ES_D10'])
# Read in cluster 1 enhancers
cluster_1 = pd.DataFrame.from_csv("cluster_1_enhancers.csv", sep=",", header=0, index_col=0)
# Choose enhacners exprssed in cluster 1
enhancers_universe_cluster_1 = enhancers_universe.loc[cluster_1.index.values]
enhancers_universe_cluster_1.to_csv("cluster_1_enhancers_locations.bed", sep="\t",header=None, index=False)
# Read in nearest genes
genes_id = pd.DataFrame.from_csv("cluster_1_genes.txt", sep="\t", header=None, index_col=None)
needed_rows = [row for row in fpkm_symbol.index if row in genes_id[0].values]
cluster1_genes_expressed = fpkm_symbol.loc[needed_rows]
# col_colors
plt.style.use('classic')
colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"]
medianprops = dict(linestyle='-', linewidth=2, color='black')
box = cluster1_genes_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5)
for patch, color in zip(box['boxes'], colors):
patch.set_facecolor(color)
plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,55))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'])
plt.savefig('box_plot_cluster_1_genes_fpkm.png')
plt.clf()
# Cluster tfs 1 e-4
scipy.stats.ranksums(cluster1_genes_expressed['ES_D0'],cluster1_genes_expressed['ES_D2'])
scipy.stats.ranksums(cluster1_genes_expressed['ES_D0'],cluster1_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster1_genes_expressed['ES_D0'],cluster1_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster1_genes_expressed['ES_D0'],cluster1_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster1_genes_expressed['ES_D2'],cluster1_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster1_genes_expressed['ES_D2'],cluster1_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster1_genes_expressed['ES_D2'],cluster1_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster1_genes_expressed['ES_D5'],cluster1_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster1_genes_expressed['ES_D5'],cluster1_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster1_genes_expressed['ES_D7'],cluster1_genes_expressed['ES_D10'])
# Read in cluster 2 enhancers
cluster_2 = pd.DataFrame.from_csv("cluster_2_enhancers.csv", sep=",", header=0, index_col=0)
# Choose enhacners exprssed in cluster 2
enhancers_universe_cluster_2 = enhancers_universe.loc[cluster_2.index.values]
enhancers_universe_cluster_2.to_csv("cluster_2_enhancers_locations.bed", sep="\t",header=None, index=False)
# Read in nearest genes
genes_id = pd.DataFrame.from_csv("cluster_2_genes.txt", sep="\t", header=None, index_col=None)
needed_rows = [row for row in fpkm_symbol.index if row in genes_id[0].values]
cluster2_genes_expressed = fpkm_symbol.loc[needed_rows]
# col_colors
plt.style.use('classic')
colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"]
medianprops = dict(linestyle='-', linewidth=2, color='black')
box = cluster2_genes_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops)
plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5)
plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5)
for patch, color in zip(box['boxes'], colors):
patch.set_facecolor(color)
plt.tick_params(axis='y', direction='out')
plt.tick_params(axis='x', direction='out')
plt.tick_params(top='off', right='off')
plt.grid(b=False)
plt.ylim((-5,50))
plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'])
plt.savefig('box_plot_cluster_2_genes_fpkm.png')
plt.clf()
# Cluster tfs 1 e-4
scipy.stats.ranksums(cluster2_genes_expressed['ES_D0'],cluster2_genes_expressed['ES_D2'])
scipy.stats.ranksums(cluster2_genes_expressed['ES_D0'],cluster2_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster2_genes_expressed['ES_D0'],cluster2_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster2_genes_expressed['ES_D0'],cluster2_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster2_genes_expressed['ES_D2'],cluster2_genes_expressed['ES_D5'])
scipy.stats.ranksums(cluster2_genes_expressed['ES_D2'],cluster2_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster2_genes_expressed['ES_D2'],cluster2_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster2_genes_expressed['ES_D5'],cluster2_genes_expressed['ES_D7'])
scipy.stats.ranksums(cluster2_genes_expressed['ES_D5'],cluster2_genes_expressed['ES_D10'])
scipy.stats.ranksums(cluster2_genes_expressed['ES_D7'],cluster2_genes_expressed['ES_D10'])
#Closest genes Cluster 4
bedtools sort -i cluster_4_enhancers_locations.bed | bedtools closest -a - -b gencode.v19.annotation_protein_coding_sorted.gtf | cut -f12 | cut -f1 -d ';' | cut -f2 -d ' ' | sort | uniq | sed 's/"//g' > cluster_4_genes.txt
#Closest genes Cluster 3
bedtools sort -i cluster_3_enhancers_locations.bed | bedtools closest -a - -b gencode.v19.annotation_protein_coding_sorted.gtf | cut -f12 | cut -f1 -d ';' | cut -f2 -d ' ' | sort | uniq | sed 's/"//g' > cluster_3_genes.txt
#Closest genes Cluster 1
bedtools sort -i cluster_1_enhancers_locations.bed | bedtools closest -a - -b gencode.v19.annotation_protein_coding_sorted.gtf | cut -f12 | cut -f1 -d ';' | cut -f2 -d ' ' | sort | uniq | sed 's/"//g' > cluster_1_genes.txt
#Closest genes Cluster 2
bedtools sort -i cluster_2_enhancers_locations.bed | bedtools closest -a - -b gencode.v19.annotation_protein_coding_sorted.gtf | cut -f12 | cut -f1 -d ';' | cut -f2 -d ' ' | sort | uniq | sed 's/"//g' > cluster_2_genes.txt