import pandas as pd import numpy as np import csv import matplotlib.pyplot as plt import seaborn as sns import scipy # Find nearest genes # Grab TF FPKM levels fpkm = pd.read_table("rna.tsv") gene_names_mapping = pd.read_csv("../gencode.v19.annotation_protein_coding_ids.txt",names=['gene_id', 'symbol']) fpkm_symbol = fpkm.merge(gene_names_mapping) fpkm_symbol = fpkm_symbol.set_index(['gene_id']) # Enhancers enhancers_universe = pd.DataFrame.from_csv("GRO-seq_enhancers.bed", sep="\t", header=None, index_col=3) # Read in cluster 4 enhancers cluster_4 = pd.DataFrame.from_csv("cluster_4_enhancers.csv", sep=",", header=0, index_col=0) # Choose enhacners exprssed in cluster 4 enhancers_universe_cluster_4 = enhancers_universe.loc[cluster_4.index.values] enhancers_universe_cluster_4.to_csv("cluster_4_enhancers_locations.bed", sep="\t",header=None, index=False) # Read in nearest genes genes_id = pd.DataFrame.from_csv("cluster_4_genes.txt", sep="\t", header=None, index_col=None) needed_rows = [row for row in fpkm_symbol.index if row in genes_id[0].values] cluster4_genes_expressed = fpkm_symbol.loc[needed_rows] # col_colors plt.style.use('classic') colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"] medianprops = dict(linestyle='-', linewidth=2, color='black') box = cluster4_genes_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops) plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5) plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5) for patch, color in zip(box['boxes'], colors): patch.set_facecolor(color) plt.tick_params(axis='y', direction='out') plt.tick_params(axis='x', direction='out') plt.tick_params(top='off', right='off') plt.grid(b=False) plt.ylim((-5,60)) plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10']) plt.savefig('box_plot_cluster_4_genes_fpkm.png') plt.clf() # Cluster tfs 1 e-4 scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D2']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D5']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D7']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D10']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D5']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D7']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D10']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D7']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D10']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D7'],cluster4_genes_expressed['ES_D10']) # Read in cluster 3 enhancers cluster_3 = pd.DataFrame.from_csv("cluster_3_enhancers.csv", sep=",", header=0, index_col=0) # Choose enhacners exprssed in cluster 3 enhancers_universe_cluster_3 = enhancers_universe.loc[cluster_3.index.values] enhancers_universe_cluster_3.to_csv("cluster_3_enhancers_locations.bed", sep="\t",header=None, index=False) # Read in nearest genes genes_id = pd.DataFrame.from_csv("cluster_4_genes.txt", sep="\t", header=None, index_col=None) needed_rows = [row for row in fpkm_symbol.index if row in genes_id[0].values] cluster4_genes_expressed = fpkm_symbol.loc[needed_rows] # col_colors plt.style.use('classic') colors = ["#FFD66F","#2E6A44","#862743", "#4FA6C7", "#3398CC"] medianprops = dict(linestyle='-', linewidth=2, color='black') box = cluster4_genes_expressed.boxplot(column=['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10'],patch_artist=True,showfliers=False,manage_xticks=False,widths = 0.6, medianprops = medianprops) plt.setp(box['whiskers'], color='k', linestyle='-', linewidth = 5) plt.setp(box['boxes'], color='k', linestyle='-', linewidth = 5) for patch, color in zip(box['boxes'], colors): patch.set_facecolor(color) plt.tick_params(axis='y', direction='out') plt.tick_params(axis='x', direction='out') plt.tick_params(top='off', right='off') plt.grid(b=False) plt.ylim((-5,60)) plt.xticks([1,2,3,4,5], ['ES_D0', 'ES_D2', 'ES_D5', 'ES_D7', 'ES_D10']) plt.savefig('box_plot_cluster_4_genes_fpkm.png') plt.clf() # Cluster tfs 1 e-4 scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D2']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D5']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D7']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D0'],cluster4_genes_expressed['ES_D10']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D5']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D7']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D2'],cluster4_genes_expressed['ES_D10']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D7']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D5'],cluster4_genes_expressed['ES_D10']) scipy.stats.ranksums(cluster4_genes_expressed['ES_D7'],cluster4_genes_expressed['ES_D10'])