import pandas as pd import matplotlib.pyplot as plt import numpy as np ssp_file = pd.read_csv("SSP_sum.tsv", sep='\t') ssp_file_filtered = pd.read_csv("SSP_filtered_peaks.tsv" , sep='\t') sunp_potential_enhancers = pd.read_csv("SUNP_filtered_peaks.tsv" , sep='\t') named_enhancers = list(ssp_file_filtered['name']) ssp_potential_enhancers = ssp_file[ssp_file['name'].isin(named_enhancers)] ssp_potential_enhancers.to_csv('SSP_filtered_peaks_sum.tsv', header=True, index=True, sep='\t') # get numbers for each es_d0_none = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D0'] < 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D0'] < 1]) es_d0_enhancers = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D0'] >= 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D0'] >= 1]) es_d2_none = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D2'] < 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D2'] < 1]) es_d2_enhancers = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D2'] >= 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D2'] >= 1]) es_d5_none = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D5'] < 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D5'] < 1]) es_d5_enhancers = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D5'] >= 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D5'] >= 1]) es_d7_none = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D7'] < 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D7'] < 1]) es_d7_enhancers = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D7'] >= 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D7'] >= 1]) es_d10_none = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D10'] < 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D10'] < 1]) es_d10_enhancers = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D10'] >= 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D10'] >= 1]) raw_data = {'Cell': ['hES', 'DE', 'GT', 'FG', 'PE'], 'unmarked': [float(es_d0_none),float(es_d2_none),float(es_d5_none),float(es_d7_none),float(es_d10_none)], 'GRO-seq': [es_d0_enhancers,es_d2_enhancers ,es_d5_enhancers,es_d7_enhancers,es_d10_enhancers]} df = pd.DataFrame(raw_data, columns = ['Cell', 'unmarked', 'GRO-seq']) # Create a figure with a single subplot f, ax = plt.subplots(1, figsize=(10,5)) # Set bar width at 1 bar_width = 1 # positions of the left bar-boundaries bar_l = [i for i in range(len(df['unmarked']))] # positions of the x-axis ticks (center of the bars as bar labels) tick_pos = [i+(bar_width/2) for i in bar_l] # Create the total enhancers totals = [i+j for i,j in zip(df['unmarked'], df['GRO-seq''])] # Create the percentage of the total unmarked enhancers value for each cell was unmarked_rel = [i / float(j) * 100 for i,j in zip(df['unmarked'], totals)] # Create the percentage of the total H3K4me1 alone enhancers value for each cell alone_rel = [i / float(j) * 100 for i,j in zip(df['GRO-seq'], totals)] N = 5 ind = np.arange(N) # the x locations for the groups width = 0.5 # the width of the bars: can also be len(x) sequence p1 = plt.bar(ind, alone_rel, width, color='#E62128') p2 = plt.bar(ind, unmarked_rel, width, bottom=alone_rel,color='#D2D5D4') plt.ylabel('% total enhancers') plt.title('Stage') plt.xticks(ind, ('hES', 'DE', 'GT', 'FG', 'PE')) plt.yticks(np.arange(0, 110, 10)) plt.savefig('Gro_percentages.png') plt.clf() # Get 1kb centered files short_short = pd.read_csv("short_short_paired_1kb.bed", sep='\t', header=None, names= ['chr', 'start', 'end', 'name', 'score', 'strand']) short_unpaired = pd.read_csv("short_unpaired_1kb.bed", sep='\t', header=None, names= ['chr', 'start', 'end', 'name', 'score', 'strand']) universe_enhancers = pd.concat([short_short,short_unpaired]) es_d0_names = list(ssp_potential_enhancers[ssp_potential_enhancers['ES_D0'] >= 0.5 ]['name']) + list(sunp_potential_enhancers[sunp_potential_enhancers['ES_D0'] >= 1 ]['name']) es_d2_names = list(ssp_potential_enhancers[ssp_potential_enhancers['ES_D2'] >= 0.5 ]['name']) + list(sunp_potential_enhancers[sunp_potential_enhancers['ES_D2'] >= 1 ]['name']) es_d5_names = list(ssp_potential_enhancers[ssp_potential_enhancers['ES_D5'] >= 0.5 ]['name']) + list(sunp_potential_enhancers[sunp_potential_enhancers['ES_D5'] >= 1 ]['name']) es_d7_names = list(ssp_potential_enhancers[ssp_potential_enhancers['ES_D7'] >= 0.5 ]['name']) + list(sunp_potential_enhancers[sunp_potential_enhancers['ES_D7'] >= 1 ]['name']) es_d10_names = list(ssp_potential_enhancers[ssp_potential_enhancers['ES_D10'] >= 0.5 ]['name']) + list(sunp_potential_enhancers[sunp_potential_enhancers['ES_D10'] >= 1 ]['name']) es_d0_enhancers = universe_enhancers[universe_enhancers['name'].isin(es_d0_names)] es_d2_enhancers = universe_enhancers[universe_enhancers['name'].isin(es_d2_names)] es_d5_enhancers = universe_enhancers[universe_enhancers['name'].isin(es_d5_names)] es_d7_enhancers = universe_enhancers[universe_enhancers['name'].isin(es_d7_names)] es_d10_enhancers = universe_enhancers[universe_enhancers['name'].isin(es_d10_names)] es_d0_enhancers.to_csv("ES_D0_gro-seq_enhancers.bed", index_label="gene_id",sep='\t', columns=["chr", "start", "end","name"],header=False,index=False) es_d2_enhancers.to_csv("ES_D2_gro-seq_enhancers.bed", index_label="gene_id",sep='\t', columns=["chr", "start", "end","name"],header=False,index=False) es_d5_enhancers.to_csv("ES_D5_gro-seq_enhancers.bed", index_label="gene_id",sep='\t', columns=["chr", "start", "end","name"],header=False,index=False) es_d7_enhancers.to_csv("ES_D7_gro-seq_enhancers.bed", index_label="gene_id",sep='\t', columns=["chr", "start", "end","name"],header=False,index=False) es_d10_enhancers.to_csv("ES_D10_gro-seq_enhancers.bed", index_label="gene_id",sep='\t', columns=["chr", "start", "end","name"],header=False,index=False)