get 1kb center of enhancer ans figures.

cd3784ba · Venkat Malladi · 7010f0ea · cd3784ba · cd3784ba · cd3784ba
Commit cd3784ba authored 8 years ago by Venkat Malladi
--- a/ES_D0_gro-seq_enhancers.bed
+++ b/ES_D0_gro-seq_enhancers.bed
--- a/ES_D0_gro-seq_enhancers_1kb.bed
+++ b/ES_D0_gro-seq_enhancers_1kb.bed
--- a/ES_D10_gro-seq_enhancers.bed
+++ b/ES_D10_gro-seq_enhancers.bed
--- a/ES_D10_gro-seq_enhancers_1kb.bed
+++ b/ES_D10_gro-seq_enhancers_1kb.bed
--- a/ES_D2_gro-seq_enhancers.bed
+++ b/ES_D2_gro-seq_enhancers.bed
--- a/ES_D2_gro-seq_enhancers_1kb.bed
+++ b/ES_D2_gro-seq_enhancers_1kb.bed
--- a/ES_D5_gro-seq_enhancers.bed
+++ b/ES_D5_gro-seq_enhancers.bed
--- a/ES_D5_gro-seq_enhancers_1kb.bed
+++ b/ES_D5_gro-seq_enhancers_1kb.bed
--- a/ES_D7_gro-seq_enhancers.bed
+++ b/ES_D7_gro-seq_enhancers.bed
--- a/ES_D7_gro-seq_enhancers_1kb.bed
+++ b/ES_D7_gro-seq_enhancers_1kb.bed
--- a/Gro_percentages.png
+++ b/Gro_percentages.png
--- a/SSP_filtered_peaks_merged.bed
+++ b/SSP_filtered_peaks_merged.bed
--- a/SSP_filtered_peaks_sum.tsv
+++ b/SSP_filtered_peaks_sum.tsv
--- a/gro-seq_enhancer_plots.py
+++ b/gro-seq_enhancer_plots.py
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+ssp_file = pd.read_csv("SSP_sum.tsv", sep='\t')
+
+ssp_file_filtered = pd.read_csv("SSP_filtered_peaks.tsv" , sep='\t')
+sunp_potential_enhancers = pd.read_csv("SUNP_filtered_peaks.tsv" , sep='\t')
+
+
+named_enhancers = list(ssp_file_filtered['name'])
+ssp_potential_enhancers = ssp_file[ssp_file['name'].isin(named_enhancers)]
+ssp_potential_enhancers.to_csv('SSP_filtered_peaks_sum.tsv', header=True, index=True, sep='\t')
+
+# get numbers for each
+es_d0_none = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D0'] < 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D0'] < 1])
+es_d0_enhancers = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D0'] >= 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D0'] >= 1])
+
+es_d2_none = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D2'] < 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D2'] < 1])
+es_d2_enhancers = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D2'] >= 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D2'] >= 1])
+
+es_d5_none = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D5'] < 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D5'] < 1])
+es_d5_enhancers = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D5'] >= 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D5'] >= 1])
+
+es_d7_none = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D7'] < 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D7'] < 1])
+es_d7_enhancers = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D7'] >= 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D7'] >= 1])
+
+es_d10_none = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D10'] < 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D10'] < 1])
+es_d10_enhancers = len(ssp_potential_enhancers[ssp_potential_enhancers['ES_D10'] >= 0.5 ]) + len(sunp_potential_enhancers[sunp_potential_enhancers['ES_D10'] >= 1])
+
+raw_data = {'Cell': ['hES', 'DE', 'GT', 'FG', 'PE'],
+        'unmarked': [float(es_d0_none),float(es_d2_none),float(es_d5_none),float(es_d7_none),float(es_d10_none)],
+        'GRO-seq': [es_d0_enhancers,es_d2_enhancers ,es_d5_enhancers,es_d7_enhancers,es_d10_enhancers]}
+
+df = pd.DataFrame(raw_data, columns = ['Cell', 'unmarked', 'GRO-seq'])
+
+# Create a figure with a single subplot
+f, ax = plt.subplots(1, figsize=(10,5))
+
+# Set bar width at 1
+bar_width = 1
+
+# positions of the left bar-boundaries
+bar_l = [i for i in range(len(df['unmarked']))]
+
+# positions of the x-axis ticks (center of the bars as bar labels)
+tick_pos = [i+(bar_width/2) for i in bar_l]
+
+# Create the total enhancers
+totals = [i+j for i,j in zip(df['unmarked'], df['GRO-seq''])]
+
+# Create the percentage of the total unmarked enhancers value for each cell was
+unmarked_rel = [i / float(j) * 100 for  i,j in zip(df['unmarked'], totals)]
+
+# Create the percentage of the total H3K4me1 alone enhancers value for each cell
+alone_rel = [i / float(j) * 100 for  i,j in zip(df['GRO-seq'], totals)]
+
+
+
+N = 5
+ind = np.arange(N)    # the x locations for the groups
+width = 0.5       # the width of the bars: can also be len(x) sequence
+
+p1 = plt.bar(ind, alone_rel, width, color='#E62128')
+p2 = plt.bar(ind, unmarked_rel, width,
+              bottom=alone_rel,color='#D2D5D4')
+
+plt.ylabel('% total enhancers')
+plt.title('Stage')
+plt.xticks(ind, ('hES', 'DE', 'GT', 'FG', 'PE'))
+plt.yticks(np.arange(0, 110, 10))
+
+plt.savefig('Gro_percentages.png')
+plt.clf()
+
+
+
+# Get 1kb centered files
+short_short = pd.read_csv("short_short_paired_1kb.bed", sep='\t', header=None, names= ['chr', 'start', 'end', 'name', 'score', 'strand'])
+short_unpaired = pd.read_csv("short_unpaired_1kb.bed", sep='\t', header=None, names= ['chr', 'start', 'end', 'name', 'score', 'strand'])
+universe_enhancers = pd.concat([short_short,short_unpaired])
+
+es_d0_names = list(ssp_potential_enhancers[ssp_potential_enhancers['ES_D0'] >= 0.5 ]['name']) + list(sunp_potential_enhancers[sunp_potential_enhancers['ES_D0'] >= 1 ]['name'])
+es_d2_names = list(ssp_potential_enhancers[ssp_potential_enhancers['ES_D2'] >= 0.5 ]['name']) + list(sunp_potential_enhancers[sunp_potential_enhancers['ES_D2'] >= 1 ]['name'])
+es_d5_names = list(ssp_potential_enhancers[ssp_potential_enhancers['ES_D5'] >= 0.5 ]['name']) + list(sunp_potential_enhancers[sunp_potential_enhancers['ES_D5'] >= 1 ]['name'])
+es_d7_names = list(ssp_potential_enhancers[ssp_potential_enhancers['ES_D7'] >= 0.5 ]['name']) + list(sunp_potential_enhancers[sunp_potential_enhancers['ES_D7'] >= 1 ]['name'])
+es_d10_names = list(ssp_potential_enhancers[ssp_potential_enhancers['ES_D10'] >= 0.5 ]['name']) + list(sunp_potential_enhancers[sunp_potential_enhancers['ES_D10'] >= 1 ]['name'])
+
+
+es_d0_enhancers = universe_enhancers[universe_enhancers['name'].isin(es_d0_names)]
+es_d2_enhancers = universe_enhancers[universe_enhancers['name'].isin(es_d2_names)]
+es_d5_enhancers = universe_enhancers[universe_enhancers['name'].isin(es_d5_names)]
+es_d7_enhancers = universe_enhancers[universe_enhancers['name'].isin(es_d7_names)]
+es_d10_enhancers = universe_enhancers[universe_enhancers['name'].isin(es_d10_names)]
+
+
+es_d0_enhancers.to_csv("ES_D0_gro-seq_enhancers.bed", index_label="gene_id",sep='\t', columns=["chr", "start", "end","name"],header=False,index=False)
+es_d2_enhancers.to_csv("ES_D2_gro-seq_enhancers.bed", index_label="gene_id",sep='\t', columns=["chr", "start", "end","name"],header=False,index=False)
+es_d5_enhancers.to_csv("ES_D5_gro-seq_enhancers.bed", index_label="gene_id",sep='\t', columns=["chr", "start", "end","name"],header=False,index=False)
+es_d7_enhancers.to_csv("ES_D7_gro-seq_enhancers.bed", index_label="gene_id",sep='\t', columns=["chr", "start", "end","name"],header=False,index=False)
+es_d10_enhancers.to_csv("ES_D10_gro-seq_enhancers.bed", index_label="gene_id",sep='\t', columns=["chr", "start", "end","name"],header=False,index=False)
--- a/groseq_processing.sh
+++ b/groseq_processing.sh
@@ -70,8 +70,37 @@ bedtools intersect -a /Volumes/project/GCRB/Lee_Lab/s163035/Matrix_analysis_PMIT
 ./cutoff_analysis_gro.py --rpkm SUNP_sum.tsv --color '#FF7C21' --factor SUNP -l 1

 # Filter SSP Paired and merge
-cut -f4 SSP_filtered_peaks.bed | sort -u > t
-for i in $(cat t); do grep $i short_short_paired_1kb.bed >> gro_seq_potential_enhancers.bed; done
-cut -f4 SUNP_filtered_peaks.bed | sort -u > t
-for i in $(cat t); do grep $i short_unpaired_1kb.bed >> gro_seq_potential_enhancers.bed; done
 for i in $(cat t); do grep $i SSP_filtered_peaks.bed > tt; bedtools merge -c 4 -o distinct -i tt >> SSP_filtered_peaks_merged.bed; done
+
+
+# Acutal Processing
+python gro-seq_enhancer_plots.py
+
+
+bedtools sort -i ES_D0_gro-seq_enhancers.bed > ES_D0_gro-seq_enhancers_1kb.bed
+bedtools sort -i ES_D2_gro-seq_enhancers.bed > ES_D2_gro-seq_enhancers_1kb.bed
+bedtools sort -i ES_D5_gro-seq_enhancers.bed > ES_D5_gro-seq_enhancers_1kb.bed
+bedtools sort -i ES_D7_gro-seq_enhancers.bed > ES_D7_gro-seq_enhancers_1kb.bed
+bedtools sort -i ES_D10_gro-seq_enhancers.bed > ES_D10_gro-seq_enhancers_1kb.bed
+
+
+# Meme processing
+#!/bin/bash
+
+#SBATCH --job-name=fasta_processing
+#SBATCH --partition=super
+#SBATCH --nodes=2
+#SBATCH --ntasks=64
+#SBATCH --time=0-36:00:00
+#SBATCH --output=fasta_processing.%j.out
+#SBATCH --error=fasta_processing.%j.err
+#SBATCH --mail-user=venkat.malladi@utsouthwestern.edu
+#SBATCH --mail-type=ALL
+# Fasta file processing
+
+module load bedtools
+bedtools getfasta -fi /project/apps_database/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa -bed ES_D0_gro-seq_enhancers_1kb.bed -fo  ES_D0_gro-seq_enhancers.fasta -name
+bedtools getfasta -fi /project/apps_database/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa -bed ES_D2_gro-seq_enhancers_1kb.bed -fo  ES_D2_gro-seq_enhancers.fasta -name
+bedtools getfasta -fi /project/apps_database/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa -bed ES_D5_gro-seq_enhancers_1kb.bed -fo  ES_D5_gro-seq_enhancers.fasta -name
+bedtools getfasta -fi /project/apps_database/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa -bed ES_D7_gro-seq_enhancers_1kb.bed -fo  ES_D7_gro-seq_enhancers.fasta -name
+bedtools getfasta -fi /project/apps_database/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa -bed ES_D10_gro-seq_enhancers_1kb.bed -fo  ES_D10_gro-seq_enhancers.fasta -name
--- a/histone_centered_processing.sh
+++ b/histone_centered_processing.sh