Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
# -*- coding: latin-1 -*-
'''Take an transcript file from GRO-seq and gets the enhancer transcripts.'''
EPILOG = '''
For more details:
%(prog)s --help
'''
import numpy
import pybedtools
import argparse
import os
from subprocess import Popen
def get_args():
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument('-i', '--intergenic_transcripts',
help="The transcripts file to define enhancers",
required = True)
args = parser.parse_args()
return args
def rename_generator_sunp(bed_file,prefix):
for i, feature in enumerate(pybedtools.BedTool(bed_file)):
feature.name = prefix + '_{0}'.format(i + 1)
yield feature
def rename_generator_ssp(bed_file,prefix):
for i, feature in enumerate(pybedtools.BedTool(bed_file)):
name = prefix + '_{0}'.format(i + 1)
feature[3] = name
feature[9] = name
yield feature
def center_overlap(feature,assembly):
"""
Return center overlap with
"""
chromsizes = pybedtools.helpers.chromsizes(assembly)
feature_1_start = int(feature[1])
feature_1_stop = int(feature[2])
feature_2_start = int(feature[7])
feature_2_stop = int(feature[8])
if feature_1_start < feature_2_start:
center = (feature_1_stop + feature_2_start)/2
else:
center = (feature_2_stop + feature_1_start)/2
center_start = center - 500
center_stop = center + 500
if center_start > center_stop:
center_start = center_stop
center_stop = center - 500
elif center_start < chromsizes[feature.chrom][0]:
center_start = chromsizes[feature.chrom][0]
elif center_stop > chromsizes[feature.chrom][1]:
center_stop = chromsizes[feature.chrom][1]
feature.start = center_start
feature.stop = center_stop
feature.name = feature[3]
feature.score = '0'
feature.strand = '.'
return pybedtools.create_interval_from_list(
[feature.chrom, str(feature.start), str(feature.stop), feature.name, feature.score, feature.strand ])
def tss_center(feature,assembly):
"""
Return center overlap with
"""
chromsizes = pybedtools.helpers.chromsizes(assembly)
feature_1_start = int(feature[1])
feature_1_stop = int(feature[2])
start = feature_1_start - 500
stop = feature_1_start + 500
if start > stop:
start = stop
stop = feature_1_stop - 500
elif start < chromsizes[feature.chrom][0]:
start = chromsizes[feature.chrom][0]
elif stop > chromsizes[feature.chrom][1]:
stop = chromsizes[feature.chrom][1]
feature.start = start
feature.stop = stop
feature.name = feature[3]
feature.score = '0'
feature.strand = '.'
return pybedtools.create_interval_from_list(
[feature.chrom, str(feature.start), str(feature.stop), feature.name, feature.score, feature.strand ])
def main():
args = get_args()
# Read in bedtools:
transcripts = pybedtools.BedTool(args.intergenic_transcripts)
# Sort if not sorted
transcripts_sorted = transcripts.sort()
# Sepertate into long and short and merge for 500
short_t = transcripts_sorted.filter(lambda c: c.length <= 9000 and c.length >= 150)
long_t = transcripts_sorted.filter(lambda c: c.length > 9000)
# Save Temp files
short_t.saveas('temp1')
short_t = pybedtools.BedTool('temp1')
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
transcripts = short_t.merge(d=500,s=True,c='4,5,6',o='distinct,mean,distinct').sort()
transcripts.saveas('temp2')
with open('temp3','w') as fh:
p = Popen(('cut', '-f', '1,2,3,5,6,7', 'temp2'), stdout=fh)
out,err = p.communicate()
transcripts_merged = pybedtools.BedTool('temp3')
# Sepertate into short and long and by strand
short_p = transcripts_merged.filter(lambda c: c.strand == '+' and c.length <= 9000 )
short_n = transcripts_merged.filter(lambda c: c.strand == '-' and c.length <= 9000 )
# Save temp files
short_p.saveas('temp4')
short_n.saveas('temp5')
# Read files
short_p = pybedtools.BedTool('temp4')
short_n = pybedtools.BedTool('temp5')
# Seperate into single and bi_directional
# Short-short paired
s_p = short_p.intersect(short_n, u=True)
s_n = short_n.intersect(short_p, u=True)
ss_overlap = s_p.intersect(s_n, S=True, wo=True)
pybedtools.BedTool(rename_generator_ssp(ss_overlap,"SSP")).saveas("temp6")
with open('temp7','w') as fh:
p = Popen(('cut', '-f', '1,2,3,4,5,6', 'temp6'), stdout=fh)
with open('temp8','w') as fh:
p = Popen(('cut', '-f', '7,8,9,10,11,12', 'temp6'), stdout=fh)
temp_p = pybedtools.BedTool('temp7')
temp_n = pybedtools.BedTool('temp8')
temp_p.cat(temp_n,postmerge=False).sort().saveas("short_short_paired.bed")
ss_overlap = pybedtools.BedTool('temp6')
ss_1kb = ss_overlap.each(center_overlap, assembly="hg19")
ss_1kb.saveas('short_short_paired_1kb.bed')
# Unpaired short
short_p_np = short_p.intersect(short_n, v=True)
short_n_np = short_n.intersect(short_p, v=True)
short_np = short_p_np.cat(short_n_np,postmerge=False).sort()
pybedtools.BedTool(rename_generator_sunp(short_np,"SUNP")).saveas("short_unpaired.bed")
short_np = pybedtools.BedTool("short_unpaired.bed")
ss_1kb = short_np.each(tss_center,assembly="hg19")
ss_1kb.saveas('short_unpaired_1kb.bed')
#Remove Files
os.remove('temp1')
os.remove('temp2')
os.remove('temp3')
os.remove('temp4')
os.remove('temp5')
os.remove('temp6')
os.remove('temp7')
os.remove('temp8')
if __name__ == '__main__':
main()