Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env python
# -*- coding: latin-1 -*-
'''Take an transcript file from GRO-seq and gets the enhancer transcripts.'''
EPILOG = '''
For more details:
%(prog)s --help
'''
import numpy
import pybedtools
import argparse
import os
from subprocess import Popen
def get_args():
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument('-i', '--intergenic_transcripts',
help="The transcripts file to define enhancers",
required = True)
args = parser.parse_args()
return args
def rename_generator_sunp(bed_file,prefix):
for i, feature in enumerate(pybedtools.BedTool(bed_file)):
feature.name = prefix + '_{0}'.format(i + 1)
yield feature
def rename_generator_ssp(bed_file,prefix):
for i, feature in enumerate(pybedtools.BedTool(bed_file)):
name = prefix + '_{0}'.format(i + 1)
feature[3] = name
feature[9] = name
yield feature
def center_overlap(feature,assembly):
"""
Return center overlap with 1kb
"""
chromsizes = pybedtools.helpers.chromsizes(assembly)
feature_1_start = int(feature[1])
feature_1_stop = int(feature[2])
feature_2_start = int(feature[7])
feature_2_stop = int(feature[8])
if feature_1_start < feature_2_start:
center = (feature_1_stop + feature_2_start)/2
else:
center = (feature_2_stop + feature_1_start)/2
center_start = center - 500
center_stop = center + 500
if center_start > center_stop:
center_start = center_stop
center_stop = center - 500
elif center_start < chromsizes[feature.chrom][0]:
center_start = chromsizes[feature.chrom][0]
elif center_stop > chromsizes[feature.chrom][1]:
center_stop = chromsizes[feature.chrom][1]
feature.start = center_start
feature.stop = center_stop
feature.name = feature[3]
feature.score = '0'
feature.strand = '.'
return pybedtools.create_interval_from_list(
[feature.chrom, str(feature.start), str(feature.stop), feature.name, feature.score, feature.strand ])
def tss_center(feature,assembly):
"""
Return tss 1kb surrounding
"""
chromsizes = pybedtools.helpers.chromsizes(assembly)
feature_1_start = int(feature[1])
feature_1_stop = int(feature[2])
feautre_1_strand = feature[5]
if feautre_1_strand == "+":
start = feature_1_start - 500
stop = feature_1_start + 500
elif feautre_1_strand == "-":
start = feature_1_stop - 500
stop = feature_1_stop + 500
if start < chromsizes[feature.chrom][0]:
if stop > chromsizes[feature.chrom][1]:
stop = chromsizes[feature.chrom][1]
feature.start = start
feature.stop = stop
feature.name = feature[3]
feature.score = '0'
feature.strand = '.'
return pybedtools.create_interval_from_list(
[feature.chrom, str(feature.start), str(feature.stop), feature.name, feature.score, feature.strand ])
def main():
args = get_args()
# Read in bedtools:
transcripts = pybedtools.BedTool(args.intergenic_transcripts)
# Sort if not sorted
transcripts_sorted = transcripts.sort()
# Sepertate into long and short and merge for 500
short_t = transcripts_sorted.filter(lambda c: c.length <= 9000 and c.length >= 1)
long_t = transcripts_sorted.filter(lambda c: c.length > 9000)
# Save Temp files
short_t.saveas('temp1')
short_t = pybedtools.BedTool('temp1')
# Sepertate into short and long and by strand
short_p = short_t.filter(lambda c: c.strand == '+' and c.length <= 9000 )
short_n = short_t.filter(lambda c: c.strand == '-' and c.length <= 9000 )
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# Save temp files
short_p.saveas('temp4')
short_n.saveas('temp5')
# Read files
short_p = pybedtools.BedTool('temp4')
short_n = pybedtools.BedTool('temp5')
# Seperate into single and bi_directional
# Short-short paired
s_p = short_p.intersect(short_n, u=True)
s_n = short_n.intersect(short_p, u=True)
ss_overlap = s_p.intersect(s_n, S=True, wo=True)
pybedtools.BedTool(rename_generator_ssp(ss_overlap,"SSP")).saveas("temp6")
with open('temp7','w') as fh:
p = Popen(('cut', '-f', '1,2,3,4,5,6', 'temp6'), stdout=fh)
with open('temp8','w') as fh:
p = Popen(('cut', '-f', '7,8,9,10,11,12', 'temp6'), stdout=fh)
temp_p = pybedtools.BedTool('temp7')
temp_n = pybedtools.BedTool('temp8')
temp_p.cat(temp_n,postmerge=False).sort().saveas("short_short_paired.bed")
ss_overlap = pybedtools.BedTool('temp6')
ss_1kb = ss_overlap.each(center_overlap, assembly="hg19")
ss_1kb.saveas('short_short_paired_1kb.bed')
# Unpaired short
short_p_np = short_p.intersect(short_n, v=True)
short_n_np = short_n.intersect(short_p, v=True)
short_np = short_p_np.cat(short_n_np,postmerge=False).sort()
pybedtools.BedTool(rename_generator_sunp(short_np,"SUNP")).saveas("short_unpaired.bed")
short_np = pybedtools.BedTool("short_unpaired.bed")
ss_1kb = short_np.each(tss_center,assembly="hg19")
ss_1kb.saveas('short_unpaired_1kb.bed')
#Remove Files
os.remove('temp1')
os.remove('temp4')
os.remove('temp5')
os.remove('temp6')
os.remove('temp7')
os.remove('temp8')
if __name__ == '__main__':
main()