#!/usr/bin/env python # -*- coding: latin-1 -*- '''Take an transcript file from GRO-seq and gets the enhancer transcripts.''' EPILOG = ''' For more details: %(prog)s --help ''' import numpy import pybedtools import argparse import os from subprocess import Popen def get_args(): parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('-i', '--intergenic_transcripts', help="The transcripts file to define enhancers", required = True) args = parser.parse_args() return args def rename_generator_sunp(bed_file,prefix): for i, feature in enumerate(pybedtools.BedTool(bed_file)): feature.name = prefix + '_{0}'.format(i + 1) yield feature def rename_generator_ssp(bed_file,prefix): for i, feature in enumerate(pybedtools.BedTool(bed_file)): name = prefix + '_{0}'.format(i + 1) feature[3] = name feature[9] = name yield feature def center_overlap(feature,assembly): """ Return center overlap with 1kb """ chromsizes = pybedtools.helpers.chromsizes(assembly) feature_1_start = int(feature[1]) feature_1_stop = int(feature[2]) feature_2_start = int(feature[7]) feature_2_stop = int(feature[8]) if feature_1_start < feature_2_start: center = (feature_1_stop + feature_2_start)/2 else: center = (feature_2_stop + feature_1_start)/2 center_start = center - 500 center_stop = center + 500 if center_start > center_stop: center_start = center_stop center_stop = center - 500 elif center_start < chromsizes[feature.chrom][0]: center_start = chromsizes[feature.chrom][0] elif center_stop > chromsizes[feature.chrom][1]: center_stop = chromsizes[feature.chrom][1] feature.start = center_start feature.stop = center_stop feature.name = feature[3] feature.score = '0' feature.strand = '.' return pybedtools.create_interval_from_list( [feature.chrom, str(feature.start), str(feature.stop), feature.name, feature.score, feature.strand ]) def tss_center(feature,assembly): """ Return tss 1kb surrounding """ chromsizes = pybedtools.helpers.chromsizes(assembly) feature_1_start = int(feature[1]) feature_1_stop = int(feature[2]) feautre_1_strand = feature[5] if feautre_1_strand == "+": start = feature_1_start - 500 stop = feature_1_start + 500 elif feautre_1_strand == "-": start = feature_1_stop - 500 stop = feature_1_stop + 500 if start < chromsizes[feature.chrom][0]: start = chromsizes[feature.chrom][0] if stop > chromsizes[feature.chrom][1]: stop = chromsizes[feature.chrom][1] feature.start = start feature.stop = stop feature.name = feature[3] feature.score = '0' feature.strand = '.' return pybedtools.create_interval_from_list( [feature.chrom, str(feature.start), str(feature.stop), feature.name, feature.score, feature.strand ]) def main(): args = get_args() # Read in bedtools: transcripts = pybedtools.BedTool(args.intergenic_transcripts) # Sort if not sorted transcripts_sorted = transcripts.sort() # Sepertate into long and short and merge for 500 short_t = transcripts_sorted.filter(lambda c: c.length <= 9000 and c.length >= 1) long_t = transcripts_sorted.filter(lambda c: c.length > 9000) # Save Temp files short_t.saveas('temp1') short_t = pybedtools.BedTool('temp1') # Sepertate into short and long and by strand short_p = short_t.filter(lambda c: c.strand == '+' and c.length <= 9000 ) short_n = short_t.filter(lambda c: c.strand == '-' and c.length <= 9000 ) # Save temp files short_p.saveas('temp4') short_n.saveas('temp5') # Read files short_p = pybedtools.BedTool('temp4') short_n = pybedtools.BedTool('temp5') # Seperate into single and bi_directional # Short-short paired s_p = short_p.intersect(short_n, u=True) s_n = short_n.intersect(short_p, u=True) ss_overlap = s_p.intersect(s_n, S=True, wo=True) pybedtools.BedTool(rename_generator_ssp(ss_overlap,"SSP")).saveas("temp6") with open('temp7','w') as fh: p = Popen(('cut', '-f', '1,2,3,4,5,6', 'temp6'), stdout=fh) with open('temp8','w') as fh: p = Popen(('cut', '-f', '7,8,9,10,11,12', 'temp6'), stdout=fh) temp_p = pybedtools.BedTool('temp7') temp_n = pybedtools.BedTool('temp8') temp_p.cat(temp_n,postmerge=False).sort().saveas("short_short_paired.bed") ss_overlap = pybedtools.BedTool('temp6') ss_1kb = ss_overlap.each(center_overlap, assembly="hg19") ss_1kb.saveas('short_short_paired_1kb.bed') # Unpaired short short_p_np = short_p.intersect(short_n, v=True) short_n_np = short_n.intersect(short_p, v=True) short_np = short_p_np.cat(short_n_np,postmerge=False).sort() pybedtools.BedTool(rename_generator_sunp(short_np,"SUNP")).saveas("short_unpaired.bed") short_np = pybedtools.BedTool("short_unpaired.bed") ss_1kb = short_np.each(tss_center,assembly="hg19") ss_1kb.saveas('short_unpaired_1kb.bed') #Remove Files os.remove('temp1') os.remove('temp4') os.remove('temp5') os.remove('temp6') os.remove('temp7') os.remove('temp8') if __name__ == '__main__': main()