#!/usr/bin/env python # -*- coding: latin-1 -*- '''Take an BED file and list of Alignments and caluclates RPKM for a minimum score''' EPILOG = ''' For more details: %(prog)s --help ''' import numpy import pybedtools from pybedtools.featurefuncs import normalized_to_length import pysam import pandas as pd import argparse import csv import math import os from sklearn import preprocessing def get_args(): parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('-p', '--peaks', help="The peak file used to calculate RPKM.", required = True) parser.add_argument('-c', '--centered', help="The peak 1kb file.", required = True) parser.add_argument('-e','--experiments', help="Comma seperate file of experiment name followd by file location.", required = True) parser.add_argument('-f','--factor', help="Factor that is being analyzed.", required = True) parser.add_argument('--minimum', default = 1, type=int, help="The minimum RPKM value that at least 1 of experiements has to have per genomic location. Default is --minimum=1") args = parser.parse_args() return args def rpkm(peak_file,aln_file,exp_name,columns): ''' Return Pandas Dataframe of RPKM value and location''' columns.append(exp_name) ## RPKM = numReads / (geneLength/1000 * totalNumReads/1,000,000 ) peak_counts = peak_file.multi_bam_coverage(bams=[aln_file]) total_counts = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(aln_file,split_lines=True)]) rpkm = peak_counts.each(normalized_to_length, 4, float(math.pow(10,9))/total_counts).saveas("test.bed") rpkm_df = rpkm.to_dataframe() #os.remove('test.bed') rpkm_df.columns = columns columns.remove(exp_name) return rpkm_df def main(): args = get_args() peak_file = pybedtools.BedTool(args.peaks) centered_file = pybedtools.BedTool(args.centered) experiment_dict = csv.DictReader(open(args.experiments)) # Create dataframe from peak file for exp in experiment_dict: rpkm_columns = [] peak_df = peak_file.to_dataframe() centered_df = centered_file.to_dataframe() columns = list(peak_df.columns.values) experiment = exp['experiment'] aln_file = exp['file'] rpkm_columns.append(experiment) peak_df = pd.merge(peak_df,rpkm(peak_file,aln_file,experiment,columns)) # Filter for alteast Min RPKM in experiment condition peak_rpkm_only = peak_df[rpkm_columns] scaler = preprocessing.StandardScaler(with_mean=False) norm = scaler.fit_transform(peak_rpkm_only.values) peak_rpkm_std_robust = pd.DataFrame(data=norm, columns=list(peak_rpkm_only.columns.values), index = peak_rpkm_only.index ) filtered_rpkm = centered_df[peak_df[experiment] >= args.minimum] # Write out RPKM matrix filtered_peaks = filtered_rpkm[columns] pybedtools.BedTool.from_dataframe(filtered_peaks).saveas(experiment + '_' + args.factor + '_filtered_peaks.bed') if __name__ == '__main__': main()