Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
Forked from Venkat Malladi / TFSEE
2 commits ahead of the upstream repository.
rpkm.py 4.64 KiB
#!/usr/bin/env python

# -*- coding: latin-1 -*-
'''Take an BED file and list of Alignments and caluclates RPKM for a minimum score'''

EPILOG = '''
For more details:
        %(prog)s --help
'''

import numpy
import pybedtools
from pybedtools.featurefuncs import normalized_to_length
import pysam
import pandas as pd
import argparse
import csv
import math
import os
from sklearn import preprocessing


def get_args():
    parser = argparse.ArgumentParser(
            description=__doc__, epilog=EPILOG,
            formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument('-p', '--peaks',
    	help="The peak file used to calculate RPKM.",
        required = True)
    parser.add_argument('-e','--experiments',
    	help="Comma seperate file of experiment name followd by file location.",
        required = True)
    parser.add_argument('-f','--factor',
    	help="Factor that is being analyzed.",
        required = True)
    parser.add_argument('--minimum',
    	default = 1,
        type=int,
    	help="The minimum RPKM value that at least 1 of experiements has to have per genomic location. Default is --minimum=1")
    args = parser.parse_args()
    return args


def rpkm(peak_file,aln_file,exp_name,columns):
    ''' Return Pandas Dataframe of RPKM value and location'''
    columns.append(exp_name)
    ## RPKM  =   numReads / (geneLength/1000 * totalNumReads/1,000,000 )
    peak_counts = peak_file.multi_bam_coverage(bams=[aln_file])
    total_counts = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(aln_file,split_lines=True)])
    rpkm = peak_counts.each(normalized_to_length, 3, float(math.pow(10,9))/total_counts).saveas("test.bed")
    rpkm_df = rpkm.to_dataframe()
    #os.remove('test.bed')
    rpkm_df.columns = columns
    columns.remove(exp_name)
    return rpkm_df


def rpkm_strand(peak_file,aln_file,exp_name,columns):
    ''' Return Pandas Dataframe of RPKM value and location'''
    columns.append(exp_name)
    ## RPKM  =   numReads / (geneLength/1000 * totalNumReads/1,000,000 )
    peak_counts = peak_file.multi_bam_coverage(bams=[aln_file],s=True)
    total_counts = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(aln_file,split_lines=True)])
    rpkm = peak_counts.each(normalized_to_length, 6, float(math.pow(10,9))/float(total_counts)).saveas("test.bed")
    rpkm_df = rpkm.to_dataframe()
    #os.remove('test.bed')
    rpkm_df.columns = columns
    columns.remove(exp_name)
    return rpkm_df


def main():
    args = get_args()
    peak_file = pybedtools.BedTool(args.peaks)
    experiment_dict = csv.DictReader(open(args.experiments))

    # Create dataframe from peak file
    rpkm_columns = []
    peak_df = peak_file.to_dataframe()
    columns = list(peak_df.columns.values)
    for exp in experiment_dict:
        experiment = exp['experiment']
        aln_file = exp['file']
        rpkm_columns.append(experiment)
        if 'strand' in columns:
            peak_df = pd.merge(peak_df,rpkm_strand(peak_file,aln_file,experiment,columns))
        else:
            peak_df = pd.merge(peak_df,rpkm(peak_file,aln_file,experiment,columns))


    # Seperate for Enhancers with Names and no Names
    if 'name' in columns:
        peak_rpkm_only = peak_df[rpkm_columns]
        scaler = preprocessing.StandardScaler(with_mean=False)
        norm = scaler.fit_transform(peak_rpkm_only.values)
        df_name = peak_df[['name']]
        peak_rpkm_std_robust = pd.DataFrame(data=norm, columns=list(peak_rpkm_only.columns.values), index = peak_rpkm_only.index )
        peak_rpkm_std_robust_name = peak_rpkm_std_robust.join(df_name)
        peak_rpkm_only_sum = peak_rpkm_std_robust_name.groupby(['name']).sum()
        peak_filtered_rpkm = peak_rpkm_only_sum[(peak_rpkm_only_sum >= args.minimum).any(axis=1)]
        named_enhancers = list(peak_filtered_rpkm.index.values)
        filtered_rpkm = peak_df[peak_df['name'].isin(named_enhancers)]
    else:
        # Filter for alteast Min RPKM in experiment condition
        peak_rpkm_only = peak_df[rpkm_columns]
        scaler = preprocessing.StandardScaler(with_mean=False)
        norm = scaler.fit_transform(peak_rpkm_only.values)
        peak_rpkm_std_robust = pd.DataFrame(data=norm, columns=list(peak_rpkm_only.columns.values), index = peak_rpkm_only.index )
        filtered_rpkm = peak_df.loc[(peak_rpkm_std_robust >= args.minimum).any(axis=1)]

    # Write out RPKM matrix
    filtered_peaks = filtered_rpkm[columns]
    filtered_rpkm.to_csv(args.factor + '_filtered_peaks.tsv', header=True, index=None, sep='\t')
    pybedtools.BedTool.from_dataframe(filtered_peaks).saveas(args.factor + '_filtered_peaks.bed')


if __name__ == '__main__':
    main()