cutoff_analysis.py

#!/usr/bin/env python

# -*- coding: latin-1 -*-
'''Take an TSV file make a plot graph'''

EPILOG = '''
For more details:
        %(prog)s --help
'''


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import argparse
import seaborn as sns

def get_args():
    parser = argparse.ArgumentParser(
            description=__doc__, epilog=EPILOG,
            formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument('-r', '--rpkm',
    	help="The file with RPKM values.",
        required = True)
    parser.add_argument('-c','--color',
    	help="The hex color to make the graph",
        required = True)
    parser.add_argument('-f','--factor',
    	help="Factor that is being analyzed.",
        required = True)
    parser.add_argument('-l','--limit',
    	help="The RPKM limit to plot",
        type=float,
        required = True)
    parser.add_argument('-p', '--protein',
    	help="The file with Protein coding gnese.")
    args = parser.parse_args()
    return args

def main():
    sns.set_style("white")
    sns.set_style("ticks")

    args = get_args()
    rpkm_file = pd.read_csv(args.rpkm, sep='\t')

    if args.protein is not None:
        pc_genes = pd.read_csv(args.protein, sep='\t', header=None)
        filtered_rpkm = rpkm_file[rpkm_file['gene_id'].isin(pc_genes[0].values)]
    else:
        filtered_rpkm = rpkm_file


    locations = np.array(filtered_rpkm['ES_D0'])
    locations = np.append(locations, np.array(filtered_rpkm['ES_D2']))
    locations = np.append(locations, np.array(filtered_rpkm['ES_D5']))
    locations = np.append(locations, np.array(filtered_rpkm['ES_D7']))
    locations = np.append(locations, np.array(filtered_rpkm['ES_D10']))

    sns.kdeplot(np.log2(locations[locations !=0] + 0.00001),color=args.color)
    sns.despine()
    plt.axvline(np.log2(args.limit), color='black', linestyle='dashed', linewidth=1)
    plt.savefig(args.factor + '_distribution.png')
    plt.clf()


if __name__ == '__main__':
    main()