#!/usr/bin/env python # -*- coding: latin-1 -*- '''Take an gt average FPKM from list of RSEM output''' EPILOG = ''' For more details: %(prog)s --help ''' import numpy as np import pandas as pd import argparse import csv def get_args(): parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('-e','--experiments', help="Comma separated file of experiment name followed by file location.", required = True) parser.add_argument('-f','--factor', help="Factor that is being analyzed.", required = True) args = parser.parse_args() return args def main(): args = get_args() experiment_dict = csv.DictReader(open(args.experiments)) # Loop through all the files fpkm_columns = [] fpkm_all = pd.DataFrame() fpkm_err = pd.DataFrame() for exp in experiment_dict: experiment = exp['experiment'] fpkm_columns.append(experiment) rep1 = pd.read_csv(exp['file1'], sep='\t') rep2 = pd.read_csv(exp['file2'], sep='\t') # Make mean of replicate 1 and 2 input_columns = ['FPKM'] rep1_index = rep1.gene_id.values rep2_index = rep2.gene_id.values tmp = pd.DataFrame(rep1, columns=input_columns) rep1_tmp = tmp.set_index(rep1_index) rep1_tmp.columns = ['Rep1_FPKM'] tmp = pd.DataFrame(rep2, columns=input_columns) rep2_tmp = tmp.set_index(rep2_index) rep2_tmp.columns = ['Rep2_FPKM'] result = pd.concat([rep1_tmp, rep2_tmp],axis=1) fpkm_mean = result.mean(axis=1) fpkm_mean.columns= [experiment] fpkm_std = result.std(axis=1) fpkm_std.columns= [experiment] # Add values to update if fpkm_all.empty: fpkm_all = fpkm_mean else: fpkm_tmp = pd.concat([fpkm_all, fpkm_mean],axis=1) fpkm_all = fpkm_tmp if fpkm_err.empty: fpkm_err = fpkm_std else: fpkm_tmp = pd.concat([fpkm_err, fpkm_std],axis=1) fpkm_err = fpkm_tmp fpkm_all.columns = fpkm_columns fpkm_err.columns = fpkm_columns fpkm_all.to_csv(args.factor + ".tsv", index_label="gene_id",sep='\t') fpkm_err.to_csv(args.factor + "_err.tsv", index_label="gene_id",sep='\t') if __name__ == '__main__': main()