An error occurred while loading the file. Please try again.
-
Venkat Malladi authoredac7a8825
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
Forked from
Venkat Malladi / TFSEE
2 commits ahead of the upstream repository.
rpkm_name.py 3.19 KiB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python
# -*- coding: latin-1 -*-
'''Take an BED file and list of Alignments and caluclates RPKM for a minimum score'''
EPILOG = '''
For more details:
%(prog)s --help
'''
import numpy
import pybedtools
from pybedtools.featurefuncs import normalized_to_length
import pysam
import pandas as pd
import argparse
import csv
import math
import os
from sklearn import preprocessing
def get_args():
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument('-p', '--peaks',
help="The peak file used to calculate RPKM.",
required = True)
parser.add_argument('-c', '--centered',
help="The peak 1kb file.",
required = True)
parser.add_argument('-e','--experiments',
help="Comma seperate file of experiment name followd by file location.",
required = True)
parser.add_argument('-f','--factor',
help="Factor that is being analyzed.",
required = True)
parser.add_argument('--minimum',
default = 1,
type=int,
help="The minimum RPKM value that at least 1 of experiements has to have per genomic location. Default is --minimum=1")
args = parser.parse_args()
return args
def rpkm(peak_file,aln_file,exp_name,columns):
''' Return Pandas Dataframe of RPKM value and location'''
columns.append(exp_name)
## RPKM = numReads / (geneLength/1000 * totalNumReads/1,000,000 )
peak_counts = peak_file.multi_bam_coverage(bams=[aln_file])
total_counts = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(aln_file,split_lines=True)])
rpkm = peak_counts.each(normalized_to_length, 4, float(math.pow(10,9))/total_counts).saveas("test.bed")
rpkm_df = rpkm.to_dataframe()
#os.remove('test.bed')
rpkm_df.columns = columns
columns.remove(exp_name)
return rpkm_df
def main():
args = get_args()
peak_file = pybedtools.BedTool(args.peaks)
centered_file = pybedtools.BedTool(args.centered)
experiment_dict = csv.DictReader(open(args.experiments))
# Create dataframe from peak file
for exp in experiment_dict:
rpkm_columns = []
peak_df = peak_file.to_dataframe()
centered_df = centered_file.to_dataframe()
columns = list(peak_df.columns.values)
experiment = exp['experiment']
aln_file = exp['file']
rpkm_columns.append(experiment)
peak_df = pd.merge(peak_df,rpkm(peak_file,aln_file,experiment,columns))
# Filter for alteast Min RPKM in experiment condition
peak_rpkm_only = peak_df[rpkm_columns]
scaler = preprocessing.StandardScaler(with_mean=False)
norm = scaler.fit_transform(peak_rpkm_only.values)
peak_rpkm_std_robust = pd.DataFrame(data=norm, columns=list(peak_rpkm_only.columns.values), index = peak_rpkm_only.index )
filtered_rpkm = centered_df[peak_df[experiment] >= args.minimum]
# Write out RPKM matrix
filtered_peaks = filtered_rpkm[columns]
pybedtools.BedTool.from_dataframe(filtered_peaks).saveas(experiment + '_' + args.factor + '_filtered_peaks.bed')
if __name__ == '__main__':
main()