Skip to content
Snippets Groups Projects
Commit 905de27d authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Update processing for histone data.

parent 38234440
Branches
No related merge requests found
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -6,3 +6,38 @@ bedops --everything H3K4me1_filtered_peaks.bed H3K27ac_filtered_peaks.bed | bed
# Get RPKM
./rpkm.py --peaks Histone_putative_enhancers.bed --experiments h3k4me1_list.csv -f H3K4me1_hc
./rpkm.py --peaks Histone_putative_enhancers.bed --experiments h3k27ac_list.csv -f H3K27ac_hc
# Acutal Processing
./define_histone_window.py --enhancers H3K27ac_filtered_peaks.bed
./rpkm_name.py --peaks enhancers_renamed.bed --experiments h3k27ac_list.csv -f H3K27ac
# histone Processing
bedtools getfasta -fi /project/apps_database/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa -bed ES_D0_H3K27ac_filtered_peaks.bed -fo ES_D0_H3K27ac_filtered_peaks.fasta -name
bedtools getfasta -fi /project/apps_database/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa -bed ES_D2_H3K27ac_filtered_peaks.bed -fo ES_D2_H3K27ac_filtered_peaks.fasta -name
bedtools getfasta -fi /project/apps_database/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa -bed ES_D5_H3K27ac_filtered_peaks.bed -fo ES_D5_H3K27ac_filtered_peaks.fasta -name
bedtools getfasta -fi /project/apps_database/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa -bed ES_D7_H3K27ac_filtered_peaks.bed -fo ES_D7_H3K27ac_filtered_peaks.fasta -name
bedtools getfasta -fi /project/apps_database/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa -bed ES_D10_H3K27ac_filtered_peaks.bed -fo ES_D10_H3K27ac_filtered_peaks.fasta -name
# Meme processing
#!/bin/bash
#SBATCH --job-name=meme_test
#SBATCH --partition=super
#SBATCH --nodes=2
#SBATCH --ntasks=64
#SBATCH --time=0-36:00:00
#SBATCH --output=meme_test.%j.out
#SBATCH --error=meme_test.%j.err
#SBATCH --mail-user=venkat.malladi@utsouthwestern.edu
#SBATCH --mail-type=ALL
module load meme/4.11.0-intel-mvapich2
meme -p 64 -dna -mod zoops -nmotifs 15 -minw 8 -maxw 15 -maxsize 20000000 -revcomp ES_D0_H3K27ac_filtered_peaks.fasta -o MEME_op_ES_D0_H3K27ac_1kb_zoops
meme -p 64 -dna -mod zoops -nmotifs 15 -minw 8 -maxw 15 -maxsize 20000000 -revcomp ES_D2_H3K27ac_filtered_peaks.fasta -o MEME_op_ES_D2_H3K27ac_1kb_zoops
meme -p 64 -dna -mod zoops -nmotifs 15 -minw 8 -maxw 15 -maxsize 20000000 -revcomp ES_D5_H3K27ac_filtered_peaks.fasta -o MEME_op_ES_D5_H3K27ac_1kb_zoops
meme -p 64 -dna -mod zoops -nmotifs 15 -minw 8 -maxw 15 -maxsize 20000000 -revcomp ES_D7_H3K27ac_filtered_peaks.fasta -o MEME_op_ES_D7_H3K27ac_1kb_zoops
meme -p 64 -dna -mod zoops -nmotifs 15 -minw 8 -maxw 15 -maxsize 20000000 -revcomp ES_D10_H3K27ac_filtered_peaks.fasta -o MEME_op_ES_D10_H3K27ac_1kb_zoops
#!/usr/bin/env python
# -*- coding: latin-1 -*-
'''Take an BED file and list of Alignments and caluclates RPKM for a minimum score'''
EPILOG = '''
For more details:
%(prog)s --help
'''
import numpy
import pybedtools
from pybedtools.featurefuncs import normalized_to_length
import pysam
import pandas as pd
import argparse
import csv
import math
import os
from sklearn import preprocessing
def get_args():
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument('-p', '--peaks',
help="The peak file used to calculate RPKM.",
required = True)
parser.add_argument('-c', '--centered',
help="The peak 1kb file.",
required = True)
parser.add_argument('-e','--experiments',
help="Comma seperate file of experiment name followd by file location.",
required = True)
parser.add_argument('-f','--factor',
help="Factor that is being analyzed.",
required = True)
parser.add_argument('--minimum',
default = 1,
type=int,
help="The minimum RPKM value that at least 1 of experiements has to have per genomic location. Default is --minimum=1")
args = parser.parse_args()
return args
def rpkm(peak_file,aln_file,exp_name,columns):
''' Return Pandas Dataframe of RPKM value and location'''
columns.append(exp_name)
## RPKM = numReads / (geneLength/1000 * totalNumReads/1,000,000 )
peak_counts = peak_file.multi_bam_coverage(bams=[aln_file])
total_counts = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(aln_file)])
rpkm = peak_counts.each(normalized_to_length, 4, float(math.pow(10,9))/total_counts).saveas("test.bed")
rpkm_df = rpkm.to_dataframe()
#os.remove('test.bed')
rpkm_df.columns = columns
columns.remove(exp_name)
return rpkm_df
def main():
args = get_args()
peak_file = pybedtools.BedTool(args.peaks)
centered_file = pybedtools.BedTool(args.centered)
experiment_dict = csv.DictReader(open(args.experiments))
# Create dataframe from peak file
for exp in experiment_dict:
rpkm_columns = []
peak_df = peak_file.to_dataframe()
centered_df = centered_file.to_dataframe()
columns = list(peak_df.columns.values)
experiment = exp['experiment']
aln_file = exp['file']
rpkm_columns.append(experiment)
peak_df = pd.merge(peak_df,rpkm(peak_file,aln_file,experiment,columns))
# Filter for alteast Min RPKM in experiment condition
peak_rpkm_only = peak_df[rpkm_columns]
scaler = preprocessing.StandardScaler(with_mean=False)
norm = scaler.fit_transform(peak_rpkm_only.values)
peak_rpkm_std_robust = pd.DataFrame(data=norm, columns=list(peak_rpkm_only.columns.values), index = peak_rpkm_only.index )
filtered_rpkm = centered_df[peak_df[experiment] >= args.minimum]
# Write out RPKM matrix
filtered_peaks = filtered_rpkm[columns]
pybedtools.BedTool.from_dataframe(filtered_peaks).saveas(experiment + '_' + args.factor + '_filtered_peaks.bed')
if __name__ == '__main__':
main()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment