Skip to content
Snippets Groups Projects
Commit 892d2c1b authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Adding in basics for calling peaks with macs2.

parent 52095af0
Branches
Tags
No related merge requests found
#!/usr/bin/env python3
'''Generate peaks from data.'''
import os
import argparse
import shutil
import logging
from multiprocessing import cpu_count
import utils
from xcor import xcor
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-t', '--tag',
help="The tagAlign file to perform peak calling on.",
required=True)
parser.add_argument('-x', '--xcor',
help="The cross-correlation file (if already calculated).",
required=True)
parser.add_argument('-c', '--con',
help="The control tagAling file used for peak calling.",
required=True)
parser.add_argument('-s', '--sample',
help="The sample id to name the peak files.",
required=True)
parser.add_argument('-g', '--genome',
help="The genome size of reference genome.",
required=True)
parser.add_argument('-z', '--size',
help="The file with chromosome sizes of reference genome.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
r_path = shutil.which("R")
if r_path:
logger.info('Found R: %s', r_path)
else:
logger.error('Missing R')
raise Exception('Missing R')
macs_path = shutil.which("macs2")
if r_path:
logger.info('Found MACS2: %s', macs_path)
else:
logger.error('Missing MACS2')
raise Exception('Missing MACS2')
bg_bw_path = shutil.which("bedGraphToBigWig")
if r_path:
logger.info('Found bedGraphToBigWig: %s', bg_bw_path)
else:
logger.error('Missing bedGraphToBigWig')
raise Exception('Missing bedGraphToBigWig')
def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes):
# Extract the fragment length estimate from column 3 of the
# cross-correlation scores file
with open(xcor, 'r') as xcor_fh:
firstline = xcor_fh.readline()
fragment_length = firstline.split()[2] # third column
logger.info("Fraglen %s" % (fragment_length))
# Generate narrow peaks and preliminary signal tracks
command = 'macs2 callpeak ' + \
'-t %s -c %s ' % (experiment, control) + \
'-f BED -n %s ' % (prefix) + \
'-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length)
logger.info(command)
returncode = utils.block_on(command)
logger.info("MACS2 exited with returncode %d" % (returncode))
assert returncode == 0, "MACS2 non-zero return"
# MACS2 sometimes calls features off the end of chromosomes.
# Remove coordinates outside chromosome sizes
narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
clipped_narrowpeak_fn = '%s-clipped' % (filename)
steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]
out, err = utils.run_pipe(steps)
# Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
# (score must be <1000)
rescaled_narrowpeak_fn = utils.rescale_scores(
clipped_narrowpeak_fn, scores_col=5)
# Sort by Col8 in descending order and replace long peak names in Column 4
# with Peak_<peakRank>
steps = [
'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'"""
]
out, err = utils.run_pipe(steps, '%s' % (narrowpeak_fn))
# For Fold enrichment signal tracks
# This file is a tab delimited file with 2 columns Col1 (chromosome name),
# Col2 (chromosome size in bp).
command = 'macs2 bdgcmp ' + \
'-t %s_treat_pileup.bdg ' % (prefix) + \
'-c %s_control_lambda.bdg ' % (prefix) + \
'-o %s_FE.bdg ' % (prefix) + \
'-m FE'
logger.info(command)
returncode = utils.block_on(command)
logger.info("MACS2 exited with returncode %d" % (returncode))
assert returncode == 0, "MACS2 non-zero return"
# Remove coordinates outside chromosome sizes (MACS2 bug)
fc_bedgraph_fn = '%s.fc.signal.bedgraph' % (prefix)
fc_signal_fn = "%s.fc_signal.bw" % (prefix)
steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)]
out, err = utils.run_pipe(steps)
# Convert bedgraph to bigwig
command = 'bedGraphToBigWig ' + \
'%s ' % (fc_bedgraph_fn) + \
'%s ' % (chrom_sizes) + \
'%s' % (fc_signal_fn)
logger.info(command)
returncode = utils.block_on(command)
logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
assert returncode == 0, "bedGraphToBigWig non-zero return"
# For -log10(p-value) signal tracks
# Compute sval =
# min(no. of reads in ChIP, no. of reads in control) / 1,000,000
out, err = utils.run_pipe(['gzip -dc %s' % (experiment), 'wc -l'])
chip_reads = out.strip()
out, err = common.run_pipe(['gzip -dc %s' % (control), 'wc -l'])
control_reads = out.strip()
sval = str(min(float(chip_reads), float(control_reads)) / 1000000)
logger.info("chip_reads = %s, control_reads = %s, sval = %s" %
(chip_reads, control_reads, sval))
command = 'macs2 bdgcmp ' + \
'-t %s_treat_pileup.bdg ' % (prefix) + \
'-c %s_control_lambda.bdg ' % (prefix) + \
'-o %s_ppois.bdg ' % (prefix) + \
'-m ppois -S %s' % (sval)
logger.info(command)
returncode = utils.block_on(command)
assert returncode == 0, "MACS2 non-zero return"
# Remove coordinates outside chromosome sizes (MACS2 bug)
pvalue_bedgraph_fn = '%s.pval.signal.bedgraph' % (prefix)
pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix)
steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)]
out, err = utils.run_pipe(steps)
# Convert bedgraph to bigwig
command = 'bedGraphToBigWig ' + \
'%s ' % (pvalue_bedgraph_fn) + \
'%s ' % (chrom_sizes) + \
'%s' % (fc_signal_fn)
logger.info(command)
returncode = utils.block_on(command)
logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
assert returncode == 0, "bedGraphToBigWig non-zero return"
def main():
args = get_args()
tag = args.tag
xcor = args.xcor
con = args.con
sample = args.sample
genome_size = args.genome
chrom_size = args.size
paired = args.paired
# Create a file handler
handler = logging.FileHandler('call_peaks.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Calculate Cross-correlation if not already calcualted
if xcor == 'Calculate':
xcor_file = xcor(tag, paired)
else:
xcor_file = xcor
# Call Peaks using MACS2
call_peaks_macs(tag, xcor_file, con, sample, genome_size, chrom_size)
if __name__ == '__main__':
main()
...@@ -45,6 +45,14 @@ def run_pipe(steps, outfile=None): ...@@ -45,6 +45,14 @@ def run_pipe(steps, outfile=None):
return out, err return out, err
def block_on(command):
process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
for line in iter(process.stdout.readline, ''):
sys.stdout.write(line)
process.wait()
return process.returncode
def strip_extensions(filename, extensions): def strip_extensions(filename, extensions):
'''Strips extensions to get basename of file.''' '''Strips extensions to get basename of file.'''
...@@ -72,3 +80,45 @@ def count_lines(filename): ...@@ -72,3 +80,45 @@ def count_lines(filename):
'wc -l' 'wc -l'
]) ])
return int(out) return int(out)
def rescale_scores(filename, scores_col, new_min=10, new_max=1000):
n_peaks = count_lines(filename)
sorted_fn = '%s-sorted' % (filename)
rescaled_fn = '%s-rescaled' % (filename)
out, err = run_pipe([
'sort -k %dgr,%dgr %s' % (scores_col, scores_col, fn),
r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""],
sorted_fn)
out, err = run_pipe([
'head -n 1 %s' % (sorted_fn),
'cut -f %s' % (scores_col)])
max_score = float(out.strip())
logger.info("rescale_scores: max_score = %s" % (max_score))
out, err = run_pipe([
'tail -n 1 %s' % (sorted_fn),
'cut -f %s' % (scores_col)])
min_score = float(out.strip())
logger.info("rescale_scores: min_score = %s" % (min_score))
a = min_score
b = max_score
x = new_min
y = new_max
if min_score == max_score: # give all peaks new_min
rescale_formula = "x"
else: # n is the unscaled score from scores_col
rescale_formula = "((n-a)*(y-x)/(b-a))+x"
out, err = run_pipe(
[
'cat %s' % (sorted_fn),
r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}"""
% (scores_col, a, b, x, y) +
r"""{$%d=int(%s) ; print $0}'"""
% (scores_col, rescale_formula)
],
rescaled_fn)
return rescaled_fn
...@@ -24,7 +24,7 @@ logger.setLevel(logging.INFO) ...@@ -24,7 +24,7 @@ logger.setLevel(logging.INFO)
def get_args(): def get_args():
'''Define arguments.''' '''Define arguments.'''
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG, description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter) formatter_class=argparse.RawDescriptionHelpFormatter)
...@@ -106,6 +106,8 @@ def xcor(tag, paired): ...@@ -106,6 +106,8 @@ def xcor(tag, paired):
cc_plot_filename, cc_scores_filename) cc_plot_filename, cc_scores_filename)
]) ])
return cc_scores_filename
def main(): def main():
args = get_args() args = get_args()
...@@ -120,7 +122,7 @@ def main(): ...@@ -120,7 +122,7 @@ def main():
check_tools() check_tools()
# Calculate Cross-correlation # Calculate Cross-correlation
xcor(tag, paired) xcor_filename = xcor(tag, paired)
if __name__ == '__main__': if __name__ == '__main__':
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment