From 892d2c1bf31f16c5828d1ba93668cf846f63fd6c Mon Sep 17 00:00:00 2001 From: Venkat Malladi <venkat.malladi@utsouthwestern.edu> Date: Tue, 14 Nov 2017 16:25:55 -0600 Subject: [PATCH] Adding in basics for calling peaks with macs2. --- workflow/scripts/call_peaks_macs.py | 250 ++++++++++++++++++++++++++++ workflow/scripts/utils.py | 50 ++++++ workflow/scripts/xcor.py | 6 +- 3 files changed, 304 insertions(+), 2 deletions(-) create mode 100644 workflow/scripts/call_peaks_macs.py diff --git a/workflow/scripts/call_peaks_macs.py b/workflow/scripts/call_peaks_macs.py new file mode 100644 index 0000000..1031bc8 --- /dev/null +++ b/workflow/scripts/call_peaks_macs.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 + +'''Generate peaks from data.''' + +import os +import argparse +import shutil +import logging +from multiprocessing import cpu_count +import utils +from xcor import xcor + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-t', '--tag', + help="The tagAlign file to perform peak calling on.", + required=True) + + parser.add_argument('-x', '--xcor', + help="The cross-correlation file (if already calculated).", + required=True) + + parser.add_argument('-c', '--con', + help="The control tagAling file used for peak calling.", + required=True) + + parser.add_argument('-s', '--sample', + help="The sample id to name the peak files.", + required=True) + + parser.add_argument('-g', '--genome', + help="The genome size of reference genome.", + required=True) + + parser.add_argument('-z', '--size', + help="The file with chromosome sizes of reference genome.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + r_path = shutil.which("R") + if r_path: + logger.info('Found R: %s', r_path) + else: + logger.error('Missing R') + raise Exception('Missing R') + + macs_path = shutil.which("macs2") + if r_path: + logger.info('Found MACS2: %s', macs_path) + else: + logger.error('Missing MACS2') + raise Exception('Missing MACS2') + + bg_bw_path = shutil.which("bedGraphToBigWig") + if r_path: + logger.info('Found bedGraphToBigWig: %s', bg_bw_path) + else: + logger.error('Missing bedGraphToBigWig') + raise Exception('Missing bedGraphToBigWig') + + +def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes): + + # Extract the fragment length estimate from column 3 of the + # cross-correlation scores file + with open(xcor, 'r') as xcor_fh: + firstline = xcor_fh.readline() + fragment_length = firstline.split()[2] # third column + logger.info("Fraglen %s" % (fragment_length)) + + + # Generate narrow peaks and preliminary signal tracks + + command = 'macs2 callpeak ' + \ + '-t %s -c %s ' % (experiment, control) + \ + '-f BED -n %s ' % (prefix) + \ + '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length) + + logger.info(command) + returncode = utils.block_on(command) + logger.info("MACS2 exited with returncode %d" % (returncode)) + assert returncode == 0, "MACS2 non-zero return" + + # MACS2 sometimes calls features off the end of chromosomes. + # Remove coordinates outside chromosome sizes + + narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix) + clipped_narrowpeak_fn = '%s-clipped' % (filename) + + + steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes), + 'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)] + + out, err = utils.run_pipe(steps) + + # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format + # (score must be <1000) + rescaled_narrowpeak_fn = utils.rescale_scores( + clipped_narrowpeak_fn, scores_col=5) + + # Sort by Col8 in descending order and replace long peak names in Column 4 + # with Peak_<peakRank> + steps = [ + 'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn), + r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""" + ] + + out, err = utils.run_pipe(steps, '%s' % (narrowpeak_fn)) + + # For Fold enrichment signal tracks + + # This file is a tab delimited file with 2 columns Col1 (chromosome name), + # Col2 (chromosome size in bp). + + command = 'macs2 bdgcmp ' + \ + '-t %s_treat_pileup.bdg ' % (prefix) + \ + '-c %s_control_lambda.bdg ' % (prefix) + \ + '-o %s_FE.bdg ' % (prefix) + \ + '-m FE' + + logger.info(command) + returncode = utils.block_on(command) + logger.info("MACS2 exited with returncode %d" % (returncode)) + assert returncode == 0, "MACS2 non-zero return" + + # Remove coordinates outside chromosome sizes (MACS2 bug) + fc_bedgraph_fn = '%s.fc.signal.bedgraph' % (prefix) + fc_signal_fn = "%s.fc_signal.bw" % (prefix) + steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes), + 'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)] + + out, err = utils.run_pipe(steps) + + # Convert bedgraph to bigwig + command = 'bedGraphToBigWig ' + \ + '%s ' % (fc_bedgraph_fn) + \ + '%s ' % (chrom_sizes) + \ + '%s' % (fc_signal_fn) + + logger.info(command) + returncode = utils.block_on(command) + logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) + assert returncode == 0, "bedGraphToBigWig non-zero return" + + # For -log10(p-value) signal tracks + + # Compute sval = + # min(no. of reads in ChIP, no. of reads in control) / 1,000,000 + out, err = utils.run_pipe(['gzip -dc %s' % (experiment), 'wc -l']) + chip_reads = out.strip() + out, err = common.run_pipe(['gzip -dc %s' % (control), 'wc -l']) + control_reads = out.strip() + sval = str(min(float(chip_reads), float(control_reads)) / 1000000) + + logger.info("chip_reads = %s, control_reads = %s, sval = %s" % + (chip_reads, control_reads, sval)) + + command = 'macs2 bdgcmp ' + \ + '-t %s_treat_pileup.bdg ' % (prefix) + \ + '-c %s_control_lambda.bdg ' % (prefix) + \ + '-o %s_ppois.bdg ' % (prefix) + \ + '-m ppois -S %s' % (sval) + + logger.info(command) + returncode = utils.block_on(command) + assert returncode == 0, "MACS2 non-zero return" + + # Remove coordinates outside chromosome sizes (MACS2 bug) + pvalue_bedgraph_fn = '%s.pval.signal.bedgraph' % (prefix) + pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix) + steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes), + 'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)] + + out, err = utils.run_pipe(steps) + + # Convert bedgraph to bigwig + command = 'bedGraphToBigWig ' + \ + '%s ' % (pvalue_bedgraph_fn) + \ + '%s ' % (chrom_sizes) + \ + '%s' % (fc_signal_fn) + + logger.info(command) + returncode = utils.block_on(command) + logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) + assert returncode == 0, "bedGraphToBigWig non-zero return" + + +def main(): + args = get_args() + tag = args.tag + xcor = args.xcor + con = args.con + sample = args.sample + genome_size = args.genome + chrom_size = args.size + paired = args.paired + + + # Create a file handler + handler = logging.FileHandler('call_peaks.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Calculate Cross-correlation if not already calcualted + if xcor == 'Calculate': + xcor_file = xcor(tag, paired) + else: + xcor_file = xcor + + # Call Peaks using MACS2 + call_peaks_macs(tag, xcor_file, con, sample, genome_size, chrom_size) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py index 2643c6e..ed90dc7 100644 --- a/workflow/scripts/utils.py +++ b/workflow/scripts/utils.py @@ -45,6 +45,14 @@ def run_pipe(steps, outfile=None): return out, err +def block_on(command): + process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) + for line in iter(process.stdout.readline, ''): + sys.stdout.write(line) + process.wait() + return process.returncode + + def strip_extensions(filename, extensions): '''Strips extensions to get basename of file.''' @@ -72,3 +80,45 @@ def count_lines(filename): 'wc -l' ]) return int(out) + + +def rescale_scores(filename, scores_col, new_min=10, new_max=1000): + n_peaks = count_lines(filename) + sorted_fn = '%s-sorted' % (filename) + rescaled_fn = '%s-rescaled' % (filename) + + out, err = run_pipe([ + 'sort -k %dgr,%dgr %s' % (scores_col, scores_col, fn), + r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""], + sorted_fn) + + out, err = run_pipe([ + 'head -n 1 %s' % (sorted_fn), + 'cut -f %s' % (scores_col)]) + max_score = float(out.strip()) + logger.info("rescale_scores: max_score = %s" % (max_score)) + + out, err = run_pipe([ + 'tail -n 1 %s' % (sorted_fn), + 'cut -f %s' % (scores_col)]) + min_score = float(out.strip()) + logger.info("rescale_scores: min_score = %s" % (min_score)) + + a = min_score + b = max_score + x = new_min + y = new_max + if min_score == max_score: # give all peaks new_min + rescale_formula = "x" + else: # n is the unscaled score from scores_col + rescale_formula = "((n-a)*(y-x)/(b-a))+x" + out, err = run_pipe( + [ + 'cat %s' % (sorted_fn), + r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" + % (scores_col, a, b, x, y) + + r"""{$%d=int(%s) ; print $0}'""" + % (scores_col, rescale_formula) + ], + rescaled_fn) + return rescaled_fn diff --git a/workflow/scripts/xcor.py b/workflow/scripts/xcor.py index 0bf51e6..5283859 100644 --- a/workflow/scripts/xcor.py +++ b/workflow/scripts/xcor.py @@ -24,7 +24,7 @@ logger.setLevel(logging.INFO) def get_args(): '''Define arguments.''' - + parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) @@ -106,6 +106,8 @@ def xcor(tag, paired): cc_plot_filename, cc_scores_filename) ]) + return cc_scores_filename + def main(): args = get_args() @@ -120,7 +122,7 @@ def main(): check_tools() # Calculate Cross-correlation - xcor(tag, paired) + xcor_filename = xcor(tag, paired) if __name__ == '__main__': -- GitLab