From 892d2c1bf31f16c5828d1ba93668cf846f63fd6c Mon Sep 17 00:00:00 2001
From: Venkat Malladi <venkat.malladi@utsouthwestern.edu>
Date: Tue, 14 Nov 2017 16:25:55 -0600
Subject: [PATCH] Adding in basics for calling peaks with macs2.

---
 workflow/scripts/call_peaks_macs.py | 250 ++++++++++++++++++++++++++++
 workflow/scripts/utils.py           |  50 ++++++
 workflow/scripts/xcor.py            |   6 +-
 3 files changed, 304 insertions(+), 2 deletions(-)
 create mode 100644 workflow/scripts/call_peaks_macs.py

diff --git a/workflow/scripts/call_peaks_macs.py b/workflow/scripts/call_peaks_macs.py
new file mode 100644
index 0000000..1031bc8
--- /dev/null
+++ b/workflow/scripts/call_peaks_macs.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+
+'''Generate peaks from data.'''
+
+import os
+import argparse
+import shutil
+import logging
+from multiprocessing import cpu_count
+import utils
+from xcor import xcor
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-t', '--tag',
+                        help="The tagAlign file to perform peak calling on.",
+                        required=True)
+
+    parser.add_argument('-x', '--xcor',
+                        help="The cross-correlation file (if already calculated).",
+                        required=True)
+
+    parser.add_argument('-c', '--con',
+                        help="The control tagAling file used for peak calling.",
+                        required=True)
+
+    parser.add_argument('-s', '--sample',
+                        help="The sample id to name the peak files.",
+                        required=True)
+
+    parser.add_argument('-g', '--genome',
+                        help="The genome size of reference genome.",
+                        required=True)
+
+    parser.add_argument('-z', '--size',
+                        help="The file with chromosome sizes of reference genome.",
+                        required=True)
+
+    parser.add_argument('-p', '--paired',
+                        help="True/False if paired-end or single end.",
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+# Functions
+
+
+def check_tools():
+    '''Checks for required componenets on user system'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    r_path = shutil.which("R")
+    if r_path:
+        logger.info('Found R: %s', r_path)
+    else:
+        logger.error('Missing R')
+        raise Exception('Missing R')
+
+    macs_path = shutil.which("macs2")
+    if r_path:
+        logger.info('Found MACS2: %s', macs_path)
+    else:
+        logger.error('Missing MACS2')
+        raise Exception('Missing MACS2')
+
+    bg_bw_path = shutil.which("bedGraphToBigWig")
+    if r_path:
+        logger.info('Found bedGraphToBigWig: %s', bg_bw_path)
+    else:
+        logger.error('Missing bedGraphToBigWig')
+        raise Exception('Missing bedGraphToBigWig')
+
+
+def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes):
+
+    # Extract the fragment length estimate from column 3 of the
+    # cross-correlation scores file
+    with open(xcor, 'r') as xcor_fh:
+        firstline = xcor_fh.readline()
+        fragment_length = firstline.split()[2]  # third column
+        logger.info("Fraglen %s" % (fragment_length))
+
+
+    # Generate narrow peaks and preliminary signal tracks
+
+    command = 'macs2 callpeak ' + \
+              '-t %s -c %s ' % (experiment, control) + \
+              '-f BED -n %s ' % (prefix) + \
+              '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length)
+
+    logger.info(command)
+    returncode = utils.block_on(command)
+    logger.info("MACS2 exited with returncode %d" % (returncode))
+    assert returncode == 0, "MACS2 non-zero return"
+
+    # MACS2 sometimes calls features off the end of chromosomes.
+    # Remove coordinates outside chromosome sizes
+
+    narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
+    clipped_narrowpeak_fn = '%s-clipped' % (filename)
+
+
+    steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes),
+            'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]
+
+    out, err = utils.run_pipe(steps)
+
+    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
+    # (score must be <1000)
+    rescaled_narrowpeak_fn = utils.rescale_scores(
+        clipped_narrowpeak_fn, scores_col=5)
+
+    # Sort by Col8 in descending order and replace long peak names in Column 4
+    # with Peak_<peakRank>
+    steps = [
+        'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
+        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'"""
+    ]
+
+    out, err = utils.run_pipe(steps, '%s' % (narrowpeak_fn))
+
+    # For Fold enrichment signal tracks
+
+    # This file is a tab delimited file with 2 columns Col1 (chromosome name),
+    # Col2 (chromosome size in bp).
+
+    command = 'macs2 bdgcmp ' + \
+          '-t %s_treat_pileup.bdg ' % (prefix) + \
+          '-c %s_control_lambda.bdg ' % (prefix) + \
+          '-o %s_FE.bdg ' % (prefix) + \
+          '-m FE'
+
+    logger.info(command)
+    returncode = utils.block_on(command)
+    logger.info("MACS2 exited with returncode %d" % (returncode))
+    assert returncode == 0, "MACS2 non-zero return"
+
+    # Remove coordinates outside chromosome sizes (MACS2 bug)
+    fc_bedgraph_fn = '%s.fc.signal.bedgraph' % (prefix)
+    fc_signal_fn = "%s.fc_signal.bw" % (prefix)
+    steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes),
+            'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)]
+
+    out, err = utils.run_pipe(steps)
+
+    # Convert bedgraph to bigwig
+    command = 'bedGraphToBigWig ' + \
+          '%s ' % (fc_bedgraph_fn) + \
+          '%s ' % (chrom_sizes) + \
+          '%s' % (fc_signal_fn)
+
+    logger.info(command)
+    returncode = utils.block_on(command)
+    logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
+    assert returncode == 0, "bedGraphToBigWig non-zero return"
+
+    # For -log10(p-value) signal tracks
+
+    # Compute sval =
+    # min(no. of reads in ChIP, no. of reads in control) / 1,000,000
+    out, err = utils.run_pipe(['gzip -dc %s' % (experiment), 'wc -l'])
+    chip_reads = out.strip()
+    out, err = common.run_pipe(['gzip -dc %s' % (control), 'wc -l'])
+    control_reads = out.strip()
+    sval = str(min(float(chip_reads), float(control_reads)) / 1000000)
+
+    logger.info("chip_reads = %s, control_reads = %s, sval = %s" %
+            (chip_reads, control_reads, sval))
+
+    command = 'macs2 bdgcmp ' + \
+          '-t %s_treat_pileup.bdg ' % (prefix) + \
+          '-c %s_control_lambda.bdg ' % (prefix) + \
+          '-o %s_ppois.bdg ' % (prefix) + \
+          '-m ppois -S %s' % (sval)
+
+    logger.info(command)
+    returncode = utils.block_on(command)
+    assert returncode == 0, "MACS2 non-zero return"
+
+    # Remove coordinates outside chromosome sizes (MACS2 bug)
+    pvalue_bedgraph_fn = '%s.pval.signal.bedgraph' % (prefix)
+    pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix)
+    steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes),
+            'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)]
+
+    out, err = utils.run_pipe(steps)
+
+    # Convert bedgraph to bigwig
+    command = 'bedGraphToBigWig ' + \
+          '%s ' % (pvalue_bedgraph_fn) + \
+          '%s ' % (chrom_sizes) + \
+          '%s' % (fc_signal_fn)
+
+    logger.info(command)
+    returncode = utils.block_on(command)
+    logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
+    assert returncode == 0, "bedGraphToBigWig non-zero return"
+
+
+def main():
+    args = get_args()
+    tag = args.tag
+    xcor = args.xcor
+    con = args.con
+    sample = args.sample
+    genome_size = args.genome
+    chrom_size = args.size
+    paired = args.paired
+
+
+    # Create a file handler
+    handler = logging.FileHandler('call_peaks.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Calculate Cross-correlation if not already calcualted
+    if xcor == 'Calculate':
+        xcor_file = xcor(tag, paired)
+    else:
+        xcor_file = xcor
+
+    # Call Peaks using MACS2
+    call_peaks_macs(tag, xcor_file, con, sample, genome_size, chrom_size)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py
index 2643c6e..ed90dc7 100644
--- a/workflow/scripts/utils.py
+++ b/workflow/scripts/utils.py
@@ -45,6 +45,14 @@ def run_pipe(steps, outfile=None):
     return out, err
 
 
+def block_on(command):
+    process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
+    for line in iter(process.stdout.readline, ''):
+        sys.stdout.write(line)
+    process.wait()
+    return process.returncode
+
+
 def strip_extensions(filename, extensions):
     '''Strips extensions to get basename of file.'''
 
@@ -72,3 +80,45 @@ def count_lines(filename):
         'wc -l'
         ])
     return int(out)
+
+
+def rescale_scores(filename, scores_col, new_min=10, new_max=1000):
+    n_peaks = count_lines(filename)
+    sorted_fn = '%s-sorted' % (filename)
+    rescaled_fn = '%s-rescaled' % (filename)
+
+    out, err = run_pipe([
+        'sort -k %dgr,%dgr %s' % (scores_col, scores_col, fn),
+        r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""],
+        sorted_fn)
+
+    out, err = run_pipe([
+        'head -n 1 %s' % (sorted_fn),
+        'cut -f %s' % (scores_col)])
+    max_score = float(out.strip())
+    logger.info("rescale_scores: max_score = %s" % (max_score))
+
+    out, err = run_pipe([
+        'tail -n 1 %s' % (sorted_fn),
+        'cut -f %s' % (scores_col)])
+    min_score = float(out.strip())
+    logger.info("rescale_scores: min_score = %s" % (min_score))
+
+    a = min_score
+    b = max_score
+    x = new_min
+    y = new_max
+    if min_score == max_score:  # give all peaks new_min
+        rescale_formula = "x"
+    else:  # n is the unscaled score from scores_col
+        rescale_formula = "((n-a)*(y-x)/(b-a))+x"
+    out, err = run_pipe(
+        [
+            'cat %s' % (sorted_fn),
+            r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}"""
+            % (scores_col, a, b, x, y) +
+            r"""{$%d=int(%s) ; print $0}'"""
+            % (scores_col, rescale_formula)
+        ],
+        rescaled_fn)
+    return rescaled_fn
diff --git a/workflow/scripts/xcor.py b/workflow/scripts/xcor.py
index 0bf51e6..5283859 100644
--- a/workflow/scripts/xcor.py
+++ b/workflow/scripts/xcor.py
@@ -24,7 +24,7 @@ logger.setLevel(logging.INFO)
 
 def get_args():
     '''Define arguments.'''
-    
+
     parser = argparse.ArgumentParser(
         description=__doc__, epilog=EPILOG,
         formatter_class=argparse.RawDescriptionHelpFormatter)
@@ -106,6 +106,8 @@ def xcor(tag, paired):
          cc_plot_filename, cc_scores_filename)
     ])
 
+    return cc_scores_filename
+
 
 def main():
     args = get_args()
@@ -120,7 +122,7 @@ def main():
     check_tools()
 
     # Calculate Cross-correlation
-    xcor(tag, paired)
+    xcor_filename = xcor(tag, paired)
 
 
 if __name__ == '__main__':
-- 
GitLab