Skip to content
Snippets Groups Projects
overlap_peaks.py 7.51 KiB
Newer Older
#!/usr/bin/env python3

'''Generate naive overlap peak files and design file for downstream processing.'''

import os
import argparse
import logging
import shutil
Holly Ruess's avatar
Holly Ruess committed
import subprocess
import pandas as pd
Venkat Malladi's avatar
Venkat Malladi committed
import utils

EPILOG = '''
For more details:
        %(prog)s --help
'''

# SETTINGS

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)


def get_args():
    '''Define arguments.'''

    parser = argparse.ArgumentParser(
        description=__doc__, epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('-d', '--design',
                        help="The design file of peaks (tsv format).",
                        required=True)

    parser.add_argument('-f', '--files',
                        help="The design file of with bam files (tsv format).",
                        required=True)

Holly Ruess's avatar
Holly Ruess committed
    parser.add_argument('-b', '--blacklist',
                        help="Bed file of blacklisted regions to remove",
                        required=False)

    args = parser.parse_args()
    return args


def check_tools():
    '''Checks for required componenets on user system.'''

    logger.info('Checking for required libraries and components on this system')

    bedtools_path = shutil.which("bedtools")
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)
Holly Ruess's avatar
Holly Ruess committed

        # Get Version
        bedtools_version_command = "bedtools --version"
        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)

        # Write to file
        bedtools_file = open("version_bedtools.txt", "wb")
        bedtools_file.write(bedtools_version)
        bedtools_file.close()
    else:
        logger.error('Missing bedtools')
        raise Exception('Missing bedtools')


Holly Ruess's avatar
Holly Ruess committed
def update_design(design):
Holly Ruess's avatar
Holly Ruess committed
    '''Update design file for differential binding.'''

    logger.info("Running control file update.")

    file_dict = design[['sample_id', 'bam_reads']] \
                .set_index('sample_id').T.to_dict()

    design = design.drop('bam_index', axis=1)

    logger.info("Adding peaks column.")

    design = design.assign(peak='', peak_caller='bed')

    return design


def overlap(experiment, design):
    '''Calculate the overlap of peaks'''

    logger.info("Determining consenus peaks for experiment %s.", experiment)

    # Output File names
    peak_type = 'narrowPeak'
    overlapping_peaks_fn = '%s.replicated.%s' % (experiment, peak_type)
    rejected_peaks_fn = '%s.rejected.%s' % (experiment, peak_type)

    # Intermediate File names
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Assign Pooled and Psuedoreplicate peaks
    pool_peaks = design.loc[design.replicate == 'pooled', 'peaks'].values[0]
    pr1_peaks = design.loc[design.replicate == '1_pr', 'peaks'].values[0]
    pr2_peaks = design.loc[design.replicate == '2_pr', 'peaks'].values[0]

    # Remove non true replicate rows
    not_replicates = ['1_pr', '2_pr', 'pooled']
    design_true_reps = design[~design['replicate'].isin(not_replicates)]
    true_rep_peaks = design_true_reps.peaks.unique()

    # Find overlaps
    awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
    cut_command = 'cut -f 1-10'

    # Find pooled peaks that overlap Rep1 and Rep2
    # where overlap is defined as the fractional overlap
    # with any one of the overlapping peak pairs  >= 0.5

    steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]),
                  awk_command,
                  cut_command,
Holly Ruess's avatar
Holly Ruess committed
                  'sort -u',
                 ]

    iter_true_peaks = iter(true_rep_peaks)
    next(iter_true_peaks)

    if len(true_rep_peaks) > 1:
        for true_peak in true_rep_peaks[1:]:
            steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak),
                               awk_command,
                               cut_command,
Holly Ruess's avatar
Holly Ruess committed
                               'sort -u',
                              ])

    out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
          (utils.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2
    # where overlap is defined as the fractional overlap
    # with any one of the overlapping peak pairs  >= 0.5

    steps_pseudo = ['intersectBed -wo -a %s -b %s' % (pool_peaks, pr1_peaks),
                    awk_command,
                    cut_command,
                    'sort -u',
                    'intersectBed -wo -a stdin -b %s' % (pr2_peaks),
                    awk_command,
                    cut_command,
Holly Ruess's avatar
Holly Ruess committed
                    'sort -u',
                   ]

    out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn)
    print("%d peaks overlap with both pooled pseudoreplicates"
          % (utils.count_lines(overlap_pr_fn)))

    # Make union of peak lists
    out, err = utils.run_pipe([
                'cat %s %s' % (overlap_tr_fn, overlap_pr_fn),
Holly Ruess's avatar
Holly Ruess committed
                'sort -u',
                ], overlapping_peaks_fn)
    print("%d peaks overlap with true replicates or with pooled pseudorepliates"
            % (utils.count_lines(overlapping_peaks_fn)))

    # Make rejected peak list
    out, err = utils.run_pipe([
        'intersectBed -wa -v -a %s -b %s' % (pool_peaks, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print("%d peaks were rejected" % (utils.count_lines(rejected_peaks_fn)))

    # Remove temporary files
    os.remove(overlap_tr_fn)
    os.remove(overlap_pr_fn)

Holly Ruess's avatar
Holly Ruess committed
    return os.path.abspath(overlapping_peaks_fn)
Holly Ruess's avatar
Holly Ruess committed
def blacklist_peaks(experiment, blacklist):
    '''Remove blacklist peaks from replicated peaks'''

    logger.info("Removing blacklist regions from replicated peaks")

Holly Ruess's avatar
Holly Ruess committed
    narrowpead_input = "%s.replicated.narrowPeak" % (experiment)
Holly Ruess's avatar
Holly Ruess committed
    # Assign output
    blpo_filename = "%s.replicated_noblacklist.narrowPeak" % (experiment)

    # Remove
    steps = [
Holly Ruess's avatar
Holly Ruess committed
        "bedtools intersect -v -a %s -b %s" % (narrowpead_input, blacklist),
Holly Ruess's avatar
Holly Ruess committed
        r"""awk 'BEGIN{OFS="\t"} {if ($5>1000) $5=1000; print $0}'""",
Holly Ruess's avatar
Holly Ruess committed
        ]
    out, err = utils.run_pipe(steps, blpo_filename)


def main():
    args = get_args()
    design = args.design
    files = args.files
Holly Ruess's avatar
Holly Ruess committed
    blacklist = args.files

    # Create a file handler
    handler = logging.FileHandler('consensus_peaks.log')
    logger.addHandler(handler)

Holly Ruess's avatar
Holly Ruess committed
    # Check if tools are present
    check_tools()

    # Read files as dataframes
    design_peaks_df = pd.read_csv(design, sep='\t')
    design_files_df = pd.read_csv(files, sep='\t')

    # Make a design file for
Holly Ruess's avatar
Holly Ruess committed
    design_diff = update_design(design_files_df)

    # Find consenus overlap peaks for each experiment
    for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
        replicated_peak = overlap(experiment, df_experiment)
        design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak

    # Write out file
Holly Ruess's avatar
Holly Ruess committed
    design_diff.columns = ['SampleID',
                           'bamReads',
                           'Condition',
                           'Replicate',
                           'Peaks',
                           'PeakCaller']
Holly Ruess's avatar
Holly Ruess committed
    design_diff.to_csv("design_exQC.tsv", header=True, sep='\t', index=False)
Holly Ruess's avatar
Holly Ruess committed
    # Remove blacklist regions; if blacklist = True
    if blacklist:
        for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
            bl_peaks = blacklist_peaks(experiment, blacklist)

if __name__ == '__main__':
    main()