Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
xcor.py 4.05 KiB
#!/usr/bin/env python3

'''Compute cross-correlation analysis.'''

import os
import argparse
import shutil
import subprocess
import logging
from multiprocessing import cpu_count
import utils

EPILOG = '''
For more details:
        %(prog)s --help
'''

# SETTINGS

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)


# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe']


def get_args():
    '''Define arguments.'''

    parser = argparse.ArgumentParser(
        description=__doc__, epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('-t', '--tag',
                        help="The tagAlign file to qc on.",
                        required=True)

    parser.add_argument('-p', '--paired',
                        help="True/False if paired-end or single end.",
                        default=False,
                        action='store_true')

    args = parser.parse_args()
    return args

# Functions


def check_tools():
    '''Checks for required componenets on user system'''

    logger.info('Checking for required libraries and components on this system')

    r_path = shutil.which("R")
    if r_path:
        logger.info('Found R: %s', r_path)

        # Get Version
        r_version_command = "R --version"
        r_version = subprocess.check_output(r_version_command, shell=True)

        # Write to file
        r_file = open("version_r.txt", "wb")
        r_file.write(r_version)
        r_file.close()
    else:
        logger.error('Missing R')
        raise Exception('Missing R')

    phantompeak_path = shutil.which("run_spp.R")
    if phantompeak_path:
        logger.info('Found phantompeak: %s', phantompeak_path)

        # Get Version
        spp_version_command = "R -e \"packageVersion('spp')\""
        spp_version = subprocess.check_output(spp_version_command, shell=True)

        # Write to file
        spp_file = open("version_spp.txt", "wb")
        spp_file.write(spp_version)
        spp_file.close()
    else:
        logger.error('Missing phantompeak')
        raise Exception('Missing phantompeak')


def xcor(tag, paired):
    '''Use spp to calculate cross-correlation stats.'''

    tag_basename = os.path.basename(utils.strip_extensions(tag, STRIP_EXTENSIONS))
    uncompressed_tag_filename = tag_basename

    # Subsample tagAlign file
    number_reads = 15000000
    subsampled_tag_filename = \
        tag_basename + ".%d.tagAlign.gz" % (number_reads/1000000)

    steps = [
        'zcat %s' % (tag),
        'grep -v "chrM"',
        'shuf -n %d --random-source=%s' % (number_reads, tag)]

    if paired:
        steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])

    steps.extend(['gzip -nc'])

    out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename)

    # Calculate Cross-correlation QC scores
    cc_scores_filename = tag_basename + ".cc.qc"
    cc_plot_filename = tag_basename + ".cc.plot.pdf"

    # CC_SCORE FILE format
    # Filename <tab>
    # numReads <tab>
    # estFragLen <tab>
    # corr_estFragLen <tab>
    # PhantomPeak <tab>
    # corr_phantomPeak <tab>
    # argmin_corr <tab>
    # min_corr <tab>
    # phantomPeakCoef <tab>
    # relPhantomPeakCoef <tab>
    # QualityTag

    run_spp_command = shutil.which("run_spp.R")
    out, err = utils.run_pipe([
        "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" %
        (run_spp_command, subsampled_tag_filename, cpu_count(),
         cc_plot_filename, cc_scores_filename)
    ])

    return cc_scores_filename


def main():
    args = get_args()
    paired = args.paired
    tag = args.tag

    # Create a file handler
    handler = logging.FileHandler('xcor.log')
    logger.addHandler(handler)

    # Check if tools are present
    check_tools()

    # Calculate Cross-correlation
    xcor(tag, paired)


if __name__ == '__main__':
    main()