#!/usr/bin/env python3 '''Compute cross-correlation analysis.''' import os import argparse import shutil import logging from multiprocessing import cpu_count from python_utils import utils EPILOG = ''' For more details: %(prog)s --help ''' # SETTINGS logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) logger.propagate = False logger.setLevel(logging.INFO) def get_args(): '''Define arguments.''' parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-t', '--tag', help="The tagAlign file to qc on.", required=True) parser.add_argument('-p', '--paired', help="True/False if paired-end or single end.", default=False, action='store_true') args = parser.parse_args() return args # Functions def check_tools(): '''Checks for required componenets on user system''' logger.info('Checking for required libraries and components on this system') r_path = shutil.which("R") if r_path: logger.info('Found R: %s', r_path) else: logger.error('Missing R') raise Exception('Missing R') def xcor(tag, paired): '''Use spp to calculate cross-correlation stats.''' tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz'])) uncompressed_tag_filename = tag_basename out, err = utils.run_pipe([ 'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename) # Subsample tagAlign file numReads = 15000000 subsampled_tag_filename = \ tag_basename + ".%d.tagAlign.gz" % (numReads/1000000) steps = [ 'grep -v "chrM" %s' % (uncompressed_tag_filename), 'shuf -n %d --random-source=%s' % (numReads, uncompressed_tag_filename) ] if paired: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) steps.extend(['gzip -nc']) out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename) # Calculate Cross-correlation QC scores cc_scores_filename = subsampled_tag_filename + ".cc.qc" cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> # numReads <tab> # estFragLen <tab> # corr_estFragLen <tab> # PhantomPeak <tab> # corr_phantomPeak <tab> # argmin_corr <tab> # min_corr <tab> # phantomPeakCoef <tab> # relPhantomPeakCoef <tab> # QualityTag run_spp_command = shutil.which("run_spp.R") out, err = utils.run_pipe([ "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % (run_spp_command, subsampled_tag_filename, cpu_count(), cc_plot_filename, cc_scores_filename) ]) return cc_scores_filename def main(): args = get_args() paired = args.paired tag = args.tag # Create a file handler handler = logging.FileHandler('xcor.log') logger.addHandler(handler) # Check if tools are present check_tools() # Calculate Cross-correlation xcor_filename = xcor(tag, paired) if __name__ == '__main__': main()