From 88a757f38a77dc2ba9a5a7d0a40726cc0bee5fac Mon Sep 17 00:00:00 2001 From: s181706 <jonathan.gesell@utsouthwestern.edu> Date: Thu, 23 Jan 2020 15:35:54 -0600 Subject: [PATCH] Added utils.py --- workflow/scripts/utils.py | 136 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 workflow/scripts/utils.py diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py new file mode 100644 index 0000000..5e4478e --- /dev/null +++ b/workflow/scripts/utils.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 + +# +# * -------------------------------------------------------------------------- +# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md) +# * -------------------------------------------------------------------------- +# + +'''General utilities.''' + + +import shlex +import logging +import subprocess +import sys +import os + + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = True + + +def run_pipe(steps, outfile=None): + from subprocess import Popen, PIPE + p = None + p_next = None + first_step_n = 1 + last_step_n = len(steps) + for n, step in enumerate(steps, start=first_step_n): + logger.debug("step %d: %s" % (n, step)) + if n == first_step_n: + if n == last_step_n and outfile: # one-step pipeline with outfile + with open(outfile, 'w') as fh: + print("one step shlex: %s to file: %s" % (shlex.split(step), outfile)) + p = Popen(shlex.split(step), stdout=fh) + break + print("first step shlex to stdout: %s" % (shlex.split(step))) + + p = Popen(shlex.split(step), stdout=PIPE) + elif n == last_step_n and outfile: # only treat the last step specially if you're sending stdout to a file + with open(outfile, 'w') as fh: + print("last step shlex: %s to file: %s" % (shlex.split(step), outfile)) + p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh) + p.stdout.close() + p = p_last + else: # handles intermediate steps and, in the case of a pipe to stdout, the last step + print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step))) + p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE) + p.stdout.close() + p = p_next + out, err = p.communicate() + return out, err + + +def block_on(command): + process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) + for line in iter(process.stdout.readline, b''): + sys.stdout.write(line.decode('utf-8')) + process.communicate() + return process.returncode + + +def strip_extensions(filename, extensions): + '''Strips extensions to get basename of file.''' + + basename = filename + for extension in extensions: + basename = basename.rpartition(extension)[0] or basename + + return basename + + +def count_lines(filename): + import mimetypes + compressed_mimetypes = [ + "compress", + "bzip2", + "gzip" + ] + mime_type = mimetypes.guess_type(filename)[1] + if mime_type in compressed_mimetypes: + catcommand = 'gzip -dc' + else: + catcommand = 'cat' + out, err = run_pipe([ + '%s %s' % (catcommand, filename), + 'wc -l' + ]) + return int(out) + + +def rescale_scores(filename, scores_col, new_min=10, new_max=1000): + sorted_fn = 'sorted-%s' % (filename) + rescaled_fn = 'rescaled-%s' % (filename) + + out, err = run_pipe([ + 'sort -k %dgr,%dgr %s' % (scores_col, scores_col, filename), + r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""], + sorted_fn) + + out, err = run_pipe([ + 'head -n 1 %s' % (sorted_fn), + 'cut -f %s' % (scores_col)]) + max_score = float(out.strip()) + logger.info("rescale_scores: max_score = %s", max_score) + + out, err = run_pipe([ + 'tail -n 1 %s' % (sorted_fn), + 'cut -f %s' % (scores_col)]) + min_score = float(out.strip()) + logger.info("rescale_scores: min_score = %s", min_score) + + a = min_score + b = max_score + x = new_min + y = new_max + + if min_score == max_score: # give all peaks new_min + rescale_formula = "x" + else: # n is the unscaled score from scores_col + rescale_formula = "((n-a)*(y-x)/(b-a))+x" + + out, err = run_pipe( + [ + 'cat %s' % (sorted_fn), + r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" + % (scores_col, a, b, x, y) + + r"""{$%d=int(%s) ; print $0}'""" + % (scores_col, rescale_formula) + ], + rescaled_fn) + + os.remove(sorted_fn) + + return rescaled_fn -- GitLab