Skip to content
Snippets Groups Projects

Containerized version with meme-chip 5.5.4

#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Call Motifs on called peaks.'''
import os
import argparse
import logging
import shutil
import subprocess
from multiprocessing import Pool
import pandas as pd
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.narrowPeak', '.replicated']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run motif search.",
required=True)
parser.add_argument('-g', '--genome',
help="The genome FASTA file.",
required=True)
parser.add_argument('-p', '--peak',
help="The number of peaks to use.",
required=True)
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
meme_path = shutil.which("meme")
if meme_path:
logger.info('Found meme: %s', meme_path)
# Get Version
memechip_version_command = "meme-chip --version"
memechip_version = subprocess.check_output(memechip_version_command, shell=True)
# Write to file
meme_file = open("version_memechip.txt", "wb")
meme_file.write(b"Version %s" % (memechip_version))
meme_file.close()
else:
logger.error('Missing meme')
raise Exception('Missing meme')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def run_wrapper(args):
motif_search(*args)
def motif_search(filename, genome, experiment, peak):
'''Run motif serach on peaks.'''
file_basename = os.path.basename(
utils.strip_extensions(filename, STRIP_EXTENSIONS))
out_fa = '%s.fa' % (experiment)
out_motif = '%s_memechip' % (experiment)
# Sort Bed file and limit number of peaks
if peak == -1:
peak = utils.count_lines(filename)
peak_no = 'all'
else:
peak_no = peak
sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak_no)
out, err = utils.run_pipe([
'sort -k %dgr,%dgr %s' % (5, 5, filename),
'head -n %s' % (peak)], outfile=sorted_fn)
# Get fasta file
out, err = utils.run_pipe([
'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)])
if err:
logger.error("bedtools error: %s", err)
# Call memechip
out, err = utils.run_pipe([
'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
if err:
logger.error("meme-chip error: %s", err)
def main():
args = get_args()
design = args.design
genome = args.genome
peak = args.peak
# Create a file handler
handler = logging.FileHandler('motif.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Read files
design_df = pd.read_csv(design, sep='\t')
meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
work_pool = Pool(min(12, design_df.shape[0]))
return_list = work_pool.map(run_wrapper, meme_arglist)
work_pool.close()
work_pool.join()
if __name__ == '__main__':
main()