Skip to content
Snippets Groups Projects
motif_search.py 4.49 KiB
Newer Older
Venkat Malladi's avatar
Venkat Malladi committed
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#

'''Call Motifs on called peaks.'''

Beibei Chen's avatar
Beibei Chen committed
import logging
Venkat Malladi's avatar
Venkat Malladi committed
import shutil
import subprocess
Venkat Malladi's avatar
Venkat Malladi committed
from multiprocessing import Pool
Beibei Chen's avatar
Beibei Chen committed
import pandas as pd
Venkat Malladi's avatar
Venkat Malladi committed

Beibei Chen's avatar
Beibei Chen committed

EPILOG = '''
For more details:
        %(prog)s --help
'''
Beibei Chen's avatar
Beibei Chen committed

Beibei Chen's avatar
Beibei Chen committed

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
Beibei Chen's avatar
Beibei Chen committed

Beibei Chen's avatar
Beibei Chen committed

# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
Venkat Malladi's avatar
Venkat Malladi committed
STRIP_EXTENSIONS = ['.narrowPeak', '.replicated']
def get_args():
    '''Define arguments.'''

    parser = argparse.ArgumentParser(
        description=__doc__, epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('-d', '--design',
                        help="The design file to run motif search.",
                        required=True)

    parser.add_argument('-g', '--genome',
                        help="The genome FASTA file.",
                        required=True)

    parser.add_argument('-p', '--peak',
                        help="The number of peaks to use.",
                        required=True)

    args = parser.parse_args()
    return args

# Functions
Beibei Chen's avatar
Beibei Chen committed

Venkat Malladi's avatar
Venkat Malladi committed

Venkat Malladi's avatar
Venkat Malladi committed
def check_tools():
    '''Checks for required componenets on user system'''

    logger.info('Checking for required libraries and components on this system')

    meme_path = shutil.which("meme")
    if meme_path:
        logger.info('Found meme: %s', meme_path)

        # Get Version
        memechip_version_command = "meme-chip --version"
        memechip_version = subprocess.check_output(memechip_version_command, shell=True)

        # Write to file
        meme_file = open("version_memechip.txt", "wb")
        meme_file.write(b"Version %s" % (memechip_version))
        meme_file.close()
Venkat Malladi's avatar
Venkat Malladi committed
    else:
        logger.error('Missing meme')
        raise Exception('Missing meme')

    bedtools_path = shutil.which("bedtools")
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)

        # Get Version
        bedtools_version_command = "bedtools --version"
        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)

        # Write to file
Venkat Malladi's avatar
Venkat Malladi committed
        bedtools_file = open("version_bedtools.txt", "wb")
        bedtools_file.write(bedtools_version)
        bedtools_file.close()
Venkat Malladi's avatar
Venkat Malladi committed
    else:
        logger.error('Missing bedtools')
        raise Exception('Missing bedtools')


Beibei Chen's avatar
Beibei Chen committed
def run_wrapper(args):
Venkat Malladi's avatar
Venkat Malladi committed
    motif_search(*args)

def motif_search(filename, genome, experiment, peak):
    '''Run motif serach on peaks.'''
    file_basename = os.path.basename(
        utils.strip_extensions(filename, STRIP_EXTENSIONS))
    out_fa = '%s.fa' % (experiment)
    out_motif = '%s_memechip' % (experiment)

    # Sort Bed file and limit number of peaks
    if peak == -1:
        peak = utils.count_lines(filename)
        peak_no = 'all'
    else:
        peak_no = peak

Venkat Malladi's avatar
Venkat Malladi committed
    sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak_no)

    out, err = utils.run_pipe([
        'sort -k %dgr,%dgr %s' % (5, 5, filename),
        'head -n %s' % (peak)], outfile=sorted_fn)

    # Get fasta file
    out, err = utils.run_pipe([
        'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)])
Venkat Malladi's avatar
Venkat Malladi committed
    if err:
        logger.error("bedtools error: %s", err)

Venkat Malladi's avatar
Venkat Malladi committed
    # Call memechip
    out, err = utils.run_pipe([
        'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
Venkat Malladi's avatar
Venkat Malladi committed
    if err:
        logger.error("meme-chip error: %s", err)


def main():
    args = get_args()
    design = args.design
    genome = args.genome
Venkat Malladi's avatar
Venkat Malladi committed
    # Create a file handler
    handler = logging.FileHandler('motif.log')
    logger.addHandler(handler)

    # Check if tools are present
    check_tools()

    # Read files
    design_df = pd.read_csv(design, sep='\t')

Venkat Malladi's avatar
Venkat Malladi committed
    meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
    work_pool = Pool(min(12, design_df.shape[0]))
    return_list = work_pool.map(run_wrapper, meme_arglist)
    work_pool.close()
    work_pool.join()
Venkat Malladi's avatar
Venkat Malladi committed

if __name__ == '__main__':
    main()