From e2f0afca1f8cabffa04ced5ce68d5ca137a45b9e Mon Sep 17 00:00:00 2001 From: Venkat Malladi <venkat.malladi@utsouthwestern.edu> Date: Sun, 16 Dec 2018 23:29:09 -0600 Subject: [PATCH] Update motif search scripts to meet new python standards --- workflow/conf/biohpc.config | 2 +- workflow/main.nf | 4 +- workflow/scripts/motif_search.py | 108 +++++++++++++++++++------------ 3 files changed, 70 insertions(+), 44 deletions(-) diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 7d54c2f..c6d3953 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -57,7 +57,7 @@ process { cpus = 32 } withName: motifSearch { - module = ['python/2.7.x-anaconda', 'meme/4.11.1-gcc-openmpi', 'bedtools/2.26.0'] + module = ['python/3.6.1-2-anaconda', 'meme/4.11.1-gcc-openmpi', 'bedtools/2.26.0'] cpus = 32 } diff --git a/workflow/main.nf b/workflow/main.nf index e770f8f..2905ed0 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -455,11 +455,11 @@ process motifSearch { output: - file "*meme*" into motifSearch + file "*meme_chip" into motifSearch script: """ - python2.7 $baseDir/scripts/motif_search.py -i $designMotifSearch -g $fasta + python3 $baseDir/scripts/motif_search.py -i $designMotifSearch -g $fasta """ } diff --git a/workflow/scripts/motif_search.py b/workflow/scripts/motif_search.py index b7756cb..5cf2785 100644 --- a/workflow/scripts/motif_search.py +++ b/workflow/scripts/motif_search.py @@ -1,60 +1,86 @@ -#!/usr/bin/env python -# programmer : bbc -# usage: +#!/usr/bin/env python3 + +'''Call Motifs on called peaks.''' + import sys import re from re import sub import string -import argparse as ap +import argparse import logging import subprocess import pandas as pd +import utils from multiprocessing import Pool -logging.basicConfig(level=10) +EPILOG = ''' +For more details: + %(prog)s --help +''' -def prepare_argparser(): - description = "Run memechip command" - epilog = "For command line options of each command, type %(prog)% COMMAND -h" - argparser = ap.ArgumentParser(description=description, epilog = epilog) - argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BED file") -# argparser.add_argument("-o","--output",dest = "outfile",type=str,required=True, help="output") - argparser.add_argument("-g","--genome",dest = "genome",type=str, help="Genome 2 bit file") - # argparser.add_argument("-m","--mask",dest = "mask",type=bool,default=False, help="Convert repeats to N") - # argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks") - return(argparser) +# SETTINGS -def rc(seq): - comps = {'A':"T",'C':"G",'G':"C",'T':"A","N":"N"} - return ''.join([comps[x] for x in seq.upper()[::-1]]) +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) -def main(): - argparser = prepare_argparser() - args = argparser.parse_args() - #run(args.infile, args.genome, args.limit, args.output) - #get Pool ready - dfile = pd.read_csv(args.infile, sep='\t') - meme_arglist = zip(dfile['Peaks'].tolist(),[args.genome]*dfile.shape[0],dfile['Condition'].tolist()) - work_pool = Pool(min(12,dfile.shape[0])) - resultList = work_pool.map(run_wrapper, meme_arglist) - work_pool.close() - work_pool.join() +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to run motif search.", + required=True) + + parser.add_argument('-g', '--genome', + help="The genome FASTA file.", + required=True) + + args = parser.parse_args() + return args + +# Functions def run_wrapper(args): - run(*args) - -def run(infile, genome, output): - # Get fasta file - fasta_command = "bedtools getfasta -fi "+genome+' -bed '+infile+' -fo '+output+'.fa' - p = subprocess.Popen(fasta_command, shell=True) - p.communicate() - - #Call memechip - meme_command = "meme-chip -oc "+output+"_memechip"+" -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 "+output+".fa" - p = subprocess.Popen(meme_command, shell=True) - p.communicate() + motif_search(*args) + +def motif_search(filename, genome, experiment): + sorted_fn = 'sorted-%s' % (filename) + out_fa = '%s.fa' % (experiment) + out_motif = '%s_memechip' % (experiment) + + # Sort Bed file by + out, err = run_pipe([ + 'sort -k %dgr,%dgr %s' % (5, 5, filename), + sorted_fn) + + # Get fasta file + out, err = utils.run_pipe([ + '"bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)]) + + #Call memechip + out, err = utils.run_pipe([ + '"meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -meme-norand' % (out_motif, out_fa)]) + +def main(): + args = get_args() + design = args.design + genome = args.genome + + # Read files + design_df = pd.read_csv(design, sep='\t') + + meme_arglist = zip(design_df['Peaks'].tolist(),[genome]*design_df.shape[0],design_df['Experiment'].tolist()) + work_pool = Pool(min(12,design_df.shape[0])) + return_list = work_pool.map(run_wrapper, meme_arglist) + work_pool.close() + work_pool.join() if __name__=="__main__": main() -- GitLab