diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index 6e15b213afac9f222e9cde55e60a63f72cbd7951..44e4b23509e4eed5a289150ebb38eec7de26c62e 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -49,6 +49,8 @@ workflow_modules: - 'macs/2.1.0-20151222' - 'UCSC_userApps/v317' - 'R/3.4.1-gccmkl' + - 'meme/4.11.1-gcc-openmpi' + - 'python/2.7.x-anaconda' # A list of parameters used by the workflow, defining how to present them, # options etc in the web interface. For each parameter: diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 5a4b0920d5d94a1b8e72311e205a50b1fc96aa85..8ae4fd3c7d4cc747893143144bb7cd8bba8d5f25 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -55,6 +55,10 @@ process { module = ['R/3.4.1-gccmkl'] cpus = 32 } + $motifSearch { + module = ['python/2.7.x-anaconda', 'meme/4.11.1-gcc-openmpi', 'bedtools/2.26.0'] + cpus = 32 + } } @@ -65,16 +69,19 @@ params { bwa = '/project/shared/bicf_workflow_ref/GRCh38' genomesize = 'hs' chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt' + fasta = '/project/shared/bicf_workflow_ref/GRCh38/genome.fa' } 'GRCh37' { bwa = '/project/shared/bicf_workflow_ref/GRCh37' genomesize = 'hs' chromsizes = '/project/shared/bicf_workflow_ref/GRCh37/genomefile.txt' + fasta = '/project/shared/bicf_workflow_ref/GRCh37/genome.fa' } 'GRCm38' { bwa = '/project/shared/bicf_workflow_ref/GRCm38' genomesize = 'mm' chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt' + fasta = '/project/shared/bicf_workflow_ref/GRCm38/genome.fa' } } } diff --git a/workflow/main.nf b/workflow/main.nf index 577903a78c3fda3a20a47a1ad104e27397e6f10b..491f1ae59be42fefa943fb79aa22c192cd99150a 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -12,6 +12,7 @@ params.genomes = [] params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false +params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false params.cutoffRatio = 1.2 // Check inputs @@ -36,6 +37,7 @@ designFile = params.designFile genomeSize = params.genomeSize genome = params.genome chromSizes = params.chromSizes +fasta = params.fasta cutoffRatio = params.cutoffRatio process checkDesignFile { @@ -371,6 +373,7 @@ process consensusPeaks { file '*.rejected.*' into rejectedPeaks file("design_diffPeaks.csv") into designDiffPeaks file("design_annotatePeaks.tsv") into designAnnotatePeaks + file("design_annotatePeaks.tsv") into designMotifSearch file("unqiue_experiments.csv") into uniqueExperiments script: @@ -441,3 +444,23 @@ process diffPeaks { } } + +// Motif Search Peaks +process motifSearch { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file designMotifSearch + + output: + + file "*meme*" into motifSearch + + script: + + """ + python2.7 $baseDir/scripts/motif_search.py -i $designMotifSearch -g $fasta + """ +} diff --git a/workflow/scripts/runMemechip.py b/workflow/scripts/motif_search.py similarity index 53% rename from workflow/scripts/runMemechip.py rename to workflow/scripts/motif_search.py index b1f848f0376a449329594d1fe3c67b3b0e454c84..b7756cb98d90b528f9f303e947070193432cc275 100644 --- a/workflow/scripts/runMemechip.py +++ b/workflow/scripts/motif_search.py @@ -1,4 +1,4 @@ -#!/qbrc/software/Python-2.7.7/bin/python +#!/usr/bin/env python # programmer : bbc # usage: @@ -8,12 +8,8 @@ from re import sub import string import argparse as ap import logging -import twobitreader import subprocess import pandas as pd -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord from multiprocessing import Pool logging.basicConfig(level=10) @@ -26,7 +22,7 @@ def prepare_argparser(): # argparser.add_argument("-o","--output",dest = "outfile",type=str,required=True, help="output") argparser.add_argument("-g","--genome",dest = "genome",type=str, help="Genome 2 bit file") # argparser.add_argument("-m","--mask",dest = "mask",type=bool,default=False, help="Convert repeats to N") - argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks") + # argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks") return(argparser) def rc(seq): @@ -38,8 +34,8 @@ def main(): args = argparser.parse_args() #run(args.infile, args.genome, args.limit, args.output) #get Pool ready - dfile = pd.read_csv(args.infile) - meme_arglist = zip(dfile['Peaks'].tolist(),[args.genome+"/genome.2bit"]*dfile.shape[0],[str(args.limit)]*dfile.shape[0],dfile['SampleID'].tolist()) + dfile = pd.read_csv(args.infile, sep='\t') + meme_arglist = zip(dfile['Peaks'].tolist(),[args.genome]*dfile.shape[0],dfile['Condition'].tolist()) work_pool = Pool(min(12,dfile.shape[0])) resultList = work_pool.map(run_wrapper, meme_arglist) work_pool.close() @@ -49,43 +45,16 @@ def main(): def run_wrapper(args): run(*args) +def run(infile, genome, output): + # Get fasta file + fasta_command = "bedtools getfasta -fi "+genome+' -bed '+infile+' -fo '+output+'.fa' + p = subprocess.Popen(fasta_command, shell=True) + p.communicate() -def run(infile, genome, limit, output): - infile = open(infile).readlines() - logging.debug(len(infile)) - genome = twobitreader.TwoBitFile(genome) - outfile = open(output+".fa","w") - rowcount = 0 - limit = int(limit) - logging.debug(limit) - if limit < 0: - limit = len(infile) - for record in infile: - rowcount += 1 - #logging.debug(record) - if rowcount <=limit: - seqbuf = record.rstrip().split("\t") - try: - #logging.debug(record.chrom) - seq = genome[seqbuf[0]][int(seqbuf[1]):int(seqbuf[2])] - except: - pass - else: - if len(seqbuf)>=5: - if seqbuf[5] == "-": - seq = rc(seq) - if len(seqbuf)>=4: - newfa_name = seqbuf[3] - else: - newfa_name = "_".join(seqbuf) - newfa = SeqRecord(Seq(seq),newfa_name,description="") - #logging.debug(seq) - SeqIO.write(newfa,outfile,"fasta") - outfile.close() #Call memechip meme_command = "meme-chip -oc "+output+"_memechip"+" -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 "+output+".fa" p = subprocess.Popen(meme_command, shell=True) p.communicate() - + if __name__=="__main__": main()