Skip to content
Snippets Groups Projects
Commit 5198436d authored by Venkat Malladi's avatar Venkat Malladi
Browse files

add in motif search.

parent 65bc6b76
Branches
Tags
No related merge requests found
......@@ -49,6 +49,8 @@ workflow_modules:
- 'macs/2.1.0-20151222'
- 'UCSC_userApps/v317'
- 'R/3.4.1-gccmkl'
- 'meme/4.11.1-gcc-openmpi'
- 'python/2.7.x-anaconda'
# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
......
......@@ -55,6 +55,10 @@ process {
module = ['R/3.4.1-gccmkl']
cpus = 32
}
$motifSearch {
module = ['python/2.7.x-anaconda', 'meme/4.11.1-gcc-openmpi', 'bedtools/2.26.0']
cpus = 32
}
}
......@@ -65,16 +69,19 @@ params {
bwa = '/project/shared/bicf_workflow_ref/GRCh38'
genomesize = 'hs'
chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/GRCh38/genome.fa'
}
'GRCh37' {
bwa = '/project/shared/bicf_workflow_ref/GRCh37'
genomesize = 'hs'
chromsizes = '/project/shared/bicf_workflow_ref/GRCh37/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/GRCh37/genome.fa'
}
'GRCm38' {
bwa = '/project/shared/bicf_workflow_ref/GRCm38'
genomesize = 'mm'
chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/GRCm38/genome.fa'
}
}
}
......
......@@ -12,6 +12,7 @@ params.genomes = []
params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false
params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
params.cutoffRatio = 1.2
// Check inputs
......@@ -36,6 +37,7 @@ designFile = params.designFile
genomeSize = params.genomeSize
genome = params.genome
chromSizes = params.chromSizes
fasta = params.fasta
cutoffRatio = params.cutoffRatio
process checkDesignFile {
......@@ -371,6 +373,7 @@ process consensusPeaks {
file '*.rejected.*' into rejectedPeaks
file("design_diffPeaks.csv") into designDiffPeaks
file("design_annotatePeaks.tsv") into designAnnotatePeaks
file("design_annotatePeaks.tsv") into designMotifSearch
file("unqiue_experiments.csv") into uniqueExperiments
script:
......@@ -441,3 +444,23 @@ process diffPeaks {
}
}
// Motif Search Peaks
process motifSearch {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
file designMotifSearch
output:
file "*meme*" into motifSearch
script:
"""
python2.7 $baseDir/scripts/motif_search.py -i $designMotifSearch -g $fasta
"""
}
#!/qbrc/software/Python-2.7.7/bin/python
#!/usr/bin/env python
# programmer : bbc
# usage:
......@@ -8,12 +8,8 @@ from re import sub
import string
import argparse as ap
import logging
import twobitreader
import subprocess
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from multiprocessing import Pool
logging.basicConfig(level=10)
......@@ -26,7 +22,7 @@ def prepare_argparser():
# argparser.add_argument("-o","--output",dest = "outfile",type=str,required=True, help="output")
argparser.add_argument("-g","--genome",dest = "genome",type=str, help="Genome 2 bit file")
# argparser.add_argument("-m","--mask",dest = "mask",type=bool,default=False, help="Convert repeats to N")
argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks")
# argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks")
return(argparser)
def rc(seq):
......@@ -38,8 +34,8 @@ def main():
args = argparser.parse_args()
#run(args.infile, args.genome, args.limit, args.output)
#get Pool ready
dfile = pd.read_csv(args.infile)
meme_arglist = zip(dfile['Peaks'].tolist(),[args.genome+"/genome.2bit"]*dfile.shape[0],[str(args.limit)]*dfile.shape[0],dfile['SampleID'].tolist())
dfile = pd.read_csv(args.infile, sep='\t')
meme_arglist = zip(dfile['Peaks'].tolist(),[args.genome]*dfile.shape[0],dfile['Condition'].tolist())
work_pool = Pool(min(12,dfile.shape[0]))
resultList = work_pool.map(run_wrapper, meme_arglist)
work_pool.close()
......@@ -49,43 +45,16 @@ def main():
def run_wrapper(args):
run(*args)
def run(infile, genome, output):
# Get fasta file
fasta_command = "bedtools getfasta -fi "+genome+' -bed '+infile+' -fo '+output+'.fa'
p = subprocess.Popen(fasta_command, shell=True)
p.communicate()
def run(infile, genome, limit, output):
infile = open(infile).readlines()
logging.debug(len(infile))
genome = twobitreader.TwoBitFile(genome)
outfile = open(output+".fa","w")
rowcount = 0
limit = int(limit)
logging.debug(limit)
if limit < 0:
limit = len(infile)
for record in infile:
rowcount += 1
#logging.debug(record)
if rowcount <=limit:
seqbuf = record.rstrip().split("\t")
try:
#logging.debug(record.chrom)
seq = genome[seqbuf[0]][int(seqbuf[1]):int(seqbuf[2])]
except:
pass
else:
if len(seqbuf)>=5:
if seqbuf[5] == "-":
seq = rc(seq)
if len(seqbuf)>=4:
newfa_name = seqbuf[3]
else:
newfa_name = "_".join(seqbuf)
newfa = SeqRecord(Seq(seq),newfa_name,description="")
#logging.debug(seq)
SeqIO.write(newfa,outfile,"fasta")
outfile.close()
#Call memechip
meme_command = "meme-chip -oc "+output+"_memechip"+" -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 "+output+".fa"
p = subprocess.Popen(meme_command, shell=True)
p.communicate()
if __name__=="__main__":
main()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment