add in motif search.

5198436d · Venkat Malladi · 65bc6b76 · 5198436d · 5198436d · 5198436d
Commit 5198436d authored 6 years ago by Venkat Malladi
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -49,6 +49,8 @@ workflow_modules:
  - 'macs/2.1.0-20151222'
  - 'UCSC_userApps/v317'
  - 'R/3.4.1-gccmkl'
+  - 'meme/4.11.1-gcc-openmpi'
+  - 'python/2.7.x-anaconda'
 # A list of parameters used by the workflow, defining how to present them,
 # options etc in the web interface. For each parameter:

--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -55,6 +55,10 @@ process {
    module = ['R/3.4.1-gccmkl']
    cpus = 32
  }
+  $motifSearch {
+    module = ['python/2.7.x-anaconda', 'meme/4.11.1-gcc-openmpi', 'bedtools/2.26.0']
+    cpus = 32
+  }
 }
@@ -65,16 +69,19 @@ params {
      bwa = '/project/shared/bicf_workflow_ref/GRCh38'
      genomesize = 'hs'
      chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt'
+      fasta = '/project/shared/bicf_workflow_ref/GRCh38/genome.fa'
    }
    'GRCh37' {
      bwa = '/project/shared/bicf_workflow_ref/GRCh37'
      genomesize = 'hs'
      chromsizes = '/project/shared/bicf_workflow_ref/GRCh37/genomefile.txt'
+      fasta = '/project/shared/bicf_workflow_ref/GRCh37/genome.fa'
    }
    'GRCm38' {
      bwa = '/project/shared/bicf_workflow_ref/GRCm38'
      genomesize = 'mm'
      chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt'
+      fasta = '/project/shared/bicf_workflow_ref/GRCm38/genome.fa'
    }
  }
 }

--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -12,6 +12,7 @@ params.genomes = []
 params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
 params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false
 params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
+params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
 params.cutoffRatio = 1.2
 // Check inputs
@@ -36,6 +37,7 @@ designFile = params.designFile
 genomeSize = params.genomeSize
 genome = params.genome
 chromSizes = params.chromSizes
+fasta = params.fasta
 cutoffRatio = params.cutoffRatio
 process checkDesignFile {
@@ -371,6 +373,7 @@ process consensusPeaks {
  file '*.rejected.*' into rejectedPeaks
  file("design_diffPeaks.csv") into designDiffPeaks
  file("design_annotatePeaks.tsv") into designAnnotatePeaks
+  file("design_annotatePeaks.tsv") into designMotifSearch
  file("unqiue_experiments.csv") into uniqueExperiments
  script:
@@ -441,3 +444,23 @@ process diffPeaks {
  }
 }
+// Motif Search  Peaks
+process motifSearch {
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  input:
+  file designMotifSearch
+  output:
+  file "*meme*" into motifSearch
+  script:
+  """
+  python2.7 $baseDir/scripts/motif_search.py -i $designMotifSearch -g $fasta
+  """
+}
--- a/workflow/scripts/runMemechip.py
+++ b/workflow/scripts/runMemechip.py
-#!/qbrc/software/Python-2.7.7/bin/python
+#!/usr/bin/env python
 # programmer : bbc
 # usage:
@@ -8,12 +8,8 @@ from re import sub
 import string
 import argparse as ap
 import logging
-import twobitreader
 import subprocess
 import pandas as pd
-from Bio import SeqIO
-from Bio.Seq import Seq
-from Bio.SeqRecord import SeqRecord
 from multiprocessing import Pool
 logging.basicConfig(level=10)
@@ -26,7 +22,7 @@ def prepare_argparser():
 #  argparser.add_argument("-o","--output",dest = "outfile",type=str,required=True, help="output")
  argparser.add_argument("-g","--genome",dest = "genome",type=str, help="Genome 2 bit file")
 # argparser.add_argument("-m","--mask",dest = "mask",type=bool,default=False, help="Convert repeats to N")
-  argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks")
+ # argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks")
  return(argparser)
 def rc(seq):
@@ -38,8 +34,8 @@ def main():
  args = argparser.parse_args()
  #run(args.infile, args.genome, args.limit, args.output)
  #get Pool ready
-  dfile = pd.read_csv(args.infile)
+  dfile = pd.read_csv(args.infile, sep='\t')
-  meme_arglist =  zip(dfile['Peaks'].tolist(),[args.genome+"/genome.2bit"]*dfile.shape[0],[str(args.limit)]*dfile.shape[0],dfile['SampleID'].tolist())  
+  meme_arglist =  zip(dfile['Peaks'].tolist(),[args.genome]*dfile.shape[0],dfile['Condition'].tolist())
  work_pool = Pool(min(12,dfile.shape[0]))
  resultList =  work_pool.map(run_wrapper, meme_arglist)
  work_pool.close()
@@ -49,43 +45,16 @@ def main():
 def run_wrapper(args):
  run(*args)
+def run(infile, genome, output):
+  # Get fasta file
+  fasta_command = "bedtools getfasta -fi "+genome+' -bed '+infile+' -fo '+output+'.fa'
+  p = subprocess.Popen(fasta_command, shell=True)
+  p.communicate()
-def run(infile, genome, limit, output):
-  infile = open(infile).readlines()
-  logging.debug(len(infile))
-  genome = twobitreader.TwoBitFile(genome)
-  outfile = open(output+".fa","w")
-  rowcount = 0
-  limit = int(limit)
-  logging.debug(limit)
-  if limit < 0:
-    limit = len(infile)
-  for record in infile:
-    rowcount += 1   
-    #logging.debug(record) 
-    if rowcount <=limit:
-      seqbuf = record.rstrip().split("\t")
-      try:
-        #logging.debug(record.chrom)
-        seq = genome[seqbuf[0]][int(seqbuf[1]):int(seqbuf[2])]
-      except:
-        pass
-      else:
-        if len(seqbuf)>=5: 
-          if seqbuf[5] == "-":
-            seq = rc(seq)
-        if len(seqbuf)>=4:
-          newfa_name = seqbuf[3]
-        else:
-          newfa_name = "_".join(seqbuf)
-        newfa = SeqRecord(Seq(seq),newfa_name,description="")
-        #logging.debug(seq)
-        SeqIO.write(newfa,outfile,"fasta")
-  outfile.close()
  #Call memechip
  meme_command = "meme-chip -oc "+output+"_memechip"+" -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 "+output+".fa"
  p = subprocess.Popen(meme_command, shell=True)
  p.communicate()
 if __name__=="__main__":
  main()