Merge branch 'develop' into 'master'

Merge develop into master See merge request !1

Merge branch 'develop' into 'master'
Merge develop into master See merge request !1
81b4c97b · Beibei Chen · 4e1e0b7c · 12f65b9c · 81b4c97b · 81b4c97b
Commit 81b4c97b authored 8 years ago by Beibei Chen
--- a/env.md
+++ b/env.md
+## Create new env in specific folder
+```shell
+conda create -p /project/shared/bicf_workflow_ref/chipseq_bchen4/ -c r r-essentials
+#Add channels
+conda config --add channels conda-forge
+conda config --add channels r
+conda config --add channels bioconda
+pip install --user twobitreader
+conda install -c r r-xml
+```
+
+Install bioconductor in R console:
+```R
+source("http://bioconductor.org/biocLite.R")
+biocLite()
+biocLite(c("DiffBind","ChIPseeker"))
+```
\ No newline at end of file
--- a/nextflow.config
+++ b/nextflow.config
+process.executor='slurm'
+process.queue='super'
+process.time='5d'
+
+
--- a/workflow/main.nf
+++ b/workflow/main.nf
+#!/usr/bin/env nextflow
+   params.design="$baseDir/../test/samplesheet.csv"
+   params.bams = "$baseDir/../test/*.bam"
+   params.bais = "$baseDir/../test/*.bai"
+   params.peaks = "$baseDir/../test/*.broadPeak"
+   params.genomepath="/project/shared/bicf_workflow_ref/hg19/"
+   species = "hg19"
+   toppeakcount = 200
+   design_file = file(params.design)
+   deeptools_design = Channel.fromPath(params.design)
+   diffbind_design = Channel.fromPath(params.design)
+   chipseeker_design = Channel.fromPath(params.design)
+   meme_design = Channel.fromPath(params.design)
+   index_bams = Channel.fromPath(params.bams)
+   deeptools_bams = Channel.fromPath(params.bams) 
+   deeptools_peaks = Channel.fromPath(params.peaks) 
+   chipseeker_peaks = Channel.fromPath(params.peaks) 
+   diffbind_bams = Channel.fromPath(params.bams) 
+   diffbind_peaks = Channel.fromPath(params.peaks) 
+   meme_peaks = Channel.fromPath(params.peaks)
+   deeptools_bamindex = Channel.fromPath(params.bais)
+   diffbind_bamindex = Channel.fromPath(params.bais) 
+
+//process bamindex {
+//   publishDir "$baseDir/output/", mode: 'copy'
+//   input:
+//     file index_bam_files from index_bams
+//   output:
+//     file "*bai" into deeptools_bamindex
+//     file "*bai" into diffbind_bamindex
+//
+//   script:
+//     """
+//     module load python/2.7.x-anaconda
+//     source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
+//     module load samtools/intel/1.3
+//     samtools index $index_bam_files
+//     """
+//}
+
+process run_deeptools {
+   publishDir "$baseDir/output", mode: 'copy'
+   input:
+     file deeptools_design_file from deeptools_design
+     file deeptools_bam_files from deeptools_bams.toList()
+     file deeptools_peak_files from deeptools_peaks.toList()
+     file deeptools_bam_indexes from deeptools_bamindex.toList()
+   output:
+     stdout result
+     script:
+     """
+     module load python/2.7.x-anaconda
+     source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
+     module load deeptools/2.3.5
+     python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}}
+"""
+}
+
+
+process run_diffbind {
+   publishDir "$baseDir/output", mode: 'copy'
+   input:
+     file diffbind_design_file from diffbind_design
+     file diffbind_bam_files from diffbind_bams.toList()
+     file diffbind_peak_files from diffbind_peaks.toList()
+     file diffbind_bam_indexes from diffbind_bamindex.toList()
+   output:
+     file "diffpeak.design" into diffpeaksdesign_chipseeker
+     file "diffpeak.design" into diffpeaksdesign_meme
+     file "*_diffbind.bed" into diffpeaks_meme
+     file "*_diffbind.bed" into diffpeaks_chipseeker
+   script:
+     """
+     module load python/2.7.x-anaconda
+     source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
+     Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file
+"""
+}
+
+process run_chipseeker_diffpeak {
+   publishDir "$baseDir/output", mode: 'copy'
+   input:
+     file diffpeak_design_file from diffpeaksdesign_chipseeker
+     file diffpeaks from diffpeaks_chipseeker
+   output:
+     stdout result_chipseeker
+   script:
+     """
+     module load python/2.7.x-anaconda
+     source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
+     Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file hg19
+"""
+}
+
+process run_chipseeker_originalpeak {
+   publishDir "$baseDir/output", mode: 'copy'
+   input:
+     file design_file from chipseeker_design
+     file chipseeker_peak_files from chipseeker_peaks.toList()
+   output:
+     stdout result1
+   script:
+     """
+     module load python/2.7.x-anaconda
+     source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
+     Rscript $baseDir/scripts/runChipseeker.R $design_file ${species}
+"""
+}
+
+process run_meme_original {
+   publishDir "$baseDir/output", mode: 'copy'
+   input:
+     file design_meme from meme_design
+     file meme_peak_files from meme_peaks.toList()
+   output:
+     stdout result_meme_original
+   script:
+     """
+     module load python/2.7.x-anaconda
+     source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
+     module load meme/4.11.1-gcc-openmpi
+     python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount}
+"""
+}
+
+process run_meme_diffpeak {
+   publishDir "$baseDir/output", mode: 'copy'
+   input:
+     file peaks_meme from diffpeaks_meme
+     file diffpeak_design from diffpeaksdesign_meme
+   output:
+     stdout result_meme
+   script:
+     """
+     module load python/2.7.x-anaconda
+     source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
+     module load meme/4.11.1-gcc-openmpi
+     python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount}
+"""
+}
+
--- a/workflow/scripts/__init__.py
+++ b/workflow/scripts/__init__.py
+
--- a/workflow/scripts/process.py
+++ b/workflow/scripts/process.py
+#!/usr/bin/python
+# programmer : bbc
+# usage: main function to call all the procedures for chip-seq analysis
+import sys
+import os
+import argparse as ap
+import logging
+import pandas as pd
+import glob
+import subprocess
+from multiprocessing import Pool
+import runDeepTools
+import runMemechip
+logging.basicConfig(level=10)
+
+
+def prepare_argparser():
+  description = "Make wig file for given bed using bam"
+  epilog = "For command line options of each command, type %(prog)% COMMAND -h"
+  argparser = ap.ArgumentParser(description=description, epilog = epilog)
+  argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input design file")
+  argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19")
+  argparser.add_argument("--top-peak",dest="toppeak",type=int, default=-1, help = "Only use top peaks for motif call")
+  #argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"])
+  #argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header")
+  return(argparser)
+
+def memechip_wrapper(args):
+  #print args  
+  runMemechip.run(*args)
+
+def main():
+  argparser = prepare_argparser()
+  args = argparser.parse_args()
+  #dfile = pd.read_csv(args.infile)
+
+  #for testing, add testing path to all input files
+  test_path = "/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/"
+  designfile = pd.read_csv(args.infile)
+  designfile['Peaks'] = designfile['Peaks'].apply(lambda x: test_path+x)
+  designfile['bamReads'] = designfile['bamReads'].apply(lambda x: test_path+x)
+  designfile['bamControl'] = designfile['bamControl'].apply(lambda x: test_path+x)
+  designfile.to_csv(args.infile+"_new",index=False)
+  dfile = pd.read_csv(args.infile+"_new")
+  #call deeptools
+  runDeepTools.run(args.infile+"_new", args.genome) 
+  #call diffbind
+  this_script = os.path.abspath(__file__).split("/")
+  folder = "/".join(this_script[0:len(this_script)-1])
+  
+  diffbind_command = "Rscript "+folder+"/runDiffBind.R "+args.infile+"_new"
+  #logging.debug(diffbind_command)
+  p = subprocess.Popen(diffbind_command, shell=True)
+  p.communicate()
+  #call chipseeker on original peaks and overlapping peaks
+  chipseeker_command = "Rscript "+folder+"/runChipseeker.R "+",".join(dfile['Peaks'].tolist())+" "+",".join(dfile['SampleID'])
+#BC##  logging.debug(chipseeker_command)
+  p = subprocess.Popen(chipseeker_command, shell=True)
+  p.communicate()
+  overlapping_peaks = glob.glob('*diffbind.bed')
+  overlapping_peak_names = []
+  for pn in overlapping_peaks:
+    overlapping_peak_names.append(pn.split("_diffbind")[0].replace("!","non"))
+  chipseeker_overlap_command = "Rscript "+folder+"/runChipseeker.R "+",".join(overlapping_peaks)+" "+",".join(overlapping_peak_names)
+  p = subprocess.Popen(chipseeker_overlap_command, shell=True)
+  p.communicate()
+  #MEME-chip on all peaks
+  meme_arglist =  zip(dfile['Peaks'].tolist(),[test_path+"hg19.2bit"]*dfile.shape[0],[str(args.toppeak)]*dfile.shape[0],dfile['SampleID'].tolist())
+#BC#  #print meme_arglist
+  work_pool = Pool(min(12,dfile.shape[0]))
+  resultList = work_pool.map(memechip_wrapper, meme_arglist)
+  work_pool.close()
+  work_pool.join()
+ 
+
+if __name__=="__main__":
+  main()
--- a/workflow/scripts/runChipseeker.R
+++ b/workflow/scripts/runChipseeker.R
+args = commandArgs(trailingOnly=TRUE)
+#if (length(args)==0) {
+#  stop("At least one argument must be supplied (input file).n", call.=FALSE)
+#} else if (length(args)==1) {
+#  # default output file
+#  args[3] = "out.txt"
+#}
+
+library(ChIPseeker)
+if(args[2]=="hg19")
+{ 
+library(TxDb.Hsapiens.UCSC.hg19.knownGene)
+txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+}
+if(args[2]=="mm10")
+{ 
+library(TxDb.Hsapiens.UCSC.mm10.knownGene)
+txdb <- TxDb.Hsapiens.UCSC.mm10.knownGene
+}
+if(args[2]=="hg38")
+{ 
+library(TxDb.Hsapiens.UCSC.hg38.knownGene)
+txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
+}
+
+design<-read.csv(args[1])
+files<-as.list(as.character(design$Peaks))
+names(files)<-design$SampleID
+
+
+peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
+for(index in c(1:length(peakAnnoList)))
+{
+  filename<-paste(names(files)[index],".chipseeker_annotation.xls",sep="")
+  write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F)
+  #draw individual plot
+  pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="") 
+  vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="") 
+  upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="")
+  pdf(pie_name)
+  plotAnnoPie(peakAnnoList[[index]])
+  dev.off()
+  pdf(vennpie_name)
+  vennpie(peakAnnoList[[index]])
+  dev.off()
+  pdf(upsetplot_name)
+  upsetplot(peakAnnoList[[index]])
+  dev.off()
+
+
+}
+
--- a/workflow/scripts/runDeepTools.py
+++ b/workflow/scripts/runDeepTools.py
+#!/usr/bin/python
+# programmer : bbc
+# usage:
+
+import sys
+import argparse as ap
+import logging
+import subprocess
+import pandas as pd
+from multiprocessing import Pool
+logging.basicConfig(level=10)
+
+
+def prepare_argparser():
+  description = "Make wig file for given bed using bam"
+  epilog = "For command line options of each command, type %(prog)% COMMAND -h"
+  argparser = ap.ArgumentParser(description=description, epilog = epilog)
+  argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file")
+  argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19")
+  #argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format")
+  #argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"])
+  #argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header")
+  return(argparser)
+
+def run_qc(files, controls, labels):
+  mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz"
+  p = subprocess.Popen(mbs_command, shell=True)
+  #logging.debug(mbs_command)
+  p.communicate()
+  pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers  -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels)
+  #logging.debug(pcor_command)
+  p = subprocess.Popen(pcor_command, shell=True)
+  p.communicate()
+  #plotCoverage
+  pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10"
+  p = subprocess.Popen(pcov_command, shell=True)
+  p.communicate()
+  #draw fingerprints plots
+  for treat,ctrl,name in zip(files,controls,labels):
+    fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png"
+    p = subprocess.Popen(fp_command, shell=True)
+    p.communicate()
+
+def bam2bw_wrapper(command):
+  p = subprocess.Popen(command, shell=True)
+  p.communicate()
+
+def run_signal(files, labels, genome):
+  #compute matrix and draw profile and heatmap
+  gene_bed = genome+"gene.bed"#"/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed"
+  bw_commands = []
+  for f in files:
+    bw_commands.append("bamCoverage -bs 10 -b "+f+" -o "+f.replace("bam","bw"))
+  work_pool = Pool(min(len(files), 12))
+  work_pool.map(bam2bw_wrapper, bw_commands)
+  work_pool.close()
+  work_pool.join()
+  
+  cm_command = "computeMatrix scale-regions -R "+gene_bed+" -a 3000 -b 3000 --regionBodyLength 5000 --skipZeros -S *.bw -o samples.deeptools_generegionscalematrix.gz"
+  p = subprocess.Popen(cm_command, shell=True)
+  p.communicate()
+  hm_command = "plotHeatmap -m samples.deeptools_generegionscalematrix.gz -out samples.deeptools_readsHeatmap.png"
+  p = subprocess.Popen(hm_command, shell=True)
+  p.communicate()  
+
+def run(dfile,genome):
+  #parse dfile, suppose data files are the same folder as design file
+  dfile = pd.read_csv(dfile)
+  #QC: multiBamSummary and plotCorrelation
+  run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID']) 
+  #signal plots
+  run_signal(dfile['bamReads'],dfile['SampleID'],genome)
+
+def main():
+  argparser = prepare_argparser()
+  args = argparser.parse_args()
+  run(args.infile, args.genome)
+  
+
+if __name__=="__main__":
+  main()
--- a/workflow/scripts/runDeepTools.pyc
+++ b/workflow/scripts/runDeepTools.pyc
--- a/workflow/scripts/runDiffBind.R
+++ b/workflow/scripts/runDiffBind.R
+library("DiffBind")
+
+#build dba object from sample sheet and do analysis
+args <- commandArgs(TRUE)
+data <- dba(sampleSheet=args[1])
+data <- dba.count(data)
+data <- dba.contrast(data, minMembers = 2, categories=DBA_CONDITION)
+data <- dba.analyze(data)
+
+#Draw figures
+pdf("diffbind.samples.heatmap.pdf")
+plot(data)
+dev.off()
+
+pdf("diffbind.samples.pca.pdf")
+dba.plotPCA(data, DBA_TISSUE, label=DBA_CONDITION)
+dev.off()
+
+#Save peak reads count
+normcount <- dba.peakset(data, bRetrieve=T)
+write.table(as.data.frame(normcount),"diffbind.normcount.txt",sep="\t",quote=F,row.names=F)
+
+#Retriving the differentially bound sites
+#make new design file for chipseeker at the same time
+new_SampleID = c()
+new_Peaks = c()
+for (i in c(1:length(data$contrasts)))
+{
+ contrast_bed_name = paste(data$contrasts[[i]]$name1,"vs",
+                      data$contrasts[[i]]$name2,"diffbind.bed",sep="_")
+ contrast_name = paste(data$contrasts[[i]]$name1,"vs",
+                      data$contrasts[[i]]$name2,"diffbind.xls",sep="_")
+ new_SampleID = c(new_SampleID,paste(data$contrasts[[i]]$name1,"vs",data$contrasts[[i]]$name2,sep="_"))
+ new_Peaks = c(new_Peaks, contrast_bed_name)
+ report <- dba.report(data, contrast=i, th=1, bCount=TRUE)
+ report <- as.data.frame(report)
+ print(head(report))
+ colnames(report)[1:5]<-c("chrom","peak_start","peak_stop","peak_width","peak_strand")
+ #print(head(report))
+ write.table(report,contrast_name,sep="\t",quote=F,row.names=F)
+ write.table(report[,c(1:3)],contrast_bed_name,sep="\t",quote=F,row.names=F, col.names=F)
+}
+#Write new design file
+newdesign = data.frame(SampleID=new_SampleID, Peaks=new_Peaks)
+write.csv(newdesign,"diffpeak.design",row.names=F,quote=F)
--- a/workflow/scripts/runMemechip.py
+++ b/workflow/scripts/runMemechip.py
+#!/qbrc/software/Python-2.7.7/bin/python
+# programmer : bbc
+# usage:
+
+import sys
+import re
+from re import sub
+import string
+import argparse as ap
+import logging
+import twobitreader
+import subprocess
+import pandas as pd
+import pybedtools
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from multiprocessing import Pool
+logging.basicConfig(level=10)
+
+
+def prepare_argparser():
+  description = "Run memechip command"
+  epilog = "For command line options of each command, type %(prog)% COMMAND -h"
+  argparser = ap.ArgumentParser(description=description, epilog = epilog)
+  argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BED file")
+#  argparser.add_argument("-o","--output",dest = "outfile",type=str,required=True, help="output")
+  argparser.add_argument("-g","--genome",dest = "genome",type=str, help="Genome 2 bit file")
+ # argparser.add_argument("-m","--mask",dest = "mask",type=bool,default=False, help="Convert repeats to N")
+  argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks")
+  return(argparser)
+
+def rc():
+  comps = {'A':"T",'C':"G",'G':"C",'T':"A","N":"N"}
+  return ''.join([comps[x] for x in seq.upper()[::-1]])
+
+def main():
+  argparser = prepare_argparser()
+  args = argparser.parse_args()
+  #run(args.infile, args.genome, args.limit, args.output)
+  #get Pool ready
+  dfile = pd.read_csv(args.infile)
+  meme_arglist =  zip(dfile['Peaks'].tolist(),[args.genome+"genome.2bit"]*dfile.shape[0],[str(args.limit)]*dfile.shape[0],dfile['SampleID'].tolist())  
+  work_pool = Pool(min(12,dfile.shape[0]))
+  resultList =  work_pool.map(run_wrapper, meme_arglist)
+  work_pool.close()
+  work_pool.join()
+
+
+def run_wrapper(args):
+  run(*args)
+
+
+def run(infile, genome, limit, output):
+  infile = pybedtools.BedTool(infile)
+  logging.debug(len(infile))
+  genome = twobitreader.TwoBitFile(genome)
+  outfile = open(output+".fa","w")
+  rowcount = 0
+  limit = int(limit)
+  logging.debug(limit)
+  if limit ==-1:
+    limit = len(infile)
+  for record in infile:
+    rowcount += 1   
+    #logging.debug(record) 
+    if rowcount <=limit:
+      #logging.debug(rowcount)
+      try:
+        #logging.debug(record.chrom)
+        seq = genome[record.chrom][record.start:record.stop]
+      except:
+        pass
+      else:
+        if record.strand == "-":
+          seq = rc(seq)
+        if len(record.fields)>=4:
+          newfa_name = record.name
+        else:
+          newfa_name = "_".join(record.fields)
+        newfa = SeqRecord(Seq(seq),newfa_name,description="")
+        #logging.debug(seq)
+        SeqIO.write(newfa,outfile,"fasta")
+  outfile.close()
+  #Call memechip
+  meme_command = "meme-chip -oc "+output+"_memechip"+" -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 "+output+".fa"
+  p = subprocess.Popen(meme_command, shell=True)
+  p.communicate()
+ 
+if __name__=="__main__":
+  main()
--- a/workflow/scripts/runMemechip.pyc
+++ b/workflow/scripts/runMemechip.pyc