diff --git a/env.md b/env.md new file mode 100644 index 0000000000000000000000000000000000000000..54176d86aa83a2fd8cf7825ad2fee7643c6234d3 --- /dev/null +++ b/env.md @@ -0,0 +1,17 @@ +## Create new env in specific folder +```shell +conda create -p /project/shared/bicf_workflow_ref/chipseq_bchen4/ -c r r-essentials +#Add channels +conda config --add channels conda-forge +conda config --add channels r +conda config --add channels bioconda +pip install --user twobitreader +conda install -c r r-xml +``` + +Install bioconductor in R console: +```R +source("http://bioconductor.org/biocLite.R") +biocLite() +biocLite(c("DiffBind","ChIPseeker")) +``` \ No newline at end of file diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000000000000000000000000000000000000..7fc857a03a9f33a5aad2a64d6b608b555422204a --- /dev/null +++ b/nextflow.config @@ -0,0 +1,5 @@ +process.executor='slurm' +process.queue='super' +process.time='5d' + + diff --git a/workflow/main.nf b/workflow/main.nf new file mode 100644 index 0000000000000000000000000000000000000000..f5dc46d69e06961430646f7f59bc66d488c9c820 --- /dev/null +++ b/workflow/main.nf @@ -0,0 +1,141 @@ +#!/usr/bin/env nextflow + params.design="$baseDir/../test/samplesheet.csv" + params.bams = "$baseDir/../test/*.bam" + params.bais = "$baseDir/../test/*.bai" + params.peaks = "$baseDir/../test/*.broadPeak" + params.genomepath="/project/shared/bicf_workflow_ref/hg19/" + species = "hg19" + toppeakcount = 200 + design_file = file(params.design) + deeptools_design = Channel.fromPath(params.design) + diffbind_design = Channel.fromPath(params.design) + chipseeker_design = Channel.fromPath(params.design) + meme_design = Channel.fromPath(params.design) + index_bams = Channel.fromPath(params.bams) + deeptools_bams = Channel.fromPath(params.bams) + deeptools_peaks = Channel.fromPath(params.peaks) + chipseeker_peaks = Channel.fromPath(params.peaks) + diffbind_bams = Channel.fromPath(params.bams) + diffbind_peaks = Channel.fromPath(params.peaks) + meme_peaks = Channel.fromPath(params.peaks) + deeptools_bamindex = Channel.fromPath(params.bais) + diffbind_bamindex = Channel.fromPath(params.bais) + +//process bamindex { +// publishDir "$baseDir/output/", mode: 'copy' +// input: +// file index_bam_files from index_bams +// output: +// file "*bai" into deeptools_bamindex +// file "*bai" into diffbind_bamindex +// +// script: +// """ +// module load python/2.7.x-anaconda +// source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/ +// module load samtools/intel/1.3 +// samtools index $index_bam_files +// """ +//} + +process run_deeptools { + publishDir "$baseDir/output", mode: 'copy' + input: + file deeptools_design_file from deeptools_design + file deeptools_bam_files from deeptools_bams.toList() + file deeptools_peak_files from deeptools_peaks.toList() + file deeptools_bam_indexes from deeptools_bamindex.toList() + output: + stdout result + script: + """ + module load python/2.7.x-anaconda + source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/ + module load deeptools/2.3.5 + python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}} +""" +} + + +process run_diffbind { + publishDir "$baseDir/output", mode: 'copy' + input: + file diffbind_design_file from diffbind_design + file diffbind_bam_files from diffbind_bams.toList() + file diffbind_peak_files from diffbind_peaks.toList() + file diffbind_bam_indexes from diffbind_bamindex.toList() + output: + file "diffpeak.design" into diffpeaksdesign_chipseeker + file "diffpeak.design" into diffpeaksdesign_meme + file "*_diffbind.bed" into diffpeaks_meme + file "*_diffbind.bed" into diffpeaks_chipseeker + script: + """ + module load python/2.7.x-anaconda + source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/ + Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file +""" +} + +process run_chipseeker_diffpeak { + publishDir "$baseDir/output", mode: 'copy' + input: + file diffpeak_design_file from diffpeaksdesign_chipseeker + file diffpeaks from diffpeaks_chipseeker + output: + stdout result_chipseeker + script: + """ + module load python/2.7.x-anaconda + source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/ + Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file hg19 +""" +} + +process run_chipseeker_originalpeak { + publishDir "$baseDir/output", mode: 'copy' + input: + file design_file from chipseeker_design + file chipseeker_peak_files from chipseeker_peaks.toList() + output: + stdout result1 + script: + """ + module load python/2.7.x-anaconda + source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/ + Rscript $baseDir/scripts/runChipseeker.R $design_file ${species} +""" +} + +process run_meme_original { + publishDir "$baseDir/output", mode: 'copy' + input: + file design_meme from meme_design + file meme_peak_files from meme_peaks.toList() + output: + stdout result_meme_original + script: + """ + module load python/2.7.x-anaconda + source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/ + module load meme/4.11.1-gcc-openmpi + python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount} +""" +} + +process run_meme_diffpeak { + publishDir "$baseDir/output", mode: 'copy' + input: + file peaks_meme from diffpeaks_meme + file diffpeak_design from diffpeaksdesign_meme + output: + stdout result_meme + script: + """ + module load python/2.7.x-anaconda + source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/ + module load meme/4.11.1-gcc-openmpi + python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount} +""" +} + diff --git a/workflow/scripts/__init__.py b/workflow/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/workflow/scripts/__init__.py @@ -0,0 +1 @@ + diff --git a/workflow/scripts/process.py b/workflow/scripts/process.py new file mode 100644 index 0000000000000000000000000000000000000000..cc5ab1422743af749ab723d23fada13be475c485 --- /dev/null +++ b/workflow/scripts/process.py @@ -0,0 +1,77 @@ +#!/usr/bin/python +# programmer : bbc +# usage: main function to call all the procedures for chip-seq analysis +import sys +import os +import argparse as ap +import logging +import pandas as pd +import glob +import subprocess +from multiprocessing import Pool +import runDeepTools +import runMemechip +logging.basicConfig(level=10) + + +def prepare_argparser(): + description = "Make wig file for given bed using bam" + epilog = "For command line options of each command, type %(prog)% COMMAND -h" + argparser = ap.ArgumentParser(description=description, epilog = epilog) + argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input design file") + argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19") + argparser.add_argument("--top-peak",dest="toppeak",type=int, default=-1, help = "Only use top peaks for motif call") + #argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"]) + #argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header") + return(argparser) + +def memechip_wrapper(args): + #print args + runMemechip.run(*args) + +def main(): + argparser = prepare_argparser() + args = argparser.parse_args() + #dfile = pd.read_csv(args.infile) + + #for testing, add testing path to all input files + test_path = "/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/" + designfile = pd.read_csv(args.infile) + designfile['Peaks'] = designfile['Peaks'].apply(lambda x: test_path+x) + designfile['bamReads'] = designfile['bamReads'].apply(lambda x: test_path+x) + designfile['bamControl'] = designfile['bamControl'].apply(lambda x: test_path+x) + designfile.to_csv(args.infile+"_new",index=False) + dfile = pd.read_csv(args.infile+"_new") + #call deeptools + runDeepTools.run(args.infile+"_new", args.genome) + #call diffbind + this_script = os.path.abspath(__file__).split("/") + folder = "/".join(this_script[0:len(this_script)-1]) + + diffbind_command = "Rscript "+folder+"/runDiffBind.R "+args.infile+"_new" + #logging.debug(diffbind_command) + p = subprocess.Popen(diffbind_command, shell=True) + p.communicate() + #call chipseeker on original peaks and overlapping peaks + chipseeker_command = "Rscript "+folder+"/runChipseeker.R "+",".join(dfile['Peaks'].tolist())+" "+",".join(dfile['SampleID']) +#BC## logging.debug(chipseeker_command) + p = subprocess.Popen(chipseeker_command, shell=True) + p.communicate() + overlapping_peaks = glob.glob('*diffbind.bed') + overlapping_peak_names = [] + for pn in overlapping_peaks: + overlapping_peak_names.append(pn.split("_diffbind")[0].replace("!","non")) + chipseeker_overlap_command = "Rscript "+folder+"/runChipseeker.R "+",".join(overlapping_peaks)+" "+",".join(overlapping_peak_names) + p = subprocess.Popen(chipseeker_overlap_command, shell=True) + p.communicate() + #MEME-chip on all peaks + meme_arglist = zip(dfile['Peaks'].tolist(),[test_path+"hg19.2bit"]*dfile.shape[0],[str(args.toppeak)]*dfile.shape[0],dfile['SampleID'].tolist()) +#BC# #print meme_arglist + work_pool = Pool(min(12,dfile.shape[0])) + resultList = work_pool.map(memechip_wrapper, meme_arglist) + work_pool.close() + work_pool.join() + + +if __name__=="__main__": + main() diff --git a/workflow/scripts/runChipseeker.R b/workflow/scripts/runChipseeker.R new file mode 100644 index 0000000000000000000000000000000000000000..7f7702fd511d7ca112b42e0398477abd9d295a89 --- /dev/null +++ b/workflow/scripts/runChipseeker.R @@ -0,0 +1,52 @@ +args = commandArgs(trailingOnly=TRUE) +#if (length(args)==0) { +# stop("At least one argument must be supplied (input file).n", call.=FALSE) +#} else if (length(args)==1) { +# # default output file +# args[3] = "out.txt" +#} + +library(ChIPseeker) +if(args[2]=="hg19") +{ +library(TxDb.Hsapiens.UCSC.hg19.knownGene) +txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +} +if(args[2]=="mm10") +{ +library(TxDb.Hsapiens.UCSC.mm10.knownGene) +txdb <- TxDb.Hsapiens.UCSC.mm10.knownGene +} +if(args[2]=="hg38") +{ +library(TxDb.Hsapiens.UCSC.hg38.knownGene) +txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene +} + +design<-read.csv(args[1]) +files<-as.list(as.character(design$Peaks)) +names(files)<-design$SampleID + + +peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE) +for(index in c(1:length(peakAnnoList))) +{ + filename<-paste(names(files)[index],".chipseeker_annotation.xls",sep="") + write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F) + #draw individual plot + pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="") + vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="") + upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="") + pdf(pie_name) + plotAnnoPie(peakAnnoList[[index]]) + dev.off() + pdf(vennpie_name) + vennpie(peakAnnoList[[index]]) + dev.off() + pdf(upsetplot_name) + upsetplot(peakAnnoList[[index]]) + dev.off() + + +} + diff --git a/workflow/scripts/runDeepTools.py b/workflow/scripts/runDeepTools.py new file mode 100644 index 0000000000000000000000000000000000000000..51757f7d52943a39c6b09a83753136c0b86d9d7d --- /dev/null +++ b/workflow/scripts/runDeepTools.py @@ -0,0 +1,81 @@ +#!/usr/bin/python +# programmer : bbc +# usage: + +import sys +import argparse as ap +import logging +import subprocess +import pandas as pd +from multiprocessing import Pool +logging.basicConfig(level=10) + + +def prepare_argparser(): + description = "Make wig file for given bed using bam" + epilog = "For command line options of each command, type %(prog)% COMMAND -h" + argparser = ap.ArgumentParser(description=description, epilog = epilog) + argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file") + argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19") + #argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format") + #argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"]) + #argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header") + return(argparser) + +def run_qc(files, controls, labels): + mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz" + p = subprocess.Popen(mbs_command, shell=True) + #logging.debug(mbs_command) + p.communicate() + pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels) + #logging.debug(pcor_command) + p = subprocess.Popen(pcor_command, shell=True) + p.communicate() + #plotCoverage + pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10" + p = subprocess.Popen(pcov_command, shell=True) + p.communicate() + #draw fingerprints plots + for treat,ctrl,name in zip(files,controls,labels): + fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png" + p = subprocess.Popen(fp_command, shell=True) + p.communicate() + +def bam2bw_wrapper(command): + p = subprocess.Popen(command, shell=True) + p.communicate() + +def run_signal(files, labels, genome): + #compute matrix and draw profile and heatmap + gene_bed = genome+"gene.bed"#"/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed" + bw_commands = [] + for f in files: + bw_commands.append("bamCoverage -bs 10 -b "+f+" -o "+f.replace("bam","bw")) + work_pool = Pool(min(len(files), 12)) + work_pool.map(bam2bw_wrapper, bw_commands) + work_pool.close() + work_pool.join() + + cm_command = "computeMatrix scale-regions -R "+gene_bed+" -a 3000 -b 3000 --regionBodyLength 5000 --skipZeros -S *.bw -o samples.deeptools_generegionscalematrix.gz" + p = subprocess.Popen(cm_command, shell=True) + p.communicate() + hm_command = "plotHeatmap -m samples.deeptools_generegionscalematrix.gz -out samples.deeptools_readsHeatmap.png" + p = subprocess.Popen(hm_command, shell=True) + p.communicate() + +def run(dfile,genome): + #parse dfile, suppose data files are the same folder as design file + dfile = pd.read_csv(dfile) + #QC: multiBamSummary and plotCorrelation + run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID']) + #signal plots + run_signal(dfile['bamReads'],dfile['SampleID'],genome) + +def main(): + argparser = prepare_argparser() + args = argparser.parse_args() + run(args.infile, args.genome) + + +if __name__=="__main__": + main() diff --git a/workflow/scripts/runDeepTools.pyc b/workflow/scripts/runDeepTools.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90a7657d67359ee5b228fa1b5f01d1661b8421ca Binary files /dev/null and b/workflow/scripts/runDeepTools.pyc differ diff --git a/workflow/scripts/runDiffBind.R b/workflow/scripts/runDiffBind.R new file mode 100644 index 0000000000000000000000000000000000000000..f4c7a6bc6bcc47e56b6a72ffde2109c362297d21 --- /dev/null +++ b/workflow/scripts/runDiffBind.R @@ -0,0 +1,45 @@ +library("DiffBind") + +#build dba object from sample sheet and do analysis +args <- commandArgs(TRUE) +data <- dba(sampleSheet=args[1]) +data <- dba.count(data) +data <- dba.contrast(data, minMembers = 2, categories=DBA_CONDITION) +data <- dba.analyze(data) + +#Draw figures +pdf("diffbind.samples.heatmap.pdf") +plot(data) +dev.off() + +pdf("diffbind.samples.pca.pdf") +dba.plotPCA(data, DBA_TISSUE, label=DBA_CONDITION) +dev.off() + +#Save peak reads count +normcount <- dba.peakset(data, bRetrieve=T) +write.table(as.data.frame(normcount),"diffbind.normcount.txt",sep="\t",quote=F,row.names=F) + +#Retriving the differentially bound sites +#make new design file for chipseeker at the same time +new_SampleID = c() +new_Peaks = c() +for (i in c(1:length(data$contrasts))) +{ + contrast_bed_name = paste(data$contrasts[[i]]$name1,"vs", + data$contrasts[[i]]$name2,"diffbind.bed",sep="_") + contrast_name = paste(data$contrasts[[i]]$name1,"vs", + data$contrasts[[i]]$name2,"diffbind.xls",sep="_") + new_SampleID = c(new_SampleID,paste(data$contrasts[[i]]$name1,"vs",data$contrasts[[i]]$name2,sep="_")) + new_Peaks = c(new_Peaks, contrast_bed_name) + report <- dba.report(data, contrast=i, th=1, bCount=TRUE) + report <- as.data.frame(report) + print(head(report)) + colnames(report)[1:5]<-c("chrom","peak_start","peak_stop","peak_width","peak_strand") + #print(head(report)) + write.table(report,contrast_name,sep="\t",quote=F,row.names=F) + write.table(report[,c(1:3)],contrast_bed_name,sep="\t",quote=F,row.names=F, col.names=F) +} +#Write new design file +newdesign = data.frame(SampleID=new_SampleID, Peaks=new_Peaks) +write.csv(newdesign,"diffpeak.design",row.names=F,quote=F) diff --git a/workflow/scripts/runMemechip.py b/workflow/scripts/runMemechip.py new file mode 100644 index 0000000000000000000000000000000000000000..b2a15a3a06c55fa1f8c0d1bf851b61f529659cb5 --- /dev/null +++ b/workflow/scripts/runMemechip.py @@ -0,0 +1,91 @@ +#!/qbrc/software/Python-2.7.7/bin/python +# programmer : bbc +# usage: + +import sys +import re +from re import sub +import string +import argparse as ap +import logging +import twobitreader +import subprocess +import pandas as pd +import pybedtools +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from multiprocessing import Pool +logging.basicConfig(level=10) + + +def prepare_argparser(): + description = "Run memechip command" + epilog = "For command line options of each command, type %(prog)% COMMAND -h" + argparser = ap.ArgumentParser(description=description, epilog = epilog) + argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BED file") +# argparser.add_argument("-o","--output",dest = "outfile",type=str,required=True, help="output") + argparser.add_argument("-g","--genome",dest = "genome",type=str, help="Genome 2 bit file") + # argparser.add_argument("-m","--mask",dest = "mask",type=bool,default=False, help="Convert repeats to N") + argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks") + return(argparser) + +def rc(): + comps = {'A':"T",'C':"G",'G':"C",'T':"A","N":"N"} + return ''.join([comps[x] for x in seq.upper()[::-1]]) + +def main(): + argparser = prepare_argparser() + args = argparser.parse_args() + #run(args.infile, args.genome, args.limit, args.output) + #get Pool ready + dfile = pd.read_csv(args.infile) + meme_arglist = zip(dfile['Peaks'].tolist(),[args.genome+"genome.2bit"]*dfile.shape[0],[str(args.limit)]*dfile.shape[0],dfile['SampleID'].tolist()) + work_pool = Pool(min(12,dfile.shape[0])) + resultList = work_pool.map(run_wrapper, meme_arglist) + work_pool.close() + work_pool.join() + + +def run_wrapper(args): + run(*args) + + +def run(infile, genome, limit, output): + infile = pybedtools.BedTool(infile) + logging.debug(len(infile)) + genome = twobitreader.TwoBitFile(genome) + outfile = open(output+".fa","w") + rowcount = 0 + limit = int(limit) + logging.debug(limit) + if limit ==-1: + limit = len(infile) + for record in infile: + rowcount += 1 + #logging.debug(record) + if rowcount <=limit: + #logging.debug(rowcount) + try: + #logging.debug(record.chrom) + seq = genome[record.chrom][record.start:record.stop] + except: + pass + else: + if record.strand == "-": + seq = rc(seq) + if len(record.fields)>=4: + newfa_name = record.name + else: + newfa_name = "_".join(record.fields) + newfa = SeqRecord(Seq(seq),newfa_name,description="") + #logging.debug(seq) + SeqIO.write(newfa,outfile,"fasta") + outfile.close() + #Call memechip + meme_command = "meme-chip -oc "+output+"_memechip"+" -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 "+output+".fa" + p = subprocess.Popen(meme_command, shell=True) + p.communicate() + +if __name__=="__main__": + main() diff --git a/workflow/scripts/runMemechip.pyc b/workflow/scripts/runMemechip.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d397028f858912677cfad02e2de1d34f683a954a Binary files /dev/null and b/workflow/scripts/runMemechip.pyc differ