diff --git a/workflow/main.nf b/workflow/main.nf index 1507f0d92f03c6939868b2b91c4bca68bda8cb7a..762cdef93f32a8e8634fbe47be6f1b65cd044576 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -5,6 +5,8 @@ params.testpath="/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/" params.design="/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/samplesheet.csv" params.genomepath="/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/hg19/" + species = "hg19" + toppeakcount = 200 // design_file = file(params.design) // bams=file(params.bams) //gtf_file = file("$params.genome/gencode.gtf") @@ -20,6 +22,9 @@ process processdesign { output: file "new_design" into deeptools_design file "new_design" into diffbind_design + file "new_design" into chipseeker_design + file "new_design" into meme_design + script: """ @@ -51,45 +56,75 @@ process run_diffbind { input: file diffbind_design_file from diffbind_design output: - file "*_diffbind.bed" into diffpeaks_chipseeker - file "*_diffbind.bed" into diffpeaks_meme - - script: + file "diffpeak.design" into diffpeaksdesign_chipseeker + file "diffpeak.design" into diffpeaksdesign_meme + file "*diffbind.bed" into diffpeaks_meme + file "*diffbind.bed" into diffpeaks_chipseeker + script: """ module load python/2.7.x-anaconda source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/ Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file """ } - -//process run_chipseeker { +// +//process run_chipseeker_diffpeak { // publishDir "$baseDir/output", mode: 'copy' // input: -// file diffbind_design_file from diffbind_design -// file annotation Tdx +// file diffpeak_design_file from diffpeaksdesign_chipseeker +// file diffpeaks from diffpeaks_chipseeker // output: -// file "*_diffbind.bed" into diffpeaks_chipseeker -// file "*_diffbind.bed" into diffpeaks_meme -// -// script: +// stdout result +// script: // """ // module load python/2.7.x-anaconda // source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/ -// Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file +// Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file hg19 //""" //} - -//process run_meme { -// publishDir "$baseDir/output", mode: 'copy' +// +//process run_chipseeker_originalpeak { +//// publishDir "$baseDir/output", mode: 'copy' // input: -// file peaks_meme from diffpeaks_meme.flatten() +// file design_file from chipseeker_design // output: -// stdout result -// script: +// stdout result1 +// script: // """ // module load python/2.7.x-anaconda // source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/ -// module load meme/4.11.1-gcc-openmpi -// python $baseDir/scripts/runMemechip.py -i $peaks_meme +// Rscript $baseDir/scripts/runChipseeker.R $design_file ${species} //""" //} + +process run_meme_original { + publishDir "$baseDir/output", mode: 'copy' + input: + file design_meme from meme_design + output: + stdout result_meme_original + script: + """ + module load python/2.7.x-anaconda + source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/ + module load meme/4.11.1-gcc-openmpi + python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount} +""" +} + +process run_meme_diffpeak { + publishDir "$baseDir/output", mode: 'copy' + input: + file peaks_meme from diffpeaks_meme + file diffpeak_design from diffpeaksdesign_meme + output: + stdout result_meme + script: + """ + module load python/2.7.x-anaconda + source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/ + module load meme/4.11.1-gcc-openmpi + python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount} +""" +} + diff --git a/workflow/scripts/process.py b/workflow/scripts/process.py index f01fd8d4dc583400635c76e53c1f4b8ee987e952..cc5ab1422743af749ab723d23fada13be475c485 100644 --- a/workflow/scripts/process.py +++ b/workflow/scripts/process.py @@ -57,7 +57,7 @@ def main(): #BC## logging.debug(chipseeker_command) p = subprocess.Popen(chipseeker_command, shell=True) p.communicate() - overlapping_peaks = glob.glob('*diffbind.xls') + overlapping_peaks = glob.glob('*diffbind.bed') overlapping_peak_names = [] for pn in overlapping_peaks: overlapping_peak_names.append(pn.split("_diffbind")[0].replace("!","non")) diff --git a/workflow/scripts/runChipseeker.R b/workflow/scripts/runChipseeker.R index ecb33a5fc78a2a8d4963a5d5362c2eb76c42ca12..7f7702fd511d7ca112b42e0398477abd9d295a89 100644 --- a/workflow/scripts/runChipseeker.R +++ b/workflow/scripts/runChipseeker.R @@ -7,25 +7,26 @@ args = commandArgs(trailingOnly=TRUE) #} library(ChIPseeker) -if args[3]=="hg19" +if(args[2]=="hg19") { library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene } -if args[3]=="mm10" +if(args[2]=="mm10") { library(TxDb.Hsapiens.UCSC.mm10.knownGene) txdb <- TxDb.Hsapiens.UCSC.mm10.knownGene } -if args[3]=="hg38" +if(args[2]=="hg38") { library(TxDb.Hsapiens.UCSC.hg38.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene } -files<-as.list(unlist(strsplit(args[1],","))) -names(files)<-as.list(unlist(strsplit(args[2],","))) -print(files) +design<-read.csv(args[1]) +files<-as.list(as.character(design$Peaks)) +names(files)<-design$SampleID + peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE) for(index in c(1:length(peakAnnoList))) diff --git a/workflow/scripts/runDiffBind.R b/workflow/scripts/runDiffBind.R index d6e88d121dd5e412415c5cc73e404fbe97d4102c..019ced9d1d787d6dd6bfa7c92f2731eb40a6c4ab 100644 --- a/workflow/scripts/runDiffBind.R +++ b/workflow/scripts/runDiffBind.R @@ -21,18 +21,25 @@ normcount <- dba.peakset(data, bRetrieve=T) write.table(as.data.frame(normcount),"diffbind.normcount.txt",sep="\t",quote=F,row.names=F) #Retriving the differentially bound sites +#make new design file for chipseeker at the same time +new_SampleID = c() +new_Peaks = c() for (i in c(1:length(data$contrasts))) { - contrast_name = paste(data$contrasts[[i]]$name1,"vs", - #contrast_bed_name = paste(data$contrasts[[i]]$name1,"vs", + contrast_bed_name = paste(data$contrasts[[i]]$name1,"vs", data$contrasts[[i]]$name2,"diffbind.bed",sep="_") + contrast_name = paste(data$contrasts[[i]]$name1,"vs", + data$contrasts[[i]]$name2,"diffbind.xls",sep="_") + new_SampleID = c(new_SampleID,paste(data$contrasts[[i]]$name1,"vs",data$contrasts[[i]]$name2,sep="_")) + new_Peaks = c(new_Peaks, contrast_bed_name) report <- dba.report(data, contrast=i, th=1, bCount=TRUE) report <- as.data.frame(report) print(head(report)) colnames(report)[1:5]<-c("chrom","peak_start","peak_stop","peak_width","peak_strand") - print(head(report)) + #print(head(report)) write.table(report,contrast_name,sep="\t",quote=F,row.names=F) - #write.table(as.data.frame(report),contrast_bed_name,sep="\t",quote=F,row.names=F, col.names=F) - + write.table(report,contrast_bed_name,sep="\t",quote=F,row.names=F, col.names=F) } - +#Write new design file +newdesign = data.frame(SampleID=new_SampleID, Peaks=new_Peaks) +write.csv(newdesign,"diffpeak.design",row.names=F,quote=F) diff --git a/workflow/scripts/runMemechip.py b/workflow/scripts/runMemechip.py index c5aea826df4478a5eaf60aac94bbdbca3851967e..1157cf1a8ab82938442efd8a331d92ed4f443816 100644 --- a/workflow/scripts/runMemechip.py +++ b/workflow/scripts/runMemechip.py @@ -10,10 +10,12 @@ import argparse as ap import logging import twobitreader import subprocess +import pandas as pd import pybedtools from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord +from multiprocessing import Pool logging.basicConfig(level=10) @@ -22,9 +24,9 @@ def prepare_argparser(): epilog = "For command line options of each command, type %(prog)% COMMAND -h" argparser = ap.ArgumentParser(description=description, epilog = epilog) argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BED file") - argparser.add_argument("-o","--output",dest = "outfile",type=str,required=True, help="output") +# argparser.add_argument("-o","--output",dest = "outfile",type=str,required=True, help="output") argparser.add_argument("-g","--genome",dest = "genome",type=str, help="Genome 2 bit file") - argparser.add_argument("-m","--mask",dest = "mask",type=bool,default=False, help="Convert repeats to N") + # argparser.add_argument("-m","--mask",dest = "mask",type=bool,default=False, help="Convert repeats to N") argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks") return(argparser) @@ -35,19 +37,32 @@ def rc(): def main(): argparser = prepare_argparser() args = argparser.parse_args() - run(args.infile, args.genome, args.limit, args.output) + #run(args.infile, args.genome, args.limit, args.output) + #get Pool ready + dfile = pd.read_csv(args.infile) + meme_arglist = zip(dfile['Peaks'].tolist(),[args.genome+"hg19.2bit"]*dfile.shape[0],[str(args.limit)]*dfile.shape[0],dfile['SampleID'].tolist()) + work_pool = Pool(min(12,dfile.shape[0])) + resultList = work_pool.map(run_wrapper, meme_arglist) + work_pool.close() + work_pool.join() + + +def run_wrapper(args): + run(*args) + def run(infile, genome, limit, output): infile = pybedtools.BedTool(infile) genome = twobitreader.TwoBitFile(genome) outfile = open(output+".fa","w") - rowcount = 1 + rowcount = 0 limit = int(limit) if limit ==-1: limit = len(infile) for record in infile: - while rowcount <=limit: - rowcount += 1 + rowcount += 1 + if rowcount <=limit: + # logging.debug(rowcount) try: seq = genome[record.chrom][record.start:record.stop] except: @@ -57,8 +72,8 @@ def run(infile, genome, limit, output): seq = rc(seq) newfa_name = record.name#"_".join(record.fields) newfa = SeqRecord(Seq(seq),newfa_name,description="") - SeqIO.write(newfa,output+".fa","fasta") - outfile.close() + SeqIO.write(newfa,outfile,"fasta") + outfile.close() #Call memechip meme_command = "meme-chip -oc "+output+"_memechip"+" -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 "+output+".fa" p = subprocess.Popen(meme_command, shell=True)