Skip to content
Snippets Groups Projects
Commit eff4984e authored by Beibei Chen's avatar Beibei Chen
Browse files

meme original works

parent c18f918e
Branches
Tags
No related merge requests found
......@@ -5,6 +5,8 @@
params.testpath="/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/"
params.design="/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/samplesheet.csv"
params.genomepath="/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/hg19/"
species = "hg19"
toppeakcount = 200
// design_file = file(params.design)
// bams=file(params.bams)
//gtf_file = file("$params.genome/gencode.gtf")
......@@ -20,6 +22,9 @@ process processdesign {
output:
file "new_design" into deeptools_design
file "new_design" into diffbind_design
file "new_design" into chipseeker_design
file "new_design" into meme_design
script:
"""
......@@ -51,45 +56,75 @@ process run_diffbind {
input:
file diffbind_design_file from diffbind_design
output:
file "*_diffbind.bed" into diffpeaks_chipseeker
file "*_diffbind.bed" into diffpeaks_meme
script:
file "diffpeak.design" into diffpeaksdesign_chipseeker
file "diffpeak.design" into diffpeaksdesign_meme
file "*diffbind.bed" into diffpeaks_meme
file "*diffbind.bed" into diffpeaks_chipseeker
script:
"""
module load python/2.7.x-anaconda
source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file
"""
}
//process run_chipseeker {
//
//process run_chipseeker_diffpeak {
// publishDir "$baseDir/output", mode: 'copy'
// input:
// file diffbind_design_file from diffbind_design
// file annotation Tdx
// file diffpeak_design_file from diffpeaksdesign_chipseeker
// file diffpeaks from diffpeaks_chipseeker
// output:
// file "*_diffbind.bed" into diffpeaks_chipseeker
// file "*_diffbind.bed" into diffpeaks_meme
//
// script:
// stdout result
// script:
// """
// module load python/2.7.x-anaconda
// source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
// Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file
// Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file hg19
//"""
//}
//process run_meme {
// publishDir "$baseDir/output", mode: 'copy'
//
//process run_chipseeker_originalpeak {
//// publishDir "$baseDir/output", mode: 'copy'
// input:
// file peaks_meme from diffpeaks_meme.flatten()
// file design_file from chipseeker_design
// output:
// stdout result
// script:
// stdout result1
// script:
// """
// module load python/2.7.x-anaconda
// source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
// module load meme/4.11.1-gcc-openmpi
// python $baseDir/scripts/runMemechip.py -i $peaks_meme
// Rscript $baseDir/scripts/runChipseeker.R $design_file ${species}
//"""
//}
process run_meme_original {
publishDir "$baseDir/output", mode: 'copy'
input:
file design_meme from meme_design
output:
stdout result_meme_original
script:
"""
module load python/2.7.x-anaconda
source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
module load meme/4.11.1-gcc-openmpi
python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount}
"""
}
process run_meme_diffpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file peaks_meme from diffpeaks_meme
file diffpeak_design from diffpeaksdesign_meme
output:
stdout result_meme
script:
"""
module load python/2.7.x-anaconda
source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
module load meme/4.11.1-gcc-openmpi
python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount}
"""
}
......@@ -57,7 +57,7 @@ def main():
#BC## logging.debug(chipseeker_command)
p = subprocess.Popen(chipseeker_command, shell=True)
p.communicate()
overlapping_peaks = glob.glob('*diffbind.xls')
overlapping_peaks = glob.glob('*diffbind.bed')
overlapping_peak_names = []
for pn in overlapping_peaks:
overlapping_peak_names.append(pn.split("_diffbind")[0].replace("!","non"))
......
......@@ -7,25 +7,26 @@ args = commandArgs(trailingOnly=TRUE)
#}
library(ChIPseeker)
if args[3]=="hg19"
if(args[2]=="hg19")
{
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
}
if args[3]=="mm10"
if(args[2]=="mm10")
{
library(TxDb.Hsapiens.UCSC.mm10.knownGene)
txdb <- TxDb.Hsapiens.UCSC.mm10.knownGene
}
if args[3]=="hg38"
if(args[2]=="hg38")
{
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
}
files<-as.list(unlist(strsplit(args[1],",")))
names(files)<-as.list(unlist(strsplit(args[2],",")))
print(files)
design<-read.csv(args[1])
files<-as.list(as.character(design$Peaks))
names(files)<-design$SampleID
peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
for(index in c(1:length(peakAnnoList)))
......
......@@ -21,18 +21,25 @@ normcount <- dba.peakset(data, bRetrieve=T)
write.table(as.data.frame(normcount),"diffbind.normcount.txt",sep="\t",quote=F,row.names=F)
#Retriving the differentially bound sites
#make new design file for chipseeker at the same time
new_SampleID = c()
new_Peaks = c()
for (i in c(1:length(data$contrasts)))
{
contrast_name = paste(data$contrasts[[i]]$name1,"vs",
#contrast_bed_name = paste(data$contrasts[[i]]$name1,"vs",
contrast_bed_name = paste(data$contrasts[[i]]$name1,"vs",
data$contrasts[[i]]$name2,"diffbind.bed",sep="_")
contrast_name = paste(data$contrasts[[i]]$name1,"vs",
data$contrasts[[i]]$name2,"diffbind.xls",sep="_")
new_SampleID = c(new_SampleID,paste(data$contrasts[[i]]$name1,"vs",data$contrasts[[i]]$name2,sep="_"))
new_Peaks = c(new_Peaks, contrast_bed_name)
report <- dba.report(data, contrast=i, th=1, bCount=TRUE)
report <- as.data.frame(report)
print(head(report))
colnames(report)[1:5]<-c("chrom","peak_start","peak_stop","peak_width","peak_strand")
print(head(report))
#print(head(report))
write.table(report,contrast_name,sep="\t",quote=F,row.names=F)
#write.table(as.data.frame(report),contrast_bed_name,sep="\t",quote=F,row.names=F, col.names=F)
write.table(report,contrast_bed_name,sep="\t",quote=F,row.names=F, col.names=F)
}
#Write new design file
newdesign = data.frame(SampleID=new_SampleID, Peaks=new_Peaks)
write.csv(newdesign,"diffpeak.design",row.names=F,quote=F)
......@@ -10,10 +10,12 @@ import argparse as ap
import logging
import twobitreader
import subprocess
import pandas as pd
import pybedtools
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from multiprocessing import Pool
logging.basicConfig(level=10)
......@@ -22,9 +24,9 @@ def prepare_argparser():
epilog = "For command line options of each command, type %(prog)% COMMAND -h"
argparser = ap.ArgumentParser(description=description, epilog = epilog)
argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BED file")
argparser.add_argument("-o","--output",dest = "outfile",type=str,required=True, help="output")
# argparser.add_argument("-o","--output",dest = "outfile",type=str,required=True, help="output")
argparser.add_argument("-g","--genome",dest = "genome",type=str, help="Genome 2 bit file")
argparser.add_argument("-m","--mask",dest = "mask",type=bool,default=False, help="Convert repeats to N")
# argparser.add_argument("-m","--mask",dest = "mask",type=bool,default=False, help="Convert repeats to N")
argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks")
return(argparser)
......@@ -35,19 +37,32 @@ def rc():
def main():
argparser = prepare_argparser()
args = argparser.parse_args()
run(args.infile, args.genome, args.limit, args.output)
#run(args.infile, args.genome, args.limit, args.output)
#get Pool ready
dfile = pd.read_csv(args.infile)
meme_arglist = zip(dfile['Peaks'].tolist(),[args.genome+"hg19.2bit"]*dfile.shape[0],[str(args.limit)]*dfile.shape[0],dfile['SampleID'].tolist())
work_pool = Pool(min(12,dfile.shape[0]))
resultList = work_pool.map(run_wrapper, meme_arglist)
work_pool.close()
work_pool.join()
def run_wrapper(args):
run(*args)
def run(infile, genome, limit, output):
infile = pybedtools.BedTool(infile)
genome = twobitreader.TwoBitFile(genome)
outfile = open(output+".fa","w")
rowcount = 1
rowcount = 0
limit = int(limit)
if limit ==-1:
limit = len(infile)
for record in infile:
while rowcount <=limit:
rowcount += 1
rowcount += 1
if rowcount <=limit:
# logging.debug(rowcount)
try:
seq = genome[record.chrom][record.start:record.stop]
except:
......@@ -57,8 +72,8 @@ def run(infile, genome, limit, output):
seq = rc(seq)
newfa_name = record.name#"_".join(record.fields)
newfa = SeqRecord(Seq(seq),newfa_name,description="")
SeqIO.write(newfa,output+".fa","fasta")
outfile.close()
SeqIO.write(newfa,outfile,"fasta")
outfile.close()
#Call memechip
meme_command = "meme-chip -oc "+output+"_memechip"+" -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 "+output+".fa"
p = subprocess.Popen(meme_command, shell=True)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment