Skip to content
Snippets Groups Projects
Commit 81b4c97b authored by Beibei Chen's avatar Beibei Chen
Browse files

Merge branch 'develop' into 'master'

Merge develop into master

See merge request !1
parents 4e1e0b7c 12f65b9c
Branches
Tags
No related merge requests found
env.md 0 → 100644
## Create new env in specific folder
```shell
conda create -p /project/shared/bicf_workflow_ref/chipseq_bchen4/ -c r r-essentials
#Add channels
conda config --add channels conda-forge
conda config --add channels r
conda config --add channels bioconda
pip install --user twobitreader
conda install -c r r-xml
```
Install bioconductor in R console:
```R
source("http://bioconductor.org/biocLite.R")
biocLite()
biocLite(c("DiffBind","ChIPseeker"))
```
\ No newline at end of file
process.executor='slurm'
process.queue='super'
process.time='5d'
#!/usr/bin/env nextflow
params.design="$baseDir/../test/samplesheet.csv"
params.bams = "$baseDir/../test/*.bam"
params.bais = "$baseDir/../test/*.bai"
params.peaks = "$baseDir/../test/*.broadPeak"
params.genomepath="/project/shared/bicf_workflow_ref/hg19/"
species = "hg19"
toppeakcount = 200
design_file = file(params.design)
deeptools_design = Channel.fromPath(params.design)
diffbind_design = Channel.fromPath(params.design)
chipseeker_design = Channel.fromPath(params.design)
meme_design = Channel.fromPath(params.design)
index_bams = Channel.fromPath(params.bams)
deeptools_bams = Channel.fromPath(params.bams)
deeptools_peaks = Channel.fromPath(params.peaks)
chipseeker_peaks = Channel.fromPath(params.peaks)
diffbind_bams = Channel.fromPath(params.bams)
diffbind_peaks = Channel.fromPath(params.peaks)
meme_peaks = Channel.fromPath(params.peaks)
deeptools_bamindex = Channel.fromPath(params.bais)
diffbind_bamindex = Channel.fromPath(params.bais)
//process bamindex {
// publishDir "$baseDir/output/", mode: 'copy'
// input:
// file index_bam_files from index_bams
// output:
// file "*bai" into deeptools_bamindex
// file "*bai" into diffbind_bamindex
//
// script:
// """
// module load python/2.7.x-anaconda
// source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
// module load samtools/intel/1.3
// samtools index $index_bam_files
// """
//}
process run_deeptools {
publishDir "$baseDir/output", mode: 'copy'
input:
file deeptools_design_file from deeptools_design
file deeptools_bam_files from deeptools_bams.toList()
file deeptools_peak_files from deeptools_peaks.toList()
file deeptools_bam_indexes from deeptools_bamindex.toList()
output:
stdout result
script:
"""
module load python/2.7.x-anaconda
source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
module load deeptools/2.3.5
python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}}
"""
}
process run_diffbind {
publishDir "$baseDir/output", mode: 'copy'
input:
file diffbind_design_file from diffbind_design
file diffbind_bam_files from diffbind_bams.toList()
file diffbind_peak_files from diffbind_peaks.toList()
file diffbind_bam_indexes from diffbind_bamindex.toList()
output:
file "diffpeak.design" into diffpeaksdesign_chipseeker
file "diffpeak.design" into diffpeaksdesign_meme
file "*_diffbind.bed" into diffpeaks_meme
file "*_diffbind.bed" into diffpeaks_chipseeker
script:
"""
module load python/2.7.x-anaconda
source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file
"""
}
process run_chipseeker_diffpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file diffpeak_design_file from diffpeaksdesign_chipseeker
file diffpeaks from diffpeaks_chipseeker
output:
stdout result_chipseeker
script:
"""
module load python/2.7.x-anaconda
source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file hg19
"""
}
process run_chipseeker_originalpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file design_file from chipseeker_design
file chipseeker_peak_files from chipseeker_peaks.toList()
output:
stdout result1
script:
"""
module load python/2.7.x-anaconda
source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
Rscript $baseDir/scripts/runChipseeker.R $design_file ${species}
"""
}
process run_meme_original {
publishDir "$baseDir/output", mode: 'copy'
input:
file design_meme from meme_design
file meme_peak_files from meme_peaks.toList()
output:
stdout result_meme_original
script:
"""
module load python/2.7.x-anaconda
source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
module load meme/4.11.1-gcc-openmpi
python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount}
"""
}
process run_meme_diffpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file peaks_meme from diffpeaks_meme
file diffpeak_design from diffpeaksdesign_meme
output:
stdout result_meme
script:
"""
module load python/2.7.x-anaconda
source activate /project/shared/bicf_workflow_ref/chipseq_bchen4/
module load meme/4.11.1-gcc-openmpi
python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount}
"""
}
#!/usr/bin/python
# programmer : bbc
# usage: main function to call all the procedures for chip-seq analysis
import sys
import os
import argparse as ap
import logging
import pandas as pd
import glob
import subprocess
from multiprocessing import Pool
import runDeepTools
import runMemechip
logging.basicConfig(level=10)
def prepare_argparser():
description = "Make wig file for given bed using bam"
epilog = "For command line options of each command, type %(prog)% COMMAND -h"
argparser = ap.ArgumentParser(description=description, epilog = epilog)
argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input design file")
argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19")
argparser.add_argument("--top-peak",dest="toppeak",type=int, default=-1, help = "Only use top peaks for motif call")
#argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"])
#argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header")
return(argparser)
def memechip_wrapper(args):
#print args
runMemechip.run(*args)
def main():
argparser = prepare_argparser()
args = argparser.parse_args()
#dfile = pd.read_csv(args.infile)
#for testing, add testing path to all input files
test_path = "/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/"
designfile = pd.read_csv(args.infile)
designfile['Peaks'] = designfile['Peaks'].apply(lambda x: test_path+x)
designfile['bamReads'] = designfile['bamReads'].apply(lambda x: test_path+x)
designfile['bamControl'] = designfile['bamControl'].apply(lambda x: test_path+x)
designfile.to_csv(args.infile+"_new",index=False)
dfile = pd.read_csv(args.infile+"_new")
#call deeptools
runDeepTools.run(args.infile+"_new", args.genome)
#call diffbind
this_script = os.path.abspath(__file__).split("/")
folder = "/".join(this_script[0:len(this_script)-1])
diffbind_command = "Rscript "+folder+"/runDiffBind.R "+args.infile+"_new"
#logging.debug(diffbind_command)
p = subprocess.Popen(diffbind_command, shell=True)
p.communicate()
#call chipseeker on original peaks and overlapping peaks
chipseeker_command = "Rscript "+folder+"/runChipseeker.R "+",".join(dfile['Peaks'].tolist())+" "+",".join(dfile['SampleID'])
#BC## logging.debug(chipseeker_command)
p = subprocess.Popen(chipseeker_command, shell=True)
p.communicate()
overlapping_peaks = glob.glob('*diffbind.bed')
overlapping_peak_names = []
for pn in overlapping_peaks:
overlapping_peak_names.append(pn.split("_diffbind")[0].replace("!","non"))
chipseeker_overlap_command = "Rscript "+folder+"/runChipseeker.R "+",".join(overlapping_peaks)+" "+",".join(overlapping_peak_names)
p = subprocess.Popen(chipseeker_overlap_command, shell=True)
p.communicate()
#MEME-chip on all peaks
meme_arglist = zip(dfile['Peaks'].tolist(),[test_path+"hg19.2bit"]*dfile.shape[0],[str(args.toppeak)]*dfile.shape[0],dfile['SampleID'].tolist())
#BC# #print meme_arglist
work_pool = Pool(min(12,dfile.shape[0]))
resultList = work_pool.map(memechip_wrapper, meme_arglist)
work_pool.close()
work_pool.join()
if __name__=="__main__":
main()
args = commandArgs(trailingOnly=TRUE)
#if (length(args)==0) {
# stop("At least one argument must be supplied (input file).n", call.=FALSE)
#} else if (length(args)==1) {
# # default output file
# args[3] = "out.txt"
#}
library(ChIPseeker)
if(args[2]=="hg19")
{
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
}
if(args[2]=="mm10")
{
library(TxDb.Hsapiens.UCSC.mm10.knownGene)
txdb <- TxDb.Hsapiens.UCSC.mm10.knownGene
}
if(args[2]=="hg38")
{
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
}
design<-read.csv(args[1])
files<-as.list(as.character(design$Peaks))
names(files)<-design$SampleID
peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
for(index in c(1:length(peakAnnoList)))
{
filename<-paste(names(files)[index],".chipseeker_annotation.xls",sep="")
write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F)
#draw individual plot
pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="")
vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="")
upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="")
pdf(pie_name)
plotAnnoPie(peakAnnoList[[index]])
dev.off()
pdf(vennpie_name)
vennpie(peakAnnoList[[index]])
dev.off()
pdf(upsetplot_name)
upsetplot(peakAnnoList[[index]])
dev.off()
}
#!/usr/bin/python
# programmer : bbc
# usage:
import sys
import argparse as ap
import logging
import subprocess
import pandas as pd
from multiprocessing import Pool
logging.basicConfig(level=10)
def prepare_argparser():
description = "Make wig file for given bed using bam"
epilog = "For command line options of each command, type %(prog)% COMMAND -h"
argparser = ap.ArgumentParser(description=description, epilog = epilog)
argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file")
argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19")
#argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format")
#argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"])
#argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header")
return(argparser)
def run_qc(files, controls, labels):
mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz"
p = subprocess.Popen(mbs_command, shell=True)
#logging.debug(mbs_command)
p.communicate()
pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels)
#logging.debug(pcor_command)
p = subprocess.Popen(pcor_command, shell=True)
p.communicate()
#plotCoverage
pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10"
p = subprocess.Popen(pcov_command, shell=True)
p.communicate()
#draw fingerprints plots
for treat,ctrl,name in zip(files,controls,labels):
fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png"
p = subprocess.Popen(fp_command, shell=True)
p.communicate()
def bam2bw_wrapper(command):
p = subprocess.Popen(command, shell=True)
p.communicate()
def run_signal(files, labels, genome):
#compute matrix and draw profile and heatmap
gene_bed = genome+"gene.bed"#"/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed"
bw_commands = []
for f in files:
bw_commands.append("bamCoverage -bs 10 -b "+f+" -o "+f.replace("bam","bw"))
work_pool = Pool(min(len(files), 12))
work_pool.map(bam2bw_wrapper, bw_commands)
work_pool.close()
work_pool.join()
cm_command = "computeMatrix scale-regions -R "+gene_bed+" -a 3000 -b 3000 --regionBodyLength 5000 --skipZeros -S *.bw -o samples.deeptools_generegionscalematrix.gz"
p = subprocess.Popen(cm_command, shell=True)
p.communicate()
hm_command = "plotHeatmap -m samples.deeptools_generegionscalematrix.gz -out samples.deeptools_readsHeatmap.png"
p = subprocess.Popen(hm_command, shell=True)
p.communicate()
def run(dfile,genome):
#parse dfile, suppose data files are the same folder as design file
dfile = pd.read_csv(dfile)
#QC: multiBamSummary and plotCorrelation
run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID'])
#signal plots
run_signal(dfile['bamReads'],dfile['SampleID'],genome)
def main():
argparser = prepare_argparser()
args = argparser.parse_args()
run(args.infile, args.genome)
if __name__=="__main__":
main()
File added
library("DiffBind")
#build dba object from sample sheet and do analysis
args <- commandArgs(TRUE)
data <- dba(sampleSheet=args[1])
data <- dba.count(data)
data <- dba.contrast(data, minMembers = 2, categories=DBA_CONDITION)
data <- dba.analyze(data)
#Draw figures
pdf("diffbind.samples.heatmap.pdf")
plot(data)
dev.off()
pdf("diffbind.samples.pca.pdf")
dba.plotPCA(data, DBA_TISSUE, label=DBA_CONDITION)
dev.off()
#Save peak reads count
normcount <- dba.peakset(data, bRetrieve=T)
write.table(as.data.frame(normcount),"diffbind.normcount.txt",sep="\t",quote=F,row.names=F)
#Retriving the differentially bound sites
#make new design file for chipseeker at the same time
new_SampleID = c()
new_Peaks = c()
for (i in c(1:length(data$contrasts)))
{
contrast_bed_name = paste(data$contrasts[[i]]$name1,"vs",
data$contrasts[[i]]$name2,"diffbind.bed",sep="_")
contrast_name = paste(data$contrasts[[i]]$name1,"vs",
data$contrasts[[i]]$name2,"diffbind.xls",sep="_")
new_SampleID = c(new_SampleID,paste(data$contrasts[[i]]$name1,"vs",data$contrasts[[i]]$name2,sep="_"))
new_Peaks = c(new_Peaks, contrast_bed_name)
report <- dba.report(data, contrast=i, th=1, bCount=TRUE)
report <- as.data.frame(report)
print(head(report))
colnames(report)[1:5]<-c("chrom","peak_start","peak_stop","peak_width","peak_strand")
#print(head(report))
write.table(report,contrast_name,sep="\t",quote=F,row.names=F)
write.table(report[,c(1:3)],contrast_bed_name,sep="\t",quote=F,row.names=F, col.names=F)
}
#Write new design file
newdesign = data.frame(SampleID=new_SampleID, Peaks=new_Peaks)
write.csv(newdesign,"diffpeak.design",row.names=F,quote=F)
#!/qbrc/software/Python-2.7.7/bin/python
# programmer : bbc
# usage:
import sys
import re
from re import sub
import string
import argparse as ap
import logging
import twobitreader
import subprocess
import pandas as pd
import pybedtools
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from multiprocessing import Pool
logging.basicConfig(level=10)
def prepare_argparser():
description = "Run memechip command"
epilog = "For command line options of each command, type %(prog)% COMMAND -h"
argparser = ap.ArgumentParser(description=description, epilog = epilog)
argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BED file")
# argparser.add_argument("-o","--output",dest = "outfile",type=str,required=True, help="output")
argparser.add_argument("-g","--genome",dest = "genome",type=str, help="Genome 2 bit file")
# argparser.add_argument("-m","--mask",dest = "mask",type=bool,default=False, help="Convert repeats to N")
argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks")
return(argparser)
def rc():
comps = {'A':"T",'C':"G",'G':"C",'T':"A","N":"N"}
return ''.join([comps[x] for x in seq.upper()[::-1]])
def main():
argparser = prepare_argparser()
args = argparser.parse_args()
#run(args.infile, args.genome, args.limit, args.output)
#get Pool ready
dfile = pd.read_csv(args.infile)
meme_arglist = zip(dfile['Peaks'].tolist(),[args.genome+"genome.2bit"]*dfile.shape[0],[str(args.limit)]*dfile.shape[0],dfile['SampleID'].tolist())
work_pool = Pool(min(12,dfile.shape[0]))
resultList = work_pool.map(run_wrapper, meme_arglist)
work_pool.close()
work_pool.join()
def run_wrapper(args):
run(*args)
def run(infile, genome, limit, output):
infile = pybedtools.BedTool(infile)
logging.debug(len(infile))
genome = twobitreader.TwoBitFile(genome)
outfile = open(output+".fa","w")
rowcount = 0
limit = int(limit)
logging.debug(limit)
if limit ==-1:
limit = len(infile)
for record in infile:
rowcount += 1
#logging.debug(record)
if rowcount <=limit:
#logging.debug(rowcount)
try:
#logging.debug(record.chrom)
seq = genome[record.chrom][record.start:record.stop]
except:
pass
else:
if record.strand == "-":
seq = rc(seq)
if len(record.fields)>=4:
newfa_name = record.name
else:
newfa_name = "_".join(record.fields)
newfa = SeqRecord(Seq(seq),newfa_name,description="")
#logging.debug(seq)
SeqIO.write(newfa,outfile,"fasta")
outfile.close()
#Call memechip
meme_command = "meme-chip -oc "+output+"_memechip"+" -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 "+output+".fa"
p = subprocess.Popen(meme_command, shell=True)
p.communicate()
if __name__=="__main__":
main()
File added
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment