Skip to content
Snippets Groups Projects
Commit 7deea97e authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Update annotation script.

parent 0620eafd
Branches
Tags
No related merge requests found
...@@ -41,16 +41,14 @@ documentation_files: ...@@ -41,16 +41,14 @@ documentation_files:
workflow_modules: workflow_modules:
- 'python/3.6.1-2-anaconda' - 'python/3.6.1-2-anaconda'
- 'trimgalore/0.4.1' - 'trimgalore/0.4.1'
- 'cutadapt/1.9.1'
- 'fastqc/0.11.2'
- 'bwa/intel/0.7.12' - 'bwa/intel/0.7.12'
- 'samtools/1.4.1'
- 'sambamba/0.6.6' - 'sambamba/0.6.6'
- 'bedtools/2.26.0' - 'bedtools/2.26.0'
- 'deeptools/2.5.0.1' - 'deeptools/2.5.0.1'
- 'phantompeakqualtools/1.2' - 'phantompeakqualtools/1.2'
- 'macs/2.1.0-20151222' - 'macs/2.1.0-20151222'
- 'UCSC_userApps/v317' - 'UCSC_userApps/v317'
- 'R/3.4.1-gccmkl'
# A list of parameters used by the workflow, defining how to present them, # A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter: # options etc in the web interface. For each parameter:
...@@ -113,13 +111,14 @@ workflow_parameters: ...@@ -113,13 +111,14 @@ workflow_parameters:
description: | description: |
A design file listing sample id, fastq files, corresponding control id A design file listing sample id, fastq files, corresponding control id
and additional information about the sample. and additional information about the sample.
regex: ".*tsv"
- id: genome - id: genome
type: select type: select
choices: choices:
- [ 'GRCh38', 'Human GRCh38'] - [ 'GRCh38', 'Human GRCh38']
- [ 'GRCh37', 'Human GRCh37'] - [ 'GRCh37', 'Human GRCh37']
- [ 'GRCh38', 'Mouse GRCh38'] - [ 'GRCm38', 'Mouse GRCm38']
required: true required: true
description: | description: |
Reference species and genome used for alignment and subsequent analysis. Reference species and genome used for alignment and subsequent analysis.
......
...@@ -12,7 +12,7 @@ process { ...@@ -12,7 +12,7 @@ process {
cpus = 32 cpus = 32
} }
$alignReads{ $alignReads{
module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3'] module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/1.4.1']
cpus = 32 cpus = 32
} }
$filterReads{ $filterReads{
...@@ -47,6 +47,11 @@ process { ...@@ -47,6 +47,11 @@ process {
module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0'] module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0']
executor = 'local' executor = 'local'
} }
$consensusPeaks {
module = ['R/3.4.1-gccmkl']
executor = 'local'
}
} }
params { params {
......
...@@ -388,12 +388,14 @@ process peakAnnotation { ...@@ -388,12 +388,14 @@ process peakAnnotation {
file consensusPeaks file consensusPeaks
output: output:
file "*chipseeker*" into chipseeker_originalpeak_output
script: file "*chipseeker*" into peakAnnotation
"""
module load python/2.7.x-anaconda script:
module load R/3.3.2-gccmkl """
Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath} module load python/2.7.x-anaconda
""" module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/annotate_peaks.R $design_file $genome
"""
} }
#!/bin/Rscript #!/bin/Rscript
library(ChIPseeker) # Load libraries
source("http://bioconductor.org/biocLite.R")
if(!require("ChIPseeker")){
biocLite("ChIPseeker")
library("ChIPseeker")
}
# Currently mouse or human
if (!require("TxDb.Hsapiens.UCSC.hg19.knownGene")){
biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
}
if (!require("TxDb.Mmusculus.UCSC.mm10.knownGene")){
biocLite("TxDb.Mmusculus.UCSC.mm10.knownGene")
library("TxDb.Mmusculus.UCSC.mm10.knownGene")
}
if (!require("TxDb.Hsapiens.UCSC.hg38.knownGene")){
biocLite("TxDb.Hsapiens.UCSC.hg38.knownGene")
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
}
# Create parser object
parser <- ArgumentParser()
# Specify our desired options
parser$add_argument("-d", "--design", help = "File path to design file", required = TRUE)
parser$add_argument("-g", "--genome", help = "The genome assembly", required = TRUE)
# Parse arguments # Create parser object
args <- parser$parse_args() args <- commandArgs(trailingOnly=TRUE)
# Check input args
if (length(args) != 2) {
stop("Usage: annotate_peaks.r [ annotate_design.tsv ] [ genome_assembly ]", call.=FALSE)
}
design_file <- args[1]
genome <-args[2]
# Load UCSC Known Genes # Load UCSC Known Genes
if(args$genome=='GRCh37') { if(genome=='GRCh37') {
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
} else if(args$genome=='GRCm38') { } else if(genome=='GRCm38') {
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
} } else if(genome=='GRCh38') {
else if(args$genome=='GRCh38') {
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
} }
# Load design file
design <- read.csv(design_file, sep ='\t')
files <- as.list(as.character(design$Peaks))
names(files) <- design$Condition
peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE) peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
for(index in c(1:length(peakAnnoList)))
{ for(index in c(1:length(peakAnnoList))) {
filename<-paste(names(files)[index],".chipseeker_annotation.xls",sep="") filename <- paste(names(files)[index],".chipseeker_annotation.xls",sep="")
write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F) write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F)
#draw individual plot
# Draw individual plots
# Define names of Plots
pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="") pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="")
vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="") vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="")
upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="") upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="")
# Pie Plots
pdf(pie_name) pdf(pie_name)
plotAnnoPie(peakAnnoList[[index]]) plotAnnoPie(peakAnnoList[[index]])
dev.off() dev.off()
# Venn Diagrams
pdf(vennpie_name) pdf(vennpie_name)
vennpie(peakAnnoList[[index]]) vennpie(peakAnnoList[[index]])
dev.off() dev.off()
# Upset Plot
pdf(upsetplot_name) pdf(upsetplot_name)
upsetplot(peakAnnoList[[index]]) upsetplot(peakAnnoList[[index]])
dev.off() dev.off()
} }
...@@ -166,7 +166,7 @@ def overlap(experiment, design): ...@@ -166,7 +166,7 @@ def overlap(experiment, design):
os.remove(overlap_tr_fn) os.remove(overlap_tr_fn)
os.remove(overlap_pr_fn) os.remove(overlap_pr_fn)
return overlapping_peaks_fn return os.path.abspath(overlapping_peaks_fn)
def main(): def main():
...@@ -186,14 +186,16 @@ def main(): ...@@ -186,14 +186,16 @@ def main():
design_diff = update_design(design_files_df) design_diff = update_design(design_files_df)
# Make a design file for annotating Peaks # Make a design file for annotating Peaks
design_anno = pd.DataFrame() anno_cols = ['Condition', 'Peaks']
design_anno = pd.DataFrame(columns = anno_cols)
# Find consenus overlap peaks for each experiment # Find consenus overlap peaks for each experiment
for experiment, df_experiment in design_peaks_df.groupby('experiment_id'): for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
replicated_peak = overlap(experiment, df_experiment) replicated_peak = overlap(experiment, df_experiment)
design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak
design_anno.loc[experiment] = [experiment, replicated_peak]
# Write out file # Write out design files
design_diff.columns = ['SampleID', design_diff.columns = ['SampleID',
'bamReads', 'bamReads',
'Condition', 'Condition',
...@@ -207,6 +209,7 @@ def main(): ...@@ -207,6 +209,7 @@ def main():
'PeakCaller'] 'PeakCaller']
design_diff.to_csv("design_diffPeaks.tsv", header=True, sep='\t', index=False) design_diff.to_csv("design_diffPeaks.tsv", header=True, sep='\t', index=False)
design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False)
if __name__ == '__main__': if __name__ == '__main__':
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment