Skip to content
Snippets Groups Projects
Commit 7deea97e authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Update annotation script.

parent 0620eafd
Branches
Tags
1 merge request!18Resolve "Add in current chip-analysis functionality."
......@@ -41,16 +41,14 @@ documentation_files:
workflow_modules:
- 'python/3.6.1-2-anaconda'
- 'trimgalore/0.4.1'
- 'cutadapt/1.9.1'
- 'fastqc/0.11.2'
- 'bwa/intel/0.7.12'
- 'samtools/1.4.1'
- 'sambamba/0.6.6'
- 'bedtools/2.26.0'
- 'deeptools/2.5.0.1'
- 'phantompeakqualtools/1.2'
- 'macs/2.1.0-20151222'
- 'UCSC_userApps/v317'
- 'R/3.4.1-gccmkl'
# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
......@@ -113,13 +111,14 @@ workflow_parameters:
description: |
A design file listing sample id, fastq files, corresponding control id
and additional information about the sample.
regex: ".*tsv"
- id: genome
type: select
choices:
- [ 'GRCh38', 'Human GRCh38']
- [ 'GRCh37', 'Human GRCh37']
- [ 'GRCh38', 'Mouse GRCh38']
- [ 'GRCm38', 'Mouse GRCm38']
required: true
description: |
Reference species and genome used for alignment and subsequent analysis.
......
......@@ -12,7 +12,7 @@ process {
cpus = 32
}
$alignReads{
module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3']
module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/1.4.1']
cpus = 32
}
$filterReads{
......@@ -47,6 +47,11 @@ process {
module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0']
executor = 'local'
}
$consensusPeaks {
module = ['R/3.4.1-gccmkl']
executor = 'local'
}
}
params {
......
......@@ -388,12 +388,14 @@ process peakAnnotation {
file consensusPeaks
output:
file "*chipseeker*" into chipseeker_originalpeak_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath}
"""
output:
file "*chipseeker*" into peakAnnotation
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/annotate_peaks.R $design_file $genome
"""
}
#!/bin/Rscript
library(ChIPseeker)
# Load libraries
source("http://bioconductor.org/biocLite.R")
if(!require("ChIPseeker")){
biocLite("ChIPseeker")
library("ChIPseeker")
}
# Currently mouse or human
if (!require("TxDb.Hsapiens.UCSC.hg19.knownGene")){
biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
}
if (!require("TxDb.Mmusculus.UCSC.mm10.knownGene")){
biocLite("TxDb.Mmusculus.UCSC.mm10.knownGene")
library("TxDb.Mmusculus.UCSC.mm10.knownGene")
}
if (!require("TxDb.Hsapiens.UCSC.hg38.knownGene")){
biocLite("TxDb.Hsapiens.UCSC.hg38.knownGene")
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
}
# Create parser object
parser <- ArgumentParser()
# Specify our desired options
parser$add_argument("-d", "--design", help = "File path to design file", required = TRUE)
parser$add_argument("-g", "--genome", help = "The genome assembly", required = TRUE)
# Parse arguments
args <- parser$parse_args()
# Create parser object
args <- commandArgs(trailingOnly=TRUE)
# Check input args
if (length(args) != 2) {
stop("Usage: annotate_peaks.r [ annotate_design.tsv ] [ genome_assembly ]", call.=FALSE)
}
design_file <- args[1]
genome <-args[2]
# Load UCSC Known Genes
if(args$genome=='GRCh37') {
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
if(genome=='GRCh37') {
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
} else if(args$genome=='GRCm38') {
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
} else if(genome=='GRCm38') {
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
}
else if(args$genome=='GRCh38') {
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
} else if(genome=='GRCh38') {
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
}
# Load design file
design <- read.csv(design_file, sep ='\t')
files <- as.list(as.character(design$Peaks))
names(files) <- design$Condition
peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
for(index in c(1:length(peakAnnoList)))
{
filename<-paste(names(files)[index],".chipseeker_annotation.xls",sep="")
for(index in c(1:length(peakAnnoList))) {
filename <- paste(names(files)[index],".chipseeker_annotation.xls",sep="")
write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F)
#draw individual plot
# Draw individual plots
# Define names of Plots
pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="")
vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="")
upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="")
# Pie Plots
pdf(pie_name)
plotAnnoPie(peakAnnoList[[index]])
dev.off()
# Venn Diagrams
pdf(vennpie_name)
vennpie(peakAnnoList[[index]])
dev.off()
# Upset Plot
pdf(upsetplot_name)
upsetplot(peakAnnoList[[index]])
dev.off()
}
......@@ -166,7 +166,7 @@ def overlap(experiment, design):
os.remove(overlap_tr_fn)
os.remove(overlap_pr_fn)
return overlapping_peaks_fn
return os.path.abspath(overlapping_peaks_fn)
def main():
......@@ -186,14 +186,16 @@ def main():
design_diff = update_design(design_files_df)
# Make a design file for annotating Peaks
design_anno = pd.DataFrame()
anno_cols = ['Condition', 'Peaks']
design_anno = pd.DataFrame(columns = anno_cols)
# Find consenus overlap peaks for each experiment
for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
replicated_peak = overlap(experiment, df_experiment)
design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak
design_anno.loc[experiment] = [experiment, replicated_peak]
# Write out file
# Write out design files
design_diff.columns = ['SampleID',
'bamReads',
'Condition',
......@@ -207,6 +209,7 @@ def main():
'PeakCaller']
design_diff.to_csv("design_diffPeaks.tsv", header=True, sep='\t', index=False)
design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False)
if __name__ == '__main__':
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment