Skip to content
Snippets Groups Projects
Commit c505f1cb authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '47-AnnotatePeaks' into 'master'

Resolve "annotate peaks"

Closes #47

See merge request !57
parents f6036083 ebaa8534
No related merge requests found
...@@ -10,6 +10,8 @@ All notable changes to this project will be documented in this file. ...@@ -10,6 +10,8 @@ All notable changes to this project will be documented in this file.
- Add PlotProfile Option - Add PlotProfile Option
- Add Python version to MultiQC - Add Python version to MultiQC
- Add and Update tests - Add and Update tests
- Use GTF files instead of TxDb and org libraries in Annotate Peaks
- Make gtf and geneName files as param inputs
## [publish_1.0.6 ] - 2019-05-31 ## [publish_1.0.6 ] - 2019-05-31
### Added ### Added
......
...@@ -74,25 +74,28 @@ params { ...@@ -74,25 +74,28 @@ params {
// Reference file paths on BioHPC // Reference file paths on BioHPC
genomes { genomes {
'GRCh38' { 'GRCh38' {
bwa = '/project/shared/bicf_workflow_ref/GRCh38' bwa = '/project/shared/bicf_workflow_ref/human/GRCh38'
genomesize = 'hs' genomesize = 'hs'
chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt' chromsizes = '/project/shared/bicf_workflow_ref/human/GRCh38/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/GRCh38/genome.fa' fasta = '/project/shared/bicf_workflow_ref/human/GRCh38/genome.fa'
gtf = '/project/shared/bicf_workflow_ref/GRCh38/gencode.gtf' gtf = '/project/shared/bicf_workflow_ref/human/GRCh38/gencode.v25.chr_patch_hapl_scaff.annotation.gtf'
geneNames = '/project/shared/bicf_workflow_ref/human/GRCh38/genenames.txt'
} }
'GRCh37' { 'GRCh37' {
bwa = '/project/shared/bicf_workflow_ref/GRCh37' bwa = '/project/shared/bicf_workflow_ref/human/GRCh37'
genomesize = 'hs' genomesize = 'hs'
chromsizes = '/project/shared/bicf_workflow_ref/GRCh37/genomefile.txt' chromsizes = '/project/shared/bicf_workflow_ref/human/GRCh37/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/GRCh37/genome.fa' fasta = '/project/shared/bicf_workflow_ref/human/GRCh37/genome.fa'
gtf = '/project/shared/bicf_workflow_ref/GRCh37/gencode.gtf' gtf = '/project/shared/bicf_workflow_ref/human/GRCh37/gencode.v19.chr_patch_hapl_scaff.annotation.gtf'
geneNames = '/project/shared/bicf_workflow_ref/human/GRCh37/genenames.txt'
} }
'GRCm38' { 'GRCm38' {
bwa = '/project/shared/bicf_workflow_ref/GRCm38' bwa = '/project/shared/bicf_workflow_ref/mouse/GRCm38'
genomesize = 'mm' genomesize = 'mm'
chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt' chromsizes = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/GRCm38/genome.fa' fasta = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genome.fa'
gtf = '/project/shared/bicf_workflow_ref/GRCm38/gencode.gtf' gtf = '/project/shared/bicf_workflow_ref/mouse/GRCm38/gencode.vM20.annotation.gtf'
geneNames = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genenames.txt'
} }
} }
} }
......
...@@ -33,14 +33,27 @@ params.multiqc = "$baseDir/conf/multiqc_config.yaml" ...@@ -33,14 +33,27 @@ params.multiqc = "$baseDir/conf/multiqc_config.yaml"
if (params.astrocyte) { if (params.astrocyte) {
print("Running under astrocyte") print("Running under astrocyte")
referenceLocation = "/project/shared/bicf_workflow_ref" referenceLocation = "/project/shared/bicf_workflow_ref"
params.bwaIndex = "$referenceLocation/$params.genome" if (params.genome == 'GRCh37') {
params.chromSizes = "$referenceLocation/$params.genome/genomefile.txt" params.bwaIndex = "$referenceLocation/human/$params.genome"
params.fasta = "$referenceLocation/$params.genome/genome.fa" params.chromSizes = "$referenceLocation/human/$params.genome/genomefile.txt"
params.gtf = "$referenceLocation/$params.genome/gencode.gtf" params.fasta = "$referenceLocation/human/$params.genome/genome.fa"
if (params.genome == 'GRCh37' || params.genome == 'GRCh38') { params.gtf = "$referenceLocation/human/$params.genome/gencode.v19.chr_patch_hapl_scaff.annotation.gtf"
params.geneNames = "$referenceLocation/human/$params.genome/genenames.txt"
params.genomeSize = 'hs' params.genomeSize = 'hs'
} else if (params.genome == 'GRCm38') { } else if (params.genome == 'GRCm38') {
params.bwaIndex = "$referenceLocation/mouse/$params.genome"
params.chromSizes = "$referenceLocation/mouse/$params.genome/genomefile.txt"
params.fasta = "$referenceLocation/mouse/$params.genome/genome.fa"
params.gtf = "$referenceLocation/mouse/$params.genome/gencode.vM20.annotation.gtf"
params.geneNames = "$referenceLocation/mouse/$params.genome/genenames.txt"
params.genomeSize = 'mm' params.genomeSize = 'mm'
} else if (params.genome == 'GRCh38') {
params.bwaIndex = "$referenceLocation/human/$params.genome"
params.chromSizes = "$referenceLocation/human/$params.genome/genomefile.txt"
params.fasta = "$referenceLocation/human/$params.genome/genome.fa"
params.gtf = "$referenceLocation/human/$params.genome/gencode.v25.chr_patch_hapl_scaff.annotation.gtf"
params.geneNames = "$referenceLocation/human/$params.genome/genenames.txt"
params.genomeSize = 'hs'
} }
} else { } else {
params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
...@@ -48,6 +61,7 @@ if (params.astrocyte) { ...@@ -48,6 +61,7 @@ if (params.astrocyte) {
params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
params.gtf = params.genome ? params.genomes[ params.genome ].gtf ?: false : false params.gtf = params.genome ? params.genomes[ params.genome ].gtf ?: false : false
params.geneNames = params.genome ? params.genomes[ params.genome ].geneNames ?: false : false
} }
...@@ -84,7 +98,9 @@ skipMotif = params.skipMotif ...@@ -84,7 +98,9 @@ skipMotif = params.skipMotif
skipPlotProfile = params.skipPlotProfile skipPlotProfile = params.skipPlotProfile
references = params.references references = params.references
multiqc = params.multiqc multiqc = params.multiqc
gtfFile = Channel.fromPath(params.gtf) gtfFile_plotProfile = Channel.fromPath(params.gtf)
gtfFile_annotPeaks = Channel.fromPath(params.gtf)
geneNames = Channel.fromPath(params.geneNames)
// Check design file for errors // Check design file for errors
process checkDesignFile { process checkDesignFile {
...@@ -469,7 +485,7 @@ process plotProfile { ...@@ -469,7 +485,7 @@ process plotProfile {
input: input:
file ("*.pooled.fc_signal.bw") from bigwigs.collect() file ("*.pooled.fc_signal.bw") from bigwigs.collect()
file gtf from gtfFile file gtf from gtfFile_plotProfile
output: output:
...@@ -524,6 +540,8 @@ process peakAnnotation { ...@@ -524,6 +540,8 @@ process peakAnnotation {
input: input:
file designAnnotatePeaks file designAnnotatePeaks
file gtf from gtfFile_annotPeaks
file geneNames
output: output:
...@@ -534,7 +552,7 @@ process peakAnnotation { ...@@ -534,7 +552,7 @@ process peakAnnotation {
""" """
module load R/3.3.2-gccmkl module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $genome Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $gtf $geneNames
""" """
} }
......
...@@ -6,40 +6,27 @@ ...@@ -6,40 +6,27 @@
#* -------------------------------------------------------------------------- #* --------------------------------------------------------------------------
#* #*
#Currently Human or Mouse
# Load libraries # Load libraries
library("ChIPseeker") library("ChIPseeker")
library(GenomicFeatures)
# Currently mouse or human
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("TxDb.Mmusculus.UCSC.mm10.knownGene")
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
library("org.Hs.eg.db")
library("org.Mm.eg.db")
# Create parser object # Create parser object
args <- commandArgs(trailingOnly=TRUE) args <- commandArgs(trailingOnly=TRUE)
# Check input args # Check input args
if (length(args) != 2) { if (length(args) != 3) {
stop("Usage: annotate_peaks.R annotate_design.tsv genome_assembly", call.=FALSE) stop("Usage: annotate_peaks.R annotate_design.tsv gtf geneNames", call.=FALSE)
} }
design_file <- args[1] design_file <- args[1]
genome_assembly <- args[2] gtf <- args[2]
geneNames <- args[3]
# Load UCSC Known Genes # Load UCSC Known Genes
if(genome_assembly=='GRCh37') { txdb <- makeTxDbFromGFF(gtf)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene sym <- read.table(geneNames, header=T, sep='\t') [,4:5]
annodb <- 'org.Hs.eg.db'
} else if(genome_assembly=='GRCm38') {
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
annodb <- 'org.Mm.eg.db'
} else if(genome_assembly=='GRCh38') {
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
annodb <- 'org.Hs.eg.db'
}
# Output version of ChIPseeker # Output version of ChIPseeker
chipseeker_version = packageVersion('ChIPseeker') chipseeker_version = packageVersion('ChIPseeker')
...@@ -54,18 +41,18 @@ names(files) <- design$Condition ...@@ -54,18 +41,18 @@ names(files) <- design$Condition
# Granges of files # Granges of files
peaks <- lapply(files, readPeakFile, as = "GRanges", header = FALSE) peaks <- lapply(files, readPeakFile, as = "GRanges", header = FALSE)
peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, annoDb=annodb, tssRegion=c(-3000, 3000), verbose=FALSE) peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
column_names <- c("chr", "start", "end", "width", "strand_1", "name", "score", "strand", "signalValue", column_names <- c("chr", "start", "end", "width", "strand_1", "name", "score", "strand", "signalValue",
"pValue", "qValue", "peak", "annotation", "geneChr", "geneStart", "geneEnd", "pValue", "qValue", "peak", "annotation", "geneChr", "geneStart", "geneEnd",
"geneLength" ,"geneStrand", "geneId", "transcriptId", "distanceToTSS", "geneLength" ,"geneStrand", "geneId", "transcriptId", "distanceToTSS", "symbol")
"ENSEMBL", "symbol", "geneName")
for(index in c(1:length(peakAnnoList))) { for(index in c(1:length(peakAnnoList))) {
filename <- paste(names(peaks)[index], ".chipseeker_annotation.tsv", sep="") filename <- paste(names(peaks)[index], ".chipseeker_annotation.tsv", sep="")
df <- as.data.frame(peakAnnoList[[index]]) df <- as.data.frame(peakAnnoList[[index]])
colnames(df) <- column_names df_final <- merge(df, sym, by.x="geneId", by.y="ensembl", all.x=T)
write.table(df[ , !(names(df) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F) colnames(df_final) <- column_names
write.table(df_final[ , !(names(df_final) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F)
# Draw individual plots # Draw individual plots
......
...@@ -41,4 +41,4 @@ def test_upsetplot_pairedend(): ...@@ -41,4 +41,4 @@ def test_upsetplot_pairedend():
def test_annotation_pairedend(): def test_annotation_pairedend():
annotation_file = test_output_path + 'ENCSR729LGA.chipseeker_annotation.tsv' annotation_file = test_output_path + 'ENCSR729LGA.chipseeker_annotation.tsv'
assert os.path.exists(annotation_file) assert os.path.exists(annotation_file)
assert utils.count_lines(annotation_file) >= 25494 assert utils.count_lines(annotation_file) >= 25466
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment