diff --git a/CHANGELOG.md b/CHANGELOG.md index bf5947140341380f402e5a180abd1d652139d0f0..bc28c53bfc58c5288b3b912d06152a1657e3db0a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ All notable changes to this project will be documented in this file. - Add PlotProfile Option - Add Python version to MultiQC - Add and Update tests +- Use GTF files instead of TxDb and org libraries in Annotate Peaks +- Make gtf and geneName files as param inputs ## [publish_1.0.6 ] - 2019-05-31 ### Added diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 7237cad47346ee0699fcadcf73dcd2a70f3431a5..b2135584edf920a7fc432589ec0ae99a76d573e0 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -74,25 +74,28 @@ params { // Reference file paths on BioHPC genomes { 'GRCh38' { - bwa = '/project/shared/bicf_workflow_ref/GRCh38' + bwa = '/project/shared/bicf_workflow_ref/human/GRCh38' genomesize = 'hs' - chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt' - fasta = '/project/shared/bicf_workflow_ref/GRCh38/genome.fa' - gtf = '/project/shared/bicf_workflow_ref/GRCh38/gencode.gtf' + chromsizes = '/project/shared/bicf_workflow_ref/human/GRCh38/genomefile.txt' + fasta = '/project/shared/bicf_workflow_ref/human/GRCh38/genome.fa' + gtf = '/project/shared/bicf_workflow_ref/human/GRCh38/gencode.v25.chr_patch_hapl_scaff.annotation.gtf' + geneNames = '/project/shared/bicf_workflow_ref/human/GRCh38/genenames.txt' } 'GRCh37' { - bwa = '/project/shared/bicf_workflow_ref/GRCh37' + bwa = '/project/shared/bicf_workflow_ref/human/GRCh37' genomesize = 'hs' - chromsizes = '/project/shared/bicf_workflow_ref/GRCh37/genomefile.txt' - fasta = '/project/shared/bicf_workflow_ref/GRCh37/genome.fa' - gtf = '/project/shared/bicf_workflow_ref/GRCh37/gencode.gtf' + chromsizes = '/project/shared/bicf_workflow_ref/human/GRCh37/genomefile.txt' + fasta = '/project/shared/bicf_workflow_ref/human/GRCh37/genome.fa' + gtf = '/project/shared/bicf_workflow_ref/human/GRCh37/gencode.v19.chr_patch_hapl_scaff.annotation.gtf' + geneNames = '/project/shared/bicf_workflow_ref/human/GRCh37/genenames.txt' } 'GRCm38' { - bwa = '/project/shared/bicf_workflow_ref/GRCm38' + bwa = '/project/shared/bicf_workflow_ref/mouse/GRCm38' genomesize = 'mm' - chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt' - fasta = '/project/shared/bicf_workflow_ref/GRCm38/genome.fa' - gtf = '/project/shared/bicf_workflow_ref/GRCm38/gencode.gtf' + chromsizes = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genomefile.txt' + fasta = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genome.fa' + gtf = '/project/shared/bicf_workflow_ref/mouse/GRCm38/gencode.vM20.annotation.gtf' + geneNames = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genenames.txt' } } } diff --git a/workflow/main.nf b/workflow/main.nf index da287e857bdf38a5131992d6b1adfd7ff5a9cc1b..f654471c74632c7efdd23836df63c9aba78c2099 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -33,14 +33,27 @@ params.multiqc = "$baseDir/conf/multiqc_config.yaml" if (params.astrocyte) { print("Running under astrocyte") referenceLocation = "/project/shared/bicf_workflow_ref" - params.bwaIndex = "$referenceLocation/$params.genome" - params.chromSizes = "$referenceLocation/$params.genome/genomefile.txt" - params.fasta = "$referenceLocation/$params.genome/genome.fa" - params.gtf = "$referenceLocation/$params.genome/gencode.gtf" - if (params.genome == 'GRCh37' || params.genome == 'GRCh38') { + if (params.genome == 'GRCh37') { + params.bwaIndex = "$referenceLocation/human/$params.genome" + params.chromSizes = "$referenceLocation/human/$params.genome/genomefile.txt" + params.fasta = "$referenceLocation/human/$params.genome/genome.fa" + params.gtf = "$referenceLocation/human/$params.genome/gencode.v19.chr_patch_hapl_scaff.annotation.gtf" + params.geneNames = "$referenceLocation/human/$params.genome/genenames.txt" params.genomeSize = 'hs' } else if (params.genome == 'GRCm38') { + params.bwaIndex = "$referenceLocation/mouse/$params.genome" + params.chromSizes = "$referenceLocation/mouse/$params.genome/genomefile.txt" + params.fasta = "$referenceLocation/mouse/$params.genome/genome.fa" + params.gtf = "$referenceLocation/mouse/$params.genome/gencode.vM20.annotation.gtf" + params.geneNames = "$referenceLocation/mouse/$params.genome/genenames.txt" params.genomeSize = 'mm' + } else if (params.genome == 'GRCh38') { + params.bwaIndex = "$referenceLocation/human/$params.genome" + params.chromSizes = "$referenceLocation/human/$params.genome/genomefile.txt" + params.fasta = "$referenceLocation/human/$params.genome/genome.fa" + params.gtf = "$referenceLocation/human/$params.genome/gencode.v25.chr_patch_hapl_scaff.annotation.gtf" + params.geneNames = "$referenceLocation/human/$params.genome/genenames.txt" + params.genomeSize = 'hs' } } else { params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false @@ -48,6 +61,7 @@ if (params.astrocyte) { params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false params.gtf = params.genome ? params.genomes[ params.genome ].gtf ?: false : false + params.geneNames = params.genome ? params.genomes[ params.genome ].geneNames ?: false : false } @@ -84,7 +98,9 @@ skipMotif = params.skipMotif skipPlotProfile = params.skipPlotProfile references = params.references multiqc = params.multiqc -gtfFile = Channel.fromPath(params.gtf) +gtfFile_plotProfile = Channel.fromPath(params.gtf) +gtfFile_annotPeaks = Channel.fromPath(params.gtf) +geneNames = Channel.fromPath(params.geneNames) // Check design file for errors process checkDesignFile { @@ -469,7 +485,7 @@ process plotProfile { input: file ("*.pooled.fc_signal.bw") from bigwigs.collect() - file gtf from gtfFile + file gtf from gtfFile_plotProfile output: @@ -524,6 +540,8 @@ process peakAnnotation { input: file designAnnotatePeaks + file gtf from gtfFile_annotPeaks + file geneNames output: @@ -534,7 +552,7 @@ process peakAnnotation { """ module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $genome + Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $gtf $geneNames """ } diff --git a/workflow/scripts/annotate_peaks.R b/workflow/scripts/annotate_peaks.R index 764106f7a529389e3de41f73b81600e0c37b92f8..853f7aa677a796b6893762b6679573f2049766d3 100644 --- a/workflow/scripts/annotate_peaks.R +++ b/workflow/scripts/annotate_peaks.R @@ -6,40 +6,27 @@ #* -------------------------------------------------------------------------- #* +#Currently Human or Mouse + # Load libraries library("ChIPseeker") - -# Currently mouse or human - -library("TxDb.Hsapiens.UCSC.hg19.knownGene") -library("TxDb.Mmusculus.UCSC.mm10.knownGene") -library("TxDb.Hsapiens.UCSC.hg38.knownGene") -library("org.Hs.eg.db") -library("org.Mm.eg.db") - +library(GenomicFeatures) # Create parser object args <- commandArgs(trailingOnly=TRUE) # Check input args -if (length(args) != 2) { - stop("Usage: annotate_peaks.R annotate_design.tsv genome_assembly", call.=FALSE) +if (length(args) != 3) { + stop("Usage: annotate_peaks.R annotate_design.tsv gtf geneNames", call.=FALSE) } design_file <- args[1] -genome_assembly <- args[2] +gtf <- args[2] +geneNames <- args[3] # Load UCSC Known Genes -if(genome_assembly=='GRCh37') { - txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene - annodb <- 'org.Hs.eg.db' -} else if(genome_assembly=='GRCm38') { - txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene - annodb <- 'org.Mm.eg.db' -} else if(genome_assembly=='GRCh38') { - txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene - annodb <- 'org.Hs.eg.db' -} +txdb <- makeTxDbFromGFF(gtf) +sym <- read.table(geneNames, header=T, sep='\t') [,4:5] # Output version of ChIPseeker chipseeker_version = packageVersion('ChIPseeker') @@ -54,18 +41,18 @@ names(files) <- design$Condition # Granges of files peaks <- lapply(files, readPeakFile, as = "GRanges", header = FALSE) -peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, annoDb=annodb, tssRegion=c(-3000, 3000), verbose=FALSE) +peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE) column_names <- c("chr", "start", "end", "width", "strand_1", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak", "annotation", "geneChr", "geneStart", "geneEnd", - "geneLength" ,"geneStrand", "geneId", "transcriptId", "distanceToTSS", - "ENSEMBL", "symbol", "geneName") + "geneLength" ,"geneStrand", "geneId", "transcriptId", "distanceToTSS", "symbol") for(index in c(1:length(peakAnnoList))) { filename <- paste(names(peaks)[index], ".chipseeker_annotation.tsv", sep="") df <- as.data.frame(peakAnnoList[[index]]) - colnames(df) <- column_names - write.table(df[ , !(names(df) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F) + df_final <- merge(df, sym, by.x="geneId", by.y="ensembl", all.x=T) + colnames(df_final) <- column_names + write.table(df_final[ , !(names(df_final) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F) # Draw individual plots diff --git a/workflow/tests/test_annotate_peaks.py b/workflow/tests/test_annotate_peaks.py index e43a63c691584eb46f30e2a874dfe097fba17078..22e1d78ef149890c1b518225f6e01b2e259aabdb 100644 --- a/workflow/tests/test_annotate_peaks.py +++ b/workflow/tests/test_annotate_peaks.py @@ -41,4 +41,4 @@ def test_upsetplot_pairedend(): def test_annotation_pairedend(): annotation_file = test_output_path + 'ENCSR729LGA.chipseeker_annotation.tsv' assert os.path.exists(annotation_file) - assert utils.count_lines(annotation_file) >= 25494 + assert utils.count_lines(annotation_file) >= 25466