diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index 7db5da7e1e95561cd005a3b725ddc8633fb76e13..6e15b213afac9f222e9cde55e60a63f72cbd7951 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -41,16 +41,14 @@ documentation_files: workflow_modules: - 'python/3.6.1-2-anaconda' - 'trimgalore/0.4.1' - - 'cutadapt/1.9.1' - - 'fastqc/0.11.2' - 'bwa/intel/0.7.12' - - 'samtools/1.4.1' - 'sambamba/0.6.6' - 'bedtools/2.26.0' - 'deeptools/2.5.0.1' - 'phantompeakqualtools/1.2' - 'macs/2.1.0-20151222' - 'UCSC_userApps/v317' + - 'R/3.4.1-gccmkl' # A list of parameters used by the workflow, defining how to present them, # options etc in the web interface. For each parameter: @@ -113,13 +111,14 @@ workflow_parameters: description: | A design file listing sample id, fastq files, corresponding control id and additional information about the sample. + regex: ".*tsv" - id: genome type: select choices: - [ 'GRCh38', 'Human GRCh38'] - [ 'GRCh37', 'Human GRCh37'] - - [ 'GRCh38', 'Mouse GRCh38'] + - [ 'GRCm38', 'Mouse GRCm38'] required: true description: | Reference species and genome used for alignment and subsequent analysis. diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 647927efa383ced7c7bcf0eed33482f6df562091..032ea21c155d843c63babbc977244440a8dc9f34 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -12,7 +12,7 @@ process { cpus = 32 } $alignReads{ - module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3'] + module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/1.4.1'] cpus = 32 } $filterReads{ @@ -47,6 +47,11 @@ process { module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0'] executor = 'local' } + $consensusPeaks { + module = ['R/3.4.1-gccmkl'] + executor = 'local' + } + } params { diff --git a/workflow/main.nf b/workflow/main.nf index 6da4f6dc8b3736d9cf5efd08a1987e9f9794e121..36a1bf92494d59615fae5bf07959e779c0115938 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -388,12 +388,14 @@ process peakAnnotation { file consensusPeaks - output: - file "*chipseeker*" into chipseeker_originalpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath} -""" + output: + + file "*chipseeker*" into peakAnnotation + + script: + """ + module load python/2.7.x-anaconda + module load R/3.3.2-gccmkl + Rscript $baseDir/scripts/annotate_peaks.R $design_file $genome + """ } diff --git a/workflow/scripts/annotate_peaks.R b/workflow/scripts/annotate_peaks.R new file mode 100644 index 0000000000000000000000000000000000000000..74c60b3a137144c3335d1dc7a14ac45f3e308d7f --- /dev/null +++ b/workflow/scripts/annotate_peaks.R @@ -0,0 +1,79 @@ +#!/bin/Rscript + +# Load libraries +source("http://bioconductor.org/biocLite.R") +if(!require("ChIPseeker")){ + biocLite("ChIPseeker") + library("ChIPseeker") +} + +# Currently mouse or human +if (!require("TxDb.Hsapiens.UCSC.hg19.knownGene")){ + biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene") + library("TxDb.Hsapiens.UCSC.hg19.knownGene") +} +if (!require("TxDb.Mmusculus.UCSC.mm10.knownGene")){ + biocLite("TxDb.Mmusculus.UCSC.mm10.knownGene") + library("TxDb.Mmusculus.UCSC.mm10.knownGene") +} +if (!require("TxDb.Hsapiens.UCSC.hg38.knownGene")){ + biocLite("TxDb.Hsapiens.UCSC.hg38.knownGene") + library("TxDb.Hsapiens.UCSC.hg38.knownGene") +} + + + +# Create parser object +args <- commandArgs(trailingOnly=TRUE) + +# Check input args +if (length(args) != 2) { + stop("Usage: annotate_peaks.r [ annotate_design.tsv ] [ genome_assembly ]", call.=FALSE) +} + +design_file <- args[1] +genome <-args[2] + +# Load UCSC Known Genes +if(genome=='GRCh37') { + txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +} else if(genome=='GRCm38') { + txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene +} else if(genome=='GRCh38') { + txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene +} + +# Load design file +design <- read.csv(design_file, sep ='\t') +files <- as.list(as.character(design$Peaks)) +names(files) <- design$Condition + + +peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE) + +for(index in c(1:length(peakAnnoList))) { + filename <- paste(names(files)[index],".chipseeker_annotation.xls",sep="") + write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F) + + # Draw individual plots + + # Define names of Plots + pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="") + vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="") + upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="") + + # Pie Plots + pdf(pie_name) + plotAnnoPie(peakAnnoList[[index]]) + dev.off() + + # Venn Diagrams + pdf(vennpie_name) + vennpie(peakAnnoList[[index]]) + dev.off() + + # Upset Plot + pdf(upsetplot_name) + upsetplot(peakAnnoList[[index]]) + dev.off() +} diff --git a/workflow/scripts/overlap_peaks.py b/workflow/scripts/overlap_peaks.py index 82f63b5ead547b93fa3b308692db2c1d2a7d3819..a5ca5c2ee49282abd330ab8b3394c2cce6df8586 100644 --- a/workflow/scripts/overlap_peaks.py +++ b/workflow/scripts/overlap_peaks.py @@ -166,7 +166,7 @@ def overlap(experiment, design): os.remove(overlap_tr_fn) os.remove(overlap_pr_fn) - return overlapping_peaks_fn + return os.path.abspath(overlapping_peaks_fn) def main(): @@ -186,14 +186,16 @@ def main(): design_diff = update_design(design_files_df) # Make a design file for annotating Peaks - design_anno = pd.DataFrame() + anno_cols = ['Condition', 'Peaks'] + design_anno = pd.DataFrame(columns = anno_cols) # Find consenus overlap peaks for each experiment for experiment, df_experiment in design_peaks_df.groupby('experiment_id'): replicated_peak = overlap(experiment, df_experiment) design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak + design_anno.loc[experiment] = [experiment, replicated_peak] - # Write out file + # Write out design files design_diff.columns = ['SampleID', 'bamReads', 'Condition', @@ -207,6 +209,7 @@ def main(): 'PeakCaller'] design_diff.to_csv("design_diffPeaks.tsv", header=True, sep='\t', index=False) + design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False) if __name__ == '__main__': diff --git a/workflow/scripts/runChipseeker.R b/workflow/scripts/runChipseeker.R deleted file mode 100644 index 8f8b0d011b511d8842c6d25d9842b3ef2a50974e..0000000000000000000000000000000000000000 --- a/workflow/scripts/runChipseeker.R +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/Rscript - -library(ChIPseeker) - - -# Create parser object -parser <- ArgumentParser() - -# Specify our desired options -parser$add_argument("-d", "--design", help = "File path to design file", required = TRUE) -parser$add_argument("-g", "--genome", help = "The genome assembly", required = TRUE) - -# Parse arguments -args <- parser$parse_args() - - -# Load UCSC Known Genes -if(args$genome=='GRCh37') { - library(TxDb.Hsapiens.UCSC.hg19.knownGene) - txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene -} else if(args$genome=='GRCm38') { - library(TxDb.Mmusculus.UCSC.mm10.knownGene) - txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene -} -else if(args$genome=='GRCh38') { - library(TxDb.Hsapiens.UCSC.hg38.knownGene) - txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene -} - - -peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE) -for(index in c(1:length(peakAnnoList))) -{ - filename<-paste(names(files)[index],".chipseeker_annotation.xls",sep="") - write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F) - #draw individual plot - pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="") - vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="") - upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="") - pdf(pie_name) - plotAnnoPie(peakAnnoList[[index]]) - dev.off() - pdf(vennpie_name) - vennpie(peakAnnoList[[index]]) - dev.off() - pdf(upsetplot_name) - upsetplot(peakAnnoList[[index]]) - dev.off() - - -}