From 7deea97e71828fd845f84e3900ecd077045adbce Mon Sep 17 00:00:00 2001
From: Venkat Malladi <venkat.malladi@utsouthwestern.edu>
Date: Tue, 24 Jul 2018 21:31:17 -0500
Subject: [PATCH] Update annotation script.

---
 astrocyte_pkg.yml                 |  7 ++-
 workflow/conf/biohpc.config       |  7 ++-
 workflow/main.nf                  | 18 +++----
 workflow/scripts/annotate_peaks.R | 79 +++++++++++++++++++++++++++++++
 workflow/scripts/overlap_peaks.py |  9 ++--
 workflow/scripts/runChipseeker.R  | 51 --------------------
 6 files changed, 104 insertions(+), 67 deletions(-)
 create mode 100644 workflow/scripts/annotate_peaks.R
 delete mode 100644 workflow/scripts/runChipseeker.R

diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml
index 7db5da7..6e15b21 100644
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -41,16 +41,14 @@ documentation_files:
 workflow_modules:
   - 'python/3.6.1-2-anaconda'
   - 'trimgalore/0.4.1'
-  - 'cutadapt/1.9.1'
-  - 'fastqc/0.11.2'
   - 'bwa/intel/0.7.12'
-  - 'samtools/1.4.1'
   - 'sambamba/0.6.6'
   - 'bedtools/2.26.0'
   - 'deeptools/2.5.0.1'
   - 'phantompeakqualtools/1.2'
   - 'macs/2.1.0-20151222'
   - 'UCSC_userApps/v317'
+  - 'R/3.4.1-gccmkl'
 
 # A list of parameters used by the workflow, defining how to present them,
 # options etc in the web interface. For each parameter:
@@ -113,13 +111,14 @@ workflow_parameters:
     description: |
       A design file listing sample id, fastq files, corresponding control id
       and additional information about the sample.
+    regex: ".*tsv"
 
   - id: genome
     type: select
     choices:
       - [ 'GRCh38', 'Human GRCh38']
       - [ 'GRCh37', 'Human GRCh37']
-      - [ 'GRCh38', 'Mouse GRCh38']
+      - [ 'GRCm38', 'Mouse GRCm38']
     required: true
     description: |
       Reference species and genome used for alignment and subsequent analysis.
diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config
index 647927e..032ea21 100644
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -12,7 +12,7 @@ process {
     cpus = 32
   }
   $alignReads{
-    module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3']
+    module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/1.4.1']
     cpus = 32
   }
   $filterReads{
@@ -47,6 +47,11 @@ process {
     module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0']
     executor = 'local'
   }
+  $consensusPeaks {
+    module = ['R/3.4.1-gccmkl']
+    executor = 'local'
+  }
+
 }
 
 params {
diff --git a/workflow/main.nf b/workflow/main.nf
index 6da4f6d..36a1bf9 100644
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -388,12 +388,14 @@ process peakAnnotation {
 
   file consensusPeaks
 
-   output:
-     file "*chipseeker*" into chipseeker_originalpeak_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath}
-"""
+ output:
+
+  file "*chipseeker*" into peakAnnotation
+
+ script:
+   """
+   module load python/2.7.x-anaconda
+   module load R/3.3.2-gccmkl
+   Rscript $baseDir/scripts/annotate_peaks.R $design_file $genome
+   """
 }
diff --git a/workflow/scripts/annotate_peaks.R b/workflow/scripts/annotate_peaks.R
new file mode 100644
index 0000000..74c60b3
--- /dev/null
+++ b/workflow/scripts/annotate_peaks.R
@@ -0,0 +1,79 @@
+#!/bin/Rscript
+
+# Load libraries
+source("http://bioconductor.org/biocLite.R")
+if(!require("ChIPseeker")){
+    biocLite("ChIPseeker")
+    library("ChIPseeker")
+}
+
+# Currently mouse or human
+if (!require("TxDb.Hsapiens.UCSC.hg19.knownGene")){
+    biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
+    library("TxDb.Hsapiens.UCSC.hg19.knownGene")
+}
+if (!require("TxDb.Mmusculus.UCSC.mm10.knownGene")){
+    biocLite("TxDb.Mmusculus.UCSC.mm10.knownGene")
+    library("TxDb.Mmusculus.UCSC.mm10.knownGene")
+}
+if (!require("TxDb.Hsapiens.UCSC.hg38.knownGene")){
+    biocLite("TxDb.Hsapiens.UCSC.hg38.knownGene")
+    library("TxDb.Hsapiens.UCSC.hg38.knownGene")
+}
+
+
+
+# Create parser object
+args <- commandArgs(trailingOnly=TRUE)
+
+# Check input args
+if (length(args) != 2) {
+  stop("Usage: annotate_peaks.r [ annotate_design.tsv ] [ genome_assembly ]", call.=FALSE)
+}
+
+design_file <- args[1]
+genome <-args[2]
+
+# Load UCSC Known Genes
+if(genome=='GRCh37') {
+    txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+} else if(genome=='GRCm38')  {
+    txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
+} else if(genome=='GRCh38')  {
+    txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
+}
+
+# Load design file
+design <- read.csv(design_file, sep ='\t')
+files <- as.list(as.character(design$Peaks))
+names(files) <- design$Condition
+
+
+peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
+
+for(index in c(1:length(peakAnnoList))) {
+  filename <- paste(names(files)[index],".chipseeker_annotation.xls",sep="")
+  write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F)
+
+  # Draw individual plots
+
+  # Define names of Plots
+  pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="")
+  vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="")
+  upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="")
+
+  # Pie Plots
+  pdf(pie_name)
+  plotAnnoPie(peakAnnoList[[index]])
+  dev.off()
+
+  # Venn Diagrams
+  pdf(vennpie_name)
+  vennpie(peakAnnoList[[index]])
+  dev.off()
+
+  # Upset Plot
+  pdf(upsetplot_name)
+  upsetplot(peakAnnoList[[index]])
+  dev.off()
+}
diff --git a/workflow/scripts/overlap_peaks.py b/workflow/scripts/overlap_peaks.py
index 82f63b5..a5ca5c2 100644
--- a/workflow/scripts/overlap_peaks.py
+++ b/workflow/scripts/overlap_peaks.py
@@ -166,7 +166,7 @@ def overlap(experiment, design):
     os.remove(overlap_tr_fn)
     os.remove(overlap_pr_fn)
 
-    return overlapping_peaks_fn
+    return os.path.abspath(overlapping_peaks_fn)
 
 
 def main():
@@ -186,14 +186,16 @@ def main():
     design_diff = update_design(design_files_df)
 
     # Make a design file for annotating Peaks
-    design_anno = pd.DataFrame()
+    anno_cols = ['Condition', 'Peaks']
+    design_anno = pd.DataFrame(columns = anno_cols)
 
     # Find consenus overlap peaks for each experiment
     for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
         replicated_peak = overlap(experiment, df_experiment)
         design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak
+        design_anno.loc[experiment] = [experiment, replicated_peak]
 
-    # Write out file
+    # Write out design files
     design_diff.columns = ['SampleID',
                             'bamReads',
                             'Condition',
@@ -207,6 +209,7 @@ def main():
                             'PeakCaller']
 
     design_diff.to_csv("design_diffPeaks.tsv", header=True, sep='\t', index=False)
+    design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False)
 
 
 if __name__ == '__main__':
diff --git a/workflow/scripts/runChipseeker.R b/workflow/scripts/runChipseeker.R
deleted file mode 100644
index 8f8b0d0..0000000
--- a/workflow/scripts/runChipseeker.R
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/Rscript
-
-library(ChIPseeker)
-
-
-# Create parser object
-parser <- ArgumentParser()
-
-# Specify our desired options
-parser$add_argument("-d", "--design", help = "File path to design file", required = TRUE)
-parser$add_argument("-g", "--genome", help = "The genome assembly", required = TRUE)
-
-# Parse arguments
-args <- parser$parse_args()
-
-
-# Load UCSC Known Genes
-if(args$genome=='GRCh37') {
-    library(TxDb.Hsapiens.UCSC.hg19.knownGene)
-    txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
-} else if(args$genome=='GRCm38')  {
-    library(TxDb.Mmusculus.UCSC.mm10.knownGene)
-    txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
-}
-else if(args$genome=='GRCh38')  {
-    library(TxDb.Hsapiens.UCSC.hg38.knownGene)
-    txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
-}
-
-
-peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
-for(index in c(1:length(peakAnnoList)))
-{
-  filename<-paste(names(files)[index],".chipseeker_annotation.xls",sep="")
-  write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F)
-  #draw individual plot
-  pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="")
-  vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="")
-  upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="")
-  pdf(pie_name)
-  plotAnnoPie(peakAnnoList[[index]])
-  dev.off()
-  pdf(vennpie_name)
-  vennpie(peakAnnoList[[index]])
-  dev.off()
-  pdf(upsetplot_name)
-  upsetplot(peakAnnoList[[index]])
-  dev.off()
-
-
-}
-- 
GitLab