Update annotation script.

7deea97e · Venkat Malladi · 0620eafd · 7deea97e · 7deea97e · 7deea97e
Commit 7deea97e authored 6 years ago by Venkat Malladi
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -41,16 +41,14 @@ documentation_files:
 workflow_modules:
  - 'python/3.6.1-2-anaconda'
  - 'trimgalore/0.4.1'
-  - 'cutadapt/1.9.1'
-  - 'fastqc/0.11.2'
  - 'bwa/intel/0.7.12'
-  - 'samtools/1.4.1'
  - 'sambamba/0.6.6'
  - 'bedtools/2.26.0'
  - 'deeptools/2.5.0.1'
  - 'phantompeakqualtools/1.2'
  - 'macs/2.1.0-20151222'
  - 'UCSC_userApps/v317'
+  - 'R/3.4.1-gccmkl'
 # A list of parameters used by the workflow, defining how to present them,
 # options etc in the web interface. For each parameter:
@@ -113,13 +111,14 @@ workflow_parameters:
    description: |
      A design file listing sample id, fastq files, corresponding control id
      and additional information about the sample.
+    regex: ".*tsv"
  - id: genome
    type: select
    choices:
      - [ 'GRCh38', 'Human GRCh38']
      - [ 'GRCh37', 'Human GRCh37']
-      - [ 'GRCh38', 'Mouse GRCh38']
+      - [ 'GRCm38', 'Mouse GRCm38']
    required: true
    description: |
      Reference species and genome used for alignment and subsequent analysis.

--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -12,7 +12,7 @@ process {
    cpus = 32
  }
  $alignReads{
-    module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3']
+    module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/1.4.1']
    cpus = 32
  }
  $filterReads{
@@ -47,6 +47,11 @@ process {
    module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0']
    executor = 'local'
  }
+  $consensusPeaks {
+    module = ['R/3.4.1-gccmkl']
+    executor = 'local'
+  }
 }
 params {

--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -388,12 +388,14 @@ process peakAnnotation {
  file consensusPeaks
-   output:
+ output:
-     file "*chipseeker*" into chipseeker_originalpeak_output
-   script:
+  file "*chipseeker*" into peakAnnotation
-     """
-     module load python/2.7.x-anaconda
+ script:
-     module load R/3.3.2-gccmkl
+   """
-     Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath}
+   module load python/2.7.x-anaconda
-"""
+   module load R/3.3.2-gccmkl
+   Rscript $baseDir/scripts/annotate_peaks.R $design_file $genome
+   """
 }
--- a/workflow/scripts/runChipseeker.R
+++ b/workflow/scripts/runChipseeker.R
 #!/bin/Rscript
-library(ChIPseeker)
+# Load libraries
+source("http://bioconductor.org/biocLite.R")
+if(!require("ChIPseeker")){
+    biocLite("ChIPseeker")
+    library("ChIPseeker")
+}
+# Currently mouse or human
+if (!require("TxDb.Hsapiens.UCSC.hg19.knownGene")){
+    biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
+    library("TxDb.Hsapiens.UCSC.hg19.knownGene")
+}
+if (!require("TxDb.Mmusculus.UCSC.mm10.knownGene")){
+    biocLite("TxDb.Mmusculus.UCSC.mm10.knownGene")
+    library("TxDb.Mmusculus.UCSC.mm10.knownGene")
+}
+if (!require("TxDb.Hsapiens.UCSC.hg38.knownGene")){
+    biocLite("TxDb.Hsapiens.UCSC.hg38.knownGene")
+    library("TxDb.Hsapiens.UCSC.hg38.knownGene")
+}
-# Create parser object
-parser <- ArgumentParser()
-# Specify our desired options
-parser$add_argument("-d", "--design", help = "File path to design file", required = TRUE)
-parser$add_argument("-g", "--genome", help = "The genome assembly", required = TRUE)
-# Parse arguments
+# Create parser object
-args <- parser$parse_args()
+args <- commandArgs(trailingOnly=TRUE)
+# Check input args
+if (length(args) != 2) {
+  stop("Usage: annotate_peaks.r [ annotate_design.tsv ] [ genome_assembly ]", call.=FALSE)
+}
+design_file <- args[1]
+genome <-args[2]
 # Load UCSC Known Genes
-if(args$genome=='GRCh37') {
+if(genome=='GRCh37') {
-    library(TxDb.Hsapiens.UCSC.hg19.knownGene)
    txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
-} else if(args$genome=='GRCm38')  {
+} else if(genome=='GRCm38')  {
-    library(TxDb.Mmusculus.UCSC.mm10.knownGene)
    txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
-}
+} else if(genome=='GRCh38')  {
-else if(args$genome=='GRCh38')  {
-    library(TxDb.Hsapiens.UCSC.hg38.knownGene)
    txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
 }
+# Load design file
+design <- read.csv(design_file, sep ='\t')
+files <- as.list(as.character(design$Peaks))
+names(files) <- design$Condition
 peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
-for(index in c(1:length(peakAnnoList)))
-{
+for(index in c(1:length(peakAnnoList))) {
-  filename<-paste(names(files)[index],".chipseeker_annotation.xls",sep="")
+  filename <- paste(names(files)[index],".chipseeker_annotation.xls",sep="")
  write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F)
-  #draw individual plot
+  # Draw individual plots
+  # Define names of Plots
  pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="")
  vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="")
  upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="")
+  # Pie Plots
  pdf(pie_name)
  plotAnnoPie(peakAnnoList[[index]])
  dev.off()
+  # Venn Diagrams
  pdf(vennpie_name)
  vennpie(peakAnnoList[[index]])
  dev.off()
+  # Upset Plot
  pdf(upsetplot_name)
  upsetplot(peakAnnoList[[index]])
  dev.off()
 }
--- a/workflow/scripts/overlap_peaks.py
+++ b/workflow/scripts/overlap_peaks.py
@@ -166,7 +166,7 @@ def overlap(experiment, design):
    os.remove(overlap_tr_fn)
    os.remove(overlap_pr_fn)
-    return overlapping_peaks_fn
+    return os.path.abspath(overlapping_peaks_fn)
 def main():
@@ -186,14 +186,16 @@ def main():
    design_diff = update_design(design_files_df)
    # Make a design file for annotating Peaks
-    design_anno = pd.DataFrame()
+    anno_cols = ['Condition', 'Peaks']
+    design_anno = pd.DataFrame(columns = anno_cols)
    # Find consenus overlap peaks for each experiment
    for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
        replicated_peak = overlap(experiment, df_experiment)
        design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak
+        design_anno.loc[experiment] = [experiment, replicated_peak]
-    # Write out file
+    # Write out design files
    design_diff.columns = ['SampleID',
                            'bamReads',
                            'Condition',
@@ -207,6 +209,7 @@ def main():
                            'PeakCaller']
    design_diff.to_csv("design_diffPeaks.tsv", header=True, sep='\t', index=False)
+    design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False)
 if __name__ == '__main__':