From 68c077f5260411a6aaf11539cf4cb665782a2696 Mon Sep 17 00:00:00 2001
From: Beibei Chen <beibei.chen@utsouthwestern.edu>
Date: Mon, 21 Nov 2016 13:26:42 -0600
Subject: [PATCH] update astrocyte_package

---
 astrocyte_package.yml |  67 +--------
 workflow/main.nf      | 309 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 313 insertions(+), 63 deletions(-)
 create mode 100644 workflow/main.nf

diff --git a/astrocyte_package.yml b/astrocyte_package.yml
index abd6e7a..697e657 100644
--- a/astrocyte_package.yml
+++ b/astrocyte_package.yml
@@ -39,14 +39,7 @@ documentation_files:
 # A list of clueter environment modules that this workflow requires to run.
 # Specify versioned module names to ensure reproducability.
 workflow_modules:
-  - 'trimgalore/0.4.1'
-  - 'cutadapt/1.9.1'
-  - 'hisat2/2.0.1-beta-intel'
-  - 'samtools/intel/1.3'
-  - 'picard/1.127'
-  - 'subread/1.5.0-intel'
-  - 'stringtie/1.1.2-intel'
-  - 'speedseq/20160506'
+  - 'deeptools/2.3.5'
 # A list of parameters used by the workflow, defining how to present them,
 # options etc in the web interface. For each parameter:
 #
@@ -81,50 +74,14 @@ workflow_modules:
 
 workflow_parameters:
 
-  - id: fastqs
+  - id: bams
     type: files
     required: true
     description: |
-      One or more input paired-end FASTQ files from a RNASeq experiment and a design file with the link between the same name and the sample group
-    regex: ".*(fastq|fq)*"
+      One or more BAM alignments from a ChIP-Seq experiment and a design file with the link between the same name and the sample group. All files will be imported from GreenCenter ChIP-seq peak calling pipeline
+    regex: ".*(bam)*"
     min: 1
 
-  - id: stranded
-    type: select
-    required: true
-    choices:
-      - [ '0', 'Unstranded']
-      - [ '1', 'Stranded']
-      - [ '2', 'Reverse Stranded']
-    description: |
-      In the case that the sequence libraries where generated using a stranded specific protocol.
-      
-  - id: pairs
-    type: select
-    required: true
-    choices:
-      - [ 'pe', 'Paired End']
-      - [ 'se', 'Single End']
-    description: |
-      In single-end sequencing, the sequencer reads a fragment from only one end to the other, generating the sequence of base pairs. In paired-end reading it starts at one read, finishes this direction at the specified read length, and then starts another round of reading from the opposite end of the fragment.
-
-  - id: align
-    type: select
-    required: true
-    choices:
-      - [ 'hisat', 'HiSAT2']
-      - [ 'star', 'STAR']
-    description: |
-      Alignment tool
-      
-  - id: markdups
-    type: select
-    required: true
-    choices:
-      - [ 'mark', 'Remove Duplicates']
-      - [ 'keep', 'Keep All Sequences']
-    description: |
-      Duplicate reads are defined as originating from the same original fragment of DNA. Duplicates are identified as read pairs having identical 5-prime positions (coordinate and strand) for both reads in a mate pair and optionally, matching unique molecular identifier reads.
 
   - id: design
     type: file
@@ -144,20 +101,6 @@ workflow_parameters:
     description: |
       Reference genome for alignment
 
-  - id: geneset
-    type: select
-    choices:
-      - ['h.all.v5.1.symbols.gmt','Hallmark Gene Sets']
-      - ['c2.all.v5.1.symbols.gmt','Curated Gene Sets']
-      - ['c3.all.v5.1.symbols.gmt','Motif Gene Sets']
-      - ['c5.all.v5.1.entrez.gmt','Gene Ontology Gene Sets']
-      - ['c6.all.v5.1.symbols.gmt','Oncogenic Signatures']
-      - ['c7.all.v5.1.entrez.gmt','Immunological Signatures']
-      
-    required: true
-    description: |
-      Gene Set Definitions used for QuSAGE Analysis -- see http://software.broadinstitute.org/gsea/msigdb/ for geneset descriptions
-
 
 # -----------------------------------------------------------------------------
 # SHINY APP CONFIGURATION
@@ -187,8 +130,6 @@ vizapp_cran_packages:
 vizapp_bioc_packages:
   - qusage
   - ballgown
-  - edgeR
-  - DESeq2
 vizapp_github_packages:
   - js229/Vennerable
 
diff --git a/workflow/main.nf b/workflow/main.nf
new file mode 100644
index 0000000..5a77874
--- /dev/null
+++ b/workflow/main.nf
@@ -0,0 +1,309 @@
+#!/usr/bin/env nextflow
+	
+// Default parameter values to run tests
+params.fastqs="$baseDir/../test_data/*.fastq.gz"
+params.design="$baseDir/../test_data/design.pe.txt"
+params.genome="/project/shared/bicf_workflow_ref/GRCh38/"
+params.markdups="mark"
+params.stranded="0"
+params.pairs="pe"
+params.geneset = 'h.all.v5.1.symbols.gmt'
+params.align = 'hisat'
+
+design_file = file(params.design)
+fastqs=file(params.fastqs)
+design_file = file(params.design)
+gtf_file = file("$params.genome/gencode.gtf")
+genenames = file("$params.genome/genenames.txt")
+geneset = file("$params.genome/gsea_gmt/$params.geneset")
+
+// params genome is the directory
+// base name for the index is always genome
+index_path = file(params.genome)
+index_name = "genome"
+star_index = 'star_index/'
+
+// Pair handling, helper function taken from rnatoy
+// which is covered by the GNU General Public License v3
+// https://github.com/nextflow-io/rnatoy/blob/master/main.nf
+
+def fileMap = [:]
+
+fastqs.each {
+    final fileName = it.getFileName().toString()
+    prefix = fileName.lastIndexOf('/')
+    fileMap[fileName] = it
+}
+def prefix = []
+new File(params.design).withReader { reader ->
+    def hline = reader.readLine()
+    def header = hline.split("\t")
+    prefixidx = header.findIndexOf{it == 'SampleID'};
+    oneidx = header.findIndexOf{it == 'FullPathToFqR1'};
+    twoidx = header.findIndexOf{it == 'FullPathToFqR2'};
+    if (twoidx == -1) {
+       twoidx = oneidx
+       }      
+    while (line = reader.readLine()) {
+    	   def row = line.split("\t")
+	   if (fileMap.get(row[oneidx]) != null) {
+	      prefix << tuple(row[prefixidx],fileMap.get(row[oneidx]),fileMap.get(row[twoidx]))
+	   }
+	  
+} 
+}
+if( ! prefix) { error "Didn't match any input files with entries in the design file" }
+
+if (params.pairs == 'pe') {
+Channel
+  .from(prefix)
+  .set { read_pe }
+Channel
+  .empty()
+  .set { read_se } 
+}
+if (params.pairs == 'se') {
+Channel
+  .from(prefix)
+  .into { read_se }
+Channel
+  .empty()
+  .set { read_pe }
+}
+
+//
+// Trim raw reads using trimgalore
+//
+process trimpe {
+  input:
+  set pair_id, file(read1), file(read2) from read_pe
+  output:
+  set pair_id, file("${read1.baseName.split("\\.", 2)[0]}_val_1.fq.gz"), file("${read2.baseName.split("\\.", 2)[0]}_val_2.fq.gz") into trimpe
+  script:
+  """
+  module load trimgalore/0.4.1 cutadapt/1.9.1
+  trim_galore --paired -q 25 --illumina --gzip --length 35 ${read1} ${read2}
+  """
+}
+process trimse {
+  input:
+  set pair_id, file(read1) from read_se
+  output:
+  set pair_id, file("${read1.baseName.split("\\.", 2)[0]}_trimmed.fq.gz") into trimse
+  script:
+  """
+  module load trimgalore/0.4.1 cutadapt/1.9.1
+  trim_galore -q 25 --illumina --gzip --length 35 ${read1}
+  """
+}
+
+//
+// Align trimmed reads to genome indes with hisat2
+// Sort and index with samtools
+// QC aligned reads with fastqc
+// Alignment stats with samtools
+//
+process alignpe {
+
+  publishDir "$baseDir/output", mode: 'copy'
+  cpus 32
+
+  input:
+  set pair_id, file(fq1), file(fq2) from trimpe
+  output:
+  set pair_id, file("${pair_id}.bam") into alignpe
+  file("${pair_id}.flagstat.txt") into alignstats_pe
+  file("${pair_id}.hisatout.txt") into hsatoutpe
+  set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqcpe
+  script:
+  if (params.align == 'hisat')
+  """
+  module load hisat2/2.0.1-beta-intel samtools/intel/1.3 fastqc/0.11.2 picard/1.127 speedseq/20160506
+  hisat2 -p 30 --no-unal --dta -x ${index_path}/${index_name} -1 ${fq1} -2 ${fq2} -S out.sam 2> ${pair_id}.hisatout.txt
+  sambamba view -t 30 -f bam -S -o output.bam out.sam
+  sambamba sort -t 30 -o ${pair_id}.bam output.bam
+  sambamba flagstat -t 30 ${pair_id}.bam > ${pair_id}.flagstat.txt
+  fastqc -f bam ${pair_id}.bam
+  """
+  else
+  """
+  module load star/2.4.2a samtools/intel/1.3 fastqc/0.11.2 picard/1.127 speedseq/20160506
+  STAR --genomeDir ${index_path}/${star_index} --readFilesIn ${fq1} ${fq2} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out
+  mv outLog.final.out ${pair_id}.hisatout.txt
+  sambamba sort -t 30 -o ${pair_id}.bam outAligned.sortedByCoord.out.bam
+  sambamba flagstat -t 30 ${pair_id}.bam > ${pair_id}.flagstat.txt
+  fastqc -f bam ${pair_id}.bam
+  """
+}
+process alignse {
+
+  publishDir "$baseDir/output", mode: 'copy'
+  cpus 32
+
+  input:
+  set pair_id, file(fq1) from trimse
+  output:
+  set pair_id, file("${pair_id}.bam") into alignse
+  file("${pair_id}.flagstat.txt") into alignstats_se
+  file("${pair_id}.hisatout.txt") into hsatoutse
+  set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqcse
+  script:
+  if (params.align == 'hisat')
+  """
+  module load hisat2/2.0.1-beta-intel samtools/intel/1.3 fastqc/0.11.2 speedseq/20160506 picard/1.127
+  hisat2 -p 30 --no-unal --dta -x ${index_path}/${index_name} -U ${fq1} -S out.sam 2> ${pair_id}.hisatout.txt
+  sambamba view -t 30 -f bam -S -o output.bam out.sam
+  sambamba sort -t 30 -o ${pair_id}.bam output.bam
+  sambamba flagstat -t 30 ${pair_id}.bam > ${pair_id}.flagstat.txt
+  fastqc -f bam ${pair_id}.bam
+  """
+  else
+  """
+  module load star/2.4.2a samtools/intel/1.3 fastqc/0.11.2 picard/1.127 speedseq/20160506
+  STAR --genomeDir ${index_path}/${star_index} --readFilesIn ${fq1} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype SAM --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out
+  mv outLog.final.out ${pair_id}.hisatout.txt
+  sambamba view -t 30 -f bam -S -o output.bam outAligned.out.sam
+  sambamba sort -t 30 -o ${pair_id}.bam output.bam
+  sambamba flagstat -t 30 ${pair_id}.bam > ${pair_id}.flagstat.txt
+  fastqc -f bam ${pair_id}.bam
+  """
+}
+
+// From here on we are the same for PE and SE, so merge channels and carry on
+
+Channel
+  .empty()
+  .mix(alignse, alignpe)
+  .tap { aligned2 }
+  .set { aligned }
+
+Channel
+  .empty()
+  .mix(alignstats_se, alignstats_pe)
+  .set { alignstats }
+
+Channel
+  .empty()
+  .mix(hsatoutse, hsatoutpe)
+  .set { hsatout }
+
+//
+// Summarize all flagstat output
+//
+process parse_alignstat {
+
+  publishDir "$baseDir/output", mode: 'copy'
+
+  input:
+  file(txt) from alignstats.toList()
+  file(txt) from  hsatout.toList()
+
+  output:
+  file('alignment.summary.txt')
+
+  """
+  perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt
+  """
+}
+
+//
+// Identify duplicate reads with Picard
+//
+process markdups {
+
+  memory '4GB'
+  publishDir "$baseDir/output", mode: 'copy'
+
+  input:
+  set pair_id, file(sbam) from aligned
+  output:
+  set pair_id, file("${pair_id}.dedup.bam") into deduped1
+  set pair_id, file("${pair_id}.dedup.bam") into deduped2
+  script:
+  if (params.markdups == 'mark')
+  """
+  module load picard/1.127 speedseq/20160506
+  sambamba markdup -t 20 -r ${sbam} ${pair_id}.dedup.bam
+  """
+  else
+  """
+  cp ${sbam} ${pair_id}.dedup.bam
+  """
+}
+
+//
+// Read summarization with subread
+//
+process featurect {
+
+  publishDir "$baseDir/output", mode: 'copy'
+  cpus 32
+
+  input:
+  set pair_id, file(dbam) from deduped1
+  file gtf_file
+  output:
+  file("${pair_id}.cts")  into counts
+  """
+  module load module subread/1.5.0-intel
+  featureCounts -s params.stranded -T 30 -p -g gene_name -a ${gtf_file} -o ${pair_id}.cts ${dbam}
+  """
+}
+
+//
+// Assemble transcripts with stringtie
+//
+
+process stringtie {
+
+  publishDir "$baseDir/output", mode: 'copy'
+  cpus 32
+
+  input:
+  set pair_id, file(dbam) from deduped2
+  file gtf_file
+  output:
+  file("${pair_id}_stringtie") into strcts
+  """
+  module load stringtie/1.1.2-intel
+  mkdir ${pair_id}_stringtie
+  cd ${pair_id}_stringtie
+  stringtie ../${dbam} -p 30 -G ../${gtf_file} -B -e -o denovo.gtf -A ../geneabund.stringtie.tab
+  """
+}
+
+process statanal {
+   publishDir "$baseDir/output", mode: 'copy'
+   input:
+   file count_file from counts.toList()
+   file design_file name 'design.txt'
+   file genenames
+   file geneset name 'geneset.gmt'
+   output:
+   file "*.txt" into txtfiles
+   file "*.png" into psfiles
+   file "*"
+   """
+   module load R/3.2.1-intel
+   perl $baseDir/scripts/concat_cts.pl -o ./ *.cts
+   cp design.txt design.shiny.txt
+   cp geneset.gmt geneset.shiny.gmt
+   Rscript  $baseDir/scripts/dea.R
+   perl $baseDir/scripts/concat_edgeR.pl *.edgeR.txt
+ """
+}
+
+process buildrda {
+   publishDir "$baseDir/output", mode: 'copy'
+   input:
+   file stringtie_dir from strcts.toList()
+   file design_file name 'design.txt'
+   output:
+   file "bg.rda" into rdafiles
+   """
+   module load R/3.2.1-intel
+   Rscript $baseDir/scripts/build_ballgown.R *_stringtie
+   """
+ }
+
+
-- 
GitLab