add all files

8aac26de · Brandi Cantarel · 8aac26de · 8aac26de · 8aac26de · 8aac26de
Commit 8aac26de authored 7 years ago by Brandi Cantarel
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+# Gitlab CI Script for astrocyte/rnaseq
+# Brandi L. Cantarel - 2017
+
+variables:
+  CLEAN_REPO_PATH: "/project/BICF/BICF_Core/s166458/astrocyte_test/${CI_RUNNER_ID}"
+  TEST_BRANCH: "master"
+
+before_script:
+  - module load nextflow/0.27.6
+  - mkdir -p "${CLEAN_REPO_PATH}"
+  - ln -s /project/shared/bicf_workflow_ref/workflow_testdata/rnaseq test_data
+
+stages:
+  - integration
+
+test_human
+  stage: integration
+  script:
+    - nextflow run workflow/main.nf --input test_data/ --design ${CLEAN_REPO_PATH}/test_data/design.rnaseq.txt --output ${CLEAN_REPO_PATH}/human_output
+  artifacts:
+    paths:
+      - ${CLEAN_REPO_PATH}
+    expire_in: 2 days
+
+test_mouse
+  stage: integration
+  script:
+    - nextflow run workflow/main.nf --input test_data/ --design ${CLEAN_REPO_PATH}/test_data/mouse_se.design.txt --pairs se --fusion skip --genome /project/shared/bicf_workflow_ref/GRCm38 --markdups null --output ${CLEAN_REPO_PATH}/mouse_output
+  artifacts:
+    paths:
+      - ${CLEAN_REPO_PATH}
+    expire_in: 2 days
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "workflow/process_scripts"]
+	path = workflow/process_scripts
+	url = git@git.biohpc.swmed.edu:BICF/Astrocyte/process_scripts.git
--- a/CHANGES.md
+++ b/CHANGES.md
--- a/LICENSE.md
+++ b/LICENSE.md
+Copyright © 2016. The University of Texas Southwestern Medical Center
+5323 Harry Hines Boulevard Dallas, Texas, 75390 Telephone 214-648-3111
\ No newline at end of file
--- a/README.md
+++ b/README.md
+## RNASeq Analysis Worklow
+
+```
+module load nextflow/0.24.1-SNAPSHOT
+nextflow run workflow/main.nf
+```
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
+#
+# metadata for the example astrocyte ChipSeq workflow package
+#
+
+# -----------------------------------------------------------------------------
+# BASIC INFORMATION
+# -----------------------------------------------------------------------------
+
+# A unique identifier for the workflow package, text/underscores only
+name: 'rnaseq_bicf'
+# Who wrote this?
+author: 'Brandi Cantarel'
+# A contact email address for questions
+email: 'biohpc-help@utsouthwestern.edu'
+# A more informative title for the workflow package
+title: 'BICF RNASeq Analysis Workflow'
+# A summary of the workflow package in plain text
+description: |
+  This is a workflow package for the BioHPC/BICF RNASeq workflow system.
+  It implements differential expression analysis, gene set enrichment analysis,
+  gene fusion analysis and variant identification using RNASeq data.
+  
+
+# -----------------------------------------------------------------------------
+# DOCUMENTATION
+# -----------------------------------------------------------------------------
+
+# A list of documentation file in .md format that should be viewable from the
+# web interface. These files are in the 'docs' subdirectory. The first file
+# listed will be used as a documentation index and is index.md by convention
+documentation_files:
+  - 'index.md'
+
+# -----------------------------------------------------------------------------
+# NEXTFLOW WORKFLOW CONFIGURATION
+# -----------------------------------------------------------------------------
+
+# Remember - The workflow file is always named 'workflow/main.f'
+#            The workflow must publish all final output into $baseDir
+
+# A list of clueter environment modules that this workflow requires to run.
+# Specify versioned module names to ensure reproducability.
+workflow_modules:
+  - 'trimgalore/0.4.1'
+  - 'cutadapt/1.9.1'
+  - 'hisat2/2.0.1-beta-intel'
+  - 'samtools/intel/1.3'
+  - 'picard/1.127'
+  - 'subread/1.5.0-intel'
+  - 'stringtie/1.1.2-intel'
+  - 'speedseq/20160506'
+  - 'python/2.7.x-anaconda'
+# A list of parameters used by the workflow, defining how to present them,
+# options etc in the web interface. For each parameter:
+#
+# REQUIRED INFORMATION
+#  id:         The name of the parameter in the NEXTFLOW workflow
+#  type:       The type of the parameter, one of:
+#                string    - A free-format string
+#                integer   - An integer
+#                real      - A real number
+#                file      - A single file from user data
+#                files     - One or more files from user data
+#                select    - A selection from a list of values
+#  required:    true/false, must the parameter be entered/chosen?
+#  description: A user friendly description of the meaning of the parameter
+#
+# OPTIONAL INFORMATION
+#  default:   A default value for the parameter (optional)
+#  min:       Minium value/characters/files for number/string/files types
+#  max:       Maxumum value/characters/files for number/string/files types
+#  regex:     A regular expression that describes valid entries / filenames
+#
+# SELECT TYPE
+#  choices:   A set of choices presented to the user for the parameter.
+#             Each choice is a pair of value and description, e.g.
+#
+#             choices:
+#               - [ 'myval', 'The first option']
+#               - [ 'myval', 'The second option']
+#
+# NOTE - All parameters are passed to NEXTFLOW as strings... but they
+#        are validated by astrocyte using the information provided above
+
+workflow_parameters:
+
+  - id: fastqs
+    type: files
+    required: true
+    description: |
+      One or more input paired-end FASTQ files from a RNASeq experiment and a design file with the link between the same name and the sample group
+    regex: ".*(fastq|fq)*"
+    min: 1
+
+  - id: stranded
+    type: select
+    required: true
+    choices:
+      - [ '0', 'Unstranded']
+      - [ '1', 'Stranded']
+      - [ '2', 'Reverse Stranded']
+    description: |
+      In the case that the sequence libraries where generated using a stranded specific protocol.
+      
+  - id: pairs
+    type: select
+    required: true
+    choices:
+      - [ 'pe', 'Paired End']
+      - [ 'se', 'Single End']
+    description: |
+      In single-end sequencing, the sequencer reads a fragment from only one end to the other, generating the sequence of base pairs. In paired-end reading it starts at one read, finishes this direction at the specified read length, and then starts another round of reading from the opposite end of the fragment.
+
+  - id: align
+    type: select
+    required: true
+    choices:
+      - [ 'hisat', 'HiSAT2']
+      - [ 'star', 'STAR']
+    description: |
+      Alignment tool
+      
+  - id: fusion
+    type: select
+    required: true
+    choices:
+      - [ 'skip', 'No Fusion Detection']
+      - [ 'detect', 'Fusion Detection']
+    description: |
+      Run Star Fusion
+
+  - id: markdups
+    type: select
+    required: true
+    choices:
+      - [ 'picard', 'Remove Duplicates']
+      - [ 'null', 'Keep All Sequences']
+    description: |
+      Duplicate reads are defined as originating from the same original fragment of DNA. Duplicates are identified as read pairs having identical 5-prime positions (coordinate and strand) for both reads in a mate pair and optionally, matching unique molecular identifier reads.
+
+  - id: dea
+    type: select
+    required: true
+    choices:
+      - [ 'detect', 'Run Statistical Analysis']
+      - [ 'skip', 'Skip Statistical Analysis']
+    description: |
+      Runs deSEq2 and EdgeR
+
+  - id: design
+    type: file
+    required: true
+    regex: ".*txt"
+    description: |
+      A design file listing pairs of sample name and sample group.
+      Columns must include: SampleID,SampleName,SampleGroup,FullPathToFqR1,FullPathToFqR2
+
+  - id: genome
+    type: select
+    choices:
+      - [ '/project/shared/bicf_workflow_ref/GRCh38', 'Human GRCh38']
+      - [ '/project/shared/bicf_workflow_ref/GRCh37', 'Human GRCh37']
+      - [ '/project/shared/bicf_workflow_ref/GRCm38', 'Mouse GRCm38']
+    required: true
+    description: |
+      Reference genome for alignment
+
+  - id: geneset
+    type: select
+    choices:
+      - ['h.all.v5.1.symbols.gmt','Hallmark Gene Sets']
+      - ['c2.all.v5.1.symbols.gmt','Curated Gene Sets']
+      - ['c3.all.v5.1.symbols.gmt','Motif Gene Sets']
+      - ['c5.all.v5.1.entrez.gmt','Gene Ontology Gene Sets']
+      - ['c6.all.v5.1.symbols.gmt','Oncogenic Signatures']
+      - ['c7.all.v5.1.entrez.gmt','Immunological Signatures']
+      
+    required: true
+    description: |
+      Gene Set Definitions used for QuSAGE Analysis -- see http://software.broadinstitute.org/gsea/msigdb/ for geneset descriptions
+
+
+# -----------------------------------------------------------------------------
+# SHINY APP CONFIGURATION
+# -----------------------------------------------------------------------------
+
+# Remember - The vizapp is always 'vizapp/server.R' 'vizapp/ui.R'
+#            The workflow must publish all final output into $baseDir
+
+# Name of the R module that the vizapp will run against
+vizapp_r_module: 'R/3.2.1-intel'
+
+# List of any CRAN packages, not provided by the modules, that must be made
+# available to the vizapp
+vizapp_cran_packages:
+  - sqldf
+  - shiny
+  - Vennerable
+  - DT
+  - ggplot2
+  - gplots
+  - gtools
+  - RColorBrewer
+
+
+# # List of any Bioconductor packages, not provided by the modules, that must be made
+# available to the vizapp
+vizapp_bioc_packages:
+  - qusage
+  - ballgown
+  - edgeR
+  - DESeq2
+vizapp_github_packages:
+  - js229/Vennerable
\ No newline at end of file
--- a/docs/index.md
+++ b/docs/index.md
+# Astrocyte RNASeq Workflow Package
+
+## Workflow SOP
+
+This SOP describes the analysis pipeline of RNA sequencing data. This pipeline includes (1) quality control, (2) variant calling analysis, (3) identification of fusion genes, and (4) statistical analyses of gene expression and isoform expression. The result R data of the statistical analysis can be visualized using a custom R shiny service.
+
+For each file this workflow:
+
+    1) Trim the ends of sequences with remaining adapter or quality scores < 25. Remove any sequence less than 35bp after trimming, then generate a file for capturing information about how many sequences were trimmed.
+    2) Trimmed Fastq files are aligned to the selected reference genome using HiSAT2 (Kim et al. 2015) or STAR (Dobin et al. 2012)
+    3) Marks Duplicates using SAMBAMBA
+    4) Features (genes, transcripts and exons) are counted using featureCounts (Liao et al 2014) and StringTie (Pertea et al. 2015) using the Gencode feature table(Harrow et al. 2012)
+    5) Basic pairwise differential expression analysis is performed using EdgeR (Robinson et al. 2010) and DESeq
+    6) Abundances of transcripts are calculated using ballgown (Frazee et al. 2014)
+    7) Identify gene fusions or fused transcripts using STAR-Fusion.
+
+## Workflow Parameters
+
+    fastq - Choose one or more RNASeq read files to process.
+    genome - Choose a genomic reference (genome).
+    stranded - If a stranded library is created, please select the appropriate protocol to ensure strand specific analysis
+    markdups - The default is to remove duplicates, you can skip this function.
+    pairs - Indicate if this is paired-end or single-end sequencing data 
+    dea - Perform Differential Expression analysis, please skip if there are < 2 sample groups
+    geneset - Select a set of known genesets for pathway analysis
+    fusion - Perform Gene Fusion Identification
+    design - This file matches the fastq files to data abot the sample
+
+ The following columns are necessary, must be named as in template and can be in any order:
+
+    SampleID
+        This ID should match the name in the fastq file ie S0001.R1.fastq.gz the sample ID is S0001
+        Note: SampleID shouldn't start with numbers ie 10C should be changed to S10C
+    SampleName
+        This ID can be the identifier of the researcher or clinician
+    SubjectID
+        Used in order to link samples from the same patient
+    SampleGroup
+	This is the group that will be used for pairwise differential expression analysis
+    FqR1
+	Name of the fastq file R1
+    FqR2
+	Name of the fastq file R2
+
+There are some optional columns that might help with the analysis:
+      Tissue
+      Gender
+      CultureDate
+      SequenceRun
+      Organism
+      CellPopulation
+      Treatment
+      GeneticFeature (WT or KO)
+      Race
+      Ethnicity
+      Age
+      
+
+### Test Data
+
+
+### Credits
+This example worklow is derived from original scripts kindly contributed by the Bioinformatic Core Facility (BICF), Department of Bioinformatics
+
+### Workflow SOP
+![workflow](pipeline_rnaseq.001.png)
+
+### References
+
+* FastQC: http://www.bioinformatics.babraham.ac.uk/projects/fastqc/ o http://www.bioinformatics.babraham.ac.uk/projects/trim_galore/
+* STAR-Fusion: https://github.com/STAR-Fusion/STAR-Fusion
+* STAR: https://github.com/alexdobin/STAR
+    * Dobin A, Davis CA, Schlesinger F, Drenkow J, Zaleski C, Jha S, Batut P, Chaisson M, Gingeras TR. STAR: ultrafast universal RNA-seq aligner. Bioinformatics. 2013 Jan 1;29(1):15-21. doi: 10.1093/bioinformatics/bts635. Epub 2012 Oct 25. PubMed PMID: 23104886; PubMed Central PMCID: PMC3530905.
+* HISAT, StringTie and Ballgown protocol
+    * Pertea M, Kim D, Pertea GM, Leek JT, Salzberg SL. Transcript-level expression analysis of RNA-seq experiments with HISAT, StringTie and Ballgown. Nat Protoc. 2016 Sep;11(9):1650-67.
+* StringTie: https://ccb.jhu.edu/software/stringtie/
+    * Pertea M, Pertea GM, Antonescu CM, Chang TC, Mendell JT, Salzberg SL. StringTie enables improved reconstruction of a transcriptome from RNA-seq reads. Nat Biotechnol. 2015 Mar;33(3):290-5.
+* Ballgown
+    * http://biorxiv.org/content/biorxiv/early/2014/09/05/003665.full.pdf o http://bioconductor.org/packages/release/bioc/html/ballgown.html o https://github.com/alyssafrazee/ballgown
+* HISAT: https://ccb.jhu.edu/software/hisat2/index.shtml
+    * Kim D, Langmead B, Salzberg SL. HISAT: a fast spliced aligner with low memory requirements. Nat Methods. 2015 Apr;12(4):357-60.
+* SAMtools: http://samtools.sourceforge.net/
+    * Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9.
+    * Li H. A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics. 2011 Nov 1;27(21):2987-93.
+* Sambamba: http://lomereiter.github.io/sambamba/
+    * Tarasov A, Vilella AJ, Cuppen E, Nijman IJ, Prins P. Sambamba: fast processing of NGS alignment formats. Bioinformatics. 2015 Jun 15;31(12):2032-4. * Picard: https://broadinstitute.github.io/picard/
+* SpeedSeq: https://github.com/hall-lab/speedseq
+    * Chiang C, Layer RM, Faust GG, Lindberg MR, Rose DB, Garrison EP, Marth GT, Quinlan AR, Hall IM. SpeedSeq: ultra-fast personal genome analysis and interpretation. Nat Methods. 2015 Oct;12(10):966-8.
+* GATK: https://software.broadinstitute.org/gatk/
+    * McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA. The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 2010 Sep;20(9):1297-303.
+    * DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8.
+    * Van der Auwera GA, Carneiro MO, Hartl C, Poplin R, Del Angel G, Levy-Moonshine A, Jordan T, Shakir K, Roazen D, Thibault J, Banks E, Garimella KV, Altshuler D, Gabriel S, DePristo MA. From FastQ data to high confidence variant calls: the Genome Analysis Toolkit best practices pipeline. Curr Protoc Bioinformatics. 2013;43:11.10.1-33. o http://gatkforums.broadinstitute.org/gatk * COSMIC: http://cancer.sanger.ac.uk/cosmic
+* BEDTools: http://bedtools.readthedocs.io/en/latest/
+    * Quinlan AR, Hall IM. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics. 2010 Mar 15;26(6):841-2. * VCFtools: http://vcftools.sourceforge.net/index.html
+    * Danecek P, Auton A, Abecasis G, Albers CA, Banks E, DePristo MA, Handsaker RE, Lunter G, Marth GT, Sherry ST, McVean G, Durbin R; 1000 Genomes Project Analysis Group. The variant call format and VCFtools. Bioinformatics. 2011 Aug 1;27(15):2156-8.
+* SnpSift: http://snpeff.sourceforge.net/
+    * Cingolani P, Patel VM, Coon M, Nguyen T, Land SJ, Ruden DM, Lu X. Using Drosophila melanogaster as a Model for Genotoxic Chemical Mutational Studies with a New Program, SnpSift. Front Genet. 2012 Mar 15;3:35.
+* featureCounts: http://subread.sourceforge.net
+    * Liao Y, Smyth GK, Shi W. featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. 2014 Apr 1;30(7):923-30.
+* edgeR: https://bioconductor.org/packages/release/bioc/html/edgeR.html
+    * Robinson MD, McCarthy DJ, Smyth GK. edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics. 2010 Jan 1;26(1):139-40.
+    * McCarthy DJ, Chen Y, Smyth GK. Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Res. 2012 May;40(10):4288-97.
--- a/docs/index.md.vc
+++ b/docs/index.md.vc
+# Astrocyte RNASeq Workflow Package
+
+## Workflow SOP
+
+This SOP describes the analysis pipeline of RNA sequencing data. This pipeline includes (1) quality control, (2) variant calling analysis, (3) identification of fusion genes, and (4) statistical analyses of gene expression and isoform expression. The result R data of the statistical analysis can be visualized using a custom R shiny service.
+
+For each file this workflow:
+
+    1) Trim the ends of sequences with remaining adapter or quality scores < 25. Remove any sequence less than 35bp after trimming, then generate a file for capturing information about how many sequences were trimmed.
+    2) Trimmed Fastq files are aligned to the selected reference genome using HiSAT2 (Kim et al. 2015) or STAR (Dobin et al. 2012)
+    3) Marks Duplicates using SAMBAMBA
+    4) Features (genes, transcripts and exons) are counted using featureCounts (Liao et al 2014) and StringTie (Pertea et al. 2015) using the Gencode feature table(Harrow et al. 2012)
+    5) Basic pairwise differential expression analysis is performed using EdgeR (Robinson et al. 2010) and DESeq
+    6) Abundances of transcripts are calculated using ballgown (Frazee et al. 2014)
+    7) Identify gene fusions or fused transcripts using STAR-Fusion.
+    8) Identify coding variants:
+       a) Using GATK, repair the format problem of input BAM file, (2) add read group information, (3) handle N-split mapping of reads caused by RNA splicing, (4) base quality score recalibration (BQSR): detect systematic errors made by the sequencer when it estimates the quality score of each base call and build a model of covariation based on the data and a set of known variants, then adjust the base quality scores in the data based on the model.
+       b) Detect variants using SAMTOOLS with mapping quality >= 50 and base calling quality >= 20. 
+       c) Detect low frequency variants using SAMTOOLS with variant allele fraction > 0.01 and depth > 50 and has been previously identified.
+       d) Detect variants using SpeedSeq
+       e) Detect variants using GATK
+
+## Workflow Parameters
+
+    fastq - Choose one or more RNASeq read files to process.
+    genome - Choose a genomic reference (genome).
+    stranded - If a stranded library is created, please select the appropriate protocol to ensure strand specific analysis
+    markdups - The default is to remove duplicates, you can skip this function.
+    pairs - Indicate if this is paired-end or single-end sequencing data 
+    dea - Perform Differential Expression analysis, please skip if there are < 2 sample groups
+    geneset - Select a set of known genesets for pathway analysis
+    fusion - Perform Gene Fusion Identification
+    variant - Perform variant detection
+    tumor - Perform Cancer specific analysis in variant detection
+    design - This file matches the fastq files to data abot the sample
+
+ The following columns are necessary, must be named as in template and can be in any order:
+
+    SampleID
+        This ID should match the name in the fastq file ie S0001.R1.fastq.gz the sample ID is S0001
+        Note: SampleID shouldn't start with numbers ie 10C should be changed to S10C
+    SampleName
+        This ID can be the identifier of the researcher or clinician
+    SubjectID
+        Used in order to link samples from the same patient
+    SampleGroup
+	This is the group that will be used for pairwise differential expression analysis
+    FqR1
+	Name of the fastq file R1
+    FqR2
+	Name of the fastq file R2
+
+There are some optional columns that might help with the analysis:
+      Tissue
+      Gender
+      CultureDate
+      SequenceRun
+      Organism
+      CellPopulation
+      Treatment
+      GeneticFeature (WT or KO)
+      Race
+      Ethnicity
+      Age
+      
+
+### Test Data
+
+
+### Credits
+This example worklow is derived from original scripts kindly contributed by the Bioinformatic Core Facility (BICF), Department of Bioinformatics
+
+### Workflow SOP
+![workflow](pipeline_rnaseq.001.png)
+
+### References
+
+* FastQC: http://www.bioinformatics.babraham.ac.uk/projects/fastqc/ o http://www.bioinformatics.babraham.ac.uk/projects/trim_galore/
+* STAR-Fusion: https://github.com/STAR-Fusion/STAR-Fusion
+* STAR: https://github.com/alexdobin/STAR
+    * Dobin A, Davis CA, Schlesinger F, Drenkow J, Zaleski C, Jha S, Batut P, Chaisson M, Gingeras TR. STAR: ultrafast universal RNA-seq aligner. Bioinformatics. 2013 Jan 1;29(1):15-21. doi: 10.1093/bioinformatics/bts635. Epub 2012 Oct 25. PubMed PMID: 23104886; PubMed Central PMCID: PMC3530905.
+* HISAT, StringTie and Ballgown protocol
+    * Pertea M, Kim D, Pertea GM, Leek JT, Salzberg SL. Transcript-level expression analysis of RNA-seq experiments with HISAT, StringTie and Ballgown. Nat Protoc. 2016 Sep;11(9):1650-67.
+* StringTie: https://ccb.jhu.edu/software/stringtie/
+    * Pertea M, Pertea GM, Antonescu CM, Chang TC, Mendell JT, Salzberg SL. StringTie enables improved reconstruction of a transcriptome from RNA-seq reads. Nat Biotechnol. 2015 Mar;33(3):290-5.
+* Ballgown
+    * http://biorxiv.org/content/biorxiv/early/2014/09/05/003665.full.pdf o http://bioconductor.org/packages/release/bioc/html/ballgown.html o https://github.com/alyssafrazee/ballgown
+* HISAT: https://ccb.jhu.edu/software/hisat2/index.shtml
+    * Kim D, Langmead B, Salzberg SL. HISAT: a fast spliced aligner with low memory requirements. Nat Methods. 2015 Apr;12(4):357-60.
+* SAMtools: http://samtools.sourceforge.net/
+    * Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9.
+    * Li H. A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics. 2011 Nov 1;27(21):2987-93.
+* Sambamba: http://lomereiter.github.io/sambamba/
+    * Tarasov A, Vilella AJ, Cuppen E, Nijman IJ, Prins P. Sambamba: fast processing of NGS alignment formats. Bioinformatics. 2015 Jun 15;31(12):2032-4. * Picard: https://broadinstitute.github.io/picard/
+* SpeedSeq: https://github.com/hall-lab/speedseq
+    * Chiang C, Layer RM, Faust GG, Lindberg MR, Rose DB, Garrison EP, Marth GT, Quinlan AR, Hall IM. SpeedSeq: ultra-fast personal genome analysis and interpretation. Nat Methods. 2015 Oct;12(10):966-8.
+* GATK: https://software.broadinstitute.org/gatk/
+    * McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA. The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 2010 Sep;20(9):1297-303.
+    * DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8.
+    * Van der Auwera GA, Carneiro MO, Hartl C, Poplin R, Del Angel G, Levy-Moonshine A, Jordan T, Shakir K, Roazen D, Thibault J, Banks E, Garimella KV, Altshuler D, Gabriel S, DePristo MA. From FastQ data to high confidence variant calls: the Genome Analysis Toolkit best practices pipeline. Curr Protoc Bioinformatics. 2013;43:11.10.1-33. o http://gatkforums.broadinstitute.org/gatk * COSMIC: http://cancer.sanger.ac.uk/cosmic
+* BEDTools: http://bedtools.readthedocs.io/en/latest/
+    * Quinlan AR, Hall IM. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics. 2010 Mar 15;26(6):841-2. * VCFtools: http://vcftools.sourceforge.net/index.html
+    * Danecek P, Auton A, Abecasis G, Albers CA, Banks E, DePristo MA, Handsaker RE, Lunter G, Marth GT, Sherry ST, McVean G, Durbin R; 1000 Genomes Project Analysis Group. The variant call format and VCFtools. Bioinformatics. 2011 Aug 1;27(15):2156-8.
+* SnpSift: http://snpeff.sourceforge.net/
+    * Cingolani P, Patel VM, Coon M, Nguyen T, Land SJ, Ruden DM, Lu X. Using Drosophila melanogaster as a Model for Genotoxic Chemical Mutational Studies with a New Program, SnpSift. Front Genet. 2012 Mar 15;3:35.
+* featureCounts: http://subread.sourceforge.net
+    * Liao Y, Smyth GK, Shi W. featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. 2014 Apr 1;30(7):923-30.
+* edgeR: https://bioconductor.org/packages/release/bioc/html/edgeR.html
+    * Robinson MD, McCarthy DJ, Smyth GK. edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics. 2010 Jan 1;26(1):139-40.
+    * McCarthy DJ, Chen Y, Smyth GK. Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Res. 2012 May;40(10):4288-97.
--- a/docs/pipeline_rnaseq.001.png
+++ b/docs/pipeline_rnaseq.001.png
--- a/docs/pipeline_rnaseq.key
+++ b/docs/pipeline_rnaseq.key
--- a/docs/pipeline_rnaseq.pdf
+++ b/docs/pipeline_rnaseq.pdf
--- a/vizapp/functions.R
+++ b/vizapp/functions.R
+col.grp <- function(n,b) {
+  colorvec <- vector(mode="character", length=length(n))
+  plotcols <- rainbow(length(b))
+  for (i in 1:length(n)) {
+    for (j in 1:length(b)) {
+      if ( n[i] == b[j] ) {
+        colorvec[i] = plotcols[j]
+      }
+    }
+  }
+  c(colorvec)
+}
+bg.colors <- function(n) {
+  colorvec <- vector(mode="character", length=nrow(n))
+  r <- rainbow(4)
+  for (i in 1:nrow(n)) {
+    if (n[i,1] >= 2) {
+      if (n[i,2] >= 2) {
+        colorvec[i] = "purple"
+      }
+      else {
+        colorvec[i] = "cyan"
+        
+      }
+    }
+    if (n[i,1] <= -2) {
+      if (n[i,2] >= 2) {
+        colorvec[i] = "red"
+      }
+      else {
+        colorvec[i] = "salmon"
+      }
+    }
+    if (n[i,2] >= -2 && n[i,2] <= 2) {
+      colorvec[i] = "grey"
+    }
+    if (n[i,1] >= -2 && n[i,1] <= 2) {
+      colorvec[i] = "grey"
+    }
+}
+  c(colorvec)
+}
+
--- a/vizapp/load_libs.R
+++ b/vizapp/load_libs.R
+options("repos"="http://cran.rstudio.com/")
+update.packages()
+
+install.packages("sqldf",dep=TRUE)
+install.packages("gmp",dep=TRUE)
+install.packages(c('gplots','lattice','latticeExtra','vegan','labdsv','cluster','ggplot2'))
+install.packages("Vennerable", repos="http://R-Forge.R-project.org",type='source')
+source("http://bioconductor.org/biocLite.R")
+biocLite(c('graph', 'RBGL', 'RColorBrewer', 'reshape', 'gtools',"edgeR", "DESeq2","qusage","ballgown"))
--- a/vizapp/server.R
+++ b/vizapp/server.R
+library(shiny)
+library(Vennerable)
+library(qusage)
+library(DT)
+library(ggplot2)
+library(ballgown)
+library(sqldf)
+library(reshape2)
+library("gplots")
+
+shinyServer(function(input, output, session) {
+
+    data.dir <- Sys.getenv('outputDir')
+    rda.dir <- data.dir
+    symsyn <- read.table(file='symbol2synonym.txt',header=FALSE)
+    names(symsyn) <- c('symbol','synonyms')
+    source('functions.R', local = TRUE)
+    source('tools/intro.R', local = TRUE)
+    source('tools/gc.R', local = TRUE)
+    source('tools/qc.R', local = TRUE)
+    source('tools/altsplice.R', local = TRUE)
+    source('tools/dea.R', local = TRUE)
+    source('tools/gsea.R', local = TRUE)
+    #source('tools/fastqc.R', local = TRUE)
+    source('tools/gc_ui.R', local = TRUE)
+    source('tools/qc_ui.R', local = TRUE)
+    source('tools/dea_ui.R', local = TRUE)
+    source('tools/gsea_ui.R', local = TRUE)
+    #source('tools/fastqc_ui.R', local = TRUE)
+    source('tools/altsplice_ui.R', local = TRUE)
+
+})
+
--- a/vizapp/symbol2synonym.txt
+++ b/vizapp/symbol2synonym.txt
--- a/vizapp/tools/altsplice.R
+++ b/vizapp/tools/altsplice.R
+
+ get.bgdata <- function(var) {
+   rdafile <- paste(data.dir,'bg.rda',sep='/')
+   load(rdafile)
+   genetrx <- indexes(bg)$t2g
+   genetrx$SYMBOL <- geneNames(bg)
+
+   geneid <- as.character(unique(genetrx[genetrx$SYMBOL %in% var$symsearch,]$g_id))
+   if (exists("var$enssearch") && var$enssearch != '') {
+       geneid <- as.character(unique(genetrx$g_id[grep(enssearch,genetrx$g_id)]))
+     }
+   genebg <- subset(bg,paste0("gene_id=='",geneid,"'"))
+   rownames(genetrx) <- transcriptNames(bg)
+   fpkm <- texpr(bg,meas='FPKM')
+   keep <- genetrx[genetrx$g_id == geneid,]$t_id
+   cttbl <- fpkm[keep,]
+   grps <- pData(bg)$group
+   trxnames <-  genetrx[genetrx$g_id == geneid,]
+   dtm <- melt(t(cttbl))
+   names(dtm) <- c('sample','transcript','value')
+   dtm$grps <- rep(grps,length(keep))
+   if (length(keep) < 2) {
+     dtm$transcript <- rep(keep,length(keep))
+   }
+   dtm$grptrx <- paste(dtm$grps,dtm$transcript,sep='.')
+   test <- stattest(gown=genebg, pData=pData(bg), feature='transcript',covariate='group', libadjust=FALSE,getFC=TRUE)
+   if (length(keep) > 1) {
+     agg = collapseTranscripts(gene=geneid, gown=bg, k=var$kct, method='kmeans')
+     test <- stattest(gowntable=agg$tab, pData=pData(bg), feature='transcript_cluster',covariate='group', libadjust=FALSE,getFC=TRUE)
+   }
+   return(list(stattbl=test,gid=geneid,obj=genebg,cttbl=dtm,tname=trxnames))
+ }
+
+find_sym <- function (sym) {
+if (!(toupper(sym) %in% toupper(symsyn$symbol))) {
+     syns <- symsyn[grep(input$symsearch,symsyn$synonym,ignore.case=TRUE),]$symbol
+     synlist <- paste(as.character(syns),collapse=',')
+   if (length(syns) > 1) {
+     paste("Please Use Official Gene Symbols",synlist,sep=':')
+   }else {"Please Use Official Gene Symbols"}  	 
+ }else {NULL}  	 
+}	 
+
+getgeneid <- eventReactive(input$altButton,{
+ if (input$symsearch == '' & input$enssearch != '') {
+    validate (find_sym(input$symsearch))
+    }   
+get.bgdata(input)
+})
+
+output$plot.cluster <- renderPlot({
+  par(oma=c(4,4,1,1)) 
+  gid <- getgeneid()$gid
+  bg <- getgeneid()$obj
+  tname <- getgeneid()$tname
+  if (nrow(tname) > 1) {
+    plotLatentTranscripts(gene=gid, gown=bg, k=input$kct, method='kmeans', returncluster=FALSE)
+  }
+})
+
+output$gene.stat <- DT::renderDataTable({
+  getgeneid()$stattbl
+},escape=FALSE)
+
+output$trx.name <- DT::renderDataTable({
+  getgeneid()$tname
+},escape=FALSE)
+
+
+output$plot.means <- renderPlot({
+  par(oma=c(4,4,1,1))
+  par(cex.main=0.75)
+  gid <- getgeneid()$gid
+  bg <- getgeneid()$obj
+  tname <- getgeneid()$tname
+  if (nrow(tname) > 1) {
+    plotMeans(gid, bg, groupvar='group', meas='cov', colorby='transcript')
+  }
+}, height = 900, width = 900)
+
+output$trx.gene <- renderPlot({
+  countTable <- getgeneid()$cttbl
+  par(oma=c(4,4,1,1))
+  p <- ggplot(countTable,aes(x=grptrx,y=log2(value+1))) + geom_boxplot(aes(fill = factor(grptrx))) + geom_jitter(height = 0) + theme(legend.position="left",axis.text.x=element_text(angle=45,hjust=1, vjust=1),legend.key.height=unit(0.5,"line"),legend.text=element_text(size=8),legend.title=element_blank()) + ylab("Relative Abundance (FPKM)") + xlab("")
+  print(p)
+})
--- a/vizapp/tools/altsplice_ui.R
+++ b/vizapp/tools/altsplice_ui.R
+output$ui_altsplice <- renderUI({
+  fluidPage(
+       includeCSS("www/style.css"),
+         sidebarLayout(
+            sidebarPanel(
+                uiOutput("dir.splice"),
+                       textInput("symsearch", "Search By Gene Symbol", 'IL1B'),
+                       textInput("enssearch", "Search By ENS ID",''),
+                       numericInput("kct",label = "Number of Clusters",value = 2),   
+                       actionButton("altButton", "GO")
+                ),
+            mainPanel(
+                tabsetPanel(
+                       tabPanel("Gene Compare",
+                             dataTableOutput('gene.stat'),
+                             dataTableOutput('trx.name'),
+                             plotOutput("plot.cluster"),
+                             plotOutput("trx.gene"),
+                             plotOutput("plot.means")
+                             )
+                    )
+                )
+            )
+        )
+    
+})
+
--- a/vizapp/tools/dea.R
+++ b/vizapp/tools/dea.R
+# UI-elements for DEA
+
+ctfile <-  paste(data.dir,'countTable.logCPM.txt',sep='/')
+samfile <-  paste(data.dir,'design.shiny.txt',sep='/')
+cts <- read.table(file=ctfile,header=TRUE,sep='\t')
+samtbl <- read.table(samfile,header=TRUE,sep="\t")
+samples <- colnames(cts[,4:length(cts)])
+mergetbl <- merge(as.data.frame(samples),samtbl,by.x="samples",by.y="SampleID",all.x=TRUE,sort=FALSE)
+grps <- mergetbl$SampleGroup
+grpnames <- levels(factor(grps))
+col.blocks <-col.grp(grps,grpnames)
+MSIG.geneSets <- read.gmt(paste(data.dir,'geneset.shiny.gmt',sep='/'))
+
+output$pick.dea <- renderUI({
+    flist <- list.files(data.dir,pattern="*edgeR.txt$")
+    selectInput("file", "Choose Pair", choices=flist)
+})
+output$pick.pathway <- renderUI({
+    pathways <- names(MSIG.geneSets)
+    pathchoices = setNames(1:length(pathways),pathways)
+    selectInput("deapathname", "Choose Pair", choices=pathchoices)
+})
+
+get.data <- function(var) {
+    f <- paste(data.dir,var$file,sep='/')
+    comp <- read.table(f,header=TRUE,sep='\t')
+    comp.filt <- na.omit(comp[abs(comp$logFC) >= var$fc.thresh & comp$rawP <= var$pval.thresh,])
+    if (var$adjust == 'FDR') {
+        comp.filt <- na.omit(comp[abs(comp$logFC) >= var$fc.thresh & comp$fdr <= var$pval.thresh,])
+    }
+    if (var$adjust == 'BONF') {
+        comp.filt <- na.omit(comp[abs(comp$logFC) >= var$fc.thresh & comp$bonf <= var$pval.thresh,])
+    }
+    genelist <- as.character(head(comp.filt[order(comp.filt$fdr),]$symbol,n=var$numgenes))
+    if (var$heatmap == 'hgeneset') {
+        genelist <- unlist(MSIG.geneSets[as.numeric(var$deapathname)])
+    }
+   if (var$heatmap == 'cgeneset') {
+         genelist <- unlist(strsplit(as.character(var$genes), "[;\n]+"))
+   }
+   return(list(filt=comp.filt,glist=genelist))
+}
+
+tbls <-  eventReactive(input$deButton,{get.data(input)})
+
+output$selectgenes <- renderUI({
+   symnames <- tbls()$glist
+   textAreaInput("genes", "Gene Symbols separated by ';'",value=paste(symnames,collapse=";"), width = '95%',rows=10)
+})
+
+output$dge.c <- DT::renderDataTable({
+    t1 <- tbls()$filt
+    t1$symbol <- paste("<a href=http://www.genecards.org/cgi-bin/carddisp.pl?gene=",t1$symbol,'>',t1$symbol,"</a>",sep='')
+    t1$ensembl <- paste("<a href=http://www.ensembl.org/Homo_sapiens/Gene/Summary?g=",t1$ensembl,'>',t1$ensembl,"</a>",sep='')
+    t1
+},escape=FALSE,filter = 'top',options = list(lengthMenu = c(10, 25, 50, 200, -1)))
+
+output$downloadC <- downloadHandler(
+                                     file <- paste(input$file,".filt.txt",sep=""),
+                                     content = function(file) {
+                                       write.table(tbls()$filt,file,quote=FALSE,row.names=FALSE,sep='\t')
+                                     })
+
+plotHeatmap <- reactive({
+	syms <- tbls()$glist
+	ct2 <- cts[cts$SYMBOL %in% syms,]
+	subset <- ct2[,4:length(ct2)]
+	row.names(subset) <- ct2$SYMBOL
+	STREE <- hclust(dist(t(subset)))
+      	zscores <- scale(t(subset))
+      	ngenes <- length(colnames(zscores))
+      	textscale <- (1/(ngenes/30))
+      	if (textscale > 1) {
+           textscale <-1
+      	}
+      	if (textscale < 0.1) {
+           textscale <- 0.1
+      	}
+	heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,srtRow=45,srtCol=45,trace="none",margins=c(8,16))
+	legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20)
+})
+output$plot.heatmap <- renderPlot({
+	plotHeatmap()
+})
+output$downloadpdf = downloadHandler(
+      filename = "output.heatmap.pdf",
+      content = function(file) {
+      pdf(file = file,paper="letter")
+      plotHeatmap()
+      dev.off()
+      })
+
+output$hm.comp <- renderImage({
+  f1 <- paste(data.dir,input$file,sep='/')
+  png <- gsub('edgeR.txt','heatmap.edgeR.png',f1)
+  list(src=png, alt=paste("HeatMap Comparison"))
+},deleteFile=FALSE)
+
+output$hmcomp.desc <- renderText({ 
+      paste("Heatmap of all genes with an FDR < 0.05 using EdgeR Results",sep='')
+    })
--- a/vizapp/tools/dea_ui.R
+++ b/vizapp/tools/dea_ui.R
+#######################################
+# Shiny interface for data functions
+#######################################
+# data ui and tabs
+output$ui_dea <- renderUI({
+    list(
+        includeCSS("www/style.css"),
+        sidebarLayout(
+            sidebarPanel(
+                numericInput("fc.thresh",
+                             "LogFold Change Threshold:", 1),
+                numericInput("gene.thresh",
+                             "Gene Mean Threshold:", 5),
+                numericInput("pval.thresh",
+                             "P-Value Threshold:", 0.05),
+                selectInput("adjust", "Choose P-Value Correction", choices=c("raw","FDR",'BONF'),selected='FDR'),
+                uiOutput("pick.dea"),
+	     	selectInput(
+			"heatmap", "HeatMap",
+      	     		c(Top = "top",
+               		HallmarkGeneSet = "hgeneset",
+	       		CustomGeneSet= "cgeneset")
+	       	),
+	     	conditionalPanel(
+      	     	condition = "input.heatmap == 'cgeneset'",
+	     		  uiOutput("selectgenes")
+		),
+	     	conditionalPanel(
+      	     	condition = "input.heatmap == 'top'",
+	      	numericInput("numgenes","Number Top Genes:", 50)
+		),
+	     	conditionalPanel(
+      	     	condition = "input.heatmap == 'hgeneset'",
+	     	uiOutput("pick.pathway")
+		),
+	    	actionButton("deButton", "Go")
+	    ),
+            mainPanel(
+                tabsetPanel(
+                    tabPanel("Differential Gene Set Comparison",
+                             downloadButton('downloadC', 'Download CSV'),
+                             dataTableOutput('dge.c')
+                             ),
+                    tabPanel("Heatmap",
+                             #downloadButton('downloadpdf', 'Download Heatmap'),
+			     h1("HeatMap Comparison"),
+			     h3("Top 50 User Defined Genes"),
+			     plotOutput("plot.heatmap"),
+			     h3("All Differentially Expressed Genes"),
+                             imageOutput("hm.comp",width="1200px",height="1200px"),
+			     textOutput("hmcomp.desc")
+                             )
+                    )
+                )
+            )
+        )
+})
--- a/vizapp/tools/fastqc.R
+++ b/vizapp/tools/fastqc.R
+output$dir.fqc <- renderUI({
+    datadir <- dir(data.dir)
+    selectInput("dset.fqc", "Choose Dataset", choices=datadir,selected='coarse_cellpops')
+})
+output$pick.fqc <- renderUI({
+    flist <- list.files(paste(data.dir,input$dset.fqc,"fastqc",sep='/'),pattern="*html$")
+    flist <- gsub(".sort_fastqc.html",'',flist)
+    selectInput("samp", "Choose Sample", choices=flist)
+})
+
+getpage <- function(var) {
+	return(includeHTML(paste(data.dir,"/",var$dset.fqc,"/fastqc/",var$samp,".sort_fastqc.html",sep='')))
+}
+output$inc <- renderUI({getpage(input)})
\ No newline at end of file