Skip to content
Snippets Groups Projects
Commit 8aac26de authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

add all files

parents
No related merge requests found
Pipeline #1672 failed
Showing with 842 additions and 0 deletions
# Gitlab CI Script for astrocyte/rnaseq
# Brandi L. Cantarel - 2017
variables:
CLEAN_REPO_PATH: "/project/BICF/BICF_Core/s166458/astrocyte_test/${CI_RUNNER_ID}"
TEST_BRANCH: "master"
before_script:
- module load nextflow/0.27.6
- mkdir -p "${CLEAN_REPO_PATH}"
- ln -s /project/shared/bicf_workflow_ref/workflow_testdata/rnaseq test_data
stages:
- integration
test_human
stage: integration
script:
- nextflow run workflow/main.nf --input test_data/ --design ${CLEAN_REPO_PATH}/test_data/design.rnaseq.txt --output ${CLEAN_REPO_PATH}/human_output
artifacts:
paths:
- ${CLEAN_REPO_PATH}
expire_in: 2 days
test_mouse
stage: integration
script:
- nextflow run workflow/main.nf --input test_data/ --design ${CLEAN_REPO_PATH}/test_data/mouse_se.design.txt --pairs se --fusion skip --genome /project/shared/bicf_workflow_ref/GRCm38 --markdups null --output ${CLEAN_REPO_PATH}/mouse_output
artifacts:
paths:
- ${CLEAN_REPO_PATH}
expire_in: 2 days
[submodule "workflow/process_scripts"]
path = workflow/process_scripts
url = git@git.biohpc.swmed.edu:BICF/Astrocyte/process_scripts.git
Copyright © 2016. The University of Texas Southwestern Medical Center
5323 Harry Hines Boulevard Dallas, Texas, 75390 Telephone 214-648-3111
\ No newline at end of file
## RNASeq Analysis Worklow
```
module load nextflow/0.24.1-SNAPSHOT
nextflow run workflow/main.nf
```
#
# metadata for the example astrocyte ChipSeq workflow package
#
# -----------------------------------------------------------------------------
# BASIC INFORMATION
# -----------------------------------------------------------------------------
# A unique identifier for the workflow package, text/underscores only
name: 'rnaseq_bicf'
# Who wrote this?
author: 'Brandi Cantarel'
# A contact email address for questions
email: 'biohpc-help@utsouthwestern.edu'
# A more informative title for the workflow package
title: 'BICF RNASeq Analysis Workflow'
# A summary of the workflow package in plain text
description: |
This is a workflow package for the BioHPC/BICF RNASeq workflow system.
It implements differential expression analysis, gene set enrichment analysis,
gene fusion analysis and variant identification using RNASeq data.
# -----------------------------------------------------------------------------
# DOCUMENTATION
# -----------------------------------------------------------------------------
# A list of documentation file in .md format that should be viewable from the
# web interface. These files are in the 'docs' subdirectory. The first file
# listed will be used as a documentation index and is index.md by convention
documentation_files:
- 'index.md'
# -----------------------------------------------------------------------------
# NEXTFLOW WORKFLOW CONFIGURATION
# -----------------------------------------------------------------------------
# Remember - The workflow file is always named 'workflow/main.f'
# The workflow must publish all final output into $baseDir
# A list of clueter environment modules that this workflow requires to run.
# Specify versioned module names to ensure reproducability.
workflow_modules:
- 'trimgalore/0.4.1'
- 'cutadapt/1.9.1'
- 'hisat2/2.0.1-beta-intel'
- 'samtools/intel/1.3'
- 'picard/1.127'
- 'subread/1.5.0-intel'
- 'stringtie/1.1.2-intel'
- 'speedseq/20160506'
- 'python/2.7.x-anaconda'
# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
#
# REQUIRED INFORMATION
# id: The name of the parameter in the NEXTFLOW workflow
# type: The type of the parameter, one of:
# string - A free-format string
# integer - An integer
# real - A real number
# file - A single file from user data
# files - One or more files from user data
# select - A selection from a list of values
# required: true/false, must the parameter be entered/chosen?
# description: A user friendly description of the meaning of the parameter
#
# OPTIONAL INFORMATION
# default: A default value for the parameter (optional)
# min: Minium value/characters/files for number/string/files types
# max: Maxumum value/characters/files for number/string/files types
# regex: A regular expression that describes valid entries / filenames
#
# SELECT TYPE
# choices: A set of choices presented to the user for the parameter.
# Each choice is a pair of value and description, e.g.
#
# choices:
# - [ 'myval', 'The first option']
# - [ 'myval', 'The second option']
#
# NOTE - All parameters are passed to NEXTFLOW as strings... but they
# are validated by astrocyte using the information provided above
workflow_parameters:
- id: fastqs
type: files
required: true
description: |
One or more input paired-end FASTQ files from a RNASeq experiment and a design file with the link between the same name and the sample group
regex: ".*(fastq|fq)*"
min: 1
- id: stranded
type: select
required: true
choices:
- [ '0', 'Unstranded']
- [ '1', 'Stranded']
- [ '2', 'Reverse Stranded']
description: |
In the case that the sequence libraries where generated using a stranded specific protocol.
- id: pairs
type: select
required: true
choices:
- [ 'pe', 'Paired End']
- [ 'se', 'Single End']
description: |
In single-end sequencing, the sequencer reads a fragment from only one end to the other, generating the sequence of base pairs. In paired-end reading it starts at one read, finishes this direction at the specified read length, and then starts another round of reading from the opposite end of the fragment.
- id: align
type: select
required: true
choices:
- [ 'hisat', 'HiSAT2']
- [ 'star', 'STAR']
description: |
Alignment tool
- id: fusion
type: select
required: true
choices:
- [ 'skip', 'No Fusion Detection']
- [ 'detect', 'Fusion Detection']
description: |
Run Star Fusion
- id: markdups
type: select
required: true
choices:
- [ 'picard', 'Remove Duplicates']
- [ 'null', 'Keep All Sequences']
description: |
Duplicate reads are defined as originating from the same original fragment of DNA. Duplicates are identified as read pairs having identical 5-prime positions (coordinate and strand) for both reads in a mate pair and optionally, matching unique molecular identifier reads.
- id: dea
type: select
required: true
choices:
- [ 'detect', 'Run Statistical Analysis']
- [ 'skip', 'Skip Statistical Analysis']
description: |
Runs deSEq2 and EdgeR
- id: design
type: file
required: true
regex: ".*txt"
description: |
A design file listing pairs of sample name and sample group.
Columns must include: SampleID,SampleName,SampleGroup,FullPathToFqR1,FullPathToFqR2
- id: genome
type: select
choices:
- [ '/project/shared/bicf_workflow_ref/GRCh38', 'Human GRCh38']
- [ '/project/shared/bicf_workflow_ref/GRCh37', 'Human GRCh37']
- [ '/project/shared/bicf_workflow_ref/GRCm38', 'Mouse GRCm38']
required: true
description: |
Reference genome for alignment
- id: geneset
type: select
choices:
- ['h.all.v5.1.symbols.gmt','Hallmark Gene Sets']
- ['c2.all.v5.1.symbols.gmt','Curated Gene Sets']
- ['c3.all.v5.1.symbols.gmt','Motif Gene Sets']
- ['c5.all.v5.1.entrez.gmt','Gene Ontology Gene Sets']
- ['c6.all.v5.1.symbols.gmt','Oncogenic Signatures']
- ['c7.all.v5.1.entrez.gmt','Immunological Signatures']
required: true
description: |
Gene Set Definitions used for QuSAGE Analysis -- see http://software.broadinstitute.org/gsea/msigdb/ for geneset descriptions
# -----------------------------------------------------------------------------
# SHINY APP CONFIGURATION
# -----------------------------------------------------------------------------
# Remember - The vizapp is always 'vizapp/server.R' 'vizapp/ui.R'
# The workflow must publish all final output into $baseDir
# Name of the R module that the vizapp will run against
vizapp_r_module: 'R/3.2.1-intel'
# List of any CRAN packages, not provided by the modules, that must be made
# available to the vizapp
vizapp_cran_packages:
- sqldf
- shiny
- Vennerable
- DT
- ggplot2
- gplots
- gtools
- RColorBrewer
# # List of any Bioconductor packages, not provided by the modules, that must be made
# available to the vizapp
vizapp_bioc_packages:
- qusage
- ballgown
- edgeR
- DESeq2
vizapp_github_packages:
- js229/Vennerable
\ No newline at end of file
# Astrocyte RNASeq Workflow Package
## Workflow SOP
This SOP describes the analysis pipeline of RNA sequencing data. This pipeline includes (1) quality control, (2) variant calling analysis, (3) identification of fusion genes, and (4) statistical analyses of gene expression and isoform expression. The result R data of the statistical analysis can be visualized using a custom R shiny service.
For each file this workflow:
1) Trim the ends of sequences with remaining adapter or quality scores < 25. Remove any sequence less than 35bp after trimming, then generate a file for capturing information about how many sequences were trimmed.
2) Trimmed Fastq files are aligned to the selected reference genome using HiSAT2 (Kim et al. 2015) or STAR (Dobin et al. 2012)
3) Marks Duplicates using SAMBAMBA
4) Features (genes, transcripts and exons) are counted using featureCounts (Liao et al 2014) and StringTie (Pertea et al. 2015) using the Gencode feature table(Harrow et al. 2012)
5) Basic pairwise differential expression analysis is performed using EdgeR (Robinson et al. 2010) and DESeq
6) Abundances of transcripts are calculated using ballgown (Frazee et al. 2014)
7) Identify gene fusions or fused transcripts using STAR-Fusion.
## Workflow Parameters
fastq - Choose one or more RNASeq read files to process.
genome - Choose a genomic reference (genome).
stranded - If a stranded library is created, please select the appropriate protocol to ensure strand specific analysis
markdups - The default is to remove duplicates, you can skip this function.
pairs - Indicate if this is paired-end or single-end sequencing data
dea - Perform Differential Expression analysis, please skip if there are < 2 sample groups
geneset - Select a set of known genesets for pathway analysis
fusion - Perform Gene Fusion Identification
design - This file matches the fastq files to data abot the sample
The following columns are necessary, must be named as in template and can be in any order:
SampleID
This ID should match the name in the fastq file ie S0001.R1.fastq.gz the sample ID is S0001
Note: SampleID shouldn't start with numbers ie 10C should be changed to S10C
SampleName
This ID can be the identifier of the researcher or clinician
SubjectID
Used in order to link samples from the same patient
SampleGroup
This is the group that will be used for pairwise differential expression analysis
FqR1
Name of the fastq file R1
FqR2
Name of the fastq file R2
There are some optional columns that might help with the analysis:
Tissue
Gender
CultureDate
SequenceRun
Organism
CellPopulation
Treatment
GeneticFeature (WT or KO)
Race
Ethnicity
Age
### Test Data
### Credits
This example worklow is derived from original scripts kindly contributed by the Bioinformatic Core Facility (BICF), Department of Bioinformatics
### Workflow SOP
![workflow](pipeline_rnaseq.001.png)
### References
* FastQC: http://www.bioinformatics.babraham.ac.uk/projects/fastqc/ o http://www.bioinformatics.babraham.ac.uk/projects/trim_galore/
* STAR-Fusion: https://github.com/STAR-Fusion/STAR-Fusion
* STAR: https://github.com/alexdobin/STAR
* Dobin A, Davis CA, Schlesinger F, Drenkow J, Zaleski C, Jha S, Batut P, Chaisson M, Gingeras TR. STAR: ultrafast universal RNA-seq aligner. Bioinformatics. 2013 Jan 1;29(1):15-21. doi: 10.1093/bioinformatics/bts635. Epub 2012 Oct 25. PubMed PMID: 23104886; PubMed Central PMCID: PMC3530905.
* HISAT, StringTie and Ballgown protocol
* Pertea M, Kim D, Pertea GM, Leek JT, Salzberg SL. Transcript-level expression analysis of RNA-seq experiments with HISAT, StringTie and Ballgown. Nat Protoc. 2016 Sep;11(9):1650-67.
* StringTie: https://ccb.jhu.edu/software/stringtie/
* Pertea M, Pertea GM, Antonescu CM, Chang TC, Mendell JT, Salzberg SL. StringTie enables improved reconstruction of a transcriptome from RNA-seq reads. Nat Biotechnol. 2015 Mar;33(3):290-5.
* Ballgown
* http://biorxiv.org/content/biorxiv/early/2014/09/05/003665.full.pdf o http://bioconductor.org/packages/release/bioc/html/ballgown.html o https://github.com/alyssafrazee/ballgown
* HISAT: https://ccb.jhu.edu/software/hisat2/index.shtml
* Kim D, Langmead B, Salzberg SL. HISAT: a fast spliced aligner with low memory requirements. Nat Methods. 2015 Apr;12(4):357-60.
* SAMtools: http://samtools.sourceforge.net/
* Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9.
* Li H. A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics. 2011 Nov 1;27(21):2987-93.
* Sambamba: http://lomereiter.github.io/sambamba/
* Tarasov A, Vilella AJ, Cuppen E, Nijman IJ, Prins P. Sambamba: fast processing of NGS alignment formats. Bioinformatics. 2015 Jun 15;31(12):2032-4. * Picard: https://broadinstitute.github.io/picard/
* SpeedSeq: https://github.com/hall-lab/speedseq
* Chiang C, Layer RM, Faust GG, Lindberg MR, Rose DB, Garrison EP, Marth GT, Quinlan AR, Hall IM. SpeedSeq: ultra-fast personal genome analysis and interpretation. Nat Methods. 2015 Oct;12(10):966-8.
* GATK: https://software.broadinstitute.org/gatk/
* McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA. The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 2010 Sep;20(9):1297-303.
* DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8.
* Van der Auwera GA, Carneiro MO, Hartl C, Poplin R, Del Angel G, Levy-Moonshine A, Jordan T, Shakir K, Roazen D, Thibault J, Banks E, Garimella KV, Altshuler D, Gabriel S, DePristo MA. From FastQ data to high confidence variant calls: the Genome Analysis Toolkit best practices pipeline. Curr Protoc Bioinformatics. 2013;43:11.10.1-33. o http://gatkforums.broadinstitute.org/gatk * COSMIC: http://cancer.sanger.ac.uk/cosmic
* BEDTools: http://bedtools.readthedocs.io/en/latest/
* Quinlan AR, Hall IM. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics. 2010 Mar 15;26(6):841-2. * VCFtools: http://vcftools.sourceforge.net/index.html
* Danecek P, Auton A, Abecasis G, Albers CA, Banks E, DePristo MA, Handsaker RE, Lunter G, Marth GT, Sherry ST, McVean G, Durbin R; 1000 Genomes Project Analysis Group. The variant call format and VCFtools. Bioinformatics. 2011 Aug 1;27(15):2156-8.
* SnpSift: http://snpeff.sourceforge.net/
* Cingolani P, Patel VM, Coon M, Nguyen T, Land SJ, Ruden DM, Lu X. Using Drosophila melanogaster as a Model for Genotoxic Chemical Mutational Studies with a New Program, SnpSift. Front Genet. 2012 Mar 15;3:35.
* featureCounts: http://subread.sourceforge.net
* Liao Y, Smyth GK, Shi W. featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. 2014 Apr 1;30(7):923-30.
* edgeR: https://bioconductor.org/packages/release/bioc/html/edgeR.html
* Robinson MD, McCarthy DJ, Smyth GK. edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics. 2010 Jan 1;26(1):139-40.
* McCarthy DJ, Chen Y, Smyth GK. Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Res. 2012 May;40(10):4288-97.
# Astrocyte RNASeq Workflow Package
## Workflow SOP
This SOP describes the analysis pipeline of RNA sequencing data. This pipeline includes (1) quality control, (2) variant calling analysis, (3) identification of fusion genes, and (4) statistical analyses of gene expression and isoform expression. The result R data of the statistical analysis can be visualized using a custom R shiny service.
For each file this workflow:
1) Trim the ends of sequences with remaining adapter or quality scores < 25. Remove any sequence less than 35bp after trimming, then generate a file for capturing information about how many sequences were trimmed.
2) Trimmed Fastq files are aligned to the selected reference genome using HiSAT2 (Kim et al. 2015) or STAR (Dobin et al. 2012)
3) Marks Duplicates using SAMBAMBA
4) Features (genes, transcripts and exons) are counted using featureCounts (Liao et al 2014) and StringTie (Pertea et al. 2015) using the Gencode feature table(Harrow et al. 2012)
5) Basic pairwise differential expression analysis is performed using EdgeR (Robinson et al. 2010) and DESeq
6) Abundances of transcripts are calculated using ballgown (Frazee et al. 2014)
7) Identify gene fusions or fused transcripts using STAR-Fusion.
8) Identify coding variants:
a) Using GATK, repair the format problem of input BAM file, (2) add read group information, (3) handle N-split mapping of reads caused by RNA splicing, (4) base quality score recalibration (BQSR): detect systematic errors made by the sequencer when it estimates the quality score of each base call and build a model of covariation based on the data and a set of known variants, then adjust the base quality scores in the data based on the model.
b) Detect variants using SAMTOOLS with mapping quality >= 50 and base calling quality >= 20.
c) Detect low frequency variants using SAMTOOLS with variant allele fraction > 0.01 and depth > 50 and has been previously identified.
d) Detect variants using SpeedSeq
e) Detect variants using GATK
## Workflow Parameters
fastq - Choose one or more RNASeq read files to process.
genome - Choose a genomic reference (genome).
stranded - If a stranded library is created, please select the appropriate protocol to ensure strand specific analysis
markdups - The default is to remove duplicates, you can skip this function.
pairs - Indicate if this is paired-end or single-end sequencing data
dea - Perform Differential Expression analysis, please skip if there are < 2 sample groups
geneset - Select a set of known genesets for pathway analysis
fusion - Perform Gene Fusion Identification
variant - Perform variant detection
tumor - Perform Cancer specific analysis in variant detection
design - This file matches the fastq files to data abot the sample
The following columns are necessary, must be named as in template and can be in any order:
SampleID
This ID should match the name in the fastq file ie S0001.R1.fastq.gz the sample ID is S0001
Note: SampleID shouldn't start with numbers ie 10C should be changed to S10C
SampleName
This ID can be the identifier of the researcher or clinician
SubjectID
Used in order to link samples from the same patient
SampleGroup
This is the group that will be used for pairwise differential expression analysis
FqR1
Name of the fastq file R1
FqR2
Name of the fastq file R2
There are some optional columns that might help with the analysis:
Tissue
Gender
CultureDate
SequenceRun
Organism
CellPopulation
Treatment
GeneticFeature (WT or KO)
Race
Ethnicity
Age
### Test Data
### Credits
This example worklow is derived from original scripts kindly contributed by the Bioinformatic Core Facility (BICF), Department of Bioinformatics
### Workflow SOP
![workflow](pipeline_rnaseq.001.png)
### References
* FastQC: http://www.bioinformatics.babraham.ac.uk/projects/fastqc/ o http://www.bioinformatics.babraham.ac.uk/projects/trim_galore/
* STAR-Fusion: https://github.com/STAR-Fusion/STAR-Fusion
* STAR: https://github.com/alexdobin/STAR
* Dobin A, Davis CA, Schlesinger F, Drenkow J, Zaleski C, Jha S, Batut P, Chaisson M, Gingeras TR. STAR: ultrafast universal RNA-seq aligner. Bioinformatics. 2013 Jan 1;29(1):15-21. doi: 10.1093/bioinformatics/bts635. Epub 2012 Oct 25. PubMed PMID: 23104886; PubMed Central PMCID: PMC3530905.
* HISAT, StringTie and Ballgown protocol
* Pertea M, Kim D, Pertea GM, Leek JT, Salzberg SL. Transcript-level expression analysis of RNA-seq experiments with HISAT, StringTie and Ballgown. Nat Protoc. 2016 Sep;11(9):1650-67.
* StringTie: https://ccb.jhu.edu/software/stringtie/
* Pertea M, Pertea GM, Antonescu CM, Chang TC, Mendell JT, Salzberg SL. StringTie enables improved reconstruction of a transcriptome from RNA-seq reads. Nat Biotechnol. 2015 Mar;33(3):290-5.
* Ballgown
* http://biorxiv.org/content/biorxiv/early/2014/09/05/003665.full.pdf o http://bioconductor.org/packages/release/bioc/html/ballgown.html o https://github.com/alyssafrazee/ballgown
* HISAT: https://ccb.jhu.edu/software/hisat2/index.shtml
* Kim D, Langmead B, Salzberg SL. HISAT: a fast spliced aligner with low memory requirements. Nat Methods. 2015 Apr;12(4):357-60.
* SAMtools: http://samtools.sourceforge.net/
* Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9.
* Li H. A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics. 2011 Nov 1;27(21):2987-93.
* Sambamba: http://lomereiter.github.io/sambamba/
* Tarasov A, Vilella AJ, Cuppen E, Nijman IJ, Prins P. Sambamba: fast processing of NGS alignment formats. Bioinformatics. 2015 Jun 15;31(12):2032-4. * Picard: https://broadinstitute.github.io/picard/
* SpeedSeq: https://github.com/hall-lab/speedseq
* Chiang C, Layer RM, Faust GG, Lindberg MR, Rose DB, Garrison EP, Marth GT, Quinlan AR, Hall IM. SpeedSeq: ultra-fast personal genome analysis and interpretation. Nat Methods. 2015 Oct;12(10):966-8.
* GATK: https://software.broadinstitute.org/gatk/
* McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA. The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 2010 Sep;20(9):1297-303.
* DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8.
* Van der Auwera GA, Carneiro MO, Hartl C, Poplin R, Del Angel G, Levy-Moonshine A, Jordan T, Shakir K, Roazen D, Thibault J, Banks E, Garimella KV, Altshuler D, Gabriel S, DePristo MA. From FastQ data to high confidence variant calls: the Genome Analysis Toolkit best practices pipeline. Curr Protoc Bioinformatics. 2013;43:11.10.1-33. o http://gatkforums.broadinstitute.org/gatk * COSMIC: http://cancer.sanger.ac.uk/cosmic
* BEDTools: http://bedtools.readthedocs.io/en/latest/
* Quinlan AR, Hall IM. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics. 2010 Mar 15;26(6):841-2. * VCFtools: http://vcftools.sourceforge.net/index.html
* Danecek P, Auton A, Abecasis G, Albers CA, Banks E, DePristo MA, Handsaker RE, Lunter G, Marth GT, Sherry ST, McVean G, Durbin R; 1000 Genomes Project Analysis Group. The variant call format and VCFtools. Bioinformatics. 2011 Aug 1;27(15):2156-8.
* SnpSift: http://snpeff.sourceforge.net/
* Cingolani P, Patel VM, Coon M, Nguyen T, Land SJ, Ruden DM, Lu X. Using Drosophila melanogaster as a Model for Genotoxic Chemical Mutational Studies with a New Program, SnpSift. Front Genet. 2012 Mar 15;3:35.
* featureCounts: http://subread.sourceforge.net
* Liao Y, Smyth GK, Shi W. featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. 2014 Apr 1;30(7):923-30.
* edgeR: https://bioconductor.org/packages/release/bioc/html/edgeR.html
* Robinson MD, McCarthy DJ, Smyth GK. edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics. 2010 Jan 1;26(1):139-40.
* McCarthy DJ, Chen Y, Smyth GK. Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Res. 2012 May;40(10):4288-97.
docs/pipeline_rnaseq.001.png

99.6 KiB

File added
File added
col.grp <- function(n,b) {
colorvec <- vector(mode="character", length=length(n))
plotcols <- rainbow(length(b))
for (i in 1:length(n)) {
for (j in 1:length(b)) {
if ( n[i] == b[j] ) {
colorvec[i] = plotcols[j]
}
}
}
c(colorvec)
}
bg.colors <- function(n) {
colorvec <- vector(mode="character", length=nrow(n))
r <- rainbow(4)
for (i in 1:nrow(n)) {
if (n[i,1] >= 2) {
if (n[i,2] >= 2) {
colorvec[i] = "purple"
}
else {
colorvec[i] = "cyan"
}
}
if (n[i,1] <= -2) {
if (n[i,2] >= 2) {
colorvec[i] = "red"
}
else {
colorvec[i] = "salmon"
}
}
if (n[i,2] >= -2 && n[i,2] <= 2) {
colorvec[i] = "grey"
}
if (n[i,1] >= -2 && n[i,1] <= 2) {
colorvec[i] = "grey"
}
}
c(colorvec)
}
options("repos"="http://cran.rstudio.com/")
update.packages()
install.packages("sqldf",dep=TRUE)
install.packages("gmp",dep=TRUE)
install.packages(c('gplots','lattice','latticeExtra','vegan','labdsv','cluster','ggplot2'))
install.packages("Vennerable", repos="http://R-Forge.R-project.org",type='source')
source("http://bioconductor.org/biocLite.R")
biocLite(c('graph', 'RBGL', 'RColorBrewer', 'reshape', 'gtools',"edgeR", "DESeq2","qusage","ballgown"))
library(shiny)
library(Vennerable)
library(qusage)
library(DT)
library(ggplot2)
library(ballgown)
library(sqldf)
library(reshape2)
library("gplots")
shinyServer(function(input, output, session) {
data.dir <- Sys.getenv('outputDir')
rda.dir <- data.dir
symsyn <- read.table(file='symbol2synonym.txt',header=FALSE)
names(symsyn) <- c('symbol','synonyms')
source('functions.R', local = TRUE)
source('tools/intro.R', local = TRUE)
source('tools/gc.R', local = TRUE)
source('tools/qc.R', local = TRUE)
source('tools/altsplice.R', local = TRUE)
source('tools/dea.R', local = TRUE)
source('tools/gsea.R', local = TRUE)
#source('tools/fastqc.R', local = TRUE)
source('tools/gc_ui.R', local = TRUE)
source('tools/qc_ui.R', local = TRUE)
source('tools/dea_ui.R', local = TRUE)
source('tools/gsea_ui.R', local = TRUE)
#source('tools/fastqc_ui.R', local = TRUE)
source('tools/altsplice_ui.R', local = TRUE)
})
This diff is collapsed.
get.bgdata <- function(var) {
rdafile <- paste(data.dir,'bg.rda',sep='/')
load(rdafile)
genetrx <- indexes(bg)$t2g
genetrx$SYMBOL <- geneNames(bg)
geneid <- as.character(unique(genetrx[genetrx$SYMBOL %in% var$symsearch,]$g_id))
if (exists("var$enssearch") && var$enssearch != '') {
geneid <- as.character(unique(genetrx$g_id[grep(enssearch,genetrx$g_id)]))
}
genebg <- subset(bg,paste0("gene_id=='",geneid,"'"))
rownames(genetrx) <- transcriptNames(bg)
fpkm <- texpr(bg,meas='FPKM')
keep <- genetrx[genetrx$g_id == geneid,]$t_id
cttbl <- fpkm[keep,]
grps <- pData(bg)$group
trxnames <- genetrx[genetrx$g_id == geneid,]
dtm <- melt(t(cttbl))
names(dtm) <- c('sample','transcript','value')
dtm$grps <- rep(grps,length(keep))
if (length(keep) < 2) {
dtm$transcript <- rep(keep,length(keep))
}
dtm$grptrx <- paste(dtm$grps,dtm$transcript,sep='.')
test <- stattest(gown=genebg, pData=pData(bg), feature='transcript',covariate='group', libadjust=FALSE,getFC=TRUE)
if (length(keep) > 1) {
agg = collapseTranscripts(gene=geneid, gown=bg, k=var$kct, method='kmeans')
test <- stattest(gowntable=agg$tab, pData=pData(bg), feature='transcript_cluster',covariate='group', libadjust=FALSE,getFC=TRUE)
}
return(list(stattbl=test,gid=geneid,obj=genebg,cttbl=dtm,tname=trxnames))
}
find_sym <- function (sym) {
if (!(toupper(sym) %in% toupper(symsyn$symbol))) {
syns <- symsyn[grep(input$symsearch,symsyn$synonym,ignore.case=TRUE),]$symbol
synlist <- paste(as.character(syns),collapse=',')
if (length(syns) > 1) {
paste("Please Use Official Gene Symbols",synlist,sep=':')
}else {"Please Use Official Gene Symbols"}
}else {NULL}
}
getgeneid <- eventReactive(input$altButton,{
if (input$symsearch == '' & input$enssearch != '') {
validate (find_sym(input$symsearch))
}
get.bgdata(input)
})
output$plot.cluster <- renderPlot({
par(oma=c(4,4,1,1))
gid <- getgeneid()$gid
bg <- getgeneid()$obj
tname <- getgeneid()$tname
if (nrow(tname) > 1) {
plotLatentTranscripts(gene=gid, gown=bg, k=input$kct, method='kmeans', returncluster=FALSE)
}
})
output$gene.stat <- DT::renderDataTable({
getgeneid()$stattbl
},escape=FALSE)
output$trx.name <- DT::renderDataTable({
getgeneid()$tname
},escape=FALSE)
output$plot.means <- renderPlot({
par(oma=c(4,4,1,1))
par(cex.main=0.75)
gid <- getgeneid()$gid
bg <- getgeneid()$obj
tname <- getgeneid()$tname
if (nrow(tname) > 1) {
plotMeans(gid, bg, groupvar='group', meas='cov', colorby='transcript')
}
}, height = 900, width = 900)
output$trx.gene <- renderPlot({
countTable <- getgeneid()$cttbl
par(oma=c(4,4,1,1))
p <- ggplot(countTable,aes(x=grptrx,y=log2(value+1))) + geom_boxplot(aes(fill = factor(grptrx))) + geom_jitter(height = 0) + theme(legend.position="left",axis.text.x=element_text(angle=45,hjust=1, vjust=1),legend.key.height=unit(0.5,"line"),legend.text=element_text(size=8),legend.title=element_blank()) + ylab("Relative Abundance (FPKM)") + xlab("")
print(p)
})
output$ui_altsplice <- renderUI({
fluidPage(
includeCSS("www/style.css"),
sidebarLayout(
sidebarPanel(
uiOutput("dir.splice"),
textInput("symsearch", "Search By Gene Symbol", 'IL1B'),
textInput("enssearch", "Search By ENS ID",''),
numericInput("kct",label = "Number of Clusters",value = 2),
actionButton("altButton", "GO")
),
mainPanel(
tabsetPanel(
tabPanel("Gene Compare",
dataTableOutput('gene.stat'),
dataTableOutput('trx.name'),
plotOutput("plot.cluster"),
plotOutput("trx.gene"),
plotOutput("plot.means")
)
)
)
)
)
})
# UI-elements for DEA
ctfile <- paste(data.dir,'countTable.logCPM.txt',sep='/')
samfile <- paste(data.dir,'design.shiny.txt',sep='/')
cts <- read.table(file=ctfile,header=TRUE,sep='\t')
samtbl <- read.table(samfile,header=TRUE,sep="\t")
samples <- colnames(cts[,4:length(cts)])
mergetbl <- merge(as.data.frame(samples),samtbl,by.x="samples",by.y="SampleID",all.x=TRUE,sort=FALSE)
grps <- mergetbl$SampleGroup
grpnames <- levels(factor(grps))
col.blocks <-col.grp(grps,grpnames)
MSIG.geneSets <- read.gmt(paste(data.dir,'geneset.shiny.gmt',sep='/'))
output$pick.dea <- renderUI({
flist <- list.files(data.dir,pattern="*edgeR.txt$")
selectInput("file", "Choose Pair", choices=flist)
})
output$pick.pathway <- renderUI({
pathways <- names(MSIG.geneSets)
pathchoices = setNames(1:length(pathways),pathways)
selectInput("deapathname", "Choose Pair", choices=pathchoices)
})
get.data <- function(var) {
f <- paste(data.dir,var$file,sep='/')
comp <- read.table(f,header=TRUE,sep='\t')
comp.filt <- na.omit(comp[abs(comp$logFC) >= var$fc.thresh & comp$rawP <= var$pval.thresh,])
if (var$adjust == 'FDR') {
comp.filt <- na.omit(comp[abs(comp$logFC) >= var$fc.thresh & comp$fdr <= var$pval.thresh,])
}
if (var$adjust == 'BONF') {
comp.filt <- na.omit(comp[abs(comp$logFC) >= var$fc.thresh & comp$bonf <= var$pval.thresh,])
}
genelist <- as.character(head(comp.filt[order(comp.filt$fdr),]$symbol,n=var$numgenes))
if (var$heatmap == 'hgeneset') {
genelist <- unlist(MSIG.geneSets[as.numeric(var$deapathname)])
}
if (var$heatmap == 'cgeneset') {
genelist <- unlist(strsplit(as.character(var$genes), "[;\n]+"))
}
return(list(filt=comp.filt,glist=genelist))
}
tbls <- eventReactive(input$deButton,{get.data(input)})
output$selectgenes <- renderUI({
symnames <- tbls()$glist
textAreaInput("genes", "Gene Symbols separated by ';'",value=paste(symnames,collapse=";"), width = '95%',rows=10)
})
output$dge.c <- DT::renderDataTable({
t1 <- tbls()$filt
t1$symbol <- paste("<a href=http://www.genecards.org/cgi-bin/carddisp.pl?gene=",t1$symbol,'>',t1$symbol,"</a>",sep='')
t1$ensembl <- paste("<a href=http://www.ensembl.org/Homo_sapiens/Gene/Summary?g=",t1$ensembl,'>',t1$ensembl,"</a>",sep='')
t1
},escape=FALSE,filter = 'top',options = list(lengthMenu = c(10, 25, 50, 200, -1)))
output$downloadC <- downloadHandler(
file <- paste(input$file,".filt.txt",sep=""),
content = function(file) {
write.table(tbls()$filt,file,quote=FALSE,row.names=FALSE,sep='\t')
})
plotHeatmap <- reactive({
syms <- tbls()$glist
ct2 <- cts[cts$SYMBOL %in% syms,]
subset <- ct2[,4:length(ct2)]
row.names(subset) <- ct2$SYMBOL
STREE <- hclust(dist(t(subset)))
zscores <- scale(t(subset))
ngenes <- length(colnames(zscores))
textscale <- (1/(ngenes/30))
if (textscale > 1) {
textscale <-1
}
if (textscale < 0.1) {
textscale <- 0.1
}
heatmap.2(zscores, col = bluered(100),Rowv = as.dendrogram(STREE), RowSideColors = col.blocks,dendrogram='row', cexCol=textscale,srtRow=45,srtCol=45,trace="none",margins=c(8,16))
legend("topright",legend=grpnames,col=rainbow(length(grpnames)),pch=20)
})
output$plot.heatmap <- renderPlot({
plotHeatmap()
})
output$downloadpdf = downloadHandler(
filename = "output.heatmap.pdf",
content = function(file) {
pdf(file = file,paper="letter")
plotHeatmap()
dev.off()
})
output$hm.comp <- renderImage({
f1 <- paste(data.dir,input$file,sep='/')
png <- gsub('edgeR.txt','heatmap.edgeR.png',f1)
list(src=png, alt=paste("HeatMap Comparison"))
},deleteFile=FALSE)
output$hmcomp.desc <- renderText({
paste("Heatmap of all genes with an FDR < 0.05 using EdgeR Results",sep='')
})
#######################################
# Shiny interface for data functions
#######################################
# data ui and tabs
output$ui_dea <- renderUI({
list(
includeCSS("www/style.css"),
sidebarLayout(
sidebarPanel(
numericInput("fc.thresh",
"LogFold Change Threshold:", 1),
numericInput("gene.thresh",
"Gene Mean Threshold:", 5),
numericInput("pval.thresh",
"P-Value Threshold:", 0.05),
selectInput("adjust", "Choose P-Value Correction", choices=c("raw","FDR",'BONF'),selected='FDR'),
uiOutput("pick.dea"),
selectInput(
"heatmap", "HeatMap",
c(Top = "top",
HallmarkGeneSet = "hgeneset",
CustomGeneSet= "cgeneset")
),
conditionalPanel(
condition = "input.heatmap == 'cgeneset'",
uiOutput("selectgenes")
),
conditionalPanel(
condition = "input.heatmap == 'top'",
numericInput("numgenes","Number Top Genes:", 50)
),
conditionalPanel(
condition = "input.heatmap == 'hgeneset'",
uiOutput("pick.pathway")
),
actionButton("deButton", "Go")
),
mainPanel(
tabsetPanel(
tabPanel("Differential Gene Set Comparison",
downloadButton('downloadC', 'Download CSV'),
dataTableOutput('dge.c')
),
tabPanel("Heatmap",
#downloadButton('downloadpdf', 'Download Heatmap'),
h1("HeatMap Comparison"),
h3("Top 50 User Defined Genes"),
plotOutput("plot.heatmap"),
h3("All Differentially Expressed Genes"),
imageOutput("hm.comp",width="1200px",height="1200px"),
textOutput("hmcomp.desc")
)
)
)
)
)
})
output$dir.fqc <- renderUI({
datadir <- dir(data.dir)
selectInput("dset.fqc", "Choose Dataset", choices=datadir,selected='coarse_cellpops')
})
output$pick.fqc <- renderUI({
flist <- list.files(paste(data.dir,input$dset.fqc,"fastqc",sep='/'),pattern="*html$")
flist <- gsub(".sort_fastqc.html",'',flist)
selectInput("samp", "Choose Sample", choices=flist)
})
getpage <- function(var) {
return(includeHTML(paste(data.dir,"/",var$dset.fqc,"/fastqc/",var$samp,".sort_fastqc.html",sep='')))
}
output$inc <- renderUI({getpage(input)})
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment