astrocyte_pkg.yml 6.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
#
# metadata for the example astrocyte ChipSeq workflow package
#

# -----------------------------------------------------------------------------
# BASIC INFORMATION
# -----------------------------------------------------------------------------

# A unique identifier for the workflow package, text/underscores only
name: 'rnaseq_bicf'
# Who wrote this?
author: 'Brandi Cantarel'
# A contact email address for questions
email: 'biohpc-help@utsouthwestern.edu'
# A more informative title for the workflow package
title: 'BICF RNASeq Analysis Workflow'
# A summary of the workflow package in plain text
description: |
  This is a workflow package for the BioHPC/BICF RNASeq workflow system.
  It implements a simple RNASeq analysis workflow using TrimGalore, HiSAT,FeatureCounts,
  StringTie and statistical analysis using EdgeR and Ballgown, plus a simple R Shiny
  visualization application.

# -----------------------------------------------------------------------------
# DOCUMENTATION
# -----------------------------------------------------------------------------

# A list of documentation file in .md format that should be viewable from the
# web interface. These files are in the 'docs' subdirectory. The first file
# listed will be used as a documentation index and is index.md by convention
31 32 33
documentation_files:
  - 'index.md'

34 35 36 37 38 39 40 41 42 43 44 45 46
# -----------------------------------------------------------------------------
# NEXTFLOW WORKFLOW CONFIGURATION
# -----------------------------------------------------------------------------

# Remember - The workflow file is always named 'workflow/main.f'
#            The workflow must publish all final output into $baseDir

# A list of clueter environment modules that this workflow requires to run.
# Specify versioned module names to ensure reproducability.
workflow_modules:
  - 'trimgalore/0.4.1'
  - 'cutadapt/1.9.1'
  - 'hisat2/2.0.1-beta-intel'
Brandi Cantarel's avatar
Brandi Cantarel committed
47
  - 'samtools/intel/1.3'
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
  - 'picard/1.127'
  - 'subread/1.5.0-intel'
  - 'stringtie/1.1.2-intel'

# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
#
# REQUIRED INFORMATION
#  id:         The name of the parameter in the NEXTFLOW workflow
#  type:       The type of the parameter, one of:
#                string    - A free-format string
#                integer   - An integer
#                real      - A real number
#                file      - A single file from user data
#                files     - One or more files from user data
#                select    - A selection from a list of values
#  required:    true/false, must the parameter be entered/chosen?
#  description: A user friendly description of the meaning of the parameter
#
# OPTIONAL INFORMATION
#  default:   A default value for the parameter (optional)
#  min:       Minium value/characters/files for number/string/files types
#  max:       Maxumum value/characters/files for number/string/files types
#  regex:     A regular expression that describes valid entries / filenames
#
# SELECT TYPE
#  choices:   A set of choices presented to the user for the parameter.
#             Each choice is a pair of value and description, e.g.
76
#
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
#             choices:
#               - [ 'myval', 'The first option']
#               - [ 'myval', 'The second option']
#
# NOTE - All parameters are passed to NEXTFLOW as strings... but they
#        are validated by astrocyte using the information provided above

workflow_parameters:

  - id: fastqs
    type: files
    required: true
    description: |
      One or more input paired-end FASTQ files from a RNASeq experiment and a design file with the link between the same name and the sample group
    regex: ".*(fastq|fq).gz"
    min: 1

Brandi Cantarel's avatar
Brandi Cantarel committed
94 95 96 97 98 99 100 101
  - id: pairs
    type: select
    required: true
    choices:
      - [ 'pe', 'Paired End']
      - [ 'se', 'Single End']
    description: |
      In single-end sequencing, the sequencer reads a fragment from only one end to the other, generating the sequence of base pairs. In paired-end reading it starts at one read, finishes this direction at the specified read length, and then starts another round of reading from the opposite end of the fragment.
102

103 104 105 106 107 108 109
  - id: markdups
    type: select
    required: true
    choices:
      - [ 'mark', 'Remove Duplicates']
      - [ 'keep', 'Keep All Sequences']
    description: |
110
      Duplicate reads are defined as originating from the same original fragment of DNA. Duplicates are identified as read pairs having identical 5-prime positions (coordinate and strand) for both reads in a mate pair and optionally, matching unique molecular identifier reads.
Brandi Cantarel's avatar
Brandi Cantarel committed
111

112 113 114 115
  - id: design
    type: file
    required: true
    description: |
Brandi Cantarel's avatar
Brandi Cantarel committed
116
      A design file listing pairs of sample name and sample group.
117 118 119 120 121 122
      Columns must include: SampleID,SampleName,SampleGroup,FullPathToFqR1,FullPathToFqR2

      FullPathToFqR1 and FullPathToFqR2 are the file names of the R1 and R2 read files.
      ie Sample1.R1.fastq.gz, Sample_1.fastq.gz, Sample.fq.gz
      Single end mode needs only the FullPathToFqR1 column

Brandi Cantarel's avatar
Brandi Cantarel committed
123 124 125
      Optional columns are encouraged, such as:
      SubjectID,Tissue,Gender,CultureDate,Organism,CellPopulation
      
126 127 128 129 130 131 132 133 134 135 136 137
  - id: genome
    type: select
    choices:
      - [ '/project/apps_database/hisat2_index/GRCh38', 'Human GRCh38']
      - [ '/project/apps_database/hisat2_index/GRCh37', 'Human GRCh37']
      - [ '/project/apps_database/hisat2_index/GRCm38', 'Mouse GRCh38']
    required: true
    description: |
      Reference genome for alignment

  - id: gtf
    type: select
138 139 140 141 142
    choices:
      - [ '/project/apps_database/iGenomes/Homo_sapiens/NCBI/GRCh38/Annotation/Genes.gencode/genes.gtf', 'GENCODE Human GRCh38' ]
      - [ '/project/apps_database/iGenomes/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf', 'NCBI Human GRCh38' ]
      - [ '/project/apps_database/iGenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf', 'Ensembl Human GRCh37' ]
      - [ '/project/apps_database/iGenomes/Mus_musculus/NCBI/GRCh38/Annotation/Genes/gencode/genes.gtf', 'NCBI Mouse GRCh38' ]
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
    required: true
    description: |
      The gene annotation file to use


# -----------------------------------------------------------------------------
# SHINY APP CONFIGURATION
# -----------------------------------------------------------------------------

# Remember - The vizapp is always 'vizapp/server.R' 'vizapp/ui.R'
#            The workflow must publish all final output into $baseDir

# Name of the R module that the vizapp will run against
vizapp_r_module: 'R/3.2.1-Intel'

# List of any CRAN packages, not provided by the modules, that must be made
# available to the vizapp
vizapp_cran_packages:
  - sqldf
  - shiny
  - Vennerable
  - DT
  - ggplot2
  - gplots
  - gtools
  - RColorBrewer


# # List of any Bioconductor packages, not provided by the modules, that must be made
# available to the vizapp
vizapp_bioc_packages:
  - qusage
  - ballgown
  - edgeR
  - DESeq2