Skip to content
Snippets Groups Projects
Commit 6ef47c9a authored by Gervaise Henry's avatar Gervaise Henry :cowboy:
Browse files

Merge branch 'develop' into 'master'

Develop

See merge request !16
parents a2733bff 1360363e
Branches
Tags
4 merge requests!29Develop,!20Master,!19Master,!16Develop
Pipeline #3191 passed with stage
in 22 minutes
......@@ -24,6 +24,120 @@ wheels/
.installed.cfg
*.egg
# PyInstaller
# Created by https://www.gitignore.io/api/r,perl,macos,linux,python,windows
# Edit at https://www.gitignore.io/?templates=r,perl,macos,linux,python,windows
### Linux ###
*~
# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*
# KDE directory preferences
.directory
# Linux trash folder which might appear on any partition or disk
.Trash-*
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### Perl ###
!Build/
.last_cover_stats
/META.yml
/META.json
/MYMETA.*
*.o
*.pm.tdy
*.bs
# Devel::Cover
cover_db/
# Devel::NYTProf
nytprof.out
# Dizt::Zilla
/.build/
# Module::Build
_build/
Build
Build.bat
# Module::Install
inc/
# ExtUtils::MakeMaker
/blib/
/_eumm/
/*.gz
/Makefile
/Makefile.old
/MANIFEST.bak
/pm_to_blib
/*.zip
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
......@@ -37,6 +151,7 @@ pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
......@@ -44,6 +159,7 @@ nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
......@@ -52,6 +168,7 @@ coverage.xml
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
......@@ -69,6 +186,10 @@ target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
......@@ -84,6 +205,8 @@ celerybeat-schedule
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
......@@ -97,16 +220,94 @@ ENV/
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
### Python Patch ###
.venv/
### R ###
# History files
.Rhistory
.Rapp.history
# Session Data files
.RData
# Example code in package build process
*-Ex.R
# Output files from R CMD build
/*.tar.gz
# Output files from R CMD check
/*.Rcheck/
# RStudio files
.Rproj.user/
# produced vignettes
vignettes/*.html
vignettes/*.pdf
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth
# knitr and R markdown default cache directories
/*_cache/
/cache/
# Temporary files created by R markdown
*.utf8.md
*.knit.md
### R.Bookdown Stack ###
# R package: bookdown caching files
/*_files/
### Windows ###
# Windows thumbnail cache files
Thumbs.db
ehthumbs.db
ehthumbs_vista.db
# Dump file
*.stackdump
# Folder config file
[Dd]esktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp
# Windows shortcuts
*.lnk
# End of https://www.gitignore.io/api/r,perl,macos,linux,python,windows
# nextflow analysis folders/files
/test_data/*.fastq.gz
/test_data/*.fastq
/test_data/*
/workflow/.nextflow/*
/workflow/work/*
/workflow/output/*
/.nextflow/*
/work/*
/output/*
pipeline_trace*.txt*
.nextflow*.log*
report.html*
timeline*.html*
*~
!.gitkeep
before_script:
- module load python/3.6.1-2-anaconda
- module load nextflow/0.27.6
- ln -s /project/shared/bicf_workflow_ref/workflow_testdata/cellranger_count/*fastq.gz test_data/
- module load nextflow/0.31.1_Ignite
- mkdir test_data/v2s2r100k
- mkdir test_data/v3s2r100k
- ln -s /project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/v2s2r100k/* test_data/v2s2r100k/
- ln -s /project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/v3s2r100k/* test_data/v3s2r100k/
stages:
- integration
simple_test:
simple_cr2v2ref1.2.0:
stage: integration
script:
- nextflow run workflow/main.nf
- nextflow run workflow/main.nf --fastq "$CI_PROJECT_DIR/test_data/v2s2r100k/*.fastq.gz" --designFile "$CI_PROJECT_DIR/test_data/v2s2r100k/design.csv" --genome 'GRCh38-1.2.0' --kitVersion 'two' --version '2.1.1'
simple_cr2v2ref3.0.0:
stage: integration
script:
- nextflow run workflow/main.nf --fastq "$CI_PROJECT_DIR/test_data/v2s2r100k/*.fastq.gz" --designFile "$CI_PROJECT_DIR/test_data/v2s2r100k/design.csv" --genome 'GRCh38-3.0.0' --kitVersion 'two' --version '2.1.1'
simple_cr3v2ref3.0.0:
stage: integration
script:
- nextflow run workflow/main.nf --fastq "$CI_PROJECT_DIR/test_data/v2s2r100k/*.fastq.gz" --designFile "$CI_PROJECT_DIR/test_data/v2s2r100k/design.csv" --genome 'GRCh38-3.0.0' --kitVersion 'two' --version '3.0.2'
simple_cr3v3ref3.0.0:
stage: integration
script:
- nextflow run workflow/main.nf --fastq "$CI_PROJECT_DIR/test_data/v3s2r100k/*.fastq.gz" --designFile "$CI_PROJECT_DIR/test_data/v3s2r100k/design.csv" --genome 'GRCh38-3.0.0' --kitVersion 'three' --version '3.0.2'
......@@ -10,4 +10,85 @@ The pipeline uses Nextflow, a bioinformatics workflow tool.
This pipeline is primarily used with a SLURM cluster on the BioHPC Cluster. However, the pipeline should be able to run on any system that Nextflow supports.
Additionally, the pipeline is designed to work with Astrocyte Workflow System using a simple web interface.
\ No newline at end of file
Additionally, the pipeline is designed to work with Astrocyte Workflow System using a simple web interface.
To Run:
-------
* Available parameters:
* **--fastq**
* path to the fastq location
* R1 and R2 only necessary but can include I2
* eg: **--fastq '/project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/v3s2r100k/\*.fastq.gz'**
* **--designFile**
* path to design file (csv format) location
* column 1 = "Sample"
* column 2 = "fastq_R1"
* column 3 = "fastq_R2"
* can have repeated "Sample" if there are multiole fastq R1/R2 pairs for the samples
* eg: **--designFile '/project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/v3s2r100k/design.csv'**
* **--genome**
* reference genome
* requires workflow/conf/biohpc.config to work
* name of available 10x Gemomics premade reference genomes:
* *'GRCh38-3.0.0'* = Human GRCh38 release 93
* *'GRCh38-1.2.0'* = Human GRCh38 release 84
* *'hg19-3.0.0'* = Human GRCh37 (hg19) release 87
* *'hg19-1.2.0'* = Human GRCh37 (hg19) release 84
* *'mm10-3.0.0'* = Human GRCm38 (mm10) release 93
* *'mm10-3.0.0'* = Human GRCm38 (mm10) release 84
* *'hg19_and_mm10-3.0.0'* = Human GRCh37 (hg19) + Mouse GRCm38 (mm19) release 93
* *'hg19_and_mm10-1.2.0'* = Human GRCh37 (hg19) + Mouse GRCm38 (mm19) release 84
* *'ercc92-1.2.0'* = ERCC.92 Spike-In
* if --genome is used then --genomeLocationFull is not necessary
* eg: **--genome 'GRCh38-3.0.0'**
* **--genomeLocationFull**
* path to a custom genome
* if --genomeLocationFull is used --genome is not necessary and is overwritten
* eg. **--genomeLocationFull '/project/apps_database/cellranger/refdata-cellranger-GRCh38-3.0.0'**
* **--expectCells**
* expected number of cells to be detected
* guides cellranger in it's cutoff for background/low quality cells
* as a guide it doesn't have to be exact
* 0-10000
* if --expextedCells is used then --forceCells is not necessary
* only used if --forceCells is not entered or set to 0
* eg: **--expectCells 10000**
* **--forceCells**
* forces filtering of the top number of cells matching this parameter
* 0-10000
* if --forceCells is used then --expectedCells is not necessary and is overwritten
* eg: **--forceCells 10000**
* **--kitVersion**
* the library chemistry version number for the 10x Genomics Gene Expression kit
* setting to auto will attempt to autodetect from the detected cycle strategy in the fastq's
* version numbers are spelled out
* --kitversion is only used if --version (cellranger version) is > 2
* --version (cellranger version) 2.1.1 can only read --kitVersion of two (2)
* options:
* *'auto'*
* *'three'*
* *'two'*
* eg: **--kitVersion 'three'**'
* **--version**
* cellranger version
* --version (cellranger version) 2.1.1 can only read --kitVersion of two (2)
* options:
* *'3.0.2'*
* *'3.0.1'*
* *'2.1.1'*
* eg: **--version '3.0.2'**'
* **--outDir**
* optional output directory for run
* eg: **--outDir 'test'**
* FULL EXAMPLE:
**nextflow main.nf --fastq '/project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/v3s2r100k/\*.fastq.gz' --designFile '/project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/v3s2r100k/design.csv' --genome 'GRCh38-3.0.0' --kitVersion 'three' --version '3.0.2' --outDir 'test'**
* Design example:
| Sample | fastq_R1 | fastq_R2 |
|---------|------------------------------------|------------------------------------|
| sample1 | pbmc_1k_v2_S1_L001_R1_001.fastq.gz | pbmc_1k_v2_S1_L001_R2_001.fastq.gz |
| sample2 | pbmc_1k_v2_S2_L001_R1_001.fastq.gz | pbmc_1k_v2_S2_L001_R2_001.fastq.gz |
| sample2 | pbmc_1k_v2_S2_L002_R1_001.fastq.gz | pbmc_1k_v2_S2_L002_R2_001.fastq.gz |
\ No newline at end of file
......@@ -41,6 +41,8 @@ documentation_files:
workflow_modules:
- 'python/3.6.1-2-anaconda'
- 'cellranger/2.1.1'
- 'cellranger/3.0.1'
- 'cellranger/3.0.2'
- 'bcl2fastq/2.17.1.14'
# A list of parameters used by the workflow, defining how to present them,
......@@ -95,11 +97,15 @@ workflow_parameters:
- id: genome
type: select
choices:
- [ '/project/apps_database/cellranger/refdata-cellranger-GRCh38-1.2.0', 'Human GRCh38']
- [ '/project/apps_database/cellranger/refdata-cellranger-hg19-1.2.0', 'Human GRCh37 (hg19)']
- [ '/project/apps_database/cellranger/refdata-cellranger-mm10-1.2.0', 'Mouse GRCm38 (mm10)']
- [ '/project/apps_database/cellranger/refdata-cellranger-hg19_and_mm10-1.2.0', 'Human GRCh37 (hg19) + Mouse GRCm38 (mm19)']
- [ '/project/apps_database/cellranger/refdata-cellranger-ercc92-1.2.0', 'ERCC.92 Spike-In']
- [ 'GRCh38-3.0.0', 'Human GRCh38 release 93']
- [ 'GRCh38-1.2.0', 'Human GRCh38 release 84']
- [ 'hg19-3.0.0', 'Human GRCh37 (hg19) release 87']
- [ 'hg19-1.2.0', 'Human GRCh37 (hg19) release 84']
- [ 'mm10-3.0.0', 'Mouse GRCm38 (mm10) release 93']
- [ 'mm10-1.2.0', 'Mouse GRCm38 (mm10) release 84']
- [ 'hg19_and_mm10-3.0.0', 'Human GRCh37 (hg19) + Mouse GRCm38 (mm19) release 93']
- [ 'hg19_and_mm10-1.2.0', 'Human GRCh37 (hg19) + Mouse GRCm38 (mm19) release 84']
- [ 'ercc92-1.2.0', 'ERCC.92 Spike-In']
required: true
description: |
Reference species and genome used for alignment and subsequent analysis.
......@@ -122,6 +128,29 @@ workflow_parameters:
description: |
Force pipeline to use this number of cells, bypassing the cell detection algorithm. Use this if the number of cells estimated by Cell Ranger is not consistent with the barcode rank plot. A value of 0 ignores this option. Any value other than 0 overrides expect-cells.
- id: kitVersion
type: select
default: 'auto'
choices:
- [ 'auto', 'Auto Detect']
- [ 'three', '3']
- [ 'two', '2']
required: true
description: |
10x single cell gene expression chemistry version (only used in cellranger version 2.x).
- id: version
type: select
default: '3.0.2'
choices:
- [ '3.0.2', '3.0.2']
- [ '3.0.1', '3.0.1']
- [ '2.1.1', '2.1.1']
required: true
description: |
10x cellranger version.
# -----------------------------------------------------------------------------
# SHINY APP CONFIGURATION
# -----------------------------------------------------------------------------
......
profiles {
standard {
includeConfig 'workflow/conf/biohpc.config'
}
}
Sample,fastq_R1,fastq_R2
D17PrPzF_BE,/work/urology/ghenry/Grimoire/Astrocyte/cellranger_count/test_data/D17PrPzF_BE_S1_L001_R1_001.fastq.gz,/work/urology/ghenry/Grimoire/Astrocyte/cellranger_count/test_data/D17PrPzF_BE_S1_L001_R2_001.fastq.gz
D17PrPzF_BE,/work/urology/ghenry/Grimoire/Astrocyte/cellranger_count/test_data/D17PrPzF_BE_S1_L002_R1_001.fastq.gz,/work/urology/ghenry/Grimoire/Astrocyte/cellranger_count/test_data/D17PrPzF_BE_S1_L002_R2_001.fastq.gz
D17PrPzF_LE,/work/urology/ghenry/Grimoire/Astrocyte/cellranger_count/test_data/D17PrPzF_LE_S3_L001_R1_001.fastq.gz,/work/urology/ghenry/Grimoire/Astrocyte/cellranger_count/test_data/D17PrPzF_LE_S3_L001_R2_001.fastq.gz
......@@ -7,8 +7,65 @@ process {
module = ['python/3.6.1-2-anaconda']
executor = 'local'
}
$count {
$count211 {
module = ['cellranger/2.1.1']
memory = '120GB'
}
$count301 {
module = ['cellranger/3.0.1']
memory = '120GB'
}
$count302 {
module = ['cellranger/3.0.2']
memory = '120GB'
}
}
params {
// Reference file paths on BioHPC
genomes {
'GRCh38-3.0.0' {
loc = '/project/apps_database/cellranger/refdata-cellranger-'
}
'GRCh38-1.2.0' {
loc = '/project/apps_database/cellranger/refdata-cellranger-'
}
'hg19-3.0.0' {
loc = '/project/apps_database/cellranger/refdata-cellranger-'
}
'hg19-1.2.0' {
loc = '/project/apps_database/cellranger/refdata-cellranger-'
}
'mm10-3.0.0' {
loc = '/project/apps_database/cellranger/refdata-cellranger-'
}
'mm10-1.2.0' {
loc = '/project/apps_database/cellranger/refdata-cellranger-'
}
'hg19_and_mm10-3.0.0' {
loc = '/project/apps_database/cellranger/refdata-cellranger-'
}
'hg19_and_mm10-1.2.0' {
loc = '/project/apps_database/cellranger/refdata-cellranger-'
}
'ercc92-1.2.0' {
loc = '/project/apps_database/cellranger/refdata-cellranger-'
}
}
// Chemistry mapping parameter
chemistry {
'auto' {
param = 'auto'
}
'one' {
param = 'SC3Pv1'
}
'two' {
param = 'SC3Pv2'
}
'three' {
param = 'SC3Pv3'
}
}
}
......
......@@ -6,9 +6,17 @@
// Define Input variables
params.fastq = "$baseDir/../test_data/*.fastq.gz"
params.designFile = "$baseDir/../test_data/design.csv"
params.genome = '/project/apps_database/cellranger/refdata-cellranger-GRCh38-1.2.0'
params.genome = 'GRCh38-3.0.0'
params.genomes = []
params.genomeLocation = params.genome ? params.genomes[ params.genome ].loc ?: false : false
params.genomeLocationFull = params.genomeLocation+params.genome
params.expectCells = 10000
params.forceCells = 0
params.kitVersion = 'three'
params.chemistry = []
params.chemistryParam = params.kitVersion ? params.chemistry[ params.kitVersion ].param ?: false : false
params.version = '3.0.2'
params.outDir = "$baseDir/output"
// Define regular variables
designLocation = Channel
......@@ -20,14 +28,17 @@ fastqList = Channel
.map { file -> [ file.getFileName().toString(), file.toString() ].join("\t") }
.collectFile(name: 'fileList.tsv', newLine: true)
refLocation = Channel
.fromPath(params.genome)
.fromPath(params.genomeLocationFull)
.ifEmpty { exit 1, "referene not found: ${params.genome}" }
expectCells = params.expectCells
forceCells = params.forceCells
chemistryParam = params.chemistryParam
version = params.version
outDir = params.outDir
process checkDesignFile {
publishDir "$baseDir/output", mode: 'copy'
publishDir "$outDir/${task.process}", mode: 'copy'
input:
......@@ -36,12 +47,11 @@ process checkDesignFile {
output:
file("design.csv") into designPaths
file("design.checked.csv") into designPaths
script:
"""
module load python/3.6.1-2-anaconda
python3 $baseDir/scripts/check_design.py -d $designLocation -f $fastqList
"""
}
......@@ -53,33 +63,117 @@ samples = designPaths
.groupTuple()
//.subscribe { println it }
// Duplicate variables
samples.into {
samples211
samples301
samples302
}
refLocation.into {
refLocation211
refLocation301
refLocation302
}
expectCells211 = expectCells
expectCells301 = expectCells
expectCells302 = expectCells
forceCells211 = forceCells
forceCells301 = forceCells
forceCells302 = forceCells
chemistryParam301 = chemistryParam
chemistryParam302 = chemistryParam
process count211 {
tag "count211-$sample"
publishDir "$outDir/${task.process}", mode: 'copy'
input:
set sample, file("${sample}_S1_L00?_R1_001.fastq.gz"), file("${sample}_S1_L00?_R2_001.fastq.gz") from samples211
file ref from refLocation211.first()
expectCells211
forceCells211
output:
file("**/outs/**") into outPaths211
when:
version == '2.1.1'
script:
if (forceCells211 == 0){
"""
cellranger count --id="$sample" --transcriptome="./$ref" --fastqs=. --sample="$sample" --expect-cells=$expectCells211
"""
} else {
"""
cellranger count --id="$sample" --transcriptome="./$ref" --fastqs=. --sample="$sample" --force-cells=$forceCells211
"""
}
}
process count {
tag "$sample"
process count301 {
tag "count301-$sample"
publishDir "$baseDir/output", mode: 'copy'
publishDir "$outDir/${task.process}", mode: 'copy'
input:
set sample, file("${sample}_S1_L00?_R1_001.fastq.gz"), file("${sample}_S1_L00?_R2_001.fastq.gz") from samples
file ref from refLocation.first()
expectCells
forceCells
set sample, file("${sample}_S1_L00?_R1_001.fastq.gz"), file("${sample}_S1_L00?_R2_001.fastq.gz") from samples301
file ref from refLocation301.first()
expectCells301
forceCells301
chemistryParam301
output:
file("**/outs/**") into outPaths
file("**/outs/**") into outPaths301
when:
version == '3.0.1'
script:
if (forceCells301 == 0){
"""
cellranger count --id="$sample" --transcriptome="./$ref" --fastqs=. --sample="$sample" --expect-cells=$expectCells301 --chemistry="$chemistryParam301"
"""
} else {
"""
cellranger count --id="$sample" --transcriptome="./$ref" --fastqs=. --sample="$sample" --force-cells=$forceCells301 --chemistry="$chemistryParam301"
"""
}
}
process count302 {
tag "count302-$sample"
publishDir "$outDir/${task.process}", mode: 'copy'
input:
set sample, file("${sample}_S1_L00?_R1_001.fastq.gz"), file("${sample}_S1_L00?_R2_001.fastq.gz") from samples302
file ref from refLocation302.first()
expectCells302
forceCells302
chemistryParam302
output:
file("**/outs/**") into outPaths302
when:
version == '3.0.2'
script:
if (forceCells == 0){
"""
module load cellranger/2.1.1
cellranger count --id="$sample" --transcriptome="./$ref" --fastqs=. --sample="$sample" --expect-cells=$expectCells
"""
if (forceCells302 == 0){
"""
cellranger count --id="$sample" --transcriptome="./$ref" --fastqs=. --sample="$sample" --expect-cells=$expectCells302 --chemistry="$chemistryParam302"
"""
} else {
"""
module load cellranger/2.1.1
cellranger count --id="$sample" --transcriptome="./$ref" --fastqs=. --sample="$sample" --force-cells=$forceCells
"""
"""
cellranger count --id="$sample" --transcriptome="./$ref" --fastqs=. --sample="$sample" --force-cells=$forceCells302 --chemistry="$chemistryParam302"
"""
}
}
......@@ -100,7 +100,7 @@ def main():
new_design_df = check_design_headers(design_df)
check_files(design_df, fastq_df)
new_design_df.to_csv('design.csv', header=True, sep=',', index=False)
new_design_df.to_csv('design.checked.csv', header=True, sep=',', index=False)
if __name__ == '__main__':
main()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment