Merge branch '50-seurat.object' into 'develop'

Resolve "Generate Raw RDS file" Closes #50 See merge request !76

Merge branch '50-seurat.object' into 'develop'
Resolve "Generate Raw RDS file" Closes #50 See merge request !76
3944b9de · Gervaise Henry · 079739df · 436b0259 · 3944b9de · 3944b9de
Commit 3944b9de authored 4 years ago by Gervaise Henry
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -3,22 +3,25 @@ before_script:
  - module load python/3.6.1-2-anaconda
  - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
  - module load nextflow/20.01.0
-  - module load singularity/3.0.2
+  - module load singularity/3.5.3
  - mkdir -p test_data/hu.v2s1r500
  - mkdir -p test_data/hu.v3s1r500
  - mkdir -p test_data/mu.v3s1r500
  - mkdir -p test_data/hu.v3s2r10k
  - mkdir -p test_data/mu.v3s2r10k
  - mkdir -p test_data/hu.v2s2r10k
+  - mkdir -p test_data/output
  - ln -sfn /project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/hu.v2s1r500/* test_data/hu.v2s1r500/
  - ln -sfn /project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/hu.v3s1r500/* test_data/hu.v3s1r500/
  - ln -sfn /project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/mu.v3s1r500/* test_data/mu.v3s1r500/
  - ln -sfn /project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/hu.v3s2r10k/* test_data/hu.v3s2r10k/
  - ln -sfn /project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/mu.v3s2r10k/* test_data/mu.v3s2r10k/
  - ln -sfn /project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/hu.v2s2r10k/* test_data/hu.v2s2r10k/
+  - ln -sfn /project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/output/* test_data/output/

 stages:
  - astrocyte_test
+  - module_test
  - container_test
  - reference_test
  - multiSample_test
@@ -34,8 +37,8 @@ astrocyte_cli:
    when:
      - always

-2.1.1_test:
-  stage: container_test
+module_2.1.1_test:
+  stage: module_test
  only:
    - branches
  except:
@@ -45,20 +48,20 @@ astrocyte_cli:
      - tags
  script:
    - module load cellranger/2.1.1
-    - cellranger count --id=test --transcriptome=/project/apps_database/cellranger/refdata-cellranger-GRCh38-3.0.0 --fastqs=./test_data/hu.v2s1r500 --sample=pbmc_1k_v2 --chemistry=SC3Pv2
+    - cellranger count --id=module-211 --transcriptome=/project/apps_database/cellranger/refdata-cellranger-GRCh38-3.0.0 --fastqs=./test_data/hu.v2s1r500 --sample=pbmc_1k_v2 --chemistry=SC3Pv2
  artifacts:
    name: "$CI_JOB_NAME"
    when: always
    paths:
-      - test/outs/web_summary.html
+      - module-211/outs/web_summary.html
    expire_in: 2 days
  retry:
    max: 0
    when:
      - always

-3.0.2_test:
-  stage: container_test
+module_3.0.2_test:
+  stage: module_test
  only:
    - branches
  except:
@@ -68,20 +71,20 @@ astrocyte_cli:
      - tags
  script:
    - module load cellranger/3.0.2
-    - cellranger count --id=test --transcriptome=/project/apps_database/cellranger/refdata-cellranger-GRCh38-3.0.0 --fastqs=./test_data/hu.v2s1r500 --sample=pbmc_1k_v2 --chemistry=SC3Pv2
+    - cellranger count --id=module-302 --transcriptome=/project/apps_database/cellranger/refdata-cellranger-GRCh38-3.0.0 --fastqs=./test_data/hu.v2s1r500 --sample=pbmc_1k_v2 --chemistry=SC3Pv2
  artifacts:
    name: "$CI_JOB_NAME"
    when: always
    paths:
-      - test/outs/web_summary.html
+      - module-302/outs/web_summary.html
    expire_in: 2 days
  retry:
    max: 0
    when:
      - always

-3.1.0_test:
-  stage: container_test
+module_3.1.0_test:
+  stage: module_test
  only:
    - branches
  except:
@@ -91,14 +94,172 @@ astrocyte_cli:
      - tags
  script:
    - module load cellranger/3.1.0
-    - cellranger count --id=test --transcriptome=/project/apps_database/cellranger/refdata-cellranger-GRCh38-3.0.0 --fastqs=./test_data/hu.v2s1r500 --sample=pbmc_1k_v2 --chemistry=SC3Pv2
+    - cellranger count --id=module-310 --transcriptome=/project/apps_database/cellranger/refdata-cellranger-GRCh38-3.0.0 --fastqs=./test_data/hu.v2s1r500 --sample=pbmc_1k_v2 --chemistry=SC3Pv2
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - module-310/outs/web_summary.html
+    expire_in: 2 days
+  retry:
+    max: 0
+    when:
+      - always
+
+module_4.0.0_test:
+  stage: module_test
+  only:
+    - branches
+  except:
+     refs:
+      - develop
+      - master
+      - tags
+  script:
+    - module load cellranger/4.0.0
+    - cellranger count --id=module-400 --transcriptome=/project/apps_database/cellranger/refdata-gex-GRCh38-2020-A --fastqs=./test_data/hu.v2s1r500 --sample=pbmc_1k_v2 --chemistry=SC3Pv2
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - module-400/outs/web_summary.html
+    expire_in: 2 days
+  retry:
+    max: 0
+    when:
+      - always
+
+module_seurat_test:
+  stage: module_test
+  only:
+    - branches
+  except:
+     refs:
+      - develop
+      - master
+      - tags
+  script:
+    - module load seurat/3.0.0
+    - ln -sfn test_data/output/* .
+    - seurat-Rscript workflow/scripts/downstream_viz.r --sample sample1 --cellrangerVersion 4.0.0
+    - bash workflow/scripts/versions_seurat.sh > version_seurat.txt
  artifacts:
    name: "$CI_JOB_NAME"
    when: always
    paths:
+      - version_seurat.txt
+    expire_in: 2 days
+  retry:
+    max: 0
+    when:
+      - always
+
+container_2.1.1_test:
+  stage: container_test
+  only:
+    - branches
+  except:
+    refs:
      - develop
      - master
-      - test/outs/web_summary.html
+      - tags
+  script:
+    - singularity run 'docker://bicf/cellranger2.1.1:2.0.0' cellranger count --id=container-211 --transcriptome=/project/apps_database/cellranger/refdata-cellranger-GRCh38-3.0.0 --fastqs=./test_data/hu.v2s1r500 --sample=pbmc_1k_v2 --chemistry=SC3Pv2
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - container-211/outs/web_summary.html
+    expire_in: 2 days
+  retry:
+    max: 0
+    when:
+      - always
+
+container_3.0.2_test:
+  stage: container_test
+  only:
+    - branches
+  except:
+    refs:
+      - develop
+      - master
+      - tags
+  script:
+    - singularity run 'docker://bicf/cellranger3.0.2:2.0.0' cellranger count --id=container-302 --transcriptome=/project/apps_database/cellranger/refdata-cellranger-GRCh38-3.0.0 --fastqs=./test_data/hu.v2s1r500 --sample=pbmc_1k_v2 --chemistry=SC3Pv2
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - container-302/outs/web_summary.html
+    expire_in: 2 days
+  retry:
+    max: 0
+    when:
+      - always
+
+container_3.1.0_test:
+  stage: container_test
+  only:
+    - branches
+  except:
+     refs:
+      - develop
+      - master
+      - tags
+  script:
+    - singularity run 'docker://bicf/cellranger3.1.0:2.0.0' cellranger count --id=container-310 --transcriptome=/project/apps_database/cellranger/refdata-cellranger-GRCh38-3.0.0 --fastqs=./test_data/hu.v2s1r500 --sample=pbmc_1k_v2 --chemistry=SC3Pv2
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - container-310/outs/web_summary.html
+    expire_in: 2 days
+  retry:
+    max: 0
+    when:
+      - always
+
+container_4.0.0_test:
+  stage: container_test
+  only:
+    - branches
+  except:
+     refs:
+      - develop
+      - master
+      - tags
+  script:
+    - singularity run 'docker://bicf/cellranger4.0.0:2.0.0_indev' cellranger count --id=container-400 --transcriptome=/project/apps_database/cellranger/refdata-gex-GRCh38-2020-A --fastqs=./test_data/hu.v2s1r500 --sample=pbmc_1k_v2 --chemistry=SC3Pv2
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - container-400/outs/web_summary.html
+    expire_in: 2 days
+  retry:
+    max: 0
+    when:
+      - always
+
+GRCh38-2020A:
+  stage: reference_test
+  only:
+    refs:
+      - develop
+      - master
+  except:
+    - tags
+  script:
+  - nextflow run workflow/main.nf -profile biohpc,cluster --fastq "test_data/hu.v3s1r500/*.fastq.gz" --designFile "test_data/hu.v3s1r500/design.csv" --genome 'GRCh38-2020-A' --kitVersion '3GEXv3' --version '4.0.0' --ci true
+  - pytest -m count400
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - .nextflow.log
+      - workflow/output/count400/sample1/outs/web_summary.html
+      - workflow/output/multiqc/run/multiqc_report.html
    expire_in: 2 days
  retry:
    max: 0
@@ -129,6 +290,30 @@ GRCh38-3.0.0:
    when:
      - always

+mm10-2020A:
+  stage: reference_test
+  only:
+    refs:
+      - develop
+      - master
+  except:
+    - tags
+  script:
+  - nextflow run workflow/main.nf -profile biohpc,cluster --fastq "test_data/mu.v3s1r500/*.fastq.gz" --designFile "test_data/mu.v3s1r500/design.csv" --genome 'mm10-2020-A' --kitVersion '3GEXv3' --version '4.0.0' --ci true
+  - pytest -m count400
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - .nextflow.log
+      - workflow/output/count400/sample1/outs/web_summary.html
+      - workflow/output/multiqc/run/multiqc_report.html
+    expire_in: 2 days
+  retry:
+    max: 0
+    when:
+      - always
+
 mm10-3.0.0:
  stage: reference_test
  only:
@@ -160,15 +345,15 @@ mm10-3.0.0:
      - master
      - tags
  script:
-  - nextflow run workflow/main.nf -profile biohpc,cluster --fastq "test_data/hu.v3s2r10k/*.fastq.gz" --designFile "test_data/hu.v3s2r10k/design.csv" --genome 'GRCh38-3.0.0' --kitVersion 'auto' --version '3.1.0' --ci true
-  - pytest -m count310
+  - nextflow run workflow/main.nf -profile biohpc,cluster --fastq "test_data/hu.v3s2r10k/*.fastq.gz" --designFile "test_data/hu.v3s2r10k/design.csv" --genome 'GRCh38-2020A' --kitVersion 'auto' --version '4.0.0' --ci true
+  - pytest -m count400
  artifacts:
    name: "$CI_JOB_NAME"
    when: always
    paths:
      - .nextflow.log
-      - workflow/output/count310/sample1/outs/web_summary.html
-      - workflow/output/count310/sample2/outs/web_summary.html
+      - workflow/output/count400/sample1/outs/web_summary.html
+      - workflow/output/count400/sample2/outs/web_summary.html
      - workflow/output/multiqc/run/multiqc_report.html
    expire_in: 2 days
  retry:

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# v2.2.0-indev
+**User Facing**
+* Add cellranger version 4.0.0
+* Add references version 2020-A (GRCh38, mm10, mix)
+* Create option to create files for downstream viz and analysis (Seurat R-object)
+
+**Background**
+
+*Known Bugs*
+* Vizapp does not yet work for Astrocyte
+* Running in CLI: to set --fastq path of file/s needs to be in quotes
+
+
 # v2.1.1
 **User Facing**
 * Check Design File for spaces in name and file contents

--- a/LICENSE
+++ b/LICENSE
@@ -2,7 +2,7 @@ MIT License

 Copyright (c) 2019 University of Texas Southwestern Medical Center.

-Contributors: Gervaise H. Henry, Jeremy Mathews, and Venkat Malladi
+Contributors: Gervaise H. Henry, Jeremy Mathews, Jon Gesell, and Venkat Malladi

 Department: Bioinformatic Core Facility, Department of Bioinformatics


--- a/README.md
+++ b/README.md
@@ -108,12 +108,17 @@ To Run:
        * *'3.0.2'*
        * *'2.1.1'*
    * eg: **--version '3.1.0'**
+  * **--vizFiles**
+    * create objects which can be used for downstream visualization and analysis of each sample outputs, currently creates:
+      * Seurat R-objects
+    * true/false
+    * eg: **--version true**
  * **--outDir**
    * optional output directory for run
    * eg: **--outDir 'test'**
 * FULL EXAMPLE:
  ```
-  nextflow run workflow/main.nf -profile biohpc,cluster --fastq '/project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/hu.v3s2r10k/*.fastq.gz' --designFile '/project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/hu.v3s2r10k/design.csv' --genome 'GRCh38-3.0.0' --kitVersion '3GEXv3' --version '3.1.0' --outDir 'test'
+  nextflow run workflow/main.nf -profile biohpc,cluster --fastq '/project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/hu.v3s2r10k/*.fastq.gz' --designFile '/project/shared/bicf_workflow_ref/workflow_testdata/cellranger/cellranger_count/hu.v3s2r10k/design.csv' --genome 'GRCh38-3.0.0' --kitVersion '3GEXv3' --version '3.1.0' --vizFiles true --outDir 'test'
  ```
 * Design example:


--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -100,12 +100,15 @@ workflow_parameters:
  - id: genome
    type: select
    choices:
+      - ['GRCh38-2020-A', 'Human GRCh38 release 98']
      - ['GRCh38-3.0.0', 'Human GRCh38 release 93']
      - ['GRCh38-1.2.0', 'Human GRCh38 release 84']
      - ['hg19-3.0.0', 'Human GRCh37 (hg19) release 87']
      - ['hg19-1.2.0', 'Human GRCh37 (hg19) release 84']
+      - ['mm10-2020-A', 'Mouse GRCm38 (mm10) release 98']
      - ['mm10-3.0.0', 'Mouse GRCm38 (mm10) release 93']
      - ['mm10-1.2.0', 'Mouse GRCm38 (mm10) release 84']
+      - ['GRCh38_and_mm10-2020-A', 'Human GRCh38 + Mouse GRCm38 (mm10) release 98']
      - ['GRCh38_and_mm10-3.1.0', 'Human GRCh38 + Mouse GRCm38 (mm10) release 93']
      - ['hg19_and_mm10-3.0.0', 'Human GRCh37 (hg19) + Mouse GRCm38 (mm10) release 93']
      - ['hg19_and_mm10-1.2.0', 'Human GRCh37 (hg19) + Mouse GRCm38 (mm10) release 84']
@@ -146,8 +149,9 @@ workflow_parameters:

  - id: version
    type: select
-    default: '3.1.0'
+    default: '4.0.0'
    choices:
+      - ['4.0.0', '4.0.0']
      - ['3.1.0', '3.1.0']
      - ['3.0.2', '3.0.2']
      - ['2.1.1', '2.1.1']
@@ -155,6 +159,17 @@ workflow_parameters:
    description: |
      10x cellranger version.

+  - id: vizFiles
+    type: select
+    choices:
+      - [ 'true', 'Yes' ]
+      - [ 'false', 'No' ]
+    default: 'true'
+    required: true
+    description: |
+      Create objects which can be used for downstream visualization and analysis of each sample outputs. Currently created: Seurat R-objects.
+
+
  - id: astrocyte
    type: select
    choices:

--- a/docs/references.md
+++ b/docs/references.md
 ### References

 1. **Nextflow**:
-  * Di Tommaso P., Chatzou M., Floden E. W., Barja P. P., Palumbo E., and Notredame C. 2017. Nextflow enables reproducible computational workflows. Nature biotechnology 35(4): 316. doi:[10.1038/nbt.3820](https://doi.org/10.1038/nbt.3820)
+  * Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316-319. doi:[10.1038/nbt.3820](https://doi.org/10.1038/nbt.3820)

 2. **cellranger**
  * Cellranger count [https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/count](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/count)
@@ -9,5 +9,8 @@
 3. **python**:
  * Anaconda (Anaconda Software Distribution, [https://anaconda.com](https://anaconda.com))

-4. **MultiQc**:
+4. **Seurat**:
+  * Stuart, T., Butler, A., Hoffman, P., Hafemeister, C., Papalexi, E., Mauck III, W. M., ... & Satija, R. (2019). Comprehensive integration of single-cell data. Cell, 177(7), 1888-1902. doi:[10.1016/j.cell.2019.05.031](https://doi.org/10.1016/j.cell.2019.05.031)
+
+5. **MultiQc**:
  * Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:[10.1093/bioinformatics/btw354](https://dx.doi.org/10.1093/bioinformatics/btw354)
--- a/workflow/configs/aws.config
+++ b/workflow/configs/aws.config
@@ -12,27 +12,35 @@ process {
  cpus = 1
  memory = '1 GB'

-  withLabel: checkDesignFile {
+  withName: checkDesignFile {
    cpus = 2
    memory = '1 GB'
  }
-  withLabel: count211 {
+  withName: count211 {
    cpus = 2
    memory = '30 GB'
  }
-  withLabel: count302 {
+  withName: count302 {
    cpus = 2
    memory = '30 GB'
  }
-  withLabel: count310 {
+  withName: count310 {
    cpus = 2
    memory = '30 GB'
  }
-  withLabel: versions {
+  withName: count400 {
+    cpus = 2
+    memory = '30 GB'
+  }
+  withName: downstreamViz {
+    cpus = 2
+    memory = '1 GB'
+  }
+  withName: versions {
    cpus = 3
    memory = '1 GB'
  }
-  withLabel: multiqc {
+  withName: multiqc {
    cpus = 1
    memory = '1 GB'
  }

--- a/workflow/configs/biohpc.config
+++ b/workflow/configs/biohpc.config
 params {
  // Reference file paths on BioHPC
  genomes {
+    'GRCh38-2020-A' {
+      loc = '/project/apps_database/cellranger/refdata-gex-'
+    }
    'GRCh38-3.0.0' {
      loc = '/project/apps_database/cellranger/refdata-cellranger-'
    }
@@ -13,12 +16,18 @@ params {
    'hg19-1.2.0' {
      loc = '/project/apps_database/cellranger/refdata-cellranger-'
    }
+    'mm10-2020-A' {
+      loc = '/project/apps_database/cellranger/refdata-gex-'
+    }
    'mm10-3.0.0' {
      loc = '/project/apps_database/cellranger/refdata-cellranger-'
    }
    'mm10-1.2.0' {
      loc = '/project/apps_database/cellranger/refdata-cellranger-'
    }
+    'GRCh38_and_mm10-2020-A' {
+      loc = '/project/apps_database/cellranger/refdata-gex-'
+    }
    'GRCh38_and_mm10-3.1.0' {
      loc = '/project/apps_database/cellranger/refdata-cellranger-'
    }
@@ -52,6 +61,11 @@ params {
  }
 }

+singularity {
+  enabled = true
+  cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/'
+}
+
 env {
  http_proxy = 'http://proxy.swmed.edu:3128'
  https_proxy = 'http://proxy.swmed.edu:3128'

--- a/workflow/configs/cluster.config
+++ b/workflow/configs/cluster.config
 process {
  executor = 'slurm'
-  queue = 'super'
+  queue = '32GB'
  clusterOptions = '--hold'

-  withName:trackStart {
+  withName: trackStart {
    executor = 'local'
  }
-  withName:checkDesignFile {
+  withName: checkDesignFile {
    executor = 'local'
  }
-  withName:count211 {
+  withName: count211 {
    queue = '128GB,256GB,256GBv1,384GB'
  }
-  withName:count302 {
+  withName: count302 {
    queue = '128GB,256GB,256GBv1,384GB'
  }
-  withName:count310 {
+  withName: count310 {
    queue = '128GB,256GB,256GBv1,384GB'
  }
-  withName:versions {
+  withName: count400 {
+    queue = '128GB,256GB,256GBv1,384GB'
+  }
+  withName: downstreamViz {
+    queue = '32GB'
+  }
+  withName: versions {
    executor = 'local'
  }
-  withName:multiqc {
+  withName: multiqc {
    executor = 'local'
  }
 }
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -21,12 +21,12 @@ main.nf
 params.name = "run"
 params.fastq = "test_data/mu.v3s1r500/*.fastq.gz"
 params.designFile = "test_data/mu.v3s1r500/design.csv"
-params.genome = 'mm10-3.0.0'
-params.genomeLocation = '/project/apps_database/cellranger/refdata-cellranger-'
+params.genome = 'mm10-2020-A'
 params.expectCells = 10000
 params.forceCells = 0
 params.kitVersion = '3GEXv3'
-params.version = '3.1.0'
+params.version = '4.0.0'
+params.vizFiles = true
 params.astrocyte = false
 params.outDir = "${baseDir}/output"

@@ -39,7 +39,11 @@ if (params.kitVersion == "3GEXv3" && params.version == '2.1.1') {
 // Define variables if astrocyte (or from config)
 if (params.astrocyte) {
  print("Running under astrocyte")
-  params.genomeLocation = '/project/apps_database/cellranger/refdata-cellranger-'
+  if (params.version == "4.0.0") {
+    params.genomeLocation = '/project/apps_database/cellranger/refdata-gex-'
+  } else {
+    params.genomeLocation = '/project/apps_database/cellranger/refdata-cellranger-'
+  }
  if (params.kitVersion == "3GEXv1") {
    params.chemistryParam ='SC3Pv1'
  } else if (params.kitVersion == "3GEXv2") {
@@ -60,7 +64,7 @@ if (params.astrocyte) {
 params.genomeLocationFull = params.genomeLocation+params.genome

 // Define variables from input
-pipelineVersion = "2.1.1"
+pipelineVersion = "2.2.0-indev"
 name = params.name
 designLocation = Channel
  .fromPath(params.designFile)
@@ -77,6 +81,7 @@ expectCells = params.expectCells
 forceCells = params.forceCells
 chemistryParam = params.chemistryParam
 version = params.version
+vizFiles = params.vizFiles
 outDir = params.outDir

 // Define script files
@@ -85,6 +90,8 @@ filename_checkScript = Channel.fromPath("$baseDir/scripts/filename_check.sh")
 generate_versionsScript = Channel.fromPath("$baseDir/scripts/generate_versions.py")
 generate_referencesScript = Channel.fromPath("$baseDir/scripts/generate_references.py")
 versions_pythonScript = Channel.fromPath("$baseDir/scripts/versions_python.sh")
+versions_seuratScript = Channel.fromPath("$baseDir/scripts/versions_seurat.sh")
+downstream_vizScript = Channel.fromPath("$baseDir/scripts/downstream_viz.r")

 // Define report files
 multiqcConf = "${baseDir}/configs/multiqc_config.yaml"
@@ -158,21 +165,26 @@ samples.into {
  samples211
  samples302
  samples310
+  samples400
 }
 refLocation.into {
  refLocation211
  refLocation302
  refLocation310
+  refLocation400
 }
 expectCells211 = expectCells
 expectCells302 = expectCells
 expectCells310 = expectCells
+expectCells400 = expectCells
 forceCells211 = forceCells
 forceCells302 = forceCells
 forceCells310 = forceCells
+forceCells400 = forceCells
 chemistryParam211 = chemistryParam
 chemistryParam302 = chemistryParam
 chemistryParam310 = chemistryParam
+chemistryParam400 = chemistryParam


 /*
@@ -192,7 +204,8 @@ process count211 {
    chemistryParam211

  output:
-    file("**/outs/**") into outPaths211
+    set sample, file("**/outs/**") into outPaths211
+    set sample, file("**/outs/filtered_*/**"), file("**/outs/analysis/clustering/graphclust/**"), file("**/outs/analysis/clustering/kmeans_2_clusters/**"), file("**/outs/analysis/clustering/kmeans_3_clusters/**"), file("**/outs/analysis/clustering/kmeans_4_clusters/**"), file("**/outs/analysis/clustering/kmeans_5_clusters/**"), file("**/outs/analysis/clustering/kmeans_6_clusters/**"), file("**/outs/analysis/clustering/kmeans_7_clusters/**"), file("**/outs/analysis/clustering/kmeans_8_clusters/**"), file("**/outs/analysis/clustering/kmeans_9_clusters/**"), file("**/outs/analysis/clustering/kmeans_10_clusters/**"), file("**/outs/analysis/pca/**"), file("**/outs/analysis/tsne/**") into filteredOut211
    file("*_metrics_summary.tsv") into metricsSummary211

  when:
@@ -239,7 +252,8 @@ process count302 {
    chemistryParam302

  output:
-    file("**/outs/**") into outPaths302
+    set sample, file("**/outs/**") into outPaths302
+    set sample, file("**/outs/filtered_*/**"), file("**/outs/analysis/clustering/graphclust/**"), file("**/outs/analysis/clustering/kmeans_2_clusters/**"), file("**/outs/analysis/clustering/kmeans_3_clusters/**"), file("**/outs/analysis/clustering/kmeans_4_clusters/**"), file("**/outs/analysis/clustering/kmeans_5_clusters/**"), file("**/outs/analysis/clustering/kmeans_6_clusters/**"), file("**/outs/analysis/clustering/kmeans_7_clusters/**"), file("**/outs/analysis/clustering/kmeans_8_clusters/**"), file("**/outs/analysis/clustering/kmeans_9_clusters/**"), file("**/outs/analysis/clustering/kmeans_10_clusters/**"), file("**/outs/analysis/pca/**"), file("**/outs/analysis/tsne/**") into filteredOut302
    file("*_metrics_summary.tsv") into metricsSummary302

  when:
@@ -285,7 +299,8 @@ process count310 {
    chemistryParam310

  output:
-    file("**/outs/**") into outPaths310
+    set sample, file("**/outs/**") into outPaths310
+    set sample, file("**/outs/filtered_*/**"), file("**/outs/analysis/clustering/graphclust/**"), file("**/outs/analysis/clustering/kmeans_2_clusters/**"), file("**/outs/analysis/clustering/kmeans_3_clusters/**"), file("**/outs/analysis/clustering/kmeans_4_clusters/**"), file("**/outs/analysis/clustering/kmeans_5_clusters/**"), file("**/outs/analysis/clustering/kmeans_6_clusters/**"), file("**/outs/analysis/clustering/kmeans_7_clusters/**"), file("**/outs/analysis/clustering/kmeans_8_clusters/**"), file("**/outs/analysis/clustering/kmeans_9_clusters/**"), file("**/outs/analysis/clustering/kmeans_10_clusters/**"), file("**/outs/analysis/pca/**"), file("**/outs/analysis/tsne/**"), file("**/outs/analysis/umap/**") into filteredOut310
    file("*_metrics_summary.tsv") into metricsSummary310

  when:
@@ -314,6 +329,87 @@ process count310 {
    }
 }

+/*
+ * count400: run cellranger count version 4.0.0
+ */
+process count400 {
+  tag "${sample}"
+  publishDir "${outDir}/${task.process}", mode: 'copy'
+  queue '128GB,256GB,256GBv1,384GB'
+  module 'cellranger/4.0.0'
+
+  input:
+    set sample, file("${sample}_S?_L001_R1_001.fastq.gz"), file("${sample}_S?_L001_R2_001.fastq.gz"), file(script) from samples400
+    file ref from refLocation400.first()
+    expectCells400
+    forceCells400
+    chemistryParam400
+
+  output:
+    set file("**/outs/**") into outPaths400
+    set sample, file("**/outs/filtered_*/**"), file("**/outs/analysis/clustering/graphclust/**"), file("**/outs/analysis/clustering/kmeans_2_clusters/**"), file("**/outs/analysis/clustering/kmeans_3_clusters/**"), file("**/outs/analysis/clustering/kmeans_4_clusters/**"), file("**/outs/analysis/clustering/kmeans_5_clusters/**"), file("**/outs/analysis/clustering/kmeans_6_clusters/**"), file("**/outs/analysis/clustering/kmeans_7_clusters/**"), file("**/outs/analysis/clustering/kmeans_8_clusters/**"), file("**/outs/analysis/clustering/kmeans_9_clusters/**"), file("**/outs/analysis/clustering/kmeans_10_clusters/**"), file("**/outs/analysis/pca/**"), file("**/outs/analysis/tsne/**"), file("**/outs/analysis/umap/**") into filteredOut400
+    file("*_metrics_summary.tsv") into metricsSummary400
+
+  when:
+    version == '4.0.0'
+
+  script:
+    if (forceCells400 == 0) {
+      """
+      hostname
+      ulimit -u 16384
+      ulimit -a
+      bash filename_check.sh -r ${ref}
+      cellranger count --id=${sample} --transcriptome=./${ref} --fastqs=. --sample=${sample} --expect-cells=${expectCells310} --chemistry=${chemistryParam310}
+      sed -E 's/("([^"]*)")?(,|\$)/\\2\t/g' ${sample}/outs/metrics_summary.csv | tr -d "," | sed "s/^/${sample}\t/" > ${sample}_metrics_summary.tsv
+      """
+    }
+    else {
+      """
+      hostname
+      ulimit -u 16384
+      ulimit -a
+      bash filename_check.sh -r ${ref}
+      cellranger count --id=${sample} --transcriptome=./${ref} --fastqs=. --sample=${sample} --force-cells=${forceCells310} --chemistry=${chemistryParam310}
+      sed -E 's/("([^"]*)")?(,|\$)/\\2\t/g' ${sample}/outs/metrics_summary.csv | tr -d "," | sed "s/^/${sample}\t/" > ${sample}_metrics_summary.tsv
+      """
+    }
+}
+
+// Collect all outputs reguardless of cellranger version
+filteredOut = filteredOut211.mix(filteredOut302, filteredOut310, filteredOut400)
+// Combine all inputs for downstreamViz
+downstreamVizIn = downstream_vizScript.combine(versions_seuratScript).combine(filteredOut)
+
+/*
+ * downstreamViz: create files for downstream use (eg. R Seurat object)
+ */
+process downstreamViz {
+  tag "${sample}"
+  publishDir "${outDir}/seurat", mode: 'copy', pattern: "*.rds"
+  module 'seurat/3.0.0'
+
+  input:
+    set file("*"), file("*"), sample, file("filtered/*"), file("clustering/graphclust/*"), file("clustering/kmeans_2_clusters/*"), file("clustering/kmeans_3_clusters/*"), file("clustering/kmeans_4_clusters/*"), file("clustering/kmeans_5_clusters/*"), file("clustering/kmeans_6_clusters/*"), file("clustering/kmeans_7_clusters/*"), file("clustering/kmeans_8_clusters/*"), file("clustering/kmeans_9_clusters/*"), file("clustering/kmeans_10_clusters/*"), file("pca/*"), file("tsne/*"), file("umap/*") from downstreamVizIn
+    //file downstream_vizScript
+    //file versions_seuratScript
+
+  output:
+    file "*.rds" into seuratPaths
+    file "version_seurat.txt" into version_seurat
+
+  when:
+    vizFiles
+
+  script:
+    """
+    hostname
+    ulimit -a
+    seurat-Rscript downstream_viz.r --sample ${sample} --cellrangerVersion ${version}
+    bash versions_seurat.sh > version_seurat.txt
+    """
+}
+
 /*
 * versions: collect all versions into a single yml
 */
@@ -323,6 +419,7 @@ process versions {

  input:
    file versions_pythonScript
+    file version_seurat
    file generate_versionsScript
    file generate_referencesScript

@@ -343,7 +440,7 @@ process versions {
 }

 // Collect all metrics summaries reguardless of cellranger version
-metricsSummary = metricsSummary211.mix(metricsSummary302, metricsSummary310)
+metricsSummary = metricsSummary211.mix(metricsSummary302, metricsSummary310, metricsSummary400)

 /*
 * multiqc: create multiqc report

--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
 profiles {
  standard {
    includeConfig 'configs/biohpc.config'
+    includeConfig 'configs/cluster.config'
  }
  biohpc {
    includeConfig 'configs/biohpc.config'
@@ -47,6 +48,6 @@ manifest {
  homePage = 'https://git.biohpc.swmed.edu/BICF/Astrocyte/cellranger_count'
  description = 'This pipeline is a wrapper for the cellranger count tool from 10x Genomics. It takes fastq files from 10x Genomics Single Cell Gene Expression libraries, performs alignment, filtering, barcode counting, and UMI counting. It uses the Chromium cellular barcodes to generate gene-barcode matrices, determine clusters, and perform gene expression analysis.'
  mainScript = 'main.nf'
-  version = '2.1.1'
+  version = '2.2.0-indev'
  nextflowVersion = '>=0.31.0'
 }
--- a/workflow/scripts/downstream_viz.r
+++ b/workflow/scripts/downstream_viz.r
+if (!require(optparse)) install.packages('optparse',repos='http://cran.us.r-project.org',quiet=TRUE)
+library(optparse)
+library(Seurat)
+
+option_list=list(
+  make_option("--sample",default="sample1",action="store",type='character',help="sample"),
+  make_option("--cellrangerVersion",default="4.0.0",action="store",type='character',help="cellranger Version")
+)
+opt=parse_args(OptionParser(option_list=option_list))
+rm(option_list)
+
+data <- Read10X(data.dir="filtered/")
+data <-  CreateSeuratObject(counts=data)
+
+dimReductions <- c("pca","tsne")
+if (opt$cellrangerVersion!="2.1.1" && opt$cellrangerVersion!="3.0.2"){
+  dimReductions <- c(dimReductions,"umap")
+}
+for (i in dimReductions){
+  if (i=="tsne"){
+    lab <- "tSNE"
+  } else {
+    lab <- toupper(i)
+  }
+  projection <- read.csv(paste0(i,"/projection.csv"),row.names=1)
+  rownames(projection) <- gsub("-.","",rownames(projection))
+  data[[i]] <- CreateDimReducObject(embeddings=as.matrix(projection),key=paste0(lab,"_"),assay="RNA")
+}
+
+clust <- c("graphclust",paste0("kmeans_",2:10,"_clusters"))
+for (i in clust){
+  clusters <- read.csv(paste0("clustering/",i,"/clusters.csv"),row.names=1)
+  rownames(clusters) <- gsub("-.","",rownames(clusters))
+  data[[i]] <- clusters
+  data@meta.data <- data@meta.data[,colnames(data@meta.data)!="orig.ident"]
+}
+
+saveRDS(data,paste0(opt$sample,".rds"))
--- a/workflow/scripts/generate_versions.py
+++ b/workflow/scripts/generate_versions.py
@@ -28,6 +28,7 @@ SOFTWARE_REGEX = {
    'Nextflow': ['version_nextflow.txt', r"(\S+)"],
    'cellranger count': ['version_cellranger.txt', r"(\S+)"],
    'python': ['version_python.txt', r"(\S+)"],
+    'seurat': ['version_seurat.txt', r"(\S+)"],
 }


@@ -77,6 +78,7 @@ def main():
    results['Nextflow'] = '<span style="color:#999999;\">N/A</span>'
    results['cellranger count'] = '<span style="color:#999999;\">N/A</span>'
    results['python'] = '<span style="color:#999999;\">N/A</span>'
+    results['seurat'] = '<span style="color:#999999;\">N/A</span>'

    # Check for version files:
    check_files(files)

--- a/workflow/scripts/versions_seurat.sh
+++ b/workflow/scripts/versions_seurat.sh
+#!/bin/bash
+#versions_python.sh
+#*
+#* --------------------------------------------------------------------------
+#* Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/cellranger_count/blob/develop/LICENSE)
+#* --------------------------------------------------------------------------
+#*
+
+seurat-Rscript -e 'packageVersion("Seurat")' |& grep '\[1\] ' | sed -n -e 's/^\[1\] ‘//p' | tr -d '’'
--- a/workflow/tests/test_count.py
+++ b/workflow/tests/test_count.py
@@ -33,4 +33,9 @@ def test_count302_count():
 @pytest.mark.count310
 def test_count310_count():
    assert os.path.exists(os.path.join(test_output_path, 'count310', 'sample1_metrics_summary.tsv'))
-    assert os.path.exists(os.path.join(test_output_path, 'count310', 'sample1', 'outs'))
\ No newline at end of file
+    assert os.path.exists(os.path.join(test_output_path, 'count310', 'sample1', 'outs'))
+
+@pytest.mark.count400
+def test_count310_count():
+    assert os.path.exists(os.path.join(test_output_path, 'count400', 'sample1_metrics_summary.tsv'))
+    assert os.path.exists(os.path.join(test_output_path, 'count400', 'sample1', 'outs'))