diff --git a/.gitignore b/.gitignore index 8157465e3cb8b758857989e70e6e461e414de743..a90209aefb30ed1a2316f2e77fae4b420a88fbb5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,11 @@ workflow/work workflow/*.html* workflow/*.csv +workflow/test.nf test_data/pbmc_1k* test_data/Brain_Tumor* -test_data/hgmm_100_fastqs +test_data/hgmm_100* test_data/fastqs *.DS_Store diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b0f3669e3e16b02d07877a84703f4d7a7e23b6db..6d9feb5d7b5f562d08d298a82476af4611806621 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -21,7 +21,7 @@ astrocyte_check: # Check for astrocyte validity - module load astrocyte/2.0.1 - astrocyte_cli check . -test-count-human: # Run cellranger count on the human data +test-count-multisample: # Run cellranger count on both human and mouse data stage: test needs: [] tags: @@ -31,24 +31,7 @@ test-count-human: # Run cellranger count on the human data script: - > nextflow run workflow/main.nf - --reference=/data/ref_data/refdata-gex-GRCh38-2020-A + --sample_sheet=/data/test_data/sample_sheet.csv --fastq=/data/test_data/Brain_Tumor_3p_LT_fastqs --noBam=true - nextflow clean -f -keep-logs - -test-count-barnyard: # Run cellranger count on the barnyard data - stage: test - needs: [] - tags: - - vm - before_script: - - export PATH="/opt/nextflow:/opt/cellranger-7.1.0:$PATH" - script: - - > - nextflow run workflow/main.nf - --id=hgmm_100 - --sample=hgmm_100 - --fastq=/data/test_data/hgmm_100_fastqs - --reference=/data/ref_data/refdata-gex-GRCh38-and-mm10-2020-A - --noBam=true - - nextflow clean -f -keep-logs diff --git a/README.md b/README.md index 2183ad257e9e7d8c8d1b72bb2f8fc9571d5cb8d8..dc323062541c810a43f18ae93b78b8c8e95c4372 100755 --- a/README.md +++ b/README.md @@ -15,24 +15,29 @@ file of aligned reads. These outputs can then be used for downstream analysis. ## Parameters -- Sample: The name of the sample. This should match the sample -name of the fastq files. - Fastq: The fastq files for the sample. Regardless of the files that are selected, only those matching the sample name will be included. -- Reference: Which reference genome to use. Current choices are -hg38 and mm10. -- expectCells: How many cells are expected to be present. Leave at -the default (0) to automatically estimate. This may be manually set if -the estimate is inaccurate. -- Chemistry: The chemistry used to create the library. Automatic detection -is recommended, but if the library chemistry is **3'v1** or **multiome GEX**, -you must set these manually. -- Introns: Should intronic reads be counted? Recommended to -keep true. Note that this **must** be true to process data from single nucleus -suspensions. -- noBam: Should the pipeline skip generating a bam file? -Bam file generation is recommended, but it may be skipped to reduce file -size output and speed up processing time. + +- sample_sheet: This is a file with the following named columns: + + - **sample**: The name of the sample. This must match the prefix of the associated fastq files. + - **reference**: Which reference genome to use. This workflow currently supports the values "hg38" or "mm10". + - **expectCells**: The number of cells expected from this sample. Set to "0" for auto-detection. + - **chemistry**: The chemistry used to generate libraries. Set to "auto" for auto-detection. + Note that if the chemistry is 3' v1 or you're analyzing GEX data alone generated from + multiome, you must set this explicitly. Possible values: + - "auto": auto detect + - "SC3Pv1": Single cell 3' v1 + - "SC3Pv2": Single cell 3' v2 + - "SC3Pv3": Single cell 3' v3 + - "SC3Pv3LT": Single cell 3' v3 LT + - "SC3Pv3HT": Single cell 3' v3 HT + - "SC5P-PE": Single cell 5' paired-end + - "SC5P-R2": Single cell 5' R2-only + - "ARC-v1": GEX only from multiome + - introns (true/false): Whether to count intronic reads. + - noBam (true/false): Whether to skip bam file generation. This will save some time and space, but + bam files may be required for downstream analysis and/or deposition into public databases. ## Questions diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index d594f551a359533ea80e8195ae72840b6dc8c248..6fcc48ff7354f8075ef62e6f93d5aea89ade54c3 100755 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -23,9 +23,9 @@ description: | #### New Features in Astrocyte 0.4.0 and above #### citation: | - Nextflow 22.04.5: https://www.nextflow.io/ + Nextflow 23.04.3: https://www.nextflow.io/ 10x Genomics Cell Ranger 7.1.0: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger - + Please acknowledge support in publications: This research was supported in part by the computational resources provided by the BioHPC supercomputing facility located in the Lyda Hill Department of @@ -104,11 +104,12 @@ workflow_modules: workflow_parameters: - - id: sample - type: string + - id: sample_sheet + type: file required: true + regex: ".*csv$" description: | - The name of the sample. This should match the prefix on the input fastq files. + A sample sheet in CSV format. See documentation below for required formatting. - id: fastq type: files @@ -118,64 +119,11 @@ workflow_parameters: The fastq.gz files to be passed to cellranger count, typically generated by cellranger mkfastq or bcl2fastq. - - id: reference - type: select - required: true - default: "hg38" - description: | - Which reference genome to use to align the reads. - choices: - - [ "hg38", "Human reference GRCh38" ] - - [ "mm10", "Mouse refernece mm10" ] - - - id: expectCells - type: integer - required: true - default: 0 - min: 0 - description: | - An estimate of how many cells you expect to be present. If 0, - cellranger will automatically estimate the number of cells present. - - - id: chemistry + - id: astrocyte type: select - default: "auto" - required: true - description: | - The chemistry used when preparing the library. It's recommended to use the default - (auto detect) UNLESS the chemistry is 3' v1, analyzing GEX data alone generated from - multiome, or if the autodetection has failed. choices: - - [ "auto", "Auto detect" ] - - [ "SC3Pv1", "Single cell 3' v1" ] - - [ "SC3Pv2", "Single cell 3' v2" ] - - [ "SC3Pv3", "Single cell 3' v3" ] - - [ "SC3Pv3LT", "Single cell 3' v3 LT" ] - - [ "SC3Pv3HT", "Single cell 3' v3 HT" ] - - [ "SC5P-PE", "Single cell 5' paired-end" ] - - [ "SC5P-R2", "Single cell 5' R2-only" ] - - [ "ARC-v1", "GEX only from multiome" ] - - - id: introns - type: select + - [ 'true', 'true'] required: true - default: "true" + default: 'true' description: | - Include intronic reads in the alignment? This is generally recommended to reduce - the number of reads discarded, and is required for single nucleus preps. - choices: - - [ "true", "Include introns" ] - - [ "false", "Exclude introns" ] - - - id: noBam - type: select - required: true - default: "false" - description: | - Skip bam file generation? This will speed up the process and save space, - but is only recommended for test or comparison runs as bam files are - useful for troubleshooting and may be required when depositing data - in public repositories. - choices: - - [ "true", "Skip bam file generation"] - - [ "false", "Do not skip bam file generation"] + Provide proper configuration for use on Astrocyte. diff --git a/docs/index.md b/docs/index.md index 35425463f61151552b4d7048834dc6f5dfd3651e..39461ed9b7cbe3a531e14ca0b63a03f8a3879960 100644 --- a/docs/index.md +++ b/docs/index.md @@ -9,24 +9,43 @@ workflow. To run this workflow, you must supply the following parameters: -- Sample: The name of the sample. This should match the sample -name of the fastq files. - Fastq: The fastq files for the sample. Regardless of the files that are -selected, only those matching the sample name will be included. -- Reference: Which reference genome to use. Current choices are -hg38 and mm10. -- expectCells: How many cells are expected to be present. Leave at -the default (0) to automatically estimate. This may be manually set if -the estimate is inaccurate. -- Chemistry: The chemistry used to create the library. Automatic detection -is recommended, but if the library chemistry is **3'v1** or **multiome GEX**, -you must set these manually. -- Introns: Should intronic reads be counted? Recommended to -keep true. Note that this **must** be true to process data from single nucleus -suspensions. -- noBam: Should the pipeline skip generating a bam file? -Bam file generation is recommended, but it may be skipped to reduce file -size output and speed up processing time. +selected, only those with prefixes matching the sample name will be included. + +- sample_sheet: This is a file with the following named columns: + + - **sample**: The name of the sample. This must match the prefix of the associated fastq files. + - **reference**: Which reference genome to use. This workflow currently supports the values "hg38" or "mm10". + - **expectCells**: The number of cells expected from this sample. Set to "0" for auto-detection. + - **chemistry**: The chemistry used to generate libraries. Set to "auto" for auto-detection. + Note that if the chemistry is 3' v1 or you're analyzing GEX data alone generated from + multiome, you must set this explicitly. Possible values: + - "auto": auto detect + - "SC3Pv1": Single cell 3' v1 + - "SC3Pv2": Single cell 3' v2 + - "SC3Pv3": Single cell 3' v3 + - "SC3Pv3LT": Single cell 3' v3 LT + - "SC3Pv3HT": Single cell 3' v3 HT + - "SC5P-PE": Single cell 5' paired-end + - "SC5P-R2": Single cell 5' R2-only + - "ARC-v1": GEX only from multiome + - introns (true/false): Whether to count intronic reads. + - noBam (true/false): Whether to skip bam file generation. This will save some time and space, but + bam files may be required for downstream analysis and/or deposition into public databases. + +## Output + +This workflow will provide the following output: + +- analysis: a directory containing data regarding differential expression, clustering, etc. The files +in this directory can be consumed by downstream analysis tools. +- raw_feature_bc_matrix(.h5): the raw (all droplets included) counts matrix in MTX (or .h5) format. +- filtered_feature_bc_matrix(.h5): the filtered (empty droplets removed) counts matrix in MTX (or .h5) format. +- web_summary.html: an HTML report providing a summary of the run. +- metrics_summary.csv: a summary of the metrics reported in web_summary.html. +- molecule_info.h5: a file containing per-molecule information for all high-quality and assigned reads +- cloupe.cloupe: a file that can be read by 10x Genomics Loupe Cell Browser for interactive visualization +- possorted_genome_bam.bam(.bai): a bam/index file containing alignment. Not generated if `noBam` set to "true". ## Test data diff --git a/test_data/sample_sheet.csv b/test_data/sample_sheet.csv new file mode 100644 index 0000000000000000000000000000000000000000..8f88ade1d3b7fe0f8f08eef0c69c799ad5e20a6a --- /dev/null +++ b/test_data/sample_sheet.csv @@ -0,0 +1,3 @@ +sample,reference,expectCells,chemistry,introns,noBam +Brain_Tumor_3p_LT,hg38,0,auto,true,true +hgmm_100,mm10,0,auto,true,true \ No newline at end of file diff --git a/workflow/configs/biohpc.config b/workflow/configs/biohpc.config index 552c35df837b2d16b077e1feafb148a405de16fd..2fb9ebcfe0159c8519eb9bff1ba5a3f04177aa08 100755 --- a/workflow/configs/biohpc.config +++ b/workflow/configs/biohpc.config @@ -1,7 +1,7 @@ process { executor = 'slurm' clusterOptions = '--hold --no-kill' - queue = '256GB,256GBv1,128GB' + queue = '512GB,256GB,256GBv1,128GB' time = '8h' } diff --git a/workflow/main.nf b/workflow/main.nf index 3ade364267fb268fe22d8d3f38a63ebe7dddbcf7..0144fd1c182f31377690dad678c6acdd77c8e7e5 100755 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -1,149 +1,108 @@ -/* - * Copyright (c) 2023. The University of Texas Southwestern Medical Center - * - * This file is part of the BioHPC Workflow Platform - * - * This is a workflow package to run cellranger count on fastq files from single cell RNA data. - * - * @authors - * John Lafin - * - */ - - -// Parameters for input values -params.sample = "Brain_Tumor_3p_LT" -params.fastq = "${projectDir}/../test_data/Brain_Tumor_3p_LT_fastqs" -params.reference = "hg38" -params.expectCells = 0 -params.chemistry = "auto" -params.introns = true -params.noBam = false -params.outDir = "${projectDir}/output" - -// Run cellranger count -process cr_count { - publishDir "${outDir}", mode: 'copy' - queue "256GB,256GBv1" - module 'cellranger/7.1.0' - module 'singularity/3.9.9' - - input: - val sample - path ref - path fastq - val expectCells - val chemistry - val introns - val noBam - path outDir - - output: - tuple val(sample), path("**/outs/**") - - script: - if( expectCells == 0 ) - if( noBam ) - """ - cellranger count --id=$sample \ - --transcriptome=$ref \ - --fastqs=. \ - --sample=$sample \ - --chemistry=$chemistry \ - --include-introns=$introns \ - --no-bam - """ - else - """ - cellranger count --id=$sample \ - --transcriptome=$ref \ - --fastqs=. \ - --sample=$sample \ - --chemistry=$chemistry \ - --include-introns=$introns - """ - - else if( expectCells > 0) - if( noBam ) - """ - cellranger count --id=$sample \ - --transcriptome=$ref \ - --fastqs=. \ - --sample=$sample \ - --expect-cells=$expectCells \ - --chemistry=$chemistry \ - --include-introns=$introns \ - --no-bam - """ - else - """ - cellranger count --id=$sample \ - --transcriptome=$ref \ - --fastqs=. \ - --sample=$sample \ - --expect-cells=$expectCells \ - --chemistry=$chemistry \ - --include-introns=$introns - """ -} - -// Download reference genome if missing -process get_reference { - queue "super" - module 'singularity/3.9.9' - - input: - val ref_name - - output: - path ("refdata*", type: "dir") - - script: - if( ref_name=="hg38" ) { - """ - wget -q https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-GRCh38-2020-A.tar.gz - tar -zxf refdata-gex-GRCh38-2020-A.tar.gz - rm refdata-gex-GRCh38-2020-A.tar.gz - """ - } - else if( ref_name=="mm10" ) { - """ - wget -q https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz - tar -zxf refdata-gex-mm10-2020-A.tar.gz - rm refdata-gex-mm10-2020-A.tar.gz - """ - } -} - -workflow { - // Select reference genome - if (file(params.reference).exists()) { - ref = file(params.reference) - } - else if (params.reference == "hg38") { - ref = file("/project/apps_database/cellranger/refdata-gex-GRCh38-2020-A") - } - else if (params.reference == "mm10") { - ref = file("/project/apps_database/cellranger/refdata-gex-mm10-2020-A") - } - else { - ref = file("missing_reference") - } - - // Download reference if missing - if( ref.isEmpty() ) { - ref = get_reference(params.reference) - } - - // Define channels from variables - sample = Channel.value(params.sample) - fastq = Channel.fromPath(params.fastq).collect() - expectCells = Channel.value(params.expectCells) - chemistry = Channel.value(params.chemistry) - introns = Channel.value(params.introns) - noBam = Channel.value(params.noBam) - outDir = Channel.value(params.outDir) - - // Run cellranger count - cr_count(sample, ref, fastq, expectCells, chemistry, introns, noBam, outDir) -} \ No newline at end of file +/* + * Copyright (c) 2023. The University of Texas Southwestern Medical Center + * + * This file is part of the BioHPC Workflow Platform + * + * This is a workflow package to run cellranger count on fastq files from single cell RNA data. + * + * @authors + * John Lafin + * + */ + + +// Parameters for input values +params.sample_sheet = "${projectDir}/../test_data/sample_sheet.csv" +params.fastq = "${projectDir}/../test_data/fastqs" +params.outDir = "${projectDir}/output" +params.astrocyte = true + +// Run cellranger count +process cr_count { + publishDir "${outDir}", mode: 'copy' + module 'cellranger/7.1.0' + module 'singularity/3.9.9' + errorStrategy 'ignore' + + input: + tuple val(sample), val(ref), val(expectCells), val(chemistry), val(introns), val(noBam) + path(fastq) + path(outDir) + + output: + tuple val(sample), path("${sample}/outs/**") + + script: + // Check reference + if (params.astrocyte) { + switch (ref) { + case "hg38": + ref = file("/project/apps_database/cellranger/refdata-gex-GRCh38-2020-A") + break + case "mm10": + ref = file("/project/apps_database/cellranger/refdata-gex-mm10-2020-A") + break + default: + ref = file(ref) + } + } + + if ( ref.isEmpty() ) { + error "Error: Missing reference. Please provide one of the options outlined in the documentation." + } + + // Run Cell Ranger count + if( expectCells == 0 ) + if( noBam ) + """ + cellranger count --id=$sample \ + --transcriptome=$ref \ + --fastqs=. \ + --sample=$sample \ + --chemistry=$chemistry \ + --include-introns=$introns \ + --no-bam + """ + else + """ + cellranger count --id=$sample \ + --transcriptome=$ref \ + --fastqs=. \ + --sample=$sample \ + --chemistry=$chemistry \ + --include-introns=$introns + """ + + else if( expectCells > 0) + if( noBam ) + """ + cellranger count --id=$sample \ + --transcriptome=$ref \ + --fastqs=. \ + --sample=$sample \ + --expect-cells=$expectCells \ + --chemistry=$chemistry \ + --include-introns=$introns \ + --no-bam + """ + else + """ + cellranger count --id=$sample \ + --transcriptome=$ref \ + --fastqs=. \ + --sample=$sample \ + --expect-cells=$expectCells \ + --chemistry=$chemistry \ + --include-introns=$introns + """ +} + +workflow { + // Parse sample sheet and run cellranger count + fastq = channel.fromPath(params.fastq).collect() + outDir = channel.value(params.outDir) + input = channel.fromPath(params.sample_sheet) \ + | splitCsv(header: true) \ + | map{ row -> tuple( row.sample, row.reference, row.expectCells, row.chemistry, row.introns, row.noBam ) } + cr_count(input, fastq, outDir) +}