Skip to content
Snippets Groups Projects
Commit 009c4f8c authored by John Lafin's avatar John Lafin
Browse files

update scripts and config

parent e8c2b0c9
Branches
Tags
No related merge requests found
workflow/work
workflow/images/singularity
workflow/*.html*
workflow/*.csv
test_data/pbmc_1k*
test_data/Brain_Tumor*
test_data/hgmm_100*
test_data/fastqs
*.DS_Store
.history
.nextflow.log*
*.img
*.txt*
*.wordcount
.nextflow/
.rlibrary/
dag.dot*
report.html*
output/
work/
\ No newline at end of file
#!/usr/bin/env Rscript
# Run basic QC on scRNA data
# Author: John T Lafin
# Updated: 2023-11-30
# Load libraries
suppressPackageStartupMessages({
library(dplyr)
library(ggplot2)
library(Seurat)
library(scDblFinder)
library(future)
library(future.apply)
library(harmony)
library(intrinsicDimension)
library(clustree)
})
# Set up parallelization
nworkers <- 8
plan(multisession, workers=nworkers)
options(future.globals.maxSize = 8000 * 1024^2)
set.seed(42)
# Define inputs
args <- commandArgs(trailingOnly=TRUE)
metadata_file <- args[1]
cutoffs_file <- args[2]
species <- args[3]
batch_var <- args[4]
# Read in metadata
# Format:
## Sample: name of sample
## Filename: name of file
md <- read.csv(metadata_file)
# Read in cutoffs if provided
# Format:
## Sample: name of sample, matching metadata file
## nCount_RNA_hi: upper bound of nCount_RNA
## nCount_RNA_lo: lower bound of nCount_RNA
## nFeature_RNA_hi: upper bound of nFeature_RNA
## nFeature_RNA_lo: lower bound of nFeature_RNA
## percent_mito: upper bound of percent.mito
## stress_score: upper bound of stress score, given as percentile
## numPCs: number of PCs to use for UMAP
if (file.exists(cutoffs_file)) {
cutoffs <- read.csv(cutoffs_file)
}
# Load data from cellranger
sc <- list()
for (i in rownames(md)) {
sc[[md[i, 1]]] <- Read10X_h5(md[i, 2])
}
# Create Seurat objects
sc <- future_lapply(sc,
CreateSeuratObject,
min.features = 200,
min.cells = 3)
# Add sample names to metadata
for (i in seq_along(sc)) {
sc[[i]] <- AddMetaData(sc[[i]],
metadata = names(sc)[i],
col.name = "Sample")
}
# Add other metadata
for (i in seq_along(sc)) {
for (j in seq(2,length(colnames(md)),1))
sc[[i]] <- AddMetaData(sc[[i]],
metadata = md[i,j],
col.name = colnames(md)[j])
}
# Run doublet detection
sc <- future_lapply(sc, function(x) {
x %>%
as.SingleCellExperiment() %>%
scDblFinder(clusters = TRUE) %>%
as.Seurat(data=NULL)},
future.seed = TRUE)
# Calculate QC metrics
## Establish pattern of mitochondrial genes and stress genes
mt.pattern <- "^(MT|mt)-"
stress.genes <- c("JUN", "FOS", "EGR1",
"ATF3", "JUNB", "GADD45B",
"IER2", "ZFP36", "DNAJB1",
"RHOB", "NR4A1", "UBC", "HES1")
## Format stress and mito gene names if these data are from mouse or barnyard
if (species == "Mouse") {
stress.genes <- tools::toTitleCase(tolower(stress.genes))
} else if (species == "Barnyard") {
stress.genes <- c(paste0("GRCh38-",stress.genes), paste0("mm10---",tools::toTitleCase(tolower(stress.genes))))
mt.pattern <- "^(GRCh38-MT|mm10---mt)-"
}
## Calculate mitochondrial gene percentage and stress score
## Turn off future here as it doesn't play nicely with NormalizeData()
plan(sequential)
sc <- lapply(sc, NormalizeData)
plan(multisession, workers=nworkers)
sc <- future_lapply(sc, function(x) {
x[["percent.mito"]] <- PercentageFeatureSet(x, pattern = mt.pattern)
AddModuleScore(x, features = list(stress.genes), name = "Stress.score")},
future.seed = TRUE)
## Remove trailing 1 in stress score
for (i in names(sc)) {
sc[[i]]@meta.data <- sc[[i]]@meta.data %>%
dplyr::rename(Stress.score = Stress.score1)
}
## Collect QC metrics
qc.dt <- tibble()
for (i in sc) {
temp <- tibble("Barcode" = Cells(i),
"Sample" = i$Sample,
"nCount_RNA" = i$nCount_RNA,
"nFeature_RNA" = i$nFeature_RNA,
"percent.mito" = i$percent.mito,
"doublet.score" = i$scDblFinder.score,
"doublet.class" = i$scDblFinder.class,
"stress.score" = i$Stress.score)
qc.dt <- bind_rows(qc.dt, temp)
}
qc.dt$Sample <- as.factor(qc.dt$Sample)
# Calculate QC cutoffs
## Log transform nCount and nFeature, calculate median and MAD
## Default cutoffs for these are 3 MADs in either direction of log-transformed median
## Default cutoff for percent.mito is 10% for human, 5% for mouse
## No stressed cell or doublet removal by default
mads <- qc.dt %>%
group_by(Sample) %>%
mutate(across(.cols = c(nCount_RNA, nFeature_RNA),
.fns = log)) %>%
summarize(across(.cols = c(nCount_RNA, nFeature_RNA, percent.mito, stress.score),
.fns = list(med = median, mad = mad))) %>%
mutate(nCount_threshold_lo = exp(nCount_RNA_med - 3*nCount_RNA_mad),
nCount_threshold_hi = exp(nCount_RNA_med + 3*nCount_RNA_mad),
nFeature_threshold_lo = exp(nFeature_RNA_med - 3*nFeature_RNA_mad),
nFeature_threshold_hi = exp(nFeature_RNA_med + 3*nFeature_RNA_mad),
percent.mito_threshold = ifelse(species == "Mouse", 5, 10))
# Replace cutoffs with user supplied values if they exist
if (exists("cutoffs")) {
for (metric in colnames(cutoffs)) {
for (s in cutoffs$Sample) {
if (!is.na(cutoffs[cutoffs$Sample == s, metric])) {
mads[mads$Sample == s, metric] <- cutoffs[cutoffs$Sample == s, metric]
}
}
}
}
# Generate pre-filter plots
## Set output directory
outdir <- "plots/pre-filter/"
dir.create(outdir, recursive = TRUE)
p.list <- list()
## nCount_RNA
p.list$nCount <- ggplot(qc.dt, aes(x = Sample, y = nCount_RNA)) +
geom_violin() +
scale_y_log10() +
facet_wrap(~Sample, scales = "free") +
geom_hline(data = mads, aes(yintercept = nCount_threshold_lo), linetype = "dashed") +
geom_hline(data = mads, aes(yintercept = nCount_threshold_hi), linetype = "dashed")
## nFeature_RNA
p.list$nFeature <- ggplot(qc.dt, aes(x = Sample, y = nFeature_RNA)) +
geom_violin() +
scale_y_log10() +
facet_wrap(~Sample, scales = "free") +
geom_hline(data = mads, aes(yintercept = nCount_threshold_lo), linetype = "dashed") +
geom_hline(data = mads, aes(yintercept = nCount_threshold_hi), linetype = "dashed")
## percent.mito
p.list$mito <- ggplot(qc.dt, aes(x = Sample, y = percent.mito)) +
geom_violin() +
facet_wrap(~Sample, scales = "free") +
geom_hline(data = mads, aes(yintercept = percent.mito_threshold), linetype = "dashed")
## Stress.score
p.list$stress <- ggplot(qc.dt, aes(x = Sample, y = stress.score)) +
geom_violin() +
facet_wrap(~Sample, scales = "free")
## Doublet scores
dblts.ncount <- tibble()
for (i in sc) {
temp <- tibble("Barcode" = Cells(i),
"Sample" = i$Sample,
"nCount_RNA" = i$nCount_RNA,
"nFeature_RNA" = i$nFeature_RNA,
"doublet.score" = i$scDblFinder.score,
"doublet.class" = i$scDblFinder.class)
dblts.ncount <- bind_rows(dblts.ncount, temp)
}
p.list$dblts <- ggplot(dblts.ncount, aes(x = nCount_RNA, y = doublet.score, color = doublet.class)) +
geom_point() +
facet_wrap(~Sample)
## Trend plots
p.list$trend <- ggplot(qc.dt, aes(x = nCount_RNA, y = nFeature_RNA)) +
facet_wrap(~Sample, scales = "free") +
geom_point(aes(color = percent.mito)) +
stat_smooth() +
scale_x_log10() +
scale_y_log10() +
labs(x = "UMIs",
y = "Genes",
color = "% Mito")
## Save plots
names(p.list) <- c("nCount_RNA", "nFeature_RNA", "percent_mito", "stress_score", "doublet_score", "trend")
for (i in seq_along(p.list)) {
ggsave(filename = paste0(outdir, names(p.list)[i], ".png"),
plot = p.list[[i]])
}
# Perform filtering
sc2 <- list()
for (i in seq_along(sc)) {
t <- mads %>%
filter(Sample == names(sc)[i])
sc2[[i]] <- subset(sc[[i]],
subset = nCount_RNA > t$nCount_threshold_lo &
nCount_RNA < t$nCount_threshold_hi &
nFeature_RNA > t$nFeature_threshold_lo &
nFeature_RNA < t$nFeature_threshold_hi &
percent.mito < t$percent.mito_threshold)
}
qc.dt2 <- tibble()
for (i in sc2) {
temp <- tibble("Sample" = i$Sample,
"nCount_RNA" = i$nCount_RNA,
"nFeature_RNA" = i$nFeature_RNA,
"percent.mito" = i$percent.mito,
"doublet.score" = i$scDblFinder.score,
"doublet.class" = i$scDblFinder.class,
"stress.score" = i$Stress.score)
qc.dt2 <- bind_rows(qc.dt2, temp)
}
qc.dt2$Sample <- as.factor(qc.dt2$Sample)
# Generate post-filter plots
## Set output directory
outdir <- "plots/post-filter/"
dir.create(outdir, recursive = TRUE)
p.list <- list()
## nCount_RNA
p.list$nCount <- ggplot(qc.dt2, aes(x = Sample, y = nCount_RNA)) +
geom_violin() +
scale_y_log10() +
facet_wrap(~Sample, scales = "free")
## nFeature_RNA
p.list$nFeature <- ggplot(qc.dt2, aes(x = Sample, y = nFeature_RNA)) +
geom_violin() +
scale_y_log10() +
facet_wrap(~Sample, scales = "free")
## percent.mito
p.list$mito <- ggplot(qc.dt2, aes(x = Sample, y = percent.mito)) +
geom_violin() +
facet_wrap(~Sample, scales = "free")
## Stress.score
p.list$stress <- ggplot(qc.dt, aes(x = Sample, y = stress.score)) +
geom_violin() +
facet_wrap(~Sample, scales = "free")
## Doublet scores
dblts.ncount2 <- tibble()
for (i in sc2) {
temp <- tibble("Barcode" = Cells(i),
"Sample" = i$Sample,
"nCount_RNA" = i$nCount_RNA,
"nFeature_RNA" = i$nFeature_RNA,
"doublet.score" = i$scDblFinder.score,
"doublet.class" = i$scDblFinder.class)
dblts.ncount2 <- bind_rows(dblts.ncount2, temp)
}
p.list$dblts <- ggplot(dblts.ncount2, aes(x = nCount_RNA, y = doublet.score, color = doublet.class)) +
geom_point() +
facet_wrap(~Sample)
## Trend plots
p.list$trend <- ggplot(qc.dt2, aes(x = nCount_RNA, y = nFeature_RNA)) +
facet_wrap(~Sample, scales = "free") +
geom_point(aes(color = percent.mito)) +
stat_smooth() +
scale_x_log10() +
scale_y_log10() +
labs(x = "UMIs",
y = "Genes",
color = "% Mito")
## Save plots
names(p.list) <- c("nCount_RNA", "nFeature_RNA", "percent_mito", "stress_score", "doublet_score", "trend")
for (i in seq_along(p.list)) {
ggsave(filename = paste0(outdir, names(p.list)[i], ".png"),
plot = p.list[[i]])
}
# Save output
outdir <- "data/"
dir.create(outdir, recursive = TRUE)
write.csv(qc.dt, paste0(outdir, "barcode_metrics_pre-filter.csv"))
write.csv(qc.dt2, paste0(outdir, "barcode_metrics_post-filter.csv"))
write.csv(mads, paste0(outdir, "cutoffs_used.csv"))
write.csv(dblts.ncount, paste0(outdir, "doublets_table.csv"))
saveRDS(sc, paste0(outdir, "pre-filtered_list.rds"))
saveRDS(sc2, paste0(outdir, "post-filtered_list.rds"))
# Preprocessing
outdir <- "plots/preprocessing/"
dir.create(outdir, recursive=TRUE)
## If more than one sample, merge them together
if (length(sc2) > 1) {
seu <- merge(sc2[[1]], sc2[-1], add.cell.ids=names(sc2))
multisample <- TRUE
} else {
seu <- sc2[[1]]
multisample <- FALSE
}
## Turn off future for NormalizeData() as it doesn't play nicely
plan(sequential)
seu <- NormalizeData(seu)
plan(multisession, workers=nworkers)
seu <- seu %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA()
## Batch correction if requested
if (multisample & !is.na(batch_var)) {
seu <- RunHarmony(seu, group.by.vars=batch_var)
default_reduction <- "harmony"
} else {
default_reduction <- "pca"
}
# Estimate dimensionality
numPCs <- maxLikGlobalDimEst(data = Embeddings(seu, reduction=default_reduction), k=10)$dim.est %>%
round()
# Create elbow and dim reduc plots
p.list <- list()
p.list$elbow <- ElbowPlot(seu, ndims=50, reduction=default_reduction) +
geom_vline(xintercept=numPCs)
# Create PCA plot
p.list$pca <- DimPlot(seu, reduction="pca", group.by="Sample")
# Create harmony plot (if multisample)
if (multisample) {
p.list$harmony <- DimPlot(seu, reduction="harmony", group.by=batch_var)
}
# kNN, clustering, UMAP
res <- seq(0.1,1,0.1)
seu <- RunUMAP(seu,
reduction = default_reduction,
dims = 1:numPCs,
metric = "euclidean") %>%
FindNeighbors(reduction = default_reduction,
dims = 1:numPCs) %>%
FindClusters(resolution = res)
# Ensure idents are in numerical order
for (i in res) {
id <- paste0("RNA_snn_res.", i)
top.level <- length(levels(seu@meta.data[[id]])) - 1
seu@meta.data[[id]] <- factor(seu@meta.data[[id]], levels = seq(0, top.level, 1))
}
# Create and save UMAPs
# One for each resolution, one for sample name, and for each additional column of md
groups <- c(paste0("RNA_snn_res.", res), "Sample", colnames(md)[3:length(colnames(md))])
umap_dir <- paste0(outdir, "umaps/")
dir.create(umap_dir, recursive=TRUE)
for (grp in groups) {
p <- DimPlot(seu, group.by=grp)
ggsave(paste0(umap_dir, grp, ".png"), plot = p)
}
# Create clustree plot
p <- clustree(seu, prefix = "RNA_snn_res.")
ggsave(paste0(outdir, "clustree.png"),
height=12,
width=12*0.618,
plot = p)
# Create stress and double score plots
p.list$doublet_score <- FeaturePlot(seu, "scDblFinder.score")
p.list$stress_score <- FeaturePlot(seu, "Stress.score")
# Save plots
for (p in names(p.list)) {
ggsave(paste0(outdir, p, ".png"),
plot = p.list[[p]])
}
# Create marker lists
markerdir <- "markers/"
dir.create(markerdir)
grps <- paste0("RNA_snn_res.", res)
for (g in grps) {
Idents(seu) <- g
seu %>%
FindAllMarkers(only.pos=TRUE, min.pct=0.1) %>%
write.csv(paste0(markerdir, g, ".csv"))
}
# Save data
outdir <- "data/"
saveRDS(seu, paste0(outdir, "processed_object.rds"))
# Save session info
capture.output(sessionInfo(), file = paste0(outdir, "sessionInfo.txt"))
process {
executor = 'slurm'
module = 'singularity/3.9.9'
clusterOptions = '--no-kill'
queue = '256GBv2,512GB,256GB,256GBv1,128GB'
time = '8h'
with_label: gpu {
queue = 'GPUv100s,GPUp100'
}
}
singularity {
enabled = true
runOptions = '--nv'
cacheDir = "${projectDir}/images/singularity"
}
// Overwrite execution report files
// Required for nextflow version >22.10.0
trace.overwrite = true
dag.overwrite = true
timeline.overwrite = true
report.overwrite = true
// We are running on Astrocyte
params.astrocyte = true
\ No newline at end of file
/*
* Copyright (c) 2023. The University of Texas Southwestern Medical Center
*
* This file is part of the BioHPC Workflow Platform
*
* This is a workflow package to run cellranger count on fastq files from single cell RNA data.
*
* @authors
* John Lafin
*
*/
// Parameters for input values
params.sample_sheet = "${projectDir}/../test_data/sample_sheet_v8.csv"
params.fastq = "${projectDir}/../test_data/fastqs"
params.outDir = "${projectDir}/output"
params.astrocyte = false
// Run cellranger count
process cr_count {
publishDir "${params.outDir}/cr_count", mode: 'copy'
container 'docker://quay.io/nf-core/cellranger:8.0.0'
input:
tuple val(sample), val(ref), val(expectCells), val(chemistry), val(introns), val(createBam)
path(fastq)
path(sample_sheet)
output:
tuple val(sample), path("${task.index}_${sample}/outs/**")
script:
// Check reference
if (params.astrocyte) {
switch (ref) {
case "hg38":
ref = file("/project/apps_database/cellranger/refdata-gex-GRCh38-2024-A")
break
case "GRCm39":
ref = file("/project/apps_database/cellranger/refdata-gex-GRCm39-2024-A")
break
case "barnyard":
ref = file("/project/apps_database/cellranger/refdata-gex-GRCh38_and_GRCm39-2024-A")
break
default:
ref = file(ref)
}
}
if ( ref.isEmpty() ) {
error "Error: Missing reference. Please provide one of the options outlined in the documentation."
}
// Parse parameters
if( expectCells.toLowerCase() == "auto" )
expectCells = ""
else
expectCells = "--expect-cells=$expectCells"
createBam = createBam.toLowerCase()
introns = introns.toLowerCase()
// Run Cell Ranger count
"""
cellranger count --id=${task.index}_${sample} \
--transcriptome=$ref \
--fastqs=. \
--sample=$sample \
--create-bam=$createBam \
--chemistry=$chemistry \
--include-introns=$introns \
$expectCells
"""
}
// Run cellbender
process cellbender {
publishDir "${params.outDir}/cellbender", mode: 'copy'
container 'docker://us.gcr.io/broad-dsde-methods/cellbender:0.3.0'
label 'gpu'
input:
tuple val(sample), file(input), val(expected_cells), val(total_droplets_included), val(fpr)
path(sample_sheet)
output:
tuple val(sample), path("${task.index}_${sample}*")
script:
if( expected_cells == "0" )
expected_cells = ""
else
expected_cells = "--expected-cells=$expected_cells"
if( total_droplets_included == "0")
total_droplets_included = ""
else
total_droplets_included = "--total-droplets-included=$total_droplets_included"
"""
mkdir ${task.index}_${sample}
cellbender remove-background \
--input ${input} \
--output ${task.index}_${sample}/${sample}.h5 \
--fpr ${fpr} \
--cuda \
$expected_cells \
$total_droplets_included
ptrepack \
--complevel 5 \
${task.index}_${sample}/${sample}.h5:/matrix \
${task.index}_${sample}/${sample}_seurat.h5:/matrix
ptrepack \
--complevel 5 \
${task.index}_${sample}/${sample}_filtered.h5:/matrix \
${task.index}_${sample}/${sample}_filtered_seurat.h5:/matrix
"""
}
// Run QC script
process quality_control {
publishDir "${params.outDir}/qc", mode: 'copy'
container 'git.biohpc.swmed.edu:5050/astrocyte/workflows/strand-lab/scrna-qc/scrna-qc:1.0.0'
input:
path h5_files
path md
path ct
val species
val batch
output:
path("data/**")
path("plots/**")
path("markers/**")
script:
"""
source /entrypoint.sh
quality_control.R $md $ct $species $batch
"""
}
// Prepare cellxgene-VIP
process setup {
publishDir "${projectDir}/output/CELLxGENE", mode: 'copy'
container 'git.biohpc.swmed.edu:5050/astrocyte/workflows/strand-lab/cellxgene/h5ad_convert:0.0.1'
input:
path input
output:
path "cxg_input.h5ad"
script:
// If input is h5ad, just copy it to the location
if (input.getExtension() == "h5ad") {
"""
cp ${input} cxg_input.h5ad
"""
}
// If input is rds, convert using the R script
else if (input.getExtension().toLowerCase() == "rds") {
"""
source /h5ad_convert/entrypoint.sh
Rscript ${projectDir}/scripts/prepare_h5ad.R $input
"""
}
// If neither, something's gone wrong
else {
"""
echo "Error: Please submit either an .h5ad file, or an .rds file containing a Seurat or SingleCellExperiment object."
exit 1
"""
}
}
workflow {
// Parse sample sheet and run cellranger count
fastq = channel.fromPath(params.fastq).collect()
sample_sheet = file(params.sample_sheet)
input = channel.fromPath(params.sample_sheet) \
| splitCsv(header: true) \
| map{ row -> tuple( row.sample, row.reference, row.expectCells, row.chemistry, row.introns, row.createBam ) }
cr_count(input, fastq, sample_sheet)
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment