Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • StrandLab/scrna_urethra
Show changes
Commits on Source (2)
# Initializing single cell data
# Author: John T Lafin
# Updated: 2024-08-13
# Updated: 2024-09-17
print("Script begin at: ")
Sys.time()
......@@ -24,7 +24,7 @@ options(future.globals.maxSize = 8000 * 1024^2)
set.seed(42)
# Create output dir
outdir <- "data_modified/2024-08-13_initial_analysis/"
outdir <- "data_modified/01_initial_analysis/"
dir.create(outdir, showWarnings = FALSE)
# Load data ---------------------------------------------------------------
......
# Pre-processing single cell data
# Author: John T Lafin
# Updated: 2024-08-13
# Updated: 2024-09-17
print("Script begin at: ")
Sys.time()
......@@ -24,11 +24,11 @@ options(future.globals.maxSize = 8000 * 1024^2)
set.seed(42)
# Create output dir
outdir <- "data_modified/2024-08-13_preprocess/"
outdir <- "data_modified/02_preprocess/"
dir.create(outdir, showWarnings = FALSE)
# Load data
seu <- readRDS("data_modified/2024-08-13_initial_analysis/merged_object.rds")
seu <- readRDS("data_modified/01_initial_analysis/merged_object.rds")
# Clustering --------------------------------------------------------------
......
# Lineage-dependent filtering
# Author: John T Lafin
# Updated: 2024-09-17
print("Script begin at: ")
Sys.time()
# Setup -------------------------------------------------------------------
# Load packages
library(dplyr)
library(Seurat)
library(intrinsicDimension)
library(ggplot2)
library(harmony)
library(future)
library(future.apply)
library(clustree)
# Future initialization
future_plan <- "multisession" # Can use multicore if not on Rstudio or Windows
n_workers <- 4
plan(future_plan, workers=n_workers)
options(future.globals.maxSize = 8000 * 1024^2)
set.seed(42)
# Create output dir
outdir <- "data_modified/03_lineage_filtering/"
dir.create(outdir, showWarnings = FALSE)
# Load data ---------------------------------------------------------------
sc.h <- readRDS("data_modified/02_preprocess/processed_object.rds")
# Remove this field or Seurat won't work properly
sc.h$ident <- NULL
default.res <- "RNA_snn_res.0.4"
Idents(sc.h) <- default.res
# Annotate lineages -------------------------------------------------------
# Using information generated from 02_preprocess.R, annotate lineages
cluster_epi <- c(0, 1, 7, 8, 9, 11, 15)
cluster_endo <- c(4, 6, 12)
cluster_leu <- c(10, 14, 16)
cluster_stroma <- c(2, 3, 5, 13)
cluster_doublets <- c(17, 18)
# Make assignments
cluster.ids <- vector("character", length = length(levels(sc.h)))
cluster.ids[cluster_epi+1] <- "Epithelia"
cluster.ids[cluster_endo+1] <- "Endothelia"
cluster.ids[cluster_leu+1] <- "Leukocyte"
cluster.ids[cluster_stroma+1] <- "Stroma"
cluster.ids[cluster_doublets+1] <- "Putative doublet"
names(cluster.ids) <- levels(sc.h)
sc.h <- RenameIdents(sc.h, cluster.ids) %>%
StashIdent("Lineage")
# Generate UMAP with lineage assignments
umapdir <- paste0(outdir, "umaps/")
dir.create(umapdir, showWarnings = FALSE)
Idents(sc.h) <- "Lineage"
DimPlot(sc.h)
ggsave(paste0(umapdir, "global_lineage.png"))
# Dot plot to check marker leakage
genes <- strsplit("EPCAM KRT5 KRT14 MSMB SEMG1 SELE PECAM1 PTPRC SRGN C1QA DCN LUM DES ACTA1 ACTG2 CRYAB CDH19",
split = " ")[[1]]
DotPlot(sc.h, features = genes) +
theme(axis.text.x = element_text(angle=45, hjust=1))
ggsave(paste0(outdir, "dotplot_global_lineage.png"))
# Subset and recluster ----------------------------------------------------
lin.list <- SplitObject(sc.h, split.by="Lineage")
lin.list[["Putative doublet"]] <- NULL # Remove doublets
res <- seq(0.1, 1, 0.1)
lin.list <- lin.list %>%
future_lapply(function(x){
x <- NormalizeData(x) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res) %>%
SetIdent(value = default.res)
return(x)},
future.seed=TRUE)
# Save UMAP of each subset
for ( lin in names(lin.list) ){
DimPlot(lin.list[[lin]], label=TRUE)
ggsave(paste0(umapdir, lin, "_prefilter.png"))
}
# Lineage-dependent filtering ---------------------------------------------
## Identify and remove clusters enriched for likely doublets
#### Save doublet plots of each lineage
dblts <- list()
for (lin in names(lin.list)) {
dblts[[lin]] <- lin.list[[lin]]@meta.data %>%
group_by(.data[[default.res]], scDblFinder.class) %>%
summarize(count = n()) %>%
group_by(.data[[default.res]]) %>%
mutate(fraction = count/sum(count))
ggplot(dblts[[lin]], aes(.data[[default.res]], fraction, fill = scDblFinder.class)) +
geom_col() +
geom_hline(yintercept = 0.5, linetype = "dashed")
ggsave(paste0(outdir, "dblts_", lin, ".png"))
}
### Remove clusters with >50% of cells classed as doublets
lin.list.filt <- list()
keep <- list()
for (lin in names(lin.list)) {
keep[[lin]] <- dblts[[lin]] %>%
filter(scDblFinder.class == "singlet" & fraction > 0.5) %>%
pull("RNA_snn_res.0.4")
lin.list.filt[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
}
## Library size filtering
### Calculate thresholds
mads_mult <- 2 # Number of MADs away from median to filter
dt <- data.frame(lineage = character(),
nCount_RNA = numeric())
for (i in names(lin.list.filt)) {
dt <- add_row(dt, lineage = i, nCount_RNA = lin.list.filt[[i]]$nCount_RNA)
}
mt.dt <- dt %>%
group_by(lineage) %>%
mutate(log.nCount_RNA = log(nCount_RNA)) %>%
summarize(median = median(nCount_RNA),
log.median = median(log.nCount_RNA),
log.mad = mad(log.nCount_RNA)) %>%
mutate(cutoff = exp(log.median - mads_mult * log.mad)) %>%
as.data.frame()
rownames(mt.dt) <- mt.dt$lineage
mt.dt$lineage <- NULL
### Save thresholds
write.csv(mt.dt, paste0(outdir, "nCount_cutoffs.csv"))
### Save violin plots with cutoffs
for (g in names(lin.list.filt)) {
VlnPlot(lin.list.filt[[g]], "nCount_RNA", pt.size = 0) +
geom_hline(yintercept = mt.dt[g, "cutoff"], linetype="dashed") +
scale_y_log10() +
NoLegend()
ggsave(paste0(outdir, paste0("vln_nCount_cutoffs_", g, ".png")))
}
### Apply library size filter
for (i in names(lin.list.filt) ) {
lin.list.filt[[i]] <- subset(lin.list.filt[[i]], subset = nCount_RNA > mt.dt[i, "cutoff"])
}
### Record how many cells were removed
cells.old <- lapply(lin.list,
ncol)
cells.new <- lapply(lin.list.filt,
ncol)
tibble("Lineage" = names(cells.old),
"Original" = unlist(cells.old),
"Filtered" = unlist(cells.new)) %>%
mutate("Difference" = Original - Filtered,
"Fraction_removed" = Difference / Original) %>%
write.csv(paste0(outdir, "fraction_removed_nCount.csv"))
# Recluster ---------------------------------------------------------------
lin.list <- lin.list.filt %>%
future_lapply(function(x){
x <- NormalizeData(x) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res) %>%
SetIdent(value = default.res)
return(x)},
future.seed=TRUE)
# Doublet removal, second round -------------------------------------------
dblts <- list()
for (lin in names(lin.list)) {
dblts[[lin]] <- lin.list[[lin]]@meta.data %>%
group_by(.data[[default.res]], scDblFinder.class) %>%
summarize(count = n()) %>%
group_by(.data[[default.res]]) %>%
mutate(fraction = count/sum(count))
}
keep <- list()
for (lin in names(lin.list)) {
keep[[lin]] <- dblts[[lin]] %>%
filter(scDblFinder.class == "singlet" & fraction > 0.5) %>%
pull("RNA_snn_res.0.4")
lin.list[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
}
## Recluster again
lin.list <- lin.list %>%
future_lapply(function(x){
x <- NormalizeData(x) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res) %>%
SetIdent(value = default.res)
return(x)},
future.seed=TRUE)
# Save outputs ------------------------------------------------------------
## Save UMAPs and objects
objdir <- paste0(outdir, "objects/")
dir.create(objdir, showWarnings = FALSE)
for (lin in names(lin.list)) {
DimPlot(lin.list[[lin]])
ggsave(paste0(umapdir, lin, "_postfilter.png"))
saveRDS(lin.list[[lin]], paste0(objdir, lin, ".rds"))
}
## Save session info
capture.output(sessionInfo(), file = paste0(outdir, "session_info.txt"))