Skip to content
Snippets Groups Projects
Commit b484c159 authored by John Lafin's avatar John Lafin
Browse files

Add lineage filtering

parent cca00d94
Branches
No related merge requests found
# Lineage-dependent filtering
# Author: John T Lafin
# Updated: 2024-09-17
print("Script begin at: ")
Sys.time()
# Setup -------------------------------------------------------------------
# Load packages
library(dplyr)
library(Seurat)
library(intrinsicDimension)
library(ggplot2)
library(harmony)
library(future)
library(future.apply)
library(clustree)
# Future initialization
future_plan <- "multisession" # Can use multicore if not on Rstudio or Windows
n_workers <- 4
plan(future_plan, workers=n_workers)
options(future.globals.maxSize = 8000 * 1024^2)
set.seed(42)
# Create output dir
outdir <- "data_modified/03_lineage_filtering/"
dir.create(outdir, showWarnings = FALSE)
# Load data ---------------------------------------------------------------
sc.h <- readRDS("data_modified/02_preprocess/processed_object.rds")
# Remove this field or Seurat won't work properly
sc.h$ident <- NULL
default.res <- "RNA_snn_res.0.4"
Idents(sc.h) <- default.res
# Annotate lineages -------------------------------------------------------
# Using information generated from 02_preprocess.R, annotate lineages
cluster_epi <- c(0, 1, 7, 8, 9, 11, 15)
cluster_endo <- c(4, 6, 12)
cluster_leu <- c(10, 14, 16)
cluster_stroma <- c(2, 3, 5, 13)
cluster_doublets <- c(17, 18)
# Make assignments
cluster.ids <- vector("character", length = length(levels(sc.h)))
cluster.ids[cluster_epi+1] <- "Epithelia"
cluster.ids[cluster_endo+1] <- "Endothelia"
cluster.ids[cluster_leu+1] <- "Leukocyte"
cluster.ids[cluster_stroma+1] <- "Stroma"
cluster.ids[cluster_doublets+1] <- "Putative doublet"
names(cluster.ids) <- levels(sc.h)
sc.h <- RenameIdents(sc.h, cluster.ids) %>%
StashIdent("Lineage")
# Generate UMAP with lineage assignments
umapdir <- paste0(outdir, "umaps/")
dir.create(umapdir, showWarnings = FALSE)
Idents(sc.h) <- "Lineage"
DimPlot(sc.h)
ggsave(paste0(umapdir, "global_lineage.png"))
# Dot plot to check marker leakage
genes <- strsplit("EPCAM KRT5 KRT14 MSMB SEMG1 SELE PECAM1 PTPRC SRGN C1QA DCN LUM DES ACTA1 ACTG2 CRYAB CDH19",
split = " ")[[1]]
DotPlot(sc.h, features = genes) +
theme(axis.text.x = element_text(angle=45, hjust=1))
ggsave(paste0(outdir, "dotplot_global_lineage.png"))
# Subset and recluster ----------------------------------------------------
lin.list <- SplitObject(sc.h, split.by="Lineage")
lin.list[["Putative doublet"]] <- NULL # Remove doublets
res <- seq(0.1, 1, 0.1)
lin.list <- lin.list %>%
future_lapply(function(x){
x <- NormalizeData(x) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res) %>%
SetIdent(value = default.res)
return(x)},
future.seed=TRUE)
# Save UMAP of each subset
for ( lin in names(lin.list) ){
DimPlot(lin.list[[lin]], label=TRUE)
ggsave(paste0(umapdir, lin, "_prefilter.png"))
}
# Lineage-dependent filtering ---------------------------------------------
## Identify and remove clusters enriched for likely doublets
#### Save doublet plots of each lineage
dblts <- list()
for (lin in names(lin.list)) {
dblts[[lin]] <- lin.list[[lin]]@meta.data %>%
group_by(.data[[default.res]], scDblFinder.class) %>%
summarize(count = n()) %>%
group_by(.data[[default.res]]) %>%
mutate(fraction = count/sum(count))
ggplot(dblts[[lin]], aes(.data[[default.res]], fraction, fill = scDblFinder.class)) +
geom_col() +
geom_hline(yintercept = 0.5, linetype = "dashed")
ggsave(paste0(outdir, "dblts_", lin, ".png"))
}
### Remove clusters with >50% of cells classed as doublets
lin.list.filt <- list()
keep <- list()
for (lin in names(lin.list)) {
keep[[lin]] <- dblts[[lin]] %>%
filter(scDblFinder.class == "singlet" & fraction > 0.5) %>%
pull("RNA_snn_res.0.4")
lin.list.filt[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
}
## Library size filtering
### Calculate thresholds
mads_mult <- 2 # Number of MADs away from median to filter
dt <- data.frame(lineage = character(),
nCount_RNA = numeric())
for (i in names(lin.list.filt)) {
dt <- add_row(dt, lineage = i, nCount_RNA = lin.list.filt[[i]]$nCount_RNA)
}
mt.dt <- dt %>%
group_by(lineage) %>%
mutate(log.nCount_RNA = log(nCount_RNA)) %>%
summarize(median = median(nCount_RNA),
log.median = median(log.nCount_RNA),
log.mad = mad(log.nCount_RNA)) %>%
mutate(cutoff = exp(log.median - mads_mult * log.mad)) %>%
as.data.frame()
rownames(mt.dt) <- mt.dt$lineage
mt.dt$lineage <- NULL
### Save thresholds
write.csv(mt.dt, paste0(outdir, "nCount_cutoffs.csv"))
### Save violin plots with cutoffs
for (g in names(lin.list.filt)) {
VlnPlot(lin.list.filt[[g]], "nCount_RNA", pt.size = 0) +
geom_hline(yintercept = mt.dt[g, "cutoff"], linetype="dashed") +
scale_y_log10() +
NoLegend()
ggsave(paste0(outdir, paste0("vln_nCount_cutoffs_", g, ".png")))
}
### Apply library size filter
for (i in names(lin.list.filt) ) {
lin.list.filt[[i]] <- subset(lin.list.filt[[i]], subset = nCount_RNA > mt.dt[i, "cutoff"])
}
### Record how many cells were removed
cells.old <- lapply(lin.list,
ncol)
cells.new <- lapply(lin.list.filt,
ncol)
tibble("Lineage" = names(cells.old),
"Original" = unlist(cells.old),
"Filtered" = unlist(cells.new)) %>%
mutate("Difference" = Original - Filtered,
"Fraction_removed" = Difference / Original) %>%
write.csv(paste0(outdir, "fraction_removed_nCount.csv"))
# Recluster ---------------------------------------------------------------
lin.list <- lin.list.filt %>%
future_lapply(function(x){
x <- NormalizeData(x) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res) %>%
SetIdent(value = default.res)
return(x)},
future.seed=TRUE)
# Doublet removal, second round -------------------------------------------
dblts <- list()
for (lin in names(lin.list)) {
dblts[[lin]] <- lin.list[[lin]]@meta.data %>%
group_by(.data[[default.res]], scDblFinder.class) %>%
summarize(count = n()) %>%
group_by(.data[[default.res]]) %>%
mutate(fraction = count/sum(count))
}
keep <- list()
for (lin in names(lin.list)) {
keep[[lin]] <- dblts[[lin]] %>%
filter(scDblFinder.class == "singlet" & fraction > 0.5) %>%
pull("RNA_snn_res.0.4")
lin.list[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
}
## Recluster again
lin.list <- lin.list %>%
future_lapply(function(x){
x <- NormalizeData(x) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res) %>%
SetIdent(value = default.res)
return(x)},
future.seed=TRUE)
# Save outputs ------------------------------------------------------------
## Save UMAPs and objects
objdir <- paste0(outdir, "objects/")
dir.create(objdir, showWarnings = FALSE)
for (lin in names(lin.list)) {
DimPlot(lin.list[[lin]])
ggsave(paste0(umapdir, lin, "_postfilter.png"))
saveRDS(lin.list[[lin]], paste0(objdir, lin, ".rds"))
}
## Save session info
capture.output(sessionInfo(), file = paste0(outdir, "session_info.txt"))
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment