Skip to content
Snippets Groups Projects
Commit e72babe6 authored by John Lafin's avatar John Lafin
Browse files

Update lineage filtering; move stress filter to annot script

parent e0e9f125
Branches
No related merge requests found
# Lineage-dependent filtering
# Author: John T Lafin
# Updated: 2024-09-17
# Updated: 2024-10-15
print("Script begin at: ")
Sys.time()
......@@ -28,6 +28,33 @@ set.seed(42)
outdir <- "data_modified/03_lineage_filtering/"
dir.create(outdir, showWarnings = FALSE)
# Create pre-processing function
preprocess <- function(x,
cluster.res = seq(0.1,1.5,0.1),
default.res = "RNA_snn_res.0.4",
batch.var = "Sample"){
x <- x %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = batch.var)
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = cluster.res) %>%
SetIdent(value = default.res)
return(x)
}
# Load data ---------------------------------------------------------------
sc.h <- readRDS("data_modified/02_preprocess/processed_object.rds")
......@@ -78,30 +105,8 @@ ggsave(paste0(outdir, "dotplot_global_lineage.png"))
lin.list <- SplitObject(sc.h, split.by="Lineage")
lin.list[["Putative doublet"]] <- NULL # Remove doublets
res <- seq(0.1, 1, 0.1)
lin.list <- lin.list %>%
future_lapply(function(x){
x <- NormalizeData(x) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res) %>%
SetIdent(value = default.res)
return(x)},
future.seed=TRUE)
res <- seq(0.1, 1.5, 0.1)
lin.list <- future_lapply(lin.list, preprocess, future.seed=TRUE)
# Save UMAP of each subset
for ( lin in names(lin.list) ){
......@@ -109,10 +114,11 @@ for ( lin in names(lin.list) ){
ggsave(paste0(umapdir, lin, "_prefilter.png"))
}
# Lineage-dependent filtering ---------------------------------------------
## Identify and remove clusters enriched for likely doublets
dbltdir <- paste0(outdir, "doublets/")
dir.create(dbltdir, showWarnings=FALSE)
#### Save doublet plots of each lineage
dblts <- list()
......@@ -125,20 +131,22 @@ for (lin in names(lin.list)) {
ggplot(dblts[[lin]], aes(.data[[default.res]], fraction, fill = scDblFinder.class)) +
geom_col() +
geom_hline(yintercept = 0.5, linetype = "dashed")
ggsave(paste0(outdir, "dblts_", lin, ".png"))
ggsave(paste0(dbltdir, lin, "_pass1.png"))
}
### Remove clusters with >50% of cells classed as doublets
### Remove clusters with >=50% of cells classed as doublets
lin.list.filt <- list()
keep <- list()
for (lin in names(lin.list)) {
keep[[lin]] <- dblts[[lin]] %>%
filter(scDblFinder.class == "singlet" & fraction > 0.5) %>%
pull("RNA_snn_res.0.4")
pull(.data[[default.res]])
lin.list.filt[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
}
## Library size filtering
ncountdir <- paste0(outdir, "ncount/")
dir.create(ncountdir, showWarnings=FALSE)
### Calculate thresholds
......@@ -161,7 +169,7 @@ rownames(mt.dt) <- mt.dt$lineage
mt.dt$lineage <- NULL
### Save thresholds
write.csv(mt.dt, paste0(outdir, "nCount_cutoffs.csv"))
write.csv(mt.dt, paste0(ncountdir, "nCount_cutoffs.csv"))
### Save violin plots with cutoffs
for (g in names(lin.list.filt)) {
......@@ -169,7 +177,7 @@ for (g in names(lin.list.filt)) {
geom_hline(yintercept = mt.dt[g, "cutoff"], linetype="dashed") +
scale_y_log10() +
NoLegend()
ggsave(paste0(outdir, paste0("vln_nCount_cutoffs_", g, ".png")))
ggsave(paste0(ncountdir, g, "_nCount.png"))
}
### Apply library size filter
......@@ -187,37 +195,16 @@ tibble("Lineage" = names(cells.old),
"Filtered" = unlist(cells.new)) %>%
mutate("Difference" = Original - Filtered,
"Fraction_removed" = Difference / Original) %>%
write.csv(paste0(outdir, "fraction_removed_nCount.csv"))
write.csv(paste0(ncountdir, "fraction_removed_nCount.csv"))
# Recluster ---------------------------------------------------------------
lin.list <- lin.list.filt %>%
future_lapply(function(x){
x <- NormalizeData(x) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res) %>%
SetIdent(value = default.res)
return(x)},
future.seed=TRUE)
lin.list <- future_lapply(lin.list.filt, preprocess, future.seed=TRUE)
# Doublet removal, second round -------------------------------------------
## Create plots of doublets
dblts <- list()
for (lin in names(lin.list)) {
dblts[[lin]] <- lin.list[[lin]]@meta.data %>%
......@@ -225,46 +212,28 @@ for (lin in names(lin.list)) {
summarize(count = n()) %>%
group_by(.data[[default.res]]) %>%
mutate(fraction = count/sum(count))
ggplot(dblts[[lin]], aes(.data[[default.res]], fraction, fill = scDblFinder.class)) +
geom_col() +
geom_hline(yintercept = 0.5, linetype = "dashed")
ggsave(paste0(dbltdir, lin, "_pass2.png"))
}
## Remove doublets
keep <- list()
for (lin in names(lin.list)) {
keep[[lin]] <- dblts[[lin]] %>%
filter(scDblFinder.class == "singlet" & fraction > 0.5) %>%
pull("RNA_snn_res.0.4")
lin.list[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
pull(.data[[default.res]])
lin.list.filt[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
}
## Recluster again
lin.list <- lin.list %>%
future_lapply(function(x){
x <- NormalizeData(x) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res) %>%
SetIdent(value = default.res)
return(x)},
future.seed=TRUE)
## Only recluster epithelia as the other lineages didn't have more doublets
lin.list$Epithelia <- preprocess(lin.list.filt$Epithelia)
# Save outputs ------------------------------------------------------------
## Save UMAPs and objects
objdir <- paste0(outdir, "objects/")
dir.create(objdir, showWarnings = FALSE)
for (lin in names(lin.list)) {
......
# Annotation and stress score filtering
# Author: John T Lafin
# Updated: 2024-10-15
print("Script begin at: ")
Sys.time()
# Setup -------------------------------------------------------------------
# Load packages
library(dplyr)
library(Seurat)
library(intrinsicDimension)
library(ggplot2)
library(harmony)
library(future)
library(future.apply)
# Future initialization
future_plan <- "multisession" # Can use multicore if not on Rstudio or Windows
n_workers <- 4
plan(future_plan, workers=n_workers)
options(future.globals.maxSize = 8000 * 1024^2)
set.seed(42)
# Create output dir
outdir <- "data_modified/04_annotation/"
dir.create(outdir, showWarnings = FALSE)
# Create pre-processing function
preprocess <- function(x,
cluster.res = seq(0.1,1.5,0.1),
default.res = "RNA_snn_res.0.4",
batch.var = "Sample"){
x <- x %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = batch.var)
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = cluster.res) %>%
SetIdent(value = default.res)
return(x)
}
# Load data ---------------------------------------------------------------
lin.list <- list()
lins <- list.files('data_modified/03_lineage_filtering/objects/') %>%
sub(".rds", "", .)
for (lin in lins) {
lin.list[[lin]] <- readRDS(paste0("data_modified/03_lineage_filtering/objects/", lin, ".rds"))
}
# Annotation --------------------------------------------------------------
annotdir <- paste0(outdir, "annotations_first_pass/")
dir.create(annotdir, showWarnings=FALSE)
## Create list of labels
annot.list <- list()
annot.list$Endothelia <- c("Venous endothelia",
"Venous endothelia",
"Capillary endothelia",
"Capillary endothelia",
"Arterial endothelia",
"Lymphatic endothelia")
annot.list$Epithelia <- c("transitional basal epithelia of urethra",
"basal epithelia of prostate",
"transitional basal epithelia of urethra",
"squamous intermediate epithelia of urethra",
"basal epithelia of prostate",
"transitional intermediate epithelia of urethra",
"squamous intermediate epithelia of urethra",
"luminal epithelia of prostate",
"club epithelia of urethra",
"transitional intermediate epithelia of urethra",
"club epithelia of urethra",
"stressed cells",
"stressed cells",
"squamous intermediate epithelia of urethra",
"squamous intermediate epithelia of urethra",
"basal epithelia of prostate",
"transitional basal epithelia of urethra",
"basal epithelia of prostate",
"proliferative cells",
"squamous luminal epithelia of urethra",
"stressed cells",
"transitional basal epithelia of urethra",
"basal epithelia of prostate")
annot.list$Leukocyte <- c("T cell",
"Macrophage",
"Mast cell",
"T cell",
"T cell",
"NK cell",
"Proliferating leukocyte",
"B cell",
"Dendritic cell",
"Plasma cell")
annot.list$Stroma <- c("pericyte",
"major interstitial fibroblast",
"smooth muscle",
"minor interstitial fibroblast",
"smooth muscle",
"smooth muscle",
"periepithelial fibroblast of vagina",
"vascular smooth muscle",
"smooth muscle",
"periepithelial fibroblast of urethra",
"stressed cell",
"pericyte",
"glia",
"smooth muscle",
"endoneurial fibroblast",
"pericyte",
"glia")
## Set epithelia and stroma to higher resolutions
Idents(lin.list$Epithelia) <- "RNA_snn_res.1.5"
Idents(lin.list$Stroma) <- "RNA_snn_res.1"
## Add the labels
for (lin in lins) {
names(annot.list[[lin]]) <- levels(lin.list[[lin]])
lin.list[[lin]] <- RenameIdents(lin.list[[lin]], annot.list[[lin]]) %>%
StashIdent("cellannotation")
}
## Add neuroendocrine cells
NE.markers <- strsplit("CHGA GRP CALCA SCG2 TPH1 ASCL1 SCG5 PHGR1 MIAT MAOB", split = " ")
lin.list$Epithelia <- AddModuleScore(lin.list$Epithelia,
features = NE.markers,
name = "NE.score")
lin.list$Epithelia@meta.data <- rename(lin.list$Epithelia@meta.data, NE.score = NE.score1)
VlnPlot(lin.list$Epithelia, "NE.score", group.by = "orig.ident") +
NoLegend() +
geom_hline(yintercept=0.3, linetype="dashed")
ggsave(paste0(annotdir, "NE_score_cutoff.png"))
lin.list$Epithelia@meta.data <- lin.list$Epithelia@meta.data %>%
mutate(cellannotation = if_else(NE.score > 0.3, "Neuroendocrine cell", cellannotation))
## Add ionocytes
VlnPlot(lin.list$Epithelia, "FOXI1", group.by = "orig.ident") +
NoLegend() +
geom_hline(yintercept=0.3, linetype="dashed")
ggsave(paste0(annotdir, "FOXI1_score_cutoff.png"))
ionocyte.barcodes <- WhichCells(lin.list$Epithelia, expression = FOXI1 > 0.3)
lin.list$Epithelia@meta.data[ionocyte.barcodes, "cellannotation"] <- "Ionocyte"
## Save updates to object
lin.list$Epithelia <- StashIdent(lin.list$Epithelia, "cellannotation")
## Split periepithelial fibroblasts by sex
fib_rename <- WhichCells(lin.list$Stroma,
idents="periepithelial fibroblast of vagina",
expression = Sex == "male")
levels(lin.list$Stroma$cellannotation) <- c(levels(lin.list$Stroma$cellannotation), "periepithelial fibroblast of prostate")
lin.list$Stroma@meta.data[fib_rename, "cellannotation"] <- "periepithelial fibroblast of prostate"
Idents(stroma) <- "cellannotation"
## Save labeled UMAP plots
for (lin in lins) {
DimPlot(lin.list[[lin]])
ggsave(paste0(annotdir, lin,"_umap.png"))
}
# Stress score filtering --------------------------------------------------
stressdir <- paste0(outdir, "stress/")
dir.create(stressdir, showWarnings = FALSE)
## Produce violin plots of stress scores
lin.list <- lapply(lin.list, SetIdent, value = "RNA_snn_res.1")
### Check epithelia at higher resolution to capture other populations
Idents(lin.list$Epithelia) <- "RNA_snn_res.1.5"
for (lin in names(lin.list)) {
VlnPlot(lin.list[[lin]], "Stress.score", pt.size=0) +
NoLegend()
ggsave(paste0(stressdir, lin, "_pre-filter.png"),
width=11,
height=8.5)
}
## Define stressed clusters based on violin plots
stressed.cells <- list()
stressed.cells$Epithelia <- c(11,12,20)
stressed.cells$Stroma <- c(2,10,15)
stressed.cells$Endothelia <- c(4,10)
stressed.cells$Leukocyte <- c(4,9)
## Remove stressed clusters and reprocess
for (lin in names(lin.list)) {
lin.list[[lin]] <- subset(lin.list[[lin]],
idents = stressed.cells[[lin]],
invert = TRUE)
}
lin.list <- lapply(lin.list, preprocess)
### TO-DO
# Second round of stress filtering on stroma
# Revisit epithelia to relabel
# Make sure that ionocytes and NE cells are present in epi labels
# Produce post-filter stress plots
for (lin in names(lin.list)) {
VlnPlot(lin.list[[lin]], "Stress.score", pt.size=0) +
NoLegend()
ggsave(paste0(stressdir, lin, "_first_filter.png"),
width=11,
height=8.5)
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment