diff --git a/scripts/03_lineage_filtering.R b/scripts/03_lineage_filtering.R index 1bfa5df479c17ca9eecc723e68378a4b68964bbb..0dc4d46d68c80d12e7a7a72ef8de7128ace4d049 100644 --- a/scripts/03_lineage_filtering.R +++ b/scripts/03_lineage_filtering.R @@ -1,6 +1,6 @@ # Lineage-dependent filtering # Author: John T Lafin -# Updated: 2024-09-17 +# Updated: 2024-10-15 print("Script begin at: ") Sys.time() @@ -28,6 +28,33 @@ set.seed(42) outdir <- "data_modified/03_lineage_filtering/" dir.create(outdir, showWarnings = FALSE) +# Create pre-processing function +preprocess <- function(x, + cluster.res = seq(0.1,1.5,0.1), + default.res = "RNA_snn_res.0.4", + batch.var = "Sample"){ + x <- x %>% + NormalizeData() %>% + FindVariableFeatures() %>% + ScaleData() %>% + RunPCA() %>% + RunHarmony(group.by.vars = batch.var) + + numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, + k = 10)$dim.est %>% + round() + x <- x %>% + RunUMAP(dims = 1:numPCs, + reduction = "harmony", + metric = "euclidean", + min.dist = 0.1) %>% + FindNeighbors(reduction = "harmony", + dims = 1:numPCs) %>% + FindClusters(resolution = cluster.res) %>% + SetIdent(value = default.res) + return(x) +} + # Load data --------------------------------------------------------------- sc.h <- readRDS("data_modified/02_preprocess/processed_object.rds") @@ -78,30 +105,8 @@ ggsave(paste0(outdir, "dotplot_global_lineage.png")) lin.list <- SplitObject(sc.h, split.by="Lineage") lin.list[["Putative doublet"]] <- NULL # Remove doublets -res <- seq(0.1, 1, 0.1) - -lin.list <- lin.list %>% - future_lapply(function(x){ - x <- NormalizeData(x) %>% - FindVariableFeatures() %>% - ScaleData() %>% - RunPCA() %>% - RunHarmony(group.by.vars = "Sample") - - numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, - k = 10)$dim.est %>% - round() - x <- x %>% - RunUMAP(dims = 1:numPCs, - reduction = "harmony", - metric = "euclidean", - min.dist = 0.1) %>% - FindNeighbors(reduction = "harmony", - dims = 1:numPCs) %>% - FindClusters(resolution = res) %>% - SetIdent(value = default.res) - return(x)}, - future.seed=TRUE) +res <- seq(0.1, 1.5, 0.1) +lin.list <- future_lapply(lin.list, preprocess, future.seed=TRUE) # Save UMAP of each subset for ( lin in names(lin.list) ){ @@ -109,10 +114,11 @@ for ( lin in names(lin.list) ){ ggsave(paste0(umapdir, lin, "_prefilter.png")) } - # Lineage-dependent filtering --------------------------------------------- ## Identify and remove clusters enriched for likely doublets +dbltdir <- paste0(outdir, "doublets/") +dir.create(dbltdir, showWarnings=FALSE) #### Save doublet plots of each lineage dblts <- list() @@ -125,20 +131,22 @@ for (lin in names(lin.list)) { ggplot(dblts[[lin]], aes(.data[[default.res]], fraction, fill = scDblFinder.class)) + geom_col() + geom_hline(yintercept = 0.5, linetype = "dashed") - ggsave(paste0(outdir, "dblts_", lin, ".png")) + ggsave(paste0(dbltdir, lin, "_pass1.png")) } -### Remove clusters with >50% of cells classed as doublets +### Remove clusters with >=50% of cells classed as doublets lin.list.filt <- list() keep <- list() for (lin in names(lin.list)) { keep[[lin]] <- dblts[[lin]] %>% filter(scDblFinder.class == "singlet" & fraction > 0.5) %>% - pull("RNA_snn_res.0.4") + pull(.data[[default.res]]) lin.list.filt[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]]) } ## Library size filtering +ncountdir <- paste0(outdir, "ncount/") +dir.create(ncountdir, showWarnings=FALSE) ### Calculate thresholds @@ -161,7 +169,7 @@ rownames(mt.dt) <- mt.dt$lineage mt.dt$lineage <- NULL ### Save thresholds -write.csv(mt.dt, paste0(outdir, "nCount_cutoffs.csv")) +write.csv(mt.dt, paste0(ncountdir, "nCount_cutoffs.csv")) ### Save violin plots with cutoffs for (g in names(lin.list.filt)) { @@ -169,7 +177,7 @@ for (g in names(lin.list.filt)) { geom_hline(yintercept = mt.dt[g, "cutoff"], linetype="dashed") + scale_y_log10() + NoLegend() - ggsave(paste0(outdir, paste0("vln_nCount_cutoffs_", g, ".png"))) + ggsave(paste0(ncountdir, g, "_nCount.png")) } ### Apply library size filter @@ -187,37 +195,16 @@ tibble("Lineage" = names(cells.old), "Filtered" = unlist(cells.new)) %>% mutate("Difference" = Original - Filtered, "Fraction_removed" = Difference / Original) %>% - write.csv(paste0(outdir, "fraction_removed_nCount.csv")) + write.csv(paste0(ncountdir, "fraction_removed_nCount.csv")) # Recluster --------------------------------------------------------------- -lin.list <- lin.list.filt %>% - future_lapply(function(x){ - x <- NormalizeData(x) %>% - FindVariableFeatures() %>% - ScaleData() %>% - RunPCA() %>% - RunHarmony(group.by.vars = "Sample") - - numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, - k = 10)$dim.est %>% - round() - x <- x %>% - RunUMAP(dims = 1:numPCs, - reduction = "harmony", - metric = "euclidean", - min.dist = 0.1) %>% - FindNeighbors(reduction = "harmony", - dims = 1:numPCs) %>% - FindClusters(resolution = res) %>% - SetIdent(value = default.res) - return(x)}, - future.seed=TRUE) - +lin.list <- future_lapply(lin.list.filt, preprocess, future.seed=TRUE) # Doublet removal, second round ------------------------------------------- +## Create plots of doublets dblts <- list() for (lin in names(lin.list)) { dblts[[lin]] <- lin.list[[lin]]@meta.data %>% @@ -225,46 +212,28 @@ for (lin in names(lin.list)) { summarize(count = n()) %>% group_by(.data[[default.res]]) %>% mutate(fraction = count/sum(count)) + ggplot(dblts[[lin]], aes(.data[[default.res]], fraction, fill = scDblFinder.class)) + + geom_col() + + geom_hline(yintercept = 0.5, linetype = "dashed") + ggsave(paste0(dbltdir, lin, "_pass2.png")) } +## Remove doublets keep <- list() for (lin in names(lin.list)) { keep[[lin]] <- dblts[[lin]] %>% filter(scDblFinder.class == "singlet" & fraction > 0.5) %>% - pull("RNA_snn_res.0.4") - lin.list[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]]) + pull(.data[[default.res]]) + lin.list.filt[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]]) } ## Recluster again - -lin.list <- lin.list %>% - future_lapply(function(x){ - x <- NormalizeData(x) %>% - FindVariableFeatures() %>% - ScaleData() %>% - RunPCA() %>% - RunHarmony(group.by.vars = "Sample") - - numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, - k = 10)$dim.est %>% - round() - x <- x %>% - RunUMAP(dims = 1:numPCs, - reduction = "harmony", - metric = "euclidean", - min.dist = 0.1) %>% - FindNeighbors(reduction = "harmony", - dims = 1:numPCs) %>% - FindClusters(resolution = res) %>% - SetIdent(value = default.res) - return(x)}, - future.seed=TRUE) - +## Only recluster epithelia as the other lineages didn't have more doublets +lin.list$Epithelia <- preprocess(lin.list.filt$Epithelia) # Save outputs ------------------------------------------------------------ ## Save UMAPs and objects - objdir <- paste0(outdir, "objects/") dir.create(objdir, showWarnings = FALSE) for (lin in names(lin.list)) { diff --git a/scripts/04_annotation.R b/scripts/04_annotation.R new file mode 100644 index 0000000000000000000000000000000000000000..5c1d175c8f0a5c2609bbca99383da55b92ae9228 --- /dev/null +++ b/scripts/04_annotation.R @@ -0,0 +1,232 @@ +# Annotation and stress score filtering +# Author: John T Lafin +# Updated: 2024-10-15 + +print("Script begin at: ") +Sys.time() + +# Setup ------------------------------------------------------------------- + +# Load packages +library(dplyr) +library(Seurat) +library(intrinsicDimension) +library(ggplot2) +library(harmony) +library(future) +library(future.apply) + +# Future initialization +future_plan <- "multisession" # Can use multicore if not on Rstudio or Windows +n_workers <- 4 +plan(future_plan, workers=n_workers) +options(future.globals.maxSize = 8000 * 1024^2) +set.seed(42) + +# Create output dir +outdir <- "data_modified/04_annotation/" +dir.create(outdir, showWarnings = FALSE) + +# Create pre-processing function +preprocess <- function(x, + cluster.res = seq(0.1,1.5,0.1), + default.res = "RNA_snn_res.0.4", + batch.var = "Sample"){ + x <- x %>% + NormalizeData() %>% + FindVariableFeatures() %>% + ScaleData() %>% + RunPCA() %>% + RunHarmony(group.by.vars = batch.var) + + numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, + k = 10)$dim.est %>% + round() + x <- x %>% + RunUMAP(dims = 1:numPCs, + reduction = "harmony", + metric = "euclidean", + min.dist = 0.1) %>% + FindNeighbors(reduction = "harmony", + dims = 1:numPCs) %>% + FindClusters(resolution = cluster.res) %>% + SetIdent(value = default.res) + return(x) +} + +# Load data --------------------------------------------------------------- + +lin.list <- list() +lins <- list.files('data_modified/03_lineage_filtering/objects/') %>% + sub(".rds", "", .) +for (lin in lins) { + lin.list[[lin]] <- readRDS(paste0("data_modified/03_lineage_filtering/objects/", lin, ".rds")) +} + +# Annotation -------------------------------------------------------------- + +annotdir <- paste0(outdir, "annotations_first_pass/") +dir.create(annotdir, showWarnings=FALSE) + +## Create list of labels +annot.list <- list() +annot.list$Endothelia <- c("Venous endothelia", + "Venous endothelia", + "Capillary endothelia", + "Capillary endothelia", + "Arterial endothelia", + "Lymphatic endothelia") +annot.list$Epithelia <- c("transitional basal epithelia of urethra", + "basal epithelia of prostate", + "transitional basal epithelia of urethra", + "squamous intermediate epithelia of urethra", + "basal epithelia of prostate", + "transitional intermediate epithelia of urethra", + "squamous intermediate epithelia of urethra", + "luminal epithelia of prostate", + "club epithelia of urethra", + "transitional intermediate epithelia of urethra", + "club epithelia of urethra", + "stressed cells", + "stressed cells", + "squamous intermediate epithelia of urethra", + "squamous intermediate epithelia of urethra", + "basal epithelia of prostate", + "transitional basal epithelia of urethra", + "basal epithelia of prostate", + "proliferative cells", + "squamous luminal epithelia of urethra", + "stressed cells", + "transitional basal epithelia of urethra", + "basal epithelia of prostate") +annot.list$Leukocyte <- c("T cell", + "Macrophage", + "Mast cell", + "T cell", + "T cell", + "NK cell", + "Proliferating leukocyte", + "B cell", + "Dendritic cell", + "Plasma cell") +annot.list$Stroma <- c("pericyte", + "major interstitial fibroblast", + "smooth muscle", + "minor interstitial fibroblast", + "smooth muscle", + "smooth muscle", + "periepithelial fibroblast of vagina", + "vascular smooth muscle", + "smooth muscle", + "periepithelial fibroblast of urethra", + "stressed cell", + "pericyte", + "glia", + "smooth muscle", + "endoneurial fibroblast", + "pericyte", + "glia") + +## Set epithelia and stroma to higher resolutions +Idents(lin.list$Epithelia) <- "RNA_snn_res.1.5" +Idents(lin.list$Stroma) <- "RNA_snn_res.1" + +## Add the labels +for (lin in lins) { + names(annot.list[[lin]]) <- levels(lin.list[[lin]]) + lin.list[[lin]] <- RenameIdents(lin.list[[lin]], annot.list[[lin]]) %>% + StashIdent("cellannotation") +} + +## Add neuroendocrine cells +NE.markers <- strsplit("CHGA GRP CALCA SCG2 TPH1 ASCL1 SCG5 PHGR1 MIAT MAOB", split = " ") +lin.list$Epithelia <- AddModuleScore(lin.list$Epithelia, + features = NE.markers, + name = "NE.score") +lin.list$Epithelia@meta.data <- rename(lin.list$Epithelia@meta.data, NE.score = NE.score1) +VlnPlot(lin.list$Epithelia, "NE.score", group.by = "orig.ident") + + NoLegend() + + geom_hline(yintercept=0.3, linetype="dashed") +ggsave(paste0(annotdir, "NE_score_cutoff.png")) + +lin.list$Epithelia@meta.data <- lin.list$Epithelia@meta.data %>% + mutate(cellannotation = if_else(NE.score > 0.3, "Neuroendocrine cell", cellannotation)) + +## Add ionocytes +VlnPlot(lin.list$Epithelia, "FOXI1", group.by = "orig.ident") + + NoLegend() + + geom_hline(yintercept=0.3, linetype="dashed") +ggsave(paste0(annotdir, "FOXI1_score_cutoff.png")) + +ionocyte.barcodes <- WhichCells(lin.list$Epithelia, expression = FOXI1 > 0.3) +lin.list$Epithelia@meta.data[ionocyte.barcodes, "cellannotation"] <- "Ionocyte" + +## Save updates to object +lin.list$Epithelia <- StashIdent(lin.list$Epithelia, "cellannotation") + +## Split periepithelial fibroblasts by sex +fib_rename <- WhichCells(lin.list$Stroma, + idents="periepithelial fibroblast of vagina", + expression = Sex == "male") +levels(lin.list$Stroma$cellannotation) <- c(levels(lin.list$Stroma$cellannotation), "periepithelial fibroblast of prostate") +lin.list$Stroma@meta.data[fib_rename, "cellannotation"] <- "periepithelial fibroblast of prostate" +Idents(stroma) <- "cellannotation" + +## Save labeled UMAP plots +for (lin in lins) { + DimPlot(lin.list[[lin]]) + ggsave(paste0(annotdir, lin,"_umap.png")) +} + +# Stress score filtering -------------------------------------------------- + +stressdir <- paste0(outdir, "stress/") +dir.create(stressdir, showWarnings = FALSE) + +## Produce violin plots of stress scores +lin.list <- lapply(lin.list, SetIdent, value = "RNA_snn_res.1") + +### Check epithelia at higher resolution to capture other populations +Idents(lin.list$Epithelia) <- "RNA_snn_res.1.5" + +for (lin in names(lin.list)) { + VlnPlot(lin.list[[lin]], "Stress.score", pt.size=0) + + NoLegend() + ggsave(paste0(stressdir, lin, "_pre-filter.png"), + width=11, + height=8.5) +} + +## Define stressed clusters based on violin plots +stressed.cells <- list() +stressed.cells$Epithelia <- c(11,12,20) +stressed.cells$Stroma <- c(2,10,15) +stressed.cells$Endothelia <- c(4,10) +stressed.cells$Leukocyte <- c(4,9) + +## Remove stressed clusters and reprocess +for (lin in names(lin.list)) { + lin.list[[lin]] <- subset(lin.list[[lin]], + idents = stressed.cells[[lin]], + invert = TRUE) +} + +lin.list <- lapply(lin.list, preprocess) + +### TO-DO +# Second round of stress filtering on stroma +# Revisit epithelia to relabel +# Make sure that ionocytes and NE cells are present in epi labels + + + + + +# Produce post-filter stress plots +for (lin in names(lin.list)) { + VlnPlot(lin.list[[lin]], "Stress.score", pt.size=0) + + NoLegend() + ggsave(paste0(stressdir, lin, "_first_filter.png"), + width=11, + height=8.5) +}