Update lineage filtering; move stress filter to annot script

e72babe6 · John Lafin · e0e9f125 · e72babe6 · e72babe6
Commit e72babe6 authored 6 months ago by John Lafin
--- a/scripts/03_lineage_filtering.R
+++ b/scripts/03_lineage_filtering.R
 # Lineage-dependent filtering
 # Author: John T Lafin
-# Updated: 2024-09-17
+# Updated: 2024-10-15

 print("Script begin at: ")
 Sys.time()
@@ -28,6 +28,33 @@ set.seed(42)
 outdir <- "data_modified/03_lineage_filtering/"
 dir.create(outdir, showWarnings = FALSE)

+# Create pre-processing function
+preprocess <- function(x, 
+                       cluster.res = seq(0.1,1.5,0.1), 
+                       default.res = "RNA_snn_res.0.4", 
+                       batch.var = "Sample"){
+  x <- x %>%
+    NormalizeData() %>%
+    FindVariableFeatures() %>%
+    ScaleData() %>%
+    RunPCA() %>%
+    RunHarmony(group.by.vars = batch.var)
+  
+  numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, 
+                               k = 10)$dim.est %>%
+    round()
+  x <- x %>%
+    RunUMAP(dims = 1:numPCs,
+            reduction = "harmony",
+            metric = "euclidean",
+            min.dist = 0.1) %>%
+    FindNeighbors(reduction = "harmony",
+                  dims = 1:numPCs) %>%
+    FindClusters(resolution = cluster.res) %>%
+    SetIdent(value = default.res)
+  return(x)
+}
+
 # Load data ---------------------------------------------------------------

 sc.h <- readRDS("data_modified/02_preprocess/processed_object.rds")
@@ -78,30 +105,8 @@ ggsave(paste0(outdir, "dotplot_global_lineage.png"))
 lin.list <- SplitObject(sc.h, split.by="Lineage")
 lin.list[["Putative doublet"]] <- NULL # Remove doublets

-res <- seq(0.1, 1, 0.1)
-
-lin.list <- lin.list %>%
-  future_lapply(function(x){
-    x <- NormalizeData(x) %>%
-      FindVariableFeatures() %>%
-      ScaleData() %>%
-      RunPCA() %>%
-      RunHarmony(group.by.vars = "Sample")
-    
-    numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, 
-                                 k = 10)$dim.est %>%
-      round()
-    x <- x %>%
-      RunUMAP(dims = 1:numPCs,
-              reduction = "harmony",
-              metric = "euclidean",
-              min.dist = 0.1) %>%
-      FindNeighbors(reduction = "harmony",
-                    dims = 1:numPCs) %>%
-      FindClusters(resolution = res) %>%
-      SetIdent(value = default.res)
-    return(x)},
-    future.seed=TRUE)
+res <- seq(0.1, 1.5, 0.1)
+lin.list <- future_lapply(lin.list, preprocess, future.seed=TRUE)

 # Save UMAP of each subset
 for ( lin in names(lin.list) ){
@@ -109,10 +114,11 @@ for ( lin in names(lin.list) ){
  ggsave(paste0(umapdir, lin, "_prefilter.png"))
 }

-
 # Lineage-dependent filtering ---------------------------------------------

 ## Identify and remove clusters enriched for likely doublets
+dbltdir <- paste0(outdir, "doublets/")
+dir.create(dbltdir, showWarnings=FALSE)

 #### Save doublet plots of each lineage
 dblts <- list()
@@ -125,20 +131,22 @@ for (lin in names(lin.list)) {
  ggplot(dblts[[lin]], aes(.data[[default.res]], fraction, fill = scDblFinder.class)) +
    geom_col() + 
    geom_hline(yintercept = 0.5, linetype = "dashed")
-  ggsave(paste0(outdir, "dblts_", lin, ".png"))
+  ggsave(paste0(dbltdir, lin, "_pass1.png"))
 }

-### Remove clusters with >50% of cells classed as doublets
+### Remove clusters with >=50% of cells classed as doublets
 lin.list.filt <- list()
 keep <- list()
 for (lin in names(lin.list)) {
  keep[[lin]] <- dblts[[lin]] %>%
    filter(scDblFinder.class == "singlet" & fraction > 0.5) %>%
-    pull("RNA_snn_res.0.4")
+    pull(.data[[default.res]])
  lin.list.filt[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
 }

 ## Library size filtering
+ncountdir <- paste0(outdir, "ncount/")
+dir.create(ncountdir, showWarnings=FALSE)

 ### Calculate thresholds

@@ -161,7 +169,7 @@ rownames(mt.dt) <- mt.dt$lineage
 mt.dt$lineage <- NULL

 ### Save thresholds
-write.csv(mt.dt, paste0(outdir, "nCount_cutoffs.csv"))
+write.csv(mt.dt, paste0(ncountdir, "nCount_cutoffs.csv"))

 ### Save violin plots with cutoffs
 for (g in names(lin.list.filt)) {
@@ -169,7 +177,7 @@ for (g in names(lin.list.filt)) {
          geom_hline(yintercept = mt.dt[g, "cutoff"], linetype="dashed") +
          scale_y_log10() +
          NoLegend()
-  ggsave(paste0(outdir, paste0("vln_nCount_cutoffs_", g, ".png")))
+  ggsave(paste0(ncountdir, g, "_nCount.png"))
 }

 ### Apply library size filter
@@ -187,37 +195,16 @@ tibble("Lineage" = names(cells.old),
       "Filtered" = unlist(cells.new)) %>%
  mutate("Difference" = Original - Filtered,
         "Fraction_removed" = Difference / Original) %>%
-  write.csv(paste0(outdir, "fraction_removed_nCount.csv"))
+  write.csv(paste0(ncountdir, "fraction_removed_nCount.csv"))


 # Recluster ---------------------------------------------------------------

-lin.list <- lin.list.filt %>%
-  future_lapply(function(x){
-    x <- NormalizeData(x) %>%
-      FindVariableFeatures() %>%
-      ScaleData() %>%
-      RunPCA() %>%
-      RunHarmony(group.by.vars = "Sample")
-    
-    numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, 
-                                 k = 10)$dim.est %>%
-      round()
-    x <- x %>%
-      RunUMAP(dims = 1:numPCs,
-              reduction = "harmony",
-              metric = "euclidean",
-              min.dist = 0.1) %>%
-      FindNeighbors(reduction = "harmony",
-                    dims = 1:numPCs) %>%
-      FindClusters(resolution = res) %>%
-      SetIdent(value = default.res)
-    return(x)},
-    future.seed=TRUE)
-
+lin.list <- future_lapply(lin.list.filt, preprocess, future.seed=TRUE)

 # Doublet removal, second round -------------------------------------------

+## Create plots of doublets
 dblts <- list()
 for (lin in names(lin.list)) {
  dblts[[lin]] <- lin.list[[lin]]@meta.data %>%
@@ -225,46 +212,28 @@ for (lin in names(lin.list)) {
    summarize(count = n()) %>%
    group_by(.data[[default.res]]) %>%
    mutate(fraction = count/sum(count))
+  ggplot(dblts[[lin]], aes(.data[[default.res]], fraction, fill = scDblFinder.class)) +
+    geom_col() + 
+    geom_hline(yintercept = 0.5, linetype = "dashed")
+  ggsave(paste0(dbltdir, lin, "_pass2.png"))
 }

+## Remove doublets
 keep <- list()
 for (lin in names(lin.list)) {
  keep[[lin]] <- dblts[[lin]] %>%
    filter(scDblFinder.class == "singlet" & fraction > 0.5) %>%
-    pull("RNA_snn_res.0.4")
-  lin.list[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
+    pull(.data[[default.res]])
+  lin.list.filt[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
 }

 ## Recluster again
-
-lin.list <- lin.list %>%
-  future_lapply(function(x){
-    x <- NormalizeData(x) %>%
-      FindVariableFeatures() %>%
-      ScaleData() %>%
-      RunPCA() %>%
-      RunHarmony(group.by.vars = "Sample")
-    
-    numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, 
-                                 k = 10)$dim.est %>%
-      round()
-    x <- x %>%
-      RunUMAP(dims = 1:numPCs,
-              reduction = "harmony",
-              metric = "euclidean",
-              min.dist = 0.1) %>%
-      FindNeighbors(reduction = "harmony",
-                    dims = 1:numPCs) %>%
-      FindClusters(resolution = res) %>%
-      SetIdent(value = default.res)
-    return(x)},
-    future.seed=TRUE)
-
+## Only recluster epithelia as the other lineages didn't have more doublets
+lin.list$Epithelia <- preprocess(lin.list.filt$Epithelia)

 # Save outputs ------------------------------------------------------------

 ## Save UMAPs and objects
-
 objdir <- paste0(outdir, "objects/")
 dir.create(objdir, showWarnings = FALSE)
 for (lin in names(lin.list)) {

--- a/scripts/04_annotation.R
+++ b/scripts/04_annotation.R
+# Annotation and stress score filtering
+# Author: John T Lafin
+# Updated: 2024-10-15
+
+print("Script begin at: ")
+Sys.time()
+
+# Setup -------------------------------------------------------------------
+
+# Load packages
+library(dplyr)
+library(Seurat)
+library(intrinsicDimension)
+library(ggplot2)
+library(harmony)
+library(future)
+library(future.apply)
+
+# Future initialization
+future_plan <- "multisession" # Can use multicore if not on Rstudio or Windows
+n_workers <- 4
+plan(future_plan, workers=n_workers)  
+options(future.globals.maxSize = 8000 * 1024^2)
+set.seed(42)
+
+# Create output dir
+outdir <- "data_modified/04_annotation/"
+dir.create(outdir, showWarnings = FALSE)
+
+# Create pre-processing function
+preprocess <- function(x, 
+                       cluster.res = seq(0.1,1.5,0.1), 
+                       default.res = "RNA_snn_res.0.4", 
+                       batch.var = "Sample"){
+  x <- x %>%
+    NormalizeData() %>%
+    FindVariableFeatures() %>%
+    ScaleData() %>%
+    RunPCA() %>%
+    RunHarmony(group.by.vars = batch.var)
+  
+  numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, 
+                               k = 10)$dim.est %>%
+    round()
+  x <- x %>%
+    RunUMAP(dims = 1:numPCs,
+            reduction = "harmony",
+            metric = "euclidean",
+            min.dist = 0.1) %>%
+    FindNeighbors(reduction = "harmony",
+                  dims = 1:numPCs) %>%
+    FindClusters(resolution = cluster.res) %>%
+    SetIdent(value = default.res)
+  return(x)
+}
+
+# Load data ---------------------------------------------------------------
+
+lin.list <- list()
+lins <- list.files('data_modified/03_lineage_filtering/objects/') %>%
+  sub(".rds", "", .)
+for (lin in lins) {
+  lin.list[[lin]] <- readRDS(paste0("data_modified/03_lineage_filtering/objects/", lin, ".rds"))
+}
+
+# Annotation --------------------------------------------------------------
+
+annotdir <- paste0(outdir, "annotations_first_pass/")
+dir.create(annotdir, showWarnings=FALSE)
+
+## Create list of labels
+annot.list <- list()
+annot.list$Endothelia <- c("Venous endothelia",
+                           "Venous endothelia",
+                           "Capillary endothelia",
+                           "Capillary endothelia",
+                           "Arterial endothelia",
+                           "Lymphatic endothelia")
+annot.list$Epithelia <- c("transitional basal epithelia of urethra",
+                          "basal epithelia of prostate",
+                          "transitional basal epithelia of urethra",
+                          "squamous intermediate epithelia of urethra",
+                          "basal epithelia of prostate",
+                          "transitional intermediate epithelia of urethra",
+                          "squamous intermediate epithelia of urethra",
+                          "luminal epithelia of prostate",
+                          "club epithelia of urethra",
+                          "transitional intermediate epithelia of urethra",
+                          "club epithelia of urethra",
+                          "stressed cells",
+                          "stressed cells",
+                          "squamous intermediate epithelia of urethra",
+                          "squamous intermediate epithelia of urethra",
+                          "basal epithelia of prostate",
+                          "transitional basal epithelia of urethra",
+                          "basal epithelia of prostate",
+                          "proliferative cells",
+                          "squamous luminal epithelia of urethra",
+                          "stressed cells",
+                          "transitional basal epithelia of urethra",
+                          "basal epithelia of prostate")
+annot.list$Leukocyte <- c("T cell",
+                          "Macrophage",
+                          "Mast cell",
+                          "T cell",
+                          "T cell",
+                          "NK cell",
+                          "Proliferating leukocyte",
+                          "B cell",
+                          "Dendritic cell",
+                          "Plasma cell")
+annot.list$Stroma <- c("pericyte",
+                       "major interstitial fibroblast",
+                       "smooth muscle",
+                       "minor interstitial fibroblast",
+                       "smooth muscle",
+                       "smooth muscle",
+                       "periepithelial fibroblast of vagina",
+                       "vascular smooth muscle",
+                       "smooth muscle",
+                       "periepithelial fibroblast of urethra",
+                       "stressed cell",
+                       "pericyte",
+                       "glia",
+                       "smooth muscle",
+                       "endoneurial fibroblast",
+                       "pericyte",
+                       "glia")
+
+## Set epithelia and stroma to higher resolutions
+Idents(lin.list$Epithelia) <- "RNA_snn_res.1.5"
+Idents(lin.list$Stroma) <- "RNA_snn_res.1"
+
+## Add the labels
+for (lin in lins) {
+  names(annot.list[[lin]]) <- levels(lin.list[[lin]])
+  lin.list[[lin]] <- RenameIdents(lin.list[[lin]], annot.list[[lin]]) %>%
+    StashIdent("cellannotation")
+}
+
+## Add neuroendocrine cells
+NE.markers <- strsplit("CHGA GRP CALCA SCG2 TPH1 ASCL1 SCG5 PHGR1 MIAT MAOB", split = " ")
+lin.list$Epithelia <- AddModuleScore(lin.list$Epithelia, 
+                       features = NE.markers,
+                       name = "NE.score")
+lin.list$Epithelia@meta.data <- rename(lin.list$Epithelia@meta.data, NE.score = NE.score1)
+VlnPlot(lin.list$Epithelia, "NE.score", group.by = "orig.ident") + 
+  NoLegend() + 
+  geom_hline(yintercept=0.3, linetype="dashed")
+ggsave(paste0(annotdir, "NE_score_cutoff.png"))
+
+lin.list$Epithelia@meta.data <- lin.list$Epithelia@meta.data %>%
+  mutate(cellannotation = if_else(NE.score > 0.3, "Neuroendocrine cell", cellannotation))
+
+## Add ionocytes
+VlnPlot(lin.list$Epithelia, "FOXI1", group.by = "orig.ident") + 
+  NoLegend() + 
+  geom_hline(yintercept=0.3, linetype="dashed")
+ggsave(paste0(annotdir, "FOXI1_score_cutoff.png"))
+
+ionocyte.barcodes <- WhichCells(lin.list$Epithelia, expression = FOXI1 > 0.3)
+lin.list$Epithelia@meta.data[ionocyte.barcodes, "cellannotation"] <- "Ionocyte"
+
+## Save updates to object
+lin.list$Epithelia <- StashIdent(lin.list$Epithelia, "cellannotation")
+
+## Split periepithelial fibroblasts by sex
+fib_rename <- WhichCells(lin.list$Stroma, 
+                         idents="periepithelial fibroblast of vagina", 
+                         expression = Sex == "male")
+levels(lin.list$Stroma$cellannotation) <- c(levels(lin.list$Stroma$cellannotation), "periepithelial fibroblast of prostate")
+lin.list$Stroma@meta.data[fib_rename, "cellannotation"] <- "periepithelial fibroblast of prostate"
+Idents(stroma) <- "cellannotation"
+
+## Save labeled UMAP plots
+for (lin in lins) {
+  DimPlot(lin.list[[lin]])
+  ggsave(paste0(annotdir, lin,"_umap.png"))
+}
+
+# Stress score filtering --------------------------------------------------
+
+stressdir <- paste0(outdir, "stress/")
+dir.create(stressdir, showWarnings = FALSE)
+
+## Produce violin plots of stress scores
+lin.list <- lapply(lin.list, SetIdent, value = "RNA_snn_res.1")
+
+### Check epithelia at higher resolution to capture other populations
+Idents(lin.list$Epithelia) <- "RNA_snn_res.1.5"
+
+for (lin in names(lin.list)) {
+  VlnPlot(lin.list[[lin]], "Stress.score", pt.size=0) +
+    NoLegend()
+  ggsave(paste0(stressdir, lin, "_pre-filter.png"),
+         width=11,
+         height=8.5)
+}
+
+## Define stressed clusters based on violin plots
+stressed.cells <- list()
+stressed.cells$Epithelia <- c(11,12,20)
+stressed.cells$Stroma <- c(2,10,15)
+stressed.cells$Endothelia <- c(4,10)
+stressed.cells$Leukocyte <- c(4,9)
+
+## Remove stressed clusters and reprocess
+for (lin in names(lin.list)) {
+  lin.list[[lin]] <- subset(lin.list[[lin]], 
+                            idents = stressed.cells[[lin]],
+                            invert = TRUE)
+}
+
+lin.list <- lapply(lin.list, preprocess)
+
+### TO-DO
+# Second round of stress filtering on stroma
+# Revisit epithelia to relabel
+# Make sure that ionocytes and NE cells are present in epi labels
+
+
+
+
+
+# Produce post-filter stress plots
+for (lin in names(lin.list)) {
+  VlnPlot(lin.list[[lin]], "Stress.score", pt.size=0) +
+    NoLegend()
+  ggsave(paste0(stressdir, lin, "_first_filter.png"),
+         width=11,
+         height=8.5)
+}