Add lineage filtering

b484c159 · John Lafin · cca00d94 · b484c159
Commit b484c159 authored 7 months ago by John Lafin
--- a/scripts/03_lineage_filtering.R
+++ b/scripts/03_lineage_filtering.R
+# Lineage-dependent filtering
+# Author: John T Lafin
+# Updated: 2024-09-17
+
+print("Script begin at: ")
+Sys.time()
+
+# Setup -------------------------------------------------------------------
+
+# Load packages
+library(dplyr)
+library(Seurat)
+library(intrinsicDimension)
+library(ggplot2)
+library(harmony)
+library(future)
+library(future.apply)
+library(clustree)
+
+# Future initialization
+future_plan <- "multisession" # Can use multicore if not on Rstudio or Windows
+n_workers <- 4
+plan(future_plan, workers=n_workers)  
+options(future.globals.maxSize = 8000 * 1024^2)
+set.seed(42)
+
+# Create output dir
+outdir <- "data_modified/03_lineage_filtering/"
+dir.create(outdir, showWarnings = FALSE)
+
+# Load data ---------------------------------------------------------------
+
+sc.h <- readRDS("data_modified/02_preprocess/processed_object.rds")
+
+# Remove this field or Seurat won't work properly
+sc.h$ident <- NULL
+default.res <- "RNA_snn_res.0.4"
+Idents(sc.h) <- default.res
+
+
+# Annotate lineages -------------------------------------------------------
+
+# Using information generated from 02_preprocess.R, annotate lineages
+cluster_epi <- c(0, 1, 7, 8, 9, 11, 15)
+cluster_endo <- c(4, 6, 12)
+cluster_leu <- c(10, 14, 16)
+cluster_stroma <- c(2, 3, 5, 13)
+cluster_doublets <- c(17, 18)
+
+# Make assignments
+cluster.ids <- vector("character", length = length(levels(sc.h)))
+cluster.ids[cluster_epi+1] <- "Epithelia"
+cluster.ids[cluster_endo+1] <- "Endothelia"
+cluster.ids[cluster_leu+1] <- "Leukocyte"
+cluster.ids[cluster_stroma+1] <- "Stroma"
+cluster.ids[cluster_doublets+1] <- "Putative doublet"
+names(cluster.ids) <- levels(sc.h)
+sc.h <- RenameIdents(sc.h, cluster.ids) %>%
+  StashIdent("Lineage")
+
+# Generate UMAP with lineage assignments
+umapdir <- paste0(outdir, "umaps/")
+dir.create(umapdir, showWarnings = FALSE)
+
+Idents(sc.h) <- "Lineage"
+DimPlot(sc.h)
+ggsave(paste0(umapdir, "global_lineage.png"))
+
+# Dot plot to check marker leakage
+genes <- strsplit("EPCAM KRT5 KRT14 MSMB SEMG1 SELE PECAM1 PTPRC SRGN C1QA DCN LUM DES ACTA1 ACTG2 CRYAB CDH19",
+                  split = " ")[[1]]
+DotPlot(sc.h, features = genes) + 
+  theme(axis.text.x = element_text(angle=45, hjust=1))
+ggsave(paste0(outdir, "dotplot_global_lineage.png"))
+
+# Subset and recluster ----------------------------------------------------
+
+lin.list <- SplitObject(sc.h, split.by="Lineage")
+lin.list[["Putative doublet"]] <- NULL # Remove doublets
+
+res <- seq(0.1, 1, 0.1)
+
+lin.list <- lin.list %>%
+  future_lapply(function(x){
+    x <- NormalizeData(x) %>%
+      FindVariableFeatures() %>%
+      ScaleData() %>%
+      RunPCA() %>%
+      RunHarmony(group.by.vars = "Sample")
+    
+    numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, 
+                                 k = 10)$dim.est %>%
+      round()
+    x <- x %>%
+      RunUMAP(dims = 1:numPCs,
+              reduction = "harmony",
+              metric = "euclidean",
+              min.dist = 0.1) %>%
+      FindNeighbors(reduction = "harmony",
+                    dims = 1:numPCs) %>%
+      FindClusters(resolution = res) %>%
+      SetIdent(value = default.res)
+    return(x)},
+    future.seed=TRUE)
+
+# Save UMAP of each subset
+for ( lin in names(lin.list) ){
+  DimPlot(lin.list[[lin]], label=TRUE)
+  ggsave(paste0(umapdir, lin, "_prefilter.png"))
+}
+
+
+# Lineage-dependent filtering ---------------------------------------------
+
+## Identify and remove clusters enriched for likely doublets
+
+#### Save doublet plots of each lineage
+dblts <- list()
+for (lin in names(lin.list)) {
+  dblts[[lin]] <- lin.list[[lin]]@meta.data %>%
+    group_by(.data[[default.res]], scDblFinder.class) %>%
+    summarize(count = n()) %>%
+    group_by(.data[[default.res]]) %>%
+    mutate(fraction = count/sum(count))
+  ggplot(dblts[[lin]], aes(.data[[default.res]], fraction, fill = scDblFinder.class)) +
+    geom_col() + 
+    geom_hline(yintercept = 0.5, linetype = "dashed")
+  ggsave(paste0(outdir, "dblts_", lin, ".png"))
+}
+
+### Remove clusters with >50% of cells classed as doublets
+lin.list.filt <- list()
+keep <- list()
+for (lin in names(lin.list)) {
+  keep[[lin]] <- dblts[[lin]] %>%
+    filter(scDblFinder.class == "singlet" & fraction > 0.5) %>%
+    pull("RNA_snn_res.0.4")
+  lin.list.filt[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
+}
+
+## Library size filtering
+
+### Calculate thresholds
+
+mads_mult <- 2 # Number of MADs away from median to filter
+dt <- data.frame(lineage = character(),
+                 nCount_RNA = numeric())
+for (i in names(lin.list.filt)) {
+  dt <- add_row(dt, lineage = i, nCount_RNA = lin.list.filt[[i]]$nCount_RNA)
+}
+mt.dt <- dt %>%
+  group_by(lineage) %>%
+  mutate(log.nCount_RNA = log(nCount_RNA)) %>% 
+  summarize(median = median(nCount_RNA), 
+            log.median = median(log.nCount_RNA),
+            log.mad = mad(log.nCount_RNA)) %>%
+  mutate(cutoff = exp(log.median - mads_mult * log.mad)) %>%
+  as.data.frame()
+
+rownames(mt.dt) <- mt.dt$lineage
+mt.dt$lineage <- NULL
+
+### Save thresholds
+write.csv(mt.dt, paste0(outdir, "nCount_cutoffs.csv"))
+
+### Save violin plots with cutoffs
+for (g in names(lin.list.filt)) {
+  VlnPlot(lin.list.filt[[g]], "nCount_RNA", pt.size = 0) + 
+          geom_hline(yintercept = mt.dt[g, "cutoff"], linetype="dashed") +
+          scale_y_log10() +
+          NoLegend()
+  ggsave(paste0(outdir, paste0("vln_nCount_cutoffs_", g, ".png")))
+}
+
+### Apply library size filter
+for (i in names(lin.list.filt) ) {
+  lin.list.filt[[i]] <- subset(lin.list.filt[[i]], subset = nCount_RNA > mt.dt[i, "cutoff"])
+}
+
+### Record how many cells were removed
+cells.old <- lapply(lin.list,
+                    ncol)
+cells.new <- lapply(lin.list.filt, 
+                    ncol)
+tibble("Lineage" = names(cells.old),
+       "Original" = unlist(cells.old),
+       "Filtered" = unlist(cells.new)) %>%
+  mutate("Difference" = Original - Filtered,
+         "Fraction_removed" = Difference / Original) %>%
+  write.csv(paste0(outdir, "fraction_removed_nCount.csv"))
+
+
+# Recluster ---------------------------------------------------------------
+
+lin.list <- lin.list.filt %>%
+  future_lapply(function(x){
+    x <- NormalizeData(x) %>%
+      FindVariableFeatures() %>%
+      ScaleData() %>%
+      RunPCA() %>%
+      RunHarmony(group.by.vars = "Sample")
+    
+    numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, 
+                                 k = 10)$dim.est %>%
+      round()
+    x <- x %>%
+      RunUMAP(dims = 1:numPCs,
+              reduction = "harmony",
+              metric = "euclidean",
+              min.dist = 0.1) %>%
+      FindNeighbors(reduction = "harmony",
+                    dims = 1:numPCs) %>%
+      FindClusters(resolution = res) %>%
+      SetIdent(value = default.res)
+    return(x)},
+    future.seed=TRUE)
+
+
+# Doublet removal, second round -------------------------------------------
+
+dblts <- list()
+for (lin in names(lin.list)) {
+  dblts[[lin]] <- lin.list[[lin]]@meta.data %>%
+    group_by(.data[[default.res]], scDblFinder.class) %>%
+    summarize(count = n()) %>%
+    group_by(.data[[default.res]]) %>%
+    mutate(fraction = count/sum(count))
+}
+
+keep <- list()
+for (lin in names(lin.list)) {
+  keep[[lin]] <- dblts[[lin]] %>%
+    filter(scDblFinder.class == "singlet" & fraction > 0.5) %>%
+    pull("RNA_snn_res.0.4")
+  lin.list[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
+}
+
+## Recluster again
+
+lin.list <- lin.list %>%
+  future_lapply(function(x){
+    x <- NormalizeData(x) %>%
+      FindVariableFeatures() %>%
+      ScaleData() %>%
+      RunPCA() %>%
+      RunHarmony(group.by.vars = "Sample")
+    
+    numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, 
+                                 k = 10)$dim.est %>%
+      round()
+    x <- x %>%
+      RunUMAP(dims = 1:numPCs,
+              reduction = "harmony",
+              metric = "euclidean",
+              min.dist = 0.1) %>%
+      FindNeighbors(reduction = "harmony",
+                    dims = 1:numPCs) %>%
+      FindClusters(resolution = res) %>%
+      SetIdent(value = default.res)
+    return(x)},
+    future.seed=TRUE)
+
+
+# Save outputs ------------------------------------------------------------
+
+## Save UMAPs and objects
+
+objdir <- paste0(outdir, "objects/")
+dir.create(objdir, showWarnings = FALSE)
+for (lin in names(lin.list)) {
+  DimPlot(lin.list[[lin]])
+  ggsave(paste0(umapdir, lin, "_postfilter.png"))
+  saveRDS(lin.list[[lin]], paste0(objdir, lin, ".rds"))
+}
+
+## Save session info
+capture.output(sessionInfo(), file = paste0(outdir, "session_info.txt"))