diff --git a/scripts/03_lineage_filtering.R b/scripts/03_lineage_filtering.R new file mode 100644 index 0000000000000000000000000000000000000000..1bfa5df479c17ca9eecc723e68378a4b68964bbb --- /dev/null +++ b/scripts/03_lineage_filtering.R @@ -0,0 +1,277 @@ +# Lineage-dependent filtering +# Author: John T Lafin +# Updated: 2024-09-17 + +print("Script begin at: ") +Sys.time() + +# Setup ------------------------------------------------------------------- + +# Load packages +library(dplyr) +library(Seurat) +library(intrinsicDimension) +library(ggplot2) +library(harmony) +library(future) +library(future.apply) +library(clustree) + +# Future initialization +future_plan <- "multisession" # Can use multicore if not on Rstudio or Windows +n_workers <- 4 +plan(future_plan, workers=n_workers) +options(future.globals.maxSize = 8000 * 1024^2) +set.seed(42) + +# Create output dir +outdir <- "data_modified/03_lineage_filtering/" +dir.create(outdir, showWarnings = FALSE) + +# Load data --------------------------------------------------------------- + +sc.h <- readRDS("data_modified/02_preprocess/processed_object.rds") + +# Remove this field or Seurat won't work properly +sc.h$ident <- NULL +default.res <- "RNA_snn_res.0.4" +Idents(sc.h) <- default.res + + +# Annotate lineages ------------------------------------------------------- + +# Using information generated from 02_preprocess.R, annotate lineages +cluster_epi <- c(0, 1, 7, 8, 9, 11, 15) +cluster_endo <- c(4, 6, 12) +cluster_leu <- c(10, 14, 16) +cluster_stroma <- c(2, 3, 5, 13) +cluster_doublets <- c(17, 18) + +# Make assignments +cluster.ids <- vector("character", length = length(levels(sc.h))) +cluster.ids[cluster_epi+1] <- "Epithelia" +cluster.ids[cluster_endo+1] <- "Endothelia" +cluster.ids[cluster_leu+1] <- "Leukocyte" +cluster.ids[cluster_stroma+1] <- "Stroma" +cluster.ids[cluster_doublets+1] <- "Putative doublet" +names(cluster.ids) <- levels(sc.h) +sc.h <- RenameIdents(sc.h, cluster.ids) %>% + StashIdent("Lineage") + +# Generate UMAP with lineage assignments +umapdir <- paste0(outdir, "umaps/") +dir.create(umapdir, showWarnings = FALSE) + +Idents(sc.h) <- "Lineage" +DimPlot(sc.h) +ggsave(paste0(umapdir, "global_lineage.png")) + +# Dot plot to check marker leakage +genes <- strsplit("EPCAM KRT5 KRT14 MSMB SEMG1 SELE PECAM1 PTPRC SRGN C1QA DCN LUM DES ACTA1 ACTG2 CRYAB CDH19", + split = " ")[[1]] +DotPlot(sc.h, features = genes) + + theme(axis.text.x = element_text(angle=45, hjust=1)) +ggsave(paste0(outdir, "dotplot_global_lineage.png")) + +# Subset and recluster ---------------------------------------------------- + +lin.list <- SplitObject(sc.h, split.by="Lineage") +lin.list[["Putative doublet"]] <- NULL # Remove doublets + +res <- seq(0.1, 1, 0.1) + +lin.list <- lin.list %>% + future_lapply(function(x){ + x <- NormalizeData(x) %>% + FindVariableFeatures() %>% + ScaleData() %>% + RunPCA() %>% + RunHarmony(group.by.vars = "Sample") + + numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, + k = 10)$dim.est %>% + round() + x <- x %>% + RunUMAP(dims = 1:numPCs, + reduction = "harmony", + metric = "euclidean", + min.dist = 0.1) %>% + FindNeighbors(reduction = "harmony", + dims = 1:numPCs) %>% + FindClusters(resolution = res) %>% + SetIdent(value = default.res) + return(x)}, + future.seed=TRUE) + +# Save UMAP of each subset +for ( lin in names(lin.list) ){ + DimPlot(lin.list[[lin]], label=TRUE) + ggsave(paste0(umapdir, lin, "_prefilter.png")) +} + + +# Lineage-dependent filtering --------------------------------------------- + +## Identify and remove clusters enriched for likely doublets + +#### Save doublet plots of each lineage +dblts <- list() +for (lin in names(lin.list)) { + dblts[[lin]] <- lin.list[[lin]]@meta.data %>% + group_by(.data[[default.res]], scDblFinder.class) %>% + summarize(count = n()) %>% + group_by(.data[[default.res]]) %>% + mutate(fraction = count/sum(count)) + ggplot(dblts[[lin]], aes(.data[[default.res]], fraction, fill = scDblFinder.class)) + + geom_col() + + geom_hline(yintercept = 0.5, linetype = "dashed") + ggsave(paste0(outdir, "dblts_", lin, ".png")) +} + +### Remove clusters with >50% of cells classed as doublets +lin.list.filt <- list() +keep <- list() +for (lin in names(lin.list)) { + keep[[lin]] <- dblts[[lin]] %>% + filter(scDblFinder.class == "singlet" & fraction > 0.5) %>% + pull("RNA_snn_res.0.4") + lin.list.filt[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]]) +} + +## Library size filtering + +### Calculate thresholds + +mads_mult <- 2 # Number of MADs away from median to filter +dt <- data.frame(lineage = character(), + nCount_RNA = numeric()) +for (i in names(lin.list.filt)) { + dt <- add_row(dt, lineage = i, nCount_RNA = lin.list.filt[[i]]$nCount_RNA) +} +mt.dt <- dt %>% + group_by(lineage) %>% + mutate(log.nCount_RNA = log(nCount_RNA)) %>% + summarize(median = median(nCount_RNA), + log.median = median(log.nCount_RNA), + log.mad = mad(log.nCount_RNA)) %>% + mutate(cutoff = exp(log.median - mads_mult * log.mad)) %>% + as.data.frame() + +rownames(mt.dt) <- mt.dt$lineage +mt.dt$lineage <- NULL + +### Save thresholds +write.csv(mt.dt, paste0(outdir, "nCount_cutoffs.csv")) + +### Save violin plots with cutoffs +for (g in names(lin.list.filt)) { + VlnPlot(lin.list.filt[[g]], "nCount_RNA", pt.size = 0) + + geom_hline(yintercept = mt.dt[g, "cutoff"], linetype="dashed") + + scale_y_log10() + + NoLegend() + ggsave(paste0(outdir, paste0("vln_nCount_cutoffs_", g, ".png"))) +} + +### Apply library size filter +for (i in names(lin.list.filt) ) { + lin.list.filt[[i]] <- subset(lin.list.filt[[i]], subset = nCount_RNA > mt.dt[i, "cutoff"]) +} + +### Record how many cells were removed +cells.old <- lapply(lin.list, + ncol) +cells.new <- lapply(lin.list.filt, + ncol) +tibble("Lineage" = names(cells.old), + "Original" = unlist(cells.old), + "Filtered" = unlist(cells.new)) %>% + mutate("Difference" = Original - Filtered, + "Fraction_removed" = Difference / Original) %>% + write.csv(paste0(outdir, "fraction_removed_nCount.csv")) + + +# Recluster --------------------------------------------------------------- + +lin.list <- lin.list.filt %>% + future_lapply(function(x){ + x <- NormalizeData(x) %>% + FindVariableFeatures() %>% + ScaleData() %>% + RunPCA() %>% + RunHarmony(group.by.vars = "Sample") + + numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, + k = 10)$dim.est %>% + round() + x <- x %>% + RunUMAP(dims = 1:numPCs, + reduction = "harmony", + metric = "euclidean", + min.dist = 0.1) %>% + FindNeighbors(reduction = "harmony", + dims = 1:numPCs) %>% + FindClusters(resolution = res) %>% + SetIdent(value = default.res) + return(x)}, + future.seed=TRUE) + + +# Doublet removal, second round ------------------------------------------- + +dblts <- list() +for (lin in names(lin.list)) { + dblts[[lin]] <- lin.list[[lin]]@meta.data %>% + group_by(.data[[default.res]], scDblFinder.class) %>% + summarize(count = n()) %>% + group_by(.data[[default.res]]) %>% + mutate(fraction = count/sum(count)) +} + +keep <- list() +for (lin in names(lin.list)) { + keep[[lin]] <- dblts[[lin]] %>% + filter(scDblFinder.class == "singlet" & fraction > 0.5) %>% + pull("RNA_snn_res.0.4") + lin.list[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]]) +} + +## Recluster again + +lin.list <- lin.list %>% + future_lapply(function(x){ + x <- NormalizeData(x) %>% + FindVariableFeatures() %>% + ScaleData() %>% + RunPCA() %>% + RunHarmony(group.by.vars = "Sample") + + numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings, + k = 10)$dim.est %>% + round() + x <- x %>% + RunUMAP(dims = 1:numPCs, + reduction = "harmony", + metric = "euclidean", + min.dist = 0.1) %>% + FindNeighbors(reduction = "harmony", + dims = 1:numPCs) %>% + FindClusters(resolution = res) %>% + SetIdent(value = default.res) + return(x)}, + future.seed=TRUE) + + +# Save outputs ------------------------------------------------------------ + +## Save UMAPs and objects + +objdir <- paste0(outdir, "objects/") +dir.create(objdir, showWarnings = FALSE) +for (lin in names(lin.list)) { + DimPlot(lin.list[[lin]]) + ggsave(paste0(umapdir, lin, "_postfilter.png")) + saveRDS(lin.list[[lin]], paste0(objdir, lin, ".rds")) +} + +## Save session info +capture.output(sessionInfo(), file = paste0(outdir, "session_info.txt"))