Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • StrandLab/scrna_urethra
Show changes
Commits on Source (4)
# Lineage-dependent filtering
# Author: John T Lafin
# Updated: 2024-09-17
# Updated: 2024-10-15
print("Script begin at: ")
Sys.time()
......@@ -28,6 +28,33 @@ set.seed(42)
outdir <- "data_modified/03_lineage_filtering/"
dir.create(outdir, showWarnings = FALSE)
# Create pre-processing function
preprocess <- function(x,
cluster.res = seq(0.1,1.5,0.1),
default.res = "RNA_snn_res.0.4",
batch.var = "Sample"){
x <- x %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = batch.var)
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = cluster.res) %>%
SetIdent(value = default.res)
return(x)
}
# Load data ---------------------------------------------------------------
sc.h <- readRDS("data_modified/02_preprocess/processed_object.rds")
......@@ -78,30 +105,8 @@ ggsave(paste0(outdir, "dotplot_global_lineage.png"))
lin.list <- SplitObject(sc.h, split.by="Lineage")
lin.list[["Putative doublet"]] <- NULL # Remove doublets
res <- seq(0.1, 1, 0.1)
lin.list <- lin.list %>%
future_lapply(function(x){
x <- NormalizeData(x) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res) %>%
SetIdent(value = default.res)
return(x)},
future.seed=TRUE)
res <- seq(0.1, 1.5, 0.1)
lin.list <- future_lapply(lin.list, preprocess, future.seed=TRUE)
# Save UMAP of each subset
for ( lin in names(lin.list) ){
......@@ -109,10 +114,11 @@ for ( lin in names(lin.list) ){
ggsave(paste0(umapdir, lin, "_prefilter.png"))
}
# Lineage-dependent filtering ---------------------------------------------
## Identify and remove clusters enriched for likely doublets
dbltdir <- paste0(outdir, "doublets/")
dir.create(dbltdir, showWarnings=FALSE)
#### Save doublet plots of each lineage
dblts <- list()
......@@ -125,20 +131,22 @@ for (lin in names(lin.list)) {
ggplot(dblts[[lin]], aes(.data[[default.res]], fraction, fill = scDblFinder.class)) +
geom_col() +
geom_hline(yintercept = 0.5, linetype = "dashed")
ggsave(paste0(outdir, "dblts_", lin, ".png"))
ggsave(paste0(dbltdir, lin, "_pass1.png"))
}
### Remove clusters with >50% of cells classed as doublets
### Remove clusters with >=50% of cells classed as doublets
lin.list.filt <- list()
keep <- list()
for (lin in names(lin.list)) {
keep[[lin]] <- dblts[[lin]] %>%
filter(scDblFinder.class == "singlet" & fraction > 0.5) %>%
pull("RNA_snn_res.0.4")
pull(.data[[default.res]])
lin.list.filt[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
}
## Library size filtering
ncountdir <- paste0(outdir, "ncount/")
dir.create(ncountdir, showWarnings=FALSE)
### Calculate thresholds
......@@ -161,7 +169,7 @@ rownames(mt.dt) <- mt.dt$lineage
mt.dt$lineage <- NULL
### Save thresholds
write.csv(mt.dt, paste0(outdir, "nCount_cutoffs.csv"))
write.csv(mt.dt, paste0(ncountdir, "nCount_cutoffs.csv"))
### Save violin plots with cutoffs
for (g in names(lin.list.filt)) {
......@@ -169,7 +177,7 @@ for (g in names(lin.list.filt)) {
geom_hline(yintercept = mt.dt[g, "cutoff"], linetype="dashed") +
scale_y_log10() +
NoLegend()
ggsave(paste0(outdir, paste0("vln_nCount_cutoffs_", g, ".png")))
ggsave(paste0(ncountdir, g, "_nCount.png"))
}
### Apply library size filter
......@@ -187,37 +195,16 @@ tibble("Lineage" = names(cells.old),
"Filtered" = unlist(cells.new)) %>%
mutate("Difference" = Original - Filtered,
"Fraction_removed" = Difference / Original) %>%
write.csv(paste0(outdir, "fraction_removed_nCount.csv"))
write.csv(paste0(ncountdir, "fraction_removed_nCount.csv"))
# Recluster ---------------------------------------------------------------
lin.list <- lin.list.filt %>%
future_lapply(function(x){
x <- NormalizeData(x) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res) %>%
SetIdent(value = default.res)
return(x)},
future.seed=TRUE)
lin.list <- future_lapply(lin.list.filt, preprocess, future.seed=TRUE)
# Doublet removal, second round -------------------------------------------
## Create plots of doublets
dblts <- list()
for (lin in names(lin.list)) {
dblts[[lin]] <- lin.list[[lin]]@meta.data %>%
......@@ -225,46 +212,28 @@ for (lin in names(lin.list)) {
summarize(count = n()) %>%
group_by(.data[[default.res]]) %>%
mutate(fraction = count/sum(count))
ggplot(dblts[[lin]], aes(.data[[default.res]], fraction, fill = scDblFinder.class)) +
geom_col() +
geom_hline(yintercept = 0.5, linetype = "dashed")
ggsave(paste0(dbltdir, lin, "_pass2.png"))
}
## Remove doublets
keep <- list()
for (lin in names(lin.list)) {
keep[[lin]] <- dblts[[lin]] %>%
filter(scDblFinder.class == "singlet" & fraction > 0.5) %>%
pull("RNA_snn_res.0.4")
lin.list[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
pull(.data[[default.res]])
lin.list.filt[[lin]] <- subset(lin.list[[lin]], idents=keep[[lin]])
}
## Recluster again
lin.list <- lin.list %>%
future_lapply(function(x){
x <- NormalizeData(x) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res) %>%
SetIdent(value = default.res)
return(x)},
future.seed=TRUE)
## Only recluster epithelia as the other lineages didn't have more doublets
lin.list$Epithelia <- preprocess(lin.list.filt$Epithelia)
# Save outputs ------------------------------------------------------------
## Save UMAPs and objects
objdir <- paste0(outdir, "objects/")
dir.create(objdir, showWarnings = FALSE)
for (lin in names(lin.list)) {
......
# Annotation and stress score filtering
# Author: John T Lafin
# Updated: 2024-10-16
print("Script begin at: ")
Sys.time()
# Setup -------------------------------------------------------------------
# Load packages
library(dplyr)
library(Seurat)
library(intrinsicDimension)
library(ggplot2)
library(harmony)
library(future)
library(future.apply)
# Future initialization
future_plan <- "multisession" # Can use multicore if not on Rstudio or Windows
n_workers <- 4
plan(future_plan, workers=n_workers)
options(future.globals.maxSize = 8000 * 1024^2)
set.seed(42)
# Create output dir
outdir <- "data_modified/04_annotation/"
dir.create(outdir, showWarnings = FALSE)
# Create pre-processing function
preprocess <- function(x,
cluster.res = seq(0.1,1.5,0.1),
default.res = "RNA_snn_res.0.4",
batch.var = "Sample"){
x <- x %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = batch.var)
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = cluster.res) %>%
SetIdent(value = default.res)
return(x)
}
# Load data ---------------------------------------------------------------
lin.list <- list()
lins <- list.files('data_modified/03_lineage_filtering/objects/') %>%
sub(".rds", "", .)
for (lin in lins) {
lin.list[[lin]] <- readRDS(paste0("data_modified/03_lineage_filtering/objects/", lin, ".rds"))
}
# Annotation --------------------------------------------------------------
annotdir <- paste0(outdir, "annotations_first_pass/")
dir.create(annotdir, showWarnings=FALSE)
## Create list of labels
annot.list <- list()
annot.list$Endothelia <- c("venous endothelia",
"venous endothelia",
"capillary endothelia",
"capillary endothelia",
"arterial endothelia",
"lymphatic endothelia")
annot.list$Epithelia <- c("transitional basal epithelia of urethra",
"basal epithelia of prostate",
"transitional basal epithelia of urethra",
"squamous intermediate epithelia of urethra",
"basal epithelia of prostate",
"transitional intermediate epithelia of urethra",
"squamous intermediate epithelia of urethra",
"luminal epithelia of prostate",
"club epithelia of urethra",
"transitional intermediate epithelia of urethra",
"club epithelia of urethra",
"stressed cells",
"stressed cells",
"squamous intermediate epithelia of urethra",
"squamous intermediate epithelia of urethra",
"basal epithelia of prostate",
"squamous basal epithelia of urethra",
"basal epithelia of prostate",
"proliferative cells",
"squamous luminal epithelia of urethra",
"stressed cells",
"transitional basal epithelia of urethra",
"basal epithelia of prostate")
annot.list$Leukocyte <- c("T cell",
"macrophage",
"mast cell",
"T cell",
"T cell",
"NK cell",
"proliferating leukocyte",
"B cell",
"dendritic cell",
"plasma cell")
annot.list$Stroma <- c("pericyte",
"major interstitial fibroblast",
"smooth muscle cell",
"minor interstitial fibroblast",
"smooth muscle cell",
"smooth muscle cell",
"periepithelial fibroblast of vagina",
"vascular smooth muscle",
"smooth muscle cell",
"periepithelial fibroblast of urethra",
"stressed cell",
"pericyte",
"glia",
"smooth muscle cell",
"endoneurial fibroblast",
"pericyte",
"glia")
## Set epithelia and stroma to higher resolutions
Idents(lin.list$Epithelia) <- "RNA_snn_res.1.5"
Idents(lin.list$Stroma) <- "RNA_snn_res.1"
## Add the labels
for (lin in lins) {
names(annot.list[[lin]]) <- levels(lin.list[[lin]])
lin.list[[lin]] <- RenameIdents(lin.list[[lin]], annot.list[[lin]]) %>%
StashIdent("cellannotation")
}
## Add neuroendocrine cells
NE.markers <- strsplit("CHGA GRP CALCA SCG2 TPH1 ASCL1 SCG5 PHGR1 MIAT MAOB", split = " ")
lin.list$Epithelia <- AddModuleScore(lin.list$Epithelia,
features = NE.markers,
name = "NE.score")
lin.list$Epithelia@meta.data <- rename(lin.list$Epithelia@meta.data, NE.score = NE.score1)
VlnPlot(lin.list$Epithelia, "NE.score", group.by = "orig.ident") +
NoLegend() +
geom_hline(yintercept=0.3, linetype="dashed")
ggsave(paste0(annotdir, "NE_score_cutoff.png"))
lin.list$Epithelia@meta.data <- lin.list$Epithelia@meta.data %>%
mutate(cellannotation = if_else(NE.score > 0.3, "neuroendocrine cell", cellannotation))
## Add ionocytes
VlnPlot(lin.list$Epithelia, "FOXI1", group.by = "orig.ident") +
NoLegend() +
geom_hline(yintercept=0.3, linetype="dashed")
ggsave(paste0(annotdir, "FOXI1_score_cutoff.png"))
ionocyte.barcodes <- WhichCells(lin.list$Epithelia, expression = FOXI1 > 0.3)
lin.list$Epithelia@meta.data[ionocyte.barcodes, "cellannotation"] <- "ionocyte"
## Split periepithelial fibroblasts by sex
fib_rename <- WhichCells(lin.list$Stroma,
idents="periepithelial fibroblast of vagina",
expression = Sex == "male")
levels(lin.list$Stroma$cellannotation) <- c(levels(lin.list$Stroma$cellannotation), "periepithelial fibroblast of prostate")
lin.list$Stroma@meta.data[fib_rename, "cellannotation"] <- "periepithelial fibroblast of prostate"
## Reset idents
Idents(lin.list$Epithelia) <- "cellannotation"
Idents(lin.list$Stroma) <- "cellannotation"
## Save labeled UMAP plots
for (lin in lins) {
DimPlot(lin.list[[lin]])
ggsave(paste0(annotdir, lin,"_umap.png"))
}
# Stress score filtering --------------------------------------------------
stressdir <- paste0(outdir, "stress/")
dir.create(stressdir, showWarnings = FALSE)
## Produce violin plots of stress scores
lin.list <- lapply(lin.list, SetIdent, value = "RNA_snn_res.1")
### Check epithelia at higher resolution to capture other populations
Idents(lin.list$Epithelia) <- "RNA_snn_res.1.5"
for (lin in names(lin.list)) {
VlnPlot(lin.list[[lin]], "Stress.score", pt.size=0) +
NoLegend()
ggsave(paste0(stressdir, lin, "_pre-filter.png"))
}
## Define stressed clusters based on violin plots
stressed.cells <- list()
stressed.cells$Epithelia <- c(11,12,20)
stressed.cells$Stroma <- c(2,10,15)
stressed.cells$Endothelia <- c(4,10)
stressed.cells$Leukocyte <- c(4,9)
## Remove stressed clusters and reprocess
for (lin in names(lin.list)) {
lin.list[[lin]] <- subset(lin.list[[lin]],
idents = stressed.cells[[lin]],
invert = TRUE)
lin.list[[lin]] <- preprocess(lin.list[[lin]], default.res="cellannotation")
}
## Save violin plots after first round of filtering
for (lin in names(lin.list)) {
VlnPlot(lin.list[[lin]], "Stress.score", pt.size=0, group.by="RNA_snn_res.1") +
NoLegend()
ggsave(paste0(stressdir, lin, "_first_filter.png"))
}
## Second round of stressed cell removal in stroma
Idents(lin.list$Stroma) <- "RNA_snn_res.1"
lin.list$Stroma <- subset(lin.list$Stroma,
idents = 8,
invert = TRUE) %>%
preprocess(default.res="cellannotation")
## Save violin plot
VlnPlot(lin.list$Stroma, "Stress.score", pt.size=0, group.by="RNA_snn_res.1") +
NoLegend()
ggsave(paste0(stressdir, lin, "_second_filter.png"))
## Reannotate endothelia
lin.list$Endothelia@meta.data <- lin.list$Endothelia@meta.data %>%
mutate(cellannotation = if_else(RNA_snn_res.1 == 7, "venous endothelia", cellannotation))
Idents(lin.list$Endothelia) <- "cellannotation"
## Save labeled UMAP plots
for (lin in lins) {
DimPlot(lin.list[[lin]])
ggsave(paste0(annotdir, lin,"_umap-2.png"))
}
# Merge -------------------------------------------------------------------
seu <- merge(lin.list[[1]], lin.list[2:length(lin.list)])
plan(sequential)
seu <- preprocess(seu, default.res="cellannotation")
## Save UMAP plot
DimPlot(seu, label=TRUE, repel=TRUE, label.size=3) +
NoLegend()
ggsave(paste0(annotdir, "global_umap.png"))
# Save outputs ------------------------------------------------------------
objdir <- paste0(outdir, "objects/")
dir.create(objdir, showWarnings=FALSE)
saveRDS(seu, paste0(objdir, "global.rds"))
for (lin in names(lin.list)){
saveRDS(lin.list[[lin]], paste0(objdir, lin, ".rds"))
}
## Save session info
capture.output(sessionInfo(), file = paste0(outdir, "session_info.txt"))
# Separation and reprocessing of female data
# Author: John T Lafin
# Updated: 2024-10-18
print("Script begin at: ")
Sys.time()
# Setup -------------------------------------------------------------------
# Load packages
library(dplyr)
library(Seurat)
library(intrinsicDimension)
library(ggplot2)
library(harmony)
library(future)
library(future.apply)
# Future initialization
future_plan <- "multisession" # Can use multicore if not on Rstudio or Windows
n_workers <- 4
plan(future_plan, workers=n_workers)
options(future.globals.maxSize = 8000 * 1024^2)
set.seed(42)
# Create output dir
outdir <- "data_modified/05_female_only/"
dir.create(outdir, showWarnings = FALSE)
# Create pre-processing function
preprocess <- function(x,
cluster.res = seq(0.1,1.5,0.1),
default.res = "RNA_snn_res.0.4",
batch.var = "Sample"){
x <- x %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = batch.var)
numPCs <- maxLikGlobalDimEst(x@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
x <- x %>%
RunUMAP(dims = 1:numPCs,
reduction = "harmony",
metric = "euclidean",
min.dist = 0.1) %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = cluster.res) %>%
SetIdent(value = default.res)
return(x)
}
# Load data ---------------------------------------------------------------
seu <- readRDS("data_modified/04_annotation/objects/global.rds")
## Subset only female by lineage
lin.list <- subset(seu, subset = Sex == "female") %>%
SplitObject(split.by="Lineage")
## Preprocess each lineage
lin.list <- future_lapply(lin.list, preprocess, default.res="cellannotation")
# Update annotation -------------------------------------------------------
## Epithelia
Idents(lin.list$Epithelia) <- "RNA_snn_res.0.5"
cluster.ids <- c("0" = "intermediate cell of transitional epithelia",
"1" = "basal cell of transitional epithelia",
"2" = "basal cell of squamous epithelia",
"3" = "intermediate cell of squamous epithelia",
"4" = "club cell",
"5" = "intermediate cell of squamous epithelia",
"6" = "intermediate cell of squamous epithelia",
"7" = "luminal cell of squamous epithelia",
"8" = "proliferative cells",
"9" = "basal cell of transitional epithelia")
lin.list$Epithelia <- RenameIdents(lin.list$Epithelia, cluster.ids) %>%
StashIdent("cellannotation")
### Label neuroendocrine cells
lin.list$Epithelia$NE.score <- NULL
NE.markers <- strsplit("CHGA GRP CALCA SCG2 TPH1 ASCL1 SCG5 PHGR1 MIAT MAOB", split = " ")
lin.list$Epithelia <- AddModuleScore(lin.list$Epithelia,
features = NE.markers,
name = "NE.score")
lin.list$Epithelia@meta.data <- rename(lin.list$Epithelia@meta.data, NE.score = NE.score1)
lin.list$Epithelia@meta.data <- lin.list$Epithelia@meta.data %>%
mutate(cellannotation = if_else(NE.score > 0.3, "neuroendocrine cell", cellannotation))
Idents(lin.list$Epithelia) <- "cellannotation"
## Endothelia
Idents(lin.list$Endothelia) <- "RNA_snn_res.0.5"
cluster.ids <- c("0" = "venous endothelia",
"1" = "venous endothelia",
"2" = "venous endothelia",
"3" = "venous endothelia",
"4" = "capillary endothelia",
"5" = "arterial endothelia",
"6" = "lymphatic endothelia",
"7" = "venous endothelia")
lin.list$Endothelia <- RenameIdents(lin.list$Endothelia, cluster.ids) %>%
StashIdent("cellannotation")
## Leukocytes
Idents(lin.list$Leukocyte) <- "RNA_snn_res.0.4"
cluster.ids <- c("0" = "macrophage",
"1" = "T cell",
"2" = "NK cell",
"3" = "mast cell",
"4" = "B cell",
"5" = "T cell")
lin.list$Leukocyte <- RenameIdents(lin.list$Leukocyte, cluster.ids) %>%
StashIdent("cellannotation")
## Stroma
Idents(lin.list$Stroma) <- "RNA_snn_res.1"
cluster.ids <- c("0" = "smooth muscle cell",
"1" = "pericyte",
"2" = "minor interstiital fibroblast",
"3" = "major interstitial fibroblast",
"4" = "smooth muscle cell",
"5" = "smooth muscle cell",
"6" = "vascular smooth muscle cell",
"7" = "periepithelial fibroblast of vagina",
"8" = "periepithelial fibroblast of urethra",
"9" = "smooth muscle cell",
"10" = "endoneurial fibroblast",
"11" = "glial cell",
"12" = "pericyte",
"13" = "major interstitial fibroblast",
"14" = "pericyte",
"15" = "smooth muscle cell",
"16" = "glial cell")
lin.list$Stroma <- RenameIdents(lin.list$Stroma, cluster.ids) %>%
StashIdent("cellannotation")
# Recombine ---------------------------------------------------------------
seu <- merge(lin.list[[1]], lin.list[2:length(lin.list)])
seu <- preprocess(seu, default.res="cellannotation")
# Save outputs ------------------------------------------------------------
## Save dot plots
plotdir <- paste0(outdir, "dot_plots/")
dir.create(plotdir, showWarnings=FALSE)
gene.list <- list(
"Epithelia" = strsplit("AKR1C1 AKR1C2 MKI67 TOP2A PIGR LCN2 KRT6A CHGA KRT5 BCAM", split=" ")[[1]],
"Endothelia" = strsplit("LYVE1 EFNB2 ACKR1 TXNIP VWF", split=" ")[[1]],
"Leukocyte" = strsplit("LYVE1 C1QA CD163 PTPRC IL7R CD8A GZMK HLA-DRA KIT MS4A1 CD79A JCHAIN GNLY GZMB", split = " ")[[1]],
"Stroma" = strsplit("C7 SFRP4 STC1 APOE APOD NGFR THY1 CD36 DES ACTG2 PRUNE2 MCAM BCAM RGS5 DLK1 MYF5 CDH19 GPM6B", split = " ")[[1]]
)
for (lin in names(lin.list)) {
DotPlot(lin.list[[lin]], features=gene.list[[lin]], cluster.idents=TRUE) +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggsave(paste0(plotdir, lin, ".png"),
height=8.5,
width=11)
}
## Save objects
objdir <- paste0(outdir, "objects/")
dir.create(objdir, showWarnings=FALSE)
saveRDS(seu, paste0(objdir, "global_female.rds"))
for (lin in names(lin.list)) {
saveRDS(lin.list[[lin]], paste0(objdir, lin, "_female.rds"))
}
## Save session info
capture.output(sessionInfo(), file = paste0(outdir, "session_info.txt"))