Skip to content
Snippets Groups Projects
Commit 58e53de1 authored by John Lafin's avatar John Lafin
Browse files

Initial commit

parents
Branches
No related merge requests found
Showing with 532 additions and 0 deletions
options(repos = c(CRAN = "https://p3m.dev/cran/__linux__/centos7/latest"))
source("renv/activate.R")
# Mac config files
.DS_Store
._.DS_Store
# R files
# Keep files necessary for renv bootstrapping
.Rhistory
.RData
.Rproj.user/
renv/*
!renv/activate.R
# Data directories
data_raw/
data_modified/
# Logfiles
logs/
# Testing and temporary files
notebooks/
testing/
This diff is collapsed.
This diff is collapsed.
sample cb_input cb_output
D17PrTzF_Via /archive/urology/Strand_lab/shared/sequencing/singlecell/count/gex/count612/D17PrTzF_Via/outs/raw_feature_bc_matrix.h5 data_raw/cellbender/D17PrTzF_Via/D17PrTzF_Via.h5
D27PrTzF_Via /archive/urology/Strand_lab/shared/sequencing/singlecell/count/gex/count612/D27PrTzF_Via/outs/raw_feature_bc_matrix.h5 data_raw/cellbender/D27PrTzF_Via/D27PrTzF_Via.h5
D35PrTzF_Via /archive/urology/Strand_lab/shared/sequencing/singlecell/count/gex/count612/D35PrTzF_Via/outs/raw_feature_bc_matrix.h5 data_raw/cellbender/D35PrTzF_Via/D35PrTzF_Via.h5
D124VMFcol /archive/urology/Strand_lab/shared/sequencing/singlecell/count/gex/count612/D124VMFcol/outs/raw_feature_bc_matrix.h5 data_raw/cellbender/D124VMFcol/D124VMFcol.h5
D148UrFcol data_raw/cellranger/D148UrFcol/outs/raw_feature_bc_matrix.h5 data_raw/cellbender/D148UrFcol/D148UrFcol.h5
D149UrFcol data_raw/cellranger/D149UrFcol/outs/raw_feature_bc_matrix.h5 data_raw/cellbender/D149UrFcol/D149UrFcol.h5
D151UrFcol data_raw/cellranger/D151UrFcol/outs/raw_feature_bc_matrix.h5 data_raw/cellbender/D151UrFcol/D151UrFcol.h5
D158UrFcol data_raw/cellranger/D158UrFcol/outs/raw_feature_bc_matrix.h5 data_raw/cellbender/D158UrFcol/D158UrFcol.h5
Sample,Location,Sex,Cellranger_version,Chemistry_version
D17PrTzF_Via,data_raw/cellbender/D17PrTzF_Via/D17PrTzF_Via_filtered_seurat.h5,male,4.0.0,v2
D27PrTzF_Via,data_raw/cellbender/D27PrTzF_Via/D27PrTzF_Via_filtered_seurat.h5,male,4.0.0,v2
D35PrTzF_Via,data_raw/cellbender/D35PrTzF_Via/D35PrTzF_Via_filtered_seurat.h5,male,4.0.0,v2
D124VMFcol,data_raw/cellbender/D124VMFcol/D124VMFcol_filtered_seurat.h5,male,4.0.0,v3
D148UrFcol,data_raw/cellbender/D148UrFcol/D148UrFcol_filtered_seurat.h5,female,4.0.0,5’ PE
D149UrFcol,data_raw/cellbender/D149UrFcol/D149UrFcol_filtered_seurat.h5,female,4.0.0,5’ PE
D151UrFcol,data_raw/cellbender/D151UrFcol/D151UrFcol_filtered_seurat.h5,female,4.0.0,5’ PE
D158UrFcol,data_raw/cellbender/D158UrFcol/D158UrFcol_filtered_seurat.h5,female,4.0.0,5’ PE
index sample location
0 D17PrTzF_Via /archive/urology/Strand_lab/shared/sequencing/singlecell/count/gex/count612/D17PrTzF_Via/outs/raw_feature_bc_matrix.h5
1 D27PrTzF_Via /archive/urology/Strand_lab/shared/sequencing/singlecell/count/gex/count612/D27PrTzF_Via/outs/raw_feature_bc_matrix.h5
2 D35PrTzF_Via /archive/urology/Strand_lab/shared/sequencing/singlecell/count/gex/count612/D35PrTzF_Via/outs/raw_feature_bc_matrix.h5
3 D124VMFcol /archive/urology/Strand_lab/shared/sequencing/singlecell/count/gex/count612/D124VMFcol/outs/raw_feature_bc_matrix.h5
4 D148UrFcol /project/urology/Strand_lab/s133712/scRNA_urethra/data_raw/cellranger/D148UrFcol/outs/raw_feature_bc_matrix.h5
5 D149UrFcol /project/urology/Strand_lab/s133712/scRNA_urethra/data_raw/cellranger/D149UrFcol/outs/raw_feature_bc_matrix.h5
6 D151UrFcol /project/urology/Strand_lab/s133712/scRNA_urethra/data_raw/cellranger/D151UrFcol/outs/raw_feature_bc_matrix.h5
7 D158UrFcol /project/urology/Strand_lab/s133712/scRNA_urethra/data_raw/cellranger/D158UrFcol/outs/raw_feature_bc_matrix.h5
\ No newline at end of file
index sample fastq_path
0 D148UrFcol /archive/urology/Strand_lab/shared/sequencing/singlecell/fastq/gex+tcr/mkfastq612/H5WHNDSX3/D148UrFcol
1 D149UrFcol /archive/urology/Strand_lab/shared/sequencing/singlecell/fastq/gex+tcr/mkfastq612/H5WHNDSX3/D149UrFcol
2 D151UrFcol /archive/urology/Strand_lab/shared/sequencing/singlecell/fastq/gex+tcr/mkfastq612/H5WHNDSX3/D151UrFcol
3 D158UrFcol /archive/urology/Strand_lab/shared/sequencing/singlecell/fastq/gex+tcr/mkfastq612/H5WHNDSX3/D158UrFcol
\ No newline at end of file
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
# Pre-processing single cell data
# Author: John T Lafin
# Updated: 2024-08-13
print("Script begin at: ")
Sys.time()
# Setup -------------------------------------------------------------------
# Load packages
library(dplyr)
library(Seurat)
library(ggplot2)
library(intrinsicDimension)
library(clustree)
library(future)
library(future.apply)
# Future initialization
future_plan <- "multisession" # Can use multicore if not on Rstudio or Windows
n_workers <- 4
plan(future_plan, workers=n_workers)
options(future.globals.maxSize = 8000 * 1024^2)
set.seed(42)
# Create output dir
outdir <- "data_modified/2024-08-13_preprocess/"
dir.create(outdir, showWarnings = FALSE)
# Load data
seu <- readRDS("data_modified/2024-08-13_initial_analysis/merged_object.rds")
# Clustering --------------------------------------------------------------
# Evaluate dimensionality
numPCs <- maxLikGlobalDimEst(seu@reductions$harmony@cell.embeddings,
k = 10)$dim.est %>%
round()
ElbowPlot(seu, ndims=50, reduction="harmony") +
geom_vline(xintercept=numPCs)
ggsave(paste0(outdir, "scree_plot.png"))
res <- seq(0.1,1,0.1)
seu <- RunUMAP(seu,
reduction = "harmony",
dims = 1:numPCs,
metric = "euclidean") %>%
FindNeighbors(reduction = "harmony",
dims = 1:numPCs) %>%
FindClusters(resolution = res)
# Ensure idents are in numerical order
for (i in res) {
id <- paste0("RNA_snn_res.", i)
top.level <- length(levels(seu@meta.data[[id]])) - 1
seu@meta.data[[id]] <- factor(seu@meta.data[[id]], levels = seq(0, top.level, 1))
}
# Create clustree plot
clustree(seu, prefix = "RNA_snn_res.")
ggsave(paste0(outdir, "clustree.png"))
# Create UMAP plots at various resolutions
plotdir <- paste0(outdir, "umap/")
dir.create(plotdir, showWarnings = FALSE)
for (i in paste0("RNA_snn_res.", res)) {
DimPlot(seu, group.by=i, label=TRUE, repel=TRUE)
ggsave(paste0(plotdir, i, ".png"))
}
# Summary plots -----------------------------------------------------------
default.res <- "RNA_snn_res.0.4"
Idents(seu) <- default.res
# Fractional composition
cells.per.sample <- seu@meta.data %>%
group_by(Sample, .data[[default.res]]) %>%
summarize(count = n()) %>%
group_by(.data[[default.res]]) %>%
mutate(Fraction = count/sum(count))
ggplot(cells.per.sample, aes(.data[[default.res]], Fraction, fill = Sample)) +
geom_col() +
labs(title = "Fraction composition",
fill = "")
ggsave(paste0(outdir, "frac_comp.png"))
# Doublets per cluster
dblts <- seu@meta.data %>%
group_by(.data[[default.res]], scDblFinder.class) %>%
summarize(count = n()) %>%
group_by(.data[[default.res]]) %>%
mutate(fraction = count/sum(count))
ggplot(dblts, aes(.data[[default.res]], fraction, fill = scDblFinder.class)) +
geom_col() +
geom_hline(yintercept = 0.5, linetype = "dashed")
ggsave(paste0(outdir, "dblts.png"))
# Lineage markers
genes <- strsplit("EPCAM KRT5 KRT14 MSMB SEMG1 SELE PECAM1 PTPRC SRGN C1QA DCN LUM DES ACTA1 ACTG2 CRYAB CDH19",
split = " ")[[1]]
DotPlot(seu, features = genes, cluster.idents = TRUE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave(paste0(outdir, "lineage_marker_dotplot.png"))
# Marker genes ------------------------------------------------------------
markerdir <- paste0(outdir, "markers/")
dir.create(markerdir, showWarnings=FALSE)
for (i in paste0("RNA_snn_res.", res)) {
Idents(seu) <- i
m <- FindAllMarkers(seu, only.pos=TRUE)
write.csv(m, paste0(markerdir, i, ".csv"))
}
# Wrap up -----------------------------------------------------------------
# Save processed data
saveRDS(seu, paste0(outdir, "processed_object.rds"))
capture.output(sessionInfo(), file = paste0(outdir, "sessionInfo.txt"))
# Initializing single cell data
# Author: John T Lafin
# Updated: 2024-08-13
print("Script begin at: ")
Sys.time()
# Setup -------------------------------------------------------------------
# Load packages
library(dplyr)
library(Seurat)
library(scDblFinder)
library(ggplot2)
library(harmony)
library(future)
library(future.apply)
# Future initialization
future_plan <- "multisession" # Can use multicore if not on Rstudio or Windows
n_workers <- 4
plan(future_plan, workers=n_workers)
options(future.globals.maxSize = 8000 * 1024^2)
set.seed(42)
# Create output dir
outdir <- "data_modified/2024-08-13_initial_analysis/"
dir.create(outdir, showWarnings = FALSE)
# Load data ---------------------------------------------------------------
print("Loading data...")
## Load sample sheet
sample_sheet <- read.csv("sample_sheets/sample_sheet.csv")
## Read cellranger output
h5_files <- sample_sheet$Location
sc <- lapply(h5_files, Read10X_h5)
names(sc) <- sample_sheet$Sample
## Create Seurat objects
sc <- future_lapply(sc, CreateSeuratObject)
meta_add <- c("Sample", "Sex", "Cellranger_version", "Chemistry_version")
## Add metadata from sample sheet
for (i in names(sc)) {
for (j in meta_add) {
sc[[i]] <- AddMetaData(object = sc[[i]],
metadata = filter(sample_sheet, Sample == i)[[j]],
col.name = j)
}
}
# Calculate QC metrics ----------------------------------------------------
# Create QC directory
qcdir <- paste0(outdir, "qc/")
dir.create(qcdir, showWarnings = FALSE)
print("Begin doublet detection at: ")
Sys.time()
# Doublet detection with scDblFinder
sc <- future_lapply(sc,
function(x) {
x %>%
subset(nCount_RNA > 0) %>%
as.SingleCellExperiment() %>%
scDblFinder(clusters = TRUE) %>%
as.Seurat(data=NULL)},
future.seed=TRUE)
## Save doublet metrics
t <- lapply(sc, `[[`, "scDblFinder.class")
num.doublets <- sapply(t, function(x){length(x[x == "doublet"])})
num.cells <- sapply(t, function(x){nrow(x)})
pct.doublets <- sapply(t, function(x){length(x[x == "doublet"])/nrow(x)})
dblts <- tibble("Sample" = names(num.doublets),
"Doublets" = num.doublets,
"Total droplets" = num.cells,
"Doublets (%)" = pct.doublets*100)
write.csv(dblts, paste0(qcdir, "doublets.csv"))
print("Doublet detection complete at: ")
Sys.time()
print("Begin QC metric calculation.")
## Calculate other QC metrics
mt.pattern <- "^(MT|mt)-"
stress.genes <- c("JUN", "FOS", "EGR1",
"ATF3", "JUNB", "GADD45B",
"IER2", "ZFP36", "DNAJB1",
"RHOB", "NR4A1", "UBC", "HES1")
# Run this in sequential mode as NormalizeData() doesn't play nicely with future
plan(sequential)
sc <- lapply(sc, function(x){
x[["percent.mito"]] <- PercentageFeatureSet(x, pattern=mt.pattern)
NormalizeData(x) %>%
AddModuleScore(features=list(stress.genes),
name="Stress.score")})
for (i in names(sc)) {
sc[[i]]@meta.data <- sc[[i]]@meta.data %>%
dplyr::rename(Stress.score = Stress.score1)
}
print("QC metric calculation complete at: ")
Sys.time()
print("Saving output...")
## Checkpoint: save list of objects
saveRDS(sc, paste0(outdir, "pre-processed_list.rds"))
qc.dt <- tibble()
for (i in sc) {
temp <- tibble("Sample" = i$Sample,
"nCount_RNA" = i$nCount_RNA,
"nFeature_RNA" = i$nFeature_RNA,
"percent.mito" = i$percent.mito,
"doublet.score" = i$scDblFinder.score,
"doublet.class" = i$scDblFinder.class,
"stress.score" = i$Stress.score)
qc.dt <- bind_rows(qc.dt, temp)
}
qc.dt$Sample <- as.factor(qc.dt$Sample)
## Calculate QC thresholds
mads_mult <- 3 # How many MADs from the median to set bounds
nCount_floor <- 100 # Minimum allowable floor for nCount_RNA
mads <- qc.dt %>%
group_by(Sample) %>%
mutate(across(.cols = c(nCount_RNA, nFeature_RNA),
.fns = log)) %>%
summarize(across(.cols = c(nCount_RNA, nFeature_RNA, percent.mito),
.fns = list(med = median, mad = mad))) %>%
mutate(nCount_RNA_threshold_lo = exp(nCount_RNA_med - mads_mult * nCount_RNA_mad),
nCount_RNA_threshold_lo = ifelse(nCount_RNA_threshold_lo < nCount_floor, nCount_floor, nCount_RNA_threshold_lo),
nCount_RNA_threshold_hi = exp(nCount_RNA_med + mads_mult * nCount_RNA_mad),
nFeature_RNA_threshold_lo = exp(nFeature_RNA_med - mads_mult * nFeature_RNA_mad),
nFeature_RNA_threshold_hi = exp(nFeature_RNA_med + mads_mult * nFeature_RNA_mad),
percent.mito_threshold = 10)
write.csv(mads, paste0(qcdir, "qc_cutoffs.csv"))
# Pre-filter plots --------------------------------------------------------
plotdir <- paste0(qcdir, "plots_pre-filter/")
dir.create(plotdir, showWarnings = FALSE)
n_row <- 2 # How many rows of plots
# Doublets plot
ggplot(qc.dt, aes(x = nCount_RNA, y = doublet.score, color = doublet.class)) +
geom_point() +
facet_wrap(~Sample, nrow=n_row)
ggsave(paste0(plotdir,"doublets.png"))
# nCount_RNA
ggplot(qc.dt, aes(x = Sample, y = nCount_RNA)) +
geom_violin() +
geom_tile(data = mads, aes(x = Sample,
y = nCount_RNA_threshold_lo,
height = 0.02,
width = 0.5)) +
geom_tile(data = mads, aes(x = Sample,
y = nCount_RNA_threshold_hi,
height = 0.02,
width = 0.5)) +
scale_y_log10() +
labs(x = "") +
theme(axis.text.x = element_text(angle=45, hjust=1))
ggsave(paste0(plotdir,"nCount_RNA.png"))
# nFeature_RNA
ggplot(qc.dt, aes(x = Sample, y = nFeature_RNA)) +
geom_violin() +
geom_tile(data = mads, aes(x = Sample,
y = nFeature_RNA_threshold_hi,
height = 0.02,
width = 0.5)) +
geom_tile(data = mads, aes(x = Sample,
y = nFeature_RNA_threshold_lo,
height = 0.02,
width = 0.5)) +
scale_y_log10() +
labs(x = "") +
theme(axis.text.x = element_text(angle=45, hjust=1))
ggsave(paste0(plotdir,"nFeature_RNA.png"))
# Percent mito
ggplot(qc.dt, aes(x = Sample, y = percent.mito)) +
geom_violin() +
geom_hline(yintercept=10, linetype="dashed") +
labs(x = "") +
theme(axis.text.x = element_text(angle=45, hjust=1))
ggsave(paste0(plotdir,"percent_mito.png"))
# Stress score
ggplot(qc.dt, aes(x = Sample, y = stress.score)) +
geom_violin() +
labs(x = "") +
theme(axis.text.x = element_text(angle=45, hjust=1))
ggsave(paste0(plotdir,"stress.png"))
# Trend plots
ggplot(qc.dt, aes(x = nCount_RNA, y = nFeature_RNA)) +
facet_wrap(~Sample, nrow=n_row) +
geom_point(aes(color=percent.mito)) +
stat_smooth() +
scale_x_log10() +
scale_y_log10() +
labs(x = "nCount_RNA",
y = "nFeature_RNA",
color = "% Mito")
ggsave(paste0(plotdir,"trend.png"))
# Apply filters -----------------------------------------------------------
for (i in names(sc)) {
t <- mads %>%
dplyr::filter(Sample == i)
cat("Subsetting sample: ", i, "\n")
sc[[i]] <- subset(sc[[i]],
subset = nCount_RNA > t$nCount_RNA_threshold_lo &
nCount_RNA < t$nCount_RNA_threshold_hi &
nFeature_RNA > t$nFeature_RNA_threshold_lo &
nFeature_RNA < t$nFeature_RNA_threshold_hi &
percent.mito < 10)
}
qc.dt3 <- tibble()
for (i in sc) {
temp <- tibble("Sample" = i$Sample,
"nCount_RNA" = i$nCount_RNA,
"nFeature_RNA" = i$nFeature_RNA,
"percent.mito" = i$percent.mito,
"doublet.score" = i$scDblFinder.score,
"doublet.class" = i$scDblFinder.class,
"stress.score" = i$Stress.score)
qc.dt3 <- bind_rows(qc.dt3, temp)
}
qc.dt3$Sample <- as.factor(qc.dt3$Sample)
# Post-filter plots -------------------------------------------------------
plotdir <- paste0(qcdir, "plots_post-filter/")
dir.create(plotdir, showWarnings = FALSE)
# Doublets plot
ggplot(qc.dt3, aes(x = nCount_RNA, y = doublet.score, color = doublet.class)) +
geom_point() +
facet_wrap(~Sample, nrow=n_row)
ggsave(paste0(plotdir,"doublets.png"))
# nCount_RNA
ggplot(qc.dt3, aes(x = Sample, y = nCount_RNA)) +
geom_violin() +
scale_y_log10() +
labs(x = "") +
theme(axis.text.x = element_text(angle=45, hjust=1))
ggsave(paste0(plotdir,"nCount_RNA.png"))
# nFeature_RNA
ggplot(qc.dt3, aes(x = Sample, y = nFeature_RNA)) +
geom_violin() +
scale_y_log10() +
labs(x = "") +
theme(axis.text.x = element_text(angle=45, hjust=1))
ggsave(paste0(plotdir,"nFeature_RNA.png"))
# Percent mito
ggplot(qc.dt3, aes(x = Sample, y = percent.mito)) +
geom_violin() +
labs(x = "") +
theme(axis.text.x = element_text(angle=45, hjust=1))
ggsave(paste0(plotdir,"percent_mito.png"))
# Stress score
ggplot(qc.dt3, aes(x = Sample, y = stress.score)) +
geom_violin() +
labs(x = "") +
theme(axis.text.x = element_text(angle=45, hjust=1))
ggsave(paste0(plotdir,"stress.png"))
# Trend plots
ggplot(qc.dt3, aes(x = nCount_RNA, y = nFeature_RNA)) +
facet_wrap(~Sample, nrow=n_row) +
geom_point(aes(color=percent.mito)) +
stat_smooth() +
scale_x_log10() +
scale_y_log10() +
labs(x = "nCount_RNA",
y = "nFeature_RNA",
color = "% Mito")
ggsave(paste0(plotdir,"trend.png"))
# Save metrics ------------------------------------------------------------
write.csv(qc.dt, paste0(qcdir, "per_cell_metrics_pre-filter.csv"))
write.csv(qc.dt3, paste0(qcdir, "per_cell_metrics_post-filter.csv"))
num_cells_pre <- qc.dt %>%
dplyr::count(Sample, name = "pre_filter")
num_cells_post <- qc.dt3 %>%
dplyr::count(Sample, name = "post_filter")
num_cells <- left_join(num_cells_pre, num_cells_post, by = "Sample") %>%
mutate("fraction_cells_removed" = 1 - (post_filter / pre_filter))
write.csv(num_cells, paste0(qcdir, "filter_summary.csv"))
# Merge and pre-process ---------------------------------------------------
seu <- merge(sc[[1]], sc[-1], add.cell.ids = names(sc))
# Future doesn't play nicely with NormalizeData()
seu <- NormalizeData(seu)
plan(future_plan, workers=n_workers)
seu <- seu %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
RunHarmony(group.by.vars = "Sample")
# Save object
saveRDS(seu, paste0(outdir, "merged_object.rds"))
# Save session info
capture.output(sessionInfo(), file = paste0(outdir, "sessionInfo.txt"))
print("Success! Completion time: ")
Sys.time()
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment