Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
scRNA_urethra
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Strand Lab
scRNA_urethra
Commits
b484c159
Commit
b484c159
authored
7 months ago
by
John Lafin
Browse files
Options
Downloads
Patches
Plain Diff
Add lineage filtering
parent
cca00d94
Branches
Branches containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
scripts/03_lineage_filtering.R
+277
-0
277 additions, 0 deletions
scripts/03_lineage_filtering.R
with
277 additions
and
0 deletions
scripts/03_lineage_filtering.R
0 → 100644
+
277
−
0
View file @
b484c159
# Lineage-dependent filtering
# Author: John T Lafin
# Updated: 2024-09-17
print
(
"Script begin at: "
)
Sys.time
()
# Setup -------------------------------------------------------------------
# Load packages
library
(
dplyr
)
library
(
Seurat
)
library
(
intrinsicDimension
)
library
(
ggplot2
)
library
(
harmony
)
library
(
future
)
library
(
future.apply
)
library
(
clustree
)
# Future initialization
future_plan
<-
"multisession"
# Can use multicore if not on Rstudio or Windows
n_workers
<-
4
plan
(
future_plan
,
workers
=
n_workers
)
options
(
future.globals.maxSize
=
8000
*
1024
^
2
)
set.seed
(
42
)
# Create output dir
outdir
<-
"data_modified/03_lineage_filtering/"
dir.create
(
outdir
,
showWarnings
=
FALSE
)
# Load data ---------------------------------------------------------------
sc.h
<-
readRDS
(
"data_modified/02_preprocess/processed_object.rds"
)
# Remove this field or Seurat won't work properly
sc.h
$
ident
<-
NULL
default.res
<-
"RNA_snn_res.0.4"
Idents
(
sc.h
)
<-
default.res
# Annotate lineages -------------------------------------------------------
# Using information generated from 02_preprocess.R, annotate lineages
cluster_epi
<-
c
(
0
,
1
,
7
,
8
,
9
,
11
,
15
)
cluster_endo
<-
c
(
4
,
6
,
12
)
cluster_leu
<-
c
(
10
,
14
,
16
)
cluster_stroma
<-
c
(
2
,
3
,
5
,
13
)
cluster_doublets
<-
c
(
17
,
18
)
# Make assignments
cluster.ids
<-
vector
(
"character"
,
length
=
length
(
levels
(
sc.h
)))
cluster.ids
[
cluster_epi
+1
]
<-
"Epithelia"
cluster.ids
[
cluster_endo
+1
]
<-
"Endothelia"
cluster.ids
[
cluster_leu
+1
]
<-
"Leukocyte"
cluster.ids
[
cluster_stroma
+1
]
<-
"Stroma"
cluster.ids
[
cluster_doublets
+1
]
<-
"Putative doublet"
names
(
cluster.ids
)
<-
levels
(
sc.h
)
sc.h
<-
RenameIdents
(
sc.h
,
cluster.ids
)
%>%
StashIdent
(
"Lineage"
)
# Generate UMAP with lineage assignments
umapdir
<-
paste0
(
outdir
,
"umaps/"
)
dir.create
(
umapdir
,
showWarnings
=
FALSE
)
Idents
(
sc.h
)
<-
"Lineage"
DimPlot
(
sc.h
)
ggsave
(
paste0
(
umapdir
,
"global_lineage.png"
))
# Dot plot to check marker leakage
genes
<-
strsplit
(
"EPCAM KRT5 KRT14 MSMB SEMG1 SELE PECAM1 PTPRC SRGN C1QA DCN LUM DES ACTA1 ACTG2 CRYAB CDH19"
,
split
=
" "
)[[
1
]]
DotPlot
(
sc.h
,
features
=
genes
)
+
theme
(
axis.text.x
=
element_text
(
angle
=
45
,
hjust
=
1
))
ggsave
(
paste0
(
outdir
,
"dotplot_global_lineage.png"
))
# Subset and recluster ----------------------------------------------------
lin.list
<-
SplitObject
(
sc.h
,
split.by
=
"Lineage"
)
lin.list
[[
"Putative doublet"
]]
<-
NULL
# Remove doublets
res
<-
seq
(
0.1
,
1
,
0.1
)
lin.list
<-
lin.list
%>%
future_lapply
(
function
(
x
){
x
<-
NormalizeData
(
x
)
%>%
FindVariableFeatures
()
%>%
ScaleData
()
%>%
RunPCA
()
%>%
RunHarmony
(
group.by.vars
=
"Sample"
)
numPCs
<-
maxLikGlobalDimEst
(
x
@
reductions
$
harmony
@
cell.embeddings
,
k
=
10
)
$
dim.est
%>%
round
()
x
<-
x
%>%
RunUMAP
(
dims
=
1
:
numPCs
,
reduction
=
"harmony"
,
metric
=
"euclidean"
,
min.dist
=
0.1
)
%>%
FindNeighbors
(
reduction
=
"harmony"
,
dims
=
1
:
numPCs
)
%>%
FindClusters
(
resolution
=
res
)
%>%
SetIdent
(
value
=
default.res
)
return
(
x
)},
future.seed
=
TRUE
)
# Save UMAP of each subset
for
(
lin
in
names
(
lin.list
)
){
DimPlot
(
lin.list
[[
lin
]],
label
=
TRUE
)
ggsave
(
paste0
(
umapdir
,
lin
,
"_prefilter.png"
))
}
# Lineage-dependent filtering ---------------------------------------------
## Identify and remove clusters enriched for likely doublets
#### Save doublet plots of each lineage
dblts
<-
list
()
for
(
lin
in
names
(
lin.list
))
{
dblts
[[
lin
]]
<-
lin.list
[[
lin
]]
@
meta.data
%>%
group_by
(
.data
[[
default.res
]],
scDblFinder.class
)
%>%
summarize
(
count
=
n
())
%>%
group_by
(
.data
[[
default.res
]])
%>%
mutate
(
fraction
=
count
/
sum
(
count
))
ggplot
(
dblts
[[
lin
]],
aes
(
.data
[[
default.res
]],
fraction
,
fill
=
scDblFinder.class
))
+
geom_col
()
+
geom_hline
(
yintercept
=
0.5
,
linetype
=
"dashed"
)
ggsave
(
paste0
(
outdir
,
"dblts_"
,
lin
,
".png"
))
}
### Remove clusters with >50% of cells classed as doublets
lin.list.filt
<-
list
()
keep
<-
list
()
for
(
lin
in
names
(
lin.list
))
{
keep
[[
lin
]]
<-
dblts
[[
lin
]]
%>%
filter
(
scDblFinder.class
==
"singlet"
&
fraction
>
0.5
)
%>%
pull
(
"RNA_snn_res.0.4"
)
lin.list.filt
[[
lin
]]
<-
subset
(
lin.list
[[
lin
]],
idents
=
keep
[[
lin
]])
}
## Library size filtering
### Calculate thresholds
mads_mult
<-
2
# Number of MADs away from median to filter
dt
<-
data.frame
(
lineage
=
character
(),
nCount_RNA
=
numeric
())
for
(
i
in
names
(
lin.list.filt
))
{
dt
<-
add_row
(
dt
,
lineage
=
i
,
nCount_RNA
=
lin.list.filt
[[
i
]]
$
nCount_RNA
)
}
mt.dt
<-
dt
%>%
group_by
(
lineage
)
%>%
mutate
(
log.nCount_RNA
=
log
(
nCount_RNA
))
%>%
summarize
(
median
=
median
(
nCount_RNA
),
log.median
=
median
(
log.nCount_RNA
),
log.mad
=
mad
(
log.nCount_RNA
))
%>%
mutate
(
cutoff
=
exp
(
log.median
-
mads_mult
*
log.mad
))
%>%
as.data.frame
()
rownames
(
mt.dt
)
<-
mt.dt
$
lineage
mt.dt
$
lineage
<-
NULL
### Save thresholds
write.csv
(
mt.dt
,
paste0
(
outdir
,
"nCount_cutoffs.csv"
))
### Save violin plots with cutoffs
for
(
g
in
names
(
lin.list.filt
))
{
VlnPlot
(
lin.list.filt
[[
g
]],
"nCount_RNA"
,
pt.size
=
0
)
+
geom_hline
(
yintercept
=
mt.dt
[
g
,
"cutoff"
],
linetype
=
"dashed"
)
+
scale_y_log10
()
+
NoLegend
()
ggsave
(
paste0
(
outdir
,
paste0
(
"vln_nCount_cutoffs_"
,
g
,
".png"
)))
}
### Apply library size filter
for
(
i
in
names
(
lin.list.filt
)
)
{
lin.list.filt
[[
i
]]
<-
subset
(
lin.list.filt
[[
i
]],
subset
=
nCount_RNA
>
mt.dt
[
i
,
"cutoff"
])
}
### Record how many cells were removed
cells.old
<-
lapply
(
lin.list
,
ncol
)
cells.new
<-
lapply
(
lin.list.filt
,
ncol
)
tibble
(
"Lineage"
=
names
(
cells.old
),
"Original"
=
unlist
(
cells.old
),
"Filtered"
=
unlist
(
cells.new
))
%>%
mutate
(
"Difference"
=
Original
-
Filtered
,
"Fraction_removed"
=
Difference
/
Original
)
%>%
write.csv
(
paste0
(
outdir
,
"fraction_removed_nCount.csv"
))
# Recluster ---------------------------------------------------------------
lin.list
<-
lin.list.filt
%>%
future_lapply
(
function
(
x
){
x
<-
NormalizeData
(
x
)
%>%
FindVariableFeatures
()
%>%
ScaleData
()
%>%
RunPCA
()
%>%
RunHarmony
(
group.by.vars
=
"Sample"
)
numPCs
<-
maxLikGlobalDimEst
(
x
@
reductions
$
harmony
@
cell.embeddings
,
k
=
10
)
$
dim.est
%>%
round
()
x
<-
x
%>%
RunUMAP
(
dims
=
1
:
numPCs
,
reduction
=
"harmony"
,
metric
=
"euclidean"
,
min.dist
=
0.1
)
%>%
FindNeighbors
(
reduction
=
"harmony"
,
dims
=
1
:
numPCs
)
%>%
FindClusters
(
resolution
=
res
)
%>%
SetIdent
(
value
=
default.res
)
return
(
x
)},
future.seed
=
TRUE
)
# Doublet removal, second round -------------------------------------------
dblts
<-
list
()
for
(
lin
in
names
(
lin.list
))
{
dblts
[[
lin
]]
<-
lin.list
[[
lin
]]
@
meta.data
%>%
group_by
(
.data
[[
default.res
]],
scDblFinder.class
)
%>%
summarize
(
count
=
n
())
%>%
group_by
(
.data
[[
default.res
]])
%>%
mutate
(
fraction
=
count
/
sum
(
count
))
}
keep
<-
list
()
for
(
lin
in
names
(
lin.list
))
{
keep
[[
lin
]]
<-
dblts
[[
lin
]]
%>%
filter
(
scDblFinder.class
==
"singlet"
&
fraction
>
0.5
)
%>%
pull
(
"RNA_snn_res.0.4"
)
lin.list
[[
lin
]]
<-
subset
(
lin.list
[[
lin
]],
idents
=
keep
[[
lin
]])
}
## Recluster again
lin.list
<-
lin.list
%>%
future_lapply
(
function
(
x
){
x
<-
NormalizeData
(
x
)
%>%
FindVariableFeatures
()
%>%
ScaleData
()
%>%
RunPCA
()
%>%
RunHarmony
(
group.by.vars
=
"Sample"
)
numPCs
<-
maxLikGlobalDimEst
(
x
@
reductions
$
harmony
@
cell.embeddings
,
k
=
10
)
$
dim.est
%>%
round
()
x
<-
x
%>%
RunUMAP
(
dims
=
1
:
numPCs
,
reduction
=
"harmony"
,
metric
=
"euclidean"
,
min.dist
=
0.1
)
%>%
FindNeighbors
(
reduction
=
"harmony"
,
dims
=
1
:
numPCs
)
%>%
FindClusters
(
resolution
=
res
)
%>%
SetIdent
(
value
=
default.res
)
return
(
x
)},
future.seed
=
TRUE
)
# Save outputs ------------------------------------------------------------
## Save UMAPs and objects
objdir
<-
paste0
(
outdir
,
"objects/"
)
dir.create
(
objdir
,
showWarnings
=
FALSE
)
for
(
lin
in
names
(
lin.list
))
{
DimPlot
(
lin.list
[[
lin
]])
ggsave
(
paste0
(
umapdir
,
lin
,
"_postfilter.png"
))
saveRDS
(
lin.list
[[
lin
]],
paste0
(
objdir
,
lin
,
".rds"
))
}
## Save session info
capture.output
(
sessionInfo
(),
file
=
paste0
(
outdir
,
"session_info.txt"
))
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment