

本系列讲解 空间转录组学 (Spatial Transcriptomics) 相关基础知识与数据分析教程[1],持续更新,欢迎关注,转发,文末有交流群!
在本文中,我们将分析人类结直肠活检的 Visium 数据。我们的目标不是重述所有可能的分析,而是突出那些在这些数据的背景下可能特别有趣的分析。
library(osfr)
library(scran)
library(scater)
library(igraph)
library(AUCell)
library(scuttle)
library(spacexr)
library(msigdbr)
library(VisiumIO)
library(jsonlite)
library(ggspavis)
library(pheatmap)
library(patchwork)
library(OSTA.data)
library(BiocParallel)
library(DropletUtils)
library(SpatialExperiment)
# specify whether/how to
# perform parallelization
bp <- MulticoreParam(th <- 4)
# set seed for random number generation
# in order to make results reproducible
set.seed(194849)
# retrieve dataset from OSF repo
id <- "Visium_HumanColon_Oliveira"
pa <- OSTA.data_load(id)
dir.create(td <- tempfile())
unzip(pa, exdir=td)
# read into 'SpatialExperiment'
obj <- TENxVisium(
spacerangerOut=file.path(td, "outs"),
format="h5",
images="lowres")
(spe <- import(obj))
## class: SpatialExperiment
## dim: 18085 4269
## metadata(2): resources spatialList
## assays(1): counts
## rownames(18085): ENSG00000187634 ENSG00000188976 ... ENSG00000198695
## ENSG00000198727
## rowData names(3): ID Symbol Type
## colnames(4269): AACAATGTGCTCCGAG-1 AACACCATTCGCATAC-1 ...
## TGTTGGTGCGGAATCA-1 TGTTGGTGGACTCAGG-1
## colData names(4): in_tissue array_row array_col sample_id
## reducedDimNames(0):
## mainExpName: Gene Expression
## altExpNames(0):
## spatialCoords names(2) : pxl_col_in_fullres pxl_row_in_fullres
## imgData names(4): sample_id image_id data scaleFactor

# use gene symbols as feature names
rownames(spe) <- make.unique(rowData(spe)$Symbol)
# add per-cell quality control metrics
sub <- list(mt=grep("^MT-", rownames(spe)))
spe <- addPerCellQCMetrics(spe, subsets=sub)

# determine outliers via thresholding on MAD from the median
ol <- perCellQCFilters(spe, sub.fields="subsets_mt_percent")
# add results as cell metadata
colData(spe)[names(ol)] <- ol
# tabulate # and % of cells that'd
# be discarded for different reasons
data.frame(
check.names=FALSE,
`#`=apply(ol, 2, sum),
`%`=round(100*apply(ol, 2, mean), 2))
## # %
## low_lib_size 3 0.07
## low_n_features 636 14.90
## high_subsets_mt_percent 161 3.77
## discard 779 18.25
让我们看看根据上述标准哪些spots会被排除:
lapply(names(ol), \(.)
plotCoords(spe, annotate=.) + ggtitle(.)) |>
wrap_plots(nrow=1, guides="collect") &
guides(col=guide_legend(override.aes=list(size=3))) &
scale_color_manual("discard", values=c("lavender", "purple")) &
theme(plot.title=element_text(hjust=0.5), legend.key.size=unit(0, "lines"))

看起来低质量 spot 在空间上高度有序,因此暂时不要移除它们。我们将在下方进一步看到,这里使用的质量控制指标,以及被认为应丢弃的 spot,在(基于转录组的)cluster 中是如何分布的。
# log-library size normalization
spe <- logNormCounts(spe)
# highly variable feature selection
tbl <- modelGeneVar(spe)
sel <- getTopHVGs(tbl, n=2e3)
# principal component analysis
spe <- runPCA(spe, subset_row=sel)
作为一种无监督方法,我们使用 Leiden 社区检测算法执行基于共享最近邻 (SNN) 图的聚类。
# build shared nearest-neighbor (SNN) graph
g <- buildSNNGraph(spe, use.dimred="PCA", type="jaccard")
# cluster via Leiden community detection algorithm
k <- cluster_leiden(g, objective_function="modularity", resolution=0.5)
table(spe$Leiden <- factor(k$membership))
## 1 2 3 4 5 6 7 8 9 10
## 326 504 710 432 732 458 315 171 469 152
作为补充方法,我们使用作者提供的(已注释的)参考单细胞数据对 spot 测量结果进行解卷积。让我们首先检索这些数据,以及相应的细胞元数据,其中包含低分辨率(Level1)和高分辨率(Level2)注释,分别划分为 10 个和 32 个子群体。
# retrieve dataset from OSF repo
id <- "Chromium_HumanColon_Oliveira"
pa <- OSTA.data_load(id)
dir.create(td <- tempfile())
unzip(pa, exdir=td)
# read into 'SingleCellExperiment'
fs <- list.files(td, full.names=TRUE)
h5 <- grep("h5$", fs, value=TRUE)
sce <- read10xCounts(h5, col.names=TRUE)
# add cell metadata
csv <- grep("csv$", fs, value=TRUE)
cd <- read.csv(csv, row.names=1)
colData(sce)[names(cd)] <- cd[colnames(sce), ]
# use gene symbols as feature names
rownames(sce) <- make.unique(rowData(sce)$Symbol)
# exclude cells deemed to be of low-quality
sce <- sce[, sce$QCFilter == "Keep"]
# tabulate subpopulations
table(sce$Level1)
##
## B cells Endothelial Fibroblast
## 33611 7969 28653
## Intestinal Epithelial Myeloid Neuronal
## 22763 25105 4199
## Smooth Muscle T cells Tumor
## 43308 29272 65626
接下来,我们使用 spacexr(RCTD)进行解卷积。默认情况下,runRctd() 的 rctd_mode="doublet",即每个像素最多拟合两个亚群;在这里,我们将 rctd_mode 设置为 "full",以允许拟合任意数量的亚群。
# prep reference data (Chromium);
# subset cells from same patient
.sce <- sce[, grepl("P2", sce$Patient)]
# downsample to at most 4,000 cells per cluster
cs <- split(seq_len(ncol(.sce)), .sce$Level1)
cs <- lapply(cs, \(.) sample(., min(length(.), 4e3)))
.sce <- .sce[, unlist(cs)]
# run 'RCTD' deconvolution
rctd_data <- createRctd(spe, .sce, cell_type_col="Level1")
(res <- runRctd(rctd_data, max_cores=th, rctd_mode="full"))
## class: SpatialExperiment
## dim: 9 4269
## metadata(4): spatial_rna config cell_type_info internal_vars
## assays(1): weights
## rownames(9): B cells Endothelial ... T cells Tumor
## rowData names(0):
## colnames(4269): AACAATGTGCTCCGAG-1 AACACCATTCGCATAC-1 ...
## TGTTGGTGCGGAATCA-1 TGTTGGTGGACTCAGG-1
## colData names(1): sample_id
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
## spatialCoords names(2) : x y
## imgData names(0):
应将 RCTD 推断的权重标准化,以使每个点中细胞类型的比例总和为 1:
# scale weights such that they sum to 1
ws <- assay(res)
ws <- sweep(ws, 2, colSums(ws), `/`)
# add proportion estimates as metadata
ws <- data.frame(t(as.matrix(ws)))
colData(spe)[names(ws)] <- ws[colnames(spe), ]
为了与无监督聚类(基于 SNN 的 Leiden)进行比较,我们也纳入了另一种分配方式:将每个 spot 直接赋予解卷积估计中最常见的标签
ids <- names(ws)[apply(ws, 1, which.max)]
table(spe$RCTD <- factor(ids), spe$Leiden)
##
## 1 2 3 4 5 6 7 8 9 10
## B.cells 0 42 0 6 0 4 0 0 0 0
## Endothelial 0 113 0 0 0 23 0 0 0 0
## Fibroblast 0 271 0 0 11 57 313 0 0 0
## Intestinal.Epithelial 0 4 0 4 0 0 0 0 466 10
## Myeloid 0 16 0 0 2 3 0 0 0 0
## Smooth.Muscle 0 8 0 0 0 362 0 0 0 0
## T.cells 0 11 0 0 0 3 0 0 0 0
## Tumor 326 39 710 422 719 6 2 171 3 142
我们还可以将组织划分为广泛的生物学区域;在此,通过将基于 RCTD 的亚群分配归纳为四个类别来实现:
lab <- list(
tum="Tumor",
epi="Intestinal.Epithelial",
imm=c("B.cells", "T.cells", "Myeloid"),
str=c("Endothelial", "Fibroblast", "Smooth.Muscle"))
idx <- match(spe$RCTD, unlist(lab))
lab <- rep.int(names(lab), sapply(lab, length))
table(spe$Domain <- factor(lab[idx]))
##
## epi imm str tum
## 484 87 1158 2540
让我们把去卷积权重在空间中可视化,也就是按照“在给定 spot 内被估计属于某一特定细胞类型的比例”来上色:
lapply(names(ws), \(.)
plotCoords(spe, annotate=.)) |>
wrap_plots(nrow=3) & theme(
legend.key.width=unit(0.5, "lines"),
legend.key.height=unit(1, "lines")) &
scale_color_gradientn(colors=pals::jet())

lapply(c("Leiden", "Domain", "RCTD"),
\(.) plotCoords(spe, annotate=.)) |>
wrap_plots(nrow=1) &
theme(legend.key.size=unit(0, "lines")) &
scale_color_manual(values=unname(pals::trubetskoy()))

为了帮助表征来自无监督聚类的亚群,我们可以查看它们在基于去卷积的聚类和广泛区域中的分布;例如,肿瘤 spot 非常多样,而平滑肌 spot 和(正常)上皮几乎完全映射到单一聚类:
cd <- data.frame(colData(spe))
df <- as.data.frame(with(cd, table(RCTD, Leiden)))
fd <- as.data.frame(with(cd, table(Domain, Leiden)))
ggplot(df, aes(Freq, RCTD, fill=Leiden)) + ggtitle("RCTD") +
ggplot(fd, aes(Freq, Domain, fill=Leiden)) + ggtitle("Domain") +
plot_layout(nrow=1, guides="collect") &
labs(x="Proportion", y=NULL) &
coord_cartesian(expand=FALSE) &
geom_col(width=1, col="white", position="fill") &
scale_fill_manual(values=unname(pals::trubetskoy())) &
theme_minimal() & theme(aspect.ratio=1,
legend.key.size=unit(2/3, "lines"),
plot.title=element_text(hjust=0.5))

让我们从 PCs 的角度审视(表达)变异的关键驱动因素。结合上面的聚类和去卷积结果,我们可以看到:
pcs <- reducedDim(spe, "PCA")
colData(spe)[colnames(pcs)] <- pcs
lapply(colnames(pcs)[seq_len(6)],
\(.) plotCoords(spe, annotate=.) +
scale_color_gradientn(., colors=pals::jet())) |>
wrap_plots(nrow=2) & theme(
plot.title=element_blank(),
legend.key.width=unit(0.5, "lines"),
legend.key.height=unit(1, "lines"))

对于特定集群,质量控制指标往往较低。反过来,它们的斑块状图案解释了之前看到的低质量斑点的聚集。
lapply(c("detected", "log_sum", "subsets_mt_percent"), \(.)
plotColData(spe, x=., y="Leiden", color_by="discard", point_size=0.1) +
scale_x_discrete(limits=names(sort(by(spe[[.]], spe$Leiden, median))))) |>
wrap_plots(nrow=1, guides="collect") &
scale_color_manual("discard", values=c("lavender", "purple")) &
guides(col=guide_legend(override.aes=list(alpha=1, size=3))) &
theme_minimal() & theme(
panel.grid.minor=element_blank(),
legend.key.size=unit(0, "lines"))

与其研究单个基因,我们也可以评估一组基因的表达(例如,通路签名);例如,恶性组织可能在糖酵解和脂肪酸代谢等代谢活性上有所不同,或表现出增加的凋亡(细胞死亡)等。在此,我们使用 msigdbr 包,从 MSigDB 获取一些生物现象的标志基因集:
# retrieve hallmark gene sets from 'MSigDB'
db <- msigdbr(species="Homo sapiens", category="H")
# get list of gene symbols, one element per set
gs <- split(db$ensembl_gene, db$gs_name)
# simplify set identifiers (drop prefix, use lower case)
names(gs) <- tolower(gsub("HALLMARK_", "", names(gs)))
# how many sets?
length(gs)
# how many genes in each?
range(sapply(gs, length))
接下来,我们将用 AUCell(Aibar et al. 2017)来打分,它分两步走:(i) 对每个观测(这里是 spot)里的基因进行排序,(ii) 为每个基因集计算 AUC 值。本质上,这些值代表“在排名最靠前(默认前 5 %)的基因里,属于该基因集的比例”;也就是说,数值越高,(协同基因表达所体现的)活性越高。
# realize (sparse) gene expression matrix
mtx <- as(logcounts(spe), "dgCMatrix")
# use ensembl identifiers as feature names
rownames(mtx) <- rowData(spe)$ID
# build per-spot gene rankings
rnk <- AUCell_buildRankings(mtx, BPPARAM=bp, plotStats=FALSE, verbose=FALSE)
# calculate AUC for each gene set in each spot
auc <- AUCell_calcAUC(geneSets=gs, rankings=rnk, nCores=th, verbose=FALSE)
# add results as spot metadata
colData(spe)[rownames(auc)] <- res <- t(assay(auc))
为简单起见,我们将继续仅调查那些跨点得分变异性最高的Signatures:
var <- colVars(res) # variance across spots
top <- names(tail(sort(var), 8)) # top sets)
总之,MYC 信号在基质区域缺失;包围癌灶的成纤维细胞环表现出 EMT、血管生成等;INFa 应答和 TNFa 信号在基质和恶性上皮中均呈斑片状分布。
lapply(top, \(.) {
spe[[.]] <- scale(spe[[.]]) # scaling
plotCoords(spe, annotate=.) # plotting
}) |>
# arrange & prettify
wrap_plots(nrow=2, guides="collect") &
scale_color_gradientn(
colors=pals::jet(),
oob=scales::squish,
limits=c(-2.5, 2.5)) &
theme(
legend.key.width=unit(0.5, "lines"),
legend.key.height=unit(1, "lines"))

为了便于解读,我们可以按 spot 标签对 AUCell 得分进行分层;这些标签既可来自无监督聚类,也可源于去卷积方法:
for (. in c("Leiden", "RCTD")) {
# aggregate AUC values by cluster
mu <- aggregateAcrossCells(auc[top, ], spe[[.]],
use.assay.type="AUC", statistics="mean")
# visualize as (cluster x set) heatmap
pheatmap(
mat=t(assay(mu)), scale="column", col=pals::coolwarm(), main=.,
cellwidth=10, cellheight=10, treeheight_row=5, treeheight_col=5)
}

对于后者,我们也可以把基因集得分与比例估计值做关联(而不是按主要亚群把标签离散化):
# correlate 'AUCell' signature scores with subpopulation
# proportion estimates from deconvolution with 'RCTD'
cm <- cor(as.matrix(ws), t(assay(auc[top, ])))
pheatmap(cm,
col=pals::coolwarm(),
breaks=seq(-1, 1, length=25),
cellwidth=10, cellheight=10,
treeheight_row=5, treeheight_col=5)

未完待续,欢迎关注!
Reference
[1] Ref: https://lmweber.org/OSTA/