RUNX1在B前体急性白血病细胞中的过表达(readMM函数活学活用)

生信技能树

发布于 2021-12-04 14:18:40

65500

代码可运行

文章被收录于专栏：生信技能树生信技能树

运行总次数：0

代码可运行

下面11月份学徒的投稿

本次要介绍的单细胞转录组数据集（GSE48192）目前还没有看到关联的文章发表，摘要写的是：使用ChIP-seq检测活性增强子组蛋白标记物H3K27ac和RUNX1，然后通过单细胞rna测序分析其对基因表达的影响。

链接：https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE148192

数据的样本信息如图：我们目前只看单细胞数据部分：

image-20210904225559157

就两个单细胞样品，而且是标准的10x仪器的数据，下面我们开始复现：

Step1：数据下载和读取

image-20210904231127546

点击custom选择我们要下载的数据：

image-20210904231347539

下载好数据我们会获得以下几个文件：

image-20210904231650429

接下来就可以读取它们啦，有意思的是每个样品都需要独立的读取3个文件，合并成为一个单细胞Seurat对象，操作技巧满满！

# 导入数据构建Seurat对象 ----------------------------------------------------------
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(clustree)) 
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr)) 
suppressPackageStartupMessages(library(data.table)) 
suppressPackageStartupMessages(library(stringr) )
suppressPackageStartupMessages(library(patchwork))
library(Seurat)
# https://hbctraining.github.io/scRNA-seq/lessons/readMM_loadData.html
library(Matrix)
library(data.table)
mtx=readMM('../data/GSM4462450_Nalm6_DMSO_matrix.mtx.gz')
mtx
dim(mtx)
cl=fread('../data/GSM4462450_Nalm6_DMSO_barcodes.tsv.gz',
         header = T,data.table = F ) 
head(cl)
rl=fread('../data/GSM4462450_Nalm6_DMSO_features.tsv.gz',
         header = T,data.table = F ) 
head(cl)
head(rl)
dim(mtx)
colnames(mtx) <- rl$V1
rownames(mtx) <- cl$V1
DMSO=CreateSeuratObject(counts = t(mtx),
                        pro='DMSO')
head(DMSO@meta.data)
mtx=readMM('../data/GSM4462451_Nalm6_lvRUNX1_matrix.mtx.gz')
mtx
dim(mtx)
cl=fread('../data/GSM4462451_Nalm6_lvRUNX1_barcodes.tsv.gz',
         header = T,data.table = F ) 
head(cl)
rl=fread('../data/GSM4462451_Nalm6_lvRUNX1_features.tsv.gz',
         header = T,data.table = F ) 
head(cl)
head(rl)
dim(mtx)
colnames(mtx) <- rl$V1
rownames(mtx) <- cl$V1
RUNX1=CreateSeuratObject(counts = t(mtx),
                         pro='RUNX1')
head(RUNX1@meta.data)

sceList=list(
  DMSO=DMSO,
  RUNX1=RUNX1
)
save(sceList,file = 'sceList.Rdata')

Setp2：数据质控过滤

这个地方没什么好说的，就是标准的代码，复制粘贴即可：

sce.all <- merge(sceList[[1]],sceList[[-1]],,add.cell.ids = c("DMSO","RUNX1"),merge.data = T)
# 质控和过滤 -------------------------------------------------------------------
#计算线粒体基因比例
# 人和鼠的基因名字稍微不一样 
mito_genes=rownames(sce.all)[grep("^MT-", rownames(sce.all))] 
mito_genes #13个线粒体基因
sce.all=PercentageFeatureSet(sce.all, "^MT-", col.name = "percent_mito")
fivenum(sce.all@meta.data$percent_mito)
#计算核糖体基因比例
ribo_genes=rownames(sce.all)[grep("^RP[sl]", rownames(sce.all),ignore.case = T)]
ribo_genes
sce.all=PercentageFeatureSet(sce.all, "^RP[SL]", col.name = "percent_ribo")
fivenum(sce.all@meta.data$percent_ribo)
#计算红血细胞基因比例
hb_genes <- rownames(sce.all)[grep("^HB[^(p)]", rownames(sce.all),ignore.case = T)]
hb_genes
sce.all=PercentageFeatureSet(sce.all, "^HB[^(P)]", col.name = "percent_hb")
fivenum(sce.all@meta.data$percent_hb)

#可视化细胞的上述比例情况
Idents(sce.all)
feats <- c("nFeature_RNA", "nCount_RNA")
p1=VlnPlot(sce.all, group.by = "orig.ident", features = feats, pt.size = 0.01, ncol = 2) + NoLegend()
p1
ggsave(filename="../picture/1_QC/Vlnplot1.png",plot=p1)

feats <- c("percent_mito", "percent_ribo", "percent_hb")
p2=VlnPlot(sce.all, group.by = "orig.ident", features = feats, pt.size = 0.01, ncol = 3, same.y.lims=T) + 
  scale_y_continuous(breaks=seq(0, 100, 5)) +
  NoLegend()
p2
ggsave(filename="../picture/1_QC/Vlnplot2.png",plot=p2)

p3=FeatureScatter(sce.all, "nCount_RNA", "nFeature_RNA", group.by = "orig.ident", pt.size = 0.5)
p3
ggsave(filename="../picture/1_QC/Scatterplot.pdf",plot=p3)

# 第一次过滤 (无文章采用默认参数）-------------------------------------------------------------------
#根据上述指标，过滤低质量细胞/基因
#过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
selected_c <- WhichCells(sce.all, expression = nFeature_RNA >300) # 每个细胞中基因表达>300
selected_f <- rownames(sce.all)[Matrix::rowSums(sce.all@assays$RNA@counts > 0 ) > 3] # 至少在3个细胞中表达 
sce.all.filt <- subset(sce.all, features = selected_f, cells = selected_c)
dim(sce.all) 
# [1] 57905 11564
dim(sce.all.filt) 
# [1] 22302 11465
table(sce.all$orig.ident)
# DMSO RUNX1 
# 6073  5491 
table(sce.all.filt$orig.ident)
# DMSO RUNX1 
# 6022  5443 
# 第二次过滤 -------------------------------------------------------------------
#过滤指标2:线粒体/核糖体基因比例(根据上面的violin图)
dim(sce.all.filt)
selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 15)
selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 1)
# 红细胞过多适当放宽阈值这里设置阈值为5
selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 0.05)
length(selected_hb)
length(selected_ribo)
length(selected_mito)

sce.all.filt <- subset(sce.all.filt, cells = selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_ribo)
sce.all.filt <- subset(sce.all.filt, cells = selected_hb)
dim(sce.all.filt)
# [1] 22302  9765
table(sce.all$orig.ident)
# DMSO RUNX1 
# 6073  5491 
table(sce.all.filt$orig.ident)
# DMSO RUNX1 
# 5911  3854 
# 保存过滤后的Seurat对象（sce.all.filt) --------------------------------------------
save(sce.all.filt,file = "../data/sce.all.filt.Rdata")

Step3：降维聚类分群

降维分析（检查批次效应）

# 降维聚类分析 ------------------------------------------------------------------
rm(list=ls()) 
load("../data/sce.all.filt.Rdata")
#检查数据
dim(sce.all.filt)
# [1] 22302  9765
table(sce.all.filt$orig.ident)
# DMSO RUNX1 
# 5911  3854 
sce <- sce.all.filt
#数据预处理
sce <- NormalizeData(sce)
sce = ScaleData(sce, 
                features = rownames(sce),
                # vars.to.regress = c("nFeature_RNA", "percent_mito")
)  #加上vars.to.regress参数后运行so slow!!!
# 降维处理
sce <- FindVariableFeatures(sce, nfeatures = 3000)
sce <- RunPCA(sce, npcs = 30)
sce <- RunTSNE(sce, dims = 1:30)
sce <- RunUMAP(sce, dims = 1:30)

# 检查批次效应 ------------------------------------------------------------------

colnames(sce@meta.data) 
# [1] "orig.ident"   "nCount_RNA"   "nFeature_RNA"
# [4] "percent_mito" "percent_ribo" "percent_hb"       
p1.compare=plot_grid(ncol = 3,
                     DimPlot(sce, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA"),
                     DimPlot(sce, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE"),
                     DimPlot(sce, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP")
)
p1.compare
ggsave(plot=p1.compare,filename="../picture/2_PCA_UMAP_TSNE/Before_inter_1.pdf")
DimPlot(sce, 
        reduction = "umap", # pca, umap, tsne
        group.by = "orig.ident",
        label = F)
ggsave("../picture/2_PCA_UMAP_TSNE/Dimplot_before_1.pdf",units = "cm", width = 20,height = 18)
save(sce,file = "../subdata/before_sce.Rdata")

image-20210905002232251

由图可以看出有明显的批次效应，其实它也跟样品差异混合在了一起。但是通常情况下，我们认为样品差异不应该是两个样品完全没有任何同类型的单细胞亚群，它们太泾渭分明了，所以这个时候去除这个差异势在必行！

CCA去除批次效应

目前CCA这个方法是主流，我们也采用它！

sce.all=sce
sce.all
# An object of class Seurat 
# 22302 features across 9765 samples within 1 assay 
# Active assay: RNA (22302 features, 3000 variable features)
# 3 dimensional reductions calculated: pca, tsne, umap

#拆分为 个seurat子对象
sce.all.list <- SplitObject(sce.all, split.by = "orig.ident")
sce.all.list
for (i in 1:length(sce.all.list)) {
  print(i)
  sce.all.list[[i]] <- NormalizeData(sce.all.list[[i]], verbose = FALSE)
  sce.all.list[[i]] <- FindVariableFeatures(sce.all.list[[i]], selection.method = "vst", 
                                            nfeatures = 2000, verbose = FALSE)
}
# 看电脑性能
alldata.anchors <- FindIntegrationAnchors(object.list = sce.all.list, dims = 1:30, 
                                          reduction = "cca")
sce.all.int <- IntegrateData(anchorset = alldata.anchors, dims = 1:30, new.assay.name = "CCA")
names(sce.all.int@assays)
#[1] "RNA" "CCA"
sce.all.int@active.assay
#[1] "CCA"
sce.all.int=ScaleData(sce.all.int)
sce.all.int=RunPCA(sce.all.int, npcs = 30)
sce.all.int=RunTSNE(sce.all.int, dims = 1:30)
sce.all.int=RunUMAP(sce.all.int, dims = 1:30)
names(sce.all.int@reductions)
# 检查批次效应----
p2.compare=plot_grid(ncol = 3,
                     DimPlot(sce.all.int, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA"),
                     DimPlot(sce.all.int, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE"),
                     DimPlot(sce.all.int, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP")
)
p2.compare
ggsave(plot=p2.compare,filename="../picture/2_PCA_UMAP_TSNE/Before_inter_CCA_2.pdf")

# CCA 比较去除批次前后  --------------------------------------------------------
p5.compare=plot_grid(ncol = 3,
                     DimPlot(sce, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA"),
                     DimPlot(sce, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE"),
                     DimPlot(sce, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP"),
                     
                     DimPlot(sce.all.int, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA"),
                     DimPlot(sce.all.int, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE"),
                     DimPlot(sce.all.int, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP")
)
p5.compare
ggsave(plot=p5.compare,filename="../picture/2_PCA_UMAP_TSNE/umap_by_har_before_after.pdf",units = "cm", width = 40,height = 18)

save(sce.all.int,file = "../data/sce.all.int_by_cca.Rdata")

image-20210905004618475

可以看到，两个样品的差异仍然是存在的，但是它们开始有单细胞亚群的交集了，非常好。说明大概率上我们的批次效应被矫正了，而且样品差异被保留了。

Step3：聚类分群

# CCA 聚类分析------------------------------------------------------------------
rm(list = ls())
load("../data/sce.all.int_by_cca.Rdata")
sce <- sce.all.int
sce <- FindNeighbors(sce, dims = 1:30)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
  # res=0.01
  print(res)
  sce <- FindClusters(sce, graph.name = "CCA_snn", resolution = res, algorithm = 1)
}
ls <- apply(sce@meta.data[,grep("CCA_snn_res",colnames(sce@meta.data))],2,table)
ls
# CCA分群数据----
save(sce,file = '../subdata/first_sce_by_CCA.Rdata')

#聚类树可视化
colors <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')
names(colors) <- unique(colnames(sce@meta.data)[grep("CCA_snn_res", colnames(sce@meta.data))]) 

p2_tree <- clustree(sce@meta.data, prefix = "CCA_snn_res.") 
p2_tree
ggsave(p2_tree, filename = "../picture/2_PCA_UMAP_TSNE/cluster_tree_by_CCA.pdf", width = 10, height = 8)

#umap可视化
cluster_umap <- plot_grid(ncol = 4, 
                          DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.01", label = T) & NoAxes(), 
                          DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.05", label = T)& NoAxes(),
                          DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.1", label = T) & NoAxes(),
                          DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.2", label = T)& NoAxes(),
                          DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.3", label = T)& NoAxes(),
                          DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.5", label = T) & NoAxes(),
                          DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.8", label = T) & NoAxes(), 
                          DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.1", label = T) & NoAxes()
)
ggsave(cluster_umap, filename = "../picture/2_PCA_UMAP_TSNE/cluster_umap_all_res_by_CCA.pdf", width = 16, height = 8)

#保存数据
saveRDS(sce, file = "../subdata/sce_all_by_CCA.Rds")

image-20210905005904358

根据分群结果，我们选取resolution=0.5进行细胞亚群注释。

Setp4：细胞亚群注释

# 防止原始数据被改动，重新命名
sce.all <- sce
Idents(sce.all) <- "CCA_snn_res.0.5"
# all markergenes -----------------------------------------------------------
p_umap=DimPlot(sce.all, reduction = "umap",
               group.by = "CCA_snn_res.0.5",label = T,label.size = 4) 
p_umap
# paper -------------------------------------------------------------------
genes_to_check = c("MS4A1","IGHD","CD27","NEIL1","MZB1","CD38","TOP2A","MKI67" )
# 该基因名称（大小写转换）----
genes_to_check=str_to_upper(genes_to_check)
genes_to_check
p_all_markers <- DotPlot(sce.all, features = genes_to_check,
                         group.by = "CCA_snn_res.0.5",assay='RNA'  )  + coord_flip()

p_all_markers
ggsave(plot=p_all_markers, filename="../picture/Annotation/check_all_marker_by_CCA_res_0.8.pdf")

p1 <- plot_grid(p_all_markers,p_umap,rel_widths = c(1.5,1))

ggsave(p1,filename = '../picture/Annotation/all_markers_umap_by_CCA_res.0.5.pdf',units = "cm",width = 35,height = 18)

image-20210905010832453

根据marker gene，我们对细胞亚群进行注释：

# 注释细胞亚群 ------------------------------------------------------------------
###### step6: 细胞类型注释 (根据marker gene) ######  
celltype=data.frame(ClusterID=0:8,
                    celltype='NA')
celltype[celltype$ClusterID %in% c( 6 ),2]='Naive B'   
celltype[celltype$ClusterID %in% c( 2 ),2]='Germinal center B'
celltype[celltype$ClusterID %in% c( 5 ),2]='Plasma B'   
celltype[celltype$ClusterID %in% c( 0,1,3,4,7,8 ),2]='Dividing Plasma B' 
head(celltype)
celltype 
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$CCA_snn_res.0.5 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)
# umap_by_celltype --------------------------------------------------------

th=theme(axis.text.x = element_text(angle = 45, 
                                    vjust = 0.5, hjust=0.5))
p_all_markers=DotPlot(sce.all, features = genes_to_check,
                      group.by = "celltype",assay='RNA'  )  + coord_flip()+th

color <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf','#999999')
p_umap<- DimPlot(sce.all, reduction = "umap", group.by = "celltype",label = T) 
p_umap
ps <- plot_grid(p_all_markers,p_umap,rel_widths = c(1.5,1))
ps
ggsave(ps,filename = "../picture/Annotation/our_markers_dotplot_by_celltype.pdf",units = "cm",width = 40,height = 15)

p_celltype<- DimPlot(sce.all, reduction = "tsne",group.by = "celltype",label = T)  
p_celltype
ggsave(p_celltype,filename = '../picture/Annotation/tsne_by_celltype.pdf',width = 20,height = 16,units = "cm")

image-20210905011351678

image-20210905011251128

可以看到，处于细胞周期的B细胞比例有点多，可能是这个癌症的特性？

这背后应该是仍然会隐藏很多生物学故事，可惜咱们仅仅是学会了单细胞数据分析，并没有学会单细胞数据理解。任重道远啊！

虽然说上面的代码都是复制粘贴即可运行，但是如果要更好地完成上面的图表，通常是需要掌握5个R包，分别是: scater,monocle,Seurat,scran,M3Drop，需要熟练掌握它们的对象，：一些单细胞转录组R包的对象 而且分析流程也大同小异：

step1: 创建对象
step2: 质量控制
step3: 表达量的标准化和归一化
step4: 去除干扰因素(多个样本整合)
step5: 判断重要的基因
step6: 多种降维算法
step7: 可视化降维结果
step8: 多种聚类算法
step9: 聚类后找每个细胞亚群的标志基因
step10: 继续分类

单细胞转录组数据分析的标准降维聚类分群，并且进行生物学注释后的结果。可以参考前面的例子：人人都能学会的单细胞聚类分群注释 ，我们演示了第一层次的分群。

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2021-11-28，如有侵权请联系 cloudcommunity@tencent.com 删除

数据分析

本文分享自生信技能树微信公众号，前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

数据分析

登录后参与评论

0 条评论

热度