单细胞转录组鉴定与骨关节炎相关的关键基因和通路

生信技能树

发布于 2021-12-04 14:17:31

95800

代码可运行

文章被收录于专栏：生信技能树生信技能树

运行总次数：0

代码可运行

下面11月份学徒的投稿

对滑膜成纤维细胞的单细胞 RNA 测序(read.table读取压缩的tsv文本文件)

今天我们复现的文章是2020年发表在Medicine 杂志上的一个单细胞数据挖掘文章，标题是《Identification of the key gene and pathways associated with osteoarthritis via single-cell RNA sequencing on synovial fibroblasts》，文章链接是：https://journals.lww.com/md-journal/Fulltext/2020/08140/Identification_of_the_key_gene_and_pathways.81.aspx

这个数据集主要是对GSE109449数据的挖掘，主要研究目标通过scRNA-seq测序手段，探究骨滑膜成纤维细胞与骨关节炎的关键基因和相关通路，本次针对这篇文章的Figure 4进行复现。如下图：

image-20210828212537989

数据链接为：https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE109449

作者提供的数据为：

GSM2943158 single fibroblast from donor OA4 (OA4_A1)
GSM2943159 single fibroblast from donor OA4 (OA4_A10)
GSM2943160 single fibroblast from donor OA4 (OA4_A11)
GSM2943161 single fibroblast from donor OA4 (OA4_A12)
GSM2943162 single fibroblast from donor OA4 (OA4_A2)
GSM2943163 single fibroblast from donor OA4 (OA4_A3)
GSM2943164 single fibroblast from donor OA4 (OA4_A4)
GSM2943165 single fibroblast from donor OA4 (OA4_A5)
GSM2943166 single fibroblast from donor OA4 (OA4_A6)

image-20210828214028760

本篇文章的数据来源：

192 scRNA-seq files of SFs from 2 OA patients were accessed from microarray GSE109449 included in Gene Expression Omnibus (GEO) atabase (https://www.ncbi.nlm.nih.gov/geo/). Then statistical analysis was conducted with respect to gene number and expression levels.

下面我们开始复现：

在运行环境下线创建文件夹data，其中包含两个文件：

image-20210829153150132

Set1 读取表达矩阵，构建Seurat对象

# 导入数据构建Seurat对象 ----------------------------------------------------------
rm(list = ls())
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(clustree)) 
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr)) 
suppressPackageStartupMessages(library(data.table)) 
meta <- read.table("../data/GSE109449_singlecell_rnaseq_metadata.tsv.gz",
                   sep = "\t",header = T,row.names = 1)
data <- read.table("../data/GSE109449_singlecell_rnaseq_gene_counts.tsv.gz",
                       sep = "\t",header = T,row.names = 1)
#tsv格式的特殊读取方式，table可以读取任何
#形式的文件，用seq来区分，ENSG0000000003等为另外一种形式的基因表达方式,gene count的文件一般就是我们所要的文件
#这个文件要同时具有基因、Barcode和数据
data[1:4, 1:4]
#                    OA4_A1   OA4_A10 OA4_A11 OA4_A12
# ENSG00000000003   2.76277 380.18400 0.00000 1.76691
# ENSG00000000005   0.00000   0.00000 0.00000 0.00000
# ENSG00000000419 271.00000   0.00000 1.01144 0.00000
# ENSG00000000457   3.93664  16.66799 4.71615 2.33886
#
fivenum(data[,1])
# Returns Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum) for the input data.
# [1] 0.000000e+00 0.000000e+00 0.000000e+00 9.216565e+00 2.170270e+05

这个数据比较狡猾，ID需要进行转换：

library(clusterProfiler)
library(org.Hs.eg.db)
ids=bitr(rownames(data),'ENSEMBL','SYMBOL','org.Hs.eg.db') # 将ENSEMBL转成SYMBOL
head(ids)
same_name <- intersect(rownames(data),ids$ENSEMBL)#取交集

data_f <- data[which(rownames(data) %in% same_name),]#从大的数据框提取有交集的数据
ids_f <- ids[which(ids$ENSEMBL %in% same_name),]#从大的数据框提取有交集的数据
ids_u <- ids_f[!duplicated(ids_f$ENSEMBL),]#ids_f比data_f 大，说明需要去重复
ids_u <- ids_u[!duplicated(ids_u$SYMBOL),]#再次去重复
data_o <- data_f[rownames(data_f) %in% ids_u$ENSEMBL,]#data_o与data_f数量相同，说明data_o无重复
rownames(data_o) <- ids_u$SYMBOL#改变行名
head(data_o)
data=data_o
data<- data[,grep("OA*",colnames(data))]#截取列明为OA开头的
data[1:4,1:4]
dim(data)
data['ACTB',] # 检查数据
data['GAPDH',] # 检查数据

整理好表达矩阵之后，构建Seurat对象

#建立对象---------------------------------------------------------
sce.all=CreateSeuratObject(counts = data)
sce.all
table(sce.all@meta.data$orig.ident)
dim(sce.all) 
#[1] 24404   192

Set2 降维聚类分群

sce <- sce.all
#数据预处理
sce <- NormalizeData(sce)
sce = ScaleData(sce, 
                features = rownames(sce),
                # vars.to.regress = c("nFeature_RNA", "percent_mito")
               )  
# 降维处理
sce <- FindVariableFeatures(sce, nfeatures = 3000)
sce <- RunPCA(sce, npcs = 30)
sce <- RunTSNE(sce, dims = 1:30)
sce <- RunUMAP(sce, dims = 1:30)
# 聚类分群
sce <- FindNeighbors(sce, dims = 1:30)
sce
# An object of class Seurat 
# 24404 features across 192 samples within 1 assay 
# Active assay: RNA (24404 features, 3000 variable features)
# 3 dimensional reductions calculated: pca, tsne, umap
res <-  c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)
sce <- FindClusters(sce, graph.name = "RNA_snn", resolution = res, algorithm = 1)
#聚类树可视化
colors <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')
names(colors) <- unique(colnames(sce@meta.data)[grep("RNA_snn_res", colnames(sce@meta.data))]) 

p2_tree <- clustree(sce@meta.data, prefix = "RNA_snn_res.") 
p2_tree
# RNA分群数据----
save(sce,file = '../data/first_sce_by_RNA.Rdata')

Set3 差异基因展示

pro='../picture/DEG/'
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE, min.pct = 0.25, 
                              thresh.use = 0.25)
DT::datatable(sce.markers)
write.csv(sce.markers,file=paste0(pro,'DEG_sce.markers_by_cluster.csv'))
library(dplyr) 
top10 <- sce.markers %>% group_by(cluster) %>% top_n(5, avg_log2FC)
DoHeatmap(sce,top10$gene,size=3)+scale_color_discrete(name = "Identity", na.translate = FALSE)
ggsave(filename=paste0(pro,'DEG_sce.markers_10_heatmap.pdf'))

image-20210829165623379

genes <- c("VCAN","MMP2","DCN","FGF7","CXCL12","CP","IGF1","C4B","C7","C3")
FeaturePlot(sce,features = genes,cols = c("green","red"))
ggsave(filename = paste0(pro,'DEG_sce.markers_10_featureplot.pdf'))

image-20210829170919113

VlnPlot(sce,features = genes)
ggsave(filename = paste0(pro,'DEG_sce.markers_10_VlnPlot.pdf'))

image-20210829171829273

到此我们就完成了《Identification of the key gene and pathways associated with osteoarthritis via single-cell RNA sequencing on synovial fibroblasts》文章Figure4的绘制。最后，总结给大家一句话，纸上得来终觉浅，绝知此事要躬行。

对滑膜成纤维细胞的单细胞 RNA 测序(read.table读取压缩的tsv文本文件)

image-20210828212537989

数据链接为：https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE109449

作者提供的数据为：

GSM2943158 single fibroblast from donor OA4 (OA4_A1)
GSM2943159 single fibroblast from donor OA4 (OA4_A10)
GSM2943160 single fibroblast from donor OA4 (OA4_A11)
GSM2943161 single fibroblast from donor OA4 (OA4_A12)
GSM2943162 single fibroblast from donor OA4 (OA4_A2)
GSM2943163 single fibroblast from donor OA4 (OA4_A3)
GSM2943164 single fibroblast from donor OA4 (OA4_A4)
GSM2943165 single fibroblast from donor OA4 (OA4_A5)
GSM2943166 single fibroblast from donor OA4 (OA4_A6)

image-20210828214028760

本篇文章的数据来源：

192 scRNA-seq files of SFs from 2 OA patients were accessed from microarray GSE109449 included in Gene Expression Omnibus (GEO) atabase (https://www.ncbi.nlm.nih.gov/geo/). Then statistical analysis was conducted with respect to gene number and expression levels.

下面我们开始复现：

在运行环境下线创建文件夹data，其中包含两个文件：

image-20210829153150132

Set1 读取表达矩阵，构建Seurat对象

# 导入数据构建Seurat对象 ----------------------------------------------------------
rm(list = ls())
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(clustree)) 
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr)) 
suppressPackageStartupMessages(library(data.table)) 
meta <- read.table("../data/GSE109449_singlecell_rnaseq_metadata.tsv.gz",
                   sep = "\t",header = T,row.names = 1)
data <- read.table("../data/GSE109449_singlecell_rnaseq_gene_counts.tsv.gz",
                       sep = "\t",header = T,row.names = 1)
#tsv格式的特殊读取方式，table可以读取任何
#形式的文件，用seq来区分，ENSG0000000003等为另外一种形式的基因表达方式,gene count的文件一般就是我们所要的文件
#这个文件要同时具有基因、Barcode和数据
data[1:4, 1:4]
#                    OA4_A1   OA4_A10 OA4_A11 OA4_A12
# ENSG00000000003   2.76277 380.18400 0.00000 1.76691
# ENSG00000000005   0.00000   0.00000 0.00000 0.00000
# ENSG00000000419 271.00000   0.00000 1.01144 0.00000
# ENSG00000000457   3.93664  16.66799 4.71615 2.33886
#
fivenum(data[,1])
# Returns Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum) for the input data.
# [1] 0.000000e+00 0.000000e+00 0.000000e+00 9.216565e+00 2.170270e+05

这个数据比较狡猾，ID需要进行转换：

library(clusterProfiler)
library(org.Hs.eg.db)
ids=bitr(rownames(data),'ENSEMBL','SYMBOL','org.Hs.eg.db') # 将ENSEMBL转成SYMBOL
head(ids)
same_name <- intersect(rownames(data),ids$ENSEMBL)#取交集

data_f <- data[which(rownames(data) %in% same_name),]#从大的数据框提取有交集的数据
ids_f <- ids[which(ids$ENSEMBL %in% same_name),]#从大的数据框提取有交集的数据
ids_u <- ids_f[!duplicated(ids_f$ENSEMBL),]#ids_f比data_f 大，说明需要去重复
ids_u <- ids_u[!duplicated(ids_u$SYMBOL),]#再次去重复
data_o <- data_f[rownames(data_f) %in% ids_u$ENSEMBL,]#data_o与data_f数量相同，说明data_o无重复
rownames(data_o) <- ids_u$SYMBOL#改变行名
head(data_o)
data=data_o
data<- data[,grep("OA*",colnames(data))]#截取列明为OA开头的
data[1:4,1:4]
dim(data)
data['ACTB',] # 检查数据
data['GAPDH',] # 检查数据

整理好表达矩阵之后，构建Seurat对象

#建立对象---------------------------------------------------------
sce.all=CreateSeuratObject(counts = data)
sce.all
table(sce.all@meta.data$orig.ident)
dim(sce.all) 
#[1] 24404   192

Set2 降维聚类分群

sce <- sce.all
#数据预处理
sce <- NormalizeData(sce)
sce = ScaleData(sce, 
                features = rownames(sce),
                # vars.to.regress = c("nFeature_RNA", "percent_mito")
               )  
# 降维处理
sce <- FindVariableFeatures(sce, nfeatures = 3000)
sce <- RunPCA(sce, npcs = 30)
sce <- RunTSNE(sce, dims = 1:30)
sce <- RunUMAP(sce, dims = 1:30)
# 聚类分群
sce <- FindNeighbors(sce, dims = 1:30)
sce
# An object of class Seurat 
# 24404 features across 192 samples within 1 assay 
# Active assay: RNA (24404 features, 3000 variable features)
# 3 dimensional reductions calculated: pca, tsne, umap
res <-  c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)
sce <- FindClusters(sce, graph.name = "RNA_snn", resolution = res, algorithm = 1)
#聚类树可视化
colors <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')
names(colors) <- unique(colnames(sce@meta.data)[grep("RNA_snn_res", colnames(sce@meta.data))]) 

p2_tree <- clustree(sce@meta.data, prefix = "RNA_snn_res.") 
p2_tree
# RNA分群数据----
save(sce,file = '../data/first_sce_by_RNA.Rdata')

Set3 差异基因展示

pro='../picture/DEG/'
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE, min.pct = 0.25, 
                              thresh.use = 0.25)
DT::datatable(sce.markers)
write.csv(sce.markers,file=paste0(pro,'DEG_sce.markers_by_cluster.csv'))
library(dplyr) 
top10 <- sce.markers %>% group_by(cluster) %>% top_n(5, avg_log2FC)
DoHeatmap(sce,top10$gene,size=3)+scale_color_discrete(name = "Identity", na.translate = FALSE)
ggsave(filename=paste0(pro,'DEG_sce.markers_10_heatmap.pdf'))

image-20210829165623379

genes <- c("VCAN","MMP2","DCN","FGF7","CXCL12","CP","IGF1","C4B","C7","C3")
FeaturePlot(sce,features = genes,cols = c("green","red"))
ggsave(filename = paste0(pro,'DEG_sce.markers_10_featureplot.pdf'))

image-20210829170919113

VlnPlot(sce,features = genes)
ggsave(filename = paste0(pro,'DEG_sce.markers_10_VlnPlot.pdf'))

image-20210829171829273

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2021-11-29，如有侵权请联系 cloudcommunity@tencent.com 删除

https

网络安全

腾讯云开发者社区

本文分享自生信技能树微信公众号，前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

https

网络安全

腾讯云开发者社区

登录后参与评论

0 条评论

热度