下面11月份学徒的投稿
对滑膜成纤维细胞的单细胞 RNA 测序(read.table读取压缩的tsv文本文件)
今天我们复现的文章是2020年发表在Medicine 杂志上的一个单细胞数据挖掘文章,标题是《Identification of the key gene and pathways associated with osteoarthritis via single-cell RNA sequencing on synovial fibroblasts》,文章链接是:https://journals.lww.com/md-journal/Fulltext/2020/08140/Identification_of_the_key_gene_and_pathways.81.aspx
这个数据集主要是对GSE109449数据的挖掘,主要研究目标通过scRNA-seq测序手段,探究骨滑膜成纤维细胞与骨关节炎的关键基因和相关通路,本次针对这篇文章的Figure 4进行复现。如下图:
image-20210828212537989
数据链接为:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE109449
作者提供的数据为:
GSM2943158 single fibroblast from donor OA4 (OA4_A1)
GSM2943159 single fibroblast from donor OA4 (OA4_A10)
GSM2943160 single fibroblast from donor OA4 (OA4_A11)
GSM2943161 single fibroblast from donor OA4 (OA4_A12)
GSM2943162 single fibroblast from donor OA4 (OA4_A2)
GSM2943163 single fibroblast from donor OA4 (OA4_A3)
GSM2943164 single fibroblast from donor OA4 (OA4_A4)
GSM2943165 single fibroblast from donor OA4 (OA4_A5)
GSM2943166 single fibroblast from donor OA4 (OA4_A6)
image-20210828214028760
本篇文章的数据来源:
192 scRNA-seq files of SFs from 2 OA patients were accessed from microarray GSE109449 included in Gene Expression Omnibus (GEO) atabase (https://www.ncbi.nlm.nih.gov/geo/). Then statistical analysis was conducted with respect to gene number and expression levels.
在运行环境下线创建文件夹data,其中包含两个文件:
image-20210829153150132
# 导入数据构建Seurat对象 ----------------------------------------------------------
rm(list = ls())
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(clustree))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(data.table))
meta <- read.table("../data/GSE109449_singlecell_rnaseq_metadata.tsv.gz",
sep = "\t",header = T,row.names = 1)
data <- read.table("../data/GSE109449_singlecell_rnaseq_gene_counts.tsv.gz",
sep = "\t",header = T,row.names = 1)
#tsv格式的特殊读取方式,table可以读取任何
#形式的文件,用seq来区分,ENSG0000000003等为另外一种形式的基因表达方式,gene count的文件一般就是我们所要的文件
#这个文件要同时具有基因、Barcode和数据
data[1:4, 1:4]
# OA4_A1 OA4_A10 OA4_A11 OA4_A12
# ENSG00000000003 2.76277 380.18400 0.00000 1.76691
# ENSG00000000005 0.00000 0.00000 0.00000 0.00000
# ENSG00000000419 271.00000 0.00000 1.01144 0.00000
# ENSG00000000457 3.93664 16.66799 4.71615 2.33886
#
fivenum(data[,1])
# Returns Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum) for the input data.
# [1] 0.000000e+00 0.000000e+00 0.000000e+00 9.216565e+00 2.170270e+05
这个数据比较狡猾,ID需要进行转换:
library(clusterProfiler)
library(org.Hs.eg.db)
ids=bitr(rownames(data),'ENSEMBL','SYMBOL','org.Hs.eg.db') # 将ENSEMBL转成SYMBOL
head(ids)
same_name <- intersect(rownames(data),ids$ENSEMBL)#取交集
data_f <- data[which(rownames(data) %in% same_name),]#从大的数据框提取有交集的数据
ids_f <- ids[which(ids$ENSEMBL %in% same_name),]#从大的数据框提取有交集的数据
ids_u <- ids_f[!duplicated(ids_f$ENSEMBL),]#ids_f比data_f 大,说明需要去重复
ids_u <- ids_u[!duplicated(ids_u$SYMBOL),]#再次去重复
data_o <- data_f[rownames(data_f) %in% ids_u$ENSEMBL,]#data_o与data_f数量相同,说明data_o无重复
rownames(data_o) <- ids_u$SYMBOL#改变行名
head(data_o)
data=data_o
data<- data[,grep("OA*",colnames(data))]#截取列明为OA开头的
data[1:4,1:4]
dim(data)
data['ACTB',] # 检查数据
data['GAPDH',] # 检查数据
整理好表达矩阵之后,构建Seurat对象
#建立对象---------------------------------------------------------
sce.all=CreateSeuratObject(counts = data)
sce.all
table(sce.all@meta.data$orig.ident)
dim(sce.all)
#[1] 24404 192
sce <- sce.all
#数据预处理
sce <- NormalizeData(sce)
sce = ScaleData(sce,
features = rownames(sce),
# vars.to.regress = c("nFeature_RNA", "percent_mito")
)
# 降维处理
sce <- FindVariableFeatures(sce, nfeatures = 3000)
sce <- RunPCA(sce, npcs = 30)
sce <- RunTSNE(sce, dims = 1:30)
sce <- RunUMAP(sce, dims = 1:30)
# 聚类分群
sce <- FindNeighbors(sce, dims = 1:30)
sce
# An object of class Seurat
# 24404 features across 192 samples within 1 assay
# Active assay: RNA (24404 features, 3000 variable features)
# 3 dimensional reductions calculated: pca, tsne, umap
res <- c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)
sce <- FindClusters(sce, graph.name = "RNA_snn", resolution = res, algorithm = 1)
#聚类树可视化
colors <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')
names(colors) <- unique(colnames(sce@meta.data)[grep("RNA_snn_res", colnames(sce@meta.data))])
p2_tree <- clustree(sce@meta.data, prefix = "RNA_snn_res.")
p2_tree
# RNA分群数据----
save(sce,file = '../data/first_sce_by_RNA.Rdata')
pro='../picture/DEG/'
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE, min.pct = 0.25,
thresh.use = 0.25)
DT::datatable(sce.markers)
write.csv(sce.markers,file=paste0(pro,'DEG_sce.markers_by_cluster.csv'))
library(dplyr)
top10 <- sce.markers %>% group_by(cluster) %>% top_n(5, avg_log2FC)
DoHeatmap(sce,top10$gene,size=3)+scale_color_discrete(name = "Identity", na.translate = FALSE)
ggsave(filename=paste0(pro,'DEG_sce.markers_10_heatmap.pdf'))
image-20210829165623379
genes <- c("VCAN","MMP2","DCN","FGF7","CXCL12","CP","IGF1","C4B","C7","C3")
FeaturePlot(sce,features = genes,cols = c("green","red"))
ggsave(filename = paste0(pro,'DEG_sce.markers_10_featureplot.pdf'))
image-20210829170919113
VlnPlot(sce,features = genes)
ggsave(filename = paste0(pro,'DEG_sce.markers_10_VlnPlot.pdf'))
image-20210829171829273
到此我们就完成了《Identification of the key gene and pathways associated with osteoarthritis via single-cell RNA sequencing on synovial fibroblasts》文章Figure4的绘制。最后,总结给大家一句话,纸上得来终觉浅,绝知此事要躬行。
对滑膜成纤维细胞的单细胞 RNA 测序(read.table读取压缩的tsv文本文件)
今天我们复现的文章是2020年发表在Medicine 杂志上的一个单细胞数据挖掘文章,标题是《Identification of the key gene and pathways associated with osteoarthritis via single-cell RNA sequencing on synovial fibroblasts》,文章链接是:https://journals.lww.com/md-journal/Fulltext/2020/08140/Identification_of_the_key_gene_and_pathways.81.aspx
这个数据集主要是对GSE109449数据的挖掘,主要研究目标通过scRNA-seq测序手段,探究骨滑膜成纤维细胞与骨关节炎的关键基因和相关通路,本次针对这篇文章的Figure 4进行复现。如下图:
image-20210828212537989
数据链接为:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE109449
作者提供的数据为:
GSM2943158 single fibroblast from donor OA4 (OA4_A1)
GSM2943159 single fibroblast from donor OA4 (OA4_A10)
GSM2943160 single fibroblast from donor OA4 (OA4_A11)
GSM2943161 single fibroblast from donor OA4 (OA4_A12)
GSM2943162 single fibroblast from donor OA4 (OA4_A2)
GSM2943163 single fibroblast from donor OA4 (OA4_A3)
GSM2943164 single fibroblast from donor OA4 (OA4_A4)
GSM2943165 single fibroblast from donor OA4 (OA4_A5)
GSM2943166 single fibroblast from donor OA4 (OA4_A6)
image-20210828214028760
本篇文章的数据来源:
192 scRNA-seq files of SFs from 2 OA patients were accessed from microarray GSE109449 included in Gene Expression Omnibus (GEO) atabase (https://www.ncbi.nlm.nih.gov/geo/). Then statistical analysis was conducted with respect to gene number and expression levels.
在运行环境下线创建文件夹data,其中包含两个文件:
image-20210829153150132
# 导入数据构建Seurat对象 ----------------------------------------------------------
rm(list = ls())
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(clustree))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(data.table))
meta <- read.table("../data/GSE109449_singlecell_rnaseq_metadata.tsv.gz",
sep = "\t",header = T,row.names = 1)
data <- read.table("../data/GSE109449_singlecell_rnaseq_gene_counts.tsv.gz",
sep = "\t",header = T,row.names = 1)
#tsv格式的特殊读取方式,table可以读取任何
#形式的文件,用seq来区分,ENSG0000000003等为另外一种形式的基因表达方式,gene count的文件一般就是我们所要的文件
#这个文件要同时具有基因、Barcode和数据
data[1:4, 1:4]
# OA4_A1 OA4_A10 OA4_A11 OA4_A12
# ENSG00000000003 2.76277 380.18400 0.00000 1.76691
# ENSG00000000005 0.00000 0.00000 0.00000 0.00000
# ENSG00000000419 271.00000 0.00000 1.01144 0.00000
# ENSG00000000457 3.93664 16.66799 4.71615 2.33886
#
fivenum(data[,1])
# Returns Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum) for the input data.
# [1] 0.000000e+00 0.000000e+00 0.000000e+00 9.216565e+00 2.170270e+05
这个数据比较狡猾,ID需要进行转换:
library(clusterProfiler)
library(org.Hs.eg.db)
ids=bitr(rownames(data),'ENSEMBL','SYMBOL','org.Hs.eg.db') # 将ENSEMBL转成SYMBOL
head(ids)
same_name <- intersect(rownames(data),ids$ENSEMBL)#取交集
data_f <- data[which(rownames(data) %in% same_name),]#从大的数据框提取有交集的数据
ids_f <- ids[which(ids$ENSEMBL %in% same_name),]#从大的数据框提取有交集的数据
ids_u <- ids_f[!duplicated(ids_f$ENSEMBL),]#ids_f比data_f 大,说明需要去重复
ids_u <- ids_u[!duplicated(ids_u$SYMBOL),]#再次去重复
data_o <- data_f[rownames(data_f) %in% ids_u$ENSEMBL,]#data_o与data_f数量相同,说明data_o无重复
rownames(data_o) <- ids_u$SYMBOL#改变行名
head(data_o)
data=data_o
data<- data[,grep("OA*",colnames(data))]#截取列明为OA开头的
data[1:4,1:4]
dim(data)
data['ACTB',] # 检查数据
data['GAPDH',] # 检查数据
整理好表达矩阵之后,构建Seurat对象
#建立对象---------------------------------------------------------
sce.all=CreateSeuratObject(counts = data)
sce.all
table(sce.all@meta.data$orig.ident)
dim(sce.all)
#[1] 24404 192
sce <- sce.all
#数据预处理
sce <- NormalizeData(sce)
sce = ScaleData(sce,
features = rownames(sce),
# vars.to.regress = c("nFeature_RNA", "percent_mito")
)
# 降维处理
sce <- FindVariableFeatures(sce, nfeatures = 3000)
sce <- RunPCA(sce, npcs = 30)
sce <- RunTSNE(sce, dims = 1:30)
sce <- RunUMAP(sce, dims = 1:30)
# 聚类分群
sce <- FindNeighbors(sce, dims = 1:30)
sce
# An object of class Seurat
# 24404 features across 192 samples within 1 assay
# Active assay: RNA (24404 features, 3000 variable features)
# 3 dimensional reductions calculated: pca, tsne, umap
res <- c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)
sce <- FindClusters(sce, graph.name = "RNA_snn", resolution = res, algorithm = 1)
#聚类树可视化
colors <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')
names(colors) <- unique(colnames(sce@meta.data)[grep("RNA_snn_res", colnames(sce@meta.data))])
p2_tree <- clustree(sce@meta.data, prefix = "RNA_snn_res.")
p2_tree
# RNA分群数据----
save(sce,file = '../data/first_sce_by_RNA.Rdata')
pro='../picture/DEG/'
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE, min.pct = 0.25,
thresh.use = 0.25)
DT::datatable(sce.markers)
write.csv(sce.markers,file=paste0(pro,'DEG_sce.markers_by_cluster.csv'))
library(dplyr)
top10 <- sce.markers %>% group_by(cluster) %>% top_n(5, avg_log2FC)
DoHeatmap(sce,top10$gene,size=3)+scale_color_discrete(name = "Identity", na.translate = FALSE)
ggsave(filename=paste0(pro,'DEG_sce.markers_10_heatmap.pdf'))
image-20210829165623379
genes <- c("VCAN","MMP2","DCN","FGF7","CXCL12","CP","IGF1","C4B","C7","C3")
FeaturePlot(sce,features = genes,cols = c("green","red"))
ggsave(filename = paste0(pro,'DEG_sce.markers_10_featureplot.pdf'))
image-20210829170919113
VlnPlot(sce,features = genes)
ggsave(filename = paste0(pro,'DEG_sce.markers_10_VlnPlot.pdf'))
image-20210829171829273
到此我们就完成了《Identification of the key gene and pathways associated with osteoarthritis via single-cell RNA sequencing on synovial fibroblasts》文章Figure4的绘制。最后,总结给大家一句话,纸上得来终觉浅,绝知此事要躬行。