我们生信马拉松培训班的最后一周是单细胞学习,在上完这周的课后,有个学员做了一个GEO数据集的单细胞分析,但是他画出来的umap图像是为了庆祝新年,像烟花一样绚烂,如下:
GSE125527数据:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE125527。
样本为 健康状态和溃疡性结肠炎(UC)中胃肠道黏膜和外周免疫系统的组织,文献中的结果如下:
学院给出的数据如下:共15个样本
GSM3576396_HC-11.tsv.gz
GSM3576397_HC-12.tsv.gz
GSM3576398_HC-13.tsv.gz
GSM3576399_UC-12.tsv.gz
GSM3576400_UC-13.tsv.gz
GSM3576401_UC-14.tsv.gz
GSM3576402_UC-15.tsv.gz
GSM3576403_UC-16.tsv.gz
GSM3576404_UC-17.tsv.gz
GSM3576405_UC-18.tsv.gz
GSM3576406_HC-14.tsv.gz
GSM3576407_HC-15.tsv.gz
GSM3576408_HC-16.tsv.gz
GSM3576409_HC-17.tsv.gz
GSM3576410_HC-18.tsv.gz
批量读取并创建Seuart对象:
###
### Create: Jianming Zeng
### Date: 2023-12-31
### Email: jmzeng1314@163.com
### Blog: http://www.bio-info-trainee.com/
### Forum: http://www.biotrainee.com/thread-1376-1-1.html
### CAFS/SUSTC/Eli Lilly/University of Macau
### Update Log: 2023-12-31 First version
### Update Log: 2024-12-09 by juan zhang (492482942@qq.com)
###
rm(list=ls())
options(stringsAsFactors = F)
library(ggsci)
library(dplyr)
library(future)
library(Seurat)
library(clustree)
library(cowplot)
library(data.table)
library(ggplot2)
library(patchwork)
library(stringr)
library(qs)
library(Matrix)
getwd()
###### step1: 导入数据 ######
samples <- list.files("GSE125527_RAW/")
samples
sceList <- lapply(samples, function(pro){
#pro <- samples[1]
print(pro)
ct <- fread(file.path("GSE125527_RAW/" ,pro),data.table = F)
ct[1:4,1:4]
rownames(ct) <- ct[,1]
ct<- ct[,-1]
ct <- t(ct)
ct[1:4,1:4]
sce <- CreateSeuratObject(counts=ct , project = gsub(".tsv.gz","",pro))
return(sce)
})
sce.all <- merge(x=sceList[[1]], y=sceList[ -1 ])
sce.all <- JoinLayers(sce.all)
sce.all
names(sce.all@assays$RNA@layers)
dim(sce.all[["RNA"]]$counts )
sce.all[["RNA"]]$counts[1:5,1:5]
as.data.frame(sce.all@assays$RNA$counts[1:10, 1:5])
head(sce.all@meta.data, 10)
table(sce.all$orig.ident)
temp <- str_split(sce.all$orig.ident, "_", n=2, simplify = T)
head(temp)
# add group and sample
sce.all$sample1 <- temp[,1]
sce.all$sample2 <- temp[,2]
sce.all$group <- str_split(temp[,2], pattern ="-" ,n = 2,simplify = T)[,1]
head(sce.all@meta.data)
table(sce.all$sample1)
table(sce.all$sample2)
table(sce.all$group,sce.all$sample1)
table(sce.all$orig.ident)
sce.all$orig.ident <- sce.all$sample2
library(qs)
qsave(sce.all, file="GSE125527/sce.all.qs")
sce.all
共得到:32154个细胞,10105个基因。
> sce.all
An object of class Seurat
10105 features across 32154 samples within 1 assay
Active assay: RNA (10105 features, 0 variable features)
1 layer present: counts
###### step2: QC质控 ######
dir.create("./1-QC")
# 1.计算线粒体基因比例
mito_genes <- grep("^MT-", rownames(sce.all),ignore.case = T, value = T)
# 可能是13个线粒体基因
print(mito_genes)
sce.all <- PercentageFeatureSet(sce.all, features = mito_genes, col.name = "percent_mito")
fivenum(sce.all@meta.data$percent_mito)
# 2.计算核糖体基因比例
ribo_genes <- grep("^Rp[sl]", rownames(sce.all),ignore.case = T, value = T)
sce.all <- PercentageFeatureSet(sce.all, features = ribo_genes, col.name = "percent_ribo")
fivenum(sce.all@meta.data$percent_ribo)
# 3.计算红血细胞基因比例
Hb_genes <- grep("^Hb[^(p)]", rownames(sce.all),ignore.case = T,value = T)
print(Hb_genes)
sce.all <- PercentageFeatureSet(sce.all, features=Hb_genes, col.name="percent_hb")
fivenum(sce.all@meta.data$percent_hb)
# 可视化细胞的上述比例情况
# pic1
p1 <- VlnPlot(sce.all, group.by = "orig.ident", features = c("nFeature_RNA", "nCount_RNA"),
pt.size = 0.02, ncol = 2) + NoLegend()
w <- length(unique(sce.all$orig.ident))/3+5;w
ggsave(filename="1-QC/Vlnplot1.pdf",plot=p1,width = w,height = 5)
# pic2
p2 <- VlnPlot(sce.all, group.by = "orig.ident", features = c("percent_mito", "percent_ribo", "percent_hb"),
pt.size = 0.0, ncol = 3) + NoLegend()
w <- length(unique(sce.all$orig.ident))/2+5;w
ggsave(filename="1-QC/Vlnplot2.pdf",plot=p2,width = w,height = 5)
## 根据上述指标,过滤低质量细胞/基因
# 过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
# 一般来说,在CreateSeuratObject的时候已经是进行了这个过滤操作
# 如果后期看到了自己的单细胞降维聚类分群结果很诡异,就可以回过头来看质量控制环节
# 先走默认流程即可
dim(sce.all)
# 过滤细胞:细胞中基因表达数少于多少基因 以及 大于多少基因
summary(sce.all$nFeature_RNA)
sce.all.filt <- subset(sce.all, subset = nFeature_RNA > 200 & nFeature_RNA < 2500)
dim(sce.all.filt)
# 过滤细胞:细胞中基因表达count数少于多少 以及 大于多少
sce.all.filt <- subset(sce.all.filt, subset = nCount_RNA > 3 )
dim(sce.all.filt)
# 过滤细胞指标2:线粒体/核糖体基因比例/红细胞(根据上面的violin图)
sce.all.filt <- subset(sce.all.filt, subset = percent_mito < 5)
dim(sce.all.filt)
# sce.all.filt <- subset(sce.all.filt, subset = percent_ribo > 3)
# dim(sce.all.filt)
sce.all.filt <- subset(sce.all.filt, subset = percent_hb < 1)
dim(sce.all.filt)
# 过滤后每个样本中的细胞数
stat_raw <- as.data.frame(table(sce.all$orig.ident))
colnames(stat_raw) <- c("sampleid","cellnum_raw")
stat_filter <- as.data.frame(table(sce.all.filt$orig.ident))
colnames(stat_filter) <- c("sampleid","cellnum_filter")
stat <- merge(stat_raw, stat_filter, by="sampleid")
stat$filtered <- stat$cellnum_raw - stat$cellnum_filter
head(stat)
write.table(stat, "1-QC/stat.xls",row.names = F,sep = "\t",quote = F)
# 可视化过滤后的情况
p1_filtered <- VlnPlot(sce.all.filt, group.by = "orig.ident", features = c("nFeature_RNA", "nCount_RNA"),
pt.size = 0, ncol = 2) + NoLegend()
w <- length(unique(sce.all.filt$orig.ident))/3+5;w
ggsave(filename="1-QC/Vlnplot1_filtered.pdf",plot=p1_filtered,width = w,height = 5)
p2_filtered <- VlnPlot(sce.all.filt, group.by = "orig.ident", features = c("percent_mito", "percent_ribo", "percent_hb"), pt.size = 0, ncol = 3) + NoLegend()
w <- length(unique(sce.all.filt$orig.ident))/2+5;w
ggsave(filename = "1-QC/Vlnplot2_filtered.pdf",plot=p2_filtered,width = w,height = 5)
然后走标准的降维聚类流程,并进行harmony,得到的umap图如下:
既然学员的代码都给了,那回头看看呀,初看没什么特别的,然后一步一步运行一下:
#####-----step1:读取数据,创建合并Seurat对象
dir='1step_LOAD/GSE125527_RAW'
samples=list.files( dir )
samples
sceList = lapply(samples,function(pro){
print(pro)
ct=fread(file.path( dir ,pro),data.table = F)
ct[1:4,1:4]
rownames(ct)=ct[,1]
colnames(ct) = paste(gsub('_filtered_gene_bc_matrices.csv.gz','',pro),
colnames(ct) ,sep = '_')
ct=ct[,-1]
sce =CreateSeuratObject(counts = ct ,
project = pro ,
min.cells = 5,
min.features = 300 )
return(sce)
})
do.call(rbind,lapply(sceList, dim))
sce.all=merge(x=sceList[[1]],
y=sceList[ -1 ])
names(sce.all@assays$RNA@layers)
sce.all[["RNA"]]$counts
# Alternate accessor function with the same result
LayerData(sce.all, assay = "RNA", layer = "counts")
sce.all <- JoinLayers(sce.all)
dim(sce.all[["RNA"]]$counts )
as.data.frame(sce.all@assays$RNA$counts[1:10, 1:2])
其实学员的代码有很多次机会他能发现他运行过程中的结果不对,但是从头跑到尾就是没发现!