前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
圈层
工具
发布
首页
学习
活动
专区
圈层
工具
MCP广场
社区首页 >专栏 >一篇单细胞文献复现及代码部分更新

一篇单细胞文献复现及代码部分更新

作者头像
生信菜鸟团
发布2023-12-14 16:13:23
发布2023-12-14 16:13:23
1.8K00
代码可运行
举报
文章被收录于专栏:生信菜鸟团生信菜鸟团
运行总次数:0
代码可运行

❝群里小伙伴提出来可否复现一篇scientific reports 杂志的文章,文章题目为:综合单细胞和空间转录组学揭示了银屑病成纤维细胞和关键基因的异质性。说是复现文章,除非文章给了代码和比较准确的数据可以做到准确复现外,我现在做的复现只是按照常规流程降维分群,然后根据文章给出的一些marker gene,结合文章和自我理解人工定义细胞亚群。 也正好借今天复现的这篇文献把代码稍微更新一下,是曾老师今年年初的时候整理的。整体变化不大,只是把代码都汇总到不同.R文件中了,qc.R, harmony.R 和 check-all-marker.R。 ❞

文章
复现的图:

文章给的图太糊了

数据集:

下载数据,然后解压数据,整理数据

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE151177

代码语言:javascript
代码运行次数:0
运行
复制
library(stringr)
fs = list.files('./GSE151177_raw/',pattern = '^GSM')
#执行这一步需要解压tar -xvf
samples=str_split(fs,'_',simplify = T)[,1]

#dir.create("./GSE151177_raw")
fs
lapply(unique(samples),function(x){
  y=fs[grepl(x,fs)]
  folder=paste0("GSE151177_raw/", str_split(y[1],'_',simplify = T)[,1])
  dir.create(folder,recursive = T)
  #为每个样本创建子文件夹
  file.rename(paste0("GSE151177_raw/",y[1]),file.path(folder,"barcodes.tsv.gz"))
  #重命名文件,并移动到相应的子文件夹里
  file.rename(paste0("GSE151177_raw/",y[2]),file.path(folder,"features.tsv.gz"))
  file.rename(paste0("GSE151177_raw/",y[3]),file.path(folder,"matrix.mtx.gz"))
})
复现代码:
代码语言:javascript
代码运行次数:0
运行
复制
library(harmony)
library(ggsci)
library(dplyr) 
library(future)
library(Seurat)
library(clustree)
library(cowplot)
library(data.table)
library(dplyr)
library(ggplot2)
library(patchwork)
library(stringr)

dir='./GSE151177_raw/' 
samples=list.files( dir )
samples 
sceList = lapply(samples,function(pro){ 
  # pro=samples[1] 
  print(pro)  
  sce = CreateSeuratObject(counts =  Read10X(file.path(dir,pro )) ,
                          project = gsub('^GSM[0-9]*_','',
                         gsub('filtered_feature_bc_matrix','',pro) )  ,# pro, #
                          min.cells = 5,
                          min.features = 500 )
  print(sce)
  cid = tail(colnames(sce)[order(sce$nFeature_RNA)],8000)
  sce$nFeature_RNA[cid]
  sce=sce[,colnames(sce) %in% cid]
  print(sce)
  return(sce)
})

sce.all=merge(x=sceList[[1]],
              y=sceList[ -1 ],
              add.cell.ids =  gsub('^GSM[0-9]*_','',
gsub('filtered_feature_bc_matrix','',samples))    )


as.data.frame(sce.all@assays$RNA@counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all$orig.ident) 

table(sce.all@meta.data$orig.ident)
sce.all$group<-ifelse(grepl("GSM4567877|GSM4567878|GSM4567879|GSM4567880|GSM4567881|GSM4567882",sce.all$orig.ident),
                       "Cons","Pso")

table(sce.all@meta.data$group)
QC质控
代码语言:javascript
代码运行次数:0
运行
复制
###### step2:QC质控 ######
dir.create("./1-QC")
setwd("./1-QC")
# 如果过滤的太狠,就需要去修改这个过滤代码
source('../scRNA_scripts/qc.R')
sce.all.filt = basic_qc(sce.all)
print(dim(sce.all))
print(dim(sce.all.filt))
setwd('../')

「以下为qc.R代码」

代码语言:javascript
代码运行次数:0
运行
复制
basic_qc <- function(input_sce){
  #计算线粒体基因比例
  mito_genes=rownames(input_sce)[grep("^MT-", rownames(input_sce),ignore.case = T)] 
  print(mito_genes) #可能是13个线粒体基因
  #input_sce=PercentageFeatureSet(input_sce, "^MT-", col.name = "percent_mito")
  input_sce=PercentageFeatureSet(input_sce, features = mito_genes, col.name = "percent_mito")
  fivenum(input_sce@meta.data$percent_mito)
  
  #计算核糖体基因比例
  ribo_genes=rownames(input_sce)[grep("^Rp[sl]", rownames(input_sce),ignore.case = T)]
  print(ribo_genes)
  input_sce=PercentageFeatureSet(input_sce,  features = ribo_genes, col.name = "percent_ribo")
  fivenum(input_sce@meta.data$percent_ribo)
  
  #计算红血细胞基因比例
  Hb_genes=rownames(input_sce)[grep("^Hb[^(p)]", rownames(input_sce),ignore.case = T)]
  print(Hb_genes)
  input_sce=PercentageFeatureSet(input_sce,  features = Hb_genes,col.name = "percent_hb")
  fivenum(input_sce@meta.data$percent_hb)
  
  #可视化细胞的上述比例情况
  feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
  feats <- c("nFeature_RNA", "nCount_RNA")
  p1=VlnPlot(input_sce, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 2) + 
    NoLegend()
  p1 
  w=length(unique(input_sce$orig.ident))/3+5;w
  ggsave(filename="Vlnplot1.pdf",plot=p1,width = w,height = 5)
  
  feats <- c("percent_mito", "percent_ribo", "percent_hb")
  p2=VlnPlot(input_sce, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 3, same.y.lims=T) + 
    scale_y_continuous(breaks=seq(0, 100, 5)) +
    NoLegend()
  p2 
  w=length(unique(input_sce$orig.ident))/2+5;w
  ggsave(filename="Vlnplot2.pdf",plot=p2,width = w,height = 5)
  
  p3=FeatureScatter(input_sce, "nCount_RNA", "nFeature_RNA", group.by = "orig.ident", pt.size = 0.5)
  ggsave(filename="Scatterplot.pdf",plot=p3)
  
  #根据上述指标,过滤低质量细胞/基因
  #过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
  # 一般来说,在CreateSeuratObject的时候已经是进行了这个过滤操作
  # 如果后期看到了自己的单细胞降维聚类分群结果很诡异,就可以回过头来看质量控制环节
  # 先走默认流程即可
  if(F){
    selected_c <- WhichCells(input_sce, expression = nFeature_RNA > 200)
    selected_f <- rownames(input_sce)[Matrix::rowSums(input_sce@assays$RNA@counts > 0 ) > 3]
    input_sce.filt <- subset(input_sce, features = selected_f, cells = selected_c)
    dim(input_sce) 
    dim(input_sce.filt) 
  }
  
  input_sce.filt =  input_sce
  # par(mar = c(4, 8, 2, 1))
  # 这里的C 这个矩阵,有一点大,可以考虑随抽样 
  C=subset(input_sce.filt,downsample=100)@assays$RNA@counts
  dim(C)
  C=Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100

  most_expressed <- order(apply(C, 1, median), decreasing = T)[50:1]
  pdf("TOP50_most_expressed_gene.pdf",width=14)
  boxplot(as.matrix(Matrix::t(C[most_expressed, ])),
          cex = 0.1, las = 1, 
          xlab = "% total count per cell", 
          col = (scales::hue_pal())(50)[50:1], 
          horizontal = TRUE)
  dev.off()
  rm(C)
  
  #过滤指标2:线粒体/核糖体基因比例(根据上面的violin图)
  selected_mito <- WhichCells(input_sce.filt, expression = percent_mito < 30)
  selected_ribo <- WhichCells(input_sce.filt, expression = percent_ribo > 3)
  selected_hb <- WhichCells(input_sce.filt, expression = percent_hb < 1 )
  length(selected_hb)
  length(selected_ribo)
  length(selected_mito)
  
  input_sce.filt <- subset(input_sce.filt, cells = selected_mito)
  input_sce.filt <- subset(input_sce.filt, cells = selected_ribo)
  input_sce.filt <- subset(input_sce.filt, cells = selected_hb)
  dim(input_sce.filt)
  
  table(input_sce.filt$orig.ident) 
  
  #可视化过滤后的情况
  feats <- c("nFeature_RNA", "nCount_RNA")
  p1_filtered=VlnPlot(input_sce.filt, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 2) + 
    NoLegend()
  w=length(unique(input_sce.filt$orig.ident))/3+5;w 
  ggsave(filename="Vlnplot1_filtered.pdf",plot=p1_filtered,width = w,height = 5)
  
  feats <- c("percent_mito", "percent_ribo", "percent_hb")
  p2_filtered=VlnPlot(input_sce.filt, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 3) + 
    NoLegend()
  w=length(unique(input_sce.filt$orig.ident))/2+5;w 
  ggsave(filename="Vlnplot2_filtered.pdf",plot=p2_filtered,width = w,height = 5) 
  return(input_sce.filt) 
  
}
harmony整合多个单细胞样品
代码语言:javascript
代码运行次数:0
运行
复制
sp='human'
###### step3: harmony整合多个单细胞样品 ######
dir.create("2-harmony")
getwd()
setwd("2-harmony")
source('../scRNA_scripts/harmony.R')
sce.all.int = run_harmony(sce.all.filt)
setwd('../')

「其中harmony.R 如下」

代码语言:javascript
代码运行次数:0
运行
复制
run_harmony <- function(input_sce){
  
  print(dim(input_sce))
  input_sce <- NormalizeData(input_sce, 
                             normalization.method = "LogNormalize",
                             scale.factor = 1e4) 
  input_sce <- FindVariableFeatures(input_sce)
  input_sce <- ScaleData(input_sce)
  input_sce <- RunPCA(input_sce, features = VariableFeatures(object = input_sce))
  seuratObj <- RunHarmony(input_sce, "orig.ident")
  names(seuratObj@reductions)
  seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                       reduction = "harmony")
  # p = DimPlot(seuratObj,reduction = "umap",label=T ) 
  # ggsave(filename='umap-by-orig.ident-after-harmony',plot = p)
  input_sce=seuratObj
  input_sce <- FindNeighbors(input_sce, reduction = "harmony",
                             dims = 1:15) 
  input_sce.all=input_sce
  
  #设置不同的分辨率,观察分群效果(选择哪一个?)
  for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
    input_sce.all=FindClusters(input_sce.all, #graph.name = "CCA_snn", 
                               resolution = res, algorithm = 1)
  }
  colnames(input_sce.all@meta.data)
  apply(input_sce.all@meta.data[,grep("RNA_snn",colnames(input_sce.all@meta.data))],2,table)
  
  p1_dim=plot_grid(ncol = 3, DimPlot(input_sce.all, reduction = "umap", group.by = "RNA_snn_res.0.01") + 
                     ggtitle("louvain_0.01"), DimPlot(input_sce.all, reduction = "umap", group.by = "RNA_snn_res.0.1") + 
                     ggtitle("louvain_0.1"), DimPlot(input_sce.all, reduction = "umap", group.by = "RNA_snn_res.0.2") + 
                     ggtitle("louvain_0.2"))
  ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_low.pdf",width = 14)
  
  p1_dim=plot_grid(ncol = 3, DimPlot(input_sce.all, reduction = "umap", group.by = "RNA_snn_res.0.8") + 
                     ggtitle("louvain_0.8"), DimPlot(input_sce.all, reduction = "umap", group.by = "RNA_snn_res.1") + 
                     ggtitle("louvain_1"), DimPlot(input_sce.all, reduction = "umap", group.by = "RNA_snn_res.0.3") + 
                     ggtitle("louvain_0.3"))
  ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_high.pdf",width = 18)
  
  p2_tree=clustree(input_sce.all@meta.data, prefix = "RNA_snn_res.")
  ggsave(plot=p2_tree, filename="Tree_diff_resolution.pdf")
  table(input_sce.all@active.ident) 
  saveRDS(input_sce.all, "sce.all_int.rds")
  return(input_sce.all)
}
降维分群和maker gene
代码语言:javascript
代码运行次数:0
运行
复制
###### step4: 降维聚类分群和看标记基因库 ######
# 原则上分辨率是需要自己肉眼判断,取决于个人经验
# 为了省力,我们直接看 0.1和0.8即可。
#由于篇幅有限,我这里只展示了0.1的,下面的分群也都是根据0.1的来的。
table(Idents(sce.all.int))
table(sce.all.int$seurat_clusters)
table(sce.all.int$RNA_snn_res.0.1) 
table(sce.all.int$RNA_snn_res.0.8) 
#remotes::install_github(repo = 'genecell/COSGR')
library(COSG)
getwd()
dir.create('check-by-0.1')
setwd('check-by-0.1')
sel.clust = "RNA_snn_res.0.1"
sce.all.int <- SetIdent(sce.all.int, value = sel.clust)
table(sce.all.int@active.ident) 
source('../scRNA_scripts/check-all-markers.R')
setwd('../') 
getwd()

check-all-markers.R代码所包含的marker gene比较多,我这里就只展示一部分。

只需在harmony的时候定义好sp='human'或者sp='mouse'

代码语言:javascript
代码运行次数:0
运行
复制
last_markers = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',
                   'CD19', 'CD79A', 'MS4A1' ,
                   'IGHG1', 'MZB1', 'SDC1',
                   'CD68', 'CD163', 'CD14', 
                   'TPSAB1' , 'TPSB2',  # mast cells,
                   'RCVRN','FPR1' , 'ITGAM' ,
                   'C1QA',  'C1QB',  # mac
                   'S100A9', 'S100A8', 'MMP19',# monocyte
                   'FCGR3A',
                   'LAMP3', 'IDO1','IDO2',## DC3 
                   'CD1E','CD1C', # DC2
                   'KLRB1','NCR1', # NK 
                   'FGF7','MME', 'ACTA2', ## human  fibo 
                 'GJB2', 'RGS5',
                   'DCN', 'LUM',  'GSN' , ## mouse PDAC fibo 
                   'MKI67' , 'TOP2A', 
                   'PECAM1', 'VWF',  ## endo 
                 "PLVAP",'PROX1','ACKR1','CA4','HEY1',
                   'EPCAM' , 'KRT19','KRT7', # epi 
                   'FYXD2', 'TM4SF4', 'ANXA4',# cholangiocytes
                   'APOC3', 'FABP1',  'APOA1',  # hepatocytes
                   'Serpina1c',
                   'PROM1', 'ALDH1A1' )
genes_to_check=c("SPINK5", "FLG",    
                 "LDR", "KRT1",    
                 "KRT10", "FABP5",   
                 "KRT5", "KRT14",   
                 "TYRP1", "MLANA",   
                 "CD40",           
                 "LY75","CD207",
                 "CD52","CD3D",   
                 "TRAC", "CCL1A1",      
                 "LUM", "CLDN5",   
                 "PECAM1")

last_markers 
genes_to_check

markers = c('last_markers',"GSE_gene","all_gene")

p_umap=DimPlot(sce.all.int, reduction = "umap",raster = F,label = T,repel = T) 
p_umap 

if(sp=='human'){
   lapply(markers, function(x){
     #x=markers[1]
     genes_to_check=str_to_upper(get(x)) 
     DotPlot(sce.all.int , features = genes_to_check )  + 
       coord_flip() + 
       theme(axis.text.x=element_text(angle=45,hjust = 1))
     
     h=length( genes_to_check )/6+3;h
     ggsave(paste('check_for_',x,'.pdf'),height = h)
   })
  lapply(markers_list, function(x){
    # x=markers_list[1]
    genes_to_check = lapply(get(x), str_to_upper)
    dup=names(table(unlist(genes_to_check)))[table(unlist(genes_to_check))>1]
    genes_to_check = lapply(genes_to_check, function(x) x[!x %in% dup])
  
    DotPlot(sce.all.int , features = genes_to_check )  + 
     # coord_flip() + 
      theme(axis.text.x=element_text(angle=45,hjust = 1))
    
    w=length( unique(unlist(genes_to_check)) )/5+6;w
    ggsave(paste('check_for_',x,'.pdf'),width  = w)
  })
  
  last_markers_to_check <<- str_to_upper(last_markers ) 

 }else if(sp=='mouse'){
   lapply(markers, function(x){
     #x=markers[1]
     genes_to_check=str_to_title(get(x)) 
     DotPlot(sce.all.int , features = genes_to_check )  + 
       coord_flip() + 
       theme(axis.text.x=element_text(angle=45,hjust = 1))
     
     h=length( genes_to_check )/6+3;h
     w=length( unique(unlist(genes_to_check)) )/5+6;w
     ggsave(paste('check_for_',x,'.pdf'),height = h,width = w)
   })
   lapply(markers_list, function(x){
     # x=markers_list[1]
     genes_to_check = lapply(get(x), str_to_title)
     dup=names(table(unlist(genes_to_check)))[table(unlist(genes_to_check))>1]
     genes_to_check = lapply(genes_to_check, function(x) x[!x %in% dup])
     
     DotPlot(sce.all.int , features = genes_to_check )  + 
       # coord_flip() + 
       theme(axis.text.x=element_text(angle=45,hjust = 1))
     
     w=length( unique(unlist(genes_to_check)) )/5+6;w
     ggsave(paste('check_for_',x,'.pdf'),width  = w)
   })
   
   last_markers_to_check <<- str_to_title(last_markers ) 
}else {
  print('we only accept human or mouse')
} 

p_all_markers = DotPlot(sce.all.int , features = last_markers_to_check )  + 
  coord_flip() + 
  theme(axis.text.x=element_text(angle=45,hjust = 1)) 
p_all_markers+p_umap
h=length( last_markers_to_check )/6+3;h
w=length( unique( Idents(sce.all.int)) )/5+10;w
ggsave(paste('last_markers_and_umap.pdf'),width  = w,height = h)
确定单细胞亚群生物学名字
代码语言:javascript
代码运行次数:0
运行
复制
###### step5: 确定单细胞亚群生物学名字 ######
# 一般来说,为了节省工作量,我们选择0.1的分辨率进行命名
# 因为命名这个步骤是纯人工操作,除非0.1确实分群太粗狂了,我们就选择0.8。
source('scRNA_scripts/lib.R')
#sce.all.int = readRDS('2-harmony/sce.all_int.rds')
# sp='human'
colnames(sce.all.int@meta.data) 
table(sce.all.int$RNA_snn_res.0.1)
 # 7:Fibro
  # 6:Endo
  # 4: macro
  # 1: granu
  # 0: T
  # 2: Mono
  # 3:basal
  # 5:melan
  sce.all.int
  celltype=data.frame(ClusterID=0:7 ,
                      celltype= 0:7) 
  #定义细胞亚群        
  celltype[celltype$ClusterID %in% c( 0),2]='T' 
  celltype[celltype$ClusterID %in% c( 1),2]='granu' 
  celltype[celltype$ClusterID %in% c( 2  ),2]= 'Mono' # 'myeloids'   
  celltype[celltype$ClusterID %in% c( 3  ),2]='basal' # 'myeloids'  
  celltype[celltype$ClusterID %in% c( 4 ),2]='macro' # 'myeloids' 
  celltype[celltype$ClusterID %in% c( 5 ),2]='melan'
  celltype[celltype$ClusterID %in% c( 6),2]='Endo'
  celltype[celltype$ClusterID %in% c(7),2]='fibro'  
   
  head(celltype)
  table(celltype$celltype)
  sce.all.int@meta.data$celltype = "NA"

  for(i in 1:nrow(celltype)){
    sce.all.int@meta.data[which(sce.all.int@meta.data$RNA_snn_res.0.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
  Idents(sce.all.int)=sce.all.int$celltype
  
  sel.clust = "celltype"
  sce.all.int <- SetIdent(sce.all.int, value = sel.clust)
  table(sce.all.int@active.ident) 
一些可视化
代码语言:javascript
代码运行次数:0
运行
复制
last_markers = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',
                 'CD19', 'CD79A', 'MS4A1' ,
                 'IGHG1', 'MZB1', 'SDC1',
                 'CD68', 'CD163', 'CD14', 
                 'C1QA',  'C1QB',  # mac
                 'S100A9', 'S100A8', 'MMP19',# monocyte
                 'FCGR3A',
                 'LAMP3', 'IDO1','IDO2',## DC3 
                 'CD1E','CD1C', # DC2
                 'FGF7','MME', 'ACTA2', ## human  fibo 
                 'GJB2', 'RGS5',
                 'DCN', 'LUM',  'GSN' , ## mouse PDAC fibo 
                 'MKI67' , 'TOP2A', 
                 'PECAM1', 'VWF',  ## endo
                 'EPCAM' , 'KRT19','KRT7', # epi 
                 'FYXD2', 'TM4SF4', 'ANXA4',# cholangiocytes
                 'APOC3', 'FABP1',  'APOA1',  # hepatocytes
                 'Serpina1c',
                 'PROM1', 'ALDH1A1' )
library(patchwork)
p_all_markers=DotPlot(sce.all.int, features = str_to_upper(last_markers),
                      assay='RNA' ,group.by = 'celltype' )  + coord_flip()+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5))
p_umap=DimPlot(sce.all.int, reduction = "umap", group.by = "celltype",label = T,label.box = T)
p_all_markers+p_umap
ggsave('markers_umap_by_celltype.pdf',width = 16,height = 8)
代码语言:javascript
代码运行次数:0
运行
复制
library(patchwork)
p_all_markers=DotPlot(sce.all.int, features = str_to_upper(genes_to_check),
                      assay='RNA' ,group.by = 'celltype' )  + coord_flip()+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5))
p_umap=DimPlot(sce.all.int, reduction = "umap", group.by = "celltype",label = T,label.box = T)
p_all_markers+p_umap
ggsave('GSE151177_umap_by_celltype.pdf',width = 16,height = 8)
代码语言:javascript
代码运行次数:0
运行
复制
DimPlot(sce.all.int,group.by = 'group',raster = T)
ggsave('umap-by-group.pdf',width = 7,height = 6)
pdf('celltype-vs-orig.ident.pdf',width = 10)
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2023-12-14,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 生信菜鸟团 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 文章
  • 复现的图:
  • 数据集:
  • 复现代码:
  • QC质控
  • harmony整合多个单细胞样品
  • 降维分群和maker gene
  • 确定单细胞亚群生物学名字
  • 一些可视化
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档