拷贝数变异(Copy number variation, CNV):基因组发生重排而导致的,一般指长度1 kb 以上的基因组片段的拷贝数增加或者减少, 主要表现为亚显微水平的重复或者缺失。因此称为“微”缺失或重复变异。
人类基因组含有约31.6亿个DNA碱基对,组成了23对染色体,每对染色体中一条遗传自父亲,一条遗传自母亲,2条染色体亦称为同源染色体,大小和基因组成基本一致,22对为常染色体,还有一对性染色体,女性为XX,男性为XY。正常⼈类基因组成分通常是以2个拷贝存在,分别来⾃⽗母。常染色体和女性X染色体正常拷贝数值为2,男性X和Y染色体正常拷贝数值为1,当拷贝数检测数值大于正常值时即为重复,小于正常值时为缺失。
CNV在基因组中的存在形式主要有以下⼏种:
异常的DNA拷贝数变异(CNV)是许多⼈类疾病(如癌症、遗传性疾病、⼼⾎管疾病)的⼀种重要分⼦机制。作为疾病的⼀项⽣物标志,染⾊体⽔平的缺失、扩增等变化已成为许多疾病研究的热点,然⽽传统的⽅法(⽐如G显带,FISH,CGH等)存在操作繁琐,分辨率低等问题,难以提供变异区段的具体信息,单细胞测序为我们提供了一种新的工具和视野去分析CNV。
#加载需要的包和数据
library(Seurat)
# devtools::install_github('satijalab/seurat-data')
library(SeuratData)
library(ggplot2)
library(patchwork)
library(dplyr)
#以之前pbmc的seurat标准流程为基础,进行分析
DimPlot(pbmc)
sce=pbmc
table( Idents(sce ))
table(sce@meta.data$seurat_clusters)
table(sce@meta.data$orig.ident)
#由于CNV分析要用到infercnv包,其CreateInfercnvObject函数需要三个文件创建对象:The raw_counts_matrix;the gene_order_file, contains chromosome, start, and stop position for each gene;The annotations_file, containing the cell name and the cell type classification
#获取counts矩阵
dat=GetAssayData(sce,slot='counts',assay='RNA')
dat[1:4,1:4]
#获取细胞亚群信息
groupinfo=data.frame(v1=colnames(dat),v2= Idents(sce ) )
head(groupinfo)
#这里使用了AnnoProbe包 https://github.com/jmzeng1314/AnnoProbe/
#AnnoProbe是生信技能树健明老师开发的包,用于表达芯片数据分析,但也可以下载GEO数据,进行进行基因注释等功能,可以注释基因并标记其在染色体上的位置
library(AnnoProbe)
geneInfor=annoGene(rownames(dat), "SYMBOL",'human')
colnames(geneInfor)
head(geneInfor)
#安装染色体和基因位置排序,并将SYMBOL,chr,start,end这几列留下
geneInfor=geneInfor[with(geneInfor,order(chr, start)),c(1,4:6)]
#去重(这里也可以操作去除性染色体,也可以把染色体排序方式改变)
geneInfor=geneInfor[!duplicated(geneInfor[,1]),]
length(unique(geneInfor[,1]))
head(geneInfor)
#整理数据
dat=dat[rownames(dat) %in% geneInfor[,1],]
dat=dat[match( geneInfor[,1], rownames(dat) ),]
dim(dat)
head(groupinfo)
dat[1:4,1:4]
table(groupinfo$v2)
head(groupinfo)
# 为了节约计算机资源,直接抽样
kp=sample(1:nrow(groupinfo),500)
groupinfo=groupinfo[kp,]
dat=dat[,kp]
# 如果是真实项目,且计算资源足够,忽略这个抽样的操作
#保存文件
expFile='expFile.txt'
write.table(dat,file =expFile,sep = '\t',quote = F)
groupFiles='groupFiles.txt'
write.table(groupinfo,file = groupFiles,sep = '\t',quote = F,col.names = F,row.names = F)
head(geneInfor)
geneFile='geneFile.txt'
write.table(geneInfor,file = geneFile,sep = '\t',quote = F,col.names = F,row.names = F)
options(stringsAsFactors = F)
expFile='expFile.txt'
groupFiles='groupFiles.txt'
geneFile='geneFile.txt'
infercnv_obj = CreateInfercnvObject(raw_counts_matrix=expFile, annotations_file=groupFiles,delim="\t",gene_order_file= geneFile,ref_group_names=c("NK",'DC','Platelet'))
#这里出现了报错,经查看,是expFile和groupFiles保存为txt时,列名出现了不一致,一个保存成了TCGAGCCTTGTGAC-1,一个保存成了TCGAGCCTTGTGAC.1,即expFile所有的-都变成了.,经过查找,并不清楚write.table函数那个参数导致的
#所以,干脆将groupFiles中的-先变成.,再保存
library(do)
head(groupinfo$v1)
head(Replace(data=groupinfo$v1,from='-1',to='.1'))
groupinfo$v1<-Replace(data=groupinfo$v1,from='-1',to='.1')
head(groupinfo)
groupFiles1='groupFiles1.txt'
write.table(groupinfo,file = groupFiles1,sep = '\t',quote = F,col.names = F,row.names = F)
head(geneInfor)
infercnv_obj = CreateInfercnvObject(raw_counts_matrix=expFile, annotations_file=groupFiles,delim="\t",gene_order_file= geneFile,ref_group_names=c("NK",'DC','Platelet'))
#ref_group_names认为这些亚群的细胞是正常细胞
## 这个取决于自己的分组信息里面的
# cutoff=1 works well for Smart-seq2, and
# cutoff=0.1 works well for 10x Genomics
# dir is auto-created for storing outputs
#cluster_by_groups: If observations are defined according to groups (ie. patients), each group of cells will be clustered separately. (default=FALSE, instead will use k_obs_groups setting)
infercnv_obj2 = infercnv::run(infercnv_obj,cutoff=0.1, out_dir= "infercnv_output", cluster_by_groups=F, hclust_method="ward.D2", plot_steps=F)
运行结束后,会在infercnv_output文件夹下生成一系列文件,如下图
#加载需要的包
options(stringsAsFactors = F)
library(phylogram)
library(gridExtra)
library(grid)
require(dendextend)
require(ggthemes)
library(tidyverse)
library(Seurat)
library(infercnv)
library(miscTools)
# 导入 inferCNV dendrogram
#infercnv.observations_dendrogram.txt是除去之前设定的参考组(正常组)外的其他组的数据
infercnv.dend <- read.dendrogram(file = "infercnv_output/infercnv.observations_dendrogram.txt")
# Cut tree,k可设置成任意数,可以理解为细胞亚群
infercnv.labels <- cutree(infercnv.dend, k = 6, order_clusters_as_data = FALSE)
table(infercnv.labels)
# 给组别加颜色
the_bars <- as.data.frame(tableau_color_pal("Tableau 20")(20)[infercnv.labels])
colnames(the_bars) <- "inferCNV_tree"
the_bars$inferCNV_tree <- as.character(the_bars$inferCNV_tree)
infercnv.dend %>% set("labels",rep("", nobs(infercnv.dend)) ) %>% plot(main="inferCNV dendrogram") %>%
colored_bars(colors = as.data.frame(the_bars), dend = infercnv.dend, sort_by_labels_order = FALSE, add = T, y_scale=10, y_shift = 0)
infercnv.labels=as.data.frame(infercnv.labels)
groupFiles='groupFiles.txt'
meta=read.table(groupFiles,sep = '\t')
infercnv.labels$V1=rownames(infercnv.labels)
meta=merge(meta,infercnv.labels,by='V1')
table(meta[,2:3])
infercnv.labels
V2 1 2 3 4 5 6
B 0 0 6 36 5 18
CD14+ Mono 31 54 3 3 0 2
CD8 T 0 0 7 2 26 23
FCGR3A+ Mono 27 5 0 0 0 1
Memory CD4 T 0 0 12 2 41 27
Naive CD4 T 1 1 18 2 41 59
#可以查看拷贝数变异分组和细胞亚群间的关系
查看每个细胞有无拷贝数变异
#代码来自生信技能树曾老师
if( ! file.exists( "cnv_scores.csv")){
tmp=read.table("infercnv_output/infercnv.references.txt", header=T)
down=mean(rowMeans(tmp)) - 2 * mean( apply(tmp, 1, sd))
up=mean(rowMeans(tmp)) + 2 * mean( apply(tmp, 1, sd))
oneCopy=up-down
oneCopy
a1= down- 2*oneCopy
a2= down- 1*oneCopy
down;up
a3= up + 1*oneCopy
a4= up + 2*oneCopy
cnv_table <- read.table("infercnv_output/infercnv.observations.txt", header=T)
# Score cells based on their CNV scores
# Replicate the table
cnv_score_table <- as.matrix(cnv_table)
cnv_score_mat <- as.matrix(cnv_table)
# Scoring
cnv_score_table[cnv_score_mat > 0 & cnv_score_mat < a2] <- "A" #complete loss. 2pts
cnv_score_table[cnv_score_mat >= a2 & cnv_score_mat < down] <- "B" #loss of one copy. 1pts
cnv_score_table[cnv_score_mat >= down & cnv_score_mat < up ] <- "C" #Neutral. 0pts
cnv_score_table[cnv_score_mat >= up & cnv_score_mat <= a3] <- "D" #addition of one copy. 1pts
cnv_score_table[cnv_score_mat > a3 & cnv_score_mat <= a4 ] <- "E" #addition of two copies. 2pts
cnv_score_table[cnv_score_mat > a4] <- "F" #addition of more than two copies. 2pts
# Check
table(cnv_score_table[,1])
# B C D
# 49 1908 253
# 将ABCD用数字代替,以得到各细胞拷贝数变异矩阵
cnv_score_table_pts <- cnv_table
rm(cnv_score_mat)
#
cnv_score_table_pts[cnv_score_table == "A"] <- 2
cnv_score_table_pts[cnv_score_table == "B"] <- 1
cnv_score_table_pts[cnv_score_table == "C"] <- 0
cnv_score_table_pts[cnv_score_table == "D"] <- 1
cnv_score_table_pts[cnv_score_table == "E"] <- 2
cnv_score_table_pts[cnv_score_table == "F"] <- 2
# Scores are stored in “cnv_score_table_pts”. Use colSums to add up scores for each cell and store as vector
cell_scores_CNV <- as.data.frame(colSums(cnv_score_table_pts))
colnames(cell_scores_CNV) <- "cnv_score"
head(cell_scores_CNV)
write.csv(x = cell_scores_CNV, file = "cnv_scores.csv")
}
查看各亚群间拷贝数变异情况
# 除去了reference后的走inferCNV的细胞
cell_scores_CNV=read.csv('cnv_scores.csv',row.names = 1)
head(cell_scores_CNV)
load(file = '../section-01-cluster/basic.sce.pbmc.Rdata')
sce=pbmc
phe=sce@meta.data
phe$celltype=Idents(sce)
head(rownames(phe))
head(rownames(cell_scores_CNV))
#rownames(phe)=paste0('X',rownames(phe))
rownames(phe)=gsub('-','.',rownames(phe))
head(rownames(phe))
head(rownames(cell_scores_CNV))
head(rownames(phe))
phe=phe[rownames(phe) %in% rownames(cell_scores_CNV),]
identical(rownames(phe),rownames(cell_scores_CNV))
infercnv.labels <- cutree(infercnv.dend, k = 6, order_clusters_as_data = FALSE)
phe$inferCNV= infercnv.labels[match(rownames(phe), names(infercnv.labels) )]
phe$cnv_scores = cell_scores_CNV[rownames(phe),]
table(phe$celltype,phe$inferCNV)
head(rownames(phe))
dim(phe)
library(ggpubr)
p1=ggboxplot(phe,'celltype','cnv_scores', fill = "celltype")
p1= p1+ theme(axis.text.x = element_text(angle = 45,
vjust = 0.5, hjust=0.5))
p2=ggboxplot(phe,'inferCNV','cnv_scores', fill = "inferCNV")
library(patchwork)
p1+p2
ggsave(filename = 'anno_CNVscore.pdf')
table(phe$celltype,phe$inferCNV)
#将拷贝数变异信息注释到sce中
#rownames(phe)=gsub('X','',rownames(phe))
rownames(phe)=gsub('[.]','-',rownames(phe))
head(rownames(phe))
sce
tail(rownames(sce@meta.data))
head(rownames(phe))
sce$celltype=Idents(sce)
table(sce$celltype)
sce=subset(sce,celltype %in% c('epithelial' ))
sce
kp=rownames(sce@meta.data) %in% rownames(phe)
table(kp)
sce=sce[,kp]
phe=phe[rownames(sce@meta.data),]
sce@meta.data=phe
head(phe)
save(sce, file = 'epi_sce_annoCNV.Rdata')
参考来源
#section 3已更新#「生信技能树」单细胞公开课2021_哔哩哔哩_bilibili
致谢
I thank Dr.Jianming Zeng(University of Macau), and all the members of his bioinformatics team, biotrainee, for generously sharing their experience and codes.
THE END
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。