差异分析的起点:counts矩阵—reads计数
拿不到count数据如何做差异分析:
• tpm:用limma做差异分析(迫不得已)
• fpkm、rpkm:转换为tpm,用limma做差异分析(迫不得已)
https://mp.weixin.qq.com/s/_DtkxSfLGQHcRju66J4yTQ
• RSEM:三大R包都可 https://www.jianshu.com/p/46b048220b88
其他来源的转录组数据和TCGA的转录组数据的差别
整理输入数据的过程不同,差异分析无差别
示例数据:GSE150392
使用数据前的要点:
下载数据
下载表达矩阵
将下面三个文件放在同一个目录下
代码如下
proj = "cov"#1.获取表达矩阵dat = data.table::fread("GSE150392_Cov_Mock_Raw_COUNTS.csv.gz", data.table = F)# 保留symbol ,去重复,再设为行名library(stringr)b = dat$V1 %>% str_split("_",simplify = T)#24行是异常数据,检查它dat$V1[24]#解决办法:删除PAR_Y_dat$V1 = str_remove(dat$V1,"PAR_Y_")dat$V1[24]b = dat$V1 %>% str_split("_",simplify = T)#36850以后是异常数据,检查dat$V1[36850]# 删除ERCC开头的行k = !str_starts(dat$V1,"ERCC-");table(k)dat = dat[k,]b = dat$V1 %>% str_split("_",simplify = T)# 按照symbol去重复dat = cbind(b[,2], dat[,-1])library(dplyr)dat = distinct(dat,V1,.keep_all = T)# 把symbol设为行名#方法1:exp = dat[,-1]rownames(exp) = dat$V1exp = as.matrix(exp)# 方法2:library(tibble)exp2 = column_to_rownames(dat,"V1") |
---|
另外一种方法,二选一即可
rm(list = ls())proj = "cov"#1.获取表达矩阵dat = data.table::fread("GSE150392_Cov_Mock_Raw_COUNTS.csv.gz")# 保留ensemblid ,行名转换# 删除ERCC开头的行k = !str_starts(dat$V1,"ERCC-");table(k)dat = dat[k,]library(stringr)b = dat$V1 %>% str_split("_",simplify = T)exp = as.matrix(dat[,-1])rownames(exp) = b[,1]library(tinyarray)exp = trans_exp_new(exp)# 分组信息获取colnames(exp)Group = str_remove(colnames(exp),"\\d")Group = factor(Group,levels = c("Mock","Cov")) exp = exp[apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp)), ]nrow(exp)# 临床信息获取library(GEOquery)eSet = getGEO("GSE150392",destdir = ".",getGPL = F)eSet = eSet[[1]]clinical = pData(eSet)# 顺便看下表达矩阵,空的dim(exprs(eSet))save(exp,Group,proj,clinical,file = paste0(proj,".Rdata")) |
---|