对于FPKM表达数据时,Edger,limma,和deseq等算法并不合适。而ballgown是针对于FPKM数据开发的差异基因算法,可以尝试。 示例数据如下所示:
# --------------------------------------------------------
#
#
#
# --------------------------------------------------------
setwd("D:\\SCIwork\\F5\\DEG")
library(tidyr)
library('ballgown')
load("mRNA_exprSet.Rda")
mRNA_exprSet[1:4,1:4]
# --------------------------------------------------------
#
#
#
# --------------------------------------------------------
mRNA_exprSet <- mRNA_exprSet %>%
tidyr::separate(gene_id, c("gene_name",
"gene_id",
"gene_biotype"),
sep = " \\| ")
mRNA_exprSet <- mRNA_exprSet[,-(2:3)]
index <- duplicated(mRNA_exprSet$gene_name)
mRNA.data <- mRNA_exprSet[!index,]
dim(mRNA.data)
# --------------------------------------------------------
#
#
#
# --------------------------------------------------------
#包含基因名的第一列转为行名
BLCA_fpkm_data = mRNA.data
rownames(BLCA_fpkm_data) = BLCA_fpkm_data[,1]
BLCA_fpkm_data = BLCA_fpkm_data[c(-1)]
#生成分组文件
load("mRNA_exprSet.Rda")
metadata <- data.frame(names(mRNA_exprSet)[-1])
for (i in 1:length(metadata[,1])) {
num <- as.numeric(substring(metadata[i,1],14,15))
if (num %in% seq(1,9)) {metadata[i,2] <- "T"}
if (num %in% seq(10,29)) {metadata[i,2] <- "N"}
}
names(metadata) <- c("TCGA_id","group")
metadata$group <- as.factor(metadata$group)
# --------------------------------------------------------
#
#
#
# --------------------------------------------------------
result_diff = stattest(gowntable = BLCA_fpkm_data ,
pData = metadata ,
covariate = "group" ,
getFC = TRUE ,
log =TRUE,
meas='FPKM',
feature="gene")
result_diff$LogFC <- log2(result_diff$fc)
result_diff$LogFC_abs <- abs(result_diff$LogFC)
write.csv (result_diff, "mRNA_BLCA_fpkm_diff.csv", row.names = F)
# --------------------------------------------------------
#
#
#
# --------------------------------------------------------
foldChange =0.5
padj=0.05
diffSig=result_diff[which(result_diff$pval< padj & result_diff$LogFC_abs > foldChange),]
dim(diffSig)
write.csv(diffSig, file="diffSig_mRNA_BLCA.csv")