#=======================================================
#=======================================================
rm(list=ls())
library(dplyr)
library(tidyr)
library(Biobase)
library(limma)
library(VennDiagram)
setwd('D:\\SCIwork\\UCEC\\000000\\1mrna\\UCEC')
sur1 <- read.csv("sur_gene_diff.csv", header = T, row.names = 1)
sur1$gene <- sub("\\_", "-", sur1$gene)
rownames(sur1) <- sur1$gene
sur1 <- subset(sur1, sur1$pValue < 0.01)
sur2 <- read.csv("sur_gene_cox.csv", header = T, row.names = 1)
rownames(sur2) <- sub("\\_", "-", rownames(sur2))
sur2$gene <- rownames(sur2)
sur2 <- subset(sur2, sur2$coef < 0)
A=rownames(sur1)
B=rownames(sur2)
inter <- intersect(A, B)
a <- 2380
b <- 2313
inter <- 345
这里的a为A数据集的基因数,b为B数据集的基因数,inter为两者交集的基因数。 计算韦恩图P值的代码为
> phyper(inter-1, a, 20000-a, b, lower.tail = F)
[1] 2.098632e-06
可以看到P值小于0.05,因此该overlap的基因不是随机生成的,是可以被接纳的。 这里需要解释的是代码phyper(inter-1, a, 20000-a, b, lower.tail = F)中的20000代表的是背景基因总数,如果是mRNA我这里设置为20000。
计算venn图P值的具体资料大家可以检索:超几何分布检验(hypergeometric test)与费歇尔精确检验(fisher's exact test); Statistical significance of the overlap between two groups of genes; Calculate venn diagram hypergeometric p value using R等。
categrory1 <- c("DEG", "PRG")
lty1 <- rep("blank", 2)
fill1 <- c("light blue", "pink")
alpha1 <- rep(0.5, 2)
catpos1 = c(0, 0)
catdist1 = rep(0.025, 2)
grid.newpage()
p <- draw.pairwise.venn(a, b, inter,
category = categrory1,
lty = lty1,
fill =fill1,
alpha = alpha1,
cat.pos = catpos1,
cat.dist = catdist1,
scaled = FALSE)
pdf(file = 'venn_anti_gene.pdf', height = 5, width = 5)
p
dev.off()