# GEO数据下载ID转换
# 加载包
#===============================================================================
# r.proxy::proxy()
library(GEOquery)
library(stringr)
library(limma)
library(tidyverse)
options('download.file.method.GEOquery' = 'libcurl' )
options(max.print = 40)
#===============================================================================
# 设置参数
#===============================================================================
gse_id = "GSE44861"
out_path = "./"
# title_matched_pattern = "cancer|normal"
title_matched_pattern =NULL
#===============================================================================
# 开始
#===============================================================================
if (!is.null(out_path)) {
if (!dir.exists(paste0(out_path,"/",gse_id,"_data/"))) {
dir.create(paste0(out_path,"/",gse_id,"_data/"),recursive = T)
}
}
out_path = paste0(out_path,"/",gse_id,"_data/")
# 下载
gse <- getGEO(gse_id, GSEMatrix =TRUE, AnnotGPL=T,getGPL = T,destdir = ".")
class(gse)
# 获取表达矩阵
a = gse[[1]] # 获取第一个元素
dat = exprs(a) %>% as.data.frame() # 将对象a 转化为表达矩阵
head(dat)
dat <- na.omit(dat) %>% as.data.frame()
#获取样本全部信息
meta = pData(a)
# 获取分组信息
if (!is.null(title_matched_pattern)) {
group_data <- meta %>% select(title) %>%
mutate(group = str_extract_all(title,title_matched_pattern,simplify = T) %>% as.vector()) %>% select(-title)
}else{
group_data = meta %>% select(title)
}
# 获取GPL文件信息,包含探针注释信息
GPL_name <- gse[[1]]@annotation
## GPL列名的描述信息
gse[[1]]@featureData@varMetadata %>% head()
GPL_anno_data <- gse[[1]]@featureData@data
dim(GPL_anno_data)
GPL_ids_cols = c("ID","Gene symbol","Gene Symbol","GENE_SYMBOL","Symbol","GeneSymbol")
match_id_cols <- colnames(GPL_anno_data)[colnames(GPL_anno_data) %in% GPL_ids_cols]
# 获取ids 探针和基因名
# 有gene symbol列的处理
if (length(match_id_cols) == 2) {
ids <- GPL_anno_data[,match_id_cols] %>% `colnames<-`(c("prob_id","gene_symbol")) %>%
filter(!gene_symbol %in% c("","---")) %>%
filter(!is.na(gene_symbol))%>%
separate_rows(gene_symbol,sep = "\\/\\/\\/") # 一个探针对应多个基因 保留所有对应情况
}
# 没有gene symbol 只有gene_assignment的情况(基因名在gene_assignment列里面)
if (length(match_id_cols) == 1 && "gene_assignment" %in% colnames(GPL_anno_data)) {
GPL_anno_data<- GPL_anno_data[GPL_anno_data$gene_assignment != "---",] # 去除gene_assignment列中的--- 20231208
GPL_anno_data <- GPL_anno_data[GPL_anno_data$gene_assignment %>% str_detect("\\/\\/"),]
ids <-
GPL_anno_data %>% mutate(gene_symbol = str_split(gene_assignment,pattern = "\\/\\/") %>% map(2) %>% unlist() %>% str_remove_all(" ")) %>%
filter(gene_symbol != "NULL") %>%
filter(!gene_symbol %in% c("","---"," --- ")) %>%
filter(!is.na(gene_symbol)) %>%
select(c(.data[[match_id_cols]],gene_symbol)) %>%
`colnames<-`(c("prob_id","gene_symbol"))
}
# ID转换
rownames(dat) <- as.character(rownames(dat))
ids$prob_id <- as.character(ids$prob_id)
dat[,"prob_id"] <- rownames(dat)
exp_data <- inner_join(ids,dat) %>%
dplyr::select(-prob_id) %>%
as.data.frame()
dim(exp_data)
exp_data$gene_symbol %>% table() %>% sort(decreasing = T) %>% head()
gene_symbol = exp_data$gene_symbol
exp_data = exp_data[,-1]
# exp_data[1:4,1:4]
# str(exp_data)
# exp_data = exp_data %>% mutate_all(as.numeric)
exp_data = as.matrix(exp_data)
rownames(exp_data) = gene_symbol
# 去重取平均
exp_data <- limma::avereps(exp_data,rownames(exp_data)) %>%
as.data.frame()
# 保存数据
write.csv(exp_data,file = paste0(out_path,"/","exp_data.csv"),row.names = T)
write.csv(meta,file = paste0(out_path,"/","meta_data.csv"),row.names = T)
write.csv(group_data,file = paste0(out_path,"/","group_data.csv"),row.names = T)
write.csv(ids,file = paste0(out_path,"/","ids.csv"),row.names = F)
write.csv(GPL_anno_data,file = paste0(out_path,"/","GPL_anno_data.csv"),row.names = T)
#===============================================================================
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。