(1)第一个部分是纯代码分析某个基因在TCGA33类肿瘤中的差异分析。 (2)结合TCGA和GTEx数据库,这样做的好处是:因为TCGA中肿瘤样本和正常样本是不均衡的,甚至某些肿瘤是没有癌旁正常组织的。所以结合GTEx数据库,可以大大增加正常样本的数量。
#=======================================================
#=======================================================
library(GenomicDataCommons)
setwd('D:\\SCIwork\\F33\\TCGA')
rm(list=ls())
library(dplyr)
library(TCGAbiolinks)
library(dplyr)
library(DT)
library(SummarizedExperiment)
library(stringr)
#=======================================================
#=======================================================
cancer <- TCGAbiolinks:::getGDCprojects()$project_id
cancer <- str_subset(cancer, "TCGA")
cancer <- sort(cancer)
for (i in 1:33) {
cancer_select <- cancer[i]
print(cancer_select)
#下载rna-seq的counts数据
suppressMessages({
query <- GDCquery(
project = cancer_select,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts") })
if (is.null(query)){
print(paste0("No Counts data of solid normal tissue for ", cancer_select ))
} else{
GDCdownload(query, method = "api",
files.per.chunk = 300)
expdat <- GDCprepare(query = query, save = TRUE,
save.filename = paste0(cancer_select,".rda"))
count_matrix=assay(expdat)
write.csv(count_matrix,
file = paste( cancer_select,"Counts.csv",
sep = "-"))}}