TCGA数据下载
By Kaiyi Fu
获取mRNAseq表达矩阵等
1、从GDC官网下载
2、用TCGAbiolinks下载
3、从cBioPortal下载
获取临床信息
获取生存信息等
TCGA-PAN CDR
Last updated
By Kaiyi Fu
Last updated
# 1. 加载R包
library(TCGAbiolinks)
library(dplyr)
library(SummarizedExperiment)
library(data.table)
# 2. 设定研究项目
project_name <-"TCGA-STAD"
output_dir <- paste0("./", project_name)
# 创建输出文件夹
# 创建输出文件夹
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}
# 3. 构建查询信息
# 筛选标准:转录组 -> 基因表达量 -> STAR流程
query <- GDCquery(project = project_name, data.category ="Transcriptome Profiling",
data.type ="Gene Expression Quantification", workflow.type ="STAR - Counts")
# 4. 执行下载# method = "api" 适合较小数据集;大数据集建议配置 gdc-client
GDCdownload(query = query, method ="api", files.per.chunk =10)
# 5. 数据准备与整合
# GDCprepare 会将零散的单样本文件整合成一个完整的 SummarizedExperiment 对象
data_se <- GDCprepare(query = query)
# 6. 提取并保存基因注释信息
# 包含 Ensembl ID, Gene Symbol, 染色体位置等
ann_df <- as.data.frame(rowRanges(data_se)) %>%
select(gene_id, gene_name, gene_type, seqnames, start, end)
fwrite(ann_df, file = file.path(output_dir, "gene_annotation.csv"))
#7. 提取表达矩阵
# 提取 Raw Counts (用于 DESeq2/edgeR 差异分析)
counts_mat <- assay(data_se,"unstranded")
counts_df <- cbind(Symbol = ann_df$gene_name,
as.data.frame(counts_mat))
fwrite(counts_df, file = file.path(output_dir, "counts_matrix.csv"))
# 提取 TPM (用于聚类、免疫浸润等分析)
tpm_mat <- assay(data_se, "tpm_unstrand")
tpm_df <- cbind(Symbol = ann_df$gene_name, as.data.frame(tpm_mat))
fwrite(tpm_df, file = file.path(output_dir, "tpm_matrix.csv"))
#8. 提取临床信息
clinical <- GDCquery_clinic(project = project_name, type = "clinical")#可能有error
fwrite(clinical, file = file.path(output_dir, "clinical_info.csv"))