🧬bulk RNAseq
By Kaiyi Fu
主要包含RNA测序数据下载与预处理、差异分析、富集分析(GO、KEGG、GSEA)等步骤。
R包加载
library(DESeq2) #差异分析
library(msigdbr) #GSEA
library(clusterProfiler) #GSEA
library(GseaVis) #GSEA
library(limma) #差异分析
library(ggplot2)
library(stringr)
library(tidyverse)
library(immunedeconv) #免疫浸润
library(GSVA) #ssGSEA
library(EnhancedVolcano)#火山图
library(ComplexHeatmap)
library(org.Mm.eg.db)
library(enrichplot)
library(RColorBrewer)
library(ggrepel)
一、测序数据的获取与预处理
GEO数据库的下载与处理
#方式一:从GEO直接获取RNAseq数据
if(! require("GEOquery")) BiocManager::install("GEOquery",update=F,ask=F)
library(GEOquery)
##下载并且了解你的数据
gset <- getGEO('GSE14323',destdir = '.',getGPL = F)
exprSet=exprs(gset[[1]])
pdata=pData(gset[[1]]) #获取临床信息
#问题解决:如何生成group_list,如何选择需要的样本??
#group_list=
#exprSet=
pdata1=pdata[pdata$source_name_ch1 == 'Normal liver tissue',]
pdata2=pdata[pdata$source_name_ch1=='liver tissue with HCC',]
exprSet=as.data.frame(exprSet)
exprSet1=exprSet[,rownames(pdata1)]
exprSet2=exprSet[,rownames(pdata2)]
#按列合并exprSet1,2
exprSet=cbind(exprSet1,exprSet2)
pdata=rbind(pdata1,pdata2)
group_list=c(rep('normal',19),rep('tumor',38))
group_list <- factor(group_list,levels = c('normal','tumor'))
group_list
save(exprSet,pdata,group_list,file ='lab0113.Rdata')
load('lab0113.Rdata')
#microarray等格式的数据需要进行ID转换
##这个方法直接下载GEO GPL处的注释文件来做转换,肯定是可行的
#GSE14323是用GPL96和GPL571一起的,两个平台注释文件非常相似
#我们选择样本数多的GPL571
#https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL571 Download full table
#开始转换
#载入注释文件,在你自己的文件夹里
#没data.table,geoquery啥的先装一下
#install.package('data.table')
#BiocManager::install('GEOquery',update = F,ask = F)
library(data.table)
anno = fread("D:/R学习/platform/GPL571-17391.txt",data.table = F,sep = '\t')
#看一下列名
colnames(anno)
#一定选ID探针列和gene symbol列,!!!此处是可变的!!!
anno = anno[,c(1,11)]
anno = anno[!anno$`Gene Symbol`== "",]
#无脑跑代码即可
tmp = rownames(exprSet)%in% anno[,1]
exprSet = exprSet[tmp,]
dim(exprSet)
match(rownames(exprSet),anno$ID)
anno = anno[match(rownames(exprSet),anno$ID),]
match(rownames(exprSet),anno$ID)
dim(exprSet)
dim(anno)
tail(sort(table(anno[,2])), n = 12L)
#注释到相同基因的探针,保留表达量最大的那个探针
{
MAX = by(exprSet, anno[,2],
function(x) rownames(x)[ which.max(rowMeans(x))])
MAX = as.character(MAX)
exprSet = exprSet[rownames(exprSet) %in% MAX,]
rownames(exprSet) = anno[ match(rownames(exprSet), anno[,1] ),2]
}
dim(exprSet)
#cel.gz文件合并
library(affy)
library(dplyr)
library(tidyverse)
data.raw <- ReadAffy()
eset <- rma(data.raw)
write.exprs(eset,"probe_exp.txt")
ann <- read.delim2("./GPL11180.txt")##修改注释文件的工作路径,指定到GPL所在文件位置,记得从GEO下载下来删除前几行
ann <- ann[,c("ID","Gene.Symbol")]
ann$Gene.symbol <-str_split_fixed(ann$Gene.Symbol,pattern = "///",3)[,1]
rt <- read.table("./probe_exp.txt",sep = "\t",header = T)
symbol <- merge(ann,rt,by.x = 1,by.y = 1)
symbol <- symbol[-1]
colnames(symbol)[1] <- c("ID")
symbol <- distinct(symbol,ID,.keep_all = T)
rownames(symbol) <- symbol$ID
symbol <- symbol[-1]
colnames(symbol) <- str_split_fixed(colnames(symbol),pattern = "_",2)[,1]
symbol <- cbind(ID=rownames(symbol),symbol)
write.table(symbol,"symbol.txt",sep = "\t",quote = F,row.names = F)
数据读入+基因去重
##方式二:读入自己的测序矩阵
#读入表达矩阵
mrna_expr <- read.table("E:/Research/12.GYY/RNAseq/data/data_CvsS.txt",header = T)
#处理重复的基因(挑选行平均值大的那一行)
#计算行平均值,按降序排列
index=order(rowMeans(mrna_expr[,-1]),decreasing = T)
#调整表达谱的基因顺序
expr_ordered=mrna_expr[index,]
#对于有重复的基因,保留第一次出现的那个,即行平均值大的那个
keep=!duplicated(expr_ordered$gene_name)
#得到最后处理之后的表达谱矩阵
mrna_expr=expr_ordered[keep,]
rownames(mrna_expr) <- mrna_expr$gene_name
mrna_expr <- mrna_expr[,-1]
#读入样本分组信息
#注意保证表达矩阵中的样本顺序和group文件分组顺序是一一对应的
mrna_sample <- read.csv("E:/Research/12.GYY/RNAseq/data/group_CvsS.csv",header = T,row.names = 1,check.names=F)
人鼠同源基因转换
#HomoloGene包
# install.packages('homologene')
library(homologene)
# 只保留编码基因
coding_gene <- data.table::fread("./input_data/protein-coding-gene.csv") %>% as.data.frame() %>% filter(gene_type == "protein_coding")
genelist<-coding_gene$gene_name
# 随便取点基因看看
genes = genelist[1:10]
genes
#使用homologene函进行转换
#inTax是输入的基因列表所属的物种号,10090是小鼠
#outTax是要转换成的物种号,9606是人
trans <- homologene(genes = genes, #需要转换的基因列表
inTax = 9606, #人
outTax = 10090 #小鼠
)
二、数据清理
删除低表达基因
#对表达量矩阵取整数(DESEQ2处理非整数rawcount时需要)
mrna_expr <- round(mrna_expr)
#根据样本情况选择不同的低表达基因过滤方法
#1、删除表达量全为0的基因
#mrna_expr <- mrna_expr[which(rowSums(mrna_expr)!=0),]
#2、或保留至少在75%的样本中都有表达的基因
#keep_data <- rowSums(mrna_expr> 0) >= floor(0.75*ncol(mrna_expr))
#table(keep_data)
#3、 或保留每个样本中都有表达的基因(可能会删除低表达基因)
keep_data <- rowSums(mrna_expr> 0) >= floor(ncol(mrna_expr))
#table(keep_data)
mrna_expr <- mrna_expr[keep_data,]
标准化
#先看一下样本间齐不齐
boxplot(log2(mrna_expr))
#不齐的话样本间normalize一下(做热图的话可以不normalize)
mrna_expr=normalizeBetweenArrays(mrna_expr)
boxplot(mrna_expr,outline=FALSE, notch=T,col=mrna_sample, las=2)
#log化处理
mrna_expr = log2(mrna_expr+1)
boxplot(log2(mrna_expr))
#将log化后的负无穷值替换为0
#mrna_expr[mrna_expr == -Inf] = 0
三、PCA
library(ggplot2)
library(ggpubr)
library(cowplot)
#如果数据是原始 counts,建议先做log转换或归一化(如DESeq2的vst/rlog)再做 PCA
#这里数据已经log化了
expr <- read.csv("patient_merge_count_normed.csv",header = T,row.names = 1,check.names=F)
mrna_sample <- read.csv("./group.csv",header = T,row.names = 1,check.names=F)
# 转置矩阵:PCA需要样本作为行
expr <- t(expr)
# PCA分析 (这里用prcomp)
pca_res <- prcomp(expr, scale. = TRUE) # scale=TRUE会标准化数据
# 查看方差解释度
summary(pca_res)
library(FactoMineR) # PCA函数
library(factoextra) # 画PCA图
# 带分组颜色的PCA图
fviz_pca_ind(pca_res,
geom.ind = c("point","text"), # 点 + 样本名
col.ind = mrna_sample$GROUP, # 按分组上色
palette = "jco", # 颜色主题
addEllipses = TRUE, # 画置信椭圆
mean.point = FALSE, # 不画均值点
repel = TRUE,
pointsize = 3, # 固定点的大小
label = "ind"
)
#绘制PC1、PC2、PC3三者关系,并列出三张图
.pca <- function(data, is.log) {
if (is.log)
data <- data
else
data <- log2(data + 1)
svd <- base::svd(scale(
x = t(data),
center = TRUE,
scale = FALSE
))
percent <- svd$d ^ 2 / sum(svd$d ^ 2) * 100
percent <-
sapply(seq_along(percent),
function(i) {
round(percent[i], 1)
})
return(list(
sing.val = svd,
variation = percent))
}
.scatter.density.pc <- function(
pcs,
pc.var,
group.name,
group,
color,
strokeSize,
pointSize,
strokeColor,
alpha,
title
){
pair.pcs <- utils::combn(ncol(pcs), 2)
pList <- list()
for(i in 1:ncol(pair.pcs)){
if(i == 1){
x <- pair.pcs[1,i]
y <- pair.pcs[2,i]
p <- ggplot(mapping = aes(
x = pcs[,x],
y = pcs[,y],
fill = group)) +
xlab(paste0('PC', x, ' (', pc.var[x], '%)')) +
ylab(paste0('PC', y, ' (', pc.var[y], '%)')) +
geom_point(
aes(fill = group),
pch = 21,
color = strokeColor,
stroke = strokeSize,
size = pointSize,
alpha = alpha) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
ggtitle(title) +
theme(
legend.position = "right",
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 1.1),
legend.background = element_blank(),
legend.text = element_text(size = 12),
legend.title = element_text(size = 14),
legend.key = element_blank(),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)) +
guides(fill = guide_legend(override.aes = list(size = 4))) +
scale_fill_manual(name = group.name, values = color)
le <- ggpubr::get_legend(p)
}else{
x <- pair.pcs[1,i]
y <- pair.pcs[2,i]
p <- ggplot(mapping = aes(
x = pcs[,x],
y = pcs[,y],
fill = group)) +
xlab(paste0('PC', x, ' (',pc.var[x], '%)')) +
ylab(paste0('PC', y, ' (',pc.var[y], '%)')) +
geom_point(
aes(fill = group),
pch = 21,
color = strokeColor,
stroke = strokeSize,
size = pointSize,
alpha = alpha) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
theme(
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 1.1),
legend.position = "none",
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)) +
scale_fill_manual(values = color, name = group.name)
}
p <- p + theme(legend.position = "none")
xdens <- cowplot::axis_canvas(p, axis = "x")+
geom_density(
mapping = aes(
x = pcs[,x],
fill = group),
alpha = 0.7,
size = 0.2
) +
theme(legend.position = "none") +
scale_fill_manual(values = color)
ydens <- cowplot::axis_canvas(
p,
axis = "y",
coord_flip = TRUE) +
geom_density(
mapping = aes(
x = pcs[,y],
fill = group),
alpha = 0.7,
size = 0.2) +
theme(legend.position = "none") +
scale_fill_manual(name = group.name, values = color) +
coord_flip()
p1 <- insert_xaxis_grob(
p,
xdens,
grid::unit(.2, "null"),
position = "top"
)
p2 <- insert_yaxis_grob(
p1,
ydens,
grid::unit(.2, "null"),
position = "right"
)
pList[[i]] <- ggdraw(p2)
}
pList[[i+1]] <- le
return(pList)
}
pca.ncg<-.pca(data = expr,
is.log = TRUE)
.scatter.density.pc(pcs = pca.ncg$sing.val$v[, 1:3],
pc.var = pca.ncg$variation,
strokeColor = 'gray30',
strokeSize = 0.2,
pointSize = 2,
alpha = 0.6,
title = "A",
group.name = "B",
group=mrna_sample$GROUP,
color=c("red","blue","green")) -> p
do.call(
gridExtra::grid.arrange,
c(p,ncol=4))
四、差异分析
DESeq2差异分析(适用于rawcount)
获取差异基因
dds <- DESeqDataSetFromMatrix(countData = expr_data,
colData = sample_data,
design = ~ group)
dds <- DESeq(dds, fitType = 'mean', minReplicatesForReplace = 7, parallel = FALSE)
nrDEG <- results(dds, contrast =c("group","DP","DSS")) #指定实验/对照组
## 转化数据框
DEG_DESeq2 <- as.data.frame(nrDEG)
DEG_DESeq2 = DEG_DESeq2[order(DEG_DESeq2$log2FoldChange),] # 按照logFC排序
DEG_DESeq2[1:3,1:4]
#如果一行包含一个具有极端计数异常值的样本,则p值和调整后的p值将被设置为NA(https://cloud.tencent.com/developer/article/2327198)
#可以删除缺失行进行过滤
DEG_DESeq2 <- na.omit(DEG_DESeq2)
write.csv(nrDEG,file = "./DEG_DESeq2.csv")
#筛选差异基因(标准为adj.P<0.05,logFC绝对值大于1)
allDiffsig=DEG_DESeq2[DEG_DESeq2$padj<0.05,]
allDiffsig=allDiffsig[allDiffsig$log2FoldChange >1 | allDiffsig$log2FoldChange < -1,]
#显著上调的基因
up_gene=allDiffsig[allDiffsig$log2FoldChange>0,]
#显著下调的基因
down_gene=allDiffsig[allDiffsig$log2FoldChange<0,]
获取标化后的表达矩阵
#1、提取归一化数据
norm_data <- as.data.frame(counts(dds,normalized=TRUE))
head(norm_data)
#文件保存,得到标准化的结果
write.csv(norm_data,file = "./DEseq2_normalized.csv",row.names = FALSE)
#注意:DESeq2的标准化主要是为了消除样本间文库大小的差异,输出的标准化计数可用于样本间的表达量比较,但不能直接用于计算基因的表达相关性
#2、如要绘制热图,可以对整个表达矩阵进行VST转换,再对感兴趣的基因按行进行Z-score标准化
# 1) 对整个表达矩阵进行VST转换
vsd <- vst(dds, blind = FALSE)
all_vst_data <- assay(vsd)
# 2) 从转换后的数据中提取感兴趣的基因
interest_genes <- c("Cd80","Cd274","CD2","Ptgs2","Il1rn","Il1b","Vegfa","Ccl3","Cxcl2","Il16","Tnf")
subset_vst <- all_vst_data[rownames(all_vst_data) %in% interest_genes, ]
# 3) 对提取的基因子集进行按行Z-score标准化
subset_zscore <- t(scale(t(subset_vst)))
# 4) 绘制热图
pheatmap(subset_zscore,
color = colorRampPalette(c("#2166AC", "#f7f7f7", "#B2182B"))(100),
breaks = seq(-2, 2, length.out = 100),
fontsize_row = 8,
border_color = NA)
limma差异分析
#构建分组矩阵
design <- model.matrix(~factor(mrna_sample$GROUP) + 0)
colnames(design) <- c("CART","GSCART")
#构建比较矩阵
contrast.matrix <- makeContrasts(GSCART - CART, levels = factor(mrna_sample$GROUP))#实验组vs对照组
#线性混合模拟
fit <- lmFit(mrna_expr,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异分析结果
RNA_DEG <- topTable(fit2,
#adjust = 'BH',
coef = 1,
n = Inf,
sort.by="logFC"
#p.value = 0.05
)
#筛选差异基因
allDiffsig=RNA_DEG[RNA_DEG$P.Val<0.05,]
allDiffsig=allDiffsig[allDiffsig$logFC >1 | allDiffsig$logFC < -1,]
#显著上调的基因
up=allDiffsig[allDiffsig$logFC>0,]
#显著下调的基因
down=allDiffsig[allDiffsig$logFC<0,]
#RNA_DEG = RNA_DEG[order(RNA_DEG$logFC),] # 按照logFC排序
write.csv(RNA_DEG,"./DEG_limma.csv")
#导出标化后的表达矩阵
write.csv(mrna_expr,"./RNA_norm.csv")
特殊情况:配对样本差异分析
rm(list=ls())
setwd("E:/Research/5.CRC_liver/Analysis-202304/data/Diff_analysis")#设置工作路径
#读取总数据
data = read.csv('Input data.csv',header=TRUE,row.names=1)
group = read.csv('group.csv', header=TRUE, row.names=1)
individuals = read.csv('individual.csv', header=TRUE, row.names=1)
#提取MO+MM
data_MMMO <- data[,c(1:47,125:171)]
group_MMMO <- group[c(1:47,125:171),]
individuals_MMMO <- individuals[c(1:47,125:171),]
groupID_MMMO <- factor(group_MMMO,levels = unique(group_MMMO))
individualsID_MMMO <- factor(individuals_MMMO,levels = unique(individuals_MMMO))
#整理分组矩阵
design_MMMO_paried <- model.matrix(~ individualsID_MMMO + groupID_MMMO)
fit_MMMO <- lmFit(data_MMMO,design_MMMO_paried)
fit_MMMO <- eBayes(fit_MMMO)
#差异分析
allDiff_MMMO_paired <- topTable(fit_MMMO,
adjust = 'BH',
coef = ncol(design_MMMO_paried),
n = Inf)
p_MMMO <- EnhancedVolcano(allDiff_MMMO_paired,
lab = rownames(allDiff_MMMO_paired),
x = 'logFC',
y = 'P.Value',
title = 'paired MM VS MO',
pointSize = 3.0,
labSize = 6.0,
legendPosition = 'right',
pCutoff = 0.05,
FCcutoff = 1)
p_MMMO
write.csv(allDiff_MMMO_paired,"allDiff_MMMO_paired.csv")
五、差异基因展示
火山图
p_MONO <- EnhancedVolcano(RNA_DEG,
lab = rownames(RNA_DEG),
x = 'logFC',
y = 'P.Value',
xlim = c(-5, 5), #x轴范围
ylim = c(0, 6.5), #y轴范围
title = 'GS_CART vs CART',
#selectLab = c("CCR5"), #填写你想标注的基因
col = c("grey30", "#458c45", "#3c61a3", "#d4372a"),
pointSize = 3.0,
labSize = 6.0,
legendPosition = 'right',
pCutoff = 0.05,
FCcutoff = 1,
gridlines.major = FALSE, # 背景网格
gridlines.minor = FALSE,
legendLabSize = 8,
legendIconSize = 8,
titleLabSize =12,subtitle = NULL)
p_MONO
热图1:pheatmap
rt=mrna_expr[c('MAP2K3','FOS','LIF','TBX21',"TNFRSF9","CD40LG",
"IL2","IFNG","TNF","IL1A","IL3","IL4","IL9","IL22",
"GZMB","CRTAM","IL18RAP","CD160","CAMK2D",
"XCL1","XCL2","CCL1","CCL4"),] #"IL21","IL17A","IL31"不在里面
rt=na.omit(rt)
write.csv(rt,"E:/Research/12.GYY/RNAseq/data/heatmap_gene.csv")
anno=data.frame(row.names = colnames(rt),group=mrna_sample)
library(pheatmap)
pheatmap(rt,annotation_col = anno,scale = 'row',#针对行进行标化
cluster_cols = F,
show_rownames = T, show_colnames = F,
color = colorRampPalette(c(rep("blue",1), "white", rep("red",1)))(100),
breaks = seq(-3,3,length.out = 100),border_color = NA)
热图2:ComplexHeatmap
#ComplexHeatmap包并不会对数据进行标准化,为了让图形更好看,先手动进行标准化
dt <- apply(rt, 1, scale)
rownames(dt) <- colnames(rt)
dt <- t(dt)
#scale函数是按列归一化,对于我们一般习惯基因名为行,样本名为列的数据框,就需要进行转置
#配色自定义:
mycol <- colorRamp2(c(-2, 0, 2), c("#0da9ce", "white", "#e74a32"))
#格子大小设置(按个人需求):
cellwidth = 1
cellheight = 0.35
cn = dim(dt)[2]
rn = dim(dt)[1]
w = cellwidth*cn
h = cellheight*rn
#注释配色自定义:
c4a_gui()
cols <- c4a('pastel1',6)
df <- data.frame(colnames(dt))
colnames(df) <- 'cell'
cols;df
#注释颜色添加:
row_cols <- setNames(c("#FBB4AE","#FBB4AE","#FBB4AE","#FBB4AE","#FBB4AE","#FBB4AE","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#CCEBC5","#CCEBC5", "#CCEBC5", "#CCEBC5", "#CCEBC5", "#DECBE4", "#DECBE4" ,"#DECBE4" ,"#DECBE4"), rownames(dt))
row_cols
#生成注释信息:
col_anno <- HeatmapAnnotation(df = df,
show_annotation_name = F,
gp = gpar(col = 'white', lwd = 2),
col = list(cell = c(
'P1_C' = "#FBB4AE",
'P2_C' = "#B3CDE3",
'P3_C' = "#CCEBC5",
'P1_S' = "#DECBE4",
'P2_S' = "#FED9A6",
'P3_S' = "#FFFFCC"
)))
#生成注释信息:
row_anno <- rowAnnotation(foo = anno_text(rownames(dt),
location = 0,
just = "left",
gp = gpar(fill = row_cols,
col = "black",
fontface = 'italic'),
width = max_text_width(rownames(dt))*1.25))
#绘图:
Heatmap(as.matrix(dt),
width = unit(w, "cm"),
height = unit(h, "cm"),
name = 'expression',
col = mycol,
cluster_columns = F,
cluster_rows = F,
column_names_side = c('top'),
column_names_rot = 60,
row_title = 'T cell key markers',
rect_gp = gpar(col = "white", lwd = 1.5),
heatmap_legend_param = list(legend_height = unit(2.8, "cm"),
grid_width = unit(0.4, "cm"),
labels_gp = gpar(fontsize = 10)),
top_annotation = col_anno, #添加列注释色块
row_split = c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","C","C", "C", "C", "C", "D", "D" ,"D" ,"D"), #对行切片
row_gap = unit(2, "mm"),
border = T,
border_gp = gpar(col = "black", lwd = 1.2)) +
row_anno #热图中添加背景注释色块
六、富集分析
KEGG富集分析
#取出显著上调的基因名(下调同理)
up_gene <- rownames(up_gene)
#ID转化
up_gene_entrezid <- bitr(up_gene, fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Mm.eg.db) #鼠是org.Mm.eg.db,人是org.Hs.eg.db
#KEGG富集分析
KEGGenrich_up <- enrichKEGG(gene = up_gene_entrezid$ENTREZID,
organism = 'mmu', #鼠是mmu,人是hsa
#universe = gene_all,
pvalueCutoff = 1,
qvalueCutoff = 1)
#画图:KEGG-上调
library(ggsci)
kegg_up=KEGGenrich_up[KEGGenrich_up$pvalue<0.05,]
#柱状图
ggplot(data=kegg_up, aes(x=Description,y=GeneRatio)) +
geom_bar(stat="identity",fill='#BC3C29') +
xlab("KEGG_up") + ylab("GeneRatio") + theme_bw()+
theme(axis.text.x=element_text(angle=0)) +
scale_fill_nejm()+scale_color_nejm()+coord_flip()
#气泡图
dotplot(KEGGenrich_up)+scale_color_continuous(low='#bc3c29', high='#dadada')
GO富集分析
#上调基因
GOenrich_up <- enrichGO(gene = up_gene_entrezid$ENTREZID,
OrgDb = 'org.Hs.eg.db',
ont = 'All',
#universe = gene_all,
pvalueCutoff = 1,
qvalueCutoff = 1)
GOenrich_up=DOSE::setReadable(GOenrich_up,OrgDb='org.Hs.eg.db',keyType='ENTREZID')
#下调基因
GOenrich_down <- enrichGO(gene = down_gene_entrezid$ENTREZID,
OrgDb = 'org.Hs.eg.db',
ont = 'All',
#universe = gene_all,
pvalueCutoff = 1,
qvalueCutoff = 1)
GOenrich_down=DOSE::setReadable(GOenrich_down,OrgDb='org.Hs.eg.db',keyType='ENTREZID')
GO_up=GOenrich_up@result
GO_down=GOenrich_down@result
write.csv(GO_up,'./GO_up.csv',quote = F)
write.csv(GO_down,'./GO_down.csv',quote = F)
#画图:GO上调
GO_up=GO_up[GO_up$pvalue<0.05,]
#柱状图
ggplot(data=GO_up, aes(x=Description,y=GeneRatio)) +
geom_bar(stat="identity",fill='#BC3C29') +
xlab("GO_up") + ylab("GeneRatio") + theme_bw()+
theme(axis.text.x=element_text(angle=0)) +
scale_fill_nejm()+scale_color_nejm()+coord_flip()
dotplot(GOenrich_up)+scale_color_continuous(low='#bc3c29', high='#dadada')
#画图:GO下调
GO_down=GO_down[GO_down$pvalue<0.05,]
#柱状图
ggplot(data=GO_down, aes(x=Description,y=GeneRatio)) +
geom_bar(stat="identity",fill='#BC3C29') +
xlab("GO_up") + ylab("GeneRatio") + theme_bw()+
theme(axis.text.x=element_text(angle=0)) +
scale_fill_nejm()+scale_color_nejm()+coord_flip()
dotplot(GOenrich_down)+scale_color_continuous(low='#bc3c29', high='#dadada')
GSEA富集分析
读懂GSEA富集曲线:
https://mp.weixin.qq.com/s/_zoBcLUQ3NKKfSKHld9HVA
library(tidyverse)
library(msigdbr)
library(clusterProfiler)
library(org.Mm.eg.db)
library(enrichplot)
library(RColorBrewer)
library(ggrepel)
library(ggplot2)
library(aplot)
GSEA需要两个文件,一个是基因列表(包括基因名及其logFC),另一个是基因通路gmt文件
#先从之前的差异基因列表提取基因名及logFC
#1)从差异基因列表导入基因(DESeq2)
DEG_DESeq2 <- read.csv("DEG_DESeq2.csv",header = T,row.names = 1,check.names=F)
DEG_DESeq2$gene_name <- rownames(DEG_DESeq2)
gene <- DEG_DESeq2[,c("gene_name","log2FoldChange")]
gene <- arrange(gene,desc(log2FoldChange))
#2)从差异基因列表导入基因(limma)
RNA_DEG <- read.csv("DEG_limma.csv",header = T,row.names = 1,check.names=F)
RNA_DEG$gene_name <- rownames(RNA_DEG)
RNA_DEG$log2FoldChange <- RNA_DEG$logFC
gene <- RNA_DEG[,-c(1:6)]
gene <- arrange(gene,desc(log2FoldChange)) #按照LogFC降序排列
#从txt文件读入
#gene <- read.table('./limma_DEG.txt',header = T) %>% arrange(desc(log2FoldChange))
# check
head(gene,3)
#生成genelist
geneList <- gene$log2FoldChange
names(geneList) <- gene$gene_name
# check
head(geneList,3)
#接下来选择不同的数据库,获取通路基因集
GO_BP数据库
######### 1、选择GO_BP数据库 ##############
# choose C5 BP gene sets
genesets <- msigdbr(species = "Homo sapiens",#Mus musculus
category = "C5",
subcategory = "BP")
# check
head(genesets,3)
# TERM2GENE
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)
# GO enrich
gseaRes <- GSEA(geneList = geneList,
TERM2GENE = gmt,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "BH", #一般选择 "BH" 或 "fdr",BH较严格,fdr较温和(计算的q小些)
by = "fgsea", #"fgsea"或“DOSE”,传统的GSEA方法很慢,可以通过对随机的基因集抽样解决这个问题,也就是 a fast gene set enrichment analysis (FGSEA)
#nPerm = 2000,
verbose = FALSE)
# to dataframe
data_ga <- data.frame(gseaRes) %>%
filter(pvalue < 0.05)
View(data_ga)
write.csv(data_ga, file = "./gsea_result_gobp.csv")
KEGG_BP数据库
######### 2、选择KEGG_BP数据库 ##############
#h <- msigdbr(species = "Mus musculus")
#table(h$gs_subcat) #查看GO;KEGG等基因集
#h2 <- h[grep(pattern = "TP53",x = h$gs_name),] #pattern参数内输入想要寻找的关键词
#获取KEGG基因集
genesets <- msigdbr(species = "Homo sapiens",#Mus musculus
category = "C2",
subcategory = "KEGG")
# check
head(genesets,3)
# TERM2GENE
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)
# KEGG enrich
gseaRes <- GSEA(geneList = geneList,
TERM2GENE = gmt,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "BH",
verbose = FALSE)
# to dataframe
data_ga <- data.frame(gseaRes) %>%
filter(pvalue < 0.05)
write.csv(data_ga, file = "E:/Research/12.GYY/RNAseq/result/gsea_result_kegg.csv")
HALLMARK数据库或自建基因集
######### 3、选择HALLMARK数据库或自建基因集 ##############
#HALLMARK数据库
genesets <- msigdbr(species = "Homo sapiens",
category = "H")
# 1)从HALLMARK获取gmt
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)
# 2)导入自己构建的gmt
gmt <- read.table("./AEos_gene.txt")
# load gene
gene <- read.table('./limma_DEG.txt',header = T) %>%
arrange(desc(log2FoldChange))
geneList <- gene$log2FoldChange
names(geneList) <- gene$gene_name
# KEGG enrich
gseaRes <- GSEA(geneList = geneList,
TERM2GENE = gmt,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "BH",
verbose = FALSE)
# to dataframe
data_ga <- data.frame(gseaRes) %>%
filter(pvalue < 1)
data_ga <- write.csv(data_ga, file = "./AEos_gene_result.csv")#
GSEA画图
#### GseaVis 可视化
# install.packages("devtools")
#devtools::install_github("junjunlab/GseaVis")
library(GseaVis)
library(jjAnno)
# retain curve and heatmap
#一次性显示多条通路
geneSetID = c('GOBP_LEUKOCYTE_MIGRATION_INVOLVED_IN_INFLAMMATORY_RESPONSE',
'GOBP_FC_GAMMA_RECEPTOR_SIGNALING_PATHWAY',
'GOBP_POSITIVE_REGULATION_OF_CELL_KILLING',
'GOBP_INTERFERON_GAMMA_PRODUCTION')
# all plot
gseaNb(object = gseaRes,
geneSetID = geneSetID,
subPlot = 2,
termWidth = 35,
#legend.position = c(0.8,0.8),
#addGene = gene,
addPval = T,
pvalX = 0.05,pvalY = 0.05,
curveCol = jjAnno::useMyCol('ironMan',10))
#### 单个通路
gseaNb(object = gseaRes,
geneSetID = 'GOBP_DNA_REPLICATION_DEPENDENT_CHROMATIN_ORGANIZATION',
subPlot = 2,
termWidth = 90,
addPval = T,
pvalX = 0.75,pvalY = 0.8,
pCol = 'black',
pHjust = 0.5,
nesDigit = 3,
pDigit = 4)
#MDSC
gseaNb(object = gseaRes,
geneSetID = 'AEos',
subPlot = 2,
addPval = T,
pvalX = 0.75,pvalY = 0.8,
pCol = 'black',
pHjust = 0)
gseaNb(object = gseaRes,
geneSetID = 'MDSC',
subPlot = 2,
addPval = T,
pvalX = 0.75,pvalY = 0.8,
pCol = 'black',
pHjust = 0,
addGene = T,
markTopgene= T,
topGeneN = 10,
geneCol = '#4d4d4d')
七、GSVA打分
library(GSVA)
# 导入自己构建的gmt(可以从GSEA官网下载gmt,转为txt)
Eos_gmt <- read.table("./EOSINOPHIL.txt",header = T)
#来自GSEA:NAKAJIMA_EOSINOPHIL
CD8_gmt <- read.table("./CD8.txt",header = T)
#来自GSEA:GOBP_T_CELL_MEDIATED_CYTOTOXICITY.v2025.1.Hs
#读入表达矩阵
mrna_expr <- read.csv("./patient_merge_count_normed.csv",header = T,row.names = 1)
mrna_expr <- as.matrix(mrna_expr)
#GSVA
Eos_ssGSEA = gsva(mrna_expr, Eos_gmt, method="ssgsea",
min.sz=1, max.sz=Inf,
kcdf="Gaussian"
)
CD8_ssGSEA = gsva(mrna_expr, CD8_gmt, method="ssgsea",
min.sz=1, max.sz=Inf,
kcdf="Gaussian"
)
#注意:使用tpm值或fpkm值时需要将kcdf设置为Gaussian,使用counts输入时需要设置为Poisson
八、免疫浸润分析
#使用IOBR包进行分析
library(IOBR)
mrna_expr <- read.csv("./patient_merge_count_normed.csv",header = T,row.names = 1)
#MCPcounter
im_mcpcounter<-deconvo_tme(eset=mrna_expr,
method="mcpcounter"
)
#EPIC
im_epic<-deconvo_tme(eset=mrna_expr,
method="epic",
arrays=F
)
#xcell
im_xcell<-deconvo_tme(eset=mrna_expr,
method="xcell",
arrays=F
)
write.csv(im_xcell,file = "./tme_xcell_merge.csv")
#CIBERSORT
im_cibersort<-deconvo_tme(eset=mrna_expr,
method="cibersort",
arrays=F,
perm=1000
)
#IPS
im_ips<-deconvo_tme(eset=mrna_expr,
method="ips",
plot=F
)
#quanTIseq
im_quantiseq<-deconvo_tme(eset=mrna_expr,
method="quantiseq",
scale_mrna=T
)
#ESTIMATE
im_estimate<-deconvo_tme(eset=mrna_expr,
method="estimate"
)
##基于ssGSEA计算免疫浸润分数
load(file="./ssGSEA28.Rdata")
im_ssgsea<-calculate_sig_score(eset=mrna_expr
,signature=cellMarker#这个28种细胞的文件需要自己准备
,method="ssgsea"#选这个就好了
)
tme_combine<-im_mcpcounter%>%
inner_join(im_epic,by="ID")%>%
inner_join(im_xcell,by="ID")%>%
inner_join(im_cibersort,by="ID")%>%
inner_join(im_ips,by="ID")%>%
inner_join(im_quantiseq,by="ID")%>%
inner_join(im_estimate,by="ID")%>%
inner_join(im_ssgsea,by="ID")
write.csv(tme_combine,file = "./tme_combine.csv")
Last updated