Page cover

🧬bulk RNAseq

By Kaiyi Fu

主要包含RNA测序数据下载与预处理、差异分析、富集分析(GO、KEGG、GSEA)等步骤。

R包加载

library(DESeq2) #差异分析
library(msigdbr) #GSEA
library(clusterProfiler) #GSEA
library(GseaVis) #GSEA
library(limma) #差异分析
library(ggplot2)
library(stringr)
library(tidyverse)
library(immunedeconv) #免疫浸润
library(GSVA) #ssGSEA
library(EnhancedVolcano)#火山图
library(ComplexHeatmap)
library(org.Mm.eg.db)
library(enrichplot)
library(RColorBrewer)
library(ggrepel)

一、测序数据的获取与预处理

GEO数据库的下载与处理

#方式一:从GEO直接获取RNAseq数据
if(! require("GEOquery")) BiocManager::install("GEOquery",update=F,ask=F)
library(GEOquery)
##下载并且了解你的数据
gset <- getGEO('GSE14323',destdir = '.',getGPL = F)

exprSet=exprs(gset[[1]])
pdata=pData(gset[[1]]) #获取临床信息

#问题解决:如何生成group_list,如何选择需要的样本??
#group_list=
#exprSet=
pdata1=pdata[pdata$source_name_ch1 == 'Normal liver tissue',]
pdata2=pdata[pdata$source_name_ch1=='liver tissue with HCC',]
exprSet=as.data.frame(exprSet)
exprSet1=exprSet[,rownames(pdata1)]
exprSet2=exprSet[,rownames(pdata2)]
#按列合并exprSet1,2
exprSet=cbind(exprSet1,exprSet2)
pdata=rbind(pdata1,pdata2)
group_list=c(rep('normal',19),rep('tumor',38))
group_list <- factor(group_list,levels = c('normal','tumor'))
group_list
save(exprSet,pdata,group_list,file ='lab0113.Rdata')
load('lab0113.Rdata')
#microarray等格式的数据需要进行ID转换
##这个方法直接下载GEO GPL处的注释文件来做转换,肯定是可行的

#GSE14323是用GPL96和GPL571一起的,两个平台注释文件非常相似
#我们选择样本数多的GPL571
#https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL571 Download full table
#开始转换
#载入注释文件,在你自己的文件夹里
#没data.table,geoquery啥的先装一下
#install.package('data.table')
#BiocManager::install('GEOquery',update = F,ask = F)

library(data.table)
anno = fread("D:/R学习/platform/GPL571-17391.txt",data.table = F,sep = '\t')
#看一下列名
colnames(anno)
#一定选ID探针列和gene symbol列,!!!此处是可变的!!!
anno = anno[,c(1,11)]
anno = anno[!anno$`Gene Symbol`== "",]
#无脑跑代码即可
tmp = rownames(exprSet)%in% anno[,1]
exprSet = exprSet[tmp,]
dim(exprSet)
match(rownames(exprSet),anno$ID)
anno = anno[match(rownames(exprSet),anno$ID),]
match(rownames(exprSet),anno$ID)
dim(exprSet)
dim(anno)
tail(sort(table(anno[,2])), n = 12L)
#注释到相同基因的探针,保留表达量最大的那个探针
{
MAX = by(exprSet, anno[,2],
function(x) rownames(x)[ which.max(rowMeans(x))])
MAX = as.character(MAX)
exprSet = exprSet[rownames(exprSet) %in% MAX,]
rownames(exprSet) = anno[ match(rownames(exprSet), anno[,1] ),2]
}
dim(exprSet)
#cel.gz文件合并

library(affy)
library(dplyr)
library(tidyverse)

data.raw <- ReadAffy()
eset <- rma(data.raw)
write.exprs(eset,"probe_exp.txt")

ann <- read.delim2("./GPL11180.txt")##修改注释文件的工作路径,指定到GPL所在文件位置,记得从GEO下载下来删除前几行
ann <- ann[,c("ID","Gene.Symbol")]
ann$Gene.symbol <-str_split_fixed(ann$Gene.Symbol,pattern = "///",3)[,1]
rt <- read.table("./probe_exp.txt",sep = "\t",header = T)
symbol <- merge(ann,rt,by.x = 1,by.y = 1)
symbol <- symbol[-1]
colnames(symbol)[1] <- c("ID")
symbol <- distinct(symbol,ID,.keep_all = T)
rownames(symbol) <- symbol$ID
symbol <- symbol[-1]
colnames(symbol) <- str_split_fixed(colnames(symbol),pattern = "_",2)[,1]
symbol <- cbind(ID=rownames(symbol),symbol)
write.table(symbol,"symbol.txt",sep = "\t",quote = F,row.names = F)

数据读入+基因去重

##方式二:读入自己的测序矩阵

#读入表达矩阵
mrna_expr <- read.table("E:/Research/12.GYY/RNAseq/data/data_CvsS.txt",header = T)

#处理重复的基因(挑选行平均值大的那一行)

#计算行平均值,按降序排列
index=order(rowMeans(mrna_expr[,-1]),decreasing = T)
#调整表达谱的基因顺序
expr_ordered=mrna_expr[index,]
#对于有重复的基因,保留第一次出现的那个,即行平均值大的那个
keep=!duplicated(expr_ordered$gene_name)
#得到最后处理之后的表达谱矩阵
mrna_expr=expr_ordered[keep,]
rownames(mrna_expr) <- mrna_expr$gene_name
mrna_expr <- mrna_expr[,-1]
#读入样本分组信息
#注意保证表达矩阵中的样本顺序和group文件分组顺序是一一对应的
mrna_sample <- read.csv("E:/Research/12.GYY/RNAseq/data/group_CvsS.csv",header = T,row.names = 1,check.names=F)

人鼠同源基因转换

#HomoloGene包

# install.packages('homologene')
library(homologene)

# 只保留编码基因
coding_gene <- data.table::fread("./input_data/protein-coding-gene.csv") %>% as.data.frame() %>% filter(gene_type == "protein_coding")
genelist<-coding_gene$gene_name

# 随便取点基因看看
genes = genelist[1:10]
genes

#使用homologene函进行转换

#inTax是输入的基因列表所属的物种号,10090是小鼠
#outTax是要转换成的物种号,9606是人
trans <- homologene(genes = genes, #需要转换的基因列表
                    inTax = 9606, #人
                    outTax = 10090 #小鼠
                    )

二、数据清理

删除低表达基因

#对表达量矩阵取整数(DESEQ2处理非整数rawcount时需要)
mrna_expr <- round(mrna_expr)

#根据样本情况选择不同的低表达基因过滤方法

#1、删除表达量全为0的基因
#mrna_expr <- mrna_expr[which(rowSums(mrna_expr)!=0),] 

#2、或保留至少在75%的样本中都有表达的基因
#keep_data <- rowSums(mrna_expr> 0) >= floor(0.75*ncol(mrna_expr))
#table(keep_data)

#3、 或保留每个样本中都有表达的基因(可能会删除低表达基因)
keep_data <- rowSums(mrna_expr> 0) >= floor(ncol(mrna_expr))
#table(keep_data)
mrna_expr <- mrna_expr[keep_data,]

标准化

如后续使用DESeq2进行差异分析,可以跳过此步,用limma差异分析则需要此步log标化

#先看一下样本间齐不齐
boxplot(log2(mrna_expr))
#不齐的话样本间normalize一下(做热图的话可以不normalize)
mrna_expr=normalizeBetweenArrays(mrna_expr)
boxplot(mrna_expr,outline=FALSE, notch=T,col=mrna_sample, las=2)
#log化处理
mrna_expr = log2(mrna_expr+1) 
boxplot(log2(mrna_expr))
#将log化后的负无穷值替换为0
#mrna_expr[mrna_expr == -Inf] = 0 

三、PCA

library(ggplot2)
library(ggpubr)
library(cowplot)

#如果数据是原始 counts,建议先做log转换或归一化(如DESeq2的vst/rlog)再做 PCA
#这里数据已经log化了

expr <- read.csv("patient_merge_count_normed.csv",header = T,row.names = 1,check.names=F)

mrna_sample <- read.csv("./group.csv",header = T,row.names = 1,check.names=F)

# 转置矩阵:PCA需要样本作为行
expr <- t(expr)


# PCA分析 (这里用prcomp)
pca_res <- prcomp(expr, scale. = TRUE)  # scale=TRUE会标准化数据

# 查看方差解释度
summary(pca_res)


library(FactoMineR)   # PCA函数
library(factoextra)   # 画PCA图

# 带分组颜色的PCA图
fviz_pca_ind(pca_res,
             geom.ind = c("point","text"),  # 点 + 样本名
             col.ind = mrna_sample$GROUP, # 按分组上色
             palette = "jco",      # 颜色主题
             addEllipses = TRUE,   # 画置信椭圆
             mean.point = FALSE, # 不画均值点
             repel = TRUE,
             pointsize = 3,     # 固定点的大小
             label = "ind" 
)


#绘制PC1、PC2、PC3三者关系,并列出三张图
.pca <- function(data, is.log) {
  if (is.log)
    data <- data
  else
    data <- log2(data + 1)
  svd <- base::svd(scale(
    x = t(data),
    center = TRUE,
    scale = FALSE
  ))
  percent <- svd$d ^ 2 / sum(svd$d ^ 2) * 100
  percent <-
    sapply(seq_along(percent),
           function(i) {
             round(percent[i], 1)
           })
  return(list(
    sing.val = svd,
    variation = percent))
}

.scatter.density.pc <- function(
    pcs, 
    pc.var, 
    group.name, 
    group, 
    color, 
    strokeSize, 
    pointSize, 
    strokeColor,
    alpha,
    title
){
  pair.pcs <- utils::combn(ncol(pcs), 2)
  pList <- list()
  for(i in 1:ncol(pair.pcs)){
    if(i == 1){
      x <- pair.pcs[1,i]
      y <- pair.pcs[2,i]
      p <- ggplot(mapping = aes(
        x = pcs[,x], 
        y = pcs[,y], 
        fill = group)) +
        xlab(paste0('PC', x, ' (', pc.var[x], '%)')) +
        ylab(paste0('PC', y, ' (', pc.var[y], '%)')) +
        geom_point(
          aes(fill = group), 
          pch = 21, 
          color = strokeColor, 
          stroke = strokeSize, 
          size = pointSize,
          alpha = alpha) +
        scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
        scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
        ggtitle(title) +
        theme(
          legend.position = "right",
          panel.background = element_blank(), 
          axis.line = element_line(colour = "black", size = 1.1),
          legend.background = element_blank(),
          legend.text = element_text(size = 12),
          legend.title = element_text(size = 14),
          legend.key = element_blank(),
          axis.text.x = element_text(size = 10),
          axis.text.y = element_text(size = 10),
          axis.title.x = element_text(size = 14),
          axis.title.y = element_text(size = 14)) +
        guides(fill = guide_legend(override.aes = list(size = 4))) + 
        scale_fill_manual(name = group.name, values = color)
      
      le <- ggpubr::get_legend(p)
    }else{
      x <- pair.pcs[1,i]
      y <- pair.pcs[2,i]
      p <- ggplot(mapping = aes(
        x = pcs[,x], 
        y = pcs[,y], 
        fill = group)) +
        xlab(paste0('PC', x, ' (',pc.var[x], '%)')) +
        ylab(paste0('PC', y, ' (',pc.var[y], '%)')) +
        geom_point(
          aes(fill = group), 
          pch = 21, 
          color = strokeColor, 
          stroke = strokeSize,
          size = pointSize,
          alpha = alpha) +
        scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
        scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
        theme(
          panel.background = element_blank(), 
          axis.line = element_line(colour = "black", size = 1.1),
          legend.position = "none",
          axis.text.x = element_text(size = 10),
          axis.text.y = element_text(size = 10),
          axis.title.x = element_text(size = 14),
          axis.title.y = element_text(size = 14)) +
        scale_fill_manual(values = color, name = group.name)
    }
    p <- p + theme(legend.position = "none")
    xdens <- cowplot::axis_canvas(p, axis = "x")+
      geom_density(
        mapping = aes(
          x = pcs[,x], 
          fill = group),
        alpha = 0.7, 
        size = 0.2
      ) +
      theme(legend.position = "none") +
      scale_fill_manual(values = color)
    
    ydens <- cowplot::axis_canvas(
      p, 
      axis = "y", 
      coord_flip = TRUE) +
      geom_density(
        mapping = aes(
          x = pcs[,y],
          fill = group),
        alpha = 0.7,
        size = 0.2) +
      theme(legend.position = "none") +
      scale_fill_manual(name = group.name, values = color) +
      coord_flip()
    
    p1 <- insert_xaxis_grob(
      p,
      xdens,
      grid::unit(.2, "null"),
      position = "top"
    )
    p2 <- insert_yaxis_grob(
      p1,
      ydens,
      grid::unit(.2, "null"),
      position = "right"
    )
    pList[[i]] <- ggdraw(p2)
  }
  pList[[i+1]] <- le
  return(pList)
}



pca.ncg<-.pca(data = expr,
              is.log = TRUE)

.scatter.density.pc(pcs = pca.ncg$sing.val$v[, 1:3],
                    pc.var = pca.ncg$variation,
                    strokeColor = 'gray30',
                    strokeSize = 0.2,
                    pointSize = 2,
                    alpha = 0.6,
                    title = "A",
                    group.name = "B",
                    group=mrna_sample$GROUP,
                    color=c("red","blue","green")) -> p

do.call(
  gridExtra::grid.arrange,
  c(p,ncol=4))

四、差异分析

DESeq2差异分析(适用于rawcount)

获取差异基因

dds <- DESeqDataSetFromMatrix(countData = expr_data,
                                colData = sample_data,
                                design = ~ group)
dds <- DESeq(dds, fitType = 'mean', minReplicatesForReplace = 7, parallel = FALSE)

nrDEG <- results(dds, contrast =c("group","DP","DSS")) #指定实验/对照组

## 转化数据框
DEG_DESeq2 <- as.data.frame(nrDEG)

DEG_DESeq2 = DEG_DESeq2[order(DEG_DESeq2$log2FoldChange),] # 按照logFC排序
DEG_DESeq2[1:3,1:4]

#如果一行包含一个具有极端计数异常值的样本,则p值和调整后的p值将被设置为NA(https://cloud.tencent.com/developer/article/2327198)
#可以删除缺失行进行过滤
DEG_DESeq2 <- na.omit(DEG_DESeq2)

write.csv(nrDEG,file = "./DEG_DESeq2.csv")

#筛选差异基因(标准为adj.P<0.05,logFC绝对值大于1)
allDiffsig=DEG_DESeq2[DEG_DESeq2$padj<0.05,]
allDiffsig=allDiffsig[allDiffsig$log2FoldChange >1 | allDiffsig$log2FoldChange < -1,]
#显著上调的基因
up_gene=allDiffsig[allDiffsig$log2FoldChange>0,]
#显著下调的基因
down_gene=allDiffsig[allDiffsig$log2FoldChange<0,]

获取标化后的表达矩阵

#1、提取归一化数据
norm_data <- as.data.frame(counts(dds,normalized=TRUE))
head(norm_data)
#文件保存,得到标准化的结果
write.csv(norm_data,file = "./DEseq2_normalized.csv",row.names = FALSE)
#注意:DESeq2的标准化主要是为了消除样本间文库大小的差异,输出的标准化计数可用于样本间的表达量比较,但不能直接用于计算基因的表达相关性


#2、如要绘制热图,可以对整个表达矩阵进行VST转换,再对感兴趣的基因按行进行Z-score标准化

# 1) 对整个表达矩阵进行VST转换
vsd <- vst(dds, blind = FALSE)
all_vst_data <- assay(vsd)

# 2) 从转换后的数据中提取感兴趣的基因
interest_genes <- c("Cd80","Cd274","CD2","Ptgs2","Il1rn","Il1b","Vegfa","Ccl3","Cxcl2","Il16","Tnf")  
subset_vst <- all_vst_data[rownames(all_vst_data) %in% interest_genes, ]

# 3) 对提取的基因子集进行按行Z-score标准化
subset_zscore <- t(scale(t(subset_vst)))

# 4) 绘制热图
pheatmap(subset_zscore,
         color = colorRampPalette(c("#2166AC", "#f7f7f7", "#B2182B"))(100),
         breaks = seq(-2, 2, length.out = 100),
         fontsize_row = 8,
         border_color = NA)

limma差异分析

#构建分组矩阵
design <- model.matrix(~factor(mrna_sample$GROUP) + 0)
colnames(design) <- c("CART","GSCART")
#构建比较矩阵
contrast.matrix <- makeContrasts(GSCART - CART, levels = factor(mrna_sample$GROUP))#实验组vs对照组
#线性混合模拟
fit <- lmFit(mrna_expr,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异分析结果
RNA_DEG <- topTable(fit2,
                    #adjust = 'BH',
                    coef = 1,
                    n = Inf,
                    sort.by="logFC"
                    #p.value = 0.05
)
#筛选差异基因
allDiffsig=RNA_DEG[RNA_DEG$P.Val<0.05,]
allDiffsig=allDiffsig[allDiffsig$logFC >1 | allDiffsig$logFC < -1,]
#显著上调的基因
up=allDiffsig[allDiffsig$logFC>0,]
#显著下调的基因
down=allDiffsig[allDiffsig$logFC<0,]

#RNA_DEG = RNA_DEG[order(RNA_DEG$logFC),] # 按照logFC排序
write.csv(RNA_DEG,"./DEG_limma.csv")

#导出标化后的表达矩阵
write.csv(mrna_expr,"./RNA_norm.csv")

特殊情况:配对样本差异分析

rm(list=ls())
setwd("E:/Research/5.CRC_liver/Analysis-202304/data/Diff_analysis")#设置工作路径

#读取总数据
data = read.csv('Input data.csv',header=TRUE,row.names=1)
group = read.csv('group.csv', header=TRUE, row.names=1) 
individuals = read.csv('individual.csv', header=TRUE, row.names=1) 
#提取MO+MM
data_MMMO <- data[,c(1:47,125:171)]
group_MMMO <- group[c(1:47,125:171),]
individuals_MMMO <- individuals[c(1:47,125:171),]
groupID_MMMO <- factor(group_MMMO,levels = unique(group_MMMO))
individualsID_MMMO <- factor(individuals_MMMO,levels = unique(individuals_MMMO))

#整理分组矩阵
design_MMMO_paried <- model.matrix(~ individualsID_MMMO + groupID_MMMO)

fit_MMMO <- lmFit(data_MMMO,design_MMMO_paried)
fit_MMMO <- eBayes(fit_MMMO)
#差异分析
allDiff_MMMO_paired <- topTable(fit_MMMO,
                           adjust = 'BH',
                           coef = ncol(design_MMMO_paried),
                           n = Inf)
p_MMMO <- EnhancedVolcano(allDiff_MMMO_paired,
                          lab = rownames(allDiff_MMMO_paired),
                          x = 'logFC',
                          y =  'P.Value',
                          title = 'paired MM VS MO',
                          pointSize = 3.0,
                          labSize = 6.0,
                          legendPosition = 'right',
                          pCutoff = 0.05,
                          FCcutoff = 1)
p_MMMO
write.csv(allDiff_MMMO_paired,"allDiff_MMMO_paired.csv")

五、差异基因展示

火山图

p_MONO <- EnhancedVolcano(RNA_DEG,
                          lab = rownames(RNA_DEG),
                          x = 'logFC',
                          y =  'P.Value',
                          xlim = c(-5, 5), #x轴范围
                          ylim = c(0, 6.5), #y轴范围
                          title = 'GS_CART vs CART',
                          #selectLab = c("CCR5"), #填写你想标注的基因
                          col = c("grey30", "#458c45", "#3c61a3", "#d4372a"),
                          pointSize = 3.0,
                          labSize = 6.0,
                          legendPosition = 'right',
                          pCutoff = 0.05,
                          FCcutoff = 1,
                          gridlines.major = FALSE,     # 背景网格
                          gridlines.minor = FALSE,
                          legendLabSize = 8,
                          legendIconSize = 8,
                          titleLabSize =12,subtitle = NULL)

p_MONO

热图1:pheatmap

rt=mrna_expr[c('MAP2K3','FOS','LIF','TBX21',"TNFRSF9","CD40LG",
               "IL2","IFNG","TNF","IL1A","IL3","IL4","IL9","IL22",
               "GZMB","CRTAM","IL18RAP","CD160","CAMK2D",
               "XCL1","XCL2","CCL1","CCL4"),] #"IL21","IL17A","IL31"不在里面
rt=na.omit(rt)

write.csv(rt,"E:/Research/12.GYY/RNAseq/data/heatmap_gene.csv")

anno=data.frame(row.names = colnames(rt),group=mrna_sample)

library(pheatmap)
pheatmap(rt,annotation_col = anno,scale = 'row',#针对行进行标化
         cluster_cols = F,
         show_rownames = T, show_colnames = F,
color = colorRampPalette(c(rep("blue",1), "white", rep("red",1)))(100),
breaks = seq(-3,3,length.out = 100),border_color = NA)

热图2:ComplexHeatmap

#ComplexHeatmap包并不会对数据进行标准化,为了让图形更好看,先手动进行标准化
dt <- apply(rt, 1, scale)
rownames(dt) <- colnames(rt)
dt <- t(dt)
#scale函数是按列归一化,对于我们一般习惯基因名为行,样本名为列的数据框,就需要进行转置
#配色自定义:
mycol <- colorRamp2(c(-2, 0, 2), c("#0da9ce", "white", "#e74a32"))

#格子大小设置(按个人需求):
cellwidth = 1
cellheight = 0.35
cn = dim(dt)[2]
rn = dim(dt)[1]
w = cellwidth*cn
h = cellheight*rn

#注释配色自定义:
c4a_gui()
cols <- c4a('pastel1',6)
df <- data.frame(colnames(dt))
colnames(df) <- 'cell'
cols;df
#注释颜色添加:
row_cols <- setNames(c("#FBB4AE","#FBB4AE","#FBB4AE","#FBB4AE","#FBB4AE","#FBB4AE","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#CCEBC5","#CCEBC5", "#CCEBC5", "#CCEBC5", "#CCEBC5", "#DECBE4", "#DECBE4" ,"#DECBE4" ,"#DECBE4"), rownames(dt))
row_cols
#生成注释信息:
col_anno <- HeatmapAnnotation(df = df,
                             show_annotation_name = F,
                             gp = gpar(col = 'white', lwd = 2),
                             col = list(cell = c(
                               'P1_C' = "#FBB4AE",
                               'P2_C' = "#B3CDE3",
                               'P3_C' = "#CCEBC5",
                               'P1_S' = "#DECBE4",
                               'P2_S' = "#FED9A6",
                               'P3_S' = "#FFFFCC" 
                              )))
#生成注释信息:
row_anno <- rowAnnotation(foo = anno_text(rownames(dt),
                                location = 0,
                                just = "left",
                                gp = gpar(fill = row_cols,
                                            col = "black",
                                            fontface = 'italic'),
                                width = max_text_width(rownames(dt))*1.25))

#绘图:
Heatmap(as.matrix(dt),
        width = unit(w, "cm"),
        height = unit(h, "cm"),
        name = 'expression',
        col = mycol,
        cluster_columns = F,
        cluster_rows = F,
        column_names_side = c('top'),
        column_names_rot = 60,
        row_title = 'T cell key markers',
        rect_gp = gpar(col = "white", lwd = 1.5),
        heatmap_legend_param = list(legend_height = unit(2.8, "cm"),
                            grid_width = unit(0.4, "cm"),
                            labels_gp = gpar(fontsize = 10)),
        top_annotation = col_anno, #添加列注释色块
        row_split = c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","C","C", "C", "C", "C", "D", "D" ,"D" ,"D"), #对行切片
        row_gap = unit(2, "mm"),
        border = T,
        border_gp = gpar(col = "black", lwd = 1.2)) +
  row_anno #热图中添加背景注释色块

六、富集分析

KEGG富集分析

#取出显著上调的基因名(下调同理)
up_gene <- rownames(up_gene)

#ID转化
up_gene_entrezid <- bitr(up_gene, fromType = "SYMBOL",
                           toType = c( "ENTREZID"),
                           OrgDb = org.Mm.eg.db) #鼠是org.Mm.eg.db,人是org.Hs.eg.db

#KEGG富集分析
KEGGenrich_up <- enrichKEGG(gene = up_gene_entrezid$ENTREZID,
                              organism = 'mmu', #鼠是mmu,人是hsa
                              #universe = gene_all,
                              pvalueCutoff = 1,
                              qvalueCutoff = 1)
#画图:KEGG-上调
library(ggsci)
kegg_up=KEGGenrich_up[KEGGenrich_up$pvalue<0.05,]
#柱状图
ggplot(data=kegg_up, aes(x=Description,y=GeneRatio)) +
  geom_bar(stat="identity",fill='#BC3C29') +
  xlab("KEGG_up") + ylab("GeneRatio") + theme_bw()+
  theme(axis.text.x=element_text(angle=0)) +
  scale_fill_nejm()+scale_color_nejm()+coord_flip()

#气泡图
dotplot(KEGGenrich_up)+scale_color_continuous(low='#bc3c29', high='#dadada')

GO富集分析

#上调基因
GOenrich_up <- enrichGO(gene = up_gene_entrezid$ENTREZID,
            OrgDb = 'org.Hs.eg.db',
            ont = 'All',
            #universe = gene_all,
            pvalueCutoff = 1,
            qvalueCutoff = 1)
GOenrich_up=DOSE::setReadable(GOenrich_up,OrgDb='org.Hs.eg.db',keyType='ENTREZID')

#下调基因
GOenrich_down <- enrichGO(gene = down_gene_entrezid$ENTREZID,
            OrgDb = 'org.Hs.eg.db',
            ont = 'All',
            #universe = gene_all,
            pvalueCutoff = 1,
            qvalueCutoff = 1)
GOenrich_down=DOSE::setReadable(GOenrich_down,OrgDb='org.Hs.eg.db',keyType='ENTREZID')


GO_up=GOenrich_up@result
GO_down=GOenrich_down@result


write.csv(GO_up,'./GO_up.csv',quote = F)
write.csv(GO_down,'./GO_down.csv',quote = F)
#画图:GO上调
GO_up=GO_up[GO_up$pvalue<0.05,]
#柱状图
ggplot(data=GO_up, aes(x=Description,y=GeneRatio)) +
  geom_bar(stat="identity",fill='#BC3C29') +
  xlab("GO_up") + ylab("GeneRatio") + theme_bw()+
  theme(axis.text.x=element_text(angle=0)) +
  scale_fill_nejm()+scale_color_nejm()+coord_flip()

dotplot(GOenrich_up)+scale_color_continuous(low='#bc3c29', high='#dadada')
#画图:GO下调

GO_down=GO_down[GO_down$pvalue<0.05,]
#柱状图
ggplot(data=GO_down, aes(x=Description,y=GeneRatio)) +
  geom_bar(stat="identity",fill='#BC3C29') +
  xlab("GO_up") + ylab("GeneRatio") + theme_bw()+
  theme(axis.text.x=element_text(angle=0)) +
  scale_fill_nejm()+scale_color_nejm()+coord_flip()

dotplot(GOenrich_down)+scale_color_continuous(low='#bc3c29', high='#dadada')

GSEA富集分析

读懂GSEA富集曲线:

https://mp.weixin.qq.com/s/_zoBcLUQ3NKKfSKHld9HVA

library(tidyverse)
library(msigdbr)
library(clusterProfiler)
library(org.Mm.eg.db)
library(enrichplot)
library(RColorBrewer)
library(ggrepel)
library(ggplot2)
library(aplot)

GSEA需要两个文件,一个是基因列表(包括基因名及其logFC),另一个是基因通路gmt文件

#先从之前的差异基因列表提取基因名及logFC

#1)从差异基因列表导入基因(DESeq2)

DEG_DESeq2 <- read.csv("DEG_DESeq2.csv",header = T,row.names = 1,check.names=F)

DEG_DESeq2$gene_name <- rownames(DEG_DESeq2)
gene <- DEG_DESeq2[,c("gene_name","log2FoldChange")]
gene <- arrange(gene,desc(log2FoldChange)) 


#2)从差异基因列表导入基因(limma)
RNA_DEG <- read.csv("DEG_limma.csv",header = T,row.names = 1,check.names=F)

RNA_DEG$gene_name <- rownames(RNA_DEG)
RNA_DEG$log2FoldChange <- RNA_DEG$logFC
gene <- RNA_DEG[,-c(1:6)]
gene <- arrange(gene,desc(log2FoldChange)) #按照LogFC降序排列

#从txt文件读入
#gene <- read.table('./limma_DEG.txt',header = T) %>%  arrange(desc(log2FoldChange))

# check
head(gene,3)

#生成genelist
geneList <- gene$log2FoldChange
names(geneList) <- gene$gene_name
# check
head(geneList,3)

#接下来选择不同的数据库,获取通路基因集

GO_BP数据库

######### 1、选择GO_BP数据库 ##############
# choose C5 BP gene sets
genesets <-  msigdbr(species = "Homo sapiens",#Mus musculus
                     category = "C5",
                     subcategory = "BP")
# check
head(genesets,3)

# TERM2GENE
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)
# GO enrich
gseaRes <- GSEA(geneList = geneList,
                TERM2GENE = gmt,
                minGSSize    = 10,
                maxGSSize    = 500,
                pvalueCutoff = 1,
                pAdjustMethod = "BH", #一般选择 "BH" 或 "fdr",BH较严格,fdr较温和(计算的q小些)
                by = "fgsea", #"fgsea"或“DOSE”,传统的GSEA方法很慢,可以通过对随机的基因集抽样解决这个问题,也就是 a fast gene set enrichment analysis (FGSEA) 
                #nPerm = 2000,
                verbose      = FALSE)

# to dataframe
data_ga <- data.frame(gseaRes) %>%
  filter(pvalue < 0.05)

View(data_ga)

write.csv(data_ga, file = "./gsea_result_gobp.csv")

KEGG_BP数据库

######### 2、选择KEGG_BP数据库 ##############
#h <- msigdbr(species = "Mus musculus")
#table(h$gs_subcat) #查看GO;KEGG等基因集
#h2 <- h[grep(pattern = "TP53",x = h$gs_name),]  #pattern参数内输入想要寻找的关键词

#获取KEGG基因集
genesets <-  msigdbr(species = "Homo sapiens",#Mus musculus
                     category = "C2",
                     subcategory = "KEGG")

# check
head(genesets,3)

# TERM2GENE
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)


# KEGG enrich
gseaRes <- GSEA(geneList = geneList,
                TERM2GENE = gmt,
                minGSSize    = 10,
                maxGSSize    = 500,
                pvalueCutoff = 1,
                pAdjustMethod = "BH",
                verbose      = FALSE)

# to dataframe
data_ga <- data.frame(gseaRes) %>%
  filter(pvalue < 0.05)

write.csv(data_ga, file = "E:/Research/12.GYY/RNAseq/result/gsea_result_kegg.csv")

HALLMARK数据库或自建基因集

######### 3、选择HALLMARK数据库或自建基因集 ##############

#HALLMARK数据库
genesets <-  msigdbr(species = "Homo sapiens",
                     category = "H")

# 1)从HALLMARK获取gmt
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)


# 2)导入自己构建的gmt
gmt <- read.table("./AEos_gene.txt")


# load gene
gene <- read.table('./limma_DEG.txt',header = T) %>%
  arrange(desc(log2FoldChange))

geneList <- gene$log2FoldChange
names(geneList) <- gene$gene_name

# KEGG enrich
gseaRes <- GSEA(geneList = geneList,
                TERM2GENE = gmt,
                minGSSize    = 10,
                maxGSSize    = 500,
                pvalueCutoff = 1,
                pAdjustMethod = "BH",
                verbose      = FALSE)



# to dataframe
data_ga <- data.frame(gseaRes) %>%
  filter(pvalue < 1)

data_ga <- write.csv(data_ga, file = "./AEos_gene_result.csv")#

GSEA画图

#### GseaVis 可视化 

# install.packages("devtools")
#devtools::install_github("junjunlab/GseaVis")
library(GseaVis)
library(jjAnno)
# retain curve and heatmap

#一次性显示多条通路
geneSetID = c('GOBP_LEUKOCYTE_MIGRATION_INVOLVED_IN_INFLAMMATORY_RESPONSE',
              'GOBP_FC_GAMMA_RECEPTOR_SIGNALING_PATHWAY',
              'GOBP_POSITIVE_REGULATION_OF_CELL_KILLING',
              'GOBP_INTERFERON_GAMMA_PRODUCTION')

# all plot
gseaNb(object = gseaRes,
       geneSetID = geneSetID,
       subPlot = 2,
       termWidth = 35,
       #legend.position = c(0.8,0.8),
       #addGene = gene,
       addPval = T,
       pvalX = 0.05,pvalY = 0.05,
       curveCol = jjAnno::useMyCol('ironMan',10))



#### 单个通路
gseaNb(object = gseaRes,
       geneSetID = 'GOBP_DNA_REPLICATION_DEPENDENT_CHROMATIN_ORGANIZATION',
       subPlot = 2,
       termWidth = 90,
       addPval = T,
       pvalX = 0.75,pvalY = 0.8,
       pCol = 'black',
       pHjust = 0.5,
       nesDigit = 3,
       pDigit = 4)


#MDSC
gseaNb(object = gseaRes,
       geneSetID = 'AEos',
       subPlot = 2,
       addPval = T,
       pvalX = 0.75,pvalY = 0.8,
       pCol = 'black',
       pHjust = 0)



gseaNb(object = gseaRes,
       geneSetID = 'MDSC',
       subPlot = 2,
       addPval = T,
       pvalX = 0.75,pvalY = 0.8,
       pCol = 'black',
       pHjust = 0,
       addGene = T,
       markTopgene= T,
       topGeneN = 10,
       geneCol = '#4d4d4d')

七、GSVA打分

library(GSVA)

# 导入自己构建的gmt(可以从GSEA官网下载gmt,转为txt)
Eos_gmt <- read.table("./EOSINOPHIL.txt",header = T)
#来自GSEA:NAKAJIMA_EOSINOPHIL
CD8_gmt <- read.table("./CD8.txt",header = T)
#来自GSEA:GOBP_T_CELL_MEDIATED_CYTOTOXICITY.v2025.1.Hs

#读入表达矩阵
mrna_expr <- read.csv("./patient_merge_count_normed.csv",header = T,row.names = 1)
mrna_expr <- as.matrix(mrna_expr)

#GSVA
Eos_ssGSEA = gsva(mrna_expr, Eos_gmt, method="ssgsea",
                       min.sz=1, max.sz=Inf,
                       kcdf="Gaussian"
)

CD8_ssGSEA = gsva(mrna_expr, CD8_gmt, method="ssgsea",
                  min.sz=1, max.sz=Inf,
                  kcdf="Gaussian"
)

#注意:使用tpm值或fpkm值时需要将kcdf设置为Gaussian,使用counts输入时需要设置为Poisson

八、免疫浸润分析

#使用IOBR包进行分析


library(IOBR)
mrna_expr <- read.csv("./patient_merge_count_normed.csv",header = T,row.names = 1)


#MCPcounter
im_mcpcounter<-deconvo_tme(eset=mrna_expr,
                           method="mcpcounter"
)


#EPIC
im_epic<-deconvo_tme(eset=mrna_expr,
                       method="epic",
                       arrays=F
)

#xcell
im_xcell<-deconvo_tme(eset=mrna_expr,
                        method="xcell",
                        arrays=F
)
write.csv(im_xcell,file = "./tme_xcell_merge.csv")

#CIBERSORT
im_cibersort<-deconvo_tme(eset=mrna_expr,
                            method="cibersort",
                            arrays=F,
                            perm=1000
)

#IPS
im_ips<-deconvo_tme(eset=mrna_expr,
                      method="ips",
                      plot=F
)

#quanTIseq
im_quantiseq<-deconvo_tme(eset=mrna_expr,
                            method="quantiseq",
                            scale_mrna=T
)

#ESTIMATE
im_estimate<-deconvo_tme(eset=mrna_expr,
                           method="estimate"
)




##基于ssGSEA计算免疫浸润分数
load(file="./ssGSEA28.Rdata")
im_ssgsea<-calculate_sig_score(eset=mrna_expr
                               ,signature=cellMarker#这个28种细胞的文件需要自己准备
                               ,method="ssgsea"#选这个就好了
)



tme_combine<-im_mcpcounter%>%
  inner_join(im_epic,by="ID")%>%
  inner_join(im_xcell,by="ID")%>%
  inner_join(im_cibersort,by="ID")%>%
  inner_join(im_ips,by="ID")%>%
  inner_join(im_quantiseq,by="ID")%>%
  inner_join(im_estimate,by="ID")%>%
  inner_join(im_ssgsea,by="ID")


write.csv(tme_combine,file = "./tme_combine.csv")

Last updated