bulk RNAseq
By Kaiyi Fu
主要包含RNA测序数据下载与预处理、差异分析、富集分析(GO、KEGG、GSEA)等步骤。
R包加载
library(survival) #生存曲线
library(survminer) #生存曲线
library(forestmodel) #森林图
library(DESeq2) #差异分析
library(msigdbr) #GSEA
library(clusterProfiler) #GSEA
library(GseaVis) #GSEA
library(limma) #差异分析
library(ggplot2)
library(stringr)
library(tidyverse)
library(mice) #填补缺失值
library(maftools) #CNV分析
library(immunedeconv) #免疫浸润
library(CancerSubtypes) #癌症分型
library(GenVisR) #Maf可视化
library(GSVA) #ssGSEA
library(EnhancedVolcano)#火山图
library(ComplexHeatmap)
library(circlize)
library(cols4all)
library(org.Mm.eg.db)
library(enrichplot)
library(RColorBrewer)
library(ggrepel)
library(aplot)
一、测序数据的获取与预处理
GEO数据库的下载与处理
#方式一:从GEO直接获取RNAseq数据
if(! require("GEOquery")) BiocManager::install("GEOquery",update=F,ask=F)
library(GEOquery)
##下载并且了解你的数据
gset <- getGEO('GSE14323',destdir = '.',getGPL = F)
exprSet=exprs(gset[[1]])
pdata=pData(gset[[1]]) #获取临床信息
#最重要!!!
#问题解决:如何生成group_list,如何选择需要的样本??
#group_list=
#exprSet=
pdata1=pdata[pdata$source_name_ch1 == 'Normal liver tissue',]
pdata2=pdata[pdata$source_name_ch1=='liver tissue with HCC',]
exprSet=as.data.frame(exprSet)
exprSet1=exprSet[,rownames(pdata1)]
exprSet2=exprSet[,rownames(pdata2)]
#按列合并exprSet1,2
exprSet=cbind(exprSet1,exprSet2)
pdata=rbind(pdata1,pdata2)
group_list=c(rep('normal',19),rep('tumor',38))
group_list <- factor(group_list,levels = c('normal','tumor'))
group_list
save(exprSet,pdata,group_list,file ='lab0113.Rdata')
load('lab0113.Rdata')
#microarray等格式的数据需要进行ID转换
##这个方法直接下载GEO GPL处的注释文件来做转换,肯定是可行的
#GSE14323是用GPL96和GPL571一起的,两个平台注释文件非常相似
#我们选择样本数多的GPL571
#https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL571 Download full table
#开始转换
#载入注释文件,在你自己的文件夹里
#没data.table,geoquery啥的先装一下
#install.package('data.table')
#BiocManager::install('GEOquery',update = F,ask = F)
library(data.table)
anno = fread("D:/R学习/platform/GPL571-17391.txt",data.table = F,sep = '\t')
#看一下列名
colnames(anno)
#一定选ID探针列和gene symbol列,!!!此处是可变的!!!
anno = anno[,c(1,11)]
anno = anno[!anno$`Gene Symbol`== "",]
#无脑跑代码即可
tmp = rownames(exprSet)%in% anno[,1]
exprSet = exprSet[tmp,]
dim(exprSet)
match(rownames(exprSet),anno$ID)
anno = anno[match(rownames(exprSet),anno$ID),]
match(rownames(exprSet),anno$ID)
dim(exprSet)
dim(anno)
tail(sort(table(anno[,2])), n = 12L)
#注释到相同基因的探针,保留表达量最大的那个探针
{
MAX = by(exprSet, anno[,2],
function(x) rownames(x)[ which.max(rowMeans(x))])
MAX = as.character(MAX)
exprSet = exprSet[rownames(exprSet) %in% MAX,]
rownames(exprSet) = anno[ match(rownames(exprSet), anno[,1] ),2]
}
dim(exprSet)
#cel.gz文件合并
library(affy)
library(dplyr)
library(tidyverse)
data.raw <- ReadAffy()
eset <- rma(data.raw)
write.exprs(eset,"probe_exp.txt")
ann <- read.delim2("./GPL11180.txt")##修改注释文件的工作路径,指定到GPL所在文件位置,记得从GEO下载下来删除前几行
ann <- ann[,c("ID","Gene.Symbol")]
ann$Gene.symbol <-str_split_fixed(ann$Gene.Symbol,pattern = "///",3)[,1]
rt <- read.table("./probe_exp.txt",sep = "\t",header = T)
symbol <- merge(ann,rt,by.x = 1,by.y = 1)
symbol <- symbol[-1]
colnames(symbol)[1] <- c("ID")
symbol <- distinct(symbol,ID,.keep_all = T)
rownames(symbol) <- symbol$ID
symbol <- symbol[-1]
colnames(symbol) <- str_split_fixed(colnames(symbol),pattern = "_",2)[,1]
symbol <- cbind(ID=rownames(symbol),symbol)
write.table(symbol,"symbol.txt",sep = "\t",quote = F,row.names = F)
自测序数据
##方式二:读入自己的测序矩阵
#读入表达矩阵
mrna_expr <- read.table("E:/Research/12.GYY/RNAseq/data/data_CvsS.txt",header = T)
#处理重复的基因:挑选行平均值大的那一行
#计算行平均值,按降序排列
index=order(rowMeans(mrna_expr[,-1]),decreasing = T)
#调整表达谱的基因顺序
expr_ordered=mrna_expr[index,]
#对于有重复的基因,保留第一次出现的那个,即行平均值大的那个
keep=!duplicated(expr_ordered$gene_name)
#得到最后处理之后的表达谱矩阵
mrna_expr=expr_ordered[keep,]
rownames(mrna_expr) <- mrna_expr$gene_name
mrna_expr <- mrna_expr[,-1]
#读入样本分组信息
#注意保证表达矩阵中的样本顺序和group文件分组顺序是一一对应的
mrna_sample <- read.csv("E:/Research/12.GYY/RNAseq/data/group_CvsS.csv",header = T,row.names = 1,check.names=F)
二、数据清理
删除低表达基因
#对表达量矩阵取整数(DESEQ2处理非整数rawcount时需要)
mrna_expr <- round(mrna_expr)
# 保留每个样本中都有表达的基因
keep_data <- rowSums(mrna_expr> 0) >= floor(ncol(mrna_expr))
#table(keep_data)
mrna_expr <- mrna_expr[keep_data,]
#删除表达量为0的基因
#mrna_expr <- mrna_expr[which(rowSums(mrna_expr)!=0),]
# 或保留至少在75%的样本中都有表达的基因
#keep_data <- rowSums(mrna_expr> 0) >= floor(0.75*ncol(mrna_expr))
#table(keep_data)
标准化
#先看一下样本间齐不齐
boxplot(log2(mrna_expr))
#不齐的话样本间normalize一下(做热图的话可以不normalize)
mrna_expr=normalizeBetweenArrays(mrna_expr)
boxplot(mrna_expr,outline=FALSE, notch=T,col=mrna_sample, las=2)
#log化处理
mrna_expr = log2(mrna_expr+1)
boxplot(log2(mrna_expr))
#将log化后的负无穷值替换为0
#mrna_expr[mrna_expr == -Inf] = 0
三、差异分析
DESeq2差异分析(适用于rawcount)
dds_2 <- DESeqDataSetFromMatrix(countData = expr_data,
colData = sample_data,
design = ~ group)
dds_2$condition <- relevel(dds_2$group, ref = "DSS") # 指定哪一组作为对照
dds_2 <- DESeq(dds_2, fitType = 'mean', minReplicatesForReplace = 7, parallel = FALSE)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
nrDEG_2 <- as.data.frame(results(dds_2))
nrDEG_2 = nrDEG_2[order(nrDEG_2$log2FoldChange),] # 按照logFC排序
nrDEG_2[1:3,1:4]
write.csv(nrDEG_2,file = "./DESeq2_DEG.csv")
limma差异分析
#构建分组矩阵
design <- model.matrix(~factor(mrna_sample$GROUP) + 0)
colnames(design) <- c("CART","GSCART")
#构建比较矩阵
contrast.matrix <- makeContrasts(GSCART - CART, levels = factor(mrna_sample$GROUP))#实验组vs对照组
#线性混合模拟
fit <- lmFit(mrna_expr,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异分析结果
RNA_DEG <- topTable(fit2,
#adjust = 'BH',
coef = 1,
n = Inf,
sort.by="logFC"
#p.value = 0.05
)
#筛选差异基因
allDiffsig=RNA_DEG[RNA_DEG$P.Val<0.05,]
allDiffsig=allDiffsig[allDiffsig$logFC >1 | allDiffsig$logFC < -1,]
#显著上调的基因
up=allDiffsig[allDiffsig$logFC>0,]
#显著下调的基因
down=allDiffsig[allDiffsig$logFC<0,]
#RNA_DEG = RNA_DEG[order(RNA_DEG$logFC),] # 按照logFC排序
write.csv(RNA_DEG,"E:/Research/12.GYY/RNAseq/result/GvsC_DEG_limma.csv")
write.csv(mrna_expr,"E:/Research/12.GYY/RNAseq/result/GvsC_RNA_norm.csv")
特殊情况:配对样本差异分析
rm(list=ls())
setwd("E:/Research/5.CRC_liver/Analysis-202304/data/Diff_analysis")#设置工作路径
#读取总数据
data = read.csv('Input data.csv',header=TRUE,row.names=1)
group = read.csv('group.csv', header=TRUE, row.names=1)
individuals = read.csv('individual.csv', header=TRUE, row.names=1)
#提取MO+MM
data_MMMO <- data[,c(1:47,125:171)]
group_MMMO <- group[c(1:47,125:171),]
individuals_MMMO <- individuals[c(1:47,125:171),]
groupID_MMMO <- factor(group_MMMO,levels = unique(group_MMMO))
individualsID_MMMO <- factor(individuals_MMMO,levels = unique(individuals_MMMO))
#整理分组矩阵
design_MMMO_paried <- model.matrix(~ individualsID_MMMO + groupID_MMMO)
fit_MMMO <- lmFit(data_MMMO,design_MMMO_paried)
fit_MMMO <- eBayes(fit_MMMO)
#差异分析
allDiff_MMMO_paired <- topTable(fit_MMMO,
adjust = 'BH',
coef = ncol(design_MMMO_paried),
n = Inf)
p_MMMO <- EnhancedVolcano(allDiff_MMMO_paired,
lab = rownames(allDiff_MMMO_paired),
x = 'logFC',
y = 'P.Value',
title = 'paired MM VS MO',
pointSize = 3.0,
labSize = 6.0,
legendPosition = 'right',
pCutoff = 0.05,
FCcutoff = 1)
p_MMMO
write.csv(allDiff_MMMO_paired,"allDiff_MMMO_paired.csv")
四、差异基因展示
火山图的绘制
p_MONO <- EnhancedVolcano(RNA_DEG,
lab = rownames(RNA_DEG),
x = 'logFC',
y = 'P.Value',
xlim = c(-5, 5), #x轴范围
ylim = c(0, 6.5), #y轴范围
title = 'GS_CART vs CART',
#selectLab = c("CCR5"), #填写你想标注的基因
col = c("grey30", "#458c45", "#3c61a3", "#d4372a"),
pointSize = 3.0,
labSize = 6.0,
legendPosition = 'right',
pCutoff = 0.05,
FCcutoff = 1,
gridlines.major = FALSE, # 背景网格
gridlines.minor = FALSE,
legendLabSize = 8,
legendIconSize = 8,
titleLabSize =12,subtitle = NULL)
p_MONO
热图1:pheatmap
rt=mrna_expr[c('MAP2K3','FOS','LIF','TBX21',"TNFRSF9","CD40LG",
"IL2","IFNG","TNF","IL1A","IL3","IL4","IL9","IL22",
"GZMB","CRTAM","IL18RAP","CD160","CAMK2D",
"XCL1","XCL2","CCL1","CCL4"),] #"IL21","IL17A","IL31"不在里面
rt=na.omit(rt)
write.csv(rt,"E:/Research/12.GYY/RNAseq/data/heatmap_gene.csv")
anno=data.frame(row.names = colnames(rt),group=mrna_sample)
library(pheatmap)
pheatmap(rt,annotation_col = anno,scale = 'row',#针对行进行标化
cluster_cols = F,
show_rownames = T, show_colnames = F,
color = colorRampPalette(c(rep("blue",1), "white", rep("red",1)))(100),
breaks = seq(-3,3,length.out = 100),border_color = NA)
热图2:ComplexHeatmap
#ComplexHeatmap包并不会对数据进行标准化,为了让图形更好看,先手动进行标准化
dt <- apply(rt, 1, scale)
rownames(dt) <- colnames(rt)
dt <- t(dt)
#scale函数是按列归一化,对于我们一般习惯基因名为行,样本名为列的数据框,就需要进行转置
#配色自定义:
mycol <- colorRamp2(c(-2, 0, 2), c("#0da9ce", "white", "#e74a32"))
#格子大小设置(按个人需求):
cellwidth = 1
cellheight = 0.35
cn = dim(dt)[2]
rn = dim(dt)[1]
w = cellwidth*cn
h = cellheight*rn
#注释配色自定义:
c4a_gui()
cols <- c4a('pastel1',6)
df <- data.frame(colnames(dt))
colnames(df) <- 'cell'
cols;df
#注释颜色添加:
row_cols <- setNames(c("#FBB4AE","#FBB4AE","#FBB4AE","#FBB4AE","#FBB4AE","#FBB4AE","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#B3CDE3","#CCEBC5","#CCEBC5", "#CCEBC5", "#CCEBC5", "#CCEBC5", "#DECBE4", "#DECBE4" ,"#DECBE4" ,"#DECBE4"), rownames(dt))
row_cols
#生成注释信息:
col_anno <- HeatmapAnnotation(df = df,
show_annotation_name = F,
gp = gpar(col = 'white', lwd = 2),
col = list(cell = c(
'P1_C' = "#FBB4AE",
'P2_C' = "#B3CDE3",
'P3_C' = "#CCEBC5",
'P1_S' = "#DECBE4",
'P2_S' = "#FED9A6",
'P3_S' = "#FFFFCC"
)))
#生成注释信息:
row_anno <- rowAnnotation(foo = anno_text(rownames(dt),
location = 0,
just = "left",
gp = gpar(fill = row_cols,
col = "black",
fontface = 'italic'),
width = max_text_width(rownames(dt))*1.25))
#绘图:
Heatmap(as.matrix(dt),
width = unit(w, "cm"),
height = unit(h, "cm"),
name = 'expression',
col = mycol,
cluster_columns = F,
cluster_rows = F,
column_names_side = c('top'),
column_names_rot = 60,
row_title = 'T cell key markers',
rect_gp = gpar(col = "white", lwd = 1.5),
heatmap_legend_param = list(legend_height = unit(2.8, "cm"),
grid_width = unit(0.4, "cm"),
labels_gp = gpar(fontsize = 10)),
top_annotation = col_anno, #添加列注释色块
row_split = c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","C","C", "C", "C", "C", "D", "D" ,"D" ,"D"), #对行切片
row_gap = unit(2, "mm"),
border = T,
border_gp = gpar(col = "black", lwd = 1.2)) +
row_anno #热图中添加背景注释色块
四、富集分析
KEGG富集分析
down_gene_entrezid <- bitr(down_gene, fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Hs.eg.db)
KEGGenrich_down <- enrichKEGG(gene = down_gene_entrezid$ENTREZID,
organism = 'hsa', #物种是人
#universe = gene_all,
pvalueCutoff = 1,
qvalueCutoff = 1)
#画图:KEGG-上调
library(ggsci)
kegg_up=kegg_up[kegg_up$pvalue<0.05,]
#柱状图
ggplot(data=kegg_up, aes(x=Description,y=GeneRatio)) +
geom_bar(stat="identity",fill='#BC3C29') +
xlab("KEGG_up") + ylab("GeneRatio") + theme_bw()+
theme(axis.text.x=element_text(angle=0)) +
scale_fill_nejm()+scale_color_nejm()+coord_flip()
#气泡图
dotplot(KEGGenrich_up)+scale_color_continuous(low='#bc3c29', high='#dadada')
#画图:KEGG-下调
library(ggsci)
kegg_down=kegg_down[kegg_down$pvalue<0.05,]
ggplot(data=kegg_down, aes(x=Description,y=GeneRatio)) +
geom_bar(stat="identity",fill='#BC3C29') +
xlab("KEGG_down") + ylab("GeneRatio") + theme_bw()+
theme(axis.text.x=element_text(angle=0)) +
scale_fill_nejm()+scale_color_nejm()+coord_flip()
dotplot(KEGGenrich_down)+scale_color_continuous(low='#bc3c29', high='#dadada')
GO富集分析
#上调基因
GOenrich_up <- enrichGO(gene = up_gene_entrezid$ENTREZID,
OrgDb = 'org.Hs.eg.db',
ont = 'All',
#universe = gene_all,
pvalueCutoff = 1,
qvalueCutoff = 1)
GOenrich_up=DOSE::setReadable(GOenrich_up,OrgDb='org.Hs.eg.db',keyType='ENTREZID')
#下调基因
GOenrich_down <- enrichGO(gene = down_gene_entrezid$ENTREZID,
OrgDb = 'org.Hs.eg.db',
ont = 'All',
#universe = gene_all,
pvalueCutoff = 1,
qvalueCutoff = 1)
GOenrich_down=DOSE::setReadable(GOenrich_down,OrgDb='org.Hs.eg.db',keyType='ENTREZID')
GO_up=GOenrich_up@result
GO_down=GOenrich_down@result
write.csv(GO_up,'E:/Research/12.GYY/RNAseq/result/GO_up.csv',quote = F)
write.csv(GO_down,'E:/Research/12.GYY/RNAseq/result/GO_down.csv',quote = F)
#画图:GO上调
GO_up=GO_up[GO_up$pvalue<0.05,]
#柱状图
ggplot(data=GO_up, aes(x=Description,y=GeneRatio)) +
geom_bar(stat="identity",fill='#BC3C29') +
xlab("GO_up") + ylab("GeneRatio") + theme_bw()+
theme(axis.text.x=element_text(angle=0)) +
scale_fill_nejm()+scale_color_nejm()+coord_flip()
dotplot(GOenrich_up)+scale_color_continuous(low='#bc3c29', high='#dadada')
#画图:GO下调
GO_down=GO_down[GO_down$pvalue<0.05,]
#柱状图
ggplot(data=GO_down, aes(x=Description,y=GeneRatio)) +
geom_bar(stat="identity",fill='#BC3C29') +
xlab("GO_up") + ylab("GeneRatio") + theme_bw()+
theme(axis.text.x=element_text(angle=0)) +
scale_fill_nejm()+scale_color_nejm()+coord_flip()
dotplot(GOenrich_down)+scale_color_continuous(low='#bc3c29', high='#dadada')
GSEA富集分析
library(tidyverse)
library(msigdbr)
library(clusterProfiler)
library(org.Mm.eg.db)
library(enrichplot)
library(RColorBrewer)
library(ggrepel)
library(ggplot2)
library(aplot)
GO_BP数据库
######### 1、选择GO_BP数据库 ##############
# choose C5 BP gene sets
genesets <- msigdbr(species = "Homo sapiens",#Mus musculus
category = "C5",
subcategory = "BP")
# check
head(genesets,3)
# TERM2GENE
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)
#从差异基因列表导入基因
RNA_DEG$gene_name <- rownames(RNA_DEG)
RNA_DEG$log2FoldChange <- RNA_DEG$logFC
gene <- RNA_DEG[,-c(1:6)]
gene <- arrange(gene,desc(log2FoldChange)) #按照LogFC降序排列
#从txt文件读入
#gene <- read.table('./limma_DEG.txt',header = T) %>% arrange(desc(log2FoldChange))
# check
head(gene,3)
geneList <- gene$log2FoldChange
names(geneList) <- gene$gene_name
# check
head(geneList,3)
# GO enrich
gseaRes <- GSEA(geneList = geneList,
TERM2GENE = gmt,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "BH", #一般选择 "BH" 或 "fdr",BH较严格,fdr较温和(计算的q小些)
by = "fgsea", #"fgsea"或“DOSE”,传统的GSEA方法很慢,可以通过对随机的基因集抽样解决这个问题,也就是 a fast gene set enrichment analysis (FGSEA)
#nPerm = 2000,
verbose = FALSE)
# to dataframe
data_ga <- data.frame(gseaRes) %>%
filter(pvalue < 0.05)
View(data_ga)
write.csv(data_ga, file = "E:/Research/12.GYY/RNAseq/result/gsea_result_gobp.csv")
KEGG_BP数据库
######### 2、选择KEGG_BP数据库 ##############
#h <- msigdbr(species = "Mus musculus")
#table(h$gs_subcat) #查看GO;KEGG等基因集
#h2 <- h[grep(pattern = "TP53",x = h$gs_name),] #pattern参数内输入想要寻找的关键词
#获取KEGG基因集
genesets <- msigdbr(species = "Homo sapiens",#Mus musculus
category = "C2",
subcategory = "KEGG")
# check
head(genesets,3)
# TERM2GENE
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)
#从差异基因列表导入基因
RNA_DEG$gene_name <- rownames(RNA_DEG)
RNA_DEG$log2FoldChange <- RNA_DEG$logFC
gene <- RNA_DEG[,-c(1:6)]
gene <- arrange(gene,desc(log2FoldChange)) #按照LogFC降序排列
#从txt文件读入
#gene <- read.table('./limma_DEG.txt',header = T) %>%
# arrange(desc(log2FoldChange))
# check
head(gene,3)
geneList <- gene$log2FoldChange
names(geneList) <- gene$gene_name
# check
head(geneList,3)
# KEGG enrich
gseaRes <- GSEA(geneList = geneList,
TERM2GENE = gmt,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "BH",
verbose = FALSE)
# to dataframe
data_ga <- data.frame(gseaRes) %>%
filter(pvalue < 0.05)
write.csv(data_ga, file = "E:/Research/12.GYY/RNAseq/result/gsea_result_kegg.csv")
HALLMARK数据库或自建基因集
######### 3、选择HALLMARK数据库或自建基因集 ##############
#HALLMARK数据库
genesets <- msigdbr(species = "Homo sapiens",
category = "H")
# 1)从HALLMARK获取gmt
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)
# 2)导入自己构建的gmt
gmt <- read.table("./AEos_gene.txt")
# load gene
gene <- read.table('./limma_DEG.txt',header = T) %>%
arrange(desc(log2FoldChange))
geneList <- gene$log2FoldChange
names(geneList) <- gene$gene_name
# KEGG enrich
gseaRes <- GSEA(geneList = geneList,
TERM2GENE = gmt,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "BH",
verbose = FALSE)
# to dataframe
data_ga <- data.frame(gseaRes) %>%
filter(pvalue < 1)
data_ga <- write.csv(data_ga, file = "./AEos_gene_result.csv")#
#可视化
library(GseaVis)
#MDSC
gseaNb(object = gseaRes,
geneSetID = 'AEos',
subPlot = 2,
addPval = T,
pvalX = 0.75,pvalY = 0.8,
pCol = 'black',
pHjust = 0)
gseaNb(object = gseaRes,
geneSetID = 'MDSC',
subPlot = 2,
addPval = T,
pvalX = 0.75,pvalY = 0.8,
pCol = 'black',
pHjust = 0,
addGene = T,
markTopgene= T,
topGeneN = 10,
geneCol = '#4d4d4d')
Last updated