TCGA/CTPAC分析
By Kaiyi
1、前期准备
rm(list = ls())
#确立工作目录
setwd("D:/Lessons/Bioinformatics/Pj_data/TCGA")
#加载R包
library(survival) #生存曲线
library(survminer) #生存曲线
library(forestmodel) #森林图
library(DESeq2) #差异分析
library(msigdbr) #GSEA
library(clusterProfiler) #GSEA
library(GseaVis) #GSEA
library(limma) #差异分析
library(ggplot2)
library(stringr)
library(tidyverse)
library(mice) #填补缺失值
library(maftools) #CNV分析
library(immunedeconv) #免疫浸润
library(CancerSubtypes) #癌症分型
library(GenVisR) #Maf可视化
library(GSVA) #ssGSEA
library(EnhancedVolcano)#火山图
2、Kaplan-Meier生存分析
#读入TCGA临床数据【至少包含三列信息,生存状态(0代表存活,1代表死亡)、生存时长(此处为PFS)、分类变量(KRAS突变型)】
survival_data <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/KRAS_survival.csv",header = T)
#拟合曲线
fit<- survfit(Surv(time, status) ~ KRAS_subtype, data = survival_data, type = 'kaplan-meier')
#绘制生存曲线
ggsurvplot(fit, data = survival_data,
#surv.median.line = "hv", #在图中添加中位生存时间
conf.int = TRUE, #给生存曲线添加上置信区间
pval = TRUE, #添加P值
pval.size = 5,#文字大小
legend.labs = c("KRAS-Mut", "KRAS-WT"), #在图中添加图例
risk.table = TRUE,#添加上风险表
ggtheme = theme_bw(), #theme_light(),
palette = c("#c68f3f","#01665e"), #"grey","npg","aaas","lancet"
title = "Colorectal Adenocarcinoma Cohort (TCGA, PanCancer Atlas)",
xlab = "Time (Months)")
#ggsurvplot的更多参数可见https://mp.weixin.qq.com/s/N6J6TifLq4SGfLJ9lqOykg
3、Cox回归分析
#读入临床数据
clinical_data <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/TCGA_clinical.csv",header = T)
#因子转换
clinical_data$Sex <- factor(clinical_data$Sex, levels = unique(clinical_data$Sex))
clinical_data$KRAS <- factor(clinical_data$KRAS, levels = unique(clinical_data$KRAS))
clinical_data$MSI <- factor(clinical_data$MSI, levels = unique(clinical_data$MSI))
clinical_data$Stage <- factor(clinical_data$Stage, levels = unique(clinical_data$Stage))
#建模
fit1 <- coxph(Surv(PFI.time, PFI) ~ Age + Sex + Stage + MSI + KRAS,
data = clinical_data)
summary(fit1)
#制作森林图
forest_model(fit1)
4、mRNA转录组差异分析
### 1)读入表达矩阵与样本分组信息
#读入前对重复的基因行进行去重
mrna_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/data_mrna_seq_v2_rsem.csv",header = T,row.names = 1,check.names=F)
#注意保证表达矩阵中的样本顺序和group文件分组顺序是一一对应的
mrna_sample <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/data_mrna_seq_v2_rsem_group.csv",header = T,row.names = 1,check.names=F)
### 2、数据清洗
#对表达量矩阵取整
mrna_expr <- round(mrna_expr)
#去除负值
mrna_expr[mrna_expr==-1] <- 0
# 保留每个样本中都有表达的基因
mrna_expr <- mrna_expr[which(rowSums(mrna_expr)!=0),]
# 或保留至少在75%的样本中都有表达的基因
#mrna_expr <- mrna_expr[which(rowSums(mrna_expr> 0) >= floor(0.75*ncol(mrna_expr))),]
###3、DEseq2差异分析
dds <- DESeqDataSetFromMatrix(countData = mrna_expr,
colData = mrna_sample,
design = ~ group)
dds$group <- relevel(dds$group, ref = "WT") # 指定哪一组作为对照组
dds <- DESeq(dds, fitType = 'mean', minReplicatesForReplace = 7, parallel = FALSE)
mrna_DEG <- as.data.frame(results(dds))
mrna_DEG = mrna_DEG[order(mrna_DEG$log2FoldChange),] # 按照logFC排序
write.csv(mrna_DEG,file = "D:/Lessons/Bioinformatics/Pj_result/mrna_DEG.csv")
5.1、GSEA富集分析
###1.1 选择GO_BP数据库
# choose C5 BP gene sets
genesets_GOBP <- msigdbr(species = "Homo sapiens",
category = "C5",
subcategory = "BP")
# TERM2GENE
gmt_GOBP <- genesets_GOBP %>% dplyr::select(gs_name,gene_symbol)
###1.2 选择KEGG数据库
# choose C2 KEGG gene sets
genesets_KEGG <- msigdbr(species = "Homo sapiens",
category = "C2",
subcategory = "KEGG")
# TERM2GENE
gmt_KEGG <- genesets_KEGG %>% dplyr::select(gs_name,gene_symbol)
###2)加载基因与foldchange值
gene <- read.csv('D:/Lessons/Bioinformatics/Pj_result/mrna_DEG.csv',header = T) %>%
arrange(desc(log2FoldChange))
geneList <- gene$log2FoldChange
names(geneList) <- gene$gene_name
# GO enrich
gseaRes <- GSEA(geneList = geneList,
TERM2GENE = gmt_GOBP,#gmt_KEGG
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "BH", #一般选择 "BH" 或 "fdr",BH较严格,fdr较温和(计算的q小些)
by = "fgsea", #"fgsea"或“DOSE”,传统的GSEA方法很慢,可以通过对随机的基因集抽样解决这个问题,也就是 a fast gene set enrichment analysis (FGSEA)
#nPerm = 2000,
verbose = FALSE)
# to dataframe
data_ga <- data.frame(gseaRes) %>%
filter(pvalue < 0.05)
#data_ga <- write.csv(data_ga, file = "D:/Lessons/Bioinformatics/Pj_result/GSEAresult_kegg.csv")
#data_ga <- write.csv(data_ga, file = "D:/Lessons/Bioinformatics/Pj_result/GSEAresult_gobp.csv")
###3)可视化
#一次性显示多条通路
geneSetID = c('GOBP_KERATINIZATION',
'GOBP_REGULATION_OF_KETONE_BIOSYNTHETIC_PROCESS',
'GOBP_PROTEIN_AUTOPROCESSING',
'GOBP_EPIDERMIS_DEVELOPMENT',
'GOBP_CELL_FATE_DETERMINATION',
'GOBP_T_CELL_ACTIVATION',
'GOBP_ADAPTIVE_IMMUNE_RESPONSE',
'GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION',
'GOBP_B_CELL_MEDIATED_IMMUNITY',
'GOBP_DENDRITIC_CELL_CHEMOTAXIS')
# all plot
library(RColorBrewer)
gseaNb(object = gseaRes,
geneSetID = geneSetID,
subPlot = 2,
termWidth = 35,
#legend.position = c(0.8,0.8),
#addGene = gene,
#addPval = T,
#pvalX = 0.05,pvalY = 0.05,
curveCol = brewer.pal(10, 'BrBG'))
5.2 单基因GSEA分析(两种思路)
// R
https://mp.weixin.qq.com/s/q6nkujgTYlbOQpkENjyyxA
6、蛋白质组学/磷酸化蛋白质学组差异分析
### 1)读入表达矩阵与样本分组信息
setwd("D:/Lessons/Bioinformatics/Pj_data/CPTAC")
#此处将样本名改成了A1:A97
#磷酸化蛋白组
phos_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_Phosphoproteome.csv",header = T,row.names = 1,check.names=F)
#蛋白质组
prot_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_Proteome.csv",header = T,row.names = 1,check.names=F)
#转录组
RNA_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_RNAseq.csv",header = T,row.names = 1,check.names=F)
#分组文件(注意保证表达矩阵中的样本顺序和group文件分组顺序是一一对应的)
cptac_sample <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_group.csv",header = T,row.names = 1,check.names=F)
### 2)数据清洗
## 磷酸蛋白质组数据NA较多,去除75%以上样本都是NA的基因
phos_expr <- phos_expr[-which(rowMeans(is.na(phos_expr)) > 0.75),] #17408 obs in 31339
#缺失值处理:mice包多重插补(或者用impute包)
#mice包不支持复杂的列名,如包含-/()数字开头等,因此将列名改成了A1:A97
imp_phos <- mice(phos_expr, m = 5, method = "pmm")
phos_expr_pmm <- complete(imp_phos,5)
imp_prot <- mice(prot_expr, m = 5, method = "pmm")
prot_expr_pmm <- complete(imp_prot,5)
imp_RNA <- mice(RNA_expr, m = 5, method = "pmm")
RNA_expr_pmm <- complete(imp_RNA,5)
###3.1)limma差异分析(磷酸化蛋白质组)
#构建分组矩阵
design <- model.matrix(~factor(cptac_sample$group) + 0)
colnames(design) <- c("Mut","WT")
#构建比较矩阵
contrast.matrix <- makeContrasts(Mut - WT, levels = factor(cptac_sample$group))#实验组vs对照组
#线性混合模拟
fit <- lmFit(phos_expr_pmm,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异基因
phos_DEG <- topTable(fit2,
#adjust = 'BH',
coef = 1,
n = Inf,
sort.by="logFC"
#p.value = 0.05
)
write.csv(phos_DEG,"D:/Lessons/Bioinformatics/Pj_result/phos_DEG_limma.csv")
write.csv(phos_expr_pmm,"D:/Lessons/Bioinformatics/Pj_result/phos_expr_pmm.csv")
###3.2)limma差异分析(蛋白质组)
#构建分组矩阵
design <- model.matrix(~factor(cptac_sample$group) + 0)
colnames(design) <- c("Mut","WT")
#构建比较矩阵
contrast.matrix <- makeContrasts(Mut - WT, levels = factor(cptac_sample$group))#实验组vs对照组
#线性混合模拟
fit <- lmFit(prot_expr_pmm,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异基因
prot_DEG <- topTable(fit2,
#adjust = 'BH',
coef = 1,
n = Inf,
sort.by="logFC"
#p.value = 0.05
)
write.csv(prot_DEG,"D:/Lessons/Bioinformatics/Pj_result/prot_DEG_limma.csv")
write.csv(prot_expr_pmm,"D:/Lessons/Bioinformatics/Pj_result/prot_expr_pmm.csv")
###3.3)limma差异分析(转录组)
#构建分组矩阵
design <- model.matrix(~factor(cptac_sample$group) + 0)
colnames(design) <- c("Mut","WT")
#构建比较矩阵
contrast.matrix <- makeContrasts(Mut - WT, levels = factor(cptac_sample$group))#实验组vs对照组
#线性混合模拟
fit <- lmFit(RNA_expr_pmm,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异基因
RNA_DEG <- topTable(fit2,
#adjust = 'BH',
coef = 1,
n = Inf,
sort.by="logFC"
#p.value = 0.05
)
write.csv(RNA_DEG,"D:/Lessons/Bioinformatics/Pj_result/RNA_DEG_limma.csv")
write.csv(RNA_expr_pmm,"D:/Lessons/Bioinformatics/Pj_result/RNA_expr_pmm.csv")
###4)九象限散点图
#4.1读入数据
library(dplyr)
rna = read.csv('D:/Lessons/Bioinformatics/Pj_result/RNA_DEG_limma.csv')
prot = read.csv('D:/Lessons/Bioinformatics/Pj_result/prot_DEG_limma.csv')
phos = read.csv('D:/Lessons/Bioinformatics/Pj_result/phos_DEG_limma.csv')
prot$Group = 'Proteome'
phos$Group = 'Phosphoproteome'
rna$Group = 'rna'
prot$sample = prot[,1]
phos$sample = phos[,1]
rna$sample = rna[,1]
#4.2数据处理
library(stringr)
phos$X = str_split(phos$X,'_',simplify = T)[,1]
colnames(prot)[1] = 'X'
phr = rbind(prot,phos)
all = merge(x = phr,y=rna,by.x = 1,by.y = 1)
all$Significance = ifelse(all$P.Value.x & all$P.Value.y < 0.05,'P.value < 0.05','P.value ≥ 0.05')
table(all$Significance)
head(all)
colnames(all)
#4.3作图
library(ggplot2)
size=10
themes <- theme(
title = element_text(size=size,color="black"),
axis.text.x = element_text(size=size,color="black"),
axis.text.y = element_text(size=size,color="black"),
axis.title.x= element_text(size=size,hjust=0.5,color="black"),
axis.title.y= element_text(size=size,hjust=0.5,color="black"),
legend.text=element_text(color="black",size=size),
legend.title = element_text(color="black", size=size),
plot.title = element_text(hjust = 0.5),
panel.background = element_blank(),
axis.ticks = element_line(colour = "black", linewidth =1),
axis.ticks.length = unit(.2, "cm"),
axis.line = element_line(linewidth=1, colour = "black"),legend.key = element_blank())
library(ggrepel)
sig = subset(all,subset = c(all$P.Value.x & all$P.Value.y < 0.05))
p=ggplot(data=all,aes(x=logFC.y,y=logFC.x))+
geom_point(aes(color = Group.x,shape = Significance))+
labs(x='Singed logfc RNA', y='Singed logfc Proteomics',color = 'Group')+
themes+
scale_color_manual(values = c('#244689','red'))+
scale_shape_manual(values = c(17,16))+
geom_text_repel(data = sig,aes(label=sample.x),size=3)
p
#保存为pdf
ggsave('D:/Lessons/Bioinformatics/Pj_result/Fig1E_dotplot.pdf',width = 8,height = 6)
7、臂级体细胞拷贝数改变 (SCNA) 事件分析
###1) 数据下载与读入(TCGA-COAD/READ)
#TCGA的MAF文件下载:https://mp.weixin.qq.com/s/oU_bgGg1vQqhwrkPHjzPwg
#https://www.jianshu.com/p/40bbb29d355d
#批量读入所有maf文件名:
mafFilePath <- dir(path = "D:/Lessons/Bioinformatics/Pj_data/TCGA/SCNV",
pattern = "masked.maf.gz$",
full.names = T,
recursive=T)
head(mafFilePath)
###2) 数据读入
#读入临床信息
#laml.clin =system.file('extdata',
# 'tcga_laml_annot.tsv', package = 'maftools')
#read.maf函数读取每一个maf文件(会遇到Error in read.maf(x, isTCGA = TRUE) : No non-synonymous mutations found Check `vc_nonSyn`` argumet in `read.maf` for details的报错,删除该未能读入的文件即可)
mafdata <- lapply(mafFilePath,
function(x){
read.maf(x,
isTCGA=TRUE) # clinicalData = laml.clin
})
###3) 数据合并
#使用merge_mafs函数合并maf数据:
maf <- merge_mafs(mafdata)
##maf文件摘要图:
plotmafSummary(maf = maf,
rmOutlier = TRUE,
addStat = "median",
dashboard = TRUE,
titvRaw = FALSE)
###4)可视化(可参考https://mp.weixin.qq.com/s/pBDue6hOf0DrTZMN0rB2sA)
##瀑布图(Oncoplots):
oncoplot(maf = maf,
#draw_titv = FALSE,
#anno_height = 1,#样本注释区高度
#legend_height = 4, #图例绘图区高度
#drawRowBar = T, #是否显示右侧条形图
#drawColBar = T, #是否显示顶部条形图
top = 10) #高频突变的Top10基因
#体细胞突变相关性热图
somaticInteractions(
maf,
top = 20,
genes = NULL,
pvalue = c(0.05, 0.1),
returnAll = TRUE,
geneOrder = NULL,
fontSize = 0.6,
showSigSymbols = TRUE,
showCounts = FALSE,
countStats = "all",
countType = "all",
countsFontSize = 0.6,
countsFontColor = "red",
colPal = "BrBG",
showSum = TRUE,
plotPadj = FALSE,
colNC = 9,
nShiftSymbols = 5,
sigSymbolsSize = 2,
sigSymbolsFontSize = 0.8,
pvSymbols = c(46, 42),
limitColorBreaks = TRUE
)
8、非整倍体评分分析
#加载TCGA aneuploidy score数据
aneuploidy_data <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/Aneuploidy_score.csv",header=T,row.names=1)
# 因子化
aneuploidy_data$KRAS <- factor(aneuploidy_data$KRAS,levels = c("WT","Mut"))
#箱线图
library(ggpubr)
color <- c("#01665e","#c68f3f")
my_comparisons <- list(c("WT", "Mut"))
#可视化
pb1<-ggboxplot(aneuploidy_data,
x="KRAS",
y="Aneuploidy_Score",
color="KRAS",
fill=NULL,
ylab = "Aneuploidy Score",
add = "jitter",
bxp.errorbar.width = 0.6,
width = 0.4,
size=0.5,
font.label = list(size=30),
palette = color)+
theme(panel.background =element_blank())
pb1<-pb1+theme(axis.line=element_line(colour="black"))+theme(axis.title.x = element_blank())
pb1<-pb1+theme(axis.text.x = element_text(size = 15,angle = 45,vjust = 1,hjust = 1))
pb1<-pb1+theme(axis.text.y = element_text(size = 12))+theme(plot.title = element_text(hjust = 0.5,size=15,face="bold"))
pb1<-pb1+theme(legend.position = "NA")
pb1<-pb1+stat_compare_means(method="wilcox.test",hide.ns = F,comparisons =my_comparisons,label="p.signif")
pb1
9、免疫浸润分析
#矩阵读入
gene_expression_matrix <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/data_mrna_seq_v2_rsem.csv",header = T,row.names = 1,check.names=F)
#反卷积方法(人):quantiseq/timer/cibersort/cibersort_abs/mcp_counter/xcell/epic/abis/consensus_tme/estimate
#反卷积方法(鼠): mmcp_counter/seqimmucc/dcq/base
#反卷积计算
xcell_TCGA <- immunedeconv::deconvolute(gene_expression_matrix, "xcell")
write.csv(xcell_TCGA, "D:/Lessons/Bioinformatics/Pj_result/xcell_TCGA.csv")
#读入xcell结果
xcell_TCGA <- read.csv("D:/Lessons/Bioinformatics/Pj_result/xcell_TCGA.csv",header = T,row.names = 1, check.names = F)
#读入KRAS分组信息
kras_group <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/data_mrna_seq_v2_rsem_group.csv",header = T,row.names = 1, check.names = F)
colnames(xcell_TCGA) <- c("cell_type",kras_group$group)
#宽数据转成长数据
library(reshape2)
dd=melt(xcell_TCGA)
colnames(dd)=c("cell_type", "KRAS", "Fraction")
#因子转换
dd$KRAS <- factor(dd$KRAS, levels = c("WT","Mut"))
#箱线图
p1 <- ggplot(dd,aes(x=cell_type, y=Fraction,
fill = KRAS, color = KRAS)) +
geom_boxplot(notch = F, alpha = 0.15,
outlier.shape = 1,
outlier.colour = "black", #outlier点用黑色
outlier.size = 0.05) +
scale_fill_manual(values= c("#01665e","#c68f3f")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 60, hjust = 1,size = 10),
axis.text.y = element_text(angle = 90, size = 12),
axis.title.y = element_text(angle = 90, size = 15)) +
theme(legend.position = "top")+
stat_compare_means(symnum.args=list(cutpoints = c(0, 0.001, 0.01, 0.05, 1),
symbols = c("***", "**", "*", "ns")),
label = "p.signif")
p1
ggsave(p1,file='D:/Lessons/Bioinformatics/Pj_result/Fig2A_xcell.pdf',width = 12,height = 5)
10、多组学分子亚型聚类
#读入数据
#磷酸化蛋白组
phos_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_Phosphoproteome.csv",header = T,row.names = 1,check.names=F)
#蛋白质组
prot_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_Proteome.csv",header = T,row.names = 1,check.names=F)
#转录组
RNA_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_RNAseq.csv",header = T,row.names = 1,check.names=F)
#分组文件(注意保证表达矩阵中的样本顺序和group文件分组顺序是一一对应的)
cptac_sample <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_group.csv",header = T)
#看一下数据分布
data.checkDistribution(phos_expr)
data.checkDistribution(prot_expr)
data.checkDistribution(RNA_expr)
###磷酸化蛋白组
#处理缺失值
index = which(is.na(phos_expr))
res_phos = data.imputation(phos_expr,fun="median")
#数据 Normalization
res_phos = data.normalization(res_phos,type="feature_Median",log2=FALSE)
#特征选择——方差、MAD、PCA、COX模型
res_phos = FSbyVar(res_phos, cut.type="topk",value=1000)
###蛋白组
#处理缺失值
index = which(is.na(prot_expr))
res_prot = data.imputation(prot_expr,fun="median")
#数据 Normalization
res_prot = data.normalization(res_prot,type="feature_Median",log2=FALSE)
#特征选择——方差、MAD、PCA、COX模型
res_prot = FSbyVar(res_prot, cut.type="topk",value=1000)
###转录组
#处理缺失值
index = which(is.na(RNA_expr))
res_RNA = data.imputation(RNA_expr,fun="median")
#数据 Normalization
res_RNA = data.normalization(res_RNA,type="feature_Median",log2=FALSE)
#特征选择——方差、MAD、PCA、COX模型
res_RNA = FSbyVar(res_RNA, cut.type="topk",value=1000)
#选出KRAS突变型的样本数据
cptac_sample <- cptac_sample[cptac_sample$group == "Mut",]
res_phos <- res_phos[,cptac_sample$sample]
res_prot <- res_prot[,cptac_sample$sample]
res_RNA <- res_RNA[,cptac_sample$sample]
#从97个样本中得到了34个
#NMF
CRC=list(RNAExp=res_RNA, ProtExp=res_prot, PhosExp = res_phos)
result=ExecuteSNF(CRC, clusterNum=2, K=20, alpha=0.5, t=20)
#癌症亚型的结果验证、解释和可视化
###Similarity smaple matrix
sil=silhouette_SimilarityMatrix(result$group, result$distanceMatrix)
plot(sil)
#生存分析
result=ExecuteSNF(CRC, clusterNum=2, K=20, alpha=0.5, t=20,plot = FALSE)
group=result$group
distanceMatrix=result$distanceMatrix
data(time) #事实上并没有找到CPTAC生存数据
data(status)
time = time[1:34]
status = status[1:34]
p_value=survAnalysis(mainTitle="CRC",time,status,group,
distanceMatrix,similarity=TRUE)
11、随机森林筛选变量
#读入表达矩阵与样本分组信息
phos_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_result/phos_expr.csv",header = T,row.names = 1,check.names=F)
prot_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_result/prot_expr.csv",header = T,row.names = 1,check.names=F)
RNA_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_result/RNA_expr.csv",header = T,row.names = 1,check.names=F)
#随机森林
library(caret)
set.seed(202)
phos_expr <- t(phos_expr)
prot_expr <- t(prot_expr)
prot_expr <-cbind(prot_expr,cptac_sample)
predictors<-colnames(phos_expr)[1:10]
model <- train(x = dat[,predictors],
y = dat$Cmic,
method = "rf",
importance = TRUE,
tuneGrid = expand.grid(mtry = c(2:4)), # length(predictors) or 2:6
trControl = trainControl(method = "cv",
number = 20,
p = 0.75,
savePredictions = TRUE))
#输出模型的RSEM和R方
model$results %>% as_tibble %>% filter(mtry == model$bestTune %>% unlist) %>% select(RMSE, Rsquared)
#棒棒糖图展示模型重要性
varImp(model, scale = FALSE) %>% plot
12、SMG分析
#GenvisR绘制瀑布图
library(FField)
library(GenVisR)
library(reshape2)
#读取maf矩阵
maf_file= read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/final_oncotator_all.csv",header = T)
#读取临床信息
maf_clin = read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_Clinical.csv",header = T)
maf_clin = maf_clin[,c(1:4,7,9,16)]
colnames(maf_clin) <- c('sample', 'Age', 'Gender',"Survival_Status","Stage","MSI Status","KRAS")
clinical <- melt(maf_clin, id.vars = c("sample"))
waterfall(maf_file, fileType = "MAF") #会报错
factor(maf_file[,"Variant_Classification"])
#删去GenviR不能识别的类型
maf_file = subset(maf_file, Variant_Classification != "Splice_Region")
maf_file = subset(maf_file, Variant_Classification != "De_novo_Start_InFrame")
maf_file = subset(maf_file, Variant_Classification != "De_novo_Start_OutOfFrame")
maf_file = subset(maf_file, Variant_Classification != "Stop_Codon_Ins")
maf_file = subset(maf_file, Variant_Classification != "Stop_Codon_Del")
maf_file = subset(maf_file, Variant_Classification != "Start_Codon_SNP")
maf_file = subset(maf_file, Variant_Classification != "Start_Codon_Ins")
maf_file = subset(maf_file, Variant_Classification != "Start_Codon_Del")
maf_file = subset(maf_file, Variant_Classification != "lincRNA")
#瀑布图
waterfall(maf_file, fileType = "MAF", clinDat = clinical,
clinVarCol = c(Live = "white", Dead = "black",Female = "firebrick2",Male = "lightblue",
KM1 = "blue",KM2="red",WT="yellow",
"65+" ="green2","65-" ="green4",
"I" = "#ddd1e7", "II" = "#bba3d0", "III" = "#9975b9", "IV" = "#7647a2"),
plotGenes = c("KRAS","APC","TP53","ARID1A","PIK3CA","TCF7L2","FBXW7","BRAF",
"SMAD4","SMAD2","MSH6","CTNNB1","AMER1","ERBB2","CDC27","ERBB3",
"B2M","AXIN2","ARID2","NRAS","MAP2K4","ACVR1B","PCBP1"))
#maftools包绘制顺反式图
#读入MAF文件
maf = read.maf(maf = "D:/Lessons/Bioinformatics/Pj_data/CPTAC/final_oncotator_all.maf", clinicalData = maf_file)
#绘制顺反式图
titv = titv(maf = maf, plot = FALSE, useSyn = TRUE)
# plot titv summary
plotTiTv(res = titv)
library(deconstructSigs)
library(BSgenome.Hsapiens.UCSC.hg19)
#提取碱基突变数据
sample.mut.ref <- maf_file[,c("Tumor_Sample_Barcode","Chromosome","Start_position","Reference_Allele","Tumor_Seq_Allele2")]
#将染色体列加上"Chr"
a <- paste("chr",maf_file[,"Chromosome"],sep = "")
sample.mut.ref$chr <- a
#使用mut.to.sigs.input构建输入文件
sigs.input <- mut.to.sigs.input(mut.ref = sample.mut.ref,
sample.id = "Tumor_Sample_Barcode",
chr = "chr",
pos = "Start_position",
ref = "Reference_Allele",
alt = "Tumor_Seq_Allele2",
bsg = BSgenome.Hsapiens.UCSC.hg19)
#查看结果信息
dim(sigs.input)
# 使用whichSignatures推断signature的组成
sample_1 = whichSignatures(tumor.ref = sigs.input,
signatures.ref = signatures.cosmic,
sample.id = "01CO005",
contexts.needed = TRUE,
tri.counts.method = 'default')
# Plot example
plot_example <- whichSignatures(tumor.ref = sigs.input,
signatures.ref = signatures.cosmic,
sample.id = "01CO005",
contexts.needed = TRUE)
# Plot output
plotSignatures(plot_example, sub = 'example')
#乐高图
library(barplot3d)
# Read in COSMIC signature probabilities
x=system.file("extdata", "signature_probabilities.txt", package = "barplot3d")
sigdata=read.table(x,header=TRUE,stringsAsFactors = FALSE)
# 输入文件的顺序必须与此一致
cat(sigdata$Somatic_mutation_type,sep="\n")
#使用自己的数据绘制乐高图
legoplot3d(contextdata=sample_1$tumor,labels=FALSE,scalexy=0.03)
#参数调整
legoplot3d(contextdata=sample_1$tumor,labels=FALSE,scalexy=0.01,sixcolors="broad",alpha=0.4)
13、KRAS亚型的功能注释
#选择Hallmark数据库
genesets_Hallmark <- msigdbr(species = "Homo sapiens",
category = "H")
Hallmark_df <- dplyr::select(genesets_Hallmark,gs_name,gs_exact_source,gene_symbol)
Hallmark_list <- split(Hallmark_df$gene_symbol, Hallmark_df$gs_name) ##按照gs_name给gene_symbol分组
#读取经过缺失值插补的转录组和蛋白组表达矩阵
prot_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_result/prot_expr_pmm.csv",header = T,row.names = 1,check.names=F)
RNA_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_result/RNA_expr_pmm.csv",header = T,row.names = 1,check.names=F)
cptac_clinical <- cptac_clinical[,cptac_clinical$KRAS_mutation_subtype%in% c("KM1","KM2")]
Error in `[.data.frame`(cptac_clinical, , cptac_clinical$KRAS_mutation_subtype %in% :
选择了未定义的列
#导入分组
cptac_clinical <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_group_original.csv",header = T,row.names = 1,check.names=F)
colnames(RNA_expr) <- cptac_clinical$KRAS_mutation_subtype
#保留KRAS突变的蛋白组数据
RNA_KRAS <- RNA_expr[,colnames(RNA_expr) %in% c("KM1","KM2")]
cptac_clinical <- cptac_clinical[cptac_clinical$KRAS_mutation_subtype%in% c("KM1","KM2"),]
colnames(RNA_KRAS) <- cptac_clinical$code
#ssGSEA: 蛋白组
prot_KRAS <- as.matrix(prot_KRAS)
prot_ssGSEA = gsva(prot_KRAS, Hallmark_list, method="ssgsea",
min.sz=1, max.sz=Inf,
kcdf="Gaussian"
)
# 去掉"HALLMARK_"
row.names(prot_ssGSEA)=substring(row.names(prot_ssGSEA), 10)
#ssGSEA: 转录组
RNA_KRAS <- as.matrix(RNA_KRAS)
RNA_ssGSEA = gsva(RNA_KRAS, Hallmark_list, method="ssgsea",
min.sz=1, max.sz=Inf,
kcdf="Gaussian"
)
# 去掉"HALLMARK_"
library(stringr)
row.names(RNA_ssGSEA)=substring(row.names(RNA_ssGSEA), 10)
#limma差异分析:蛋白组
# 设置或导入分组
design <- model.matrix(~factor(cptac_clinical$KRAS_mutation_subtype) + 0)
colnames(design) <- c("KM1","KM2")
#构建比较矩阵
contrast.matrix <- makeContrasts(KM1 - KM2, levels = factor(cptac_clinical$KRAS_mutation_subtype))#实验组vs对照组
#线性混合模拟
fit <- lmFit(prot_ssGSEA,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异基因
prot_DEG <- topTable(fit2,
#adjust = 'BH',
coef = 1,
n = Inf,
sort.by="logFC"
#p.value = 0.05
)
sig_prot_DEG = subset(prot_DEG, P.Value < 0.05)
prot_sig=prot_ssGSEA[row.names(sig_prot_DEG),]
#limma差异分析:转录组
# 设置或导入分组
design <- model.matrix(~factor(cptac_clinical$KRAS_mutation_subtype) + 0)
colnames(design) <- c("KM1","KM2")
#构建比较矩阵
contrast.matrix <- makeContrasts(KM1 - KM2, levels = factor(cptac_clinical$KRAS_mutation_subtype))#实验组vs对照组
#线性混合模拟
fit <- lmFit(RNA_ssGSEA,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异基因
RNA_DEG <- topTable(fit2,
#adjust = 'BH',
coef = 1,
n = Inf,
sort.by="logFC"
#p.value = 0.05
)
sig_RNA_DEG = subset(RNA_DEG, P.Value < 0.05)
RNA_sig=RNA_ssGSEA[row.names(sig_RNA_DEG),]
#差异分析可视化:蛋白组差异通路分组热图
library("pheatmap")
annotation_col= data.frame(Group = factor(cptac_clinical$KRAS_mutation_subtype))
row.names(annotation_col) = colnames(prot_sig)
annotation_colors=list(Group = c("KM1" = "#00C094", "KM2" = "#f58220"))
pheatmap(prot_sig, show_rownames = T,scale = "row",
annotation_col = annotation_col,
annotation_colors=annotation_colors,
gaps_col = c(17),
fontsize = 10,
cluster_cols = F,
cluster_rows = T,
show_colnames = F,
cutree_rows =2
)
library("pheatmap")
annotation_col= data.frame(Group = factor(cptac_clinical$KRAS_mutation_subtype))
row.names(annotation_col) = colnames(RNA_sig)
annotation_colors=list(Group = c("KM1" = "#00C094", "KM2" = "#f58220"))
pheatmap(prot_sig, show_rownames = T,scale = "row",
annotation_col = annotation_col,
annotation_colors=annotation_colors,
gaps_col = c(17),
fontsize = 10,
cluster_cols = F,
cluster_rows = T,
show_colnames = F,
cutree_rows =2
)
#磷酸化位点差异分析
#读取经过缺失值插补的磷酸蛋白组表达矩阵
phos_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_result/phos_expr_pmm.csv",header = T,row.names = 1,check.names=F)
#导入分组
cptac_clinical <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_group_original.csv",header = T,row.names = 1,check.names=F)
colnames(phos_expr) <- cptac_clinical$KRAS_mutation_subtype
#保留KRAS突变的蛋白组数据
phos_KRAS <- phos_expr[,colnames(phos_expr) %in% c("KM1","KM2")]
cptac_clinical <- cptac_clinical[cptac_clinical$KRAS_mutation_subtype%in% c("KM1","KM2"),]
colnames(phos_KRAS) <- cptac_clinical$code
#limma差异分析:转录组
# 设置或导入分组
design <- model.matrix(~factor(cptac_clinical$KRAS_mutation_subtype) + 0)
colnames(design) <- c("KM1","KM2")
#构建比较矩阵
contrast.matrix <- makeContrasts(KM1 - KM2, levels = factor(cptac_clinical$KRAS_mutation_subtype))#实验组vs对照组
#线性混合模拟
fit <- lmFit(phos_KRAS,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异基因
phos_DEG <- topTable(fit2,
#adjust = 'BH',
coef = 1,
n = Inf,
sort.by="logFC"
#p.value = 0.05
)
# 绘制火山图
p_MONO <- EnhancedVolcano(phos_DEG,
lab = rownames(phos_DEG),
x = 'logFC',
y = 'P.Value',
title = 'KM1 vs KM2',
pointSize = 3.0,
labSize = 6.0,
legendPosition = 'right',
pCutoff = 0.05,
FCcutoff = 1)
p_MONO
#KSEA富集分析
library(KSEAapp)
KSData=read.csv('PSP&NetworKIN_Kinase_Substrate_Dataset_July2016.csv',
header = T)#注释文件
PX=read.csv('Diff_phos.csv',
header = T)#磷酸化数据文件
KSEA.Barplot(KSData,#注释文件
PX,#磷酸化数据文件
NetworKIN = T,#是否包括NetworKIN预测结果
NetworKIN.cutoff = 5,#NetworKIN预测打分阈值
m.cutoff = 5,#激酶对应底物的最小数量
p.cutoff = 0.05,#p值
export = F)#是否将图导出
14、KRAS亚型的免疫浸润分析
###xCell免疫浸润分析
#矩阵读入
gene_expression_matrix <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_RNAseq.csv",header = T,row.names = 1,check.names=F)
#反卷积方法(人):quantiseq/timer/cibersort/cibersort_abs/mcp_counter/xcell/epic/abis/consensus_tme/estimate
#反卷积方法(鼠): mmcp_counter/seqimmucc/dcq/base
#反卷积计算
xcell_TCGA <- immunedeconv::deconvolute(gene_expression_matrix, "xcell")
write.csv(xcell_TCGA, "D:/Lessons/Bioinformatics/Pj_result/xcell_TCGA.csv")
#读入KRAS分组信息
kras_group <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_group_original.csv",header = T)
kras_group <- kras_group[,-c(2,4)]
colnames(xcell_TCGA) <- c("cell_type",kras_group$KRAS_mutation_subtype)
#宽数据转成长数据
library(reshape2)
dd=melt(xcell_TCGA)
colnames(dd)=c("cell_type", "KRAS", "Fraction")
#因子转换
dd$KRAS <- factor(dd$KRAS, levels = c("WT","KM1","KM2"))
#箱线图
p1 <- ggplot(dd,aes(x=cell_type, y=Fraction,
fill = KRAS, color = KRAS)) +
geom_boxplot(notch = F, alpha = 0.15,
outlier.shape = 1,
outlier.colour = "black", #outlier点用黑色
outlier.size = 0.05) +
scale_fill_manual(values= c("#01665e","#c68f3f","black")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 60, hjust = 1,size = 10),
axis.text.y = element_text(angle = 90, size = 12),
axis.title.y = element_text(angle = 90, size = 15)) +
theme(legend.position = "top")+
stat_compare_means(symnum.args=list(cutpoints = c(0, 0.001, 0.01, 0.05, 1),
symbols = c("***", "**", "*", "ns")),
label = "p.signif")
p1
ggsave(p1,file='D:/Lessons/Bioinformatics/Pj_result/Fig2A_xcell.pdf',width = 12,height = 5)
Last updated