📉TCGA/CTPAC分析

By Kaiyi

1、前期准备

rm(list = ls())
#确立工作目录
setwd("D:/Lessons/Bioinformatics/Pj_data/TCGA")
#加载R包
library(survival) #生存曲线
library(survminer) #生存曲线
library(forestmodel) #森林图
library(DESeq2) #差异分析
library(msigdbr) #GSEA
library(clusterProfiler) #GSEA
library(GseaVis) #GSEA
library(limma) #差异分析
library(ggplot2)
library(stringr)
library(tidyverse)
library(mice) #填补缺失值
library(maftools) #CNV分析
library(immunedeconv) #免疫浸润
library(CancerSubtypes) #癌症分型
library(GenVisR) #Maf可视化
library(GSVA) #ssGSEA
library(EnhancedVolcano)#火山图

2、Kaplan-Meier生存分析

#读入TCGA临床数据【至少包含三列信息,生存状态(0代表存活,1代表死亡)、生存时长(此处为PFS)、分类变量(KRAS突变型)】
survival_data <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/KRAS_survival.csv",header = T)
#拟合曲线
fit<- survfit(Surv(time, status) ~ KRAS_subtype, data = survival_data, type = 'kaplan-meier')
#绘制生存曲线
ggsurvplot(fit, data = survival_data,
           #surv.median.line = "hv", #在图中添加中位生存时间
           conf.int = TRUE, #给生存曲线添加上置信区间
           pval = TRUE,  #添加P值
           pval.size = 5,#文字大小
           legend.labs = c("KRAS-Mut", "KRAS-WT"), #在图中添加图例
           risk.table = TRUE,#添加上风险表
           ggtheme = theme_bw(), #theme_light(),  
           palette = c("#c68f3f","#01665e"), #"grey","npg","aaas","lancet"
           title = "Colorectal Adenocarcinoma Cohort (TCGA, PanCancer Atlas)",
           xlab = "Time (Months)")
#ggsurvplot的更多参数可见https://mp.weixin.qq.com/s/N6J6TifLq4SGfLJ9lqOykg

3、Cox回归分析

#读入临床数据
clinical_data <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/TCGA_clinical.csv",header = T)
#因子转换
clinical_data$Sex <- factor(clinical_data$Sex, levels = unique(clinical_data$Sex))
clinical_data$KRAS <- factor(clinical_data$KRAS, levels = unique(clinical_data$KRAS))
clinical_data$MSI <- factor(clinical_data$MSI, levels = unique(clinical_data$MSI))
clinical_data$Stage <- factor(clinical_data$Stage, levels = unique(clinical_data$Stage))

#建模
fit1 <- coxph(Surv(PFI.time, PFI) ~ Age + Sex + Stage + MSI + KRAS,
           data = clinical_data)
summary(fit1)

#制作森林图
forest_model(fit1)

4、mRNA转录组差异分析

### 1)读入表达矩阵与样本分组信息
#读入前对重复的基因行进行去重
mrna_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/data_mrna_seq_v2_rsem.csv",header = T,row.names = 1,check.names=F)
#注意保证表达矩阵中的样本顺序和group文件分组顺序是一一对应的
mrna_sample <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/data_mrna_seq_v2_rsem_group.csv",header = T,row.names = 1,check.names=F)
### 2、数据清洗
#对表达量矩阵取整
mrna_expr <- round(mrna_expr) 
#去除负值
mrna_expr[mrna_expr==-1] <- 0 
# 保留每个样本中都有表达的基因
mrna_expr <- mrna_expr[which(rowSums(mrna_expr)!=0),]
# 或保留至少在75%的样本中都有表达的基因
#mrna_expr <- mrna_expr[which(rowSums(mrna_expr> 0) >= floor(0.75*ncol(mrna_expr))),]
###3、DEseq2差异分析
dds <- DESeqDataSetFromMatrix(countData = mrna_expr,
                                colData = mrna_sample,
                                design = ~ group)
dds$group <- relevel(dds$group, ref = "WT") # 指定哪一组作为对照组

dds <- DESeq(dds, fitType = 'mean', minReplicatesForReplace = 7, parallel = FALSE)
mrna_DEG <- as.data.frame(results(dds))
mrna_DEG = mrna_DEG[order(mrna_DEG$log2FoldChange),] # 按照logFC排序
write.csv(mrna_DEG,file = "D:/Lessons/Bioinformatics/Pj_result/mrna_DEG.csv")

5.1、GSEA富集分析

###1.1 选择GO_BP数据库
# choose C5 BP gene sets
genesets_GOBP <-  msigdbr(species = "Homo sapiens",
                     category = "C5",
                     subcategory = "BP")
# TERM2GENE
gmt_GOBP <- genesets_GOBP %>% dplyr::select(gs_name,gene_symbol)
###1.2 选择KEGG数据库
# choose C2 KEGG gene sets
genesets_KEGG <-  msigdbr(species = "Homo sapiens",
                     category = "C2",
                     subcategory = "KEGG")
# TERM2GENE
gmt_KEGG <- genesets_KEGG %>% dplyr::select(gs_name,gene_symbol)
###2)加载基因与foldchange值
gene <- read.csv('D:/Lessons/Bioinformatics/Pj_result/mrna_DEG.csv',header = T) %>%
  arrange(desc(log2FoldChange))

geneList <- gene$log2FoldChange
names(geneList) <- gene$gene_name

# GO enrich
gseaRes <- GSEA(geneList = geneList,
                TERM2GENE = gmt_GOBP,#gmt_KEGG
                minGSSize    = 10,
                maxGSSize    = 500,
                pvalueCutoff = 1,
                pAdjustMethod = "BH", #一般选择 "BH" 或 "fdr",BH较严格,fdr较温和(计算的q小些)
                by = "fgsea", #"fgsea"或“DOSE”,传统的GSEA方法很慢,可以通过对随机的基因集抽样解决这个问题,也就是 a fast gene set enrichment analysis (FGSEA) 
                #nPerm = 2000,
                verbose      = FALSE)

# to dataframe
data_ga <- data.frame(gseaRes) %>%
  filter(pvalue < 0.05)
#data_ga <- write.csv(data_ga, file = "D:/Lessons/Bioinformatics/Pj_result/GSEAresult_kegg.csv")
#data_ga <- write.csv(data_ga, file = "D:/Lessons/Bioinformatics/Pj_result/GSEAresult_gobp.csv")
###3)可视化

#一次性显示多条通路
geneSetID = c('GOBP_KERATINIZATION',
              'GOBP_REGULATION_OF_KETONE_BIOSYNTHETIC_PROCESS',
              'GOBP_PROTEIN_AUTOPROCESSING',
              'GOBP_EPIDERMIS_DEVELOPMENT',
              'GOBP_CELL_FATE_DETERMINATION',
              'GOBP_T_CELL_ACTIVATION',
              'GOBP_ADAPTIVE_IMMUNE_RESPONSE',
              'GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION',
              'GOBP_B_CELL_MEDIATED_IMMUNITY',
              'GOBP_DENDRITIC_CELL_CHEMOTAXIS')

# all plot
library(RColorBrewer)
gseaNb(object = gseaRes,
       geneSetID = geneSetID,
       subPlot = 2,
       termWidth = 35,
       #legend.position = c(0.8,0.8),
       #addGene = gene,
       #addPval = T,
       #pvalX = 0.05,pvalY = 0.05,
       curveCol = brewer.pal(10, 'BrBG'))

5.2 单基因GSEA分析(两种思路)

// R

https://mp.weixin.qq.com/s/q6nkujgTYlbOQpkENjyyxA

6、蛋白质组学/磷酸化蛋白质学组差异分析

### 1)读入表达矩阵与样本分组信息
setwd("D:/Lessons/Bioinformatics/Pj_data/CPTAC")
#此处将样本名改成了A1:A97
#磷酸化蛋白组
phos_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_Phosphoproteome.csv",header = T,row.names = 1,check.names=F)
#蛋白质组
prot_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_Proteome.csv",header = T,row.names = 1,check.names=F)
#转录组
RNA_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_RNAseq.csv",header = T,row.names = 1,check.names=F)
#分组文件(注意保证表达矩阵中的样本顺序和group文件分组顺序是一一对应的)
cptac_sample <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_group.csv",header = T,row.names = 1,check.names=F)
### 2)数据清洗
## 磷酸蛋白质组数据NA较多,去除75%以上样本都是NA的基因
phos_expr <- phos_expr[-which(rowMeans(is.na(phos_expr)) > 0.75),] #17408 obs in 31339

#缺失值处理:mice包多重插补(或者用impute包)
#mice包不支持复杂的列名,如包含-/()数字开头等,因此将列名改成了A1:A97

imp_phos <- mice(phos_expr, m = 5, method = "pmm")
phos_expr_pmm <- complete(imp_phos,5)

imp_prot <- mice(prot_expr, m = 5, method = "pmm")
prot_expr_pmm <- complete(imp_prot,5)

imp_RNA <- mice(RNA_expr, m = 5, method = "pmm")
RNA_expr_pmm <- complete(imp_RNA,5)
###3.1)limma差异分析(磷酸化蛋白质组)

#构建分组矩阵
design <- model.matrix(~factor(cptac_sample$group) + 0)
colnames(design) <- c("Mut","WT")
#构建比较矩阵
contrast.matrix <- makeContrasts(Mut - WT, levels = factor(cptac_sample$group))#实验组vs对照组
#线性混合模拟
fit <- lmFit(phos_expr_pmm,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异基因
phos_DEG <- topTable(fit2,
                #adjust = 'BH',
                coef = 1,
                n = Inf,
                sort.by="logFC"
                #p.value = 0.05
)
write.csv(phos_DEG,"D:/Lessons/Bioinformatics/Pj_result/phos_DEG_limma.csv")
write.csv(phos_expr_pmm,"D:/Lessons/Bioinformatics/Pj_result/phos_expr_pmm.csv")
###3.2)limma差异分析(蛋白质组)

#构建分组矩阵
design <- model.matrix(~factor(cptac_sample$group) + 0)
colnames(design) <- c("Mut","WT")
#构建比较矩阵
contrast.matrix <- makeContrasts(Mut - WT, levels = factor(cptac_sample$group))#实验组vs对照组
#线性混合模拟
fit <- lmFit(prot_expr_pmm,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异基因
prot_DEG <- topTable(fit2,
                #adjust = 'BH',
                coef = 1,
                n = Inf,
                sort.by="logFC"
                #p.value = 0.05
)
write.csv(prot_DEG,"D:/Lessons/Bioinformatics/Pj_result/prot_DEG_limma.csv")
write.csv(prot_expr_pmm,"D:/Lessons/Bioinformatics/Pj_result/prot_expr_pmm.csv")
###3.3)limma差异分析(转录组)

#构建分组矩阵
design <- model.matrix(~factor(cptac_sample$group) + 0)
colnames(design) <- c("Mut","WT")
#构建比较矩阵
contrast.matrix <- makeContrasts(Mut - WT, levels = factor(cptac_sample$group))#实验组vs对照组
#线性混合模拟
fit <- lmFit(RNA_expr_pmm,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异基因
RNA_DEG <- topTable(fit2,
                #adjust = 'BH',
                coef = 1,
                n = Inf,
                sort.by="logFC"
                #p.value = 0.05
)
write.csv(RNA_DEG,"D:/Lessons/Bioinformatics/Pj_result/RNA_DEG_limma.csv")
write.csv(RNA_expr_pmm,"D:/Lessons/Bioinformatics/Pj_result/RNA_expr_pmm.csv")
###4)九象限散点图
#4.1读入数据
library(dplyr)
rna = read.csv('D:/Lessons/Bioinformatics/Pj_result/RNA_DEG_limma.csv')
prot = read.csv('D:/Lessons/Bioinformatics/Pj_result/prot_DEG_limma.csv')
phos = read.csv('D:/Lessons/Bioinformatics/Pj_result/phos_DEG_limma.csv')

prot$Group = 'Proteome'
phos$Group = 'Phosphoproteome'
rna$Group = 'rna'

prot$sample = prot[,1]
phos$sample = phos[,1]
rna$sample = rna[,1]
#4.2数据处理
library(stringr)
phos$X = str_split(phos$X,'_',simplify = T)[,1]
colnames(prot)[1] = 'X'

phr = rbind(prot,phos)

all = merge(x = phr,y=rna,by.x = 1,by.y = 1)
all$Significance = ifelse(all$P.Value.x & all$P.Value.y < 0.05,'P.value < 0.05','P.value ≥ 0.05')
table(all$Significance)
head(all)
colnames(all)
#4.3作图
library(ggplot2)
size=10
themes <-  theme(
  title = element_text(size=size,color="black"),
  axis.text.x = element_text(size=size,color="black"),
  axis.text.y = element_text(size=size,color="black"),
  axis.title.x= element_text(size=size,hjust=0.5,color="black"),
  axis.title.y= element_text(size=size,hjust=0.5,color="black"),
  legend.text=element_text(color="black",size=size),
  legend.title = element_text(color="black", size=size),
  plot.title = element_text(hjust = 0.5),
  panel.background = element_blank(),
  axis.ticks = element_line(colour = "black", linewidth =1),
  axis.ticks.length = unit(.2, "cm"),
  axis.line = element_line(linewidth=1, colour = "black"),legend.key = element_blank())

library(ggrepel)

sig = subset(all,subset = c(all$P.Value.x & all$P.Value.y < 0.05))

p=ggplot(data=all,aes(x=logFC.y,y=logFC.x))+
  geom_point(aes(color = Group.x,shape = Significance))+
  labs(x='Singed logfc RNA', y='Singed logfc Proteomics',color = 'Group')+
  themes+
  scale_color_manual(values = c('#244689','red'))+
  scale_shape_manual(values = c(17,16))+
  geom_text_repel(data = sig,aes(label=sample.x),size=3)
p
#保存为pdf
ggsave('D:/Lessons/Bioinformatics/Pj_result/Fig1E_dotplot.pdf',width = 8,height = 6)

7、臂级体细胞拷贝数改变 (SCNA) 事件分析

###1) 数据下载与读入(TCGA-COAD/READ)

#TCGA的MAF文件下载:https://mp.weixin.qq.com/s/oU_bgGg1vQqhwrkPHjzPwg
#https://www.jianshu.com/p/40bbb29d355d

#批量读入所有maf文件名:
mafFilePath <- dir(path = "D:/Lessons/Bioinformatics/Pj_data/TCGA/SCNV",
pattern = "masked.maf.gz$",
full.names = T,
recursive=T)
head(mafFilePath)
###2) 数据读入

#读入临床信息
#laml.clin =system.file('extdata',
#     'tcga_laml_annot.tsv', package = 'maftools') 

#read.maf函数读取每一个maf文件(会遇到Error in read.maf(x, isTCGA = TRUE) : No non-synonymous mutations found Check `vc_nonSyn`` argumet in `read.maf` for details的报错,删除该未能读入的文件即可)

mafdata <- lapply(mafFilePath,
function(x){
read.maf(x,
isTCGA=TRUE) # clinicalData = laml.clin
})
###3) 数据合并
#使用merge_mafs函数合并maf数据:
maf <- merge_mafs(mafdata)
##maf文件摘要图:
plotmafSummary(maf = maf,
rmOutlier = TRUE,
addStat = "median",
dashboard = TRUE,
titvRaw = FALSE)
###4)可视化(可参考https://mp.weixin.qq.com/s/pBDue6hOf0DrTZMN0rB2sA)
##瀑布图(Oncoplots):
oncoplot(maf = maf,
#draw_titv = FALSE,
#anno_height = 1,#样本注释区高度
#legend_height = 4, #图例绘图区高度
#drawRowBar = T, #是否显示右侧条形图
#drawColBar = T, #是否显示顶部条形图
top = 10) #高频突变的Top10基因
#体细胞突变相关性热图
somaticInteractions(
  maf,
  top = 20,
  genes = NULL,
  pvalue = c(0.05, 0.1),
  returnAll = TRUE,
  geneOrder = NULL,
  fontSize = 0.6,
  showSigSymbols = TRUE,
  showCounts = FALSE,
  countStats = "all",
  countType = "all",
  countsFontSize = 0.6,
  countsFontColor = "red",
  colPal = "BrBG",
  showSum = TRUE,
  plotPadj = FALSE,
  colNC = 9,
  nShiftSymbols = 5,
  sigSymbolsSize = 2,
  sigSymbolsFontSize = 0.8,
  pvSymbols = c(46, 42),
  limitColorBreaks = TRUE
)

8、非整倍体评分分析

#加载TCGA aneuploidy score数据
aneuploidy_data <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/Aneuploidy_score.csv",header=T,row.names=1)
# 因子化
aneuploidy_data$KRAS <- factor(aneuploidy_data$KRAS,levels = c("WT","Mut"))
#箱线图
library(ggpubr)
color <- c("#01665e","#c68f3f")
my_comparisons <- list(c("WT", "Mut"))
#可视化
  pb1<-ggboxplot(aneuploidy_data,
                 x="KRAS",
                 y="Aneuploidy_Score",
                 color="KRAS",
                 fill=NULL,
                 ylab = "Aneuploidy Score",
                 add = "jitter",
                 bxp.errorbar.width = 0.6,
                 width = 0.4,
                 size=0.5,
                 font.label = list(size=30), 
                 palette = color)+
  theme(panel.background =element_blank())
  pb1<-pb1+theme(axis.line=element_line(colour="black"))+theme(axis.title.x = element_blank())
  pb1<-pb1+theme(axis.text.x = element_text(size = 15,angle = 45,vjust = 1,hjust = 1))
  pb1<-pb1+theme(axis.text.y = element_text(size = 12))+theme(plot.title = element_text(hjust = 0.5,size=15,face="bold"))
  pb1<-pb1+theme(legend.position = "NA")
  pb1<-pb1+stat_compare_means(method="wilcox.test",hide.ns = F,comparisons =my_comparisons,label="p.signif")
pb1

9、免疫浸润分析

#矩阵读入
gene_expression_matrix <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/data_mrna_seq_v2_rsem.csv",header = T,row.names = 1,check.names=F)

#反卷积方法(人):quantiseq/timer/cibersort/cibersort_abs/mcp_counter/xcell/epic/abis/consensus_tme/estimate
#反卷积方法(鼠): mmcp_counter/seqimmucc/dcq/base

#反卷积计算
xcell_TCGA <- immunedeconv::deconvolute(gene_expression_matrix, "xcell")
write.csv(xcell_TCGA, "D:/Lessons/Bioinformatics/Pj_result/xcell_TCGA.csv")
#读入xcell结果
xcell_TCGA <- read.csv("D:/Lessons/Bioinformatics/Pj_result/xcell_TCGA.csv",header = T,row.names = 1, check.names = F)
#读入KRAS分组信息
kras_group <- read.csv("D:/Lessons/Bioinformatics/Pj_data/TCGA/data_mrna_seq_v2_rsem_group.csv",header = T,row.names = 1, check.names = F)
colnames(xcell_TCGA) <- c("cell_type",kras_group$group)
#宽数据转成长数据
library(reshape2)
dd=melt(xcell_TCGA)
colnames(dd)=c("cell_type", "KRAS", "Fraction")
#因子转换
dd$KRAS <- factor(dd$KRAS, levels = c("WT","Mut"))
#箱线图
p1 <- ggplot(dd,aes(x=cell_type, y=Fraction, 
                      fill = KRAS, color = KRAS)) +
  geom_boxplot(notch = F, alpha = 0.15, 
               outlier.shape = 1,
               outlier.colour = "black", #outlier点用黑色
               outlier.size = 0.05) +
  scale_fill_manual(values= c("#01665e","#c68f3f")) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1,size = 10), 
        axis.text.y = element_text(angle = 90, size = 12),
        axis.title.y = element_text(angle = 90, size = 15)) +
  theme(legend.position = "top")+
  stat_compare_means(symnum.args=list(cutpoints = c(0, 0.001, 0.01, 0.05, 1), 
                                      symbols = c("***", "**", "*", "ns")),
                     label = "p.signif")
p1
ggsave(p1,file='D:/Lessons/Bioinformatics/Pj_result/Fig2A_xcell.pdf',width = 12,height = 5)

10、多组学分子亚型聚类

#读入数据
#磷酸化蛋白组
phos_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_Phosphoproteome.csv",header = T,row.names = 1,check.names=F)
#蛋白质组
prot_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_Proteome.csv",header = T,row.names = 1,check.names=F)
#转录组
RNA_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_RNAseq.csv",header = T,row.names = 1,check.names=F)
#分组文件(注意保证表达矩阵中的样本顺序和group文件分组顺序是一一对应的)
cptac_sample <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_group.csv",header = T)
#看一下数据分布
data.checkDistribution(phos_expr)
data.checkDistribution(prot_expr)
data.checkDistribution(RNA_expr)
###磷酸化蛋白组
#处理缺失值
index = which(is.na(phos_expr))
res_phos = data.imputation(phos_expr,fun="median")
#数据 Normalization
res_phos = data.normalization(res_phos,type="feature_Median",log2=FALSE)
#特征选择——方差、MAD、PCA、COX模型
res_phos = FSbyVar(res_phos, cut.type="topk",value=1000)
###蛋白组
#处理缺失值
index = which(is.na(prot_expr))
res_prot = data.imputation(prot_expr,fun="median")
#数据 Normalization
res_prot = data.normalization(res_prot,type="feature_Median",log2=FALSE)
#特征选择——方差、MAD、PCA、COX模型
res_prot = FSbyVar(res_prot, cut.type="topk",value=1000)
###转录组
#处理缺失值
index = which(is.na(RNA_expr))
res_RNA = data.imputation(RNA_expr,fun="median")
#数据 Normalization
res_RNA = data.normalization(res_RNA,type="feature_Median",log2=FALSE)
#特征选择——方差、MAD、PCA、COX模型
res_RNA = FSbyVar(res_RNA, cut.type="topk",value=1000)
#选出KRAS突变型的样本数据
cptac_sample <- cptac_sample[cptac_sample$group == "Mut",]
res_phos <- res_phos[,cptac_sample$sample]
res_prot <- res_prot[,cptac_sample$sample]
res_RNA <- res_RNA[,cptac_sample$sample]
#从97个样本中得到了34个
#NMF
CRC=list(RNAExp=res_RNA, ProtExp=res_prot, PhosExp = res_phos)
result=ExecuteSNF(CRC, clusterNum=2, K=20, alpha=0.5, t=20)
#癌症亚型的结果验证、解释和可视化
###Similarity smaple matrix
sil=silhouette_SimilarityMatrix(result$group, result$distanceMatrix)
plot(sil)
#生存分析
result=ExecuteSNF(CRC, clusterNum=2, K=20, alpha=0.5, t=20,plot = FALSE)
group=result$group
distanceMatrix=result$distanceMatrix
data(time) #事实上并没有找到CPTAC生存数据
data(status)
time = time[1:34]
status = status[1:34]
p_value=survAnalysis(mainTitle="CRC",time,status,group,
                     distanceMatrix,similarity=TRUE)

11、随机森林筛选变量

#读入表达矩阵与样本分组信息
phos_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_result/phos_expr.csv",header = T,row.names = 1,check.names=F)
prot_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_result/prot_expr.csv",header = T,row.names = 1,check.names=F)
RNA_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_result/RNA_expr.csv",header = T,row.names = 1,check.names=F)
#随机森林
library(caret)
set.seed(202)
phos_expr <- t(phos_expr)

prot_expr <- t(prot_expr)
prot_expr <-cbind(prot_expr,cptac_sample)
predictors<-colnames(phos_expr)[1:10]
model <- train(x = dat[,predictors], 
               y = dat$Cmic,
               method = "rf",
               importance = TRUE,
               tuneGrid = expand.grid(mtry = c(2:4)), # length(predictors) or 2:6
               trControl = trainControl(method = "cv", 
                                        number = 20,
                                        p = 0.75,
                                        savePredictions = TRUE))
#输出模型的RSEM和R方
model$results %>% as_tibble %>% filter(mtry == model$bestTune %>% unlist) %>% select(RMSE, Rsquared)
#棒棒糖图展示模型重要性
varImp(model, scale = FALSE) %>% plot

12、SMG分析

#GenvisR绘制瀑布图
library(FField)
library(GenVisR)
library(reshape2)
#读取maf矩阵
maf_file= read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/final_oncotator_all.csv",header = T)
#读取临床信息
maf_clin = read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_Clinical.csv",header = T)
maf_clin = maf_clin[,c(1:4,7,9,16)]
colnames(maf_clin) <- c('sample', 'Age', 'Gender',"Survival_Status","Stage","MSI Status","KRAS")
clinical <- melt(maf_clin, id.vars = c("sample"))

waterfall(maf_file, fileType = "MAF") #会报错
factor(maf_file[,"Variant_Classification"])
#删去GenviR不能识别的类型
maf_file = subset(maf_file, Variant_Classification != "Splice_Region")
maf_file = subset(maf_file, Variant_Classification != "De_novo_Start_InFrame")
maf_file = subset(maf_file, Variant_Classification != "De_novo_Start_OutOfFrame")
maf_file = subset(maf_file, Variant_Classification != "Stop_Codon_Ins")
maf_file = subset(maf_file, Variant_Classification != "Stop_Codon_Del")
maf_file = subset(maf_file, Variant_Classification != "Start_Codon_SNP")
maf_file = subset(maf_file, Variant_Classification != "Start_Codon_Ins")
maf_file = subset(maf_file, Variant_Classification != "Start_Codon_Del")
maf_file = subset(maf_file, Variant_Classification != "lincRNA")

#瀑布图
waterfall(maf_file, fileType = "MAF", clinDat = clinical, 
          clinVarCol = c(Live = "white", Dead = "black",Female = "firebrick2",Male = "lightblue",
                         KM1 = "blue",KM2="red",WT="yellow",
                         "65+" ="green2","65-" ="green4",
                         "I" = "#ddd1e7", "II" = "#bba3d0", "III" = "#9975b9", "IV" = "#7647a2"),
          plotGenes = c("KRAS","APC","TP53","ARID1A","PIK3CA","TCF7L2","FBXW7","BRAF",
                        "SMAD4","SMAD2","MSH6","CTNNB1","AMER1","ERBB2","CDC27","ERBB3",
                        "B2M","AXIN2","ARID2","NRAS","MAP2K4","ACVR1B","PCBP1")) 
#maftools包绘制顺反式图

#读入MAF文件
maf = read.maf(maf = "D:/Lessons/Bioinformatics/Pj_data/CPTAC/final_oncotator_all.maf", clinicalData = maf_file)
#绘制顺反式图
titv = titv(maf = maf, plot = FALSE, useSyn = TRUE)
# plot titv summary
plotTiTv(res = titv)
library(deconstructSigs)
library(BSgenome.Hsapiens.UCSC.hg19)
#提取碱基突变数据
sample.mut.ref <- maf_file[,c("Tumor_Sample_Barcode","Chromosome","Start_position","Reference_Allele","Tumor_Seq_Allele2")]
#将染色体列加上"Chr"
a <- paste("chr",maf_file[,"Chromosome"],sep = "")
sample.mut.ref$chr <- a

#使用mut.to.sigs.input构建输入文件
sigs.input <- mut.to.sigs.input(mut.ref = sample.mut.ref,
                               sample.id = "Tumor_Sample_Barcode",
                               chr = "chr",
                               pos = "Start_position",
                               ref = "Reference_Allele",
                               alt = "Tumor_Seq_Allele2",
                               bsg = BSgenome.Hsapiens.UCSC.hg19)
#查看结果信息
dim(sigs.input)

# 使用whichSignatures推断signature的组成
sample_1 = whichSignatures(tumor.ref = sigs.input,
                         signatures.ref = signatures.cosmic,
                         sample.id = "01CO005",
                         contexts.needed = TRUE,
                         tri.counts.method = 'default')

# Plot example
plot_example <- whichSignatures(tumor.ref = sigs.input,
                      signatures.ref = signatures.cosmic,
                      sample.id = "01CO005",
                      contexts.needed = TRUE)
# Plot output
plotSignatures(plot_example, sub = 'example')
#乐高图
library(barplot3d)
# Read in COSMIC signature probabilities
x=system.file("extdata", "signature_probabilities.txt", package = "barplot3d")
sigdata=read.table(x,header=TRUE,stringsAsFactors = FALSE)
# 输入文件的顺序必须与此一致
cat(sigdata$Somatic_mutation_type,sep="\n")

#使用自己的数据绘制乐高图
legoplot3d(contextdata=sample_1$tumor,labels=FALSE,scalexy=0.03)
#参数调整
legoplot3d(contextdata=sample_1$tumor,labels=FALSE,scalexy=0.01,sixcolors="broad",alpha=0.4)

13、KRAS亚型的功能注释

#选择Hallmark数据库
genesets_Hallmark <-  msigdbr(species = "Homo sapiens",
                     category = "H")
Hallmark_df <- dplyr::select(genesets_Hallmark,gs_name,gs_exact_source,gene_symbol)
Hallmark_list <- split(Hallmark_df$gene_symbol, Hallmark_df$gs_name) ##按照gs_name给gene_symbol分组
#读取经过缺失值插补的转录组和蛋白组表达矩阵
prot_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_result/prot_expr_pmm.csv",header = T,row.names = 1,check.names=F)
RNA_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_result/RNA_expr_pmm.csv",header = T,row.names = 1,check.names=F)
cptac_clinical <- cptac_clinical[,cptac_clinical$KRAS_mutation_subtype%in% c("KM1","KM2")]
Error in `[.data.frame`(cptac_clinical, , cptac_clinical$KRAS_mutation_subtype %in%  : 
  选择了未定义的列
#导入分组
cptac_clinical <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_group_original.csv",header = T,row.names = 1,check.names=F)
colnames(RNA_expr) <- cptac_clinical$KRAS_mutation_subtype
#保留KRAS突变的蛋白组数据
RNA_KRAS <- RNA_expr[,colnames(RNA_expr) %in% c("KM1","KM2")]
cptac_clinical <- cptac_clinical[cptac_clinical$KRAS_mutation_subtype%in% c("KM1","KM2"),]
colnames(RNA_KRAS) <- cptac_clinical$code
#ssGSEA: 蛋白组
prot_KRAS <- as.matrix(prot_KRAS)
prot_ssGSEA = gsva(prot_KRAS, Hallmark_list, method="ssgsea",
               min.sz=1, max.sz=Inf,
               kcdf="Gaussian"
)
# 去掉"HALLMARK_"
row.names(prot_ssGSEA)=substring(row.names(prot_ssGSEA), 10)
#ssGSEA: 转录组
RNA_KRAS <- as.matrix(RNA_KRAS)
RNA_ssGSEA = gsva(RNA_KRAS, Hallmark_list, method="ssgsea",
               min.sz=1, max.sz=Inf,
               kcdf="Gaussian"
)
# 去掉"HALLMARK_"
library(stringr)
row.names(RNA_ssGSEA)=substring(row.names(RNA_ssGSEA), 10)
#limma差异分析:蛋白组
# 设置或导入分组
design <- model.matrix(~factor(cptac_clinical$KRAS_mutation_subtype) + 0)
colnames(design) <- c("KM1","KM2")
#构建比较矩阵
contrast.matrix <- makeContrasts(KM1 - KM2, levels = factor(cptac_clinical$KRAS_mutation_subtype))#实验组vs对照组
#线性混合模拟
fit <- lmFit(prot_ssGSEA,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异基因
prot_DEG <- topTable(fit2,
                #adjust = 'BH',
                coef = 1,
                n = Inf,
                sort.by="logFC"
                #p.value = 0.05
)
sig_prot_DEG = subset(prot_DEG, P.Value < 0.05)
prot_sig=prot_ssGSEA[row.names(sig_prot_DEG),]
#limma差异分析:转录组
# 设置或导入分组
design <- model.matrix(~factor(cptac_clinical$KRAS_mutation_subtype) + 0)
colnames(design) <- c("KM1","KM2")
#构建比较矩阵
contrast.matrix <- makeContrasts(KM1 - KM2, levels = factor(cptac_clinical$KRAS_mutation_subtype))#实验组vs对照组
#线性混合模拟
fit <- lmFit(RNA_ssGSEA,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异基因
RNA_DEG <- topTable(fit2,
                #adjust = 'BH',
                coef = 1,
                n = Inf,
                sort.by="logFC"
                #p.value = 0.05
)
sig_RNA_DEG = subset(RNA_DEG, P.Value < 0.05)
RNA_sig=RNA_ssGSEA[row.names(sig_RNA_DEG),]
#差异分析可视化:蛋白组差异通路分组热图
library("pheatmap")
annotation_col= data.frame(Group = factor(cptac_clinical$KRAS_mutation_subtype))
row.names(annotation_col) = colnames(prot_sig)
annotation_colors=list(Group = c("KM1" = "#00C094", "KM2" = "#f58220"))
pheatmap(prot_sig, show_rownames = T,scale = "row",
         annotation_col = annotation_col,
         annotation_colors=annotation_colors,
         gaps_col = c(17),
         fontsize = 10,
         cluster_cols = F,
         cluster_rows = T,
         show_colnames = F,
         cutree_rows =2
         )
library("pheatmap")
annotation_col= data.frame(Group = factor(cptac_clinical$KRAS_mutation_subtype))
row.names(annotation_col) = colnames(RNA_sig)
annotation_colors=list(Group = c("KM1" = "#00C094", "KM2" = "#f58220"))
pheatmap(prot_sig, show_rownames = T,scale = "row",
         annotation_col = annotation_col,
         annotation_colors=annotation_colors,
         gaps_col = c(17),
         fontsize = 10,
         cluster_cols = F,
         cluster_rows = T,
         show_colnames = F,
         cutree_rows =2
         )
#磷酸化位点差异分析
#读取经过缺失值插补的磷酸蛋白组表达矩阵
phos_expr <- read.csv("D:/Lessons/Bioinformatics/Pj_result/phos_expr_pmm.csv",header = T,row.names = 1,check.names=F)
#导入分组
cptac_clinical <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_group_original.csv",header = T,row.names = 1,check.names=F)
colnames(phos_expr) <- cptac_clinical$KRAS_mutation_subtype
#保留KRAS突变的蛋白组数据
phos_KRAS <- phos_expr[,colnames(phos_expr) %in% c("KM1","KM2")]
cptac_clinical <- cptac_clinical[cptac_clinical$KRAS_mutation_subtype%in% c("KM1","KM2"),]
colnames(phos_KRAS) <- cptac_clinical$code
#limma差异分析:转录组
# 设置或导入分组
design <- model.matrix(~factor(cptac_clinical$KRAS_mutation_subtype) + 0)
colnames(design) <- c("KM1","KM2")
#构建比较矩阵
contrast.matrix <- makeContrasts(KM1 - KM2, levels = factor(cptac_clinical$KRAS_mutation_subtype))#实验组vs对照组
#线性混合模拟
fit <- lmFit(phos_KRAS,design)#非线性最小二乘法,线性模型拟合
fit2 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit2)#用经验贝叶斯调整t-test中方差的部分
#输出差异基因
phos_DEG <- topTable(fit2,
                #adjust = 'BH',
                coef = 1,
                n = Inf,
                sort.by="logFC"
                #p.value = 0.05
)
# 绘制火山图

p_MONO <- EnhancedVolcano(phos_DEG,
                          lab = rownames(phos_DEG),
                          x = 'logFC',
                          y =  'P.Value',
                          title = 'KM1 vs KM2',
                          pointSize = 3.0,
                          labSize = 6.0,
                          legendPosition = 'right',
                          pCutoff = 0.05,
                          FCcutoff = 1)
p_MONO
#KSEA富集分析
library(KSEAapp)
KSData=read.csv('PSP&NetworKIN_Kinase_Substrate_Dataset_July2016.csv',
                 header = T)#注释文件
PX=read.csv('Diff_phos.csv',
             header = T)#磷酸化数据文件
KSEA.Barplot(KSData,#注释文件
             PX,#磷酸化数据文件
             NetworKIN = T,#是否包括NetworKIN预测结果
             NetworKIN.cutoff = 5,#NetworKIN预测打分阈值
             m.cutoff = 5,#激酶对应底物的最小数量
             p.cutoff = 0.05,#p值
             export = F)#是否将图导出

14、KRAS亚型的免疫浸润分析

###xCell免疫浸润分析
#矩阵读入
gene_expression_matrix <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_COAD_RNAseq.csv",header = T,row.names = 1,check.names=F)

#反卷积方法(人):quantiseq/timer/cibersort/cibersort_abs/mcp_counter/xcell/epic/abis/consensus_tme/estimate
#反卷积方法(鼠): mmcp_counter/seqimmucc/dcq/base

#反卷积计算
xcell_TCGA <- immunedeconv::deconvolute(gene_expression_matrix, "xcell")
write.csv(xcell_TCGA, "D:/Lessons/Bioinformatics/Pj_result/xcell_TCGA.csv")
#读入KRAS分组信息
kras_group <- read.csv("D:/Lessons/Bioinformatics/Pj_data/CPTAC/CPTAC_group_original.csv",header = T)
kras_group <- kras_group[,-c(2,4)]
colnames(xcell_TCGA) <- c("cell_type",kras_group$KRAS_mutation_subtype)
#宽数据转成长数据
library(reshape2)
dd=melt(xcell_TCGA)
colnames(dd)=c("cell_type", "KRAS", "Fraction")
#因子转换
dd$KRAS <- factor(dd$KRAS, levels = c("WT","KM1","KM2"))
#箱线图
p1 <- ggplot(dd,aes(x=cell_type, y=Fraction, 
                      fill = KRAS, color = KRAS)) +
  geom_boxplot(notch = F, alpha = 0.15, 
               outlier.shape = 1,
               outlier.colour = "black", #outlier点用黑色
               outlier.size = 0.05) +
  scale_fill_manual(values= c("#01665e","#c68f3f","black")) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1,size = 10), 
        axis.text.y = element_text(angle = 90, size = 12),
        axis.title.y = element_text(angle = 90, size = 15)) +
  theme(legend.position = "top")+
  stat_compare_means(symnum.args=list(cutpoints = c(0, 0.001, 0.01, 0.05, 1), 
                                      symbols = c("***", "**", "*", "ns")),
                     label = "p.signif")
p1
ggsave(p1,file='D:/Lessons/Bioinformatics/Pj_result/Fig2A_xcell.pdf',width = 12,height = 5)

Last updated