> For the complete documentation index, see [llms.txt](https://liulab-1.gitbook.io/liulab-docs/llms.txt). Markdown versions of documentation pages are available by appending `.md` to page URLs; this page is available as [Markdown](https://liulab-1.gitbook.io/liulab-docs/drylab-protocols/scrnaseq-seurat.md).

# scRNAseq-seurat

{% hint style="info" %}
示例数据与代码链接：<https://pan.baidu.com/s/1yp\\_xd7kyPoI4vqPHiPBO6w> 提取码：ybu9&#x20;
{% endhint %}

本章节只包含基础的分析流程，更多进阶分析参见下一章节“scRNA-scanpy”；

本章节示例代码基于seurat 4.3.0.1版本，出现报错可考虑重新安装为4.3.0版本seurat。

## R包安装

<pre><code><strong>## 设置镜像，加快安装速度
</strong>options(repos=structure(c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))) 
## 查看镜像源
options()$repos

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

install.packages("devtools")
install.packages("R.utils")
install.packages("Seurat")
install.packages("pheatmap") 
install.packages("tidyverse") 
install.packages("magrittr") # 管道使用 R 包
install.packages("Matrix")
devtools::install_github('satijalab/seurat-data') 
remotes::install_github('satijalab/seurat-wrappers') ## 可能比较慢, 请耐心等待
remotes::install_github('chris-mcginnis-ucsf/DoubletFinder')
BiocManager::install("batchelor")
install.packages("harmony")
BiocManager::install("Biostrings",version = '3.14', force = TRUE)
BiocManager::install("celldex") 
BiocManager::install("SingleR")
BiocManager::install("clusterProfiler") ## 自动安装 "GO.db"
BiocManager::install("org.Hs.eg.db") ## 可能比较耗时, 请耐心等待
</code></pre>

## R包加载

```
library(Seurat)
library(SeuratData)
library(patchwork)
library(ggplot2)
library(batchelor)
library(SeuratWrappers)
library(magrittr)
library(tidyverse)
library(clusterProfiler)
library(GO.db)
library(org.Hs.eg.db)
library(DOSE)
library(DoubletFinder)
library(SingleR)
library(celldex)
library(harmony)
library(pheatmap)
```

## 1、单细胞数据读入

```
#方法一
# 读入数据,使用目录向量合并
#Read10X函数前提是提供了3个10X文件，名字分别为barcodes.tsv.gz，features.tsv.gz，matrix.mtx.gz

rm(list=ls())
obj_dir <- c("E:/Protocol and tutorial/scRNAseq/2.单细胞转录组基础分析/HNC01TIL","E:/Protocol and tutorial/scRNAseq/2.单细胞转录组基础分析/Tonsil2") 
names(obj_dir) = c("HNC01TIL","Tonsil2")

counts <- Read10X(data.dir = obj_dir)

scRNA1 = CreateSeuratObject(counts, min.cells=1) #创建Seurat对象
scRNA1 = subset(scRNA1, downsample=1000,seed=42) #从两个样本中各提取1000个细胞，减少后续运算所需的内存与时间
dim(scRNA1)
table(scRNA1@meta.data$orig.ident)

#保存读入的rds
saveRDS(scRNA1, "Step1_merged_sample.rds")

#生成seurat对象list
scRNAlist <- SplitObject(scRNA1, split.by = "orig.ident")
```

```
#方法二
#使用merge函数合并seurat对象
scRNA2list <- list()
#以下代码会把每个样本的数据创建一个seurat对象，并存放到列表scRNAlist里
for(i in 1:length(obj_dir)){
  counts <- Read10X(data.dir = obj_dir[i])
  
  scRNA2list[[i]] <- CreateSeuratObject(counts, min.cells=1)
}

#使用merge函数将2个seurat对象合并成一个seurat对象
scRNA2 <- merge(scRNA2list[[1]], y=scRNA2list[[2]])
scRNA2 = subset(scRNA2, downsample=1000,seed=42)
table(scRNA2@meta.data$orig.ident)
sum(row.names(scRNA1@meta.data) == row.names(scRNA2@meta.data)) #检验两个object之间的细胞是否一致
```

```
#txt矩阵的读入
#部分公共数据只提供了txt格式的表达矩阵

#1、单个读入
counts = read.table("matrix.txt", header = T, row.names = 1)

#2、批量读入

# 设置数据目录
data_dir <- "E:/Research/17.MMY/GSE167297"

# 获取所有矩阵文件
files <- list.files(data_dir, pattern = "CountMatrix\\.txt$", full.names = TRUE)

# 构建函数：读取 + 创建 Seurat 对象
read_to_seurat <- function(f){
  # 提取文件基础名（不含路径）
  fname <- basename(f)
  
  # 去掉后缀，得到样本名（如 Pt1_Deep）
  sample_id <- str_remove(fname, "_CountMatrix\\.txt$")
  
  message("Processing: ", sample_id)
  
  # 读取矩阵（假设行为基因、列为细胞；若相反请 transpose）
  mat <- read.table(f, header = TRUE, row.names = 1, sep = "\t", check.names = FALSE)
  
  # 若读取为data.frame -> 转为matrix
  mat <- as.matrix(mat)
  
  # 创建 Seurat 对象
  obj <- CreateSeuratObject(
    counts = mat,
    project = sample_id,
    min.cells = 1,
    min.features = 0
  )
  
  # 设置 orig.ident
  obj$orig.ident <- sample_id
  
  return(obj)
}

# 批量读取为列表
obj_list <- lapply(files, read_to_seurat)
names(obj_list) <- sapply(files, function(f) str_remove(basename(f), "_CountMatrix\\.txt$"))

# 若需要合并为一个 Seurat 对象
combined <- Reduce(function(x, y) merge(x, y), obj_list)

combined

```

## 2、单细胞数据质控

### 2.1 去除双细胞

```
# remove doublets
obj = SplitObject(scRNA1, split.by = "orig.ident") #将整合的SeuratObject重新分为两个SeuratObject
obj_rm=list() #创造一个空的list，用于后续盛放去除双细胞后Seurat对象
doublets_plot = list()
pc.num = 1:20 #定义pc维数
dir.create("SingleCell_QC") #在当前目录下新建一个文件夹，用于存放QC结果
RemoveDoublets <-function(
    object,
    doublet.rate,
    sample_name,
    pN=0.25,
    pc.num=1:30,
    use.SCT=FALSE
  ){
    ## 寻找最优pK值
    sweep.res.list <- paramSweep_v3(object, PCs = pc.num, sct = F) #使用log标准化，sct=F，如使用SCT标准化，则为T
    sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)  
    bcmvn <- find.pK(sweep.stats) #可以看到最佳参数的点
    pK_bcmvn <- bcmvn$pK[which.max(bcmvn$BCmetric)] %>% as.character() %>% as.numeric()
    ## 排除不能检出的同源doublets，优化期望的doublets数量
    homotypic.prop <- modelHomotypic(object$seurat_clusters)   # 最好提供celltype
    nExp_poi <- round(doublet.rate*ncol(object)) 
    nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
    seu.scored <- doubletFinder_v3(object, PCs = pc.num, pN = 0.25, pK = pK_bcmvn, 
                                   nExp = nExp_poi.adj, reuse.pANN = F, sct = F)
    # pick out doublets
    cname <-colnames(seu.scored[[]])
    DF<-cname[grep('^DF',cname)]
    seu.scored[["doublet"]] <- as.numeric(seu.scored[[DF]]=="Doublet")
    
    # remove doublets
    seu.removed <- subset(seu.scored, subset = doublet != 1)
    p1 <- DimPlot(seu.scored, group.by = DF)
    res.list <- list("plot"=p1, "obj"=seu.removed)
    return(res.list)
  }

for( i in names(obj)){
    obj[[i]] <- NormalizeData(obj[[i]])
    obj[[i]] <- FindVariableFeatures(obj[[i]], selection.method = "vst", nfeatures = 2000)
    obj[[i]] <- ScaleData(obj[[i]])
    obj[[i]] <- RunPCA(obj[[i]])
    obj[[i]] <- RunUMAP(obj[[i]], dims = 1:20)
    obj[[i]] <- FindNeighbors(obj[[i]], dims = pc.num) %>% FindClusters(resolution = 0.3)
    tmp <- RemoveDoublets(obj[[i]], doublet.rate=0.008,sample_name=i,pc.num=pc.num)
    #直接查表，1000细胞对应的doublets rate是0.8%，10000细胞对应的doublets rate是～7.6%
    obj_rm[[i]] <- tmp$obj
    doublets_plot[[i]] <- tmp$plot
  }
```

```
doublets_plot[["Tonsil2"]]
```

```
doublets_plot[["HNC01TIL"]]
```

### 2.2 去除线粒体、红细胞的影响

```
#质控
scRNA <- merge(obj_rm[[1]], y=obj_rm[[2]]) #重新整合去除双细胞后的两个SeuratObject
Idents(scRNA) <- 'orig.ident' #确立细胞身份的主要标签，此处是样本来源（即Tonsil2和HNC01TIL）

# 计算线粒体基因比例
scRNA[["percent.mt"]] = PercentageFeatureSet(scRNA, 
                                                   pattern = "^MT-") #因为人的线粒体基因是MT-开头的，所以可以通过这种方式匹配，如果是小鼠的话，是以“mt-”开头的。其他特殊物种可以直接提供线粒体基因的列表

#计算红细胞基因比例
HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ")
HB_m <- match(HB.genes, rownames(scRNA@assays$RNA)) 
HB.genes <- rownames(scRNA@assays$RNA)[HB_m] 
HB.genes <- HB.genes[!is.na(HB.genes)] 
scRNA[["percent.HB"]]<-PercentageFeatureSet(scRNA, features=HB.genes) #此处就是根据提供的红细胞基因列表计算红细胞基因表达的比例

# 数据过滤前，对 nFeature数、nCount 数,线粒体基因比例和红细胞基因比例进行可视化
beforeQC_vlnplot = VlnPlot(scRNA, 
                           features = c("nFeature_RNA", 
                                        "nCount_RNA", 
                                        "percent.mt",
                                        "percent.HB"), 
                           ncol = 4, 
                           pt.size = 0.1)
dir.create("SingleCell_QC")
ggsave("SingleCell_QC/BeforeQC_nFeature_nCount_percent.mt_percent.HB_vlnplot.pdf", plot = beforeQC_vlnplot)
ggsave("SingleCell_QC/BeforeQC_nFeature_nCount_percent.mt_percent.HB_vlnplot.png", plot = beforeQC_vlnplot)
beforeQC_vlnplot
```

```
#=== 对 nFeature、线粒体基因比例与 nCount 的关系进行可视化 ===#
plot1 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot3 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.HB")

pearplot <- CombinePlots(plots = list(plot1, plot2, plot3), nrow=1, legend="none") 
ggsave("SingleCell_QC/pearplot_before_qc.pdf", plot = pearplot, width = 12, height = 5) 
ggsave("SingleCell_QC/pearplot_before_qc.png", plot = pearplot, width = 12, height = 5)
plot1
plot2
plot3
```

```
# 根据基因数和线粒体比例进行数据过滤
minGene=500 #这里设置了比较严格的条件，一般可以设置200，保留较多的细胞用于后续分析，保留细胞类型多样性，由于今天演示，为保证细胞的基因表达数量足够，设置得较高。
maxGene=3000 #也可以设置为6000
pctMT=10 #也可以设置成20

scRNA = subset(scRNA, 
                     subset = nFeature_RNA > minGene & nFeature_RNA < maxGene & percent.mt < pctMT)

afterQC_vlnplot = VlnPlot(scRNA, 
                           features = c("nFeature_RNA", 
                                        "nCount_RNA", 
                                        "percent.mt",
                                        "percent.HB"), 
                           ncol = 4, 
                           pt.size = 0.1)

ggsave("SingleCell_QC/afterQC_nFeature_nCount_percent.mt_percent.HB_vlnplot.pdf", plot = afterQC_vlnplot)
ggsave("SingleCell_QC/afterQC_nFeature_nCount_percent.mt_percent.HB_vlnplot.png", plot = afterQC_vlnplot)
afterQC_vlnplot
saveRDS(scRNA,"Step2_After_QC.rds")
```

## 3、数据归一化与标准化

```
#方法一：NormalizeData+FindVariableFeatures+ScaleData
scRNA <- NormalizeData(scRNA)
scRNA <- FindVariableFeatures(scRNA, selection.method = "vst")
scRNA <- ScaleData(scRNA, features = VariableFeatures(scRNA))

#方法二：也可以使用SCTransform进行归一化
scRNA <- SCTransform(scRNA, vars.to.regress = c("nCount_RNA","nCount_RNA","percent.mt", "percent.HB"), verbose = F) #排除线粒体基因等干扰
```

```
#检查seurat对象的结构
str(scRNA)
```

```
#稀疏矩阵的格式，没有经过归一化
scRNA@assays$RNA@counts[1:6,1:60]
```

```
#稀疏矩阵的格式，经过归一化
scRNA@assays$RNA@data[1:6,1:60]
```

```
#meta.data内的信息
head(scRNA@meta.data)
```

### 4、数据批次矫正与聚类

```
#查看pca结果
scRNA <- RunPCA(scRNA, features = VariableFeatures(scRNA))
plot1 <- DimPlot(scRNA, reduction = "pca", group.by="orig.ident")
plot2 <- ElbowPlot(scRNA, ndims=30, reduction="pca") 

plot3 <- plot1+plot2
dir.create("Clustering")
ggsave("Clustering/pca.png", plot = plot3, width = 8, height = 4)
plot3
```

```
##细胞聚类
scRNA <- FindNeighbors(scRNA, dims = pc.num) #pc.num可以更改
scRNA <- FindClusters(scRNA, resolution = 0.5) #resolution可以更改，数字越小，分群越少
table(scRNA@meta.data$seurat_clusters)
metadata <- scRNA@meta.data
cell_cluster <- data.frame(cell_ID=rownames(metadata), cluster_ID=metadata$seurat_clusters)
write.csv(cell_cluster,'Clustering/cell_cluster.csv',row.names = F, quote = F)

#UMAP
scRNA <- RunUMAP(scRNA, dims = pc.num) #这一步有时候有点慢，pc.num可以更改

#group_by_cluster
plot3 = DimPlot(scRNA, reduction = "umap", label=T) 
ggsave("Clustering/UMAP.png",plot3)
plot3
```

```
#group_by_sample
plot4 = DimPlot(scRNA, reduction = "umap", group.by='orig.ident')

#split_by_sample
plot5 = DimPlot(scRNA, reduction = "umap", split.by='orig.ident')

plotc <- plot4+plot5 
ggsave("Clustering/UMAP_cluster_sample.png", plot = plotc, width = 10, height = 5)
plotc
saveRDS(scRNA,"Step3_Clustering.rds")
```

### 4.1 批次效应矫正方法一：MNN

```
#去批次后进行聚类
scRNAlist <- SplitObject(scRNA, split.by = "orig.ident")
scRNAlist <- lapply(scRNAlist, FUN = function(x) NormalizeData(x)) #如果前面用的是SCTransform就不能这样做了
scRNAlist <- lapply(scRNAlist, FUN = function(x) FindVariableFeatures(x))
scRNA_mnn <- RunFastMNN(object.list = scRNAlist)
scRNA_mnn <- FindVariableFeatures(scRNA_mnn)
scRNA_mnn <- RunUMAP(scRNA_mnn, reduction = "mnn", dims = 1:20) #稍微有点慢，dims也是可以改变的
scRNA_mnn <- FindNeighbors(scRNA_mnn, reduction = "mnn", dims = 1:20)
scRNA_mnn <- FindClusters(scRNA_mnn)
p1 <- DimPlot(scRNA_mnn, group.by = "orig.ident", pt.size=0.1) + 
  ggtitle("Integrated by fastMNN")
p2 <- DimPlot(scRNA, group.by="orig.ident", pt.size=0.1) + 
  ggtitle("No integrated")
p = p1 + p2 + plot_layout(guides='collect')
#比较去批次前后聚类结果的变化
p
saveRDS(scRNA_mnn,"Step4_MNN.rds")
```

```
p3 <- DimPlot(scRNA_mnn,  pt.size=0.1) 

p4 <- DimPlot(scRNA_mnn, split.by="orig.ident", pt.size=0.1) 

p = p3 + p4 + plot_layout(guides='collect')
#比较各个样本内cluster的组成情况
p

ggsave('Clustering/fastMNN.png', p, width=8, height=4)
```

### 4.2 批次效应矫正方法二：harmony

```
#harmony
library(harmony)
scRNA_harmony <- RunHarmony(scRNA, group.by.vars = "orig.ident")
scRNA_harmony <- RunUMAP(scRNA_harmony, reduction = "harmony", dims = 1:20) #稍微有点慢
scRNA_harmony <- FindNeighbors(scRNA_harmony, reduction = "harmony", dims = 1:20) %>% FindClusters()

p1 <- DimPlot(scRNA_harmony, group.by = "orig.ident", pt.size=0.1) + 
  ggtitle("Integrated by harmony")
p2 <- DimPlot(scRNA, group.by="orig.ident", pt.size=0.1) + 
  ggtitle("No integrated")
#比较去批次前后聚类结果的变化
p = p1 + p2 + plot_layout(guides='collect')

p
```

```
p3 = DimPlot(scRNA_harmony, reduction = "umap") 
p4 = DimPlot(scRNA_harmony, reduction = "umap", split.by='orig.ident') 
#比较各个样本内cluster的组成情况
plotc <- p3+p4 + plot_layout(guides='collect')
ggsave("Clustering/scRNA_harmony.png", plot = plotc, width = 10, height = 5)
plotc
saveRDS(scRNA_harmony,"Step4_harmony.rds")
```

### 4.3 批次效应矫正方法三：锚点整合IntegrateData

```
#数据整合
# 选取两个样本之间共有的feature
features <- SelectIntegrationFeatures(object.list = scRNAlist)
#以共有features为基础寻找锚点
scRNA.anchors <- FindIntegrationAnchors(object.list = scRNAlist,anchor.features = features)
#利用锚点整合数据
scRNA3 <- IntegrateData(anchorset = scRNA.anchors)
#重跑前面聚类的一般流程
scRNA3 <- ScaleData(scRNA3, verbose = FALSE)
scRNA3 <- RunPCA(scRNA3, npcs = 30, verbose = FALSE)
scRNA3 <- RunUMAP(scRNA3, reduction = "pca", dims = 1:20)
scRNA3 <- FindNeighbors(scRNA3, reduction = "pca", dims = 1:20)
scRNA3 <- FindClusters(scRNA3, resolution = 0.5)
#比较整合前后聚类结果的变化
p1 <- DimPlot(scRNA3, group.by = "orig.ident", pt.size=0.1) + 
  ggtitle("Integrated")
p2 <- DimPlot(scRNA, group.by="orig.ident", pt.size=0.1) + 
  ggtitle("No integrated")
p = p1 + p2 + plot_layout(guides='collect')

p
```

```
p3 <- DimPlot(scRNA3, reduction = "umap")
p4 <- DimPlot(scRNA3, reduction = "umap", split.by = "orig.ident")
Integrated_dimplot <- p3 + p4 +  plot_layout(guides='collect')
#比较各个样本内cluster的组成情况
Integrated_dimplot
ggsave('Clustering/Integrated.png', Integrated_dimplot, width=8, height=4)
```

### 5、marker基因鉴定

```
#鉴定 marker 基因
dir.create("Marker")
# 鉴定各 cluster 的 Marker 基因
all.markers = FindAllMarkers(scRNA_mnn, 
                                     min.pct = 0.25, 
                                     logfc.threshold = 0.25, 
                                     only.pos = TRUE)
head(all.markers)
```

```
# cluster5 和cluster0、3进行比较
cluster5.markers <- FindMarkers(scRNA_mnn, 
                                ident.1 = 5, 
                                ident.2 = c(0, 3), 
                                min.pct = 0.25, 
                                only.pos = TRUE)
head(cluster5.markers)
```

```
# 根据 FoldChange 进行排序选取每群细胞的 top10 Marker 基因
top10 = all.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)

# 保存 Marker 基因
write.table(all.markers, 
            "Marker/all_Markers_of_each_clusters.xls", 
            col.names = T, 
            row.names = F, 
            sep = "\t")
write.table(top10, 
            "Marker/top10_Markers_of_each_clusters.xls", 
            col.names = T, 
            row.names = F, 
            sep = "\t")
```

```
# 绘制热图
scRNA_mnn <- ScaleData(scRNA_mnn, features = row.names(scRNA_mnn))
heatmap_plot = DoHeatmap(object = scRNA_mnn, 
                         features = as.vector(top10$gene), 
                         group.by = "seurat_clusters", 
                         group.bar = T, 
                         size = 2) +
  theme(axis.text.y = element_text(size = 4))

# 保存绘图结果
ggsave("Marker/top10_marker_of_each_cluster_heatmap.pdf", width = 12, height = 12,
       plot = heatmap_plot)
ggsave("Marker/top10_marker_of_each_cluster_heatmap.png", width = 12, height = 12,
       plot = heatmap_plot)
heatmap_plot
```

```
top1 = all.markers %>% group_by(cluster) %>% top_n(n = 1, wt = avg_log2FC)
# 绘制单个基因的小提琴图
VlnPlot(scRNA_mnn, 
                  cols = NULL, 
                  features = "RGS13", 
                  pt.size = 0, 
                  group.by = "seurat_clusters")

#批量绘制小提琴图
dir.create("Marker/vlnplot/")
for (gene in as.vector(top1$gene)) {
  vlnplot = VlnPlot(scRNA_mnn, 
                    features = gene, 
                    pt.size = 0, 
                    group.by = "seurat_clusters")
  ggsave(paste0("Marker/vlnplot/",gene,"_vlnplot.pdf"), plot = vlnplot)
  ggsave(paste0("Marker/vlnplot/",gene,"_vlnplot.png"), plot = vlnplot)
}
```

```
# 绘制单个基因的featureplot
FeaturePlot(scRNA_mnn, 
            features = "RGS13", 
            pt.size = 0.5,
            reduction = "umap",
            cols = c("lightgrey", "#DE1F1F"))


#批量绘制 featureplot 图
dir.create("Marker/featureplot/")
for (gene in as.vector(top1$gene)) {
  featureplot = FeaturePlot(scRNA_mnn, 
                            features = gene, 
                            pt.size = 0.5, 
                            cols = c("lightgrey", "#DE1F1F"))
  ggsave(paste0("Marker/featureplot/",gene,"_featureplot.pdf"), plot = featureplot)
  ggsave(paste0("Marker/featureplot/",gene,"_featureplot.png"), plot = featureplot)
}
```

## 6、差异基因鉴定

```
#将数据集的idents从clusters切换回orig.ident(各样本)
Idents(scRNA_mnn) <- 'orig.ident'
# 鉴定各样本的差异基因
Diff_exp = FindAllMarkers(scRNA_mnn, 
                               min.pct = 0.25, 
                               logfc.threshold = 0.25, 
                               only.pos = TRUE)
head(Diff_exp)
top10 = Diff_exp %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
dir.create("Diffexp")
write.table(Diff_exp, 
            "Diffexp/group_HNC01TIL-vs-Tonsil2-diff.xls", 
            col.names = T, 
            row.names = F, 
            sep = "\t")
write.table(top10, 
            "Diffexp/group_HNC01TIL-vs-Tonsil2-top10-diff.xls", 
            col.names = T, 
            row.names = F, 
            sep = "\t")
```

```
heatmap_plot = DoHeatmap(object = scRNA_mnn, 
                         features = as.vector(top10$gene), 
                         group.by = "orig.ident", 
                         group.bar = T, 
                         size = 2) +
  theme(axis.text.y = element_text(size = 4))

# 保存绘图结果
ggsave("Diffexp/top10_marker_of_each_cluster_heatmap.pdf",
       plot = heatmap_plot)
ggsave("Diffexp/top10_marker_of_each_cluster_heatmap.png", 
       plot = heatmap_plot)
heatmap_plot
```

## 7、富集分析

```
#富集分析
genes_symbol <- as.character(Diff_exp$gene)
#转换ID
eg = bitr(genes_symbol, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
id = as.character(eg[,2])
dir.create("enrichment")
#GO
ego <- enrichGO(gene = id,
                OrgDb = org.Hs.eg.db,
                ont = "MF",
                pAdjustMethod = "BH",
                pvalueCutoff = 0.05,
                qvalueCutoff = 0.05,
                readable = TRUE)
GO_dot <- dotplot(ego)
GO_bar <- barplot(ego)
res_plot <- CombinePlots(list(GO_dot,GO_bar), nrow=1)
ggsave("enrichment/GO_results.pdf", plot=res_plot, width = 12,height = 7)
ggsave("enrichment/GO_results.png", plot=res_plot, width = 12,height = 7)
```

```
GO_dot
```

```
GO_bar
```

## 8、细胞类型鉴定

### 8.1 方法一：SingleR（有时候不太准确）

```
dir.create("SingleR")
Idents(scRNA_mnn) <- 'seurat_clusters'
# 读入参考数据集
load("HumanPrimaryCellAtlas_hpca.se_human.RData")
norm_count = GetAssayData(scRNA_mnn, 
                          slot="data") #标准化后的矩阵
pred<- SingleR(test = norm_count, 
               ref = hpca.se, 
               labels = hpca.se$label.main)
#将注释结果添加到seurat对象里
scRNA_mnn$singleR_celltype = pred$labels

# 对细胞类型注释结果进行可视化
celltype_plot = DimPlot(scRNA_mnn, 
                        reduction = "umap", 
                        group.by = "singleR_celltype")
ggsave("SingleR/celltype_plot.pdf", celltype_plot)
ggsave("SingleR/celltype_plot.png", celltype_plot)
celltype_plot
```

```
# 通过热图了解每个细胞对应细胞类型的打分情况
score_heatmap <-  plotScoreHeatmap(pred, clusters = scRNA_mnn@meta.data$seurat_clusters, order.by = "clusters")
ggsave("SingleR/score_heatmap.pdf", score_heatmap, width = 8, height = 10)
ggsave("SingleR/score_heatmap.png", score_heatmap, width = 8, height = 10)
score_heatmap
```

```
# 通过热图了解每个cluster对应细胞类型的打分情况
tab <- table(cluster=scRNA_mnn@meta.data$seurat_clusters, label=pred$labels) 
cluster_type <- pheatmap::pheatmap(log10(tab+10)) # using a larger pseudo-count for smoothing. 
ggsave("SingleR/cluster_type.pdf", cluster_type)
ggsave("SingleR/cluster_type.png", cluster_type)
cluster_type
```

### 8.2 方法二：根据细胞marker手动注释

```
#比如可以找到一些T细胞marker，用散点图或者小提琴图展示出来，观察其分布情况
#也可以根据各细胞亚群的top差异基因（FindAllMarkers的结果）直接注释

genes_to_check = c('PTPRC', 'CD3D', 'CD3E','FOXP3',
                   'CD4','IL7R','NKG7','CD8A',"GZMB",'GNLY','MKI67','TCF7','IFNG','IL17A','PDCD1','TOX','CD160','KLRC2','FGFBP2', 'CD27','CCR7')
DotPlot(scRNA_mnn, group.by = 'seurat_clusters',
        features = unique(genes_to_check)) + RotatedAxis()
```

### 在meta.data中添加细胞注释

```
# 将singleR得到的细胞类型加入到对象的meta.data中，后续用于高级分析
scRNA_mnn <- RenameIdents(scRNA_mnn, '0'='T_cells','1'='B_cell','2'='T_cells','3'='B_cell','4'='T_cells','5'='T_cells','6'='T_cells','7'='Monocyte','8'='B_cell','9'='Monocyte','10'='B_cell','11'='Pre-B_cell_CD34-','12'='CMP')
scRNA_mnn[["celltype"]] <- Idents(scRNA_mnn)
head(scRNA_mnn@meta.data)
```

```
saveRDS(scRNA_mnn, "Step5_CellTyping.rds")
```

```
p1 <- DimPlot(scRNA_mnn, group.by = "seurat_clusters")
p2 <- DimPlot(scRNA_mnn, group.by = "celltype")
#查看cluster与细胞类型的对应关系
p3 = p1+p2
ggsave("SingleR/celltype_contrast.pdf",p3,width = 12, height = 6)
ggsave("SingleR/celltype_contrast.png",p3, width = 12, height = 6)
p3
```

```
#查看各样本中细胞类型的组成情况
DimPlot(scRNA_mnn, group.by = "celltype", split.by = "orig.ident")
```


---

# Agent Instructions
This documentation is published with GitBook. GitBook is the documentation platform designed so that both humans and AI agents can read, navigate, and reason over technical content effectively. Learn more at gitbook.com.

## Querying This Documentation
If you need additional information that is not directly available in this page, you can query the documentation dynamically by asking a question.

Perform an HTTP GET request on the current page URL with the `ask` query parameter:

```
GET https://liulab-1.gitbook.io/liulab-docs/drylab-protocols/scrnaseq-seurat.md?ask=<question>
```

The question should be specific, self-contained, and written in natural language.
The response will contain a direct answer to the question and relevant excerpts and sources from the documentation.

Use this mechanism when the answer is not explicitly present in the current page, you need clarification or additional context, or you want to retrieve related documentation sections.
