scRNAseq-scanpy
By Kaiyi Fu
环境配置
Anaconda
下载安装anaconda:https://www.anaconda.com/download
配置channels:打开Anaconda Navigator,点击Channel,添加conda-forge和bioconda,点击更“Update channels”
在conda环境中安装包:点击Anaconda Powershell Prompt,即可用conda命令创建环境,安装python包
VScode
下载并安装VScode
安装插件:Python、C/C++、R、R Debugger、Jupyter相关、Github Copilot、Auto Import、Remote - SSH、WSL等扩展插件
新建ipynb文件,Select Kernel选择环境变量
PyTorch
根据驱动和CUDA版本,在官网选择适当版本的PyTorch,进入conda环境,将下载指令复制到"Conda Prompt"中运行
下载完成后输入"pip list",查看列表中有没有"torch",有则说明安装完成
Conda常用命令操作
// Conda Powershell
# 创建名称为scanpy,python版本为3.11的虚拟环境
conda create -n scanpy python=3.11 --yes
# 激活scanpy环境
conda activate scanpy
# 退出当前虚拟环境
conda deactivate
# 查看所有虚拟环境
conda env list
#删除环境
conda remove -n 名字 --all
#重命名环境:conda没有提供重命名环境的命令,我们只能先克隆一份原来的环境,然后再删除原来的环境
conda create -n b --clone a
conda remove -n a --all
#安装包
conda install pip
#格式:conda install [-n env_name | -p path] [-c channel_address] [packages]
#安装指定版本的包
conda install peckage_name==x.x
#移除包
conda remove -n my_env numpy
conda uninstall 是 conda remove 的别名,同理
#列出环境中所有包
conda list
#更新包
conda update -n my_env numpy scipy
#格式:conda update [-n env_name | -p path] [packages] [--all]
#设置镜像
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
conda config --add channels https://mirrors.ustc.edu.cn/anaconda/pkgs/free/
conda config --add channels https://mirrors.ustc.edu.cn/anaconda/pkgs/main/
#设置搜索时显示Channel地址
conda config --set show_channel_urls yes
系统教程
安装Python包
// Conda Powershell
# 安装jupyter notebook内核,-i参数指定了清华镜像作为安装包来源,可以加快安装速度。
pip install ipykernel -i https://pypi.tuna.tsinghua.edu.cn/simple
python -m ipykernel install --user --name scanpy --display-name "Scanpy"
# 安装常用分析包
pip install scanpy -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install pooch -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install scrublet -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install doubletdetection -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install bbknn -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install scanorama -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install gseapy -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install adjustText -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install cellphonedb -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install ktplotspy -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install palantir -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install infercnvpy -i https://pypi.tuna.tsinghua.edu.cn/simple
conda install -c conda-forge fa2
pip install torch-geometric
pip install diopy
pip install numba
pip install omicverse
pip install scvi-tools
### pyscenic环境
conda create -n pyscenic python=3.10 --yes
conda activate pyscenic
pip install ipykernel -i https://pypi.tuna.tsinghua.edu.cn/simple
python -m ipykernel install --user --name pyscenic --display-name "pySCENIC"
#新版本numpy导致module 'numpy' has no attribute 'object'的报错,因此指定版本
pip install numpy==1.23.5 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install scanpy -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install pyscenic -i https://pypi.tuna.tsinghua.edu.cn/simple
conda deactivate
分析前参数设置
// Python
# 当前工作目录
import os
os.getcwd()
# 目录下文件
os.listdir()
#导入包
import scanpy as sc
import anndata as an
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#基本参数设置
torch.set_float32_matmul_precision("high")
sc.settings.verbosity = 3
#sc是scanpy的简写,后面跟上的settings.verbosity 关于日志记录的详细等级,3表示显示提示
sc.logging.print_header()
# 此函数用于打印可能影响数值结果的各种依赖包的版本信息
sc.settings.set_figure_params(dpi=80, facecolor="white")
#设置图片的尺寸
sc.set_figure_params(figsize=(6, 6), frameon=False)
#设置图片大小
#所有参数
# sc.settings.set_figure_params(
# *,
# scanpy: 'bool' = True,
# dpi: 'int' = 80,
# dpi_save: 'int' = 150,
# frameon: 'bool' = True,
# vector_friendly: 'bool' = True,
# fontsize: 'int' = 14,
# figsize: 'int | None' = None,
# color_map: 'str | None' = None,
# format: '_Format' = 'pdf',
# facecolor: 'str | None' = None,
# transparent: 'bool' = False,
# ipython_format: 'str' = 'png2x',
# ) -> 'None'
# scanpy: 如果为True,使用Scanpy的默认matplotlib设置。
# dpi: 图像显示的分辨率。
# dpi_save: 保存图像的分辨率。
# frameon: 是否在图像周围显示框架。
# vector_friendly: 是否生成对矢量图友好的输出。
# fontsize: 图形中使用的字体大小。
# figsize: 图形的尺寸,如果为None,则使用默认尺寸。
# color_map: 使用的颜色映射。
# format: 保存图形时使用的格式。
# facecolor: 图形的背景颜色,如果为None,使用默认颜色。
# transparent: 是否使图形背景透明。
# ipython_format: 在Jupyter Notebook中显示图形时使用的格式。
# 设定一个文件路径
results_file = "output/" # the file that will store the analysis results
数据的基本处理
1、数据格式
Scanpy 构建的对象叫做 AnnData 对象,数据存储是以多个模块存储;
obs 存储的是 seurat 对象中的 meta.data 矩阵
X 对象为一个稀疏矩阵(scipy.sparse.csr.csr_matrix),用于存储矩阵的值,与 seurat 对象是转置关系,即Scanpy 中的行为样本,列为基因
X 的多层级是由 layers 管理的,X 仍然只是一个矩阵而已。例如,在 X 存储未归一化的原始计数数据,在 layers 中可以添加归一化的数据或其他转换后的数据
var 存储的是基因(特征)的信息
uns 存储的是后续添加的非结构信息
obsm 和 varm 两个属性类似于字典,可以用键值对的方式添加矩阵类型(如 Pandas 数据框, scipy 稀疏矩阵, numpy 数组)的元数据信息,如 UMAP embedding 等;obsm 所添加的每个数据长度一定等于
n_obs
,varm
的长度与n_vars
相同obsp 和 varp 中主要存储了观测值或变量两两之间的关系矩阵
AnnData object with n_obs × n_vars = 8931 × 33538
n_obs: 代表观测的数量
n_vars: 代表变量的数量,即测量的基因数
scanpy常用组件
sc.pp:用于数据预处理的 preprocessing 模块
sc.tl :访问用于数据分析的 tools 模块
sc.pl:用于可视化的 ploting 模块
访问读写操作是顶层函数,直接使用 sc 访问
// Python
#查看数据
adata.obs.shape # 2700个细胞
adata.var.shape # 32738个基因
adata.to_df().shape # 2700*32738
adata.obs.head()
adata.var.head()
adata.to_df().iloc[0:5,0:5]
#查看细胞名(barcodes的名称)
print(adata.obs_names)
#查看barcodes的meta_data
print(adata.obs)
#查看基因名(features)
print(adata.var_names)
#查看核糖体基因
ribo=adata.var_names.str.match('^RP[SL]') #鼠是小写
print(adata.var_names[ribo])
#查看barcodes的数量
print(len(adata.obs_names))
#查看features的数量
print(len(adata.var_names))
#查看counts数据
print(adata.X)
#查看数据对象的形状
print(adata.shape)
2、数据读入
// Python
# 读取 AnnData 对象
filename = "E:/.../abc.h5ad"
adata = sc.read_h5ad(filename)
# 读取matrix.mtx, barcodes.tsv和genes.tsv三个10x文件
adata = sc.read_10x_mtx('./10X')
print(adata)
# omicsverse读取三个文件
adata = sc.read_10x_mtx(
'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file
var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)
cache=True) # write a cache file for faster subsequent reading
adata
# 读取h5
adata = sc.read_10x_h5('./h5')
adata
# 直接构建AnnData对象,分别将表达矩阵,细胞信息,基因信息读取
# creat scanpy object
df = pd.read_csv('processfile/count.csv', index_col=0)
meta = pd.read_csv('processfile/metadata.csv', index_col=0)
cellinfo = pd.DataFrame(df.index,index=df.index,columns=['sample_index'])
geneinfo = pd.DataFrame(df.columns,index=df.columns,columns=['genes_index'])
sce = sc.AnnData(df, obs=cellinfo, var = geneinfo)
#读取完之后对基因名和细胞名去重
adata.var_names_make_unique()
adata.obs_names_make_unique()
adata
3、数据提取与导出
// Python
import scanpy as sc
import scipy.sparse as sparse
import scipy.io as sio
# 保存h5ad
adata.write('my_results.h5ad', compression="gzip")
# 将AnnData对象其中一层的数据转换为一个DataFrame
#数据框为 n_obs 行 n_vars 列,索引为 obs_names,列名为 var_names
df = adata.to_df(layer="log_transformed")
df.shape
# 提取细胞和基因信息
cellinfo = adata.obs
geneinfo = adata.var
# 提取稀疏矩阵(计数矩阵)并转置
mtx = adata.layers['counts'].T # 使用 'counts' 层的数据
# 保存细胞信息和基因信息为 CSV 文件
cellinfo.to_csv("e:/MyPython/singlecell_scanpy/cellinfo.csv")
geneinfo.to_csv("e:/MyPython/singlecell_scanpy/geneinfo.csv")
# 导出metadata的csv
adata.obs[['doublet_score', 'predicted_doublet']].to_csv('scrublet.prediction.res.csv')
# 保存稀疏矩阵为 .mtx 文件
sparse_matrix = sparse.csr_matrix(mtx) # 将稀疏矩阵转换为 CSR 格式
sio.mmwrite("e:/MyPython/singlecell_scanpy/sparse_matrix.mtx", sparse_matrix)
4、数据转换(H5AD/Seurat/H5等)
H5AD转换为Seurat
// Python
#scanpy直接输出单细胞对象的各类信息
import scanpy as sc
from scipy import io
adata = sc.read_h5ad(filename)
adata = adata.raw.to_adata()
with open('barcodes.tsv','w') as f:
for item in adata.obs_names:
f.write(item + '\n')
adata.obs_names
with open('features.tsv','w') as f:
for item in ['\t' .join([x,x, 'Gene Expression'])
for x in adata.var_names]:
f.write(item + '\n')
io.mmwrite('matrix',adata.X.T)
adata.obs.to_csv('metadata.csv')
mymatrix <- Read10X('./')
// R
#R中读取
scRNA <- CreateSeuratObject(counts = mymatrix)
mymetadata <- read.csv('metadata.csv')
scRNA <- AddMetaData(scRNA,metadata = mymetadata)
// R
#方法一:zellkonverter
library(Seurat)
library(zellkonverter)
library(SingleCellExperiment)
# 读取.h5ad文件
sce <- zellkonverter::readH5AD("GSM4648564_adipose_raw_counts.h5ad")
# 查看SingleCellExperiment对象
sce
# 访问主表达矩阵
expression_matrix <- assay(sce)
# 访问细胞元数据
cell_metadata <- colData(sce)
# 访问基因元数据
gene_metadata <- rowData(sce)
#总步骤:
sce<-zellkonverter::readH5AD("test.h5ad",verbose=T,reader = "R")
assay(sce,"counts")<-assay(sce,"X")
assay(sce,"logcounts")<-log1p(assay(sce,"counts"))
sc <- as.Seurat(sce,counts="counts",data="logcounts")
# 修改assay name为RNA
sc <- RenameAssays(object = sc, originalexp = 'RNA')
# 转换为V5
sc[["RNA"]] <- as(object = sc[["RNA"]], Class = "Assay5")
#此方法转换的seurat对象为v3 assays,后面要转化成v5。
#转换后的orig.ident为sample,需要自己修改,多样本最好转换前,在obs上增加一列sample信息的列,转换后替代成orig.ident
// R
#方法二:使用reticulate
library(reticulate)
# 导入scanpy
sc <- import('scanpy')
# 读入h5ad文件
scvi_h5ad <- sc$read_h5ad('data/GSE136831/04_annotation.h5ad')
scvi_h5ad
## 在这里我只取需要的信息
## 1.样本信息
meta = scvi_h5ad$obs # obs储存细胞信息,对应seuratobj@meta.data
var = scvi_h5ad$var # var储存基因信息,这是由于anndata提取的矩阵没有行列名,所以这里是为了提取基因名
## 2.scvi坐标
X_umap_scVI <- scvi_h5ad$obsm['X_umap_scVI']
rownames(X_umap_scVI) = rownames(meta)
colnames(X_umap_scVI) = c('scVI_UMAP-1', 'scVI_UMAP-2')
head(X_umap_scVI)
X_umap_Harmony <- scvi_h5ad$obsm['X_umap_Harmony']
rownames(X_umap_Harmony) = rownames(meta)
colnames(X_umap_Harmony) = c('Harmony_UMAP-1', 'Harmony_UMAP-2')
head(X_umap_Harmony)
# 提取矩阵,这里需要行列转置,因为anndata矩阵是 cells x genes,而seurat是genes x cells
filter_counts = t(scvi_h5ad$layers['counts'])
dim(filter_counts)
# 提取的矩阵缺少行列名
colnames(filter_counts) = rownames(meta)
# 这里的矩阵是只保留了高可变基因
rownames(filter_counts) = rownames(var)[var$highly_variable_features]
filter_counts[1:5,1:3]
# 如果你希望拿到原始矩阵,你可以从这里提取: adata$raw$X (前提是你在Scanpy中保存了原始矩阵)
# 对应的,你应该从adata$raw$var提取基因名
# 如果你对anndata对象不熟悉,可以忽略这几行注释
# 重新构建seurat对象
scobj <- CreateSeuratObject(counts = filter_counts,
meta.data = meta)
scobj
scobj <- NormalizeData(scobj)
scobj <- FindVariableFeatures(scobj, selection.method = "vst", nfeatures = 1000)
scobj <- ScaleData(scobj, features = VariableFeatures(object = scobj))
# 添加 reductions
scobj[['X_umap_scVI']] <- CreateDimReducObject(embeddings = X_umap_scVI)
scobj[['X_umap_Harmony']] <- CreateDimReducObject(embeddings = X_umap_Harmony)
library(SCP)
CellDimPlot(
srt = scobj,
group.by = c("celltype"),
reduction = "X_umap_scVI",
theme_use = "theme_blank",
xlab = "UMAP-1",
ylab = "UMAP-2",
label = F,
label.size = 3.5
)
// R
#方法三:
#devtools::install_github("cellgeni/schard")
sc <- schard::h5ad2seurat(h5ad_file_path)
Seurat转换为H5AD
// R
#方法一: 安装 zellkonverter 包
install.packages("zellkonverter")
library(zellkonverter) # 加载必要的库
library(Seurat)
library(SingleCellExperiment) # 将 Seurat 对象转换为 SingleCellExperiment 对象
sce_object <- as.SingleCellExperiment(seurat_object) # 保存为 h5ad 格式
writeH5AD(sce_object, "seurat_object.h5ad")
#方法二:MudataSeurat包
# remotes::install_github("zqfang/MuDataSeurat", ref='dev', force = T)
# 单模态数据
MuDataSeurat::WriteH5AD(scRNA, "Step1_merged_raw.h5ad", assay="RNA")
#Seurat v4适用,Seurat v5需要先转换为v4对象
#方法三:scCustomize
https://samuel-marsh.github.io/scCustomize/articles/Object_Conversion.html
#此包同样支持seurat对象v3/v4/v5互转
# Currently scCustomize 3.0.0 is not released yet. You may want to use
pak::pkg_install("samuel-marsh/scCustomize@release/3.0.0")
# to get the latest features
sc <- scCustomize::as.anndata(x = pbmc, file_path = "~/Desktop", file_name = "pbmc_anndata.h5ad")
5、子集操作与添加metadata
// Python
#提取矩阵子集的通用格式
adata[["Cell_1", "Cell_10"], ["Gene_5", "Gene_1900"]]
#提取某类细胞
adata[adata.obs.cell_type == "B"]
#选取线粒体比率>0.02的子集
sub_adata=adata[adata.obs.percent_mito>0.02]
#查看子集
print(sub_adata)
#获取counts值大于500的barcodes
i=adata.obs.n_counts>7000
print(adata.obs_names[i])
#注意:对AnnData对象进行子集总是会返回视图(View),例如
adata_view = adata[:5, ["Gene_1", "Gene_3"]]
adata_view
# View of AnnData object with n_obs × n_vars = 5 × 2
# obs: 'time_yr', 'subject_id', 'instrument_type', 'site'
#要在内存中保存从 AnnData 索引的子集,就必须调用 .copy() 方法!
adata_subset = adata[:5, ["Gene_1", "Gene_3"]].copy()
#添加元数据
#adata.obs 和 adata.var 都是 Pandas DataFrame 类型,可以直接添加对应的信息列
ct = np.random.choice(["B", "T", "Monocyte"], size=(adata.n_obs,))
adata.obs["cell_type"] = pd.Categorical(ct) # 分类类型性能更好
adata.obs.head(3)
# cell_type
# Cell_0 T
# Cell_1 T
# Cell_2 T
#添加obsm和varm
#通过字典的方式来插入的不同矩阵,但是要保证数据的维度必须要与对应的 n_obs 和 n_vars 保持一致
adata.obsm["X_umap"] = np.random.normal(0, 1, size=(adata.n_obs, 2))
adata.varm["gene_stuff"] = np.random.normal(0, 1, size=(adata.n_vars, 5))
adata.obsm
#添加layers
adata.layers["log_transformed"] = np.log1p(adata.X)
adata
6、对象合并
// Python
# https://mp.weixin.qq.com/s/ypI9fvqRaBvekg4enaFb4Q
# 两个或多个 AnnData 对象可以使用 concat 函数连接(Concatenation)或合并(Merging)起来,其中
# 连接是指我们保留每个对象的所有子元素,并以有序的方式堆叠这些元素
# 合并不与连接轴对齐的元素
adata1=ov.read('neurips2021_s1d3.h5ad')
adata1.obs['batch']='s1d3'
adata2=ov.read('neurips2021_s2d1.h5ad')
adata2.obs['batch']='s2d1'
adata3=ov.read('neurips2021_s3d7.h5ad')
adata3.obs['batch']='s3d7'
adata=sc.concat([adata1,adata2,adata3],merge='same')
adata
adata.obs['batch'].unique()
import numpy as np
adata.X=adata.X.astype(np.int64)
数据预处理
对于单细胞数据,我们需要在分析前进行质量控制,包括去除包含双细胞、低表达细胞和低表达基因的细胞。除此之外,我们还需要根据线粒体基因比例、转录本数量、每个细胞表达的基因数量、细胞复杂性等进行过滤。
1、背景RNA校正
CellBender:
需要输入 cellranger count 输出的没有 filter 的矩阵(raw_feature_bc_matrix)
如果没有cellranger的输出文件,可以在R中将三个未经过滤的的10X文件转为h5文件,然后在linux的conda终端中运行cellbender
// R
library(DropletUtils)
library(Seurat)
filter_matrix <- Read10X("./filtered_feature_bc_matrix/")
write10xCounts("./filtered_feature_bc_matrix.h5", filter_matrix, type = "HDF5",
genome = "mm10", version = "3", overwrite = TRUE,
gene.id = rownames(filter_matrix),
gene.symbol = rownames(filter_matrix))
// Conda Prompt(Linux)
conda create -n cellbender python=3.7
conda activate cellbender
conda install pip
pip install cellbender
pip install lxml_html_clean
#CellBender的运行和R包不太一样,只需要直接指定h5文件即可,最后会输出一个过滤的h5文件用于下游分析
cellbender remove-background --cuda --input D_raw_feature_bc_matrix.h5 --output D_cellbender.h5
decontX
在R中运行,可以自行设定阈值
// R
####读入整合####
#读入三个10X文件(去除空液滴后的)
rm(list=ls())
obj_dir <- c("./D_EmptyDrops_CR_289238_matrix","DP_EmptyDrops_CR_289237_matrix")
names(obj_dir) = c("D","DP")
counts.filter <- Read10X(data.dir = obj_dir)
####DecontX去除RNA污染####
#https://bioconductor.org/packages/devel/bioc/vignettes/celda/inst/doc/decontX.html
#另外读入三个raw 10X文件(没有去除empty drop的raw matrix)
obj_dir <- c("./D_all_matrix","DP_all_matrix")
names(obj_dir) = c("D","DP")
counts.raw <- Read10X(data.dir = obj_dir)
library(SingleCellExperiment)
#生成sce对象
sce.filter <- SingleCellExperiment(list(counts=counts.filter))
sce.raw <-SingleCellExperiment(list(counts=counts.raw))
#运行decontX
decontX_results <- decontX(sce.filter,background =sce.raw) #可以添加batch参数指定批次,batch = sce$orig.ident
sce <- CreateSeuratObject(counts.filter)
#过滤前13107个细胞
#提取污染分数
sce$decontX_contamination <- decontX_results$decontX_contamination
#提取校正矩阵
sce[["decontX"]]=CreateAssayObject(counts=round(assays(decontX_results)[[2]])) ##decontX 的净化矩阵由浮点数组成,四舍五入成整数
### 过滤####
#根据contamination值进行过滤,一般是保留<0.2的细胞,其实有点多,可能会丢失过多的信息。
sce_filt <- sce[,sce$decontX_contamination<0.4]
#阈值0.2时,剩余7232个,过多
#阈值0.4时,剩余11196个
table(sce_filt@meta.data$orig.ident)
#D DP
#6356 4840
#导出为h5ad
MuDataSeurat::WriteH5AD(sce_filt, "E:/Research/scRNAseq/h5ad/Step2_afterDecontX.h5ad", assay="decontX")
#saveRDS(scRNA1,"E:/Research/scRNAseq/h5ad/Step1_merged.rds")
### 结果的可视化 ####
#绘制 DecontX 结果
umap <- reducedDim(decontX_results, "decontX_UMAP")
library(celda)
plotDimReduceCluster(x = decontX_results$decontX_clusters,
dim1 = umap[, 1], dim2 = umap[, 2])
plotDecontXContamination(decontX_results)
#查看污染的分布
library(scater)
decontX_results <- logNormCounts(decontX_results)
plotDimReduceFeature(as.matrix(logcounts(decontX_results)),
dim1 = umap[, 1],
dim2 = umap[, 2],
features = c("Cd3d", "Cd3e", "Gnly",'Ly6g','Siglecf','Jchain',
"S100a8", "S100a9","C1qa",
"Cd79a", "Cd79b", "Ms4a1"),
exactMatch = TRUE)
#主要在巨噬
#调整先验以影响污染估计
metadata(decontX_results)$decontX$estimates$all_cells$delta
sce.delta <- decontX(decontX_results, delta = c(3, 20), estimateDelta = FALSE)
plot(decontX_results$decontX_contamination, sce.delta$decontX_contamination,
xlab = "DecontX estimated priors",
ylab = "Setting priors to estimate higher contamination")
abline(0, 1, col = "red", lwd = 2)
#阈值设为0.4较为合理
2、去除双细胞
COMPOSITE
// Python
#Omicverse一键质控
import scanpy as sc
import omicverse as ov
ov.plot_set()
adata=ov.pp.qc(adata,
tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250},
doublets_method='sccomposite', #也可以是scrublet
batch_key=None)
adata
#去除双细胞的方法有两种可以选择,composite会比Scrublet去除更多的细胞
Scrublet
// Python
sc.external.pp.scrublet(adata, expected_doublet_rate = 0.05,threshold=0.25,batch_key = "orig.ident")
#导出
adata.obs[['doublet_score', 'predicted_doublet']].to_csv('scrublet.prediction.res.csv')
#保留单细胞
adata = adata[adata.obs['predicted_doublet'] == False] #obs 为细胞维度对象,var是基因维度对象
adata
3、线粒体/核糖体/红细胞过滤
// Python
# 线粒体基因
adata.var["mt"] = adata.var_names.str.startswith("mt-")
# 核糖体基因
adata.var["ribo"] = adata.var_names.str.startswith(("Rps", "Rpl"))
# 血红蛋白基因
adata.var["hb"] = adata.var_names.str.contains(("^Hb[^(p)]"))
#计算协变量
sc.pp.calculate_qc_metrics(
adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
adata
#查看
adata.obs[['total_counts','n_genes_by_counts','pct_counts_mt','pct_counts_ribo','pct_counts_hb',]]
#绘制三个协变量的分布图
import matplotlib.pyplot as plt
mito_filter = 15
n_counts_filter = 6000
fig, axs = plt.subplots(ncols = 2, figsize = (8,4))
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt',ax = axs[0], show=False)
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts',ax = axs[1], show = False)
#draw horizontal red lines indicating thresholds.
axs[0].hlines(y = mito_filter, xmin = 0, xmax = max(adata.obs['total_counts']), color = 'red', ls = 'dashed')
axs[1].hlines(y = n_counts_filter, xmin = 0, xmax = max(adata.obs['total_counts']), color = 'red', ls = 'dashed')
fig.tight_layout()
plt.show()
#n_genes_by_counts/detected_genes: 一个细胞中发现的有效基因数量(即表达量不为0)
#total_counts/nUMIs: 一个细胞中发现的分子数量(UMI),通常也可以被认为是这个细胞的文库大小
#pct_counts_mt/mito_perc: 一个细胞中线粒体基因的表达计数占比
#在scanpy的官方教程中,高计数的细胞被认为是双细胞进而过滤,所以我们绘制了两条红线,但在我们的教程中,对于双细胞我们将采用其他方法进行过滤,
#所以我们只需要对一些低表达的细胞进行质控,比如nUMI小于500的细胞,比如detected_gene小于250的细胞,线粒体基因的计数比例不超过15%。但这个过滤我们要在双细胞过滤完后再进行。
#手动过滤低质量读数的细胞
import numpy as np
tresh={'pct_counts_mt': 15, 'total_counts': 500, 'n_genes_by_counts': 250}
adata.obs['passing_mt'] = adata.obs['pct_counts_mt'] < tresh['pct_counts_mt']
adata.obs['passing_nUMIs'] = adata.obs['total_counts'] > tresh['total_counts']
adata.obs['passing_ngenes'] = adata.obs['n_genes_by_counts'] > tresh['n_genes_by_counts']
#需要对数据取保留的细胞的交集
QC_test = (adata.obs['passing_mt']) & (adata.obs['passing_nUMIs']) & (adata.obs['passing_ngenes'])
removed = QC_test.loc[lambda x : x == False]
adata_manual = adata[QC_test, :].copy()
#QC后的小提琴图
sc.pl.violin(adata_manual, ['total_counts','n_genes_by_counts','pct_counts_mt',],
jitter=0.4, multi_panel=True)
4、基因和细胞数过滤
// Python
#只保留至少有200个基因表达的细胞,只保留至少有3个细胞表达的基因
sc.pp.filter_cells(adata_manual, min_genes=200)
sc.pp.filter_genes(adata_manual, min_cells=3)
adata_manual
5、计算细胞周期
// Python
#计算细胞周期
ov.pp.score_genes_cell_cycle(adata_manual,species='mouse') #human
adata_manual
##保存scanpy.h5ad用于后续使用
adata_manual.write('E:/Research/scRNAseq/h5ad/Step3_afterQC.h5ad',compression='gzip')
归一化+选择高可变基因
使用scanpy函数
// Python
#读入QC后的数据
adata = sc.read(
filename="E:/Research/scRNAseq/h5ad/Step3_afterQC.h5ad")
adata
# 将count data保存至layers["counts"]中
adata.layers["counts"] = adata.X.copy()
# Normalizing to median total counts
sc.pp.normalize_total(adata)
# Logarithmize the data
sc.pp.log1p(adata)
#保存raw数据
adata.raw = adata
#选择高可变基因
sc.pp.highly_variable_genes(
adata,
flavor="seurat_v3",
n_top_genes=2000,
layer="counts",
batch_key="batch",
subset=True,
#n_bin =40
#span=0.5, #可以防止报错
)
使用omicverse
// Python
#Omicverse归一化+高可变基因筛选
#移位对数(shiftlog)分析,筛选2000个高可变基因
adata=ov.pp.preprocess(adata,
mode='shiftlog|pearson',
n_HVGs=2000,)
adata
# 重要!!!将 AnnData对象的.raw 属性设置为标准化和对数化的原始基因表达,以便以后用于基因表达的差异检测和可视化
adata.raw = adata
#选择高可变基因进行后续分析
adata = adata[:, adata.var.highly_variable_features]
adata
##保存scanpy.h5ad用于后续使用
adata.write('E:/Research/scRNAseq/h5ad/Step4_afternorm.h5ad',compression='gzip')
数据整合与批次校正
scVI
// Python
import scvi
#读入预处理后的数据
adata = sc.read(
filename="E:/Research/scRNAseq/h5ad/Step4_afternorm.h5ad")
adata
scvi.model.SCVI.setup_anndata(adata, layer="counts",
categorical_covariate_keys= ['phase'],
continuous_covariate_keys=['pct_counts_mt'],
)#, batch_key="orig.ident"
vae = scvi.model.SCVI(adata, n_layers=2, n_latent=30, gene_likelihood="nb")
vae
vae.train()
#获取批次效应校正后的降维结果
adata.obsm["X_scVI"] = vae.get_latent_representation()
#可视化
ov.utils.embedding(adata,
basis='X_scVI',frameon='small',
color=['orig.ident'],show=False)
##保存scanpy.h5ad用于后续使用
adata.write('E:/Research/scRNAseq/h5ad/Step5_afterscVI.h5ad',compression='gzip')
CellANOVA
原始代码:
// Python
import anndata as ad
import scanpy as sc
import gc
import sys
import cellanova as cnova
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sea
sc.settings.verbosity = 0
sc.settings.set_figure_params(dpi=80)
pd.set_option('display.max_columns', None)
seed = 10
np.random.seed(seed)
#前期完成质控、归一化和高可变基因筛选
integrate_key = 'orig.ident'
condition_key = 'batch'
control_name = 'Batch1'
control_batches = list(set(adata_prep[adata_prep.obs[condition_key]==control_name,].obs[integrate_key]))
control_dict = {
'g1': control_batches,
}
control_dict
adata_prep= cnova.model.calc_ME(adata_prep, integrate_key='orig.ident')
adata_prep
adata_prep = cnova.model.calc_BE(adata_prep,
integrate_key='orig.ident',
control_dict=control_dict)
adata_prep
adata_prep = cnova.model.calc_TE(adata_prep, integrate_key='orig.ident')
adata_prep
integrated = ad.AnnData(adata_prep.layers['denoised'], dtype=np.float32)
integrated.obs = adata_prep.obs.copy()
integrated.var_names = adata_prep.var_names
sc.pp.neighbors(integrated, n_neighbors=15, n_pcs=30)
sc.tl.umap(integrated)
sc.pl.umap(integrated, color=['orig.ident'])
sc.pl.umap(integrated, color=['batch'])
sc.pl.umap(integrated, color=['site'])
sc.pl.umap(integrated, color=['time'])
#分群
sc.tl.leiden(integrated, key_added="leiden_CellANOVA", resolution=0.5)
sc.pl.umap(integrated, color=["leiden_CellANOVA"], ncols=1)
Omicverse中调用:
// Python
## 指定对照样本,construct control pool
control_dict = {
'pool1': ['s1d3','s2d1'],
}
ov.single.batch_correction(adata,batch_key='batch',n_pcs=50,
methods='CellANOVA',control_dict=control_dict)
adata
adata.obsm["X_mde_cellANOVA"] = ov.utils.mde(adata.obsm["X_cellanova"])
ov.pl.embedding(adata,
basis='X_mde_cellANOVA',frameon='small',
color=['batch','cell_type'],show=False)
Harmony
scanpy函数
// Python
ov.pp.scale(adata)
adata
ov.pp.pca(adata,layer='scaled',n_pcs=30)
adata
adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca']
sc.external.pp.harmony_integrate(adata, 'batch',basis = 'X_pca',max_iter_harmony = 30)
sc.pp.neighbors(adata, use_rep='X_pca_harmony',n_neighbors=30, n_pcs=30)
sc.tl.umap(adata,min_dist=0.4)
sc.pl.umap(adata, color=['orig.ident','batch','site'], frameon=False)
Omicverse调用:
// python
ov.single.batch_correction(adata,batch_key='batch',
methods='harmony',n_pcs=50)
adata
adata.obsm["X_mde_harmony"] = ov.utils.mde(adata.obsm["X_harmony"])
ov.utils.embedding(adata,
basis='X_mde_harmony',frameon='small',
color=['batch','cell_type'],show=False)
BBKNN
// Python
#https://mp.weixin.qq.com/s/CJq1WJ2EMi2z9RDT5axPAg
#https://scanpy.readthedocs.io/en/stable/tutorials/basics/integrating-data-using-ingest.html
sc.external.pp.bbknn(adata_all, batch_key="batch")
sc.tl.umap(adata_all)
sc.pl.umap(adata_all, color=["batch", "celltype"])
scAtlasVAE
// Linux终端
#https://github.com/WanluLiuLab/scAtlasVAE
#新建一个scatlasvae环境
conda env create -n scatlasvae python=3.8
conda activate scatlasvae
pip install ipykernel -i https://pypi.tuna.tsinghua.edu.cn/simple
#安装一系列依赖包
#https://github.com/WanluLiuLab/scAtlasVAE/blob/master/environment.yml
pip3 install "scatlasvae[gpu]"
#安装torch
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
# Testing if CUDA is available
import torch
print(torch.__version__)
print(torch.cuda.is_available())
// Python
#https://scatlasvae.readthedocs.io/en/latest/
#选择scatlasvae终端
import scatlasvae
adata = scatlasvae.read_h5ad("path_to_adata")
vae_model = scatlasvae.model.scAtlasVAE(
adata,
batch_key="sample_name"
)
vae_model.fit()
不同整合方式的比较
可以从生物保留性、批次效应消除、以及簇同质性三个维度对不同方法进行评价(https://mp.weixin.qq.com/s/DK91JN-hsYSZW0BaMeetxw)
(https://mp.weixin.qq.com/s/CJq1WJ2EMi2z9RDT5axPAg)
// Python
#Omicverse
#在conda先安装包:pip install scib_metrics
adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca'].copy()
adata.obsm['X_mira_topic']=adata.obsm['X_topic_compositions'].copy()
adata.obsm['X_mira_feature']=adata.obsm['X_umap_features'].copy()
#需要指定batch key和label key(细胞类型)
from scib_metrics.benchmark import Benchmarker
bm = Benchmarker(
adata,
batch_key="batch",
label_key="cell_type",
embedding_obsm_keys=["X_pca", "X_combat", "X_harmony",'X_cellanova',
'X_scanorama','X_mira_topic','X_mira_feature','X_scVI',],
n_jobs=8,
)
#运行
bm.benchmark()
#展示benchmarking的各个指标
bm.plot_results_table(min_max_scale=False)
降维+聚类
计算neighbors+leiden分群+UMAP展示
// Python
sc.pp.neighbors(adata, n_neighbors=30,
n_pcs=30,use_rep='X_scVI')
#一开始可能会报错,后来根据https://github.com/lmcinnes/umap/issues/702删除了System32下面的svml_dispmd.dll成功了
sc.tl.umap(adata, min_dist=0.3)
sc.pl.umap(
adata,
color=["batch", "leiden"],
frameon=False,
ncols=1,
)
#Scanpy默认聚类
sc.tl.leiden(adata, key_added="leiden_scvi", resolution=0.8)
#UMAP可视化
adata_raw=adata.raw.to_adata()
adata_raw
#普通可视化:按照metadata展示
sc.pl.umap(adata, color=["leiden_scvi","orig.ident"],
legend_loc="on data")
#单个基因
ov.pl.embedding(adata_raw,
basis='X_umap',
color='Cd86',
frameon='small')
# 按照metadata拆分展示(相当于split.by)
for site in ["BM", "Blood", "Colon"]:
sc.pl.umap(rna, color="site", groups=[site])
确定合适的分群resolution
cluster tree和 marker gene AUC值
// R
#https://mp.weixin.qq.com/s/sQgMIoXN15N1zL84uC7y2A
#1、cluster tree
#2、marker gene AUC 值
ROGUE:评估细胞亚群纯度
// R
#https://mp.weixin.qq.com/s/asxSHi8nAKVbN8RLqxembA
#前期走常规的Seurat/Scanpy流程,把细胞大类计算出来
#分别计算所注释细胞的ROGUE(或者只计算我们感兴趣的细胞群)(ROGUE小于0.9则认为异质性较大,可细分)
#将ROGUE小于0.9的目标细胞亚群提取出来,重新降维聚类,进一步注释
#注释结果继续用ROGUE评价,直到不能细分结束(两种情况:ROGUE都大于0.9;或者细胞太少不能继续分)
#亚群细分后,就可以看看这些亚群的生物学效应,比如找找哪些亚群在肿瘤组织特异存在
#devtools::install_github("PaulingLiu/ROGUE")
suppressMessages(library(ROGUE))
suppressMessages(library(ggplot2))
suppressMessages(library(tidyverse))
#1、读取数据 ####
library(schard)
sc <- schard::h5ad2seurat('E:/Research/KMJ/scRNAseq/h5ad/Step7_afterAnnotation_raw.h5ad')
#提取表达矩阵
#对于表达矩阵,行应该是基因,列应该是细胞。表达式值应该是UMI counts(基于液滴的数据集)或TPM(基于全长的数据集)
expr <- as.data.frame(sc[["RNA"]]@counts)
expr[1:5, 1:4]
#提取Meta 信息
#列"celltype"包含相应的细胞亚型,列"Patient"包含每个细胞所属的样本(如患者)。
meta = sc@meta.data
head(meta)
#2、前期准备####
#过滤掉低丰度基因和低质量细胞
expr <- matr.filter(expr, #表达矩阵(行为基因,列为细胞)
min.cells = 10,
min.genes = 10)
#表达熵模型(Expression entropy model)
#为了应用S-E模型,我们使用SE_fun函数计算每个基因的表达熵
ent.res <- SE_fun(expr)
head(ent.res)
#Gene,基因名称。
#mean.expr。Expr表示基因的平均表达水平。
#entropy(熵),一个给定的平均基因表达的期望表达熵。
#fit, 表示基因的平均表达水平。
#ds,针对零期望的熵差。
#p.value, ds的显著性。
#p.adj,调整后的P值
#使用SEplot函数来可视化S和E之间的关系
SEplot(ent.res)
#通过ROGUE鉴定出的信息丰富的基因,可用于后续的聚类分析和伪时间分析
#3、ROGUE计算 ####
##使用CalculateRogue函数计算ROGUE值。该群体的ROGUE值为0.827,从而证实了其异质性
rogue.value <- CalculateRogue(ent.res,
platform = "UMI", #UMI/full-length
k = 30) #其中K是一个重要参数,它将ROGUE值限制在0和1之间。一个对所有基因没有显著ds的细胞群体将获得1的ROGUE值,而对显著ds进行最大汇总的群体应该获得~ 0的纯度评分
rogue.value
#计算每个样本的每个假定聚类的ROGUE值
#为了准确估计每个簇的纯度,我们建议计算不同样品中每种细胞类型的ROGUE值
rogue.res <- rogue(expr, labels = meta$celltype, samples = meta$orig.ident, platform = "UMI", span = 0.6)
rogue.res
#4、可视化####
#箱线图
rogue.boxplot(rogue.res)
#可视化每一个celltype的ROGUE值,每一个点代表一个 样本,可以看到一些CT的ROGUE
# 加载必要的包
library(ggplot2)
library(tidyr)
df <- rogue.res
# 将宽格式数据转换为长格式,便于 ggplot2 绘图
df_long <- pivot_longer(df, cols = everything(), names_to = "Variable", values_to = "Value")
df <- rogue.res
# 将宽格式数据转换为长格式,便于 ggplot2 绘图
df_long <- pivot_longer(df, cols = everything(), names_to = "Variable", values_to = "Value")
# 绘制箱线图
ggplot(df_long, aes(x = Variable, y = Value, fill = Variable)) +
geom_boxplot() + # 绘制箱线图
geom_jitter(width = 0.2, size = 2, color = "black") + # 添加数据点
theme_void() + # 使用空主题,去掉所有背景和网格线
scale_fill_brewer(palette = "Set3") + # 使用 Set3 调色板填充颜色
labs(
fill = "Cluster", # 修改图例标题为 "Cluster"
title = "ROUGE" # 添加图表标题
) +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"), # 设置标题居中、字体大小和加粗
axis.title.x = element_blank(), # 去掉 X 轴标题
axis.title.y = element_blank(), # 去掉 Y 轴标题
axis.text.x = element_blank(), # 去掉 X 轴刻度标签
axis.text.y = element_blank(), # 去掉 Y 轴刻度标签
axis.line = element_blank(), # 去掉坐标轴线
axis.ticks = element_blank() # 去掉坐标轴刻度线
)
#热图
rogue.res[is.na(rogue.res)]=-0.8
pheatmap::pheatmap(rogue.res,
legend_breaks = c(-0.8,seq(from=0.8, to=1, by=0.02)),
legend_labels = c("NA",seq(from=0.8, to=1, by=0.02)),
legend = TRUE,
fontsize = 8
# filename = 'pheatmaps.pdf'
)
#ROGUE评分默认>0.9时是一致性比较高的细胞亚类
UMAP的优化
结合PAGA或使用force atlas layout
// Python
# 数据预处理:计算距离
sc.tl.draw_graph(rna)
sc.pl.draw_graph(rna, color='leiden_totalVI', legend_loc='on data',title = "")
# diffmap:对图形进行降噪处理,优化图网络(选做)
sc.tl.diffmap(rna)
sc.pp.neighbors(rna, n_neighbors=10, use_rep='X_diffmap')
sc.tl.draw_graph(rna)
sc.pl.draw_graph(rna, color='leiden_totalVI', legend_loc='on data')
# 基于PAGA的网络拓扑图
sc.tl.paga(rna, groups='leiden_totalVI')
sc.pl.paga(rna, color=['leiden_totalVI'])
#在PAGA构建的网络拓扑图中,两点之间的连线代表两个细胞类群之间有关系,线的长短反映两个细胞类群在聚类图上的位置关系,线的粗细表示得到的轨迹关系的置信度,线越粗,置信度越高;点大小表示细胞的数量。
# 基于 PAGA 重新计算细胞之间的距离,使用force atlas layout布局(选做)
sc.tl.draw_graph(rna, layout='fa',init_pos='paga')
sc.pl.draw_graph(rna, color=['leiden_totalVI','batch','time','site'], legend_loc='on data')
#基于PAGA起点画UMAP
sc.tl.umap(rna,min_dist = 0.3,spread=1,init_pos='paga')
sc.pl.umap(rna, color=["leiden_totalVI",'time','site'],legend_loc='on data',title = "")
UMAP密度图
// Python
#查看每个细胞亚群的密集程度
sc.tl.embedding_density(data,groupby='louvain')
sc.pl.embedding_density(data,groupby='louvain',
color_map='gnuplot2',#color_map='winter'
add_outline=True, #亚群加边框
group=['CD14+ Monocyte','CD19+ B'] #指定亚群
)
// R
#https://mp.weixin.qq.com/s/6fAYxWYEKlFdybmne0ck2w
#https://mp.weixin.qq.com/s/B4p0XtKjuIHo5fxB7vdGjA
细胞注释+寻找marker基因
手动注释
// Python
#建议用adata.raw而非过滤后的2000个高可变基因中查看marker
adata_raw=adata.raw.to_adata()
adata_raw
marker_genes = {
"Fibroblast": ['FN1','DCN',"LUM",'COL1A1','COL1A2,COL3A1','VIM','FAP','FDGFRB'],
"Fibroblastic reticular cells": ['CCL21','PDPN'],
"Smooth Muscle Cell": ['CNN1','TAGLN','MYH11','MYLK','ACTG2','TAGLN2','SMTN','MYOCD','CALD1','MUSTN1'],
"Endothelial": ['PECAM1', 'VWF','ENG','CDH5',"CLDN5"],
"Epithelial": ['EPCAM' ,'SFN', 'KRT5','KRT8','KRT19','MMP7','MUC16'],
"Mesothelial cells": ['MSLN','UPK3B','WT1','DES','UPK3B'],
"Pericytes": ['RGS5','MCAM','ACTA2','NG2','PDGFRB','SUR2','KIR6'],
"Neuron": ['MAP2','NEFL','TUBB3','SYP','SNAP25'],
"Stem cells": ['SOX2','KLF4','NANOG','PROM1','CD34'],
'Adipocytes':['COL4A2','FMO2','FGF10','PPARG'],
'Oocytes':['ZP3','PUBB8'],
"CD14+ Mono": ["FCN1", "CD14"],
"CD16+ Mono": ["TCF7L2", "FCGR3A", "LYN"],
"Macro": ['CD163','C1QA','APOE','SPP1','CD86'],
"cDC1": ["CLEC9A", "CADM1"],
"cDC2": [
"CST3",
"COTL1",
"LYZ",
"DMXL2",
"CLEC10A",
"FCER1A",
], # Note: DMXL2 should be negative
"Plasma cells": ["MZB1", "HSP90B1", "FNDC3B", "PRDM1", "IGKC", "JCHAIN"],
"CD4+ T activated": ["CD4", "IL7R", "TRBC2", "ITGB1"],
"CD4+ T naive": ["CD4", "IL7R", "TRBC2", "CCR7"],
"CD8+ T": ["CD8A", "CD8B", "GZMK", "GZMA", "CCL5", "GZMB", "GZMH", "GZMA"],
"pDC": ["GZMB", "IL3RA", "COBLL1", "TCF4"],
"HSC": ["SOX4","KIT","NRIP1", "MECOM", "PROM1", "NKAIN2", "CD34"],
"Proliferating cells": ["MKI67", "CDK1"],
"Granulocytes": ["MPO", "AMH","SERPINE2","GSTA1","HSD17B1"],
}
marker_genes_in_data = dict()
for ct, markers in marker_genes.items():
markers_found = list()
for marker in markers:
if marker in adata.var.index:
markers_found.append(marker)
marker_genes_in_data[ct] = markers_found
sc.tl.dendrogram(adata_raw,'leiden_scvi',use_rep="X_scVI")
sc.pl.dotplot(
adata_raw,
groupby="leiden_scvi",
var_names=marker_genes_in_data,
dendrogram=True,
standard_scale="var", # standard scale: normalize each gene to range from 0 to 1
)
cluster2annotation = {
'0': 'Epithelial cells',
'1': 'Epithelial cells',
'2': 'T cells',
'3': 'Epithelial cells',
'4': 'Fibroblasts',
'5': 'Macrophages',
'6': 'B cells',
'7': 'Epithelial cells',
'8': 'Endothelial cells',
'9': 'Normal epithelial cells',#疑似正常卵巢上皮
'10': 'Pericytes',
'11': 'Smooth muscle cells',
'12': 'Neuron',
'13': 'Plasma cells',
'14': 'Fibroblastic reticular cells',
'15': 'Normal epithelial cells',#正常上皮,疑似胰腺癌转移灶Acinar cells
'16': 'Adipocytes',
'17': 'DCs',
'18': 'Epithelial cells',#疑似干细胞
'19': 'Mast cells',
}
adata_raw.obs['major_celltype'] = adata_raw.obs['leiden_scvi'].map(cluster2annotation).astype('category')
sc.pl.umap(adata_raw, color=["major_celltype"])
自动注释
// R
#https://mp.weixin.qq.com/s/i7IwKwssas_0uNl7lQYjFg
迁移注释
// Python
寻找细胞亚群差异基因
sc.tl.rank_genes_groups:使用没有矫正的数据做差异表达分析
// Python
# 更改聚类数
sc.tl.leiden(adata, resolution = 1)
sc.tl.rank_genes_groups(adata, 'leiden')
# 可视化
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False)
plt.savefig(dir+"02-intergration_rank_genes_groups.png", dpi=300)
markers = sc.get.rank_genes_groups_df(adata, None)
markers = markers[(markers.pvals_adj < 0.05) & (markers.logfoldchanges > .5)]
markers
# 保存
markers.to_csv(dir+'markers.csv')
scVI差异分析:使用模型标准化后的值做差异表达分析
// Python
#可以用于指定不同条件的细胞的差异分析,但更适合于聚类之后的不同细胞亚群的差异分析
#https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scrna/scVI_DE_worm.html#selecting-genes-and-loading-data
#https://mp.weixin.qq.com/s/vP540N61s02H0vlcm5uRHQ
#加载选择了高可变基因的数据
adata = sc.read(
filename="E:/Research/KMJ/scRNAseq/h5ad/Step4_afternorm.h5ad")
adata
#用scVI(单元变分推理)进行去批次效应整合
scvi.model.SCVI.setup_anndata(adata, layer="counts",
categorical_covariate_keys= ['phase'],
continuous_covariate_keys=['pct_counts_mt'],
)#, batch_key="orig.ident"
vae = scvi.model.SCVI(adata, n_layers=2, n_latent=30, gene_likelihood="nb")
vae.train()
#获取批次效应校正后的降维结果
adata.obsm["X_scVI"] = vae.get_latent_representation()
#保存模型到文件夹
scvi.model.SCVI.save(vae,dir_path='E:/Research/KMJ/scRNAseq/h5ad/Step5_scVI_model',save_anndata=True)
#加载模型
adata = sc.read_h5ad('E:/Research/KMJ/scRNAseq/h5ad/Step5_scVI_model/adata.h5ad')
model = scvi.model.SCVI.load('E:/Research/KMJ/scRNAseq/h5ad/Step5_scVI_model', adata=adata) #adata is AnnData object organized in the same way as data used to train model
model
#构建邻域图
sc.pp.neighbors(adata, n_neighbors=30,
n_pcs=30,use_rep='X_scVI')
sc.tl.umap(adata)
sc.tl.leiden(adata, key_added="leiden_scvi", resolution=0.8)
sc.pl.umap(adata, color=["leiden_scvi"])
#简单看一下marker基因siglecf的表达
sc.pl.dotplot(adata, ['Siglecf'], groupby='leiden_scvi', dendrogram=True)
#一般的可以直接指定细胞亚群进行差异分析
scvi_de = model.differential_expression(
idx1 = [adata.obs['cell type_sub'] == 'AT1'],
idx2 = [adata.obs['cell type_sub'] == 'AT2']
)
scvi_de
#如果要看某条件分组下的两群细胞的差异,可以新建注释,将orig.ident为D并且leiden_svci等于7的,注释为Eos_D,将orig.ident为DP并且leiden_svci等于7的,注释为Eos_DP
adata.obs['celltype'] = "others"
adata.obs.loc[(adata.obs['orig.ident']=='D') & (adata.obs['leiden_scvi']=='7'),'celltype'] = 'Eos_D'
adata.obs.loc[(adata.obs['orig.ident']=='DP') & (adata.obs['leiden_scvi']=='7'),'celltype'] = 'Eos_DP'
adata.obs["celltype"].value_counts()
# 差异分析
scvi_de = model.differential_expression(
idx1 = [adata.obs['celltype'] == 'Eos_D'],
idx2 = [adata.obs['celltype'] == 'Eos_DP'],
weights="uniform",
#batch_correction=True,
)
scvi_de
# 显著性结果筛选
scvi_de = scvi_de[(scvi_de['is_de_fdr_0.05']) & (abs(scvi_de.lfc_mean) > .5)]
scvi_de = scvi_de.sort_values('lfc_mean')
scvi_de
# 表达筛选
scvi_de = scvi_de[(scvi_de.raw_normalized_mean1 > .5) | (scvi_de.raw_normalized_mean2 > .5)]
scvi_de
#选取celltype为Eos_D和Eos_DP的细胞
adata_log = adata[adata.obs['celltype'].isin(['Eos_D','Eos_DP'])].copy()
#绘制热图:top 25 and bottom 25 from sorted df
genes_to_show = scvi_de[-25:].index.tolist() + scvi_de[:25].index.tolist()
sc.pl.heatmap(subset, genes_to_show, groupby='cell_type_sub', swap_axes=True, layer = 'scvi_normalized',log = True)
plt.savefig(dir+"06-intergration_heatmap_scvi.png")
scvi_de.to_csv(dir+'scvi_de.csv')
COSG差异分析
// Python
sc.tl.rank_genes_groups(adata, groupby='leiden',
method='t-test',use_rep='scVI',)
ov.single.cosg(adata, key_added='leiden_cosg', groupby='leiden')
sc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',
cmap='Spectral_r',key='leiden_cosg',
standard_scale='var',n_genes=3)
常用细胞marker
T细胞marker
肿瘤细胞推断
InferCNV
// Python
import infercnvpy as cnv
#inferCNV推断肿瘤细胞(系统内存不足,先选择抽样)
adata = sc.pp.subsample(adata, fraction=0.1, copy=True)
adata
#创建染色体坐标数据(gtf下载自https://www.gencodegenes.org/human/)
cnv.io.genomic_position_from_gtf(adata=adata, gtf_file='E:/Research/LZQ/scRNAseq/data/gencode.v46.chr_patch_hapl_scaff.annotation.gtf',gtf_gene_id='gene_name', inplace=True)
adata.var.loc[:, ["gene_name", "chromosome", "start", "end"]].head()
# We provide all immune cell types as "normal cells".
cnv.tl.infercnv(
adata,
reference_key="major_celltype",
reference_cat=["T cells", "Macrophages", "Mast cells", "B cells",'Plasma cells',"DCs"],
window_size=250,
)
cnv.pl.chromosome_heatmap(adata, groupby="major_celltype")
cnv.tl.pca(adata)
cnv.pp.neighbors(adata)
cnv.tl.leiden(adata)
cnv.pl.chromosome_heatmap(adata, groupby="cnv_leiden", dendrogram=True)
cnv.tl.umap(adata)
cnv.tl.cnv_score(adata)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(11, 11))
ax4.axis("off")
cnv.pl.umap(
adata,
color="cnv_leiden",
legend_loc="on data",
legend_fontoutline=2,
ax=ax1,
show=False,
)
cnv.pl.umap(adata, color="cnv_score", ax=ax2, show=False)
cnv.pl.umap(adata, color="leiden_scvi", ax=ax3)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 11), gridspec_kw={"wspace": 0.5})
ax4.axis("off")
sc.pl.umap(adata, color="cnv_leiden", ax=ax1, show=False)
sc.pl.umap(adata, color="cnv_score", ax=ax2, show=False)
sc.pl.umap(adata, color="major_celltype", ax=ax3)
// R
#CNV变异频率展示
#https://mp.weixin.qq.com/s/JIQckGOomZKdk5kO0zDwhw
细胞比例分析
绝对比例和相对比例
// Python
#Omicverse
#https://omicverse.readthedocs.io/en/latest/Tutorials-plotting/t_visualize_single/
import omicverse as ov
import scanpy as sc
#import scvelo as scv
ov.plot_set()
import matplotlib.pyplot as plt
fig,ax=plt.subplots(figsize = (1,4))
ov.pl.cellproportion(adata=adata,celltype_clusters='clusters',
groupby='age(days)',legend=True,ax=ax)
fig,ax=plt.subplots(figsize = (2,2))
ov.pl.cellproportion(adata=adata,celltype_clusters='age(days)',
groupby='clusters',groupby_li=['nIPC','Granule immature','Granule mature'],
legend=True,ax=ax)
// R
#带阴影的柱状图
https://mp.weixin.qq.com/s/U3YvE_anYfv6ZXdU-BbN_Q
scCODA
// Python
https://github.com/theislab/scCODA
Ro/e
// Python
#omicverse
roe=ov.utils.roe(adata,sample_key='group',cell_type_key='celltype')
import seaborn as sns
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(2,4))
# 创建一个 roe 的副本,用于存储经过转换的注释数据
transformed_roe = roe.copy()
transformed_roe = transformed_roe.applymap(
lambda x: '+++' if x >= 2 else ('++' if x >= 1.5 else ('+' if x >= 1 else '+/-')))
sns.heatmap(roe, annot=transformed_roe, cmap='RdBu_r', fmt='',
cbar=True, ax=ax,vmin=0.5,vmax=1.5,cbar_kws={'shrink':0.5})
# 绘制热图:
# - roe: 原始的数据,用于热图的颜色表示
# - annot: 注释,使用转换后的 `transformed_roe` 来显示符号
# - cmap: 颜色映射使用 'RdBu_r'(红色到蓝色反转)
# - fmt: 格式为空(注释符号不会应用格式化)
# - cbar: 显示颜色条(cbar=True)
# - ax: 指定绘图的轴对象
# - vmin 和 vmax 设置热图的最小值和最大值(0.5 到 1.5 之间)
# - cbar_kws: 设置颜色条的缩放比例(shrink=0.5 表示颜色条缩小为原来的一半)
# 设置 x 和 y 轴标签的字体大小
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Group',fontsize=13)
plt.ylabel('Cell type',fontsize=13)
plt.title('Ro/e',fontsize=13)
# 去除网格线
ax.grid(False)
// R
#https://mp.weixin.qq.com/s/oHzfFx02Kp5w2x_v3l9iKw
#https://mp.weixin.qq.com/s/GCubkqV6Xo2VwNeWG9Up0Q
# 两个文章提供的函数
# copy from file Tissue_distribution_analysis.R, https://github.com/Sijin-ZhangLab/PanMyeloid
ROIE <- function(crosstab){
## Calculate the Ro/e value from the given crosstab
##
## Args:
#' @crosstab: the contingency table of given distribution
##
## Return:
## The Ro/e matrix
rowsum.matrix <- matrix(0, nrow = nrow(crosstab), ncol = ncol(crosstab))
rowsum.matrix[,1] <- rowSums(crosstab)
colsum.matrix <- matrix(0, nrow = ncol(crosstab), ncol = ncol(crosstab))
colsum.matrix[1,] <- colSums(crosstab)
allsum <- sum(crosstab)
roie <- divMatrix(crosstab, rowsum.matrix %*% colsum.matrix / allsum)
row.names(roie) <- row.names(crosstab)
colnames(roie) <- colnames(crosstab)
return(roie)
}
divMatrix <- function(m1, m2){
## Divide each element in turn in two same dimension matrixes
##
## Args:
#' @m1: the first matrix
#' @m2: the second matrix
##
## Returns:
## a matrix with the same dimension, row names and column names as m1.
## result[i,j] = m1[i,j] / m2[i,j]
dim_m1 <- dim(m1)
dim_m2 <- dim(m2)
if( sum(dim_m1 == dim_m2) == 2 ){
div.result <- matrix( rep(0,dim_m1[1] * dim_m1[2]) , nrow = dim_m1[1] )
row.names(div.result) <- row.names(m1)
colnames(div.result) <- colnames(m1)
for(i in 1:dim_m1[1]){
for(j in 1:dim_m1[2]){
div.result[i,j] <- m1[i,j] / m2[i,j]
}
}
return(div.result)
}
else{
warning("The dimensions of m1 and m2 are different")
}
}
metadata <- data.table::fread("input/GSE164522/GSE164522_CRLM_metadata.csv.gz",data.table = F)
rownames(metadata) = metadata[,1]
meta = metadata[,-1]
# meta <- GSE164522_objects@meta.data 从seurat中提取的样本信息
# meta <- h5ad$metadata 在R中,利用reticulate包和python库anndata 提取的样本信息 参考 https://github.com/Sijin-ZhangLab/PanMyeloid
meta_filt <- meta[meta$group %in% c('LN','MN','MT','PBMC','PN','PT'),]
meta_filt$group <- factor(as.vector(meta_filt$group),levels=c('LN','MN','MT','PBMC','PN','PT'))
summary <- table(meta_filt[,c('celltype_major','tissue')])# tissue group在数据均一致,不同于是否缩写
# 利用自定义函数计算ro/e
roe <- as.data.frame(ROIE(summary))
#热图
library("pheatmap")
pheatmap(roe, display_numbers = TRUE,number_color = "black",cluster_row = FALSE,cluster_col = FALSE,
color = colorRampPalette(c("#e7eb8e","#cd782d","firebrick3"))(50)) # 画图
# 棒棒糖图
roe0 <- as.data.frame(t(roe))
roe0$group <- rownames(roe0)
library(ggpubr)
# 仅在几种组别中提取b细胞进行棒棒糖图绘制
ggdotchart(roe0, x = "group", y = "Bcell",
color = "group", # Color by groups
#palette = as.vector(fifth), # Custom color palette
sorting = "descending", # Sort value in descending order
add = "segments", # Add segments from y = 0 to dots
add.params = list(color = "lightgray", size = 2), # Change segment color and size
dot.size = 6, # Large dot size
label = round(roe0$Bcell,2), # Add mpg values as dot labels
font.label = list(color = "black", size = 9,
vjust = 0.5), # Adjust label parameters
ggtheme = theme_pubr() # ggplot2 theme
)+ geom_hline(yintercept = 1, linetype = 2, color = "black")+theme(legend.position='none')+ylab("Ro/e")+ggtitle("")+theme(axis.text.x = element_text(angle = 45,vjust = 1))
差异及富集分析
Scanpy差异分析 + gseapy富集分析
// Python
#https://huarc.net/notebook/scatlasvae/analysis_final.html
#定义黑名单,里面是一些容易被组织解离影响的基因
#https://www.nature.com/articles/nmeth.4437
#https://static-content.springer.com/esm/art%3A10.1038%2Fnmeth.4437/MediaObjects/41592_2017_BFnmeth4437_MOESM1_ESM.pdf
blacklist = ["Actg1", "Btg1", "Cxcl1", "Dnajb4", "Errfi1", "H3f3b", "Hspb1", "Irf1", "Klf6", "Mir22hg", "Nfkbia", "Pcf11", "Pxdc1", "Sdc4", "Srf", "Tpm3", "Usp2", "Gadd45g", "Ankrd1", "Btg2", "Cyr61", "Dusp1", "Fam132b", "Hipk3", "Hsph1", "Irf8", "Klf9", "Mt1", "Nfkbiz", "Pde4b", "Rap1b", "Serpine1", "Srsf5", "Tppp3", "Wac", "Hspe1", "Arid5a", "Ccnl1",
"Dcn", "Dusp8", "Fos", "Hsp90aa1", "Id3", "Itpkc", "Litaf", "Mt2", "Nop58", "Per1", "Rassf1", "Skil", "Srsf7", "Tra2a", "Zc3h12a", "Ier5", "Atf3", "Ccrn4l", "Ddx3x", "Egr1", "Fosb", "Hsp90ab1", "Idi1", "Jun", "Lmna", "Myadm", "Nppc", "Phlda1", "Rhob", "Slc10a6", "Stat3", "Tra2b", "Zfand5", "Kcne4", "Atf4", "Cebpb", "Ddx5", "Egr2", "Fosl2", "Hspa1a", "Ier2", "Junb", "Maff", "Myc",
"Nr4a1", "Pnp", "Rhoh", "Slc38a2", "Tagln2", "Trib1", "Zfp36", "Bag3", "Cebpd", "Des", "Eif1", "Gadd45a", "Hspa1b", "Ier3", "Jund", "Mafk", "Myd88", "Odc1", "Pnrc1", "Ripk1", "Slc41a1", "Tiparp", "Tubb4b", "Zfp36l1", "Bhlhe40", "Cebpg", "Dnaja1", "Eif5", "Gcc1", "Hspa5", "Ifrd1", "Klf2", "Mcl1", "Nckap5l", "Osgin1", "Ppp1cc", "Sat1", "Socs3", "Tnfaip3", "Tubb6", "Zfp36l2", "Brd2", "Csrnp1", "Dnajb1",
"Erf","Gem","Hspa8","Il6","Klf4","Midn","Ncoa7","Oxnad1","Ppp1r15a","Sbno2","Sqstm1","Tnfaip6","Ubc","Zyx"
]
import gseapy
#定义函数:包含了差异分析和富集分析
def DEG_analysis(adata, groupby, query_subtype, reference_subtype, dw=True):
# 进行差异基因分析,使用 t-test 方法
# `groupby`:分组依据(例如细胞亚型)
# `query_subtype`:要进行比较的查询亚型
# `reference_subtype`:参考亚型
# `dw`:布尔值,指示是否计算下调基因的富集分析
# 进行基因组差异分析,计算 query_subtype 和 reference_subtype 之间的差异基因
sc.tl.rank_genes_groups(adata, groupby=groupby, method="t-test", key_added='rank_genes_test',
groups=[query_subtype], reference=reference_subtype)
# 将差异基因分析的结果(基因名称、log2 fold change、p-value)存储为 DataFrame
diff_exp_genes = pd.DataFrame(np.hstack([
np.array(list(map(list, adata.uns["rank_genes_test"]["names"]))), # 基因名称
np.array(list(map(list, adata.uns["rank_genes_test"]['logfoldchanges']))), # log2 fold change
np.array(list(map(list, adata.uns["rank_genes_test"]['pvals_adj']))) # p-value(已调整)
]),
columns = list(range(3)) # 设置 DataFrame 的列名为 0、1、2,分别对应基因、logFC 和 p-value
)
# 根据 log2 fold change 和 p-value 筛选出上调的差异表达基因,且排除黑名单基因
diff_exp_genes_up = diff_exp_genes[(~diff_exp_genes[0].isin(blacklist)) & # 排除黑名单基因
(diff_exp_genes[1].astype('float') >= 0.25) & # logFC >= 0.25
(diff_exp_genes[2].astype('float') <= 0.05)] # p-value <= 0.05
# 根据 log2 fold change 和 p-value 筛选出下调的差异表达基因,且排除黑名单基因
diff_exp_genes_dw = diff_exp_genes[(~diff_exp_genes[0].isin(blacklist)) & # 排除黑名单基因
(diff_exp_genes[1].astype('float') <= -0.25) & # logFC <= -0.25
(diff_exp_genes[2].astype('float') <= 0.05)] # p-value <= 0.05
# 定义富集分析函数,使用 Enrichr 进行 GO 富集分析
def enr_res(adata=adata,
groupby=groupby,
diff_exp_genes=diff_exp_genes_up,
subtype=query_subtype):
# 绘制 dotplot,显示差异基因在不同分组中的表达情况
ht = sc.pl.dotplot(
adata,
diff_exp_genes.iloc[:, 0], # 使用上调基因的名称列
groupby=groupby, # 根据 `groupby` 列分组
show=False, # 不直接显示图像
return_fig=True # 返回图形对象
)
# 提取 dotplot 的大小数据
size_df = ht.dot_size_df
# 根据大小筛选出表达量大于 0.25 的基因
s = set(size_df.loc[subtype, size_df.loc[subtype] > 0.25].index)
# 从差异基因中筛选出与 dotplot 中基因表达量大于 0.25 相关的基因
enr_res_df = pd.DataFrame(list(filter(lambda x: x in s, diff_exp_genes.iloc[:, 0])))
# 使用 Enrichr 进行 GO 生物过程富集分析
enr_res_use = gseapy.enrichr(gene_list=enr_res_df.iloc[:, 0],
organism='Human', # 选择人类基因集
gene_sets=['GO_Biological_Process_2021']) # 选择 GO Biological Process 2021 基因集
return enr_res_use # 返回富集分析的结果
# 对上调基因进行富集分析
enr_res_up = enr_res(diff_exp_genes=diff_exp_genes_up, subtype=query_subtype)
# 如果 `dw` 为 True,则对下调基因进行富集分析;否则,使用上调基因的富集分析结果
if dw == True:
enr_res_dw = enr_res(diff_exp_genes=diff_exp_genes_dw, subtype=reference_subtype)
else:
enr_res_dw = enr_res_up # 使用上调基因的富集分析结果
# 返回差异基因分析的结果,包括所有基因、上调基因、下调基因,以及它们的富集分析结果
return diff_exp_genes, diff_exp_genes_up, diff_exp_genes_dw, enr_res_up, enr_res_dw
#GO Term Analysis
Eos_DP_list = DEG_analysis(subset, groupby='orig.ident',
query_subtype='DP', reference_subtype='D', dw=False) #reference_subtype='rest'
#差异基因列表导出csv
Eos_DP_list[0].to_csv('E:/Research/KMJ/scRNAseq/result/DEG_Eos_DP_list.csv')
#获取通路
Eos_DP_go = Eos_DP_list[3].res2d[Eos_DP_list[3].res2d['Adjusted P-value']<0.05] #0.05
## color palette
from colour import Color
from matplotlib.colors import LinearSegmentedColormap
def make_colormap( colors, show_palette = False ):
color_ramp = LinearSegmentedColormap.from_list( 'my_list', [ Color( c1 ).rgb for c1 in colors ] )
if show_palette:
plt.figure( figsize = (15,3))
plt.imshow( [ list(np.arange(0, len( colors ) , 0.1)) ] , interpolation='nearest', origin='lower', cmap= color_ramp )
plt.xticks([])
plt.yticks([])
return color_ramp
Eos_DP_list[3].res2d.Term = Eos_DP_list[3].res2d.Term.str.split(" \(GO").str[0]
# 通过调用 str.split 方法,将字符串以 " (GO" 为分隔符进行拆分,并取拆分后的第一个部分。
# 目的是去掉 " (GO" 及后续的内容,保留基因本身的名称或描述
## dotplot
gseapy.dotplot(Eos_DP_list[3].res2d.iloc[[0,1,2,3],:], #选择了数据框中的特定行
size=20,
cmap=make_colormap(['#c8bad8', '#5f0caf']),
figsize=(3,5), vmin=1.3)
#plt.savefig('figures/EOS_DP_go_up.pdf')
plt.show()
Seurat::FindMarkers + 富集分析
// R
#https://mp.weixin.qq.com/s/Cl9tDWqIBWcxT-9jQbJUNg
##清空环境
rm(list=ls())
library(Seurat)
library(readr)
library(dplyr)
library(ggplot2)
library(zellkonverter)
library(SingleCellExperiment)
setwd('E:/Research/KMJ/scRNAseq/h5ad')
# 读取.h5ad文件
sce <- zellkonverter::readH5AD("./Step7_afterAnnotation_raw.h5ad")
assay(sce,"counts")<-assay(sce,"X")
assay(sce,"logcounts")<-log1p(assay(sce,"counts"))
sc <- as.Seurat(sce,counts="counts",data="logcounts")
# 修改assay name为RNA
sc <- RenameAssays(object = sc, originalexp = 'RNA')
table(sc$celltype)
#取子集,celltype为Eosinophils的细胞
Idents(sc)="celltype"
sc_EOS <- subset(sc, idents = "Eosinophils")
#不进行任何筛选的差异分析
Idents(sc_EOS)="orig.ident"
all_markers = Seurat::FindMarkers(sc_EOS,
ident.1 = "DP",ident.2 = "D",
test.use = "wilcox",min.pct = 0,logfc.threshold = 0)
#保存
write.csv(all_markers, file = "E:/Research/KMJ/scRNAseq/result/DEG_seurat_Eos.csv")
#对min.pct和logfc.threshold进行筛选的差异分析,一般min.pct = 0.1,logfc.threshold = 0.25
all_markers = Seurat::FindMarkers(sc_EOS,
ident.1 = "D",ident.2 = "DP",
test.use = "wilcox",
min.pct = 0.05,logfc.threshold = 0.2) #min.pct = 0.1,logfc.threshold = 0.25
#更改行名
all_markers$gene = rownames(all_markers)
#自定义函数:GO富集分析
GO_a <- function(interest_gene, global_gene) {
require(clusterProfiler)
# Convert gene symbols to ENTREZ IDs for interest and global genes
gene_entrez_id2GO <-
clusterProfiler::bitr(interest_gene, "SYMBOL", "ENTREZID", "org.Mm.eg.db", drop = TRUE)$ENTREZID #org.Hs.eg.db
universe_gene_entrez_id2GO <-
clusterProfiler::bitr(global_gene, "SYMBOL", "ENTREZID", "org.Mm.eg.db", drop = TRUE)$ENTREZID #org.Hs.eg.db
# Perform GO enrichment analysis and simplify results
enr_res <- clusterProfiler::enrichGO(
gene = gene_entrez_id2GO,
universe = universe_gene_entrez_id2GO,
ont = "BP",
OrgDb = 'org.Mm.eg.db') #org.Hs.eg.db
enr_res2 <- clusterProfiler::simplify(enr_res)
# Generate plots
p1 <- goplot(enr_res)
p2 <- barplot(enr_res2, showCategory = 20, color = "p.adjust")
# Return results and plots as a list
list(go_res = enr_res,
go_res_simplify = enr_res2,
p1 = p1,p2 = p2)
}
#自定义函数:KEGG富集分析
KEGG_a <- function(interest_gene, global_gene) {
require(clusterProfiler)
# Convert gene symbols to ENTREZ IDs for interest and global genes
gene_entrez_id2KEGG <-
clusterProfiler::bitr(interest_gene, "SYMBOL", "ENTREZID", "org.Mm.eg.db", drop = TRUE)$ENTREZID #org.Hs.eg.db
universe_gene_entrez_id2KEGG <-
clusterProfiler::bitr(global_gene, "SYMBOL", "ENTREZID", "org.Mm.eg.db", drop = TRUE)$ENTREZID #org.Hs.eg.db
# Perform KEGG enrichment analysis and simplify results
enr_res <- clusterProfiler::enrichKEGG(
gene = gene_entrez_id2KEGG,
universe = universe_gene_entrez_id2KEGG,
organism = "mmu") #hsa
#enr_res2 <- term_similarity_from_enrichResult(enr_res)
# Generate plots
#p1 <- goplot(enr_res)
p2 <- barplot(enr_res, showCategory = 20, color = "p.adjust")
# Return results and plots as a list
list(kegg_res = enr_res,
#kegg_res_simplify = enr_res2,
#p1 = p1,
p2 = p2
)
}
#将差异基因分为上调和下调基因
down = all_markers %>% filter(p_val < 0.1, avg_log2FC < -0.2) #p_val_adj<0.05
up = all_markers %>% filter(p_val < 0.1, avg_log2FC > 0.2)
#富集分析
down_go = GO_a(rownames(down),rownames(sc_EOS))
up_go = GO_a(rownames(up),rownames(sc_EOS))
down_kegg = KEGG_a(rownames(down),rownames(sc_EOS))
up_kegg = KEGG_a(rownames(up),rownames(sc_EOS))
#创建分组信息
process_GO <- function(GO_data, annotation) {
GO_data$go_res_simplify@result %>%
head(10) %>%mutate(annotation = annotation,
log10q = -log10(abs(qvalue)))}
GO_result <- bind_rows(
process_GO(down_go, "down_go"),
process_GO(up_go, "up_go"))
process_KEGG <- function(KEGG_data, annotation) {
KEGG_data$kegg_res@result %>%
head(5) %>%mutate(annotation = annotation,
log10q = -log10(abs(qvalue)))}
KEGG_result <- bind_rows(
process_KEGG(down_kegg, "down_kegg"),
process_KEGG(up_kegg, "up_kegg"))
#对上调和下调基因创建分组
GO_result$log10q = ifelse(GO_result$annotation == "down_go", GO_result$log10q * -1, GO_result$log10q)
GO_result$label_hjust=ifelse(GO_result$log10q>0,1,0)
KEGG_result$log10q = ifelse(KEGG_result$annotation == "down_kegg", KEGG_result$log10q * -1, KEGG_result$log10q)
KEGG_result$label_hjust=ifelse(KEGG_result$log10q>0,1,0)
# Plot the GO results
ggplot(GO_result) +
geom_bar(aes(
x = reorder(Description, log10q),
y = log10q,
fill = annotation
),
stat = "identity",
color = "white") +
geom_text(aes(
x = reorder(Description, log10q),
y = 0,
label = Description,
hjust = label_hjust
),
size = 7 * 0.35,
angle = 0) +
scale_fill_manual(
name = "",
values = c("down_go" = "#0f7b9f", "up_go" = "#d83215")
) +
coord_flip() +
cowplot::theme_cowplot() +
theme(
axis.text.x = element_text(size = 7),
text = element_text(size = 7),
axis.line = element_line(size = 0.3),
axis.ticks = element_line(size = 0.3),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
panel.border = element_blank(),
legend.position = "none"
) +
labs(x = "", y = "") +
ylim(-15, 15)
#Plot the KEGG results
ggplot(KEGG_result) +
geom_bar(aes(
x = reorder(Description, log10q),
y = log10q,
fill = annotation
),
stat = "identity",
color = "white") +
geom_text(aes(
x = reorder(Description, log10q),
y = 0,
label = Description,
hjust = label_hjust
),
size = 7 * 0.35,
angle = 0) +
scale_fill_manual(
name = "",
values = c("down_kegg" = "#0f7b9f", "up_kegg" = "#d83215")
) +
coord_flip() +
cowplot::theme_cowplot() +
theme(
axis.text.x = element_text(size = 7),
text = element_text(size = 7),
axis.line = element_line(size = 0.3),
axis.ticks = element_line(size = 0.3),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
panel.border = element_blank(),
legend.position = "none"
) +
labs(x = "", y = "") +
ylim(-15, 15)
## 保存图片
ggsave(paste0(fdir,"Figure3A.pdf"),width = 5,height = 4)
pseudobulks
// Python
#Python:https://mp.weixin.qq.com/s/JnD778-fwLsoPQkT8EvsYg
#R:https://mp.weixin.qq.com/s/ReaV52HlFpHY9gLvV019qw
SEACells/Metacell 差异分析
// Python
#https://starlitnightly.github.io/omicverse/Tutorials-single/t_scdeg/#train-and-save-the-model
Memento
// Conda Prompt
https://github.com/yelabucsf/scrna-parameter-estimation
https://nbviewer.org/github/yelabucsf/scrna-parameter-estimation/blob/master/tutorials/binary_testing.ipynb
#新建环境(不然容易安装失败)
conda create -n memento python=3.10
conda activate memento
# 安装scanpy
pip install ipykernel
conda install -c conda-forge scanpy python-igraph leidenalg
#pip install scanpy
# 安装memento
pip install -i https://test.pypi.org/simple/ memento
# Basic usage 数据下载 来自https://github.com/yelabucsf/scrna-parameter-estimation/issues/27
wget -c https://memento-examples.s3.us-west-2.amazonaws.com/pbmc-ifnb/interferon_filtered.h5ad
// Python
#https://nbviewer.org/github/yelabucsf/scrna-parameter-estimation/blob/master/tutorials/binary_testing.ipynb
#VScode 激活memento kernel
# 导入模块
import memento
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
# 导入数据
adata = sc.read('interferon_filtered.h5ad')
adata = adata[adata.obs.cell == 'CD14+ Monocytes'].copy()
#指定对照组为0
adata.obs['stim'] = adata.obs['stim'].apply(lambda x: 0 if x == 'ctrl'else 1)
adata.obs[['ind', 'stim', 'cell']].sample(5)
# 对两组进行二元比较
result_1d = memento.binary_test_1d(
adata=adata,
capture_rate=0.07, #是采样和测序中总体UMI效率的粗略估计值,0.07表示是 10× v1,0.15表示10× v2,0.25表示10× v3
treatment_col='stim',
num_cpus=12, #CPU数量
num_boot=5000) #Bootstrap迭代次数
# 打印top差异均值基因(DM)
result_1d.query('de_coef > 0').sort_values('de_pval').head(10)
# 打印top差异变异性基因(DV)
result_1d.query('dv_coef > 0 & de_coef > 0').sort_values('dv_pval').head(10)
###差异共表达分析
# 导入模块
import itertools
# 生成基因对列表
gene_pairs = list(itertools.product(['IRF7'], adata.var.index.tolist())) # 单个转录因子(IRF7)的列表
# 差异共表达的假设检验
result_2d = memento.binary_test_2d(
adata=adata,
gene_pairs=gene_pairs,
capture_rate=0.07,
treatment_col='stim',
num_cpus=12,
num_boot=5000)
# 排序和获取结果
result_2d.sort_values('corr_pval').head(5)
通路打分
scanpy.tl.score_genes
// Python
#https://mp.weixin.qq.com/s/DxDtvxVG3pxod9KeZx1lpA
## 读入gene signature
with open(dir + 'datp_sig.txt') as f:
datp_sig = [x.strip() for x in list(f)]
sc.tl.score_genes(subset, datp_sig, score_name = 'datp')
subset.obs
#可视化
sc.pl.violin(subset, 'datp', groupby='condition')
plt.savefig(dir+"08-intergration_datp_sig.png")
#显著性检验
a = subset[subset.obs.condition == 'COVID19'].obs.datp.values
b = subset[subset.obs.condition == 'Control'].obs.datp.values
stats.mannwhitneyu(a, b)
# 结果
MannwhitneyuResult(statistic=77312171.0, pvalue=0.0)
# 将打分UMAP可视化
sc.pl.umap(subset, color = 'datp', vmax = 0.5)
plt.savefig(dir+"09-intergration_datp_sig_umap.png")
AUCell
// Python
#https://starlitnightly.github.io/omicverse/Tutorials-single/t_aucell/
adata.X.max()
#如果anndata 对象大于 10 且 type 为 int 的最大值。我们需要规范化并 log1p 它
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.X.max()
#首先,我们需要下载对应物种的数据集
pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2021.txt',organism='Mouse')
##1、Assest one geneset
geneset_name='response to vitamin (GO:0033273)'
ov.single.geneset_aucell(adata,
geneset_name=geneset_name,
geneset=pathway_dict[geneset_name])
sc.pl.embedding(adata,
basis='umap',
color=["{}_aucell".format(geneset_name)])
##2、Assest more than one geneset
geneset_names=['response to vitamin (GO:0033273)','response to vitamin D (GO:0033280)']
ov.single.pathway_aucell(adata,
pathway_names=geneset_names,
pathways_dict=pathway_dict)
sc.pl.embedding(adata,
basis='umap',
color=[i+'_aucell' for i in geneset_names])
##3、Assest test geneset
ov.single.geneset_aucell(adata,
geneset_name='Sox',
geneset=['Sox17', 'Sox4', 'Sox7', 'Sox18', 'Sox5'])
sc.pl.embedding(adata,
basis='umap',
color=["Sox_aucell"])
##4、Assest all pathways
adata_aucs=ov.single.pathway_aucell_enrichment(adata,
pathways_dict=pathway_dict,
num_workers=8)
adata_aucs.obs=adata[adata_aucs.obs.index].obs
adata_aucs.obsm=adata[adata_aucs.obs.index].obsm
adata_aucs.obsp=adata[adata_aucs.obs.index].obsp
adata_aucs
sc.pl.embedding(adata_aucs,
basis='umap',
color=geneset_names)
##用计算 Scanpy 中跨簇差异表达基因的算法来确定簇特异性信号通路
#adata_aucs.uns['log1p']['base']=None
sc.tl.rank_genes_groups(adata_aucs, 'clusters', method='t-test',n_genes=100)
sc.pl.rank_genes_groups_dotplot(adata_aucs,groupby='clusters',
cmap='Spectral_r',
standard_scale='var',n_genes=3)
#组间比较AUCell评分
fig, ax = plt.subplots(figsize=(6,2))
ov.pl.bardotplot(adata,groupby='clusters',color='Sox_aucell',figsize=(6,2),
ax=ax,
ylabel='Expression',
bar_kwargs={'alpha':0.5,'linewidth':2,'width':0.6,'capsize':4},
scatter_kwargs={'alpha':0.8,'s':10,'marker':'o'})
ov.pl.add_palue(ax,line_x1=3,line_x2=4,line_y=0.1,
text_y=0.02,
text='$p={}$'.format(round(0.001,3)),
fontsize=11,fontcolor='#000000',
horizontalalignment='center',)
ov.single.pathway_enrichment
// Python
pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2021.txt',organism='Mouse')
res=ov.single.pathway_enrichment(adata,pathways_dict=pathway_dict,organism='Mouse',
group_by='clusters',plot=True)
ax=ov.single.pathway_enrichment_plot(res,plot_title='Enrichment',cmap='Reds',
xticklabels=True,cbar=False,square=True,vmax=10,
yticklabels=True,cbar_kws={'label': '-log10(qvalue)','shrink': 0.5,})
singleseqgset富集分析
// R
#https://mp.weixin.qq.com/s/8yWFD1vnXc36L61JrOSmtQ
GSVA富集分析
// R
#https://mp.weixin.qq.com/s/sRQIgp1VbpKyZp69d3J6pQ
// R
#https://www.cnblogs.com/ywliao/articles/14546349.html
## load data
.libPaths("/data2/csj/tools/Rlib")
library(Seurat)
library(dplyr)
library(monocle)
options(stringsAsFactors=FALSE)
library(reticulate)
parse_h5ad <- function(adata){
require(reticulate)
ad <- import("anndata", convert = FALSE)
ada <- ad$read_h5ad(adata)
meta <- py_to_r(ada$obs)
if(class(ada$raw$X)[1] == "scipy.sparse.csr.csr_matrix"){
exp <- t(py_to_r(ada$raw$X$toarray()))
}
else{
exp <- t(py_to_r(ada$raw$X))
}
rownames(exp) <- rownames(py_to_r(ada$raw$var))
colnames(exp) <- rownames(meta)
return(
list(
metadata = meta,
expression = exp
)
)
}
############## different expression
DEGene <- function(h5ad,Cluster1,Cluster2){
## h5ad: the result of function parse_h5ad
## Cluster1 : a vector of Cluster
## Cluster2 : a vector of Cluster
pseudocount.use =1
cell_name_1 <- rownames(h5ad$metadata[h5ad$metadata$MajorCluster %in% Cluster1,])
cell_name_2 <- rownames(h5ad$metadata[h5ad$metadata$MajorCluster %in% Cluster2,])
Expression_1 <- h5ad$expression[,cell_name_1]
Expression_2 <- h5ad$expression[,cell_name_2]
## log FC
mean_c1 <- as.data.frame(rowMeans(as.matrix(Expression_1)))
colnames(mean_c1) <- "mean_c1"
mean_c2 <- as.data.frame(rowMeans(as.matrix(Expression_2)))
colnames(mean_c2) <- "mean_c2"
log2fc <- data.frame(log2fc = log2(mean_c1$mean_c1 + pseudocount.use) - log2(mean_c2$mean_c2 + pseudocount.use))
rownames(log2fc) <- rownames(mean_c1)
log2fc$gene <- rownames(log2fc)
## wilcox test
group.info <- data.frame(row.names = c(cell_name_1, cell_name_2))
group.info[cell_name_1, "group"] <- "Group1"
group.info[cell_name_2, "group"] <- "Group2"
group.info[, "group"] <- factor(x = group.info[, "group"])
data.use <- h5ad$expression[, rownames(x = group.info), drop = FALSE]
p_val <- sapply(
X = 1:nrow(x = data.use),
FUN = function(x) {
return(wilcox.test(data.use[x, ] ~ group.info[, "group"])$p.value)
})
## BH correction
adj_p_val <- p.adjust(p_val, method="BH")
## DE table
result <- data.frame(gene=log2fc$gene, log2FC=log2fc$log2fc, Pvalue=p_val, Adj_pval=adj_p_val)
return(result)
}
##################################################### GSVA
runGSVA <- function(h5ad,Cluster1,Cluster2,kcdf="Gaussian",AdjPvalueCutoff=0.05){
## h5ad: the result of function parse_h5ad
## Cluster1 : Cluster1
## Cluster2 : Cluster2
## kcdf: kcdf="Gaussian" for continuous and 'Poisson for integer counts'
require(GSVA)
require(GSEABase)
require(GSVAdata)
require(clusterProfiler)
data(c2BroadSets)
library(limma)
expression <- h5ad$expression[,rownames(h5ad$metadata[h5ad$metadata$MajorCluster %in% c(Cluster1,Cluster2),])]
## change gene symbol to geneid
gene_entrezid <- bitr(rownames(expression), fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
expression_filt <- expression[gene_entrezid$SYMBOL,]
rownames(expression_filt) <- gene_entrezid$ENTREZID
expression_filt <- as.matrix(expression_filt)
res_gsva <- gsva(expression_filt, c2BroadSets, parallel.sz=10,kcdf=kcdf)
annotation_col = data.frame(CellType = factor(h5ad$metadata[colnames(res_gsva),]$MajorCluster))
rownames(annotation_col) = colnames(res_gsva)
## using limma to conduct DE analysis
f <- factor(annotation_col$CellType)
design <- model.matrix(~0+f)
colnames(design) <- c("C1","C2")
rownames(design) <- colnames(res_gsva)
fit <- lmFit(res_gsva, design)
cont.matrix=makeContrasts('C1-C2',levels = design)
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)
gs <- topTable(fit2,adjust='BH', number=Inf, p.value=AdjPvalueCutoff)
gs$cluster <- ifelse(gs$t > 0 , "C1", "C2")
return(gs)
}
irGSEA:整合多种算法分析
// R
#https://github.com/chuiqin/irGSEA
#https://mp.weixin.qq.com/s/j2vbSgGDq9mV43QdGW8-bw
library(irGSEA)
sc_EOS <- irGSEA.score(object = sc_EOS, assay = "RNA",
slot = "data", seeds = 123, ncores = 1,
min.cells = 3, min.feature = 0,
custom = F, geneset = NULL, msigdb = T,
species = "Mus musculus", category = "H",
subcategory = NULL, geneid = "symbol",
method = c("AUCell", "UCell", "singscore",
"ssgsea", "JASMINE", "viper"),
aucell.MaxRank = NULL, ucell.MaxRank = NULL,
kcdf = 'Gaussian')
# 整合差异基因集
# 如果报错,考虑加句代码options(future.globals.maxSize=100000*1024^5)
result.dge<-irGSEA.integrate(object=sc_EOS,
group.by="orig.ident",
method=c("AUCell","UCell","singscore",
"ssgsea","JASMINE","viper"))
#查看RRA识别的在多种打分方法中都普遍认可的差异基因集
geneset.show<-result.dge$RRA%>%
dplyr::filter(pvalue<=0.05)%>%
dplyr::pull(Name)%>%unique(.)
heatmap.plot <- irGSEA.heatmap(object = result.dge,
method = "singscore",
top = 10,
show.geneset = NULL)
heatmap.plot
DecoupleR:整合多种算法分析
// R
#https://mp.weixin.qq.com/s/kK5QlE5OJUx_COx0W4G8vA
#https://mp.weixin.qq.com/s/L_yNW6fHpvcAIxH3GTIBnA
#转录因子:https://mp.weixin.qq.com/s/2kOzSe4Wg2pYFlesB9JKGw
#https://mp.weixin.qq.com/s/402Ov0GwaJh6gm2KrrRDpw
富集通路网络图
// R
#富集通路网络图
#https://mp.weixin.qq.com/s/tqAKQv-Ek7Lad_bhAS7-wQ
拟时序分析
// Python
RNAvelocity
// Python
细胞通讯
CellPhoneDB
// Python
CellChat
// R
#可视化:https://mp.weixin.qq.com/s/85Bs8fsjXEv4Dq8W5FW_lg
iTALK
// R
#https://mp.weixin.qq.com/s/P_KgD0Usqj16aButnY59uA
转录因子分析
// 详见pySCENIC
Last updated