CITEseq
By Kaiyi
1、数据读入与整合
// 在R中进行
rm(list=ls())
setwd("E:/Research/9.PXX/citeseq")
####R包加载####
library(Seurat)
library(SeuratData)
library(patchwork)
library(ggplot2)
library(batchelor)
library(SeuratWrappers)
library(magrittr)
library(tidyverse)
library(clusterProfiler)
library(GO.db)
library(org.Hs.eg.db)
library(DOSE)
library(DoubletFinder)
library(SingleR)
library(celldex)
library(harmony)
library(pheatmap)
library(ggsci)
library(data.table)
#### 1 数据读入 ######
###### 1.1 依次读入RNA和蛋白 #####
#每个文件夹内有barcodes.tsv,genes.tsv,matrix.mtx三个文件
#genes.tsv最下面为蛋白ADT的信息(如果是抗体标签号可以对应替换为基因名)
counts1 <- Read10X(data.dir = "./rawdata/RNA_Blood_Day7")
cite_counts1 <- Read10X(data.dir = "./rawdata/CITE_Blood_Day7")
adt.names1 <- setdiff (rownames (cite_counts1),rownames (counts1))
idx1 <- which(rownames(cite_counts1 ) %in% adt.names1)
adt1 <- cite_counts1[idx1,]
counts2 <- Read10X(data.dir = "./rawdata/RNA_Blood_Day11")
cite_counts2 <- Read10X(data.dir = "./rawdata/CITE_Blood_Day11")
adt.names2 <- setdiff (rownames (cite_counts2),rownames (counts2))
idx2 <- which(rownames(cite_counts2 ) %in% adt.names2)
adt2 <- cite_counts2[idx2,]
counts3 <- Read10X(data.dir = "./rawdata/RNA_Blood_Day14")
cite_counts3 <- Read10X(data.dir = "./rawdata/CITE_Blood_Day14")
adt.names3 <- setdiff (rownames (cite_counts3),rownames (counts3))
idx3 <- which(rownames(cite_counts3 ) %in% adt.names3)
adt3 <- cite_counts3[idx3,]
counts4 <- Read10X(data.dir = "./rawdata/RNA_Colon_Day4")
cite_counts4 <- Read10X(data.dir = "./rawdata/CITE_Colon_Day4")
adt.names4 <- setdiff (rownames (cite_counts4),rownames (counts4))
idx4 <- which(rownames(cite_counts4 ) %in% adt.names4)
adt4 <- cite_counts4[idx4,]
counts5 <- Read10X(data.dir = "./rawdata/RNA_Colon_Day7")
cite_counts5 <- Read10X(data.dir = "./rawdata/CITE_Colon_Day7")
adt.names5 <- setdiff (rownames (cite_counts5),rownames (counts5))
idx5 <- which(rownames(cite_counts5 ) %in% adt.names5)
adt5 <- cite_counts5[idx5,]
counts6 <- Read10X(data.dir = "./rawdata/RNA_Colon_Day11")
cite_counts6 <- Read10X(data.dir = "./rawdata/CITE_Colon_Day11")
adt.names6 <- setdiff (rownames (cite_counts6),rownames (counts6))
idx6 <- which(rownames(cite_counts6 ) %in% adt.names6)
adt6 <- cite_counts6[idx6,]
counts7 <- Read10X(data.dir = "./rawdata/RNA_Colon_Day14")
cite_counts7 <- Read10X(data.dir = "./rawdata/CITE_Colon_Day14")
adt.names7 <- setdiff (rownames (cite_counts7),rownames (counts7))
idx7 <- which(rownames(cite_counts7 ) %in% adt.names7)
adt7 <- cite_counts7[idx7,]
counts8 <- Read10X(data.dir = "./rawdata/RNA_BM_Day14")
cite_counts8 <- Read10X(data.dir = "./rawdata/CITE_BM_Day14")
adt.names8 <- setdiff (rownames (cite_counts8),rownames (counts8))
idx8 <- which(rownames(cite_counts8 ) %in% adt.names8)
adt8 <- cite_counts8[idx8,]
scRNA01 = CreateSeuratObject(counts1,project="BBloodDay07")
scCITE01 = CreateAssayObject(adt1)
scRNA01[["ADT"]] <- scCITE01
scRNA02 = CreateSeuratObject(counts2,project="BBloodDay11")
scCITE02 = CreateAssayObject(adt2)
scRNA02[["ADT"]] <- scCITE02
scRNA03 = CreateSeuratObject(counts3,project="BBloodDay14")
scCITE03 = CreateAssayObject(adt3)
scRNA03[["ADT"]] <- scCITE03
scRNA04 = CreateSeuratObject(counts4,project="AColonDay04")
scCITE04 = CreateAssayObject(adt4)
scRNA04[["ADT"]] <- scCITE04
scRNA05 = CreateSeuratObject(counts5,project="AColonDay07")
scCITE05 = CreateAssayObject(adt5)
scRNA05[["ADT"]] <- scCITE05
scRNA06 = CreateSeuratObject(counts6,project="AColonDay11")
scCITE06 = CreateAssayObject(adt6)
scRNA06[["ADT"]] <- scCITE06
scRNA07 = CreateSeuratObject(counts7,project="AColonDay14")
scCITE07 = CreateAssayObject(adt7)
scRNA07[["ADT"]] <- scCITE07
scRNA08 = CreateSeuratObject(counts8,project="CBoneMarrowDay14")
scCITE08 = CreateAssayObject(adt8)
scRNA08[["ADT"]] <- scCITE08
# 合并RNA的Seurat对象,将所有Seurat对象合并到一个对象中
scRNA_list <- list(scRNA01,scRNA02,scRNA03,scRNA04,scRNA05,scRNA06,scRNA07,scRNA08)
scRNA <- merge(scRNA_list[[1]],
y=c(scRNA_list[[2]],scRNA_list[[3]],scRNA_list[[4]],scRNA_list[[5]],
scRNA_list[[6]],scRNA_list[[7]],scRNA_list[[8]]),
add.cell.ids= c("BloodDay07","BloodDay11","BloodDay14",
"ColonDay04","ColonDay07","ColonDay11","ColonDay14",
"BMDay14"))
table(scRNA@meta.data$orig.ident)
#AColonDay04 AColonDay07 AColonDay11 AColonDay14 BBloodDay07 BBloodDay11
#17529 15245 10256 16746 18830 21433
#BBloodDay14 CBoneMarrowDay14
#15625 20838
#将orig.ident从character转为factor
scRNA@meta.data$orig.ident <- as.factor(scRNA@meta.data$orig.ident)
levels(scRNA@meta.data$orig.ident)
#删除不需要的变量
rm(scRNA01,scRNA02,scRNA03,scRNA04,scRNA05,scRNA06,scRNA07,scRNA08)
rm(scCITE01,scCITE02,scCITE03,scCITE04,scCITE05,scCITE06,scCITE07,scCITE08)
rm(counts1,counts2,counts3,counts4,counts5,counts6,counts7,counts8)
rm(adt1,adt2,adt3,adt4,adt5,adt6,adt7,adt8)
rm(cite_counts1,cite_counts2,cite_counts3,cite_counts4,cite_counts5,cite_counts6,cite_counts7,cite_counts8)
rm(scRNA_list)
dim(scRNA)
#32589 136502
###### 1.2 批量读入RNA和蛋白(在genes.tsv顺序不一样时失效) #####
# 读入数据,使用目录向量合并
obj_dir <- c("./data/RNA_Blood_Day7","./data/RNA_Blood_Day11","./data/RNA_Blood_Day14",
"./data/RNA_Colon_Day4","./data/RNA_Colon_Day7","./data/RNA_Colon_Day11","./data/RNA_Colon_Day14",
"./data/RNA_BM_Day14")
names(obj_dir) = c("BBloodDay07","BBloodDay11","BBloodDay14",
"AColonDay04","AColonDay07","AColonDay11","AColonDay14",
"CBoneMarrowDay14")
counts <- Read10X(data.dir = obj_dir)
#读入citeseq的矩阵
cite_dir <- c("./data/CITE_Blood_Day7","./data/CITE_Blood_Day11","./data/CITE_Blood_Day14",
"./data/CITE_Colon_Day4","./data/CITE_Colon_Day7","./data/CITE_Colon_Day11","./data/CITE_Colon_Day14",
"./data/CITE_BM_Day14")
names(cite_dir) = c("BBloodDay07","BBloodDay11","BBloodDay14",
"AColonDay04","AColonDay07","AColonDay11","AColonDay14",
"CBoneMarrowDay14") #多样本整合的时候命名barcode-ID
cite_counts <- Read10X(data.dir = cite_dir)
#如报错Barcode file missing. Expecting barcodes.tsv.gz:需要把features.tsv改成genes.tsv
#寻找citeseq比转录组矩阵多出来的部分,这部分即为蛋白抗体信息
adt.names <- setdiff (rownames (cite_counts),rownames (counts))
idx <- which(rownames(cite_counts ) %in% adt.names)
adt <- cite_counts[idx,]
scRNA1 = CreateSeuratObject(counts,min.cells = 5) #创建Seurat对象,2GB,保留5个以上细胞表达的基因
adt_assay <- CreateAssayObject(counts = adt)
#Warning: Feature names cannot have underscores ('_'), replacing with dashes ('-')
scRNA1[["ADT"]] <- adt_assay
rownames(scRNA1[["ADT"]])
#注意:我们可以通过函数快速地在两个数据之间来回切换,以便后续的分析以及可视化
#DefaultAssay(scRNA) <- "RNA"
#DefaultAssay(scRNA) <- "ADT"
#scRNA@assays[["RNA"]]@counts#原始数据,就是我们读取那三个文件后获得的原始数据
#scRNA@assays[["RNA"]]@data#经过标准化的数据
#scRNA@assays[["RNA"]]@scale.data#经过标准化后执行高可变基因筛选并且归一化后的数据
rm(cite_counts)
rm(counts)
rm(adt)
rm(adt_assay)
dim(scRNA1)
#22398 136502
###### 1.3 添加metadata信息#####
table(scRNA@meta.data$orig.ident)
#AColonDay04 AColonDay07 AColonDay11 AColonDay14 BBloodDay07 BBloodDay11
#17529 15245 10256 16746 18830 21433
#BBloodDay14 CBoneMarrowDay14
#15625 20838
###### 1.3.1 添加site #####
#添加一列样本信息至meta.data$site,用于存储样本类型(区分是来自血/肠/骨髓的)
names(scRNA@meta.data)
#"orig.ident" "nCount_RNA" "nFeature_RNA" "nCount_ADT" "nFeature_ADT"
scRNA@meta.data$site <- scRNA@meta.data$orig.ident
levels(scRNA@meta.data$site)
#替换
level_BloodDay7 <- which(levels(scRNA@meta.data$site)=="BBloodDay07")
levels(scRNA@meta.data$site)[level_BloodDay7]<- "Blood"
level_BloodDay11 <- which(levels(scRNA@meta.data$site)=="BBloodDay11")
levels(scRNA@meta.data$site)[level_BloodDay11]<- "Blood"
level_BloodDay14 <- which(levels(scRNA@meta.data$site)=="BBloodDay14")
levels(scRNA@meta.data$site)[level_BloodDay14]<- "Blood"
level_ColonDay4 <- which(levels(scRNA@meta.data$site)=="AColonDay04")
levels(scRNA@meta.data$site)[level_ColonDay4]<- "Colon"
level_ColonDay7 <- which(levels(scRNA@meta.data$site)=="AColonDay07")
levels(scRNA@meta.data$site)[level_ColonDay7]<- "Colon"
level_ColonDay11 <- which(levels(scRNA@meta.data$site)=="AColonDay11")
levels(scRNA@meta.data$site)[level_ColonDay11]<- "Colon"
level_ColonDay14 <- which(levels(scRNA@meta.data$site)=="AColonDay14")
levels(scRNA@meta.data$site)[level_ColonDay14]<- "Colon"
level_BMDay14 <- which(levels(scRNA@meta.data$site)=="CBoneMarrowDay14")
levels(scRNA@meta.data$site)[level_BMDay14]<- "BM"
table(scRNA@meta.data$site)
#Colon Blood BM
#59776 55888 20838
###### 1.3.2 添加time #####
#添加一列样本信息至meta.data$time,用于存储样本类型(区分是来自DSS处理4/7/11还是处理14天的)
names(scRNA@meta.data)
#"orig.ident" "nCount_RNA" "nFeature_RNA" "nCount_ADT" "nFeature_ADT" "site"
scRNA@meta.data$time <- scRNA@meta.data$orig.ident
levels(scRNA@meta.data$time)
#替换
level_BloodDay7 <- which(levels(scRNA@meta.data$time)=="BBloodDay07")
levels(scRNA@meta.data$time)[level_BloodDay7]<- "Day7"
level_BloodDay11 <- which(levels(scRNA@meta.data$time)=="BBloodDay11")
levels(scRNA@meta.data$time)[level_BloodDay11]<- "Day11"
level_BloodDay14 <- which(levels(scRNA@meta.data$time)=="BBloodDay14")
levels(scRNA@meta.data$time)[level_BloodDay14]<- "Day14"
level_ColonDay4 <- which(levels(scRNA@meta.data$time)=="AColonDay04")
levels(scRNA@meta.data$time)[level_ColonDay4]<- "Day4"
level_ColonDay7 <- which(levels(scRNA@meta.data$time)=="AColonDay07")
levels(scRNA@meta.data$time)[level_ColonDay7]<- "Day7"
level_ColonDay11 <- which(levels(scRNA@meta.data$time)=="AColonDay11")
levels(scRNA@meta.data$time)[level_ColonDay11]<- "Day11"
level_ColonDay14 <- which(levels(scRNA@meta.data$time)=="AColonDay14")
levels(scRNA@meta.data$time)[level_ColonDay14]<- "Day14"
level_BMDay14 <- which(levels(scRNA@meta.data$time)=="CBoneMarrowDay14")
levels(scRNA@meta.data$time)[level_BMDay14]<- "Day14"
#查看修改后的metadata
table(scRNA@meta.data$time)
#Day4 Day7 Day11 Day14
#17529 34075 31689 53209
#修改metadata$orig.ident的名字
levels(scRNA@meta.data$orig.ident) <- c('ColonDay4', 'ColonDay7', 'ColonDay11','ColonDay14',
"BloodDay7", "BloodDay11", "BloodDay14",
"BMDay14")
table(scRNA@meta.data$orig.ident)
# ColonDay4 ColonDay7 ColonDay11 ColonDay14 BloodDay7 BloodDay11 BloodDay14 BMDay14
#17529 15245 10256 16746 18830 21433 15625 20838
###### 1.3.3 添加batch #####
names(scRNA@meta.data)
#"orig.ident" "nCount_RNA" "nFeature_RNA" "nCount_ADT" "nFeature_ADT" "site" "time"
scRNA@meta.data$batch <- scRNA@meta.data$orig.ident
levels(scRNA@meta.data$batch)
#替换
level_BloodDay7 <- which(levels(scRNA@meta.data$batch)=="BloodDay7")
levels(scRNA@meta.data$batch)[level_BloodDay7]<- "Batch1"
level_BloodDay11 <- which(levels(scRNA@meta.data$batch)=="BloodDay11")
levels(scRNA@meta.data$batch)[level_BloodDay11]<- "Batch1"
level_BloodDay14 <- which(levels(scRNA@meta.data$batch)=="BloodDay14")
levels(scRNA@meta.data$batch)[level_BloodDay14]<- "Batch2"
level_ColonDay4 <- which(levels(scRNA@meta.data$batch)=="ColonDay4")
levels(scRNA@meta.data$batch)[level_ColonDay4]<- "Batch2"
level_ColonDay7 <- which(levels(scRNA@meta.data$batch)=="ColonDay7")
levels(scRNA@meta.data$batch)[level_ColonDay7]<- "Batch1"
level_ColonDay11 <- which(levels(scRNA@meta.data$batch)=="ColonDay11")
levels(scRNA@meta.data$batch)[level_ColonDay11]<- "Batch1"
level_ColonDay14 <- which(levels(scRNA@meta.data$batch)=="ColonDay14")
levels(scRNA@meta.data$batch)[level_ColonDay14]<- "Batch2"
level_BMDay14 <- which(levels(scRNA@meta.data$batch)=="BMDay14")
levels(scRNA@meta.data$batch)[level_BMDay14]<- "Batch2"
table(scRNA@meta.data$batch)
#Batch2 Batch1
#70738 65764
#保存
saveRDS(scRNA, "./result/Alldata_new/Step1_merged_sample.rds")
2、数据质控
// 在python中进行
import sys
print(sys.path)#查看python安装位置
import os, sys
os.getcwd()
os.listdir(os.getcwd())
// 安装anaconda3,在annaconda中安装系列包
#运行环境:e:\Research\9.PXX\citeseq\code\.conda
#在anaconda/miniconda3中进入,输入conda env list查看环境,输入conda activate e:\Research\9.PXX\citeseq\code\envCITEseq进入当前环境
#然后可以用pip install指令安装python包
#pip install scrublet
#或pip install -i https://pypi.tuna.tsinghua.edu.cn/simple scrublet
#pip install -U omicverse
#pip install -U numba
#pip install torch-geometric
#pip install diopy
# 0.加载需要的包
import scrublet as scr
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scanpy as sc
import torch
import omicverse as ov
import diopy
import infercnvpy as cnv
import mudata as md
import muon as mu
from muon import prot as pt
import tempfile
import anndata as ad
import mudata as md
import scvi
import seaborn as sns
sns.set_theme()
torch.set_float32_matmul_precision("high")
#1.读入数据
mdata = md.read('E:/Research/9.PXX/citeseq/result/Alldata_new/Step1_merged_sample.h5mu')
mdata
mdata["RNA"].obs = mdata.obs
mdata["ADT"].obs = mdata.obs
mdata
#提取RNA矩阵
adata = mdata["RNA"].copy()
adata
#提取蛋白矩阵
prot = mdata.mod['ADT'].copy()
prot
# Original QC plot
n0 = adata.shape[0]
print(f'Original cell number: {n0}')
#2.计算双细胞
sc.external.pp.scrublet(adata, expected_doublet_rate = 0.1,threshold=0.25,batch_key = "orig.ident")
#expected_doublet_rate,doublets的预期占比,通常为0.05-0.1,结果对该参数不是特别敏感
#看一下结果
adata.obs[['doublet_score', 'predicted_doublet']]
#导出结果
adata.obs[['doublet_score', 'predicted_doublet']].to_csv('E:/Research/9.PXX/citeseq/result/Alldata_new/scrublet_prediction_res.csv')
#保留单细胞
adata = adata[adata.obs['predicted_doublet'] == False] #obs 为细胞维度对象,var是基因维度对象
adata
n1 = adata.shape[0]
n1
#3、过滤低质量的细胞
# 线粒体基因
adata.var["mt"] = adata.var_names.str.startswith("mt-")
# 核糖体基因
adata.var["ribo"] = adata.var_names.str.startswith(("rps", "rpl"))
# 血红蛋白基因
adata.var["hb"] = adata.var_names.str.contains(("^Hb[^(p,e)]"))
#计算协变量
sc.pp.calculate_qc_metrics(
adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
adata
adata.obs[['total_counts','n_genes_by_counts','pct_counts_mt','pct_counts_hb']]
#n_genes_by_counts/detected_genes: 一个细胞中发现的有效基因数量(即表达量不为0)
#total_counts/nUMIs: 一个细胞中发现的分子数量(UMI),通常也可以被认为是这个细胞的文库大小
#pct_counts_mt/mito_perc: 一个细胞中线粒体基因的表达计数占比
#绘制三个协变量的分布图
import matplotlib.pyplot as plt
mito_filter = 10
n_counts_filter = 5000
fig, axs = plt.subplots(ncols = 2, figsize = (8,4))
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt',ax = axs[0], show=False)
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts',ax = axs[1], show = False)
#draw horizontal red lines indicating thresholds.
axs[0].hlines(y = mito_filter, xmin = 0, xmax = max(adata.obs['total_counts']), color = 'red', ls = 'dashed')
axs[1].hlines(y = n_counts_filter, xmin = 0, xmax = max(adata.obs['total_counts']), color = 'red', ls = 'dashed')
fig.tight_layout()
plt.show()
#在scanpy的官方教程中,高计数的细胞被认为是双细胞进而过滤,所以我们绘制了两条红线,但在我们的教程中,对于双细胞我们将采用其他方法进行过滤,
#所以我们只需要对一些低表达的细胞进行质控,比如nUMI小于500的细胞,比如detected_gene小于250的细胞,线粒体基因的计数比例不超过15%。但这个过滤我们要在双细胞过滤完后再进行。
#手动过滤低质量读数的细胞
import numpy as np
tresh={'pct_counts_mt': 10, 'total_counts': 500, 'n_genes_by_counts': 250,'pct_counts_hb': 2.5}
adata.obs['passing_mt'] = adata.obs['pct_counts_mt'] < tresh['pct_counts_mt']
adata.obs['passing_nUMIs'] = adata.obs['total_counts'] > tresh['total_counts']
adata.obs['passing_ngenes'] = adata.obs['n_genes_by_counts'] > tresh['n_genes_by_counts']
adata.obs['passing_ngenes'] = adata.obs['n_genes_by_counts'] < 5000
adata.obs['passing_hb'] = adata.obs['pct_counts_hb'] < tresh['pct_counts_hb']
print(f'Lower treshold, total_counts: {tresh["total_counts"]}; filtered-out-cells: {n1-np.sum(adata.obs["passing_nUMIs"])}')
print(f'Lower treshold, n genes: {tresh["n_genes_by_counts"]}; filtered-out-cells: {n1-np.sum(adata.obs["passing_ngenes"])}')
print(f'Lower treshold, mito %: {tresh["pct_counts_mt"]}; filtered-out-cells: {n1-np.sum(adata.obs["passing_mt"])}')
print(f'Lower treshold, hb %: {tresh["pct_counts_hb"]}; filtered-out-cells: {n1-np.sum(adata.obs["passing_hb"])}')
#需要对数据取保留的细胞的交集
QC_test = (adata.obs['passing_mt']) & (adata.obs['passing_nUMIs']) & (adata.obs['passing_ngenes']) & (adata.obs['passing_hb'])
removed = QC_test.loc[lambda x : x == False]
print(f'Total cell filtered out with this last QC (and its chosen options): {n1-np.sum(QC_test)}')
adata_manual = adata[QC_test, :].copy()
n2 = adata_manual.shape[0]
# Store cleaned adata
print(f'Cells retained after scrublet and filtering: {n2}, {n0-n2} removed.')
#小提琴图
sc.pl.violin(adata_manual, ['total_counts','n_genes_by_counts','pct_counts_mt','pct_counts_hb',],
jitter=0.4, multi_panel=True)
#只保留至少有200个基因表达的细胞,只保留至少有5个细胞表达的基因
sc.pp.filter_cells(adata_manual, min_genes=200)
sc.pp.filter_genes(adata_manual, min_cells=5)
adata_manual
#创建mudata对象
# 确认两个anndata对象的细胞ID
prot_cells = prot.obs_names
adata_manual_cells = adata_manual.obs_names
# 获取交集细胞ID
common_cells = prot_cells.intersection(adata_manual_cells)
common_cells
# 对prot对象进行子集操作
prot_filtered = prot[common_cells, :]
prot_filtered
#整合成mudata
mdata = md.MuData({"RNA": adata_manual, "ADT": prot_filtered})
mdata
##保存
mdata.write("E:/Research/9.PXX/citeseq/result/Alldata_new/Step2_filtered.h5mu")
// Some code
Last updated