假期这几天去骑了车子,风景美如画!~🚴
各位假期都去哪里玩了呀,人山人海吗?!😂
今天是python
的scanpy
,还是要学起来这个了。😜
看现在的趋势,未来python
可能才是未来,R
再不做出改变就要被取代了。🙂↕️
import os
import scanpy as sc
import anndata as ad
import scrublet # 直接导入 Scrublet
sc.settings.set_figure_params(dpi=50, facecolor="white")
这里我发现联网读取不了原始数据,我就进行了本地读取,原教程中好像代码有写问题,这里我做了修改,写了注释,大家可以试一试!~😘
# 定义样本信息(文件名与本地路径关联)
samples = {
"s1d1": "s1d1_filtered_feature_bc_matrix.h5",
"s1d3": "s1d3_filtered_feature_bc_matrix.h5",
}
adatas = {}
for sample_id, filename in samples.items():
# 构建本地文件路径:scanpy_data/文件名
local_path = os.path.join("scanpy_data", filename)
# 读取 10x Genomics H5 文件
sample_adata = sc.read_10x_h5(local_path)
# 确保基因名唯一(解决单样本内基因名重复问题)
sample_adata.var_names_make_unique()
# 存储到字典
adatas[sample_id] = sample_adata
# 合并所有样本的 AnnData 对象(移除 `keys` 参数)
adata = ad.concat(
adatas, # 直接传入字典,自动使用字典键作为样本标识
label="sample", # 在 obs 中添加样本来源列
join="inner" # 仅保留所有样本共有的基因(解决跨样本基因名不一致问题)
)
# 确保观测名(细胞条形码)唯一(解决跨样本细胞名重复问题)
adata.obs_names_make_unique()
# 打印各样本细胞数统计
print("各样本细胞数量统计:")
print(adata.obs["sample"].value_counts())
# 返回合并后的 AnnData 对象(可用于后续分析)
adata
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")
sc.pp.calculate_qc_metrics(
adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)
可视化质控指标!~😜
sc.pl.violin(
adata,
["n_genes_by_counts", "total_counts", "pct_counts_mt"],
jitter=0.4,
multi_panel=True,
)
再来一张!~🫠
sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")
过滤细胞和基因!~🧐
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_genes(adata, min_cells=3)
这里我发现代码也不太对,运行不下去了。💀
下面是我重新写的,亲测有效,有注释!~💪
# 确保 adata.obs 中存在 'predicted_doublet' 列,初始化为 False
adata.obs["predicted_doublet"] = False
for sample in adata.obs["sample"].unique():
# 提取单个样本数据
sample_mask = adata.obs["sample"] == sample
adata_sample = adata[sample_mask].copy()
# 运行 Scrublet
scrub = scrublet.Scrublet(adata_sample.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
# 确保 predicted_doublets 是布尔数组
predicted_doublets = predicted_doublets.astype(bool)
# 回填结果到原 adata(直接通过布尔索引操作)
adata.obs.loc[sample_mask, "doublet_score"] = doublet_scores
adata.obs.loc[sample_mask, "predicted_doublet"] = predicted_doublets
# 过滤双细胞(确保列存在且为布尔类型)
adata = adata[~adata.obs["predicted_doublet"], :].copy()
# Saving count data
adata.layers["counts"] = adata.X.copy()
# Normalizing to median total counts
sc.pp.normalize_total(adata)
# Logarithmize the data
sc.pp.log1p(adata)
这里是top 2000
!~😘
sc.pp.highly_variable_genes(adata, n_top_genes=2000, batch_key="sample")
sc.pl.highly_variable_genes(adata)
肘图!~😘
sc.tl.pca(adata)
sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True)
sc.pl.pca(
adata,
color=["sample", "sample", "pct_counts_mt", "pct_counts_mt"],
dimensions=[(0, 1), (2, 3), (0, 1), (2, 3)],
ncols=2,
size=2,
)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.pl.umap(
adata,
color="sample",
# Setting a smaller point size to get prevent overlap
size=2,
)
# Leiden 聚类(修正参数)
sc.tl.leiden(adata, resolution=0.5)
# UMAP 计算与可视化
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50)
sc.tl.umap(adata)
sc.pl.umap(adata, color=["leiden"])
sc.pl.umap(
adata,
color=["leiden", "predicted_doublet", "doublet_score"],
# increase horizontal space between panels
wspace=0.5,
size=3,
)
sc.pl.umap(
adata,
color=["leiden", "log1p_total_counts", "pct_counts_mt", "log1p_n_genes_by_counts"],
wspace=0.5,
ncols=2,
)
from leidenalg import RBConfigurationVertexPartition
# 运行不同分辨率的 Leiden 聚类
for res in [0.02, 0.5, 2.0]:
sc.tl.leiden(
adata,
key_added=f"leiden_res_{res:.2f}",
resolution=res,
partition_type=RBConfigurationVertexPartition
)
sc.pl.umap(
adata,
color=["leiden_res_0.02", "leiden_res_0.50", "leiden_res_2.00"],
legend_loc="on data",
)
marker_genes = {
"CD14+ Mono": ["FCN1", "CD14"],
"CD16+ Mono": ["TCF7L2", "FCGR3A", "LYN"],
# Note: DMXL2 should be negative
"cDC2": ["CST3", "COTL1", "LYZ", "DMXL2", "CLEC10A", "FCER1A"],
"Erythroblast": ["MKI67", "HBA1", "HBB"],
# Note HBM and GYPA are negative markers
"Proerythroblast": ["CDK6", "SYNGR1", "HBM", "GYPA"],
"NK": ["GNLY", "NKG7", "CD247", "FCER1G", "TYROBP", "KLRG1", "FCGR3A"],
"ILC": ["ID2", "PLCG2", "GNLY", "SYNE1"],
"Naive CD20+ B": ["MS4A1", "IL4R", "IGHD", "FCRL1", "IGHM"],
# Note IGHD and IGHM are negative markers
"B cells": [
"MS4A1",
"ITGB1",
"COL4A4",
"PRDM1",
"IRF4",
"PAX5",
"BCL11A",
"BLK",
"IGHD",
"IGHM",
],
"Plasma cells": ["MZB1", "HSP90B1", "FNDC3B", "PRDM1", "IGKC", "JCHAIN"],
# Note PAX5 is a negative marker
"Plasmablast": ["XBP1", "PRDM1", "PAX5"],
"CD4+ T": ["CD4", "IL7R", "TRBC2"],
"CD8+ T": ["CD8A", "CD8B", "GZMK", "GZMA", "CCL5", "GZMB", "GZMH", "GZMA"],
"T naive": ["LEF1", "CCR7", "TCF7"],
"pDC": ["GZMB", "IL3RA", "COBLL1", "TCF4"],
}
sc.pl.dotplot(adata, marker_genes, groupby="leiden_res_0.02", standard_scale="var")
adata.obs["cell_type_lvl1"] = adata.obs["leiden_res_0.02"].map(
{
"0": "Lymphocytes",
"1": "Monocytes",
"2": "Erythroid",
"3": "B Cells",
}
)
sc.pl.dotplot(adata, marker_genes, groupby="leiden_res_0.50", standard_scale="var")
# Obtain cluster-specific differentially expressed genes
sc.tl.rank_genes_groups(adata, groupby="leiden_res_0.50", method="wilcoxon")
sc.pl.rank_genes_groups_dotplot(
adata, groupby="leiden_res_0.50", standard_scale="var", n_genes=5
)
看一下Cluster 7
的高表达基因,应该是个NK细胞
。🙊
sc.get.rank_genes_groups_df(adata, group="7").head(5)
可视化一下Cluster 7
的高表达基因。🧬
dc_cluster_genes = sc.get.rank_genes_groups_df(adata, group="7").head(5)["names"]
sc.pl.umap(
adata,
color=[*dc_cluster_genes, "leiden_res_0.50"],
legend_loc="on data",
frameon=False,
ncols=3,
)
最后祝大家早日不卷!~
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有