基础脚本更新--Seurat(V5) + 排污 + 去双细胞传参脚本

原创

追风少年i

修改于 2026-01-06 11:13:42

2390

作者，Evil Genius

这会儿到过年的这段时间，基本都是更新脚本。

也不知道为什么，可能有国内拆分试剂等不清楚的原因，单细胞数据的细胞数量虽然有了很多大的提升，但是数据质量却有所下降，导致排污和去除双细胞成了分析的必须。

今天我们就来更新一下这个内容，确保单细胞数据的准确性以及后续单细胞空间联合的可靠性。

还是封装传参类的脚本，首先准备文件，共5列内容，样本名称，数据路径，数据格式（10x or csv），样本分组、疾病分期，当然，大家最好把更多的信息放进来，比如性别，年龄等内容，信息越全越好。

然后是脚本设计，通常来讲，我们Seurat V5 + decontX + scDblFinder的组合,每个样本单独运行decontX + scDblFinder，然后进行harmony整合分析。

全流程脚本如下，推荐大家学一学封装，这样以后传参分析数据即可，不用一行一行的运行,更多的时间用在课题设计，生物学解读，阅读文献等方面。

脚本基本都测试过了，完全满足分析需求。

#!/usr/bin/env Rscript
####zhaoyunfei
####20260104

suppressPackageStartupMessages({
  library(Seurat)
  library(dplyr)
  library(harmony)
  library(future)
  library(celda)
  library(Matrix)
  library(ggplot2)
  library(patchwork)
  library(argparse)
  library(Matrix)
  library(tibble)
  library(SingleCellExperiment)
  library(scater)
  library(data.table)
  library(scDblFinder)
})


# ============================================================================
# 命令行参数解析
# ============================================================================

parser <- ArgumentParser(description = "单细胞RNA-seq分析管道 - Seurat V5 Harmony整合")

# 必需参数
parser$add_argument("--sample-file", type = "character", required = TRUE,
                   help = "样本信息文件路径 (制表符分隔)")
parser$add_argument("--output-dir", type = "character", default = "./results",
                   help = "输出目录")

# 输出参数
parser$add_argument("--output-prefix", type = "character", default = "scRNA",
                   help = "输出文件前缀")

# QC参数
parser$add_argument("--min-cells", type = "integer", default = 3,
                   help = "每个基因最少表达的细胞数")
parser$add_argument("--min-features", type = "integer", default = 200,
                   help = "每个细胞最少检测到的基因数")
parser$add_argument("--max-features", type = "integer", default = 6000,
                   help = "每个细胞最多检测到的基因数")
parser$add_argument("--max-mito", type = "double", default = 10,
                   help = "线粒体基因百分比阈值")
parser$add_argument("--min-counts", type = "integer", default = 500,
                   help = "每个细胞最少UMI数")

# 分析参数
parser$add_argument("--nfeatures", type = "integer", default = 2000,
                   help = "用于可变基因分析的特征数")
parser$add_argument("--npcs", type = "integer", default = 30,
                   help = "PCA主成分数")
parser$add_argument("--dims", type = "integer", default = 20,
                   help = "用于降维的PC数")
parser$add_argument("--resolution", type = "double", default = 0.5,
                   help = "聚类分辨率")

# Harmony参数
parser$add_argument("--harmony-vars", type = "character", default = "sample_id",
                   help = "Harmony整合的变量")

# 运行参数
parser$add_argument("--threads", type = "integer", default = 8,
                   help = "并行线程数")
parser$add_argument("--seed", type = "integer", default = 42,
                   help = "随机种子")

args <- parser$parse_args()

# ============================================================================
# 设置环境
# ============================================================================

cat("Seurat V5 Harmony整合管道\n")

cat("参数设置:\n")
for (arg_name in names(args)) {
  cat(sprintf("  %-20s: %s\n", arg_name, args[[arg_name]]))
}
cat("\n")

# 创建输出目录
if (!dir.exists(args$output_dir)) {
  dir.create(args$output_dir, recursive = TRUE)
}

# 设置并行计算
if (args$threads > 1) {
  plan("multicore", workers = args$threads)
  options(future.globals.maxSize = 30 * 1024^3)
}

set.seed(args$seed)

# ============================================================================
# 1. 读取样本信息
# ============================================================================

cat("步骤1: 读取样本信息\n")

sample_info <- read.delim(args$sample_file, stringsAsFactors = FALSE)
colnames(sample_info) <- c("sample_name", "sample_path", "data_format", "group", "stage")

cat("找到", nrow(sample_info), "个样本:\n")
print(sample_info)

# ============================================================================
# 2. 读取和处理单个样本
# ============================================================================

cat("步骤2: 读取样本数据\n")

read_single_sample <- function(sample_name, sample_path, data_format, group, stage) {
  cat(sprintf("读取样本: %s\n", sample_name))
  
  if (data_format == "10x") {
    # 读取10x数据
    data <- Read10X(data.dir = sample_path)
    counts <- as.matrix(data,'dgCMatrix')
    sce <- SingleCellExperiment(assays = list(counts = counts))
    qc_stats <- perCellQCMetrics(sce)
    keep_cells <- qc_stats$sum >= 500 & qc_stats$detected >= 200
    sce_filtered <- sce[, keep_cells]
    sce_filtered <- computeLibraryFactors(sce_filtered)
    sf <- sizeFactors(sce_filtered)
    if (any(sf <= 0)) {
          warning(sprintf("发现 %d 个细胞的size factors <= 0，正在修复...", 
                                    sum(sf <= 0)))
      sf[sf <= 0] <- min(sf[sf > 0])
        sizeFactors(sce_filtered) <- sf
    }
    decontX_res <- decontX(sce_filtered)
    data = assays(decontX_res)$decontXcounts
    obj <- CreateSeuratObject(
      counts = data,
      project = sample_name,
      min.cells = args$min_cells,
      min.features = args$min_features
    )
  } else if (data_format == "csv") {
    # 读取CSV数据
    expr_matrix <- read.csv(sample_path, row.names = 1,check.names = F)
    counts <- as(as.matrix(expr_matrix), "dgCMatrix")
    sce <- SingleCellExperiment(assays = list(counts = counts))
    qc_stats <- perCellQCMetrics(sce)
    keep_cells <- qc_stats$sum >= 500 & qc_stats$detected >= 200
    sce_filtered <- sce[, keep_cells]
    sce_filtered <- computeLibraryFactors(sce_filtered)
    sf <- sizeFactors(sce_filtered)
    if (any(sf <= 0)) {
    warning(sprintf("发现 %d 个细胞的size factors <= 0，正在修复...",
    sum(sf <= 0)))
    sf[sf <= 0] <- min(sf[sf > 0])
    sizeFactors(sce_filtered) <- sf
    }
   decontX_res <- decontX(sce_filtered)
   data = assays(decontX_res)$decontXcounts
   obj <- CreateSeuratObject(
     counts = expr_matrix,
     project = sample_name,
     min.cells = args$min_cells,
     min.features = args$min_features
   )
  } else {
    stop(sprintf("不支持的数据格式: %s", data_format))
  }
  
  # 添加元数据
  obj$sample_id <- sample_name
  obj$orig.ident <- sample_name
  obj$group <- group
  obj$stage <- stage
  
  # 计算QC指标
  obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-|^mt-")
  obj[["percent.ribo"]] <- PercentageFeatureSet(obj, pattern = "^RP[SL]|^Rp[sl]")
  
  # 重命名细胞
  obj <- RenameCells(obj, add.cell.id = sample_name)
  
  cat(sprintf("  细胞数: %d, 基因数: %d\n", ncol(obj), nrow(obj)))
  return(obj)
}

# 读取所有样本
seurat_list <- list()
for (i in 1:nrow(sample_info)) {
  row <- sample_info[i, ]
  obj <- read_single_sample(row$sample_name, row$sample_path, 
                           row$data_format, row$group, row$stage)
  seurat_list[[row$sample_name]] <- obj
}

# ============================================================================
# 3. 创建Seurat V5对象
# ============================================================================

cat("步骤3: 创建Seurat V5对象\n")
for (i in 1:length(seurat_list)){
  obj = seurat_list[[i]]
  sce <- as.SingleCellExperiment(seurat_list[[i]])
  sce <- scDblFinder(sce)
  obj$scDblFinder_score <- sce$scDblFinder.score
  obj$scDblFinder_class <- sce$scDblFinder.class
  obj_clean <- subset(obj, subset = scDblFinder_class == "singlet")
  seurat_list[[i]] = obj_clean
}

seurat_merged <- seurat_list[[1]]

if (length(seurat_list) > 1) {
    for (i in 2:length(seurat_list)) {
    sample_name <- names(seurat_list)[i]
    cat(sprintf("合并样本 %d/%d: %s\n", i, length(seurat_list), sample_name))
        seurat_merged <- merge(
    x = seurat_merged,
    y = seurat_list[[i]]
    )
      }
}

# 添加统一的元数据
metadata_df <- do.call(rbind, lapply(names(seurat_list), function(sample_name) {
                           obj <- seurat_list[[sample_name]]
                             
                             # 直接从元数据中获取（如果存在）
                             if ("group" %in% colnames(obj@meta.data) && "stage" %in% colnames(obj@meta.data)) {
                                 data.frame(
                                          row.names = colnames(obj),
                                                sample_id = sample_name,
                                                group = obj$group,
                                                  stage = obj$stage,
                                                  stringsAsFactors = FALSE
                                                  )
                             } else {
                                 # 如果元数据中没有，使用默认值
                                 n_cells <- ncol(obj)
                             data.frame(
                                      row.names = colnames(obj),
                                            sample_id = rep(sample_name, n_cells),
                                            group = rep("Unknown", n_cells),
                                              stage = rep("Unknown", n_cells),
                                              stringsAsFactors = FALSE
                                              )
                               }
}))

seurat_merged <- AddMetaData(seurat_merged, metadata = metadata_df)

cat("合并后总细胞数:", ncol(seurat_merged), "\n")
cat("合并后基因数:", nrow(seurat_merged), "\n")

# ============================================================================
# 4. 质量控制
# ============================================================================

cat("步骤4: 质量控制\n")

# 重新计算QC指标（为了确保一致）
seurat_merged[["percent.mt"]] <- PercentageFeatureSet(seurat_merged, pattern = "^MT-|^mt-")

original_cells <- ncol(seurat_merged)

# 过滤细胞
seurat_filtered <- subset(seurat_merged,
                         subset = nFeature_RNA > args$min_features &
                           nFeature_RNA < args$max_features &
                           nCount_RNA > args$min_counts &
                           percent.mt < args$max_mito)

cat("过滤前:", original_cells, "个细胞\n")
cat("过滤后:", ncol(seurat_filtered), "个细胞\n")
cat("过滤掉:", original_cells - ncol(seurat_filtered), "个细胞\n")

# QC可视化
qc_plots <- VlnPlot(seurat_filtered,
                    features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
                    group.by = "sample_id",
                    pt.size = 0.1,
                    ncol = 3)

ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_QC_violin.pdf")),
       qc_plots, width = 15, height = 5)

# ============================================================================
# 5. 数据标准化和预处理
# ============================================================================

cat("步骤5: 数据标准化和预处理\n")

# 标准化
seurat_filtered <- NormalizeData(seurat_filtered)

# 寻找高变基因
seurat_filtered <- FindVariableFeatures(seurat_filtered,
                                       selection.method = "vst",
                                       nfeatures = args$nfeatures)

# 缩放数据
seurat_filtered <- ScaleData(seurat_filtered)

cat("找到", length(VariableFeatures(seurat_filtered)), "个高变基因\n")

# 高变基因可视化
var_plot <- VariableFeaturePlot(seurat_filtered)
top10 <- head(VariableFeatures(seurat_filtered), 10)
var_plot <- LabelPoints(plot = var_plot, points = top10, repel = TRUE)

ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_variable_genes.pdf")),
       var_plot, width = 10, height = 8)

# ============================================================================
# 6. PCA降维
# ============================================================================

cat("步骤6: PCA降维\n")

# 运行PCA
seurat_filtered <- RunPCA(seurat_filtered,
                         features = VariableFeatures(object = seurat_filtered))

# 肘部图
elbow_plot <- ElbowPlot(seurat_filtered, ndims = 50)
ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_elbow_plot.pdf")),
       elbow_plot, width = 8, height = 6)

cat("PCA计算完成\n")

# ============================================================================
# 7. Seurat V5 Harmony整合
# ============================================================================

cat("步骤7: Seurat V5 Harmony整合\n")

cat("使用 IntegrateLayers 进行 Harmony 整合...\n")

# 关键步骤：为每个样本设置layer
seurat_filtered$orig.ident <- seurat_filtered$sample_id

# 执行Harmony整合
seurat_harmony <- IntegrateLayers(
  object = seurat_filtered,
  method = HarmonyIntegration,
  orig.reduction = "pca",
  new.reduction = "harmony",
  verbose = FALSE,
  assay = "RNA"
)

cat("Harmony整合完成\n")

# 检查整合结果
cat("可用降维方法:", names(seurat_harmony@reductions), "\n")

# ============================================================================
# 8. UMAP降维
# ============================================================================

cat("步骤8: UMAP降维\n")

# 基于Harmony整合后的空间运行UMAP
seurat_harmony <- RunUMAP(seurat_harmony,
                         reduction = "harmony",
                         dims = 1:min(args$dims, ncol(seurat_harmony[["harmony"]])),
                         reduction.name = "umap.harmony",
                         reduction.key = "UMAPHARMONY_")

# 可视化
umap_by_sample <- DimPlot(seurat_harmony,
                          reduction = "umap.harmony",
                          group.by = "sample_id",
                          pt.size = 0.5,
                          label = FALSE)

umap_by_group <- DimPlot(seurat_harmony,
                         reduction = "umap.harmony",
                         group.by = "group",
                         pt.size = 0.5,
                         label = FALSE)

umap_by_stage <- DimPlot(seurat_harmony,
                         reduction = "umap.harmony",
                         group.by = "stage",
                         pt.size = 0.5,
                         label = FALSE)

# 组合图
combined_umap <- umap_by_sample + umap_by_group + umap_by_stage

ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_UMAP_harmony.pdf")),
       combined_umap, width = 15, height = 5)

# 按样本分面显示
faceted_umap <- DimPlot(seurat_harmony,
                        reduction = "umap.harmony",
                        group.by = "sample_id",
                        split.by = "sample_id",
                        ncol = min(4, length(unique(seurat_harmony$sample_id))),
                        pt.size = 0.5)

ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_UMAP_by_sample.pdf")),
       faceted_umap,
       width = min(15, 4 * length(unique(seurat_harmony$sample_id))),
       height = 10)

# ============================================================================
# 9. 细胞聚类
# ============================================================================

cat("步骤9: 细胞聚类\n")

# 基于Harmony整合后的空间构建邻居图
seurat_harmony <- FindNeighbors(seurat_harmony,
                               reduction = "harmony",
                               dims = 1:min(args$dims, ncol(seurat_harmony[["harmony"]])))

# 聚类分析
seurat_harmony <- FindClusters(seurat_harmony, resolution = args$resolution)

# 聚类可视化
cluster_plot <- DimPlot(seurat_harmony,
                       reduction = "umap.harmony",
                       group.by = "seurat_clusters",
                       label = TRUE,
                       pt.size = 0.5) +
  ggtitle(paste("Clusters (resolution =", args$resolution, ")"))

ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_clusters.pdf")),
       cluster_plot, width = 10, height = 8)

cat("找到", length(unique(seurat_harmony$seurat_clusters)), "个cluster\n")

# 聚类统计
cluster_stats <- as.data.frame(table(seurat_harmony$seurat_clusters))
colnames(cluster_stats) <- c("Cluster", "Cell_Count")
cluster_stats$Percentage <- round(cluster_stats$Cell_Count / sum(cluster_stats$Cell_Count) * 100, 2)

write.csv(cluster_stats,
          file.path(args$output_dir, paste0(args$output_prefix, "_cluster_stats.csv")),
          row.names = FALSE)

cat("聚类统计:\n")
print(cluster_stats)

# ============================================================================
# 10. 差异表达分析
# ============================================================================

cat("步骤10: 差异表达分析\n")

seurat_harmony = JoinLayers(seurat_harmony)

# 设置默认ident为cluster
Idents(seurat_harmony) <- "seurat_clusters"

# 寻找所有cluster的marker基因
cat("寻找marker基因...\n")
all_markers <- FindAllMarkers(seurat_harmony,
                             only.pos = TRUE,
                             min.pct = 0.25,
                             logfc.threshold = 0.25,
                             test.use = "wilcox")

# 保存所有marker基因
write.csv(all_markers,
          file.path(args$output_dir, paste0(args$output_prefix, "_all_markers.csv")),
          row.names = FALSE)

cat("找到", nrow(all_markers), "个marker基因\n")

# 提取每个cluster的前10个marker基因
top10_markers <- all_markers %>%
  group_by(cluster) %>%
  top_n(n = 10, wt = avg_log2FC)

write.csv(top10_markers,
          file.path(args$output_dir, paste0(args$output_prefix, "_top10_markers.csv")),
          row.names = FALSE)

# 热图可视化（显示每个cluster的前5个marker基因）
top5_markers <- all_markers %>%
  group_by(cluster) %>%
  top_n(n = 5, wt = avg_log2FC)

# 创建热图
if (nrow(top5_markers) > 0) {
  heatmap_genes <- unique(top5_markers$gene)
  if (length(heatmap_genes) > 0) {
    heatmap_plot <- DoHeatmap(seurat_harmony,
                             features = heatmap_genes,
                             group.by = "seurat_clusters",
                             size = 3) +
      theme(axis.text.y = element_text(size = 6))
    
    ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_marker_heatmap.pdf")),
           heatmap_plot,
           width = 12,
           height = max(8, length(heatmap_genes) * 0.15))
  }
}

# ============================================================================
# 11. 保存结果
# ============================================================================

cat("步骤11: 保存结果\n")

# 保存RDS文件
rds_file <- file.path(args$output_dir, paste0(args$output_prefix, "_harmony_integrated.rds"))
saveRDS(seurat_harmony, file = rds_file)
cat("保存RDS文件:", rds_file, "\n")

# 保存元数据
metadata <- seurat_harmony@meta.data
write.csv(metadata,
          file.path(args$output_dir, paste0(args$output_prefix, "_metadata.csv")),
          row.names = TRUE)

# 保存整合信息
integration_summary <- data.frame(
  Parameter = c("Samples", "Cells_Before_QC", "Cells_After_QC", 
                "Genes", "Clusters", "Resolution", "Integration_Method"),
  Value = c(nrow(sample_info),
            original_cells,
            ncol(seurat_harmony),
            nrow(seurat_harmony),
            length(unique(seurat_harmony$seurat_clusters)),
            args$resolution,
            "Harmony (IntegrateLayers)")
)

write.csv(integration_summary,
          file.path(args$output_dir, paste0(args$output_prefix, "_integration_summary.csv")),
          row.names = FALSE)

生活很好，有你更好。

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

数据分析

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

数据分析

#单细胞

#数据分析

登录后参与评论

0 条评论

热度

基础脚本更新--Seurat(V5) + 排污 + 去双细胞传参脚本

基础脚本更新--Seurat(V5) + 排污 + 去双细胞传参脚本

作者，Evil Genius

这会儿到过年的这段时间，基本都是更新脚本。

也不知道为什么，可能有国内拆分试剂等不清楚的原因，单细胞数据的细胞数量虽然有了很多大的提升，但是数据质量却有所下降，导致排污和去除双细胞成了分析的必须。

今天我们就来更新一下这个内容，确保单细胞数据的准确性以及后续单细胞空间联合的可靠性。

还是封装传参类的脚本，首先准备文件，共5列内容，样本名称，数据路径，数据格式（10x or csv），样本分组、疾病分期，当然，大家最好把更多的信息放进来，比如性别，年龄等内容，信息越全越好。

然后是脚本设计，通常来讲，我们Seurat V5 + decontX + scDblFinder的组合,每个样本单独运行decontX + scDblFinder，然后进行harmony整合分析。

全流程脚本如下，推荐大家学一学封装，这样以后传参分析数据即可，不用一行一行的运行,更多的时间用在课题设计，生物学解读，阅读文献等方面。

脚本基本都测试过了，完全满足分析需求。

生活很好，有你更好。

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐