首页
学习
活动
专区
圈层
工具
发布
社区首页 >专栏 >基础脚本更新--Seurat(V5) + 排污 + 去双细胞传参脚本

基础脚本更新--Seurat(V5) + 排污 + 去双细胞传参脚本

原创
作者头像
追风少年i
修改2026-01-06 11:13:42
修改2026-01-06 11:13:42
2390
举报

作者,Evil Genius

这会儿到过年的这段时间,基本都是更新脚本。

也不知道为什么,可能有国内拆分试剂等不清楚的原因,单细胞数据的细胞数量虽然有了很多大的提升,但是数据质量却有所下降,导致排污和去除双细胞成了分析的必须。

今天我们就来更新一下这个内容,确保单细胞数据的准确性以及后续单细胞空间联合的可靠性。

还是封装传参类的脚本,首先准备文件,共5列内容,样本名称,数据路径,数据格式(10x or csv),样本分组、疾病分期,当然,大家最好把更多的信息放进来,比如性别,年龄等内容,信息越全越好。

然后是脚本设计,通常来讲,我们Seurat V5 + decontX + scDblFinder的组合,每个样本单独运行decontX + scDblFinder,然后进行harmony整合分析。

全流程脚本如下,推荐大家学一学封装,这样以后传参分析数据即可,不用一行一行的运行,更多的时间用在课题设计,生物学解读,阅读文献等方面。

脚本基本都测试过了,完全满足分析需求。

代码语言:javascript
复制
#!/usr/bin/env Rscript
####zhaoyunfei
####20260104

suppressPackageStartupMessages({
  library(Seurat)
  library(dplyr)
  library(harmony)
  library(future)
  library(celda)
  library(Matrix)
  library(ggplot2)
  library(patchwork)
  library(argparse)
  library(Matrix)
  library(tibble)
  library(SingleCellExperiment)
  library(scater)
  library(data.table)
  library(scDblFinder)
})


# ============================================================================
# 命令行参数解析
# ============================================================================

parser <- ArgumentParser(description = "单细胞RNA-seq分析管道 - Seurat V5 Harmony整合")

# 必需参数
parser$add_argument("--sample-file", type = "character", required = TRUE,
                   help = "样本信息文件路径 (制表符分隔)")
parser$add_argument("--output-dir", type = "character", default = "./results",
                   help = "输出目录")

# 输出参数
parser$add_argument("--output-prefix", type = "character", default = "scRNA",
                   help = "输出文件前缀")

# QC参数
parser$add_argument("--min-cells", type = "integer", default = 3,
                   help = "每个基因最少表达的细胞数")
parser$add_argument("--min-features", type = "integer", default = 200,
                   help = "每个细胞最少检测到的基因数")
parser$add_argument("--max-features", type = "integer", default = 6000,
                   help = "每个细胞最多检测到的基因数")
parser$add_argument("--max-mito", type = "double", default = 10,
                   help = "线粒体基因百分比阈值")
parser$add_argument("--min-counts", type = "integer", default = 500,
                   help = "每个细胞最少UMI数")

# 分析参数
parser$add_argument("--nfeatures", type = "integer", default = 2000,
                   help = "用于可变基因分析的特征数")
parser$add_argument("--npcs", type = "integer", default = 30,
                   help = "PCA主成分数")
parser$add_argument("--dims", type = "integer", default = 20,
                   help = "用于降维的PC数")
parser$add_argument("--resolution", type = "double", default = 0.5,
                   help = "聚类分辨率")

# Harmony参数
parser$add_argument("--harmony-vars", type = "character", default = "sample_id",
                   help = "Harmony整合的变量")

# 运行参数
parser$add_argument("--threads", type = "integer", default = 8,
                   help = "并行线程数")
parser$add_argument("--seed", type = "integer", default = 42,
                   help = "随机种子")

args <- parser$parse_args()

# ============================================================================
# 设置环境
# ============================================================================

cat("Seurat V5 Harmony整合管道\n")

cat("参数设置:\n")
for (arg_name in names(args)) {
  cat(sprintf("  %-20s: %s\n", arg_name, args[[arg_name]]))
}
cat("\n")

# 创建输出目录
if (!dir.exists(args$output_dir)) {
  dir.create(args$output_dir, recursive = TRUE)
}

# 设置并行计算
if (args$threads > 1) {
  plan("multicore", workers = args$threads)
  options(future.globals.maxSize = 30 * 1024^3)
}

set.seed(args$seed)

# ============================================================================
# 1. 读取样本信息
# ============================================================================

cat("步骤1: 读取样本信息\n")

sample_info <- read.delim(args$sample_file, stringsAsFactors = FALSE)
colnames(sample_info) <- c("sample_name", "sample_path", "data_format", "group", "stage")

cat("找到", nrow(sample_info), "个样本:\n")
print(sample_info)

# ============================================================================
# 2. 读取和处理单个样本
# ============================================================================

cat("步骤2: 读取样本数据\n")

read_single_sample <- function(sample_name, sample_path, data_format, group, stage) {
  cat(sprintf("读取样本: %s\n", sample_name))
  
  if (data_format == "10x") {
    # 读取10x数据
    data <- Read10X(data.dir = sample_path)
    counts <- as.matrix(data,'dgCMatrix')
    sce <- SingleCellExperiment(assays = list(counts = counts))
    qc_stats <- perCellQCMetrics(sce)
    keep_cells <- qc_stats$sum >= 500 & qc_stats$detected >= 200
    sce_filtered <- sce[, keep_cells]
    sce_filtered <- computeLibraryFactors(sce_filtered)
    sf <- sizeFactors(sce_filtered)
    if (any(sf <= 0)) {
          warning(sprintf("发现 %d 个细胞的size factors <= 0,正在修复...", 
                                    sum(sf <= 0)))
      sf[sf <= 0] <- min(sf[sf > 0])
        sizeFactors(sce_filtered) <- sf
    }
    decontX_res <- decontX(sce_filtered)
    data = assays(decontX_res)$decontXcounts
    obj <- CreateSeuratObject(
      counts = data,
      project = sample_name,
      min.cells = args$min_cells,
      min.features = args$min_features
    )
  } else if (data_format == "csv") {
    # 读取CSV数据
    expr_matrix <- read.csv(sample_path, row.names = 1,check.names = F)
    counts <- as(as.matrix(expr_matrix), "dgCMatrix")
    sce <- SingleCellExperiment(assays = list(counts = counts))
    qc_stats <- perCellQCMetrics(sce)
    keep_cells <- qc_stats$sum >= 500 & qc_stats$detected >= 200
    sce_filtered <- sce[, keep_cells]
    sce_filtered <- computeLibraryFactors(sce_filtered)
    sf <- sizeFactors(sce_filtered)
    if (any(sf <= 0)) {
    warning(sprintf("发现 %d 个细胞的size factors <= 0,正在修复...",
    sum(sf <= 0)))
    sf[sf <= 0] <- min(sf[sf > 0])
    sizeFactors(sce_filtered) <- sf
    }
   decontX_res <- decontX(sce_filtered)
   data = assays(decontX_res)$decontXcounts
   obj <- CreateSeuratObject(
     counts = expr_matrix,
     project = sample_name,
     min.cells = args$min_cells,
     min.features = args$min_features
   )
  } else {
    stop(sprintf("不支持的数据格式: %s", data_format))
  }
  
  # 添加元数据
  obj$sample_id <- sample_name
  obj$orig.ident <- sample_name
  obj$group <- group
  obj$stage <- stage
  
  # 计算QC指标
  obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-|^mt-")
  obj[["percent.ribo"]] <- PercentageFeatureSet(obj, pattern = "^RP[SL]|^Rp[sl]")
  
  # 重命名细胞
  obj <- RenameCells(obj, add.cell.id = sample_name)
  
  cat(sprintf("  细胞数: %d, 基因数: %d\n", ncol(obj), nrow(obj)))
  return(obj)
}

# 读取所有样本
seurat_list <- list()
for (i in 1:nrow(sample_info)) {
  row <- sample_info[i, ]
  obj <- read_single_sample(row$sample_name, row$sample_path, 
                           row$data_format, row$group, row$stage)
  seurat_list[[row$sample_name]] <- obj
}

# ============================================================================
# 3. 创建Seurat V5对象
# ============================================================================

cat("步骤3: 创建Seurat V5对象\n")
for (i in 1:length(seurat_list)){
  obj = seurat_list[[i]]
  sce <- as.SingleCellExperiment(seurat_list[[i]])
  sce <- scDblFinder(sce)
  obj$scDblFinder_score <- sce$scDblFinder.score
  obj$scDblFinder_class <- sce$scDblFinder.class
  obj_clean <- subset(obj, subset = scDblFinder_class == "singlet")
  seurat_list[[i]] = obj_clean
}

seurat_merged <- seurat_list[[1]]

if (length(seurat_list) > 1) {
    for (i in 2:length(seurat_list)) {
    sample_name <- names(seurat_list)[i]
    cat(sprintf("合并样本 %d/%d: %s\n", i, length(seurat_list), sample_name))
        seurat_merged <- merge(
    x = seurat_merged,
    y = seurat_list[[i]]
    )
      }
}

# 添加统一的元数据
metadata_df <- do.call(rbind, lapply(names(seurat_list), function(sample_name) {
                           obj <- seurat_list[[sample_name]]
                             
                             # 直接从元数据中获取(如果存在)
                             if ("group" %in% colnames(obj@meta.data) && "stage" %in% colnames(obj@meta.data)) {
                                 data.frame(
                                          row.names = colnames(obj),
                                                sample_id = sample_name,
                                                group = obj$group,
                                                  stage = obj$stage,
                                                  stringsAsFactors = FALSE
                                                  )
                             } else {
                                 # 如果元数据中没有,使用默认值
                                 n_cells <- ncol(obj)
                             data.frame(
                                      row.names = colnames(obj),
                                            sample_id = rep(sample_name, n_cells),
                                            group = rep("Unknown", n_cells),
                                              stage = rep("Unknown", n_cells),
                                              stringsAsFactors = FALSE
                                              )
                               }
}))

seurat_merged <- AddMetaData(seurat_merged, metadata = metadata_df)

cat("合并后总细胞数:", ncol(seurat_merged), "\n")
cat("合并后基因数:", nrow(seurat_merged), "\n")

# ============================================================================
# 4. 质量控制
# ============================================================================

cat("步骤4: 质量控制\n")

# 重新计算QC指标(为了确保一致)
seurat_merged[["percent.mt"]] <- PercentageFeatureSet(seurat_merged, pattern = "^MT-|^mt-")

original_cells <- ncol(seurat_merged)

# 过滤细胞
seurat_filtered <- subset(seurat_merged,
                         subset = nFeature_RNA > args$min_features &
                           nFeature_RNA < args$max_features &
                           nCount_RNA > args$min_counts &
                           percent.mt < args$max_mito)

cat("过滤前:", original_cells, "个细胞\n")
cat("过滤后:", ncol(seurat_filtered), "个细胞\n")
cat("过滤掉:", original_cells - ncol(seurat_filtered), "个细胞\n")

# QC可视化
qc_plots <- VlnPlot(seurat_filtered,
                    features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
                    group.by = "sample_id",
                    pt.size = 0.1,
                    ncol = 3)

ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_QC_violin.pdf")),
       qc_plots, width = 15, height = 5)

# ============================================================================
# 5. 数据标准化和预处理
# ============================================================================

cat("步骤5: 数据标准化和预处理\n")

# 标准化
seurat_filtered <- NormalizeData(seurat_filtered)

# 寻找高变基因
seurat_filtered <- FindVariableFeatures(seurat_filtered,
                                       selection.method = "vst",
                                       nfeatures = args$nfeatures)

# 缩放数据
seurat_filtered <- ScaleData(seurat_filtered)

cat("找到", length(VariableFeatures(seurat_filtered)), "个高变基因\n")

# 高变基因可视化
var_plot <- VariableFeaturePlot(seurat_filtered)
top10 <- head(VariableFeatures(seurat_filtered), 10)
var_plot <- LabelPoints(plot = var_plot, points = top10, repel = TRUE)

ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_variable_genes.pdf")),
       var_plot, width = 10, height = 8)

# ============================================================================
# 6. PCA降维
# ============================================================================

cat("步骤6: PCA降维\n")

# 运行PCA
seurat_filtered <- RunPCA(seurat_filtered,
                         features = VariableFeatures(object = seurat_filtered))

# 肘部图
elbow_plot <- ElbowPlot(seurat_filtered, ndims = 50)
ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_elbow_plot.pdf")),
       elbow_plot, width = 8, height = 6)

cat("PCA计算完成\n")

# ============================================================================
# 7. Seurat V5 Harmony整合
# ============================================================================

cat("步骤7: Seurat V5 Harmony整合\n")

cat("使用 IntegrateLayers 进行 Harmony 整合...\n")

# 关键步骤:为每个样本设置layer
seurat_filtered$orig.ident <- seurat_filtered$sample_id

# 执行Harmony整合
seurat_harmony <- IntegrateLayers(
  object = seurat_filtered,
  method = HarmonyIntegration,
  orig.reduction = "pca",
  new.reduction = "harmony",
  verbose = FALSE,
  assay = "RNA"
)

cat("Harmony整合完成\n")

# 检查整合结果
cat("可用降维方法:", names(seurat_harmony@reductions), "\n")

# ============================================================================
# 8. UMAP降维
# ============================================================================

cat("步骤8: UMAP降维\n")

# 基于Harmony整合后的空间运行UMAP
seurat_harmony <- RunUMAP(seurat_harmony,
                         reduction = "harmony",
                         dims = 1:min(args$dims, ncol(seurat_harmony[["harmony"]])),
                         reduction.name = "umap.harmony",
                         reduction.key = "UMAPHARMONY_")

# 可视化
umap_by_sample <- DimPlot(seurat_harmony,
                          reduction = "umap.harmony",
                          group.by = "sample_id",
                          pt.size = 0.5,
                          label = FALSE)

umap_by_group <- DimPlot(seurat_harmony,
                         reduction = "umap.harmony",
                         group.by = "group",
                         pt.size = 0.5,
                         label = FALSE)

umap_by_stage <- DimPlot(seurat_harmony,
                         reduction = "umap.harmony",
                         group.by = "stage",
                         pt.size = 0.5,
                         label = FALSE)

# 组合图
combined_umap <- umap_by_sample + umap_by_group + umap_by_stage

ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_UMAP_harmony.pdf")),
       combined_umap, width = 15, height = 5)

# 按样本分面显示
faceted_umap <- DimPlot(seurat_harmony,
                        reduction = "umap.harmony",
                        group.by = "sample_id",
                        split.by = "sample_id",
                        ncol = min(4, length(unique(seurat_harmony$sample_id))),
                        pt.size = 0.5)

ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_UMAP_by_sample.pdf")),
       faceted_umap,
       width = min(15, 4 * length(unique(seurat_harmony$sample_id))),
       height = 10)

# ============================================================================
# 9. 细胞聚类
# ============================================================================

cat("步骤9: 细胞聚类\n")

# 基于Harmony整合后的空间构建邻居图
seurat_harmony <- FindNeighbors(seurat_harmony,
                               reduction = "harmony",
                               dims = 1:min(args$dims, ncol(seurat_harmony[["harmony"]])))

# 聚类分析
seurat_harmony <- FindClusters(seurat_harmony, resolution = args$resolution)

# 聚类可视化
cluster_plot <- DimPlot(seurat_harmony,
                       reduction = "umap.harmony",
                       group.by = "seurat_clusters",
                       label = TRUE,
                       pt.size = 0.5) +
  ggtitle(paste("Clusters (resolution =", args$resolution, ")"))

ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_clusters.pdf")),
       cluster_plot, width = 10, height = 8)

cat("找到", length(unique(seurat_harmony$seurat_clusters)), "个cluster\n")

# 聚类统计
cluster_stats <- as.data.frame(table(seurat_harmony$seurat_clusters))
colnames(cluster_stats) <- c("Cluster", "Cell_Count")
cluster_stats$Percentage <- round(cluster_stats$Cell_Count / sum(cluster_stats$Cell_Count) * 100, 2)

write.csv(cluster_stats,
          file.path(args$output_dir, paste0(args$output_prefix, "_cluster_stats.csv")),
          row.names = FALSE)

cat("聚类统计:\n")
print(cluster_stats)

# ============================================================================
# 10. 差异表达分析
# ============================================================================

cat("步骤10: 差异表达分析\n")

seurat_harmony = JoinLayers(seurat_harmony)

# 设置默认ident为cluster
Idents(seurat_harmony) <- "seurat_clusters"

# 寻找所有cluster的marker基因
cat("寻找marker基因...\n")
all_markers <- FindAllMarkers(seurat_harmony,
                             only.pos = TRUE,
                             min.pct = 0.25,
                             logfc.threshold = 0.25,
                             test.use = "wilcox")

# 保存所有marker基因
write.csv(all_markers,
          file.path(args$output_dir, paste0(args$output_prefix, "_all_markers.csv")),
          row.names = FALSE)

cat("找到", nrow(all_markers), "个marker基因\n")

# 提取每个cluster的前10个marker基因
top10_markers <- all_markers %>%
  group_by(cluster) %>%
  top_n(n = 10, wt = avg_log2FC)

write.csv(top10_markers,
          file.path(args$output_dir, paste0(args$output_prefix, "_top10_markers.csv")),
          row.names = FALSE)

# 热图可视化(显示每个cluster的前5个marker基因)
top5_markers <- all_markers %>%
  group_by(cluster) %>%
  top_n(n = 5, wt = avg_log2FC)

# 创建热图
if (nrow(top5_markers) > 0) {
  heatmap_genes <- unique(top5_markers$gene)
  if (length(heatmap_genes) > 0) {
    heatmap_plot <- DoHeatmap(seurat_harmony,
                             features = heatmap_genes,
                             group.by = "seurat_clusters",
                             size = 3) +
      theme(axis.text.y = element_text(size = 6))
    
    ggsave(file.path(args$output_dir, paste0(args$output_prefix, "_marker_heatmap.pdf")),
           heatmap_plot,
           width = 12,
           height = max(8, length(heatmap_genes) * 0.15))
  }
}

# ============================================================================
# 11. 保存结果
# ============================================================================

cat("步骤11: 保存结果\n")

# 保存RDS文件
rds_file <- file.path(args$output_dir, paste0(args$output_prefix, "_harmony_integrated.rds"))
saveRDS(seurat_harmony, file = rds_file)
cat("保存RDS文件:", rds_file, "\n")

# 保存元数据
metadata <- seurat_harmony@meta.data
write.csv(metadata,
          file.path(args$output_dir, paste0(args$output_prefix, "_metadata.csv")),
          row.names = TRUE)

# 保存整合信息
integration_summary <- data.frame(
  Parameter = c("Samples", "Cells_Before_QC", "Cells_After_QC", 
                "Genes", "Clusters", "Resolution", "Integration_Method"),
  Value = c(nrow(sample_info),
            original_cells,
            ncol(seurat_harmony),
            nrow(seurat_harmony),
            length(unique(seurat_harmony$seurat_clusters)),
            args$resolution,
            "Harmony (IntegrateLayers)")
)

write.csv(integration_summary,
          file.path(args$output_dir, paste0(args$output_prefix, "_integration_summary.csv")),
          row.names = FALSE)

生活很好,有你更好。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 作者,Evil Genius
  • 这会儿到过年的这段时间,基本都是更新脚本。
  • 也不知道为什么,可能有国内拆分试剂等不清楚的原因,单细胞数据的细胞数量虽然有了很多大的提升,但是数据质量却有所下降,导致排污和去除双细胞成了分析的必须。
  • 今天我们就来更新一下这个内容,确保单细胞数据的准确性以及后续单细胞空间联合的可靠性。
  • 还是封装传参类的脚本,首先准备文件,共5列内容,样本名称,数据路径,数据格式(10x or csv),样本分组、疾病分期,当然,大家最好把更多的信息放进来,比如性别,年龄等内容,信息越全越好。
  • 然后是脚本设计,通常来讲,我们Seurat V5 + decontX + scDblFinder的组合,每个样本单独运行decontX + scDblFinder,然后进行harmony整合分析。
  • 全流程脚本如下,推荐大家学一学封装,这样以后传参分析数据即可,不用一行一行的运行,更多的时间用在课题设计,生物学解读,阅读文献等方面。
  • 脚本基本都测试过了,完全满足分析需求。
  • 生活很好,有你更好。
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档