import warnings
import numpy as np
import pandas as pd
import scanpy as sc
import snapatac2 as snap
import polars as pl
warnings.filterwarnings(action="ignore")
snap.tl.call_peaks这个函数需要anndata对象中.obsm['insertion']和.uns['reference_sequences']两个数据去call peaks,但是atac_annot对象中没有,因此需要加进去。
dataset = snap.read_dataset("pbmc.h5ads")
atac_annot=sc.read("atac_annot.h5ad")
atac_annot.obsm['insertion']=dataset.adatas.obsm['insertion']
pbmc_5k = sc.read("pbmc_5k.h5ad")
atac_annot.uns['reference_sequences']=pbmc_5k.uns['reference_sequences']
del pbmc_5k
dataset.close()
callpeak
Use the callpeak
command in MACS2 to identify regions enriched with TN5 insertions. The parameters passed to MACS2 are: “-shift -100 -extsize 200 -nomodel -callsummits -nolambda -keep-dup all”
snap.tl.call_peaks(atac_annot, groupby="CellType",out_dir="tmp",key_added="Peaks_CellType")
peak_mat = snap.pp.make_peak_matrix(atac_annot, file="peak_matrix.h5ad",use_rep="Peaks_CellType")
peak_mat.uns['Peaks_CellType']=atac_annot.uns["Peaks_CellType"]
del atac_annot
identify marker regions for each cell type
tl.marker_regions
function aggregates signal across cells and utilizes z-scores to identify specifically enriched peaks.
Aggregate values in adata.X in a row-wise fashion. This is used to compute RPKM or RPM values stratified by user-provided groupings.
tl.marker_regions
这个函数对每类细胞进行summarize(R中的函数),aggregates(python中的聚合),再对的到的matrix归一化,画热图
marker_peaks = snap.tl.marker_regions(peak_mat, groupby="CellType", pvalue=0.01)
snap.pl.regions(peak_mat, groupby="CellType", peaks=marker_peaks, interactive=False,show=True)
Identify enriched transcription factor motifs.
genome_fasta参数,必须是绝对路径,没有压缩的fasta格式的基因组文件
motifs = snap.tl.motif_enrichment(
motifs=snap.datasets.cis_bp(unique=True),
regions=marker_peaks,
genome_fasta="/User/victor/DataHub/Genomics/GENCODE/GRCh38.primary_assembly.genome.fa",
)
snap.pl.motif_enrichment(enrichment=motifs,min_log_fc=1, max_fdr=0.0001, height=1200, interactive=False)
group1 = "Naive B"
group2 = "Memory B"
naive_B = np.array(peak_mat.obs['CellType'] == group1)
memory_B = np.array(peak_mat.obs['CellType'] == group2)
peaks_selected = np.logical_or(
peak_mat.uns["Peaks_CellType"][group1].to_numpy(),
peak_mat.uns["Peaks_CellType"][group2].to_numpy(),
)
cell_group1和cell_group2这两个参数一定要是np.array否则报错。
diff_peaks = snap.tl.diff_test(
data=peak_mat,
cell_group1=naive_B,
cell_group2=memory_B,
features=peaks_selected,
direction="both"
min_log_fc=0.25,
min_pct=0.05
)
diff_peaks = diff_peaks.filter(pl.col('p-value') < 0.01)
diff_peaks
shape: (490, 4)
┌───────────────────────────┬───────────────────┬──────────┬──────────────────┐
│ feature name ┆ log2(fold_change) ┆ p-value ┆ adjusted p-value │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 │
╞═══════════════════════════╪═══════════════════╪══════════╪══════════════════╡
│ chr1:40152971-40153472 ┆ -1.616357 ┆ 0.000054 ┆ 0.263484 │
│ chr13:110593401-110593902 ┆ 1.445044 ┆ 0.000052 ┆ 0.263484 │
│ chr16:11768789-11769290 ┆ -2.19686 ┆ 0.00004 ┆ 0.263484 │
│ chr11:59135818-59136319 ┆ 0.781031 ┆ 0.000163 ┆ 0.264818 │
│ … ┆ … ┆ … ┆ … │
│ chr9:35619345-35619846 ┆ 0.501729 ┆ 0.00957 ┆ 0.295918 │
│ chr9:94639596-94640097 ┆ 0.418962 ┆ 0.009847 ┆ 0.295918 │
│ chr9:99140650-99141151 ┆ 0.814029 ┆ 0.009244 ┆ 0.295918 │
│ chrX:10121779-10122280 ┆ 0.948257 ┆ 0.009888 ┆ 0.295918 │
└───────────────────────────┴───────────────────┴──────────┴──────────────────┘
snap.pl.regions(
peak_mat,
groupby = 'CellType',
peaks = {
group1: diff_peaks.filter(pl.col("log2(fold_change)") > 0)['feature name'].to_numpy(),
group2: diff_peaks.filter(pl.col("log2(fold_change)") < 0)['feature name'].to_numpy(),
},
interactive = False,
)
从非memory_B的细胞类型中各挑选100个细胞组成背景(background),通过差异分析找出memory_B相对于背景的特异的peaks
barcodes = np.array(peak_mat.obs_names)
background = []
for i in np.unique(peak_mat.obs['CellType']):
if i != group2:
cells = np.random.choice(
barcodes[peak_mat.obs['CellType'] == i],
size = 100,
replace = False,
)
background.append(cells)
background = np.concatenate(background)
diff_peaks = snap.tl.diff_test(
peak_mat,
cell_group1 = memory_B,
cell_group2 = background,
features = peak_mat.uns["Peaks_CellType"][group2].to_numpy(),
direction = "positive",
)
diff_peaks = diff_peaks.filter(pl.col('p-value') < 0.01)
diff_peaks
shape: (5572, 4)
┌──────────────────────────┬───────────────────┬──────────┬──────────────────┐
│ feature name ┆ log2(fold_change) ┆ p-value ┆ adjusted p-value │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 │
╞══════════════════════════╪═══════════════════╪══════════╪══════════════════╡
│ chr17:32252971-32253472 ┆ 1.046856 ┆ 0.000089 ┆ 0.138804 │
│ chr19:6459579-6460080 ┆ 0.994567 ┆ 0.000108 ┆ 0.138804 │
│ chr19:40529039-40529540 ┆ 0.48354 ┆ 0.000149 ┆ 0.138804 │
│ chr3:155870796-155871297 ┆ 0.606834 ┆ 0.000148 ┆ 0.138804 │
│ … ┆ … ┆ … ┆ … │
│ chr1:1629312-1629813 ┆ 0.283976 ┆ 0.986251 ┆ 0.986782 │
│ chr2:111898593-111899094 ┆ 0.392095 ┆ 0.987453 ┆ 0.987807 │
│ chr18:13562387-13562888 ┆ 0.252019 ┆ 0.996619 ┆ 0.996798 │
│ chr19:45667913-45668414 ┆ 0.31387 ┆ 0.999187 ┆ 0.999187 │
└──────────────────────────┴───────────────────┴──────────┴──────────────────┘
snap.pl.regions(
peak_mat,
groupby = 'CellType',
peaks = {
group2: diff_peaks['feature name'].to_numpy(),
},
interactive = False,
)
peak_mat.close()
├── 10x-Multiome-Pbmc10k-RNA.h5ad
├── ATAC.01.pp.ipynb
├── ATAC.02.annot.ipynb
├── ATAC.03.DAR.ipynb
├── atac_annot.h5ad
├── data
│ ├── pbmc_10k
│ │ ├── fragments.tsv.gz
│ │ └── web_summary.html
│ └── pbmc_5k
│ ├── fragments.tsv.gz
│ └── web_summary.html
├── pbmc.h5ads
├── pbmc_10k.h5ad
├── pbmc_5k.h5ad
├── peak_matrix.h5ad
├── quantify.sh.log
└── tmp
├── CD14 Mono.NarrowPeak.gz
├── CD14 Mono_insertion.bed.gz
├── CD4 Naive.NarrowPeak.gz
├── CD4 Naive_insertion.bed.gz
├── Intermediate B.NarrowPeak.gz
├── Intermediate B_insertion.bed.gz
├── MAIT.NarrowPeak.gz
├── MAIT_insertion.bed.gz
├── Memory B.NarrowPeak.gz
├── Memory B_insertion.bed.gz
├── NK.NarrowPeak.gz
├── NK_insertion.bed.gz
├── Naive B.NarrowPeak.gz
├── Naive B_insertion.bed.gz
├── Treg.NarrowPeak.gz
├── Treg_insertion.bed.gz
├── cDC.NarrowPeak.gz
├── cDC_insertion.bed.gz
├── pDC.NarrowPeak.gz
└── pDC_insertion.bed.gz