
ClickHouse 是一款列式存储的分布式 OLAP 数据库,其核心设计围绕高性能分析查询优化:
PRIMARY KEY 创建跳数索引,加速数据定位。Distributed 表引擎实现集群查询。GROUP BY、JOIN 等复杂聚合。INNER/LEFT JOIN 而非全连接。ReplacingMergeTree 引擎去重。
TTL 自动过期旧数据。
windowFunnel 函数追踪玩家转化路径。
-- 创建分布式表(本地表 + 分布式表)
CREATE TABLE logs_local (
timestamp DateTime,
user_id UInt64,
event_type String,
device String
) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/logs', '{replica}')
PARTITION BY toYYYYMM(timestamp)
ORDER BY (user_id, timestamp);
CREATE TABLE logs_dist AS logs_local
ENGINE = Distributed(cluster_name, default, logs_local, rand());-- 实时计算每5分钟活跃用户数
SELECT
toStartOfFiveMinute(timestamp) AS time_window,
uniq(user_id) AS active_users
FROM logs_dist
WHERE timestamp > now() - INTERVAL 1 HOUR
GROUP BY time_window
ORDER BY time_window;# 并行导入CSV数据
cat data.csv | clickhouse-client \
--query="INSERT INTO logs_dist FORMAT CSV" \
--max_insert_block_size=100000-- 计算7日留存率
WITH
first_day_users AS (
SELECT user_id
FROM logs_dist
WHERE toDate(timestamp) = '2023-01-01'
),
retained_users AS (
SELECT DISTINCT user_id
FROM logs_dist
WHERE toDate(timestamp) BETWEEN '2023-01-07' AND '2023-01-08'
AND user_id IN first_day_users
)
SELECT
count() AS day1_users,
(SELECT count() FROM retained_users) AS day7_retained,
day7_retained / day1_users AS retention_rate
FROM first_day_users;PARTITION BY toYYYYMM())加速范围查询。ORDER BY 末尾,低基数字段靠前。MaterializedView 预计算常用指标。JBOD 或 S3 引擎归档历史数据。ZooKeeper 负载,大规模集群建议分片数量控制在 100 以内。EXPLAIN 分析查询计划,重点关注 ReadRows 和 BytesRead 指标。