
ClickHouse 分析:PB 级查询性能
ClickHouse 可以在单台服务器上每秒扫描数十亿行。要实现这一性能,需要理解其与 OLTP 数据库根本不同的架构。
ClickHouse 为何快速
- 列式存储:仅读取查询涉及的列
- 压缩:相似值的压缩比行存储好 5-10 倍
- 向量化执行:使用 SIMD 指令一次处理 128 行
- MergeTree 颗粒:稀疏索引跳过无关的 8192 行块
- 并行执行:自动使用所有 CPU 核心

选择合适的引擎
-- ReplacingMergeTree:按主键去重
CREATE TABLE events (
event_id UUID,
user_id UInt64,
event_type LowCardinality(String),
created_at DateTime64(3),
version UInt64
)
ENGINE = ReplacingMergeTree(version)
PARTITION BY toYYYYMM(created_at)
ORDER BY (user_id, event_type, created_at);
-- AggregatingMergeTree:预聚合分析状态
CREATE TABLE hourly_stats (
hour DateTime,
user_id UInt64,
event_count AggregateFunction(count, UInt8),
total_amount AggregateFunction(sum, Decimal(18, 2))
)
ENGINE = AggregatingMergeTree()
PARTITION BY toYYYYMM(hour)
ORDER BY (hour, user_id);
-- SummingMergeTree:合并时自动求和计数器
CREATE TABLE ad_metrics (
hour DateTime,
campaign_id UInt32,
impressions UInt64,
clicks UInt64,
spend Float64
)
ENGINE = SummingMergeTree((impressions, clicks, spend))
ORDER BY (hour, campaign_id);
ORDER BY 设计:最关键的决定
ORDER BY 是排序键,而非唯一性约束——它决定了查询性能:
-- 错误:高基数在前,无法按 user_id 跳过颗粒
ORDER BY (event_id, user_id, created_at)
-- 正确:从低基数到高基数
ORDER BY (user_id, event_type, created_at)
-- 按 user_id 查询可跳过 99%+ 的颗粒!

最佳数据类型
-- 使用适合数据的最小类型
CREATE TABLE optimized (
user_id UInt64, -- 8 字节
country LowCardinality(String), -- 字典编码
status UInt8, -- 1 字节
amount Decimal(18, 2), -- 足够精度
created_at DateTime64(3) -- 毫秒精度
);
-- 基准测试:String vs LowCardinality 用于 country(1 亿行)
-- String:15GB,LowCardinality:2GB,GROUP BY:快 15 倍
用于预聚合的物化视图
-- 每次 INSERT 自动填充聚合表
CREATE MATERIALIZED VIEW mv_hourly
TO hourly_stats AS
SELECT
toStartOfHour(created_at) AS hour,
user_id,
countState() AS event_count,
sumState(amount) AS total_amount
FROM events
GROUP BY hour, user_id;
-- 查询:比扫描原始事件快 1000 倍
SELECT
hour, user_id,
countMerge(event_count) AS events,
sumMerge(total_amount) AS revenue
FROM hourly_stats
WHERE hour >= now() - INTERVAL 24 HOUR
GROUP BY hour, user_id
ORDER BY revenue DESC;

查询优化模式
-- PREWHERE:在读取其他列之前先使用一列预过滤
SELECT user_id, amount
FROM events
PREWHERE event_type = 'purchase' -- 仅先读取 event_type
WHERE amount > 100; -- 然后应用 amount 过滤
-- 近似函数:2% 误差,快 10-50 倍
SELECT uniqHLL12(user_id) FROM events; -- 替代 COUNT(DISTINCT)
SELECT quantileTDigest(0.95)(response_ms) FROM requests; -- 替代精确 p95
-- 绝不使用 SELECT *:列式存储中会读取所有列
SELECT user_id, event_type, amount FROM events -- 仅指定需要的列
WHERE created_at > '2026-01-01';
分区与 TTL
-- 时间序列按月分区
CREATE TABLE logs (
timestamp DateTime,
level LowCardinality(String),
message String
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(timestamp)
ORDER BY (level, timestamp);
-- 自动删除旧数据
ALTER TABLE logs MODIFY TTL timestamp + INTERVAL 90 DAY;
-- 分层存储:热数据在 SSD,冷数据在 HDD
ALTER TABLE events MODIFY TTL
created_at + INTERVAL 30 DAY TO DISK 'ssd',
created_at + INTERVAL 1 YEAR TO DISK 'hdd';
分布式设置
-- 每个分片上的复制本地表
CREATE TABLE events_local ON CLUSTER my_cluster (...)
ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/events', '{replica}')
...
-- 分布式表从所有分片读取
CREATE TABLE events ON CLUSTER my_cluster
AS events_local
ENGINE = Distributed(my_cluster, default, events_local, rand());
性能分析
-- 验证颗粒跳过是否生效
EXPLAIN indexes = 1
SELECT count() FROM events WHERE user_id = 123;
-- "Granules: 5/100000" = 跳过了 99.995% 的数据!
-- 查找慢查询
SELECT query, read_rows, memory_usage, query_duration_ms
FROM system.query_log
WHERE type = 'QueryFinish' AND query_start_time > now() - INTERVAL 1 HOUR
ORDER BY query_duration_ms DESC LIMIT 10;
关键洞察:ClickHouse 模式设计必须由查询模式驱动,而非规范化。积极反规范化,谨慎选择 ORDER BY,并使用物化视图处理常见聚合。