yjhatfdu2
2024-04-02 14:55:08 +08:00
clickhouse 造一天数据试试看,单机 64 核 epyc 256G ram
建表,目前试下来效率最高的表结构
create table test4
(
    time     datetime CODEC (DoubleDelta, LZ4),
    country  UInt8 CODEC (DoubleDelta, LZ4),
    province UInt8 CODEC (DoubleDelta, LZ4),
    city     UInt16 CODEC (DoubleDelta, LZ4),
    uid      UInt32
) engine = MergeTree() partition by toYYYYMMDD(time)
      order by (time, country, province, city) settings index_granularity = 65536;
先造 10 亿数据,分布在一天内
insert into test4
select dateAdd(second, number / 1000000000, toDateTime('2024-04-02'))
     , rand32() % 200
     , rand32(1) % 250
     , rand32(2) % 100
     , number + 1
from numbers(1000000000);
-- 然后扩增到 32 倍
insert into test4 select * from test4;
insert into test4 select * from test4;
insert into test4 select * from test4;
insert into test4 select * from test4;
insert into test4 select * from test4;
SELECT count(*)
FROM test4
Query id: a4a01197-a22b-4a0d-9747-26555229ff58
┌─────count()─┐
│ 32000000000 │
└─────────────┘
1 row in set. Elapsed: 0.004 sec. 
一共 320 亿
等后台 merge 完才 14.28 GiB 磁盘占用
楼主要的查询
WITH r AS
    (
        SELECT count() AS c
        FROM test4
        WHERE country = 100
        GROUP BY uid
    )
SELECT avg(c)
FROM r
Query id: c634e7a7-13fa-4d40-9f30-e6e43105ffe9
┌─avg(c)─┐
│     32 │
└────────┘
1 row in set. Elapsed: 0.168 sec. Processed 160.30 million rows, 801.18 MB (954.12 million rows/s., 4.77 GB/s.)
0.168 秒完成
这样看起来,一年的数据单机也问题不大
注意,不同的建表语句尤其是 CODEC 非常影响存储空间和性能