ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS)。关于ClickHouse是特性,[4]官方文档有详细说明,不再赘述。简单理解,一个字“快”,两个字“很快”,高性能是其最大的卖点,[5]有ClickHouse与各种分析性数据库的性能对比,从各大公司的实际使用情况看,相比Hive等有几十、几百倍的性能提升。本文通过docker-compose的方式在单机上模拟多机分布式环境,试玩ClickHouse的基本操作,对ClickHouse有一个基本认识。
值得注意的是,ClickHouse也有一些缺陷[4]。
docker-compose配置放置在github上clickhouse-playgrounds。
git clone https://github.com/lzshlzsh/clickhouse-playgrounds.git
cd clickhouse-playgrounds/playground-docker-compose
docker-compose up -d
控制台1
docker-compose run --no-deps --rm client
启动了6个节点的集群cluster_3shards_2replicas
,共3个分片,每个分片2备份。在client命令行运行如下sql。
select * from system.clusters;
┌─cluster───────────────────────────┬─shard_num─┬─shard_weight─┬─replica_num─┬─host_name─┬─host_address─┬─port─┬─is_local─┬─user────┬─default_database─┬─errors_count─┬─estimated_recovery_time─┐
│ cluster_3shards_2replicas │ 1 │ 1 │ 1 │ ck1 │ 172.18.0.7 │ 9000 │ 1 │ default │ │ 0 │ 0 │
│ cluster_3shards_2replicas │ 1 │ 1 │ 2 │ ck2 │ 172.18.0.8 │ 9000 │ 0 │ default │ │ 0 │ 0 │
│ cluster_3shards_2replicas │ 2 │ 1 │ 1 │ ck3 │ 172.18.0.4 │ 9000 │ 0 │ default │ │ 0 │ 0 │
│ cluster_3shards_2replicas │ 2 │ 1 │ 2 │ ck4 │ 172.18.0.6 │ 9000 │ 0 │ default │ │ 0 │ 0 │
│ cluster_3shards_2replicas │ 3 │ 1 │ 1 │ ck5 │ 172.18.0.2 │ 9000 │ 0 │ default │ │ 0 │ 0 │
│ cluster_3shards_2replicas │ 3 │ 1 │ 2 │ ck6 │ 172.18.0.5 │ 9000 │ 0 │ default │ │ 0 │ 0 │
docker-compose down -v
导入OnTime数据[7],这里只导入了1988、1989
两年的数据
控制台2
cd clickhouse-playgrounds
mkdir -p test/ontime
cd test/ontime
for s in {1988..1989}
do
for m in {1..12}
do
wget https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_${s}_${m}.zip
done
done
控制台1
的ClickHouse client上命令行操作CREATE TABLE `ontime` (
`Year` UInt16,
`Quarter` UInt8,
`Month` UInt8,
`DayofMonth` UInt8,
`DayOfWeek` UInt8,
`FlightDate` Date,
`UniqueCarrier` FixedString(7),
`AirlineID` Int32,
`Carrier` FixedString(2),
`TailNum` String,
`FlightNum` String,
`OriginAirportID` Int32,
`OriginAirportSeqID` Int32,
`OriginCityMarketID` Int32,
`Origin` FixedString(5),
`OriginCityName` String,
`OriginState` FixedString(2),
`OriginStateFips` String,
`OriginStateName` String,
`OriginWac` Int32,
`DestAirportID` Int32,
`DestAirportSeqID` Int32,
`DestCityMarketID` Int32,
`Dest` FixedString(5),
`DestCityName` String,
`DestState` FixedString(2),
`DestStateFips` String,
`DestStateName` String,
`DestWac` Int32,
`CRSDepTime` Int32,
`DepTime` Int32,
`DepDelay` Int32,
`DepDelayMinutes` Int32,
`DepDel15` Int32,
`DepartureDelayGroups` String,
`DepTimeBlk` String,
`TaxiOut` Int32,
`WheelsOff` Int32,
`WheelsOn` Int32,
`TaxiIn` Int32,
`CRSArrTime` Int32,
`ArrTime` Int32,
`ArrDelay` Int32,
`ArrDelayMinutes` Int32,
`ArrDel15` Int32,
`ArrivalDelayGroups` Int32,
`ArrTimeBlk` String,
`Cancelled` UInt8,
`CancellationCode` FixedString(1),
`Diverted` UInt8,
`CRSElapsedTime` Int32,
`ActualElapsedTime` Int32,
`AirTime` Int32,
`Flights` Int32,
`Distance` Int32,
`DistanceGroup` UInt8,
`CarrierDelay` Int32,
`WeatherDelay` Int32,
`NASDelay` Int32,
`SecurityDelay` Int32,
`LateAircraftDelay` Int32,
`FirstDepTime` String,
`TotalAddGTime` String,
`LongestAddGTime` String,
`DivAirportLandings` String,
`DivReachedDest` String,
`DivActualElapsedTime` String,
`DivArrDelay` String,
`DivDistance` String,
`Div1Airport` String,
`Div1AirportID` Int32,
`Div1AirportSeqID` Int32,
`Div1WheelsOn` String,
`Div1TotalGTime` String,
`Div1LongestGTime` String,
`Div1WheelsOff` String,
`Div1TailNum` String,
`Div2Airport` String,
`Div2AirportID` Int32,
`Div2AirportSeqID` Int32,
`Div2WheelsOn` String,
`Div2TotalGTime` String,
`Div2LongestGTime` String,
`Div2WheelsOff` String,
`Div2TailNum` String,
`Div3Airport` String,
`Div3AirportID` Int32,
`Div3AirportSeqID` Int32,
`Div3WheelsOn` String,
`Div3TotalGTime` String,
`Div3LongestGTime` String,
`Div3WheelsOff` String,
`Div3TailNum` String,
`Div4Airport` String,
`Div4AirportID` Int32,
`Div4AirportSeqID` Int32,
`Div4WheelsOn` String,
`Div4TotalGTime` String,
`Div4LongestGTime` String,
`Div4WheelsOff` String,
`Div4TailNum` String,
`Div5Airport` String,
`Div5AirportID` Int32,
`Div5AirportSeqID` Int32,
`Div5WheelsOn` String,
`Div5TotalGTime` String,
`Div5LongestGTime` String,
`Div5WheelsOff` String,
`Div5TailNum` String
) ENGINE = MergeTree() PARTITION BY toYYYYMM(FlightDate) ORDER BY (Year, FlightDate)
控制台2
操作cd ../../playground-docker-compose/
for i in ../test/ontime/*.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | docker-compose run --no-deps --rm client --host=ck1 --query="INSERT INTO ontime FORMAT CSVWithNames"; done
控制台1
操作c1979307f870 :) select count(1) from ontime;
SELECT count(1)
FROM ontime
┌─count(1)─┐
│ 10243296 │
└──────────┘
1 rows in set. Elapsed: 0.005 sec.
# Q0
SELECT avg(c1)
FROM
(
SELECT
Year,
Month,
count(*) AS c1
FROM ontime
GROUP BY
Year,
Month
)
┌─avg(c1)─┐
│ 426804 │
└─────────┘
1 rows in set. Elapsed: 0.084 sec. Processed 10.24 million rows, 30.73 MB (122.55 million rows/s., 367.66 MB/s.)
# Q1.查询从1988年到1989年每天的航班数
SELECT
DayOfWeek,
count(*) AS c
FROM ontime
WHERE (Year >= 1988) AND (Year <= 1989)
GROUP BY DayOfWeek
ORDER BY c DESC
┌─DayOfWeek─┬───────c─┐
│ 5 │ 1497912 │
│ 3 │ 1493143 │
│ 2 │ 1492544 │
│ 1 │ 1489357 │
│ 4 │ 1484423 │
│ 7 │ 1420042 │
│ 6 │ 1365875 │
└───────────┴─────────┘
7 rows in set. Elapsed: 0.060 sec. Processed 10.24 million rows, 30.73 MB (169.39 million rows/s., 508.18 MB/s.)
# Q2.查询从1988年到1989年每周延误超过10分钟的航班数
SELECT
DayOfWeek,
count(*) AS c
FROM ontime
WHERE (DepDelay > 10) AND (Year >= 1988) AND (Year <= 1989)
GROUP BY DayOfWeek
ORDER BY c DESC
┌─DayOfWeek─┬──────c─┐
│ 4 │ 307762 │
│ 5 │ 304375 │
│ 3 │ 290879 │
│ 2 │ 270656 │
│ 1 │ 243052 │
│ 7 │ 234229 │
│ 6 │ 219208 │
└───────────┴────────┘
7 rows in set. Elapsed: 0.081 sec. Processed 10.24 million rows, 71.70 MB (126.90 million rows/s., 888.31 MB/s.)
# Q3.查询1988年到1989年每个机场延误超过10分钟以上的次数
SELECT
Origin,
count(*) AS c
FROM ontime
WHERE (DepDelay > 10) AND (Year >= 1988) AND (Year <= 1989)
GROUP BY Origin
ORDER BY c DESC
LIMIT 10
┌─Origin─┬──────c─┐
│ ORD │ 138100 │
│ DFW │ 80991 │
│ ATL │ 80184 │
│ CLT │ 68363 │
│ PIT │ 67484 │
│ LAX │ 63626 │
│ DEN │ 60269 │
│ SFO │ 58458 │
│ EWR │ 55438 │
│ STL │ 53238 │
└────────┴────────┘
10 rows in set. Elapsed: 0.086 sec. Processed 10.24 million rows, 112.68 MB (118.85 million rows/s., 1.31 GB/s.)
# Q4. 查询1988年各航空公司延误超过10分钟以上的次数
SELECT
Carrier,
count(*)
FROM ontime
WHERE (DepDelay > 10) AND (Year = 1988)
GROUP BY Carrier
ORDER BY count(*) DESC
┌─Carrier─┬─count()─┐
│ UA │ 113208 │
│ AA │ 96946 │
│ DL │ 96443 │
│ PI │ 94642 │
│ CO │ 80925 │
│ AL │ 78368 │
│ EA │ 66918 │
│ NW │ 64962 │
│ TW │ 51121 │
│ WN │ 44572 │
│ US │ 26604 │
│ HP │ 21175 │
│ AS │ 13352 │
│ PA │ 10181 │
│ PS │ 5020 │
└─────────┴─────────┘
15 rows in set. Elapsed: 0.041 sec. Processed 5.20 million rows, 41.62 MB (126.31 million rows/s., 1.01 GB/s.)
# Q5. 查询1988年各航空公司延误超过10分钟以上的百分比
SELECT
Carrier,
c,
c2,
(c * 100) / c2 AS c3
FROM
(
SELECT
Carrier,
count(*) AS c
FROM ontime
WHERE (DepDelay > 10) AND (Year = 1988)
GROUP BY Carrier
)
ANY INNER JOIN
(
SELECT
Carrier,
count(*) AS c2
FROM ontime
WHERE Year = 1988
GROUP BY Carrier
) USING (Carrier)
ORDER BY c3 DESC
┌─Carrier─┬──────c─┬─────c2─┬─────────────────c3─┐
│ AL │ 78368 │ 361059 │ 21.705039896526607 │
│ PI │ 94642 │ 470957 │ 20.095677524699706 │
│ US │ 26604 │ 133324 │ 19.954396807776543 │
│ UA │ 113208 │ 587144 │ 19.281130353030942 │
│ TW │ 51121 │ 275819 │ 18.534256160743094 │
│ CO │ 80925 │ 457031 │ 17.70667635236997 │
│ EA │ 66918 │ 389292 │ 17.18966739619617 │
│ WN │ 44572 │ 262422 │ 16.984856452584005 │
│ NW │ 64962 │ 431440 │ 15.057018357129612 │
│ AS │ 13352 │ 89822 │ 14.864955133486228 │
│ PA │ 10181 │ 72264 │ 14.088619506254844 │
│ AA │ 96946 │ 694757 │ 13.953943609060435 │
│ DL │ 96443 │ 753983 │ 12.791137200706117 │
│ PS │ 5020 │ 41911 │ 11.9777624012789 │
│ HP │ 21175 │ 180871 │ 11.707238860845575 │
└─────────┴────────┴────────┴────────────────────┘
15 rows in set. Elapsed: 0.067 sec. Processed 10.40 million rows, 62.43 MB (155.81 million rows/s., 934.89 MB/s.)
## 更好的查询版本
SELECT
Carrier,
avg(DepDelay > 10) * 100 AS c3
FROM ontime
WHERE Year = 1988
GROUP BY Carrier
ORDER BY c3 DESC
┌─Carrier─┬─────────────────c3─┐
│ AL │ 21.705039896526607 │
│ PI │ 20.095677524699706 │
│ US │ 19.954396807776543 │
│ UA │ 19.281130353030942 │
│ TW │ 18.534256160743094 │
│ CO │ 17.70667635236997 │
│ EA │ 17.18966739619617 │
│ WN │ 16.984856452584005 │
│ NW │ 15.057018357129612 │
│ AS │ 14.86495513348623 │
│ PA │ 14.088619506254844 │
│ AA │ 13.953943609060435 │
│ DL │ 12.791137200706117 │
│ PS │ 11.9777624012789 │
│ HP │ 11.707238860845575 │
└─────────┴────────────────────┘
15 rows in set. Elapsed: 0.046 sec. Processed 5.20 million rows, 41.62 MB (112.93 million rows/s., 903.44 MB/s.)