os: centos 7.4
db: postgresql 10.10
pg_pathman: 1.5
pg_pathman是postgresql管理分区插件,postgresql 9.6、10 的内置分区管理也一直都在完善。使用哪种方式来管理,用户自己决定。不过pg_pathman 确实很方便。
由于pg_pathman使用了custom scan provider api,所以只支持PostgreSQL 9.5以及以上的版本
The pg_pathman module provides optimized partitioning mechanism and functions to manage partitions.
The extension is compatible with:
PostgreSQL 9.5, 9.6, 10;
Postgres Pro Standard 9.5, 9.6;
Postgres Pro Enterprise;
创建新表
peiybdb=# create table tmp_range(
id int,
name text,
insert_timestamp timestamp with time zone not null
);
分区设置
peiybdb=# select create_range_partitions(
'tmp_range'::regclass, -- 主表oid
'insert_timestamp', -- 分区字段,一定要not null约束
'2020-01-01 00:00:00'::timestamp with time zone, -- 开始时间
interval '1 month', -- 分区间隔,一个月
20, -- 分区表数量
false -- 不立即将数据从主表迁移到分区
);
peiybdb=# select set_enable_parent('tmp_range'::regclass, false);
插入数据
peiybdb=# insert into tmp_range
select id,
md5(id::text),
current_date - mod(id,1000)
from generate_series(1,1000000) as id;
peiybdb=# select count(*) from tmp_range;
count
---------
1000000
(1 row)
peiybdb=# select count(*) from only tmp_range;
count
-------
0
(1 row)
查看执行计划
peiybdb=# explain select * from tmp_range where insert_timestamp = '2020-03-12 11:59:46'::timestamp with time zone;
QUERY PLAN
-----------------------------------------------------------------------------------------
Append (cost=0.00..415.50 rows=1 width=44)
-> Seq Scan on tmp_range_3 (cost=0.00..415.50 rows=1 width=45)
Filter: (insert_timestamp = '2020-03-12 11:59:46+08'::timestamp with time zone)
(3 rows)
使用了= ‘2020-03-12 11:59:46’::timestamp with time zone ,被认为常量,进行了分区裁剪
peiybdb=# explain select *
from tmp_range
where insert_timestamp between '2020-03-10 11:59:46'::timestamp with time zone
and '2020-03-12 11:59:46'::timestamp with time zone
;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Append (cost=0.00..463.00 rows=2000 width=44)
-> Seq Scan on tmp_range_3 (cost=0.00..463.00 rows=2000 width=45)
Filter: ((insert_timestamp >= '2020-03-10 11:59:46+08'::timestamp with time zone) AND (insert_timestamp <= '2020-03-12 11:59:46+08'::timestamp with time zone))
(3 rows)
使用了 between and 也进行了分区裁剪
peiybdb=# explain select *
from tmp_range
where insert_timestamp >= '2020-03-10 11:59:46'::timestamp with time zone
and insert_timestamp <= '2020-03-12 11:59:46'::timestamp with time zone
;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Append (cost=0.00..463.00 rows=2000 width=44)
-> Seq Scan on tmp_range_3 (cost=0.00..463.00 rows=2000 width=45)
Filter: ((insert_timestamp >= '2020-03-10 11:59:46+08'::timestamp with time zone) AND (insert_timestamp <= '2020-03-12 11:59:46+08'::timestamp with time zone))
(3 rows)
使用了 >= <= 也进行了分区裁剪
peiybdb=# explain select * from tmp_range where insert_timestamp = now() - interval '1 day';
QUERY PLAN
---------------------------------------------------------------------
Append (cost=0.00..27366.18 rows=136 width=44)
-> Seq Scan on tmp_range_51 (cost=0.00..188.50 rows=1 width=45)
Filter: (insert_timestamp = (now() - '1 day'::interval))
-> Seq Scan on tmp_range_50 (cost=0.00..832.50 rows=1 width=45)
Filter: (insert_timestamp = (now() - '1 day'::interval))
-> Seq Scan on tmp_range_18 (cost=0.00..29.78 rows=6 width=44)
Filter: (insert_timestamp = (now() - '1 day'::interval))
-> Seq Scan on tmp_range_19 (cost=0.00..29.78 rows=6 width=44)
Filter: (insert_timestamp = (now() - '1 day'::interval))
-> Seq Scan on tmp_range_20 (cost=0.00..29.78 rows=6 width=44)
Filter: (insert_timestamp = (now() - '1 day'::interval))
(103 rows)
为什么走了所有分区?
insert_timestamp 列为 timestamp with time zone
now() - interval ‘1 day’ 也为 timestamp with time zone
peiybdb=# select pg_typeof( now() - interval '1 day');
pg_typeof
--------------------------
timestamp with time zone
(1 row)
如果插入的分区值不在初始化分区表的范围内,pg_pathman会自动创建相关的分区表,这个确实很实用、很方便。
模拟已有表,且里面有大量数据
peiybdb=# create table tmp_range_his(
id int,
name text,
insert_timestamp timestamp with time zone not null
);
peiybdb=# insert into tmp_range_his
select id,
md5(id::text),
current_date - mod(id,1000)
from generate_series(1,2000000) as id;
默认创建200个分区,每个分区包含一个月的数据。
需要注意如下:
1)开始日期需要比表中已有记录的开始日期要小
2)分区表数量要能够覆盖到表现有的数据
peiybdb=# select min(insert_timestamp) from tmp_range_his;
min
------------------------
2017-06-24 00:00:00+08
(1 row)
peiybdb=# select create_range_partitions(
'tmp_range_his'::regclass, -- 主表oid
'insert_timestamp', -- 分区字段,一定要not null约束
'2017-01-01 00:00:00'::timestamp with time zone, -- 开始时间
interval '1 month', -- 分区间隔,一个月
200, -- 分区表数量
false -- 不立即将数据从主表迁移到分区
);
执行非堵塞迁移
peiybdb=# select partition_table_concurrently(
'tmp_range_his'::regclass, -- 主表OID
1000, -- 一个事务批量迁移多少记录
1.0 -- 获得行锁失败时,休眠多久再次获取,重试60次退出任务
);
查看迁移进度
peiybdb=# select * from pathman_concurrent_part_tasks;
userid | pid | dbid | relid | processed | status
----------+------+--------+---------------+-----------+---------
postgres | 5345 | 148454 | tmp_range_his | 654000 | working
(1 row)
peiybdb=# select * from pathman_concurrent_part_tasks;
userid | pid | dbid | relid | processed | status
--------+-----+------+-------+-----------+--------
(0 rows)
禁用主表
peiybdb=# select set_enable_parent('tmp_range_his'::regclass, false);
迁移后,查看数据
peiybdb=# select count(*) from tmp_range_his;
count
-------
2000000
(1 row)
peiybdb=# select count(*) from only tmp_range_his;
count
-------
0
(1 row)
建议
参考:
https://github.com/postgrespro/pg_pathman
https://github.com/postgrespro/pg_pathman/wiki
https://github.com/postgrespro/pg_pathman_build