参考链接:
https://github.com/ceph/ceph/blob/master/doc/man/8/ceph-objectstore-tool.rst
https://github.com/ceph/ceph/blob/master/src/tools/ceph_objectstore_tool.cc
ceph-objectstore-tool 是 ceph 提供的一个操作 pg 及 pg 里面对象的工具。
ceph-objectstore-tool是修改OSD状态的工具。它有助于操作对象的内容、删除对象、列出omap、操作omap头、操作omap键、列出对象属性和操作对象属性键。
[root@localhost build]# ./bin/ceph-objectstore-tool -h
Must provide --data-path
Allowed options:
--help produce help message,帮助
--type arg Arg is one of [bluestore (default), filestore,
memstore],存储引擎类型,默认 bluestore
--data-path arg path to object store, mandatory,存储路径,一般 /var/ceph/osd-0
--journal-path arg path to journal, use if tool can't find it,filestore 时使用
--pgid arg PG id, mandatory for info, log, remove, export,
export-remove, mark-complete, trim-pg-log, and
mandatory for apply-layout-settings if --pool is
not specified,某些情况下并且 pool 未指定时,为必填项
--pool arg Pool name, mandatory for apply-layout-settings if
--pgid is not specified,apply-layout-settings 时并且 pgid 未指定时,必填
--op arg Arg is one of [info, log, remove, mkfs, fsck,
repair, fuse, dup, export, export-remove, import,
list, fix-lost, list-pgs, dump-journal,
dump-super, meta-list, get-osdmap, set-osdmap,
get-inc-osdmap, set-inc-osdmap, mark-complete,
reset-last-complete, apply-layout-settings,
update-mon-db, dump-export, trim-pg-log],操作
--epoch arg epoch# for get-osdmap and get-inc-osdmap, the
current epoch in use if not specified,几代目
--file arg path of file to export, export-remove, import,
get-osdmap, set-osdmap, get-inc-osdmap or
set-inc-osdmap,指定文件输入输出路径
--mon-store-path arg path of monstore to update-mon-db
--fsid arg fsid for new store created by mkfs
--target-data-path arg path of target object store (for --op dup)
--mountpoint arg fuse mountpoint
--format arg (=json-pretty) Output format which may be json, json-pretty,
xml, xml-pretty
--debug Enable diagnostic output to stderr
--force Ignore some types of errors and proceed with
operation - USE WITH CAUTION: CORRUPTION POSSIBLE
NOW OR IN THE FUTURE
--skip-journal-replay Disable journal replay
--skip-mount-omap Disable mounting of omap
--head Find head/snapdir when searching for objects by
name
--dry-run Don't modify the objectstore
--namespace arg Specify namespace when searching for objects
--rmtype arg Specify corrupting object removal 'snapmap' or
'nosnapmap' - TESTING USE ONLY
Positional syntax:
ceph-objectstore-tool ... <object> (get|set)-bytes [file]
ceph-objectstore-tool ... <object> set-(attr|omap) <key> [file]
ceph-objectstore-tool ... <object> (get|rm)-(attr|omap) <key>
ceph-objectstore-tool ... <object> get-omaphdr
ceph-objectstore-tool ... <object> set-omaphdr [file]
ceph-objectstore-tool ... <object> list-attrs
ceph-objectstore-tool ... <object> list-omap
ceph-objectstore-tool ... <object> remove|removeall
ceph-objectstore-tool ... <object> dump
ceph-objectstore-tool ... <object> set-size
ceph-objectstore-tool ... <object> clear-data-digest
ceph-objectstore-tool ... <object> remove-clone-metadata <cloneid>
<object> can be a JSON object description as displayed
by --op list.
<object> can be an object name which will be looked up in all
the OSD's PGs.
<object> can be the empty string ('') which with a provided pgid
specifies the pgmeta object
The optional [file] argument will read stdin or write stdout
if not specified or if '-' specified.
通用示例
ceph-objectstore-tool --data-path path-to-osd --op
使用前,关闭相关 OSD 服务
ceph osd set noout
systemctl stop ceph-osd@$OSD_NUMBER
systemctl status ceph-osd@$OSD_NUMBER
使用结束,重启 OSD 服务
systemctl restart ceph-osd@OSD_NUMBER
ceph osd unset noout
crash 警告解决方法
HEALTH_WARN 5 daemons have recently crashed
ceph crash ls-new
ceph crash archive-all
结果以 [pgid, {oid,对象信息}] 的形式展示每个对象。
ceph-objectstore-tool --data-path $PATH_TO_OSD --op list
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --op list
...
["2.7",{"oid":"rbd_data.20e5ff0224ec0.00000000000000a0","key":"","snapid":-2,"hash":467323015,"max":0,"pool":2,"namespace":"","max":0}]
["2.3a",{"oid":"rbd_info","key":"","snapid":-2,"hash":2886620986,"max":0,"pool":2,"namespace":"","max":0}]
["2.33",{"oid":"rbd_data.20e5ff0224ec0.0000000000000000","key":"","snapid":-2,"hash":2764933619,"max":0,"pool":2,"namespace":"","max":0}]
["2.25",{"oid":"rbd_id.rbd-pool-image-1","key":"","snapid":-2,"hash":2198578149,"max":0,"pool":2,"namespace":"","max":0}]
["2.1c",{"oid":"rbd_directory","key":"","snapid":-2,"hash":816417820,"max":0,"pool":2,"namespace":"","max":0}]
["2.1d",{"oid":"rbd_header.20e5ff0224ec0","key":"","snapid":-2,"hash":1624672572,"max":0,"pool":2,"namespace":"","max":0}]
...
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID --op list
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --pgid 1.68 --op list
["1.68",{"oid":"benchmark_data_node-1_2694_object67","key":"","snapid":-2,"hash":1705301608,"max":0,"pool":1,"namespace":"","max":0}]
...
可以指定对象id,也可以通过在结果中 grep 来筛选。
ceph-objectstore-tool --data-path $PATH_TO_OSD --op list $OBJECT_ID
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --op list benchmark_data_node-1_2694_object61
["1.26",{"oid":"benchmark_data_node-1_2694_object61","key":"","snapid":-2,"hash":1072655526,"max":0,"pool":1,"namespace":"","max":0}]
$OBJECT 可以是 json 格式的对象,也可以直接是 oid。这一点在帮助文档中有提到。
ceph-objectstore-tool --data-path $PATH_TO_OSD $OBJECT dump
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ rbd_header.20e5ff0224ec0 dump
{
"id": {
"oid": "rbd_header.20e5ff0224ec0",
"key": "",
"snapid": -2,
"hash": 1624672572,
"max": 0,
"pool": 2,
"namespace": "",
"max": 0
},
"info": {
"oid": {
"oid": "rbd_header.20e5ff0224ec0",
"key": "",
"snapid": -2,
"hash": 1624672572,
"max": 0,
"pool": 2,
"namespace": ""
},
"version": "137'29",
"prior_version": "137'28",
"last_reqid": "osd.1.0:2",
"user_version": 27,
"size": 0,
"mtime": "2021-05-27 09:33:24.367195",
"local_mtime": "2021-05-27 09:33:24.422271",
"lost": 0,
"flags": [
"dirty",
"omap",
"data_digest",
"omap_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"data_digest": "0xffffffff",
"omap_digest": "0x4bbef111",
"expected_object_size": 0,
"expected_write_size": 0,
"alloc_hint_flags": 0,
"manifest": {
"type": 0
},
"watchers": {}
},
"stat": {
"size": 0,
"blksize": 4096,
"blocks": 0,
"nlink": 1
},
"SnapSet": {
"snap_context": {
"seq": 0,
"snaps": []
},
"clones": []
}
}
注意:fix 功能还需要完善,暂时不可用
ceph-objectstore-tool --data-path $PATH_TO_OSD --op fix-lost
注意:fix 功能还需要完善,暂时不可用
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID --op fix-lost
注意:fix 功能还需要完善,暂时不可用
ceph-objectstore-tool --data-path $PATH_TO_OSD --op fix-lost $OBJECT_ID
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT get-bytes > $OBJECT_FILE_NAME
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 get-bytes > rbd_header
[root@node-1 ~]# ls -al rbd_header
-rw-r--r-- 1 root root 4194304 6月 9 14:54 rbd_header
结合 get-bytes 命令,可以用于替换损坏对象
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT set-bytes < $OBJECT_FILE_NAME
[root@osd ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0 --pgid 0.1c '{"oid":"zone_info.default","key":"","snapid":-2,"hash":235010478,"max":0,"pool":11,"namespace":""}' set-bytes < zone_info.default.working-copy
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT remove
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object67 remove
remove #1:166b25a6:::benchmark_data_node-1_2694_object67:head#
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT list-omap
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ rbd_header.20e5ff0224ec0 list-omap
access_timestamp
create_timestamp
features
flags
modify_timestamp
object_prefix
order
size
snap_seq
必须要指定key,可以先通过 list-omap 获取所有key
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT get-omap <key> [> $OBJECT_MAP_FILE_NAME]
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ rbd_header.20e5ff0224ec0 get-omap object_prefix
Base64:FgAAAHJiZF9kYXRhLjIwZTVmZjAyMjRlYzA=
[root@node-1 ~]# echo FgAAAHJiZF9kYXRhLjIwZTVmZjAyMjRlYzA= | base64 -d
rbd_data.20e5ff0224ec0
此操作会覆盖之前的 value 值,若要以追加或者修改的方式更改 value,需要先使用 get-omap命令获取原本的 value。
注意:必须提供 key,文件路径。
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT set-omap <$KEY> <$OBJECT_MAP_FILE_NAME>
[root@node-1 ~]# vi my_omap_value
this is my omap value
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 set-omap my_omap_key my_omap_value
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 get-omap my_omap_key
Base64:dGhpcyBpcyBteSBvbWFwIHZhbHVlCg==
[root@node-1 ~]# echo dGhpcyBpcyBteSBvbWFwIHZhbHVlCg== | base64 -d
this is my omap value
必须指定 key。
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT rm-omap $KEY
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 rm-omap my_omap_key
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 get-omap my_omap_key
Key not found
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT set-omaphdr [< $OBJECT_MAP_FILE_NAME]
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 get-omaphdr
Base64:dGhpcyBpcyBvbWFwIGhlYWRlcgo=
[root@node-1 ~]# echo dGhpcyBpcyBvbWFwIGhlYWRlcgo= | base64 -d
this is omap header
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT get-omap $KEY [> $OBJECT_MAP_FILE_NAME]
[root@node-1 ~]# cat omaphdr
this is omap header
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 set-omaphdr < omaphdr
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 get-omaphdr
Base64:dGhpcyBpcyBvbWFwIGhlYWRlcgo=
[root@node-1 ~]# echo dGhpcyBpcyBvbWFwIGhlYWRlcgo= | base64 -d
this is omap header
列出对象的 xattr 属性的所有 key 。
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT list-attrs
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 list-attrs
_
snapset
需要指定对象及其 xattr 的 key
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT get-attr $KEY [> $OBJECT_ATTRS_FILE_NAME]
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 get-attr _
Base64:EQgcAQAABANEAAAAAAAAACMAAABiZW5jaG1hcmtfZGF0YV9ub2RlLTFfMjY5NF9vYmplY3Q2Mf7/pmzvPwAAAAAAAQAAAAAAAAAGAxwAAAABAAAAAAAAAP8AAAAAAAAAAP//AAAAAAIAAAAAAAAAMgAAAAEAAAAAAAAAHQAAAAICFQAAAAQCAAAAAAAAAAcAAAAAAAAAAAAAAAAAQAAAAAAAv9eYYNTq8SUCAhUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAADQAAAC/15hgasR6MwnBjuz/AABAAAAAAAAAAEAAAAAAADUAAAA=
注意:必须指定 oid、key、value文件路径
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT set-attr $KEY < $OBJECT_ATTRS_FILE_NAME
[root@node-1 ~]# vi my_xattr
this is my xattr
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 set-attr my_xattr_key my_xattr
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 get-attr my_xattr_key
Base64:dGhpcyBpcyBteSB4YXR0cgo=
[root@node-1 ~]# echo dGhpcyBpcyBteSB4YXR0cgo= | base64 -d
this is my xattr
ceph-objectstore-tool --data-path $PATH_TO_OSD --pgid $PG_ID $OBJECT rm-attr $KEY
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 rm-attr my_xattr_key
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ benchmark_data_node-1_2694_object61 get-attr my_xattr_key
getattr: (61) No data available
[root@node-1 ceph-objectstore-tool-test]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --op [fsck|repair]
此组合操作可以拷贝出一个完整的 osd。
首先mkfs,需要指定路径,type 可选项(默认 bluestore),fsid 可选项(默认 随机生成)
[root@node-1 ~]# ceph-objectstore-tool --data-path /root/osd.dir/ --op mkfs
failed to fetch mon config (--no-mon-config to skip)
[root@node-1 ~]# ceph-objectstore-tool --data-path /root/osd.dir/ --op mkfs --no-mon-config
然后使用 dup 命令复制 osd,需要指出 源路径 和 目标路径
本人 dup 失败了,下面贴出报错信息。
[root@node-1 osd.dir]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --target-data-path ~/osd.dir/ --op dup
dup from bluestore: /var/lib/ceph/osd/ceph-0/
to bluestore: /root/osd.dir/
src fsid 9912f587-6c2c-4098-8635-b97fd46f721e != dest 2c65c351-a968-4dee-b97a-82e9107ef749
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/gigantic/release/14.2.10/rpm/el7/BUILD/ceph-14.2.10/src/os/bluestore/Allocator.cc: In function 'virtual Allocator::SocketHook::~SocketHook()' thread 7fcfc160f780 time 2021-06-10 10:34:15.927047
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/gigantic/release/14.2.10/rpm/el7/BUILD/ceph-14.2.10/src/os/bluestore/Allocator.cc: 43: FAILED ceph_assert(r == 0)
ceph version 14.2.10 (b340acf629a010a74d90da5782a2c5fe0b54ac20) nautilus (stable)
1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x14a) [0x7fcfb75ba2d5]
2: (()+0x25449d) [0x7fcfb75ba49d]
3: (()+0x925a75) [0x561b5b38fa75]
4: (BitmapAllocator::~BitmapAllocator()+0x12f) [0x561b5b3ded7f]
5: (BlueFS::_stop_alloc()+0xb3) [0x561b5b39f853]
6: (BlueFS::umount(bool)+0x13e) [0x561b5b3b9e6e]
7: (BlueStore::_close_bluefs(bool)+0x11) [0x561b5b29a401]
8: (BlueStore::_close_db_and_around(bool)+0x91) [0x561b5b31dac1]
9: (BlueStore::umount()+0x299) [0x561b5b31e4c9]
10: (dup(std::string, ObjectStore*, std::string, ObjectStore*)+0x39c) [0x561b5ae20e4c]
11: (main()+0x3139) [0x561b5ade1789]
12: (__libc_start_main()+0xf5) [0x7fcfb444d555]
13: (()+0x3a52a0) [0x561b5ae0f2a0]
*** Caught signal (Aborted) **
in thread 7fcfc160f780 thread_name:ceph-objectstor
ceph version 14.2.10 (b340acf629a010a74d90da5782a2c5fe0b54ac20) nautilus (stable)
1: (()+0xf630) [0x7fcfb5a8f630]
2: (gsignal()+0x37) [0x7fcfb4461387]
3: (abort()+0x148) [0x7fcfb4462a78]
4: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x199) [0x7fcfb75ba324]
5: (()+0x25449d) [0x7fcfb75ba49d]
6: (()+0x925a75) [0x561b5b38fa75]
7: (BitmapAllocator::~BitmapAllocator()+0x12f) [0x561b5b3ded7f]
8: (BlueFS::_stop_alloc()+0xb3) [0x561b5b39f853]
9: (BlueFS::umount(bool)+0x13e) [0x561b5b3b9e6e]
10: (BlueStore::_close_bluefs(bool)+0x11) [0x561b5b29a401]
11: (BlueStore::_close_db_and_around(bool)+0x91) [0x561b5b31dac1]
12: (BlueStore::umount()+0x299) [0x561b5b31e4c9]
13: (dup(std::string, ObjectStore*, std::string, ObjectStore*)+0x39c) [0x561b5ae20e4c]
14: (main()+0x3139) [0x561b5ade1789]
15: (__libc_start_main()+0xf5) [0x7fcfb444d555]
16: (()+0x3a52a0) [0x561b5ae0f2a0]
已放弃
# 终端1
[root@node-1 ceph-objectstore-tool-test]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --op fuse --mountpoint /mnt/ceph-osd@0/
mounting fuse at /mnt/ceph-osd@0/ ...
# 终端2
[root@node-1 mnt]# df
文件系统 1K-块 已用 可用 已用% 挂载点
foo 10481664 3820032 6661632 37% /mnt/ceph-osd@0
# 使用完,记得卸载
[root@node-1 mnt]# umount /mnt/ceph-osd\@0/
[root@node-1 ceph-objectstore-tool-test]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --op dump-super
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --op dump-super
{
"cluster_fsid": "60e065f1-d992-4d1a-8f4e-f74419674f7e",
"osd_fsid": "9912f587-6c2c-4098-8635-b97fd46f721e",
"whoami": 0,
"current_epoch": 156,
"oldest_map": 1,
"newest_map": 156,
"weight": 0,
"compat": {
"compat": {},
"ro_compat": {},
"incompat": {
"feature_1": "initial feature set(~v.18)",
"feature_2": "pginfo object",
"feature_3": "object locator",
"feature_4": "last_epoch_clean",
"feature_5": "categories",
"feature_6": "hobjectpool",
"feature_7": "biginfo",
"feature_8": "leveldbinfo",
"feature_9": "leveldblog",
"feature_10": "snapmapper",
"feature_11": "sharded objects",
"feature_12": "transaction hints",
"feature_13": "pg meta object",
"feature_14": "explicit missing set",
"feature_15": "fastinfo pg attr",
"feature_16": "deletes in missing set"
}
},
"clean_thru": 156,
"last_epoch_mounted": 154
}
[root@node-1 ceph-objectstore-tool-test]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --op list-pgs
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --op list-pgs
2.e
2.d
2.c
2.a
2.9
2.8
2.7
2.3f
2.f
2.3e
2.3a
...
[root@node-1 ceph-objectstore-tool-test]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --pgid 1.0 --op [info|log]
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --pgid 1.2 --op log
{
"pg_log_t": {
"head": "50'2",
"tail": "0'0",
"log": [
{
"op": "modify",
"object": "1:416569a2:::benchmark_data_node-1_2694_object81:head",
"version": "29'1",
"prior_version": "0'0",
"reqid": "client.24213.0:82",
"extra_reqids": [],
"mtime": "2021-05-10 14:50:40.083127",
"return_code": 0,
"mod_desc": {
"object_mod_desc": {
"can_local_rollback": false,
"rollback_info_completed": false,
"ops": []
}
}
},
{
"op": "modify",
"object": "1:416569a2:::benchmark_data_node-1_2694_object81:head",
"version": "50'2",
"prior_version": "29'1",
"reqid": "osd.1.0:9",
"extra_reqids": [],
"mtime": "0.000000",
"return_code": 0,
"mod_desc": {
"object_mod_desc": {
"can_local_rollback": false,
"rollback_info_completed": false,
"ops": []
}
}
}
],
"dups": []
},
"pg_missing_t": {
"missing": [],
"may_include_deletes": true
}
}
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --pgid 1.2 --op info
{
"pgid": "1.2",
"last_update": "50'2",
"last_complete": "50'2",
"log_tail": "0'0",
"last_user_version": 1,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": [],
"history": {
"epoch_created": 18,
"epoch_pool_created": 18,
"last_epoch_started": 324,
"last_interval_started": 323,
"last_epoch_clean": 324,
"last_interval_clean": 323,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 323,
"same_interval_since": 323,
"same_primary_since": 322,
"last_scrub": "50'2",
"last_scrub_stamp": "2021-07-08 16:27:39.579601",
"last_deep_scrub": "50'2",
"last_deep_scrub_stamp": "2021-07-08 16:27:39.579601",
"last_clean_scrub_stamp": "2021-07-08 16:27:39.579601"
},
"stats": {
"version": "50'2",
"reported_seq": "189",
"reported_epoch": "321",
"state": "unknown",
"last_fresh": "2021-07-13 14:36:21.560457",
"last_change": "2021-07-13 14:36:21.560457",
"last_active": "2021-06-23 14:50:51.902398",
"last_peered": "2021-06-23 14:49:25.437981",
"last_clean": "2021-06-23 14:49:25.437981",
"last_became_active": "2021-06-23 14:44:21.614072",
"last_became_peered": "2021-06-23 14:44:21.614072",
"last_unstale": "2021-07-13 14:36:21.560457",
"last_undegraded": "2021-07-13 14:36:21.560457",
"last_fullsized": "2021-07-13 14:36:21.560457",
"mapping_epoch": 323,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 18,
"last_epoch_clean": 311,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "50'2",
"last_scrub_stamp": "2021-07-08 16:27:39.579601",
"last_deep_scrub": "50'2",
"last_deep_scrub_stamp": "2021-07-08 16:27:39.579601",
"last_clean_scrub_stamp": "2021-07-08 16:27:39.579601",
"log_size": 2,
"ondisk_log_size": 2,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 4194304,
"num_objects": 1,
"num_object_clones": 0,
"num_object_copies": 3,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 1,
"num_whiteouts": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 1,
"num_write_kb": 4096,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0,
"num_omap_bytes": 0,
"num_omap_keys": 0,
"num_objects_repaired": 0
},
"up": [
1,
0,
2
],
"acting": [
1,
0,
2
],
"avail_no_missing": [],
"object_location_counts": [],
"blocked_by": [],
"up_primary": 1,
"acting_primary": 1,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 324,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
# export:导出不删除,export-remove:导出并从 OSD 中移除该 PG
[root@node-1 ceph-objectstore-tool-test]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --pgid 1.0 --op [export|export-remove] --file export.file
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --pgid 2.1 --op export --file pg2.1.file
Exporting 2.1 info 2.1( v 119'6 (0'0,119'6] local-lis/les=154/155 n=0 ec=64/64 lis/c 154/154 les/c/f 155/155/0 154/154/154)
Export successful
[root@node-1 ~]# ls
anaconda-ks.cfg ceph-deploy pg2.1.file
# 查看,需要先使用 export 导出pg
[root@node-1 ceph-objectstore-tool-test]# ceph-objectstore-tool --file ./export.file --op dump-export
[root@node-1 ~]# ceph-objectstore-tool --file pg2.1.file --op dump-export
failed to fetch mon config (--no-mon-config to skip)
[root@node-1 ~]# ceph-objectstore-tool --file pg2.1.file --op dump-export --no-mon-config
{
"pgid": "2.1",
"cluster_fsid": "60e065f1-d992-4d1a-8f4e-f74419674f7e",
"features": "compat={},rocompat={},incompat={1=initial feature set(~v.18),2=pginfo object,3=object locator,4=last_epoch_clean,5=categories,6=hobjectpool,7=biginfo,8=leveldbinfo,9=leveldblog,10=snapmapper,12=transaction hints,13=pg meta object,14=explicit missing set,15=fastinfo pg attr,16=deletes in missing set}",
"metadata_section": {
"pg_disk_version": 10,
"map_epoch": 155,
"OSDMap": {
"epoch": 155,
"fsid": "60e065f1-d992-4d1a-8f4e-f74419674f7e",
"created": "2020-08-07 13:40:34.125175",
"modified": "2021-06-10 08:50:55.264485",
"last_up_change": "2021-06-10 08:50:54.258791",
"last_in_change": "2021-06-09 13:47:51.144852",
"flags": "sortbitwise,recovery_deletes,purged_snapdirs,pglog_hardlimit",
"flags_num": 5799936,
"flags_set": [
"pglog_hardlimit",
"purged_snapdirs",
"recovery_deletes",
"sortbitwise"
],
"crush_version": 7,
"full_ratio": 0.94999998807907104,
"backfillfull_ratio": 0
...
...
}
[root@node-1 ceph-objectstore-tool-test]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --pgid 1.0 --op import --file import.file
# pgid要匹配
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --pgid 2.0 --op import --file pg2.1.file
specified pgid 2.0 does not match actual pgid 2.1
# pg需要为空
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --pgid 2.1 --op import --file pg2.1.file
get_pg_num_history pg_num_history pg_num_history(e156 pg_nums {1={18=128},2={64=64}} deleted_pools )
pgid 2.1 already exists
# 导出,再导入
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --pgid 2.0 --op export-remove --file pg2.1_repli.file
Exporting 2.0 info 2.0( v 119'6 (0'0,119'6] local-lis/les=154/155 n=0 ec=64/64 lis/c 154/154 les/c/f 155/155/0 154/154/152)
Export successful
marking collection for removal
setting '_remove' omap key
finish_remove_pgs 2.0_head removing 2.0
Remove successful
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --pgid 2.0 --op import --file pg2.1_repli.file
get_pg_num_history pg_num_history pg_num_history(e156 pg_nums {1={18=128},2={64=64}} deleted_pools )
Importing pgid 2.0
write_pg epoch 155 info 2.0( v 119'6 (0'0,119'6] local-lis/les=154/155 n=0 ec=64/64 lis/c 154/154 les/c/f 155/155/0 154/154/152)
Import successful
源码位于:src/tools/ceph_objectstore_tool.cc
int main(int argc, char **argv)
{
// 使用 options_description 保存所有的参数
po::options_description desc("Allowed options");
// 参数解析
desc.add_options()
("type", po::value<string>(&type),
"Arg is one of [bluestore (default), filestore, memstore]")
...;
vector<string> ceph_option_strings;
po::variables_map vm;
try {
po::parsed_options parsed =
po::command_line_parser(argc, argv).options(all).allow_unregistered().positional(pd).run();
po::store( parsed, vm);
po::notify(vm);
ceph_option_strings = po::collect_unrecognized(parsed.options,
po::include_positional);
} catch(po::error &e) {
std::cerr << e.what() << std::endl;
return 1;
}
// 参数校验
...
// 向ceph_option_strings加入:-n, osd.whoami, --osd-data, data-path
char fn[PATH_MAX];
snprintf(fn, sizeof(fn), "%s/whoami", dpath.c_str());
int fd = ::open(fn, O_RDONLY);
if (fd >= 0) {
bufferlist bl;
bl.read_fd(fd, 64);
string s(bl.c_str(), bl.length());
int whoami = atoi(s.c_str());
vector<string> tmp;
// identify ourselves as this osd so we can auth and fetch our configs
tmp.push_back("-n");
tmp.push_back(string("osd.") + stringify(whoami));
// populate osd_data so that the default keyring location works
tmp.push_back("--osd-data");
tmp.push_back(dpath);
tmp.insert(tmp.end(), ceph_option_strings.begin(),
ceph_option_strings.end());
tmp.swap(ceph_option_strings);
}
// 读取 osd type
snprintf(fn, sizeof(fn), "%s/type", dpath.c_str());
...
// 对一些特殊 op 做参数完整性校验
if (op == "fuse" && mountpoint.length() == 0) {
cerr << "Missing fuse mountpoint" << std::endl;
usage(desc);
return 1;
}
...
// 创建 ObjectStoreTool
ObjectStoreTool tool = ObjectStoreTool(file_fd, dry_run);
...
// 初始化 global_context
auto cct = global_init(
NULL, ceph_options,
CEPH_ENTITY_TYPE_OSD,
CODE_ENVIRONMENT_UTILITY_NODOUT,
init_flags);
common_init_finish(g_ceph_context);
...
// 创建对象存储句柄,filestore|bluestore
ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags);
int ret = fs->mount();
// 获取 collection 句柄
auto ch = fs->open_collection(coll_t::meta());
...
// 获取超级块
std::unique_ptr <OSDSuperblock> superblock;
if (!no_superblock) {
superblock.reset(new OSDSuperblock);
bufferlist::const_iterator p;
ret = fs->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
if (ret < 0) {
cerr << "Failure to read OSD superblock: " << cpp_strerror(ret) << std::endl;
goto out;
}
p = bl.cbegin();
decode(*superblock, p);
}
}
// 根据 op 执行相应函数
...
}
// 导出文件模型:
// |super-header|pg-begin|metadata|object|pg-end|
int ObjectStoreTool::do_export(ObjectStore *fs, coll_t coll, spg_t pgid,
pg_info_t &info, epoch_t map_epoch, __u8 struct_ver,
const OSDSuperblock &superblock,
PastIntervals &past_intervals) {
PGLog::IndexedLog log;
pg_missing_t missing;
int ret = get_log(fs, struct_ver, pgid, info, log, missing);
if (ret > 0)
return ret;
// 向导出文件写入 超级块 信息
write_super();
// pg开始的信息,里面保存了 pgid 和 superblock
pg_begin pgb(pgid, superblock);
// Special case: If replicated pg don't require the importing OSD to have shard feature
if (pgid.is_no_shard()) {
pgb.superblock.compat_features.incompat.remove(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
}
// 向导出文件写入 pgb 信息
ret = write_section(TYPE_PG_BEGIN, pgb, file_fd);
if (ret)
return ret;
// The metadata_section is now before files, so import can detect
// errors and abort without wasting time.
metadata_section ms(
struct_ver,
map_epoch,
info,
log,
past_intervals,
missing);
ret = add_osdmap(fs, ms);
if (ret)
return ret;
// 向导出文件写入 metadata_section
ret = write_section(TYPE_PG_METADATA, ms, file_fd);
if (ret)
return ret;
// 导出 pg 所有对象内容
ret = export_files(fs, coll);
if (ret) {
cerr << "export_files error " << ret << std::endl;
return ret;
}
// 写入 pg_end
ret = write_simple(TYPE_PG_END, file_fd);
if (ret)
return ret;
return 0;
}
int ObjectStoreTool::dump_export(Formatter *formatter)
{
bufferlist ebl;
pg_info_t info;
PGLog::IndexedLog log;
//bool skipped_objects = false;
int ret = read_super();
if (ret)
return ret;
if (sh.magic != super_header::super_magic) {
cerr << "Invalid magic number" << std::endl;
return -EFAULT;
}
if (sh.version > super_header::super_ver) {
cerr << "Can't handle export format version=" << sh.version << std::endl;
return -EINVAL;
}
formatter->open_object_section("Export");
//First section must be TYPE_PG_BEGIN
sectiontype_t type;
ret = read_section(&type, &ebl);
if (ret)
return ret;
if (type == TYPE_POOL_BEGIN) {
cerr << "Dump of pool exports not supported" << std::endl;
return -EINVAL;
} else if (type != TYPE_PG_BEGIN) {
cerr << "Invalid first section type " << std::to_string(type) << std::endl;
return -EFAULT;
}
auto ebliter = ebl.cbegin();
pg_begin pgb;
pgb.decode(ebliter);
spg_t pgid = pgb.pgid;
formatter->dump_string("pgid", stringify(pgid));
formatter->dump_string("cluster_fsid", stringify(pgb.superblock.cluster_fsid));
formatter->dump_string("features", stringify(pgb.superblock.compat_features));
bool done = false;
bool found_metadata = false;
metadata_section ms;
bool objects_started = false;
while(!done) {
ret = read_section(&type, &ebl);
if (ret)
return ret;
if (debug) {
cerr << "dump_export: Section type " << std::to_string(type) << std::endl;
}
if (type >= END_OF_TYPES) {
cerr << "Skipping unknown section type" << std::endl;
continue;
}
switch(type) {
case TYPE_OBJECT_BEGIN:
if (!objects_started) {
formatter->open_array_section("objects");
objects_started = true;
}
ret = dump_object(formatter, ebl);
if (ret) return ret;
break;
case TYPE_PG_METADATA:
if (objects_started)
cerr << "WARNING: metadata_section out of order" << std::endl;
ret = dump_pg_metadata(formatter, ebl, ms);
if (ret) return ret;
found_metadata = true;
break;
case TYPE_PG_END:
if (objects_started) {
formatter->close_section();
}
done = true;
break;
default:
cerr << "Unknown section type " << std::to_string(type) << std::endl;
return -EFAULT;
}
}
if (!found_metadata) {
cerr << "Missing metadata section" << std::endl;
return -EFAULT;
}
formatter->close_section();
formatter->flush(cout);
return 0;
}
// 调用 BlueStore::fsck() -> BlueStore::_fsck() 完成数据校验、修复
int fsck(bool deep) override {
return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, false);
}
int repair(bool deep) override {
return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, true);
}
int quick_fix() override {
return _fsck(FSCK_SHALLOW, true);
}
/**
An overview for currently implemented repair logics
performed in fsck in two stages: detection(+preparation) and commit.
Detection stage (in processing order):
(Issue -> Repair action to schedule)
- Detect undecodable keys for Shared Blobs -> Remove
- Detect undecodable records for Shared Blobs -> Remove
(might trigger missed Shared Blob detection below)
- Detect stray records for Shared Blobs -> Remove
- Detect misreferenced pextents -> Fix
Prepare Bloom-like filter to track cid/oid -> pextent
Prepare list of extents that are improperly referenced
Enumerate Onode records that might use 'misreferenced' pextents
(Bloom-like filter applied to reduce computation)
Per each questinable Onode enumerate all blobs and identify broken ones
(i.e. blobs having 'misreferences')
Rewrite each broken blob data by allocating another extents and
copying data there
If blob is shared - unshare it and mark corresponding Shared Blob
for removal
Release previously allocated space
Update Extent Map
- Detect missed Shared Blobs -> Recreate
- Detect undecodable deferred transaction -> Remove
- Detect Freelist Manager's 'false free' entries -> Mark as used
- Detect Freelist Manager's leaked entries -> Mark as free
- Detect statfs inconsistency - Update
Commit stage (separate DB commit per each step):
- Apply leaked FM entries fix
- Apply 'false free' FM entries fix
- Apply 'Remove' actions
- Apply fix for misreference pextents
- Apply Shared Blob recreate
(can be merged with the step above if misreferences were dectected)
- Apply StatFS update
*/
int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
{
dout(1) << __func__
<< (repair ? " repair" : " check")
<< (depth == FSCK_DEEP ? " (deep)" :
depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
<< dendl;
// in deep mode we need R/W write access to be able to replay deferred ops
bool read_only = !(repair || depth == FSCK_DEEP);
int r = _open_db_and_around(read_only);
if (r < 0)
return r;
if (!read_only) {
r = _upgrade_super();
if (r < 0) {
goto out_db;
}
}
r = _open_collections();
if (r < 0)
goto out_db;
mempool_thread.init();
// we need finisher and kv_{sync,finalize}_thread *just* for replay
// enable in repair or deep mode modes only
if (!read_only) {
_kv_start();
r = _deferred_replay();
_kv_stop();
}
if (r < 0)
goto out_scan;
// 校验并修复元数据,具体内容参见:src/os/bluestore/BlueStore.cc
r = _fsck_on_open(depth, repair);
out_scan:
mempool_thread.shutdown();
_shutdown_cache();
out_db:
_close_db_and_around(false);
return r;
}
// fs->mkfs();
int BlueStore::mkfs() {
...
{
// 如果之前已经 mkfs 过了,则只做 fsck 检查
r = read_meta("mkfs_done", &done);
...
r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
...
return r; // idempotent
}
// 向/osd-data-path/block写入元数据 type ,设为 bluestore
{
...
r = read_meta("type", &type);
if (r == 0) {
if (type != "bluestore") {
derr << __func__ << " expected bluestore, but type is " << type << dendl;
return -EIO;
}
} else {
r = write_meta("type", "bluestore");
if (r < 0)
return r;
}
}
freelist_type = "bitmap";
//打开设备目录/osd-data-path/
r = _open_path();
if (r < 0)
return r;
//打开/创建设备目录下的/osd-data-path/fsid
r = _open_fsid(true);
if (r < 0)
goto out_path_fd;
//锁定fsid
r = _lock_fsid();
if (r < 0)
goto out_close_fsid;
//读取fsid,若没有,则生成 fsid
r = _read_fsid(&old_fsid);
if (r < 0 || old_fsid.is_zero()) {
if (fsid.is_zero()) {
fsid.generate_random(); //随机生成 fsid
dout(1) << __func__ << " generated fsid " << fsid << dendl;
} else {
dout(1) << __func__ << " using provided fsid " << fsid << dendl;
}
// we'll write it later.
} else {
if (!fsid.is_zero() && fsid != old_fsid) {
derr << __func__ << " on-disk fsid " << old_fsid
<< " != provided " << fsid << dendl;
r = -EINVAL;
goto out_close_fsid;
}
fsid = old_fsid;
}
//在/osd-data-path/目录下创建 block 文件,并把它链接到真正的 bluestore_block_path,尝试预分配 bluestore_block_size 大小的空间。
r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
cct->_conf->bluestore_block_size,
cct->_conf->bluestore_block_create);
if (r < 0)
goto out_close_fsid;
//若设有多个磁盘,用作 wal 和 db 设备,则继续创建 block.wal 和 block.db 链接,并预分配空间。
if (cct->_conf->bluestore_bluefs) {
r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
cct->_conf->bluestore_block_wal_size,
cct->_conf->bluestore_block_wal_create);
if (r < 0)
goto out_close_fsid;
r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
cct->_conf->bluestore_block_db_size,
cct->_conf->bluestore_block_db_create);
if (r < 0)
goto out_close_fsid;
}
//创建并打开 BlockDevice,其类型有pmem,kernel,ust-nvme。ceph有自己的一套块设备操作方式,例如 kernel 设备使用 libaio 直接操作,越过了文件系统。
r = _open_bdev(true);
if (r < 0)
goto out_close_fsid;
// choose min_alloc_size
if (cct->_conf->bluestore_min_alloc_size) {
min_alloc_size = cct->_conf->bluestore_min_alloc_size;
} else {
ceph_assert(bdev);
if (bdev->is_rotational()) {
min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
} else {
min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
}
}
//验证块设备大小是否足够启用 bluefs
_validate_bdev();
// make sure min_alloc_size is power of 2 aligned.
if (!isp2(min_alloc_size)) {
...
goto out_close_bdev;
}
// 启用 cephfs 及其 db,用来存储元数据,一般是 rocksdb
r = _open_db(true);
if (r < 0)
goto out_close_bdev;
...
// 记录 kv_backend 数据库类型
r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
if (r < 0)
goto out_close_fm;
// 记录是否采用 bluefs 代替文件系统,基本都采用
r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
if (r < 0)
goto out_close_fm;
// 更新 fsid
if (fsid != old_fsid) {
r = _write_fsid();
if (r < 0) {
derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
goto out_close_fm;
}
}
if (out_of_sync_fm.fetch_and(0)) {
_sync_bluefs_and_fm();
}
out_close_fm:
_close_fm();
out_close_db:
_close_db();
out_close_bdev:
_close_bdev();
out_close_fsid:
_close_fsid();
out_path_fd:
_close_path();
if (r == 0 &&
cct->_conf->bluestore_fsck_on_mkfs) {
int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
if (rc < 0)
return rc;
if (rc > 0) {
derr << __func__ << " fsck found " << rc << " errors" << dendl;
r = -EIO;
}
}
if (r == 0) {
// indicate success by writing the 'mkfs_done' file
r = write_meta("mkfs_done", "yes");
}
if (r < 0) {
derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
} else {
dout(0) << __func__ << " success" << dendl;
}
return r;
}
if (op == "dup") {
string target_type;
char fn[PATH_MAX];
snprintf(fn, sizeof(fn), "%s/type", target_data_path.c_str());
// 创建 target-path/type 文件
int fd = ::open(fn, O_RDONLY);
bufferlist bl;
bl.read_fd(fd, 64);
if (bl.length()) {
target_type = string(bl.c_str(), bl.length() - 1); // drop \n
}
::close(fd);
ObjectStore *targetfs = ObjectStore::create(
g_ceph_context, target_type,
target_data_path, "", 0);
if (targetfs == NULL) {
cerr << "Unable to open store of type " << target_type << std::endl;
return 1;
}
int r = dup(dpath, fs, target_data_path, targetfs);
if (r < 0) {
cerr << "dup failed: " << cpp_strerror(r) << std::endl;
return 1;
}
return 0;
}
if (op == "fuse") {
#ifdef HAVE_LIBFUSE
//
FuseStore fuse(fs, mountpoint);
cout << "mounting fuse at " << mountpoint << " ..." << std::endl;
// 通过用户态libfuse挂载 objectstore
int r = fuse.main();
if (r < 0) {
cerr << "failed to mount fuse: " << cpp_strerror(r) << std::endl;
return 1;
}
#else
cerr << "fuse support not enabled" << std::endl;
#endif
return 0;
}
int FuseStore::main()
{
const char *v[] = {
"foo",
mount_point.c_str(),
"-f",
"-d", // debug
};
int c = 3;
auto fuse_debug = store->cct->_conf.get_val<bool>("fuse_debug");
if (fuse_debug)
++c;
// 调用 libfuse 库的 fuse_main 来挂载自制文件系统
return fuse_main(c, (char**)v, &fs_oper, (void*)this);
}
int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock &sb,
bool force, std::string pgidstr) {
bufferlist ebl;
pg_info_t info;
PGLog::IndexedLog log;
bool skipped_objects = false;
if (!dry_run)
// 递归删除 is_tepm || _has_remove_flag 标志的pg内容
// OSD::recursive_remove_collection(g_ceph_context, store, pgid, *it);
finish_remove_pgs(store);
// 读取要导入的文件中 super_header
// 之前 exprot 时,向文件中写入:super_header, pg_begin, object, pg_end 等信息
int ret = read_super();
//First section must be TYPE_PG_BEGIN
sectiontype_t type;
// 读取 pg_begin
ret = read_section(&type, &ebl);
auto ebliter = ebl.cbegin();
pg_begin pgb;
pgb.decode(ebliter);
spg_t pgid = pgb.pgid;
if (pgidstr.length()) {
spg_t user_pgid;
// 验证 命令行输入的pg_id 和文件中读取到的 pg_id 是否一致
bool ok = user_pgid.parse(pgidstr.c_str());
// This succeeded in main() already
ceph_assert(ok);
if (pgid != user_pgid) {
cerr << "specified pgid " << user_pgid
<< " does not match actual pgid " << pgid << std::endl;
return -EINVAL;
}
}
// 验证集群 fsid 是否一致,这也要求必须导入的文件必须来自同一个集群
if (!pgb.superblock.cluster_fsid.is_zero()
&& pgb.superblock.cluster_fsid != sb.cluster_fsid) {
cerr << "Export came from different cluster with fsid "
<< pgb.superblock.cluster_fsid << std::endl;
return -EINVAL;
}
// Special case: Old export has SHARDS incompat feature on replicated pg, removqqe it
if (pgid.is_no_shard())
pgb.superblock.compat_features.incompat.remove(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) {
CompatSet unsupported = sb.compat_features.unsupported(pgb.superblock.compat_features);
cerr << "Export has incompatible features set " << unsupported << std::endl;
// Let them import if they specify the --force option
if (!force)
return 11; // Positive return means exit status
}
// we need the latest OSDMap to check for collisions
OSDMap curmap;
bufferlist bl;
// 获取 osd_map
ret = get_osdmap(store, sb.current_epoch, curmap, bl);
pool_pg_num_history_t pg_num_history;
get_pg_num_history(store, &pg_num_history);
ghobject_t pgmeta_oid = pgid.make_pgmeta_oid();
// Check for PG already present.
coll_t coll(pgid);
if (store->collection_exists(coll)) {
cerr << "pgid " << pgid << " already exists" << std::endl;
return -EEXIST;
}
// 创建 pg、osdriver 句柄
ObjectStore::CollectionHandle ch;
OSDriver driver(
store,
coll_t(),
OSD::make_snapmapper_oid());
//
SnapMapper mapper(g_ceph_context, &driver, 0, 0, 0, pgid.shard);
bool done = false;
bool found_metadata = false;
metadata_section ms;
while (!done) {
// 读取 文件 指针目前指向的 section_header,并返回 type 及其内容
ret = read_section(&type, &ebl);
// 跳过不能识别的 section
if (type >= END_OF_TYPES) {
cout << "Skipping unknown section type" << std::endl;
continue;
}
// 根据 tpye,填充对应的 bl 信息:object,metadata,pg-end
switch (type) {
case TYPE_OBJECT_BEGIN:
ceph_assert(found_metadata);
// 导入 object 内容
ret = get_object(store, driver, mapper, coll, ebl, ms.osdmap,
&skipped_objects);
if (ret) return ret;
break;
case TYPE_PG_METADATA:
ret = get_pg_metadata(store, ebl, ms, sb, pgid);
if (ret) return ret;
found_metadata = true;
if (pgid != ms.info.pgid) {
cerr << "specified pgid " << pgid << " does not match import file pgid "
<< ms.info.pgid << std::endl;
return -EINVAL;
}
// make sure there are no conflicting splits or merges
if (ms.osdmap.have_pg_pool(pgid.pgid.pool())) {
auto p = pg_num_history.pg_nums.find(pgid.pgid.m_pool);
if (p != pg_num_history.pg_nums.end() &&
!p->second.empty()) {
unsigned start_pg_num = ms.osdmap.get_pg_num(pgid.pgid.pool());
unsigned pg_num = start_pg_num;
for (auto q = p->second.lower_bound(ms.map_epoch);
q != p->second.end();
++q) {
unsigned new_pg_num = q->second;
cout << "pool " << pgid.pgid.pool() << " pg_num " << pg_num
<< " -> " << new_pg_num << std::endl;
// check for merge target
spg_t target;
if (pgid.is_merge_source(pg_num, new_pg_num, &target)) {
// FIXME: this checks assumes the OSD's PG is at the OSD's
// map epoch; it could be, say, at *our* epoch, pre-merge.
coll_t coll(target);
if (store->collection_exists(coll)) {
cerr << "pgid " << pgid << " merges to target " << target
<< " which already exists" << std::endl;
return 12;
}
}
// check for split children
set <spg_t> children;
if (pgid.is_split(start_pg_num, new_pg_num, &children)) {
cerr << " children are " << children << std::endl;
for (auto child : children) {
coll_t coll(child);
if (store->collection_exists(coll)) {
cerr << "pgid " << pgid << " splits to " << children
<< " and " << child << " exists" << std::endl;
return 12;
}
}
}
pg_num = new_pg_num;
}
}
} else {
cout << "pool " << pgid.pgid.pool() << " doesn't existing, not checking"
<< " for splits or mergers" << std::endl;
}
if (!dry_run) {
ObjectStore::Transaction t;
ch = store->create_new_collection(coll);
create_pg_collection(
t, pgid,
pgid.get_split_bits(ms.osdmap.get_pg_pool(pgid.pool())->get_pg_num()));
init_pg_ondisk(t, pgid, NULL);
// mark this coll for removal until we're done
map <string, bufferlist> values;
encode((char) 1, values["_remove"]);
t.omap_setkeys(coll, pgid.make_pgmeta_oid(), values);
store->queue_transaction(ch, std::move(t));
}
break;
// pg-end 标志着整个 pg 文件已经全部导入
case TYPE_PG_END:
ceph_assert(found_metadata);
done = true;
break;
default:
cerr << "Unknown section type " << std::to_string(type) << std::endl;
return -EFAULT;
}
}
ObjectStore::Transaction t;
if (!dry_run) {
pg_log_t newlog, reject;
pg_log_t::filter_log(pgid, ms.osdmap, g_ceph_context->_conf->osd_hit_set_namespace,
ms.log, newlog, reject);
divergent_priors_t newdp, rejectdp;
filter_divergent_priors(pgid, ms.osdmap, g_ceph_context->_conf->osd_hit_set_namespace,
ms.divergent_priors, newdp, rejectdp);
ms.divergent_priors = newdp;
ms.missing.filter_objects([&](const hobject_t &obj) {
if (obj.nspace == g_ceph_context->_conf->osd_hit_set_namespace)
return false;
ceph_assert(!obj.is_temp());
object_t oid = obj.oid;
object_locator_t loc(obj);
pg_t raw_pgid = ms.osdmap.object_locator_to_pg(oid, loc);
pg_t _pgid = ms.osdmap.raw_pg_to_pg(raw_pgid);
return pgid.pgid != _pgid;
});
// Just like a split invalidate stats since the object count is changed
if (skipped_objects)
ms.info.stats.stats_invalid = true;
// 导入 mete-data 内容
ret = write_pg(
t,
ms.map_epoch,
ms.info,
newlog,
ms.past_intervals,
ms.divergent_priors,
ms.missing);
if (ret) return ret;
}
if (!dry_run) {
t.omap_rmkey(coll, pgid.make_pgmeta_oid(), "_remove");
wait_until_done(&t, [&] {
store->queue_transaction(ch, std::move(t));
// make sure we flush onreadable items before mapper/driver are destroyed.
ch->flush();
});
}
return 0;
}
int get_osdmap(ObjectStore *store, epoch_t e, OSDMap &osdmap, bufferlist &bl) {
// 获取 collection 句柄,
ObjectStore::CollectionHandle ch = store->open_collection(coll_t::meta());
// 读取 osd map,读操作不需要经过事务
// OSD::get_inc_osdmap_pobject_name(e) 获取 inc osdmap
bool found = store->read(
ch, OSD::get_osdmap_pobject_name(e), 0, 0, bl) >= 0;
osdmap.decode(bl);
return 0;
}
if (op == "set-osdmap") {
bufferlist bl;
// 读取要写入的 osdmap 文件
ret = get_fd_data(file_fd, bl);
if (ret < 0) {
cerr << "Failed to read osdmap " << cpp_strerror(ret) << std::endl;
} else {
// 设置 osdmap
ret = set_osdmap(fs, epoch, bl, force);
}
goto out;
}
int set_osdmap(ObjectStore *store, epoch_t e, bufferlist &bl, bool force) {
OSDMap osdmap;
osdmap.decode(bl);
// 获取 collection 句柄
auto ch = store->open_collection(coll_t::meta());
// 获取 osdmap id
// const ghobject_t inc_oid = OSD::get_inc_osdmap_pobject_name(e);
const ghobject_t full_oid = OSD::get_osdmap_pobject_name(e);
// 写入 osdmap
ObjectStore::Transaction t;
t.write(coll_t::meta(), full_oid, 0, bl.length(), bl);
t.truncate(coll_t::meta(), full_oid, bl.length());
store->queue_transaction(ch, std::move(t));
return 0;
}
int update_mon_db(ObjectStore& fs, OSDSuperblock& sb,
const string& keyring,
const string& store_path)
{
MonitorDBStore ms(store_path);
// 打开 mondb
int r = ms.create_and_open(cerr);
if (r < 0) {
cerr << "unable to open mon store: " << store_path << std::endl;
return r;
}
// 更新 kerying
if ((r = update_auth(keyring, sb, ms)) < 0) {
goto out;
}
// 更新 osdmap
if ((r = update_osdmap(fs, sb, ms)) < 0) {
goto out;
}
// 更新 monitor
if ((r = update_monitor(sb, ms)) < 0) {
goto out;
}
out:
ms.close();
return r;
}
// Please use export-remove or you must use --force option
int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid) {
// 测试模式,直接返回 0
if (!dry_run)
finish_remove_pgs(store);
if (!store->collection_exists(coll_t(r_pgid)))
return -ENOENT;
if (dry_run)
return 0;
ObjectStore::Transaction rmt;
int r = mark_pg_for_removal(store, r_pgid, &rmt);
if (r < 0) {
return r;
}
ObjectStore::CollectionHandle ch = store->open_collection(coll_t(r_pgid));
store->queue_transaction(ch, std::move(rmt));
finish_remove_pgs(store);
return r;
}
/* fixme: using full features */
int do_list(ObjectStore *store, string pgidstr, string object, boost::optional <std::string> nspace,
Formatter *formatter, bool debug, bool human_readable, bool head) {
int r;
lookup_ghobject lookup(object, nspace, head);
if (pgidstr.length() > 0) {
/*
* auto ch = store->open_collection(coll);
* ghobject_t next;
* vecotr<ghobject_t> list;
* int r = store->collection_list(ch, next, ghobject_t::get_max(), LIST_AT_A_TIME, list, next)
* 获取到 pg 所有的对象,及对象信息
*/
r = action_on_all_objects_in_pg(store, pgidstr, lookup, debug);
} else {
r = action_on_all_objects(store, lookup, debug);
}
if (r)
return r;
lookup.dump(formatter, human_readable);
formatter->flush(cout);
return 0;
}
// 与 list 原理相同
// 指定pg 为 coll_t::meta() 元数据pg
ret = fs->list_collections(ls);
// Find pg
for (it = ls.begin(); it != ls.end(); ++it) {
spg_t tmppgid;
if (pgidstr == "meta") {
if (it->to_str() == "meta")
break;
else
continue;
}
if (!it->is_pg(&tmppgid)) {
continue;
}
if (it->is_temp(&tmppgid)) {
continue;
}
if (op != "list-pgs" && tmppgid != pgid) {
continue;
}
if (op != "list-pgs") {
//Found!
break;
}
cout << tmppgid << std::endl;
}