WAL是warite ahead log(预写式日志)得缩写,顾名思义,也就是在执行真正得写操作之前先写一个日志,预写日志。
wal:存放预写式日志,最大得作用是记录了整个数据变化得全部历程,在etcd中,所有数据得修改在提交前,都要先写入到WAL中。
V3 版本备份数据(单机版):
#备份
[root@etcd1 ~]# ETCDCTL_API=3 etcdctl snapshot save snapshop.db
#恢复到一个指定目录
[root@etcd1 ~]# ETCDCTL_API=3 etcdctl snapshot restore snapshop.db --data-dir=/opt/etcd-testdir-2
Deprecated: Use `etcdutl snapshot restore` instead.
2022-05-05T20:38:13+08:00 info snapshot/v3_snapshot.go:251 restoring snapshot {"path": "snapshop.db", "wal-dir": "/opt/etcd-testdir-2/member/wal", "data-dir": "/opt/etcd-testdir-2", "snap-dir": "/opt/etcd-testdir-2/member/snap", "stack": "go.etcd.io/etcd/etcdutl/v3/snapshot.(*v3Manager).Restore\n\t/tmp/etcd-release-3.5.1/etcd/release/etcd/etcdutl/snapshot/v3_snapshot.go:257\ngo.etcd.io/etcd/etcdutl/v3/etcdutl.SnapshotRestoreCommandFunc\n\t/tmp/etcd-release-3.5.1/etcd/release/etcd/etcdutl/etcdutl/snapshot_command.go:147\ngo.etcd.io/etcd/etcdctl/v3/ctlv3/command.snapshotRestoreCommandFunc\n\t/tmp/etcd-release-3.5.1/etcd/release/etcd/etcdctl/ctlv3/command/snapshot_command.go:128\ngithub.com/spf13/cobra.(*Command).execute\n\t/home/remote/sbatsche/.gvm/pkgsets/go1.16.3/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:856\ngithub.com/spf13/cobra.(*Command).ExecuteC\n\t/home/remote/sbatsche/.gvm/pkgsets/go1.16.3/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:960\ngithub.com/spf13/cobra.(*Command).Execute\n\t/home/remote/sbatsche/.gvm/pkgsets/go1.16.3/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:897\ngo.etcd.io/etcd/etcdctl/v3/ctlv3.Start\n\t/tmp/etcd-release-3.5.1/etcd/release/etcd/etcdctl/ctlv3/ctl.go:107\ngo.etcd.io/etcd/etcdctl/v3/ctlv3.MustStart\n\t/tmp/etcd-release-3.5.1/etcd/release/etcd/etcdctl/ctlv3/ctl.go:111\nmain.main\n\t/tmp/etcd-release-3.5.1/etcd/release/etcd/etcdctl/main.go:59\nruntime.main\n\t/home/remote/sbatsche/.gvm/gos/go1.16.3/src/runtime/proc.go:225"}
2022-05-05T20:38:13+08:00 info membership/store.go:141 Trimming membership information from the backend...
2022-05-05T20:38:13+08:00 info membership/cluster.go:421 added member {"cluster-id": "cdf818194e3a8c32", "local-member-id": "0", "added-peer-id": "8e9e05c52164694d", "added-peer-peer-urls": ["http://localhost:2380"]}
2022-05-05T20:38:13+08:00 info snapshot/v3_snapshot.go:272 restored snapshot {"path": "snapshop.db", "wal-dir": "/opt/etcd-testdir-2/member/wal", "data-dir": "/opt/etcd-testdir-2", "snap-dir": "/opt/etcd-testdir-2/member/snap"}
[root@etcd1 ~]#
[root@etcd1 ~]# vim /etc/systemd/system/
basic.target.wants/ default.target getty.target.wants/ sockets.target.wants/
dbus-org.freedesktop.NetworkManager.service default.target.wants/ multi-user.target.wants/ system-update.target.wants/
dbus-org.freedesktop.nm-dispatcher.service etcd.service remote-fs.target.wants/
[root@etcd1 ~]# vim /etc/systemd/system/etcd.service
[root@etcd1 ~]#
[root@etcd1 ~]# cat /etc/systemd/system/etcd.service
[Unit]
Description=Etcd Server
After=network.target
After=network-online.target
Wants=network-online.target
Documentation=https://github.com/coreos
[Service]
Type=notify
#需要将workingdir改成刚才生成的新数据目录
WorkingDirectory=/var/lib/etcd
ExecStart=/usr/local/kubernetes/bin/etcd \
--name=etcd-172.16.92.150 \
--cert-file=/etc/kubernetes/ssl/etcd.pem \
--key-file=/etc/kubernetes/ssl/etcd-key.pem \
--peer-cert-file=/etc/kubernetes/ssl/etcd.pem \
--peer-key-file=/etc/kubernetes/ssl/etcd-key.pem \
--trusted-ca-file=/etc/kubernetes/ssl/ca.pem \
--peer-trusted-ca-file=/etc/kubernetes/ssl/ca.pem \
--initial-advertise-peer-urls=https://172.16.92.150:2380 \
--listen-peer-urls=https://172.16.92.150:2380 \
--listen-client-urls=https://172.16.92.150:2379,http://127.0.0.1:2379 \
--advertise-client-urls=https://172.16.92.150:2379 \
--initial-cluster-token=etcd-cluster-0 \
--initial-cluster=etcd-172.16.92.150=https://172.16.92.150:2380,etcd-172.16.92.151=https://172.16.92.151:2380,etcd-172.16.92.152=https://172.16.92.152:2380 \
--initial-cluster-state=new \
#datadir也需要修改成新生成的目录
--data-dir=/var/lib/etcd \
--wal-dir= \
--snapshot-count=50000 \
--auto-compaction-retention=1 \
--auto-compaction-mode=periodic \
--max-request-bytes=10485760 \
--quota-backend-bytes=8589934592
Restart=always
RestartSec=15
LimitNOFILE=65536
OOMScoreAdjust=-999
[Install]
WantedBy=multi-user.target
[root@haproxy1 kubeasz]# ./ezctl backup k8s-cluster
ansible-playbook -i clusters/k8s-cluster/hosts -e @clusters/k8s-cluster/config.yml playbooks/94.backup.yml
2022-05-07 05:59:05 INFO cluster:k8s-cluster backup begins in 5s, press any key to abort:
PLAY [localhost] **************************************************************************************************************************************************************************************************************
TASK [Gathering Facts] ********************************************************************************************************************************************************************************************************
ok: [localhost]
TASK [set NODE_IPS of the etcd cluster] ***************************************************************************************************************************************************************************************
ok: [localhost]
TASK [get etcd cluster status] ************************************************************************************************************************************************************************************************
changed: [localhost]
TASK [debug] ******************************************************************************************************************************************************************************************************************
ok: [localhost] => {
"ETCD_CLUSTER_STATUS": {
"changed": true,
"cmd": "for ip in 172.16.92.150 172.16.92.151 172.16.92.152 ;do ETCDCTL_API=3 /etc/kubeasz/bin/etcdctl --endpoints=https://\"$ip\":2379 --cacert=/etc/kubeasz/clusters/k8s-cluster/ssl/ca.pem --cert=/etc/kubeasz/clusters/k8s-cluster/ssl/etcd.pem --key=/etc/kubeasz/clusters/k8s-cluster/ssl/etcd-key.pem endpoint health; done",
"delta": "0:00:00.347998",
"end": "2022-05-07 05:59:25.515497",
"failed": false,
"rc": 0,
"start": "2022-05-07 05:59:25.167499",
"stderr": "",
"stderr_lines": [],
"stdout": "https://172.16.92.150:2379 is healthy: successfully committed proposal: took = 32.036126ms\nhttps://172.16.92.151:2379 is healthy: successfully committed proposal: took = 28.272971ms\nhttps://172.16.92.152:2379 is healthy: successfully committed proposal: took = 22.756272ms",
"stdout_lines": [
"https://172.16.92.150:2379 is healthy: successfully committed proposal: took = 32.036126ms",
"https://172.16.92.151:2379 is healthy: successfully committed proposal: took = 28.272971ms",
"https://172.16.92.152:2379 is healthy: successfully committed proposal: took = 22.756272ms"
]
}
}
TASK [get a running ectd node] ************************************************************************************************************************************************************************************************
changed: [localhost]
TASK [debug] ******************************************************************************************************************************************************************************************************************
ok: [localhost] => {
"RUNNING_NODE.stdout": "172.16.92.150"
}
TASK [get current time] *******************************************************************************************************************************************************************************************************
changed: [localhost]
TASK [make a backup on the etcd node] *****************************************************************************************************************************************************************************************
changed: [localhost -> 172.16.92.150]
TASK [fetch the backup data] **************************************************************************************************************************************************************************************************
changed: [localhost -> 172.16.92.150]
TASK [update the latest backup] ***********************************************************************************************************************************************************************************************
changed: [localhost]
PLAY RECAP ********************************************************************************************************************************************************************************************************************
localhost : ok=10 changed=6 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
备份完之后会在下面的目录存放备份文件snapshot.db ,是以时间戳命名,如果想备份指定的备份文件,只需要将时间戳文件cp成snapshot.db即可
[root@haproxy1 kubeasz]# ll clusters/k8s-cluster/backup/
总用量 8124
-rw-------. 1 root root 2768928 5月 5 20:47 snapshot_202205052047.db
-rw-------. 1 root root 2768928 5月 7 05:59 snapshot_202205070559.db
-rw-------. 1 root root 2768928 5月 7 05:59 snapshot.db
[root@haproxy1 kubeasz]# kubectl get pods
NAME READY STATUS RESTARTS AGE
net-test 1/1 Running 0 33h
[root@haproxy1 kubeasz]# kubectl delete pods net-test
pod "net-test" deleted
[root@haproxy1 kubeasz]# kubectl get pods
No resources found in default namespace.
恢复:
[root@haproxy1 kubeasz]# ./ezctl restore k8s-cluster
ansible-playbook -i clusters/k8s-cluster/hosts -e @clusters/k8s-cluster/config.yml playbooks/95.restore.yml
2022-05-07 06:04:52 INFO cluster:k8s-cluster restore begins in 5s, press any key to abort:
PLAY [kube_master] ************************************************************************************************************************************************************************************************************
TASK [Gathering Facts] ********************************************************************************************************************************************************************************************************
ok: [172.16.92.131]
ok: [172.16.92.130]
TASK [stopping kube_master services] ******************************************************************************************************************************************************************************************
changed: [172.16.92.130] => (item=kube-apiserver)
changed: [172.16.92.131] => (item=kube-apiserver)
changed: [172.16.92.130] => (item=kube-controller-manager)
changed: [172.16.92.131] => (item=kube-controller-manager)
changed: [172.16.92.130] => (item=kube-scheduler)
changed: [172.16.92.131] => (item=kube-scheduler)
PLAY [kube_master,kube_node] **************************************************************************************************************************************************************************************************
TASK [Gathering Facts] ********************************************************************************************************************************************************************************************************
ok: [172.16.92.141]
ok: [172.16.92.142]
ok: [172.16.92.140]
TASK [stopping kube_node services] ********************************************************************************************************************************************************************************************
changed: [172.16.92.130] => (item=kubelet)
changed: [172.16.92.131] => (item=kubelet)
changed: [172.16.92.140] => (item=kubelet)
changed: [172.16.92.141] => (item=kubelet)
changed: [172.16.92.142] => (item=kubelet)
changed: [172.16.92.130] => (item=kube-proxy)
changed: [172.16.92.131] => (item=kube-proxy)
changed: [172.16.92.140] => (item=kube-proxy)
changed: [172.16.92.141] => (item=kube-proxy)
changed: [172.16.92.142] => (item=kube-proxy)
PLAY [etcd] *******************************************************************************************************************************************************************************************************************
TASK [Gathering Facts] ********************************************************************************************************************************************************************************************************
ok: [172.16.92.150]
ok: [172.16.92.152]
ok: [172.16.92.151]
TASK [cluster-restore : 停止ectd 服务] ********************************************************************************************************************************************************************************************
changed: [172.16.92.152]
changed: [172.16.92.151]
changed: [172.16.92.150]
TASK [cluster-restore : 清除etcd 数据目录] ******************************************************************************************************************************************************************************************
changed: [172.16.92.151]
changed: [172.16.92.152]
changed: [172.16.92.150]
TASK [cluster-restore : 生成备份目录] ***********************************************************************************************************************************************************************************************
ok: [172.16.92.150]
ok: [172.16.92.151]
ok: [172.16.92.152]
TASK [cluster-restore : 准备指定的备份etcd 数据] ***************************************************************************************************************************************************************************************
changed: [172.16.92.151]
changed: [172.16.92.152]
changed: [172.16.92.150]
TASK [cluster-restore : 清理上次备份恢复数据] *******************************************************************************************************************************************************************************************
changed: [172.16.92.150]
changed: [172.16.92.151]
changed: [172.16.92.152]
TASK [cluster-restore : etcd 数据恢复] ********************************************************************************************************************************************************************************************
changed: [172.16.92.150]
changed: [172.16.92.152]
changed: [172.16.92.151]
TASK [cluster-restore : 恢复数据至etcd 数据目录] ***************************************************************************************************************************************************************************************
changed: [172.16.92.150]
changed: [172.16.92.151]
changed: [172.16.92.152]
TASK [cluster-restore : 重启etcd 服务] ********************************************************************************************************************************************************************************************
changed: [172.16.92.150]
changed: [172.16.92.151]
changed: [172.16.92.152]
TASK [cluster-restore : 以轮询的方式等待服务同步完成] ***************************************************************************************************************************************************************************************
changed: [172.16.92.150]
changed: [172.16.92.151]
changed: [172.16.92.152]
PLAY [kube_master] ************************************************************************************************************************************************************************************************************
TASK [starting kube_master services] ******************************************************************************************************************************************************************************************
changed: [172.16.92.130] => (item=kube-apiserver)
changed: [172.16.92.131] => (item=kube-apiserver)
changed: [172.16.92.130] => (item=kube-controller-manager)
changed: [172.16.92.131] => (item=kube-controller-manager)
changed: [172.16.92.130] => (item=kube-scheduler)
changed: [172.16.92.131] => (item=kube-scheduler)
PLAY [kube_master,kube_node] **************************************************************************************************************************************************************************************************
TASK [starting kube_node services] ********************************************************************************************************************************************************************************************
changed: [172.16.92.130] => (item=kubelet)
changed: [172.16.92.131] => (item=kubelet)
changed: [172.16.92.140] => (item=kubelet)
changed: [172.16.92.141] => (item=kubelet)
changed: [172.16.92.142] => (item=kubelet)
changed: [172.16.92.130] => (item=kube-proxy)
changed: [172.16.92.140] => (item=kube-proxy)
changed: [172.16.92.131] => (item=kube-proxy)
changed: [172.16.92.141] => (item=kube-proxy)
changed: [172.16.92.142] => (item=kube-proxy)
PLAY RECAP ********************************************************************************************************************************************************************************************************************
172.16.92.130 : ok=5 changed=4 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
172.16.92.131 : ok=5 changed=4 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
172.16.92.140 : ok=3 changed=2 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
172.16.92.141 : ok=3 changed=2 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
172.16.92.142 : ok=3 changed=2 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
172.16.92.150 : ok=10 changed=8 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
172.16.92.151 : ok=10 changed=8 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
172.16.92.152 : ok=10 changed=8 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
查看pod已经被恢复
[root@haproxy1 kubeasz]# kubectl get pods
NAME READY STATUS RESTARTS AGE
net-test 1/1 Running 0 33h
总结备份流程:
当etcd集群宕机数量超过集群总节点一半以上的时候(如总数3台宕机两台),就会导致整个集群宕机,后期需要重新恢复数据,恢复数据流程如下: