环境: centos7.6, openvsitch 2.11.0
安装 openvswitch,需要安装 openstack yum 源
yum install -y openvswitch
systemctl start openvswitch
k8s01 创建、配置 vmA1 接口
[root@k8s01 ~]# ip link add dev vmA1-sw type veth peer name vmA1
[root@k8s01 ~]# ip link set vmA1-sw up
[root@k8s01 ~]# ip link set vmA1 up
[root@k8s01 ~]# ip addr add 192.168.60.11/24 dev vmA1
k8s01 创建、配置 vmB1 接口
[root@k8s01 ~]# ip link add dev vmB1-sw type veth peer name vmB1
[root@k8s01 ~]# ip link set vmB1-sw up
[root@k8s01 ~]# ip link set vmB1 up
[root@k8s01 ~]# ip addr add 192.168.70.11/24 dev vmB1
创建 tenantA 和 tenantB 网桥,它们代表虚拟交换机
[root@k8s01 31956]# ovs-vsctl add-br tenantA
[root@k8s01 31956]# ovs-vsctl add-br tenantB
将模拟接口插到网桥上
[root@k8s01 ~]# ovs-vsctl add-port tenantA vmA1-sw
[root@k8s01 ~]# ovs-vsctl add-port tenantB vmB1-sw
[root@k8s01 ~]# ovs-vsctl show
2d3e5812-033a-4641-9064-1268e693c49a
Bridge tenantA
Port tenantA
Interface tenantA
type: internal
Port "vmA1-sw"
Interface "vmA1-sw"
Bridge tenantB
Port tenantB
Interface tenantB
type: internal
Port "vmB1-sw"
Interface "vmB1-sw"
ovs_version: "2.11.0"
安装 openvswitch,需要安装 openstack yum 源
yum install -y openvswitch
systemctl start openvswitch
k8s02 创建、配置 vmA2 接口
[root@k8s02 ~]# ip link add dev vmA2-sw type veth peer name vmA2
[root@k8s02 ~]# ip link set vmA2-sw up
[root@k8s02 ~]# ip link set vmA2 up
[root@k8s02 ~]# ip addr add 192.168.60.12/24 dev vmA2
k8s02 创建、配置 vmB2接口
[root@k8s02 ~]# ip link add dev vmB2-sw type veth peer name vmB2
[root@k8s02 ~]# ip link set vmB2-sw up
[root@k8s02 ~]# ip link set vmB2 up
[root@k8s02 ~]# ip addr add 192.168.70.12/24 dev vmB2
创建 tenantA 和 tenantB 网桥,它们代表虚拟交换机
[root@k8s02 31956]# ovs-vsctl add-br tenantA
[root@k8s02 31956]# ovs-vsctl add-br tenantB
将模拟接口插到网桥上
[root@k8s02 ~]# ovs-vsctl add-port tenantA vmA2-sw
[root@k8s02 ~]# ovs-vsctl add-port tenantB vmB2-sw
[root@k8s02 ~]# ovs-vsctl show
2d3e5812-033a-4641-9064-1268e693c49a
Bridge tenantA
Port tenantA
Interface tenantA
type: internal
Port "vmA2-sw"
Interface "vmA2-sw"
Bridge tenantB
Port tenantB
Interface tenantB
type: internal
Port "vmB2-sw"
Interface "vmB2-sw"
ovs_version: "2.11.0"
[root@k8s02 ~]# ping -c 2 192.168.60.11
PING 192.168.60.11 (192.168.60.11) 56(84) bytes of data.
From 192.168.60.12 icmp_seq=1 Destination Host Unreachable
From 192.168.60.12 icmp_seq=2 Destination Host Unreachable
--- 192.168.60.11 ping statistics ---
2 packets transmitted, 0 received, +2 errors, 100% packet loss, time 999ms
pipe 2
[root@k8s02 ~]# ping -c 2 192.168.70.11
PING 192.168.70.11 (192.168.70.11) 56(84) bytes of data.
From 192.168.70.12 icmp_seq=1 Destination Host Unreachable
From 192.168.70.12 icmp_seq=2 Destination Host Unreachable
--- 192.168.70.11 ping statistics ---
2 packets transmitted, 0 received, +2 errors, 100% packet loss, time 999ms
pipe 2
注意事项:
1、使用 ovs-vctl 报错
[root@k8s02 ~]# ovs-vsctl add-br tenantA
net_mlx5: cannot load glue library: /lib64/libmlx5.so.1: version `MLX5_1.6' not found (required by /usr/lib64/dpdk-pmds-glue/librte_pmd_mlx5_glue.so.18.11.0)
net_mlx5: cannot initialize PMD due to missing run-time dependency on rdma-core libraries (libibverbs, libmlx5)
解决方法
yum install libmlx5
[root@k8s01 ~]# ovs-vsctl add-port tenantA vxlanA -- set interface vxlanA type=vxlan options:remote_ip=10.2.7.201 options:key=5000
[root@k8s01 ~]# ovs-vsctl add-port tenantB vxlanB -- set interface vxlanB type=vxlan options:remote_ip=10.2.7.201 options:key=6000
[root@k8s01 ~]# ovs-vsctl show
2d3e5812-033a-4641-9064-1268e693c49a
Bridge tenantA
Port tenantA
Interface tenantA
type: internal
Port "vmA1-sw"
Interface "vmA1-sw"
Port vxlanA
Interface vxlanA
type: vxlan
options: {key="5000", remote_ip="10.2.7.201"}
Bridge tenantB
Port tenantB
Interface tenantB
type: internal
Port vxlanB
Interface vxlanB
type: vxlan
options: {key="6000", remote_ip="10.2.7.201"}
Port "vmB1-sw"
Interface "vmB1-sw"
ovs_version: "2.11.0"
[root@k8s02 ~]# ovs-vsctl add-port tenantA vxlanA -- set interface vxlanA type=vxlan options:remote_ip=10.2.7.200 options:key=5000
[root@k8s02 ~]# ovs-vsctl add-port tenantB vxlanB -- set interface vxlanB type=vxlan options:remote_ip=10.2.7.200 options:key=6000
[root@k8s02 ~]# ovs-vsctl show
c3b22dc3-7884-41cc-98c8-da60451491c3
Bridge tenantA
Port vxlanA
Interface vxlanA
type: vxlan
options: {key="5000", remote_ip="10.2.7.200"}
Port tenantA
Interface tenantA
type: internal
Port "vmA2-sw"
Interface "vmA2-sw"
Bridge tenantB
Port vxlanB
Interface vxlanB
type: vxlan
options: {key="6000", remote_ip="10.2.7.200"}
Port "vmB2-sw"
Interface "vmB2-sw"
Port tenantB
Interface tenantB
type: internal
ovs_version: "2.11.0"
[root@k8s02 ~]# ping -c 2 192.168.60.11
PING 192.168.60.11 (192.168.60.11) 56(84) bytes of data.
64 bytes from 192.168.60.11: icmp_seq=1 ttl=64 time=1.81 ms
64 bytes from 192.168.60.11: icmp_seq=2 ttl=64 time=0.279 ms
--- 192.168.60.11 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.279/1.048/1.818/0.770 ms
[root@k8s02 ~]# ping -c 2 192.168.70.11
PING 192.168.70.11 (192.168.70.11) 56(84) bytes of data.
64 bytes from 192.168.70.11: icmp_seq=1 ttl=64 time=1.21 ms
64 bytes from 192.168.70.11: icmp_seq=2 ttl=64 time=0.204 ms
--- 192.168.70.11 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1000ms
rtt min/avg/max/mdev = 0.204/0.710/1.217/0.507 ms
ovs-vsctl 是配置 openvswitch 最重要的命令,ovs-vsctl 与 ovsdb-server 通信,ovsdb-server 管理着 openvswitch 的配置信息数据库。
ovs-vsctl 一次可以执行一条或多条命令,多条命令用 “–” 分开
帮助信息:ovs-vsctl --help
、 man 5 ovs-vswitchd.conf.db
、ovs-vsctl list
显示已存在网桥的 db 信息
[root@k8s01 ~]# ovs-vsctl list bridge
_uuid : ed1cf240-72b2-4d0d-80ba-34fe4319805f
auto_attach : []
controller : []
datapath_id : "000042f21ced0d4d"
datapath_type : ""
datapath_version : "<unknown>"
external_ids : {}
fail_mode : []
flood_vlans : []
flow_tables : {}
ipfix : []
mcast_snooping_enable: false
mirrors : []
name : tenantB
netflow : []
other_config : {}
ports : [05e579af-f158-4a27-b046-3d45a63f463f, a33d043c-2a82-4f52-8c10-4a28eab38345, b17ade3a-31bf-4cd2-8f1c-8819b82c53aa]
protocols : []
rstp_enable : false
rstp_status : {}
sflow : []
status : {}
stp_enable : false
_uuid : af7cbcf0-95c4-4f3e-a89a-6eb5a2a44154
auto_attach : []
controller : []
datapath_id : "0000f2bc7caf3e4f"
datapath_type : ""
datapath_version : "<unknown>"
external_ids : {}
fail_mode : []
flood_vlans : []
flow_tables : {}
ipfix : []
mcast_snooping_enable: false
mirrors : []
name : tenantA
netflow : []
other_config : {}
ports : [a631ac6c-6039-4171-812a-aefc24487ac0, f26cfa5d-fd35-439a-a934-86bdfe632d5e, f87dd940-1d71-437a-8694-a003c25c7c7b]
protocols : []
rstp_enable : false
rstp_status : {}
sflow : []
status : {}
stp_enable : false
显示网桥 tenanaA 的 db 信息
[root@k8s01 ~]# ovs-vsctl list bridge tenantA
_uuid : af7cbcf0-95c4-4f3e-a89a-6eb5a2a44154
auto_attach : []
controller : []
datapath_id : "0000f2bc7caf3e4f"
datapath_type : ""
datapath_version : "<unknown>"
external_ids : {}
fail_mode : []
flood_vlans : []
flow_tables : {}
ipfix : []
mcast_snooping_enable: false
mirrors : []
name : tenantA
netflow : []
other_config : {}
ports : [a631ac6c-6039-4171-812a-aefc24487ac0, f26cfa5d-fd35-439a-a934-86bdfe632d5e, f87dd940-1d71-437a-8694-a003c25c7c7b]
protocols : []
rstp_enable : false
rstp_status : {}
sflow : []
status : {}
stp_enable : false
传统的交换机端口是 2 层端口,流量在该交换机端口之间传输,这些端口没有 3 层的 ip 配置。即使是的 linux bridge 也能发现这些特性,比如 eth0 配置了 ip,再把它加入到一个 bridge,那么你将失去 eth0 的连接,因为 eth0 此时只作为 2 层端口,可以将 ip 移到连接 eth0 的 bridge 接口(比如 br0)
openvswith 用 internal port 提供一个解决方法。 internal port 是一个 3 层端口,能够暴露到 openvswitch 外面,所以能够进行 ip 设置
k8s01 上配置
[root@k8s01 ~]# ovs-vsctl add-port tenantA internalPort -- set interface internalPort type=internal
[root@k8s01 ~]# ip addr add 192.168.60.50/24 dev internalPort
[root@k8s01 ~]# ip link set internalPort up
k8s02 上 ping 192.168.60.50
[root@k8s02 ~]# ping -c2 192.168.60.50
PING 192.168.60.50 (192.168.60.50) 56(84) bytes of data.
64 bytes from 192.168.60.50: icmp_seq=1 ttl=64 time=1.36 ms
64 bytes from 192.168.60.50: icmp_seq=2 ttl=64 time=0.162 ms
--- 192.168.60.50 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.162/0.763/1.364/0.601 ms
internal port 的名称就像网桥名称一样
[root@k8s01 ~]# ovs-vsctl show
2d3e5812-033a-4641-9064-1268e693c49a
Bridge tenantA
Port tenantA
Interface tenantA
type: internal
Port internalPort
Interface internalPort
type: internal
Port "vmA1-sw"
Interface "vmA1-sw"
Port vxlanA
Interface vxlanA
type: vxlan
options: {key="5000", remote_ip="10.2.7.201"}
Bridge tenantB
Port tenantB
Interface tenantB
type: internal
Port vxlanB
Interface vxlanB
type: vxlan
options: {key="6000", remote_ip="10.2.7.201"}
Port "vmB1-sw"
Interface "vmB1-sw"
ovs_version: "2.11.0
[root@k8s01 ~]# ip link | grep tenant
197: tenantA: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
198: tenantB: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
交换机有 mirror port 和 span prot,该 port 可以作为特定流量的镜像,用于拍错。 openvswith 也支持 mirror port
创建一个内部的 mirror port,我们需要一个端口并在上面进行 wireshark/tcpdump 监听
[root@k8s01 ~]# ovs-vsctl add-port tenantA mirrorPort -- set interface mirrorPort type=internal
创建 mirror 配置
[root@k8s01 ~]# ovs-vsctl --id=@vmA1-sw get port vmA1-sw --\
> --id=@mirrorPort get port mirrorPort --\
> --id=@mirror create mirror name=mirror \
> select-dst-port=@vmA1-sw select-src-port=@vmA1-sw output-port=@mirrorPort --\
> set bridge tenantA mirrors=@mirror
8d73fe4f-b111-4f8f-a3d1-b8672dbe037a
安装 wireshark
[root@k8s01 ~]# yum -y install wireshark
k8s02 ping
[root@k8s02 ~]# ping 192.168.60.11
PING 192.168.60.11 (192.168.60.11) 56(84) bytes of data.
64 bytes from 192.168.60.11: icmp_seq=1 ttl=64 time=0.966 ms
64 bytes from 192.168.60.11: icmp_seq=2 ttl=64 time=0.211 ms
64 bytes from 192.168.60.11: icmp_seq=3 ttl=64 time=0.193 ms
64 bytes from 192.168.60.11: icmp_seq=4 ttl=64 time=0.235 ms
64 bytes from 192.168.60.11: icmp_seq=5 ttl=64 time=0.277 ms
...
监听 mirror 端口
[root@k8s01 ~]# ip link set mirrorPort up
[root@k8s01 ~]# tshark -c 6 -i mirrorPort
Running as user "root" and group "root". This could be dangerous.
Capturing on 'mirrorPort'
1 0.000000000 192.168.60.12 -> 192.168.60.11 ICMP 98 Echo (ping) request id=0x7aec, seq=1/256, ttl=64
2 0.000140651 192.168.60.11 -> 192.168.60.12 ICMP 98 Echo (ping) reply id=0x7aec, seq=1/256, ttl=64 (request in 1)
3 0.999594777 192.168.60.12 -> 192.168.60.11 ICMP 98 Echo (ping) request id=0x7aec, seq=2/512, ttl=64
4 0.999626499 192.168.60.11 -> 192.168.60.12 ICMP 98 Echo (ping) reply id=0x7aec, seq=2/512, ttl=64 (request in 3)
5 1.999676670 192.168.60.12 -> 192.168.60.11 ICMP 98 Echo (ping) request id=0x7aec, seq=3/768, ttl=64
6 1.999707851 192.168.60.11 -> 192.168.60.12 ICMP 98 Echo (ping) reply id=0x7aec, seq=3/768, ttl=64 (request in 5)
6 packets captured
监听 eth0 端口,发现 192.168.60.12 > 192.168.60.11 流量是封装成 vxlan 通过 eth0 发送
[root@k8s01 ~]# tcpdump -i eth0 host 10.2.7.201 -n | grep -C2 192.168.60.11
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
09:09:44.905533 IP 10.2.7.200.51404 > 10.2.7.201.2380: Flags [.], ack 1949, win 1424, options [nop,nop,TS val 2565202554 ecr 1863325088], length 0
09:09:44.906178 IP 10.2.7.201.53192 > 10.2.7.200.4789: VXLAN, flags [I] (0x08), vni 5000
IP 192.168.60.12 > 192.168.60.11: ICMP echo request, id 31344, seq 9, length 64
09:09:44.906281 IP 10.2.7.200.39328 > 10.2.7.201.4789: VXLAN, flags [I] (0x08), vni 5000
IP 192.168.60.11 > 192.168.60.12: ICMP echo reply, id 31344, seq 9, length 64
09:09:44.906463 IP 10.2.7.201.ssh > 10.2.7.107.55590: Flags [P.], seq 3157118280:3157118380, ack 17960795, win 386, options [nop,nop,TS val 1863325089 ecr 4229146211], length 100
09:09:44.906569 IP 10.2.7.107.55590 > 10.2.7.201.ssh: Flags [.], ack 100, win 1424, options [nop,nop,TS val 4229147211 ecr 1863325089], length 0
--
...
清理 mirror
[root@k8s01 ~]# ovs-vsctl clear bridge tenantA mirrors
[root@k8s01 ~]# ovs-vsctl del-port mirrorPort
注意:mirror 设置中 --id 命令提供接口的别名,需要接口的 id 而不是名称。不能给 patch ports 创建 mirror,如果需要 mirror path ports 流量,请使用 veth pairs。
patch ports 就像连接两个交换机之间的电缆,或者插入 openvswitch 网桥的 veth pair
创建 tenantC 网桥以及设置 tenantC internal port
k8s01 节点
[root@k8s01 ~]# ovs-vsctl add-br tenantC
[root@k8s01 ~]# ip addr add 192.168.80.11/24 dev tenantC
[root@k8s01 ~]# ip link set tenantC up
k8s02 节点
[root@k8s02 ~]# ovs-vsctl add-br tenantC
[root@k8s02 ~]# ip addr add 192.168.80.12/24 dev tenantC
[root@k8s02 ~]# ip link set tenantC up
k8s02 ping 192.168.80.11,没有 patch 连接,所以不通
[root@k8s02 ~]# ping -c2 192.168.80.11
PING 192.168.80.11 (192.168.80.11) 56(84) bytes of data.
From 192.168.80.12 icmp_seq=1 Destination Host Unreachable
From 192.168.80.12 icmp_seq=2 Destination Host Unreachable
--- 192.168.80.11 ping statistics ---
2 packets transmitted, 0 received, +2 errors, 100% packet loss, time 999ms
pipe 2
建立 path 连接
k8s01 节点
[root@k8s01 ~]# ovs-vsctl add-port tenantC patchC --\
> add-port tenantA patchA --\
> set interface patchC type=patch options:peer=patchA --\
> set interface patchA type=patch options:peer=patchC
k8s02 节点
[root@k8s02 ~]# ovs-vsctl add-port tenantC patchC --\
> add-port tenantA patchA --\
> set interface patchC type=patch options:peer=patchA --\
> set interface patchA type=patch options:peer=patchC
k8s02 ping 192.168.80.11
[root@k8s02 ~]# ping -c2 192.168.80.11
PING 192.168.80.11 (192.168.80.11) 56(84) bytes of data.
64 bytes from 192.168.80.11: icmp_seq=1 ttl=64 time=1.47 ms
64 bytes from 192.168.80.11: icmp_seq=2 ttl=64 time=0.189 ms
--- 192.168.80.11 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.189/0.833/1.478/0.645 ms
k8s01 节点
[root@k8s01 ~]# ovs-vsctl add-port tenantC vlanPortC tag=10 --\
> set interface vlanPortC type=internal
[root@k8s01 ~]# ip addr add 192.168.90.11/24 dev vlanPortC
[root@k8s01 ~]# ip link set vlanPortC up
k8s02 节点
[root@k8s02 ~]# ovs-vsctl add-port tenantC vlanPortC tag=10 --\
> set interface vlanPortC type=internal
[root@k8s02 ~]# ip addr add 192.168.90.12/24 dev vlanPortC
[root@k8s02 ~]# ip link set vlanPortC up
k8s02 ping 192.168.90.11
[root@k8s02 ~]# ping -c2 192.168.90.11
PING 192.168.90.11 (192.168.90.11) 56(84) bytes of data.
64 bytes from 192.168.90.11: icmp_seq=1 ttl=64 time=1.67 ms
64 bytes from 192.168.90.11: icmp_seq=2 ttl=64 time=0.178 ms
--- 192.168.90.11 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.178/0.927/1.676/0.749 ms
监听 eth0 端口,发现 192.168.90.12 > 192.168.90.11 流量是封装成 vxlan 通过 eth0 发送
[root@k8s01 ~]# tcpdump -i eth0 -n| grep -C2 192.168.90.12
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
10:28:34.066375 IP 10.2.7.202.46040 > 10.2.7.200.2380: Flags [.], ack 886, win 1424, options [nop,nop,TS val 1172533648 ecr 2569931715], length 0
10:28:34.068319 IP 10.2.7.201.42349 > 10.2.7.200.4789: VXLAN, flags [I] (0x08), vni 5000
IP 192.168.90.12 > 192.168.90.11: ICMP echo request, id 19832, seq 134, length 64
10:28:34.068389 IP 10.2.7.200.49438 > 10.2.7.201.4789: VXLAN, flags [I] (0x08), vni 5000
IP 192.168.90.11 > 192.168.90.12: ICMP echo reply, id 19832, seq 134, length 64
10:28:34.068592 IP 10.2.7.201.ssh > 10.2.7.107.55590: Flags [P.], seq 3157289604:3157289704, ack 17976739, win 431, options [nop,nop,TS val 1868054235 ecr 4233875337], length 100
..
vxlan 和 gre 很相似。移除 tenantA 和 tenantC 之间的 patch,gre 管道将处理 tenanC 网桥之间的流量信息
删除 patch ports
k8s01 节点
k8s01 节点[root@k8s01 ~]# ovs-vsctl del-port patchC
[root@k8s01 ~]# ovs-vsctl del-port patchA
k8s02 节点
[root@k8s02 ~]# ovs-vsctl del-port patchC
[root@k8s02 ~]# ovs-vsctl del-port patchA
k8s02 ping 192.168.90.11,不通
[root@k8s02 ~]# ping -c2 192.168.90.11
PING 192.168.90.11 (192.168.90.11) 56(84) bytes of data.
From 192.168.90.12 icmp_seq=1 Destination Host Unreachable
From 192.168.90.12 icmp_seq=2 Destination Host Unreachable
--- 192.168.90.11 ping statistics ---
2 packets transmitted, 0 received, +2 errors, 100% packet loss, time 999ms
pipe 2
增加 gre 管道
k8s01 节点
[root@k8s01 ~]# ovs-vsctl add-port tenantC greC -- set interface greC type=gre options:remote_ip=10.2.7.201
k8s02 节点
[root@k8s02 ~]# ovs-vsctl add-port tenantC greC -- set interface greC type=gre options:remote_ip=10.2.7.200
k8s02 ping 192.168.90.11
[root@k8s02 ~]# ping -c2 192.168.90.11
PING 192.168.90.11 (192.168.90.11) 56(84) bytes of data.
64 bytes from 192.168.90.11: icmp_seq=1 ttl=64 time=1.28 ms
64 bytes from 192.168.90.11: icmp_seq=2 ttl=64 time=0.173 ms
--- 192.168.90.11 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.173/0.729/1.286/0.557 ms
监听 eth0 端口,发现 192.168.90.12 > 192.168.90.11 流量是封装成 GREv0 通过 eth0 发送
[root@k8s01 ~]# tcpdump -n -i eth0 | grep -C2 192.168.90.12
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
10:50:38.842380 IP 10.2.7.201.ssh > 10.2.7.107.55590: Flags [P.], seq 1988:2080, ack 393, win 431, options [nop,nop,TS val 1869379008 ecr 4235201072], length 92
10:50:38.842472 IP 10.2.7.107.55590 > 10.2.7.201.ssh: Flags [.], ack 2080, win 1424, options [nop,nop,TS val 4235201101 ecr 1869379008], length 0
10:50:38.842498 IP 10.2.7.201 > 10.2.7.200: GREv0, length 106: IP 192.168.90.12 > 192.168.90.11: ICMP echo request, id 3857, seq 1, length 64
10:50:38.842607 IP 10.2.7.107.ssh > 10.3.57.11.34344: Flags [P.], seq 2640:2752, ack 641, win 399, options [nop,nop,TS val 4235201101 ecr 3872193733], length 112
10:50:38.842731 IP 10.3.57.11.34344 > 10.2.7.107.ssh: Flags [.], ack 2752, win 2440, options [nop,nop,TS val 3872193762 ecr 4235201101], length 0
10:50:38.842902 IP 10.2.7.200 > 10.2.7.201: GREv0, length 106: IP 192.168.90.11 > 192.168.90.12: ICMP echo reply, id 3857, seq 1, length 64
10:50:38.843231 IP 10.2.7.201.ssh > 10.2.7.107.55590: Flags [P.], seq 2080:2180, ack 393, win 431, options [nop,nop,TS val 1869379009 ecr 4235201101], length 100
10:50:38.843267 IP 10.2.7.107.55590 > 10.2.7.201.ssh: Flags [.], ack 2180, win 1424, options [nop,nop,TS val 4235201101 ecr 1869379009], length 0
--
...
监听 gre_sys 端口,发现 192.168.90.12 > 192.168.90.11 会通过 gre_sys 接口
[root@k8s01 ~]# tcpdump -n -i gre_sys
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on gre_sys, link-type EN10MB (Ethernet), capture size 262144 bytes
10:49:28.236776 IP 192.168.80.12 > 192.168.80.11: ICMP echo request, id 2707, seq 36, length 64
10:49:28.236827 IP 192.168.80.11 > 192.168.80.12: ICMP echo reply, id 2707, seq 36, length 64
10:49:29.236759 IP 192.168.80.12 > 192.168.80.11: ICMP echo request, id 2707, seq 37, length 64
10:49:29.236809 IP 192.168.80.11 > 192.168.80.12: ICMP echo reply, id 2707, seq 37, length 64
10:49:30.236728 IP 192.168.80.12 > 192.168.80.11: ICMP echo request, id 2707, seq 38, length 64
10:49:30.236773 IP 192.168.80.11 > 192.168.80.12: ICMP echo reply, id 2707, seq 38, length 64
...
mtu 最大传输单元,一般最大值为 1500,超过就会切分。vxlan 使用封装技术,传输单元可能会超过 底层网络最大传输单元 1500,这回导致问题,比如下面。
k8s02 作为server
[root@k8s02 ~]# iperf3 -s
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
Accepted connection from 192.168.70.11, port 40230
[ 5] local 192.168.70.12 port 5201 connected to 192.168.70.11 port 40232
[ ID] Interval Transfer Bandwidth
[ 5] 0.00-1.00 sec 0.00 Bytes 0.00 bits/sec
[ 5] 1.00-2.00 sec 0.00 Bytes 0.00 bits/sec
[ 5] 2.00-3.00 sec 0.00 Bytes 0.00 bits/sec
[ 5] 3.00-4.00 sec 0.00 Bytes 0.00 bits/sec
[ 5] 4.00-5.00 sec 0.00 Bytes 0.00 bits/sec
[ 5] 5.00-6.00 sec 0.00 Bytes 0.00 bits/sec
[ 5] 6.00-7.00 sec 0.00 Bytes 0.00 bits/sec
[ 5] 7.00-8.00 sec 0.00 Bytes 0.00 bits/sec
[ 5] 8.00-9.00 sec 0.00 Bytes 0.00 bits/sec
[ 5] 9.00-10.00 sec 0.00 Bytes 0.00 bits/sec
[ 5] 10.00-10.04 sec 0.00 Bytes 0.00 bits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bandwidth
[ 5] 0.00-10.04 sec 0.00 Bytes 0.00 bits/sec sender
[ 5] 0.00-10.04 sec 0.00 Bytes 0.00 bits/sec receiver
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
...
k8s01 作为 client
[root@k8s01 ~]# iperf3 -c 192.168.70.12
Connecting to host 192.168.70.12, port 5201
[ 4] local 192.168.70.11 port 40232 connected to 192.168.70.12 port 5201
[ ID] Interval Transfer Bandwidth Retr Cwnd
[ 4] 0.00-1.00 sec 84.8 KBytes 694 Kbits/sec 2 1.41 KBytes
[ 4] 1.00-2.00 sec 0.00 Bytes 0.00 bits/sec 1 1.41 KBytes
[ 4] 2.00-3.00 sec 0.00 Bytes 0.00 bits/sec 0 1.41 KBytes
[ 4] 3.00-4.00 sec 0.00 Bytes 0.00 bits/sec 1 1.41 KBytes
[ 4] 4.00-5.00 sec 0.00 Bytes 0.00 bits/sec 0 1.41 KBytes
[ 4] 5.00-6.00 sec 0.00 Bytes 0.00 bits/sec 0 1.41 KBytes
[ 4] 6.00-7.00 sec 0.00 Bytes 0.00 bits/sec 1 1.41 KBytes
[ 4] 7.00-8.00 sec 0.00 Bytes 0.00 bits/sec 0 1.41 KBytes
[ 4] 8.00-9.00 sec 0.00 Bytes 0.00 bits/sec 0 1.41 KBytes
[ 4] 9.00-10.00 sec 0.00 Bytes 0.00 bits/sec 0 1.41 KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bandwidth Retr
[ 4] 0.00-10.00 sec 84.8 KBytes 69.5 Kbits/sec 5 sender
[ 4] 0.00-10.00 sec 0.00 Bytes 0.00 bits/sec receiver
iperf Done.
为了避免这样的为题,必须调整底层网络的 mtu。通过计算额外的 header,VXLAN + UDP + IP + Ethernet,得到 mtu 1554
[root@k8s02 ~]# ip link set eth0 mtu 1554
[root@k8s02 ~]# iperf3 -s
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
Accepted connection from 192.168.70.11, port 45934
[ 5] local 192.168.70.12 port 5201 connected to 192.168.70.11 port 45936
[ ID] Interval Transfer Bandwidth
[ 5] 0.00-1.00 sec 316 MBytes 2.65 Gbits/sec
[ 5] 1.00-2.00 sec 329 MBytes 2.76 Gbits/sec
[ 5] 2.00-3.00 sec 356 MBytes 2.99 Gbits/sec
[ 5] 3.00-4.00 sec 353 MBytes 2.96 Gbits/sec
[ 5] 4.00-5.00 sec 353 MBytes 2.96 Gbits/sec
[ 5] 5.00-6.00 sec 344 MBytes 2.88 Gbits/sec
[ 5] 6.00-7.00 sec 348 MBytes 2.92 Gbits/sec
[ 5] 7.00-8.00 sec 348 MBytes 2.92 Gbits/sec
[ 5] 8.00-9.00 sec 342 MBytes 2.87 Gbits/sec
[ 5] 9.00-10.00 sec 344 MBytes 2.89 Gbits/sec
[ 5] 10.00-10.04 sec 15.1 MBytes 2.99 Gbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bandwidth
[ 5] 0.00-10.04 sec 0.00 Bytes 0.00 bits/sec sender
[ 5] 0.00-10.04 sec 3.37 GBytes 2.88 Gbits/sec receiver
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
[root@k8s01 ~]# ip link set eth0 mtu 1554
[root@k8s01 ~]# iperf3 -c 192.168.70.12
Connecting to host 192.168.70.12, port 5201
[ 4] local 192.168.70.11 port 45936 connected to 192.168.70.12 port 5201
[ ID] Interval Transfer Bandwidth Retr Cwnd
[ 4] 0.00-1.00 sec 332 MBytes 2.79 Gbits/sec 8 730 KBytes
[ 4] 1.00-2.00 sec 330 MBytes 2.77 Gbits/sec 0 1022 KBytes
[ 4] 2.00-3.00 sec 356 MBytes 2.99 Gbits/sec 0 1.23 MBytes
[ 4] 3.00-4.00 sec 352 MBytes 2.96 Gbits/sec 32 1.42 MBytes
[ 4] 4.00-5.00 sec 352 MBytes 2.96 Gbits/sec 0 1.59 MBytes
[ 4] 5.00-6.00 sec 345 MBytes 2.89 Gbits/sec 14 1.30 MBytes
[ 4] 6.00-7.00 sec 348 MBytes 2.92 Gbits/sec 0 1.49 MBytes
[ 4] 7.00-8.00 sec 349 MBytes 2.93 Gbits/sec 9 1.19 MBytes
[ 4] 8.00-9.00 sec 341 MBytes 2.86 Gbits/sec 0 1.39 MBytes
[ 4] 9.00-10.00 sec 345 MBytes 2.90 Gbits/sec 32 1.55 MBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bandwidth Retr
[ 4] 0.00-10.00 sec 3.37 GBytes 2.89 Gbits/sec 95 sender
[ 4] 0.00-10.00 sec 3.37 GBytes 2.89 Gbits/sec receiver
iperf Done.
也可以减少 vmB1 的 mtu
[root@k8s02 ~]# ip link set vmB2 mtu 1450
[root@k8s02 ~]# ip link set vmB2-sw mtu 1450
[root@k8s02 ~]# iperf3 -s
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
Accepted connection from 192.168.70.11, port 52206
[ 5] local 192.168.70.12 port 5201 connected to 192.168.70.11 port 52208
[ ID] Interval Transfer Bandwidth
[ 5] 0.00-1.00 sec 298 MBytes 2.50 Gbits/sec
[ 5] 1.00-2.00 sec 326 MBytes 2.73 Gbits/sec
[ 5] 2.00-3.00 sec 327 MBytes 2.75 Gbits/sec
[ 5] 3.00-4.00 sec 323 MBytes 2.71 Gbits/sec
[ 5] 4.00-5.00 sec 334 MBytes 2.80 Gbits/sec
[ 5] 5.00-6.00 sec 341 MBytes 2.86 Gbits/sec
[ 5] 6.00-7.00 sec 342 MBytes 2.87 Gbits/sec
[ 5] 7.00-8.00 sec 344 MBytes 2.89 Gbits/sec
[ 5] 8.00-9.00 sec 345 MBytes 2.89 Gbits/sec
[ 5] 9.00-10.00 sec 341 MBytes 2.86 Gbits/sec
[ 5] 10.00-10.04 sec 13.9 MBytes 2.86 Gbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bandwidth
[ 5] 0.00-10.04 sec 0.00 Bytes 0.00 bits/sec sender
[ 5] 0.00-10.04 sec 3.26 GBytes 2.79 Gbits/sec receiver
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
[root@k8s01 ~]# ip link set vmA1 mtu 1450
[root@k8s01 ~]# ip link set vmA1-sw mtu 1450
[root@k8s01 ~]# iperf3 -c 192.168.70.12
Connecting to host 192.168.70.12, port 5201
[ 4] local 192.168.70.11 port 52208 connected to 192.168.70.12 port 5201
[ ID] Interval Transfer Bandwidth Retr Cwnd
[ 4] 0.00-1.00 sec 312 MBytes 2.61 Gbits/sec 47 754 KBytes
[ 4] 1.00-2.00 sec 328 MBytes 2.75 Gbits/sec 0 1.00 MBytes
[ 4] 2.00-3.00 sec 327 MBytes 2.75 Gbits/sec 0 1.21 MBytes
[ 4] 3.00-4.00 sec 322 MBytes 2.71 Gbits/sec 0 1.39 MBytes
[ 4] 4.00-5.00 sec 334 MBytes 2.80 Gbits/sec 0 1.55 MBytes
[ 4] 5.00-6.00 sec 341 MBytes 2.86 Gbits/sec 23 1.28 MBytes
[ 4] 6.00-7.00 sec 342 MBytes 2.87 Gbits/sec 33 1.45 MBytes
[ 4] 7.00-8.00 sec 345 MBytes 2.89 Gbits/sec 1 1.17 MBytes
[ 4] 8.00-9.00 sec 344 MBytes 2.88 Gbits/sec 0 1.37 MBytes
[ 4] 9.00-10.00 sec 341 MBytes 2.86 Gbits/sec 0 1.53 MBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bandwidth Retr
[ 4] 0.00-10.00 sec 3.26 GBytes 2.80 Gbits/sec 104 sender
[ 4] 0.00-10.00 sec 3.26 GBytes 2.80 Gbits/sec receiver
iperf Done.
参考文章: