当前位置: 代码迷 >> 综合 >> Debug OpenvSwitch (by quqi99)
  详细解决方案

Debug OpenvSwitch (by quqi99)

热度:75   发布时间:2023-12-13 08:55:43.0

作者:张华 发表于:2020-12-28
版权声明:可以任意转载,转载时请务必以超链接形式标明文章原始出处和作者信息及本版权声明

GDB调试

# will stop service
sysctl kernel.core_pattern
sudo service apport start force_start=1 enabled=1
grep enabled /etc/default/apport
sudo killall -SIGSEGV ovs-vswitchd
sudo cat /var/log/apport.log# won't stop service
gdb -ex "set pagination 0" -ex "thread apply all bt" -batch -p $(pidof ovs-vswitchd) #for call trace
gcore $(pidof ovs-vswitchd)
sudo apt install openvswitch-dbg cgdb
#cgdb $(which ovs-vswitchd) $(pidof ovs-vswitchd)
cgdb ovs-vswitchd /var/crash/_usr_lib_openvswitch-switch_ovs-vswitchd.0.crash

Debug Log

# log level: emer,err,warn,info,dbg,OFF
# log type: console, syslog, file
sudo ovs-appctl vlog/list |grep vswitchd
sudo ovs-appctl vlog/set vswitchd:file:dbg
sudo ovs-appctl vlog/set vswitchd:file:OFF
sudo ovs-appctl vlog/set ANY:ANY:dbg# use this one, we can see the realtime log
sudo ovs-appctl vlog/set ANY:file:dbg
tail -f /var/log/openvswitch/ovs-vswitchd.log

eg: ovn-controller.log is too big

ls /var/run/openvswitch/*ctl
ls /var/run/ovn/*.ctl
#sudo ovs-appctl -t ovsdb-server list-commands
sudo ovs-appctl -t /var/run/openvswitch/ovsdb-server.21606.ctl list-commands
sudo ovs-appctl -t /var/run/openvswitch/ovs-vswitchd.21623.ctl list-commands
sudo ovs-appctl -t /var/run/ovn/ovnnb_db.ctl list-commands
sudo ovs-appctl -t /var/run/ovn/ovn-northd.21745.ctl list-commands
sudo ovs-appctl -t /var/run/ovn/ovnnb_db.ctl list-commands
sudo ovs-appctl -t /var/run/ovn/ovnsb_db.ctl list-commands
sudo ovs-appctl -t /var/run/ovn/ovn-controller.21762.ctl list-commands#ovn-controller.log is too big
sudo ovs-appctl -t /var/run/ovn/ovn-controller*.ctl list-commands
sudo ovs-appctl -t /var/run/ovn/ovn-controller*.ctl vlog/list-pattern
sudo ovs-appctl -t /var/run/ovn/ovn-controller*.ctl vlog/list
sudo ovs-appctl -t /var/run/ovn/ovn-controller*.ctl vlog/list |grep chassis
sudo ovs-appctl -t /var/run/ovn/ovn-controller*.ctl vlog/set chassis:file:dbg
sudo ovs-appctl -t /var/run/ovn/ovn-controller*.ctl vlog/set ANY:ANY:err

CLI Usage

#网桥状态
ovs-vsctl show
#网桥查询
ovs-vsctl list-br
#端口查询
ovs-vsctl list-ports br-int
#接口查询
ovs-vsctl list-ifaces br-int
#端口、接口归属查询
ovs-vsctl port-to-br tap30580aa5-b0
ovs-vsctl iface-to-br tap30580aa5-b0
#查询网桥流表
ovs-ofctl dump-flows br-int
#查询网桥信息
ovs-ofctl show br-int
#Datapath统计信息查询:hit表示datapath命中数,missed未命中,lost表示没有传递到用户空间就丢弃了
ovs-dpctl show
#查询端口详细统计信息
ovs-dpctl show -s
#指定端口统计信息
ovs-ofctl dump-ports br-int 1
#网桥转发规则
ovs-appctl fdb/show br-int#日志查询
ovsdb-tool show-log -m /var/lib/openvswitch/conf.db#流表匹配
ovs-appctl ofproto/trace br-tun dl_vlan=1

tcpdump

* On compute node
tcpdump -vvv -neli tapd76aea08-db port 67 or port 68 or arp -w `hostname`-tapd76aea08-db-vmport.pcap* On dhcp-agent
sudo ip netns exec qdhcp-2be2d2e6-a691-49fb-b260-c4e8abe86fd7 tcpdump -neli any port 67 or port 68 or arp -w `hostname`-dhcpport.pcap* Run dhclient <interface> on the VM
Check if the instance got IP or not

Flow Debug - ovs-stat

另一个实例:https://pastebin.ubuntu.com/p/MHNVf8wXtb/

snap install ovs-stat
snap connect ovs-stat:network-control
snap connect ovs-stat:openvswitch
ovs-stat --compress
# https://snapcraft.io/ovs-stat
juju ssh nova-compute/0 -- sudo -s
#sudo snap install snapd --edge
sudo snap install ovs-stat# for analysing ovs itself 
sudo snap connect ovs-stat:openvswitch
sudo snap connect ovs-stat:network-control# for analysing sosreport ovs data where sosreport is on a seperate filesystem
#sudo snap connect ovs-stat:removable-media#ovs-stat -p /tmp/results --tree ./sosreport-015 --openstack  #don't use sudo
ovs-stat -p /tmp/results --tree --openstackroot@juju-7c33c2-bionic-7:~# readlink -f /tmp/snap.ovs-stat/tmp/results/juju-7c33c2-bionic-7/ovs/bridges/br-int/ports/*
/tmp/snap.ovs-stat/tmp/results/juju-7c33c2-bionic-7/ovs/ports/int-br-data
/tmp/snap.ovs-stat/tmp/results/juju-7c33c2-bionic-7/ovs/ports/patch-tun
/tmp/snap.ovs-stat/tmp/results/juju-7c33c2-bionic-7/ovs/ports/tapcdda0c13-5e
root@juju-7c33c2-bionic-7:~# ovs-stat -p /tmp/results --host juju-7c33c2-bionic-7 --query "openstack.ports"
int-br-data  patch-int  patch-tun  phy-br-data  tapcdda0c13-5e  vxlan-0a05008d
root@juju-7c33c2-bionic-7:~# ovs-stat -p /tmp/results --host juju-7c33c2-bionic-7 --query "openstack.bridge br-int"
int-br-data     patch-tun       tapcdda0c13-5e#demo help
root@juju-7c33c2-bionic-7:~# sudo ls /tmp/snap.ovs-stat/tmp/results
juju-7c33c2-bionic-7
root@juju-7c33c2-bionic-7:~# ovs-stat -p /tmp/results --host juju-7c33c2-bionic-7 --query ""
Choose one of the following:
ofproto-trace
openstack
root@juju-7c33c2-bionic-7:~# ovs-stat -p /tmp/results --host juju-7c33c2-bionic-7 --query "openstack"
Choose one of the following:
openstack.bridge
openstack.bridge.ports
openstack.bridges
openstack.bridges.list
openstack.dvr
openstack.l2pop
openstack.local-vlans
openstack.local-vlans.info
openstack.port
openstack.port.flows
openstack.port.info
openstack.port.list
openstack.ports
openstack.ports.listroot@juju-7c33c2-bionic-7:~# ovs-stat -p /tmp/results --host juju-7c33c2-bionic-7 --query "openstack.port tapcdda0c13-5e"
Port tapcdda0c13-5e:- bridge: br-int- id: 6- local-vlan: 3- mac address: fe:16:3e:c4:58:9c
root@juju-7c33c2-bionic-7:~# ovs-stat -p /tmp/results --host juju-7c33c2-bionic-7 --query "openstack.l2pop 3"
10.5.0.141root@juju-7c33c2-bionic-7:~# ovs-stat -p /tmp/results --host juju-7c33c2-bionic-7 --query "ofproto-trace"
Choose one of the following:
ofproto-trace.port
ofproto-trace.port.all
root@juju-7c33c2-bionic-7:~# ovs-stat -p /tmp/results --host juju-7c33c2-bionic-7 --query "ofproto-trace.port tapcdda0c13-5e"
IMPORTANT: it looks like this port is attached to a vm so mac address has been converted from fe:16:3e:c4:58:9c to fa:16:3e:c4:58:9c
[arp]
sudo ovs-appctl ofproto/trace br-int in_port=6,arp,arp_spa=192.168.21.7,dl_src=fa:16:3e:c4:58:9c
[icmp]
sudo ovs-appctl ofproto/trace br-int in_port=6,ip,nw_proto=1,nw_src=192.168.21.7,nw_dst=1.1.1.1,dl_src=fa:16:3e:c4:58:9c
[dhcp]
sudo ovs-appctl ofproto/trace br-int udp,in_port=6,dl_src=fa:16:3e:c4:58:9c,dl_dst=ff:ff:ff:ff:ff:ff,nw_src=0.0.0.0,nw_dst=255.255.255.255,udp_src=68,udp_dst=67
[vm-to-vm]
sudo ovs-appctl ofproto/trace br-int in_port=6,tcp,dl_src=fa:16:3e:c4:58:9c,dl_dst=MAC_OF_REMOTE_INSTANCE
sudo ovs-appctl ofproto/trace br-int in_port=6,dl_vlan=3,dl_src=fa:16:3e:c4:58:9c,dl_dst=MAC_OF_REMOTE_INSTANCEroot@juju-7c33c2-bionic-7:~# sudo ovs-appctl ofproto/trace br-int in_port=6,arp,arp_spa=192.168.21.7,dl_src=fa:16:3e:c4:58:9c
Flow: arp,in_port=6,vlan_tci=0x0000,dl_src=fa:16:3e:c4:58:9c,dl_dst=00:00:00:00:00:00,arp_spa=192.168.21.7,arp_tpa=0.0.0.0,arp_op=0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
bridge("br-int")
----------------0. priority 0, cookie 0xe1185b8d7fdb39b4goto_table:60
60. in_port=6, priority 100, cookie 0xe1185b8d7fdb39b4set_field:0x6->reg5set_field:0x3->reg6resubmit(,71)
71. arp,reg5=0x6,in_port=6,dl_src=fa:16:3e:c4:58:9c,arp_spa=192.168.21.7, priority 95, cookie 0xe1185b8d7fdb39b4resubmit(,94)
94. priority 1, cookie 0xe1185b8d7fdb39b4NORMAL-> no learned MAC for destination, floodingbridge("br-tun")----------------0. in_port=1, priority 1, cookie 0x95591ec120b6fb4bgoto_table:22. dl_dst=00:00:00:00:00:00/01:00:00:00:00:00, priority 0, cookie 0x95591ec120b6fb4bgoto_table:2020. priority 0, cookie 0x95591ec120b6fb4bgoto_table:2222. dl_vlan=3, priority 1, cookie 0x95591ec120b6fb4bpop_vlanset_field:0x4d1->tun_idoutput:3-> output to kernel tunnelbridge("br-data")
-----------------0. in_port=1, priority 2, cookie 0xf521d3d0416ecc5bdropFinal flow: arp,reg5=0x6,reg6=0x3,in_port=6,vlan_tci=0x0000,dl_src=fa:16:3e:c4:58:9c,dl_dst=00:00:00:00:00:00,arp_spa=192.168.21.7,arp_tpa=0.0.0.0,arp_op=0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
Megaflow: pkt_mark=0,recirc_id=0,ct_state=-trk,eth,arp,in_port=6,vlan_tci=0x0000/0x1fff,dl_src=fa:16:3e:c4:58:9c,dl_dst=00:00:00:00:00:00,arp_spa=192.168.21.7
Datapath actions: set(tunnel(tun_id=0x4d1,src=10.5.2.207,dst=10.5.0.141,ttl=64,tp_dst=4789,flags(df|key))),6,push_vlan(vid=3,pcp=0),1

如果遇到丢包问题,怎么抓呢,例:

Further analysis using ovs-stat:
# Get id from tap device
$ ovs-stat -p /var/snap/ovs-stat/common/tmp/tmp.KEKB4UNSpk --host ps5-ra2-n1 --query "openstack.port tapc96ec0d2-57"
Port tapc96ec0d2-57:
- bridge: br-int
- id: 118
- local-vlan:
- mac address: fe:16:3e:0a:60:3e# Get metadata value from table=0 flow
$ grep in_port=118 /var/snap/ovs-stat/common/tmp/tmp.KEKB4UNSpk/ps5-ra2-n1/ovs/bridges/br-int/flows.stripped | grep table=0
table=0, priority=100,in_port=118 actions=set_field:0x48->reg13,set_field:0x20->reg11,set_field:0x21->reg12,set_field:0x3d3->metadata,set_field:0x3->reg14,resubmit(,8)metadata = 0x3d3# Get all DROP rules with n_packets > 0
$ grep metadata=0x3d3 /var/snap/ovs-stat/common/tmp/tmp.KEKB4UNSpk/ps5-ra2-n1/ovs/bridges/br-int/flows | grep -v n_packets=0 | grep -i drop
cookie=0x50ac02e5, duration=136901.013s, table=14, n_packets=174365, n_bytes=14228750, priority=65535,ct_state=+inv+trk,metadata=0x3d3 actions=drop
cookie=0x6d966244, duration=136907.908s, table=44, n_packets=4, n_bytes=216, priority=65535,ct_state=+inv+trk,metadata=0x3d3 actions=dropThe packets on the network are getting dropped due to invalid connections (ct_state = +inv+trk)
However ./sos_commands/openvswitch/ovs-appctl_dpctl.dump-conntrack_-m_system_ovs-system does not show any conntrack rules with mark state.

收集问题前后流表

ovs-stat --openstack --compress --archive-tag before
<restart neutron-openvswitch-agent>
ovs-stat --openstack --compress --archive-tag after

实例 - dvr + l3ha 环境下无法访问LB FIP

dvr + l3ha 环境下无法访问LB FIP(因为service vm所有计算节点上有多条cookie的流导致backend所用port不通)dvr+l3hap在计算节点上有qrouter-xxx, 在l3上有snat-xxx,下列路径中的(3)是一个unbound port
(1) curl > (2) lb floating ip > (3) lb vip > (4) amphora master > (5)
backend vm1, 找到l3 master
neutron l3-agent-list-hosting-router <tenant-router-uuid>2, 然后去l3 master和compute node分别抓包
sudo ip netns exec snat-<router-uuid> tcpdump -enli any host <lb-vip>
sudo ip netns exec qrouter-<router-uuid> tcpdump -enli any host <lb-vip>3, 如果上步抓不到lb-vip的包,那问题就出在东西向,得继续检查compute node与l3之间是否有l2-pop隧道grep -r local_ip /etc/neutron  # make a note of address from each hostsudo snap install ovs-statsudo snap connect ovs-stat:openvswitchsudo snap connect ovs-stat:network-controlsudo ovs-stat --openstacksudo ovs-stat -p <result-path-from-above> --query "openstack.l2pop
<namespace>"  # snat- on gateway, qrouter- on compute4, 如果上步的l2-pop没问题,继续两个节点上抓br-tun的包sudo ovs-tcpdump -enl -i br-tun host <lb-vip>
grep -r local_ip /etc/neutronip r get <address>| grep dev| cut -d ' ' -f 3tcpdump -enl -i <iface> host <lb-vip>5, 如果上步没问题,继续两个节点上抓物理网卡的包
grep -r local_ip /etc/neutronip r get <address>| grep dev| cut -d ' ' -f 3tcpdump -enl -i <iface> host <lb-vip>最后在运行amphora vm master的计算节点上看到了下面cookie的错误
$ ovs-stat -p /var/snap/ovs-stat/common/tmp/tmp.gsf2G4QWQW
--show-neutron-errorsData source: localhost (hostname=xxx)Results root: /var/snap/ovs-stat/common/tmp/tmp.gsf2G4QWQWRead-only: trueSearching for errors related to Openstack Neutron usage of
Openvswitch...INFO: the following bridges have more than one cookie. Depending on
whichneutron plugin you are using this may or may not be a problem i.e.
if you areusing the openvswitch ML2 plugin there is only supposed to be one
cookie perbridge but if you are using the OVN plugin there will be many cookies.br-int (725) - run the following to see full list of cookies:ls
/var/snap/ovs-stat/common/tmp/tmp.gsf2G4QWQW/xxx/ovs/bridges/br-int/flowinfo/cookies/*
broken links run:find /var/snap/ovs-stat/common/tmp/tmp.gsf2G4QWQW/xxx
-xtype l正常情况下,一个ovs-agent只有一个cookie,cookie在agent重启或者ovs flow更新时可能会变( (it used a temp cookie to switch out old flows with new ones))
neutron/agent/linux/openvswitch_firewall/firewall.py#OVSFirewallDriver.update_port_filter()
https://github.com/openstack/neutron/blob/stable/queens/neutron/agent/linux/openvswitch_firewall/firewall.py#L637
更新sg时,firewwall rules将被先delete后create,这中间就是时间差引入问题.现在改成cookie
new flow使用updated cookie, existing flows仍然用default cookie
删除和port关联的有old default cookies的flows, 这时new flow还会在那儿
将new flow改为old default cookie
OVSFWPortNotFound它是由这个问题造成的 - https://bugs.launchpad.net/neutron/+bug/1907491 (or 1708731)
$ grep -lr 192.168.0.51 "/var/snap/ovs-stat/common/tmp/tmp.gsf2G4QWQW"|
grep cookies
/var/snap/ovs-stat/common/tmp/tmp.gsf2G4QWQW/xxx/ovs/bridges/br-tun/flowinfo/cookies/7e6866b8372d488f
/var/snap/ovs-stat/common/tmp/tmp.gsf2G4QWQW/xxx/ovs/bridges/br-int/flowinfo/cookies/7dff6ae312da9a4
/var/snap/ovs-stat/common/tmp/tmp.gsf2G4QWQW/xxx/ovs/bridges/br-int/flowinfo/cookies/743dffeefc0f90fc
/var/snap/ovs-stat/common/tmp/tmp.gsf2G4QWQW/xxx/ovs/bridges/br-int/flowinfo/cookies/3a0831b9d4780409未来可以这样比较流
sudo ovs-stat --openstack --compress --archive-tag before# restart neutron-openvswitch-agentsudo ovs-stat --openstack --compress --archive-tag after# diff old and new flows without cookies
diff <(sort before/<host>/ovs/bridges/br-int/flows.stripped) <(sort final/<host>/ovs/bridges/br-int/flows.stripped) |& tee br-int-diff

实例 - debug flow

一个虚机172.16.13.36(在host=014上)无法ping172.16.13.24(在host=013上),在172.16.13.36处运行tcpdump能看到arp request但不能看到arp reply. 拓扑如下:

172.16.13.36 fa:16:3e:db:8e:ae 997e3544-6ee5-4841-a53c-6015e373179a
172.16.13.24 fa:16:3e:2e:e9:3f 8ab536e4-3e46-4c1f-9592-3c62a3709a9a
1, hyperconverge014 (192.168.10.16)Port "vxlan-c0a80a14"
Interface "vxlan-c0a80a14"
type: vxlan
options: {df_default="true", in_key=flow, local_ip="192.168.10.16", out_key=flow, remote_ip="192.168.10.20"}$ ovs-stat -p /tmp/results --host hyperconverge014 --query "openstack.port tap997e3544-6e"
Port tap997e3544-6e:
- bridge: br-int
- id: 60
- local-vlan: 18
- mac address: fe:16:3e:db:8e:ae2, hyperconverge013 (192.168.10.20)Port "vxlan-c0a80a10"
Interface "vxlan-c0a80a10"
type: vxlan
options: {df_default="true", in_key=flow, local_ip="192.168.10.20", out_key=flow, remote_ip="192.168.10.16"}$ ovs-stat -p /tmp/results --host hyperconverge013 --query "openstack.port tap8ab536e4-3e"
Port tap8ab536e4-3e:
- bridge: br-int
- id: 112
- local-vlan: 13
- mac address: fe:16:3e:2e:e9:3f

1, 先要来’openstack port list’这样就能确定两个虚机所用的tap name, 然后排除不存在两个计算节点上的tunnel没建的问题。

$ ovs-stat -p /tmp/results --host hyperconverge014 --query "openstack.l2pop 18" |grep 192.168.10.20
192.168.10.20
$ ovs-stat -p /tmp/results --host hyperconverge013 --query "openstack.l2pop 13" |grep 192.168.10.16
192.168.10.16

注:最后证明还是因为这个块造成的,虽然hyperconverge014有到192.168.10.20的tunnel,但它的总个数和别的不一样,重启了这个节点上的ovs就好了。

2, 通过ovs-stat辅助生成下列ofproto/trace检查两个vm之前的流量,看不出来问题.

$ sudo ovs-appctl ofproto/trace br-int in_port=60,dl_vlan=18,dl_src=fa:16:3e:db:8e:ae,dl_dst=fa:16:3e:2e:e9:3f
Flow: in_port=60,dl_vlan=18,dl_vlan_pcp=0,vlan_tci1=0x0000,dl_src=fa:16:3e:db:8e:ae,dl_dst=fa:16:3e:2e:e9:3f,dl_type=0x0000
bridge("br-int")
----------------0. priority 0, cookie 0xe78fd7af9952e341goto_table:60
60. in_port=60, priority 100, cookie 0xe78fd7af9952e341set_field:0x3c->reg5set_field:0x12->reg6resubmit(,71)
71. reg5=0x3c,in_port=60, priority 10, cookie 0xe78fd7af9952e341ct_clearresubmit(,93)
93. priority 0, cookie 0xe78fd7af9952e341drop
Final flow: reg5=0x3c,reg6=0x12,in_port=60,dl_vlan=18,dl_vlan_pcp=0,vlan_tci1=0x0000,dl_src=fa:16:3e:db:8e:ae,dl_dst=fa:16:3e:2e:e9:3f,dl_type=0x0000
Megaflow: recirc_id=0,ct_state=-trk,eth,in_port=60,dl_src=fa:16:3e:db:8e:ae,dl_type=0x0000
Datapath actions: ct_clear

3, cookie看起来也没问题, 一个cookie有13477个说明正在用

$ sudo ls /tmp/snap.ovs-stat/tmp/results/hyperconverge014/ovs/bridges/br-int/flowinfo/cookies
daaefb0341f466c3  e78fd7af9952e341$ sudo grep -r 'daaefb0341f466c3' /tmp/snap.ovs-stat/tmp/results/hyperconverge014/ovs/bridges/br-int/flowscookie=0xdaaefb0341f466c3, duration=91564.916s, table=82, n_packets=0, n_bytes=0, idle_age=65534, hard_age=65534, priority=70,ct_state=+est-rel-rpl,ip,reg6=0x12,nw_src=172.16.13.29 actions=conjunction(112,1/2)cookie=0xdaaefb0341f466c3, duration=91564.916s, table=82, n_packets=0, n_bytes=0, idle_age=65534, hard_age=65534, priority=70,ct_state=+new-est,ip,reg6=0x12,nw_src=172.16.13.29 actions=conjunction(113,1/2)$ sudo grep -r 'e78fd7af9952e341' /tmp/snap.ovs-stat/tmp/results/hyperconverge014/ovs/bridges/br-int/flows |wc -l
13477

两个虚机是同网段的,所以不涉及南北向与跨网段的东西向流量问题,所以路径只是简单地:

 vm tap -> br-int -> br-tun -- br-tun -> br-int -> tap

目前:

  • l2pop tunnel没问题
  • cookie没问题
  • flow通过ofproto/trace检查看起来没问题,但显示n_package=0,似乎说明客户在ping之前生成的sosreport.
    所以我们再仔细的检查流:
    1, 先检查table=1 (DVR_TO_SRC_MAC=1与ARP_DVR_MAC_TO_DST_MAC=3), 这里的n_packets居然是0, 为什么它会是0?
$ sudo grep -r 'table=1' /tmp/snap.ovs-stat/tmp/results/hyperconverge014/ovs/bridges/br-int/flows  |grep fa:16:3e:db:8e:aecookie=0xe78fd7af9952e341, duration=91548.413s, table=1, n_packets=0, n_bytes=0, idle_age=65534, hard_age=65534, priority=4,dl_vlan=18,dl_dst=fa:16:3e:db:8e:ae actions=mod_dl_src:fa:16:3e:20:d5:2b,resubmit(,60)

2, 检查table=71的arp流量

$ sudo grep -r 'table=71' /tmp/snap.ovs-stat/tmp/results/hyperconverge014/ovs/bridges/br-int/flows |grep fa:16:3e:db:8e:ae |grep arpcookie=0xe78fd7af9952e341, duration=91562.991s, table=71, n_packets=113305, n_bytes=4758810, idle_age=1, hard_age=65534, priority=95,arp,reg5=0x3c,in_port=60,dl_src=fa:16:3e:db:8e:ae,arp_spa=172.16.13.36 actions=resubmit(,94)

3, 检查table=21的arp-responder的流量,为空,说明没有enable arp-responder特性,arp将走广播,所以有上面第2步的arp流

$ sudo grep -r 'table=21' /tmp/snap.ovs-stat/tmp/results/hyperconverge014/ovs/bridges/br-int/flows
$ sudo grep -r 'table=21' /tmp/snap.ovs-stat/tmp/results/hyperconverge013/ovs/bridges/br-int/flows
$

4, 既然ping不通,肯定是哪里drop了,我们得着重找drop的流,再结合n_packets是否为0判断。

$ sudo grep -r -i 'drop' /tmp/snap.ovs-stat/tmp/results/hyperconverge014/ovs/bridges/br-tun/flows |grep dl_vlan=18cookie=0x9d9b4c5e3e2bd9b2, duration=91550.002s, table=1, n_packets=4, n_bytes=168, idle_age=65534, hard_age=65534, priority=3,arp,dl_vlan=18,arp_tpa=172.16.13.1 actions=dropcookie=0x9d9b4c5e3e2bd9b2, duration=91550.001s, table=1, n_packets=3379223, n_bytes=493601225, idle_age=0, hard_age=65534, priority=2,dl_vlan=18,dl_dst=fa:16:3e:20:d5:2b actions=drop

一般不应该drop带vlan的数据流,为什么这里还有很多n_packets不为0:

$ sudo grep -r -i 'drop' /tmp/snap.ovs-stat/tmp/results/hyperconverge014/ovs/bridges/br-tun/flows |grep -v vlancookie=0x9d9b4c5e3e2bd9b2, duration=230948.347s, table=0, n_packets=111180, n_bytes=4730883, idle_age=127, hard_age=65534, priority=0 actions=dropcookie=0x9d9b4c5e3e2bd9b2, duration=230948.345s, table=3, n_packets=0, n_bytes=0, idle_age=65534, hard_age=65534, priority=0 actions=dropcookie=0x9d9b4c5e3e2bd9b2, duration=230948.346s, table=4, n_packets=1308763, n_bytes=186255809, idle_age=0, hard_age=65534, priority=0 actions=dropcookie=0x9d9b4c5e3e2bd9b2, duration=230948.345s, table=6, n_packets=0, n_bytes=0, idle_age=65534, hard_age=65534, priority=0 actions=dropcookie=0x9d9b4c5e3e2bd9b2, duration=230948.344s, table=22, n_packets=8461624, n_bytes=1007117125, idle_age=0, hard_age=65534, priority=0 actions=drop

抓包实践

1, 抓物理网卡流量, 在VM=192.168.21.123上ping 192.168.21.44 -c1),这里应使用tunnel IP (grep -r local_ip /etc/neutron && ip r get <address>| grep dev| cut -d ' ' -f 3)
root@juju-41c350-octavia-9:~# tcpdump -enl -i ens3 "((arp or icmp) and (host 10.5.0.239))"
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on ens3, link-type EN10MB (Ethernet), capture size 262144 bytes
09:01:05.419697 fa:16:3e:10:65:e9 > fa:16:3e:0c:6d:a5, ethertype ARP (0x0806), length 42: Request who-has 10.5.0.239 tell 10.5.0.64, length 28
09:01:05.419750 fa:16:3e:0c:6d:a5 > fa:16:3e:10:65:e9, ethertype ARP (0x0806), length 42: Reply 10.5.0.239 is-at fa:16:3e:0c:6d:a5, length 28
09:01:05.496131 fa:16:3e:0c:6d:a5 > fa:16:3e:10:65:e9, ethertype ARP (0x0806), length 42: Request who-has 10.5.0.64 tell 10.5.0.239, length 28
09:01:05.496551 fa:16:3e:10:65:e9 > fa:16:3e:0c:6d:a5, ethertype ARP (0x0806), length 42: Reply 10.5.0.64 is-at fa:16:3e:10:65:e9, length 28
09:01:11.348061 fa:16:3e:2f:75:70 > fa:16:3e:0c:6d:a5, ethertype ARP (0x0806), length 42: Request who-has 10.5.0.239 tell 10.5.2.51, length 28
09:01:11.348110 fa:16:3e:0c:6d:a5 > fa:16:3e:2f:75:70, ethertype ARP (0x0806), length 42: Reply 10.5.0.239 is-at fa:16:3e:0c:6d:a5, length 28root@juju-41c350-octavia-10:~# tcpdump -enl -i ens3 "((arp or icmp) and (host 10.5.0.239))"
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on ens3, link-type EN10MB (Ethernet), capture size 262144 bytes
09:01:05.418929 fa:16:3e:10:65:e9 > fa:16:3e:0c:6d:a5, ethertype ARP (0x0806), length 42: Request who-has 10.5.0.239 tell 10.5.0.64, length 28
09:01:05.420072 fa:16:3e:0c:6d:a5 > fa:16:3e:10:65:e9, ethertype ARP (0x0806), length 42: Reply 10.5.0.239 is-at fa:16:3e:0c:6d:a5, length 28
09:01:05.496142 fa:16:3e:0c:6d:a5 > fa:16:3e:10:65:e9, ethertype ARP (0x0806), length 42: Request who-has 10.5.0.64 tell 10.5.0.239, length 28
09:01:05.496203 fa:16:3e:10:65:e9 > fa:16:3e:0c:6d:a5, ethertype ARP (0x0806), length 42: Reply 10.5.0.64 is-at fa:16:3e:10:65:e9, length 28
09:01:35.141597 fa:16:3e:c9:07:83 > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 10.5.0.239 tell 10.5.0.1, length 282, 抓虚机的tap device.root@juju-41c350-octavia-9:~# sudo ovs-tcpdump -enl -i tapdcf14412-d2 "((arp or icmp) and (host 192.168.21.44))"
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on ovsmi401790, link-type EN10MB (Ethernet), capture size 262144 bytes
08:59:31.469378 fa:16:3e:b4:cb:26 > fa:16:3e:c1:ea:64, ethertype 802.1Q (0x8100), length 102: vlan 4, p 0, ethertype IPv4, 192.168.21.123 > 192.168.21.44: ICMP echo request, id 15714, seq 1, length 64
08:59:31.471163 fa:16:3e:c1:ea:64 > fa:16:3e:b4:cb:26, ethertype 802.1Q (0x8100), length 102: vlan 4, p 0, ethertype IPv4, 192.168.21.44 > 192.168.21.123: ICMP echo reply, id 15714, seq 1, length 64
08:59:36.472576 fa:16:3e:c1:ea:64 > fa:16:3e:b4:cb:26, ethertype 802.1Q (0x8100), length 46: vlan 4, p 0, ethertype ARP, Request who-has 192.168.21.123 tell 192.168.21.44, length 28
08:59:36.474225 fa:16:3e:b4:cb:26 > fa:16:3e:c1:ea:64, ethertype 802.1Q (0x8100), length 46: vlan 4, p 0, ethertype ARP, Request who-has 192.168.21.44 tell 192.168.21.123, length 28
08:59:36.474231 fa:16:3e:b4:cb:26 > fa:16:3e:c1:ea:64, ethertype 802.1Q (0x8100), length 46: vlan 4, p 0, ethertype ARP, Reply 192.168.21.123 is-at fa:16:3e:b4:cb:26, length 28
08:59:36.474712 fa:16:3e:c1:ea:64 > fa:16:3e:b4:cb:26, ethertype 802.1Q (0x8100), length 46: vlan 4, p 0, ethertype ARP, Reply 192.168.21.44 is-at fa:16:3e:c1:ea:64, length 28root@juju-41c350-octavia-10:~# sudo ovs-tcpdump -enl -i tap2d6c945f-f4 "((arp or icmp) and (host 192.168.21.44))"
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on ovsmi749395, link-type EN10MB (Ethernet), capture size 262144 bytes
08:59:31.467771 fa:16:3e:b4:cb:26 > fa:16:3e:c1:ea:64, ethertype 802.1Q (0x8100), length 102: vlan 2, p 0, ethertype IPv4, 192.168.21.123 > 192.168.21.44: ICMP echo request, id 15714, seq 1, length 64
08:59:31.472073 fa:16:3e:c1:ea:64 > fa:16:3e:b4:cb:26, ethertype 802.1Q (0x8100), length 102: vlan 2, p 0, ethertype IPv4, 192.168.21.44 > 192.168.21.123: ICMP echo reply, id 15714, seq 1, length 64
08:59:36.473080 fa:16:3e:c1:ea:64 > fa:16:3e:b4:cb:26, ethertype 802.1Q (0x8100), length 46: vlan 2, p 0, ethertype ARP, Request who-has 192.168.21.123 tell 192.168.21.44, length 28
08:59:36.473525 fa:16:3e:b4:cb:26 > fa:16:3e:c1:ea:64, ethertype 802.1Q (0x8100), length 46: vlan 2, p 0, ethertype ARP, Reply 192.168.21.123 is-at fa:16:3e:b4:cb:26, length 28
08:59:36.473808 fa:16:3e:b4:cb:26 > fa:16:3e:c1:ea:64, ethertype 802.1Q (0x8100), length 46: vlan 2, p 0, ethertype ARP, Request who-has 192.168.21.44 tell 192.168.21.123, length 28
08:59:36.474698 fa:16:3e:c1:ea:64 > fa:16:3e:b4:cb:26, ethertype 802.1Q (0x8100), length 46: vlan 2, p 0, ethertype ARP, Reply 192.168.21.44 is-at fa:16:3e:c1:ea:64, length 283, 不能直接抓br-tun, 但可以抓br-int中的patch-tunroot@juju-41c350-octavia-9:~# ovs-tcpdump -en -i patch-tun "(arp or icmp)"
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on mipatch-tun, link-type EN10MB (Ethernet), capture size 262144 bytes
09:20:29.668464 fa:16:3e:b4:cb:26 > fa:16:3e:c1:ea:64, ethertype IPv4 (0x0800), length 98: 192.168.21.123 > 192.168.21.44: ICMP echo request, id 15728, seq 1, length 64root@juju-41c350-octavia-10:~# ovs-tcpdump -en -i patch-tun "(arp or icmp)"
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on mipatch-tun, link-type EN10MB (Ethernet), capture size 262144 bytes
09:20:11.381840 fa:16:3e:c1:ea:64 > fa:16:3e:b4:cb:26, ethertype IPv4 (0x0800), length 98: 192.168.21.44 > 192.168.21.123: ICMP echo reply, id 15727, seq 1, length 64

openstack-toolkit - 查看tunnel

snap install openstack-toolkit
source adminrc
openstack-toolkit.neutron --get-l2pop-mapovs-stat --openstack --query "openstack.l2pop <port-name>"ovs-appctl dpif/show

实例 - HW offload - 虚机tap无法收到icmp reply

遇到一个问题,创建虚机后,可以经FIP ssh进虚机,在虚机里ping 网关,网关上的qrouter-xxx能看到icmp request & reply,但是虚机tap无法收到icmp reply.
ovs-appctl ofproto/trace包是支持使用实际包来测试的。

ovs-appctl vlog/set dpif_netlink:DBG
ovs-appctl vlog/set ofproto_dpif_xlate:DBG
#ethtool -K ens7 hw-tc-offload on
2021-06-07T01:40:05.823Z|00023|ofproto_dpif_xlate(handler11)|DBG|bridge br-data: learned that fa:16:3e:b9:27:aa is on port ens7 in VLAN 0
2021-06-07T01:40:05.823Z|00024|ofproto_dpif_xlate(handler11)|DBG|bridge br-int: learned that fa:16:3e:b9:27:aa is on port int-br-data in VLAN 3# VM: private=192.168.21.43, 10.5.151.172
ssh -i ~/testkey.priv ubuntu@10.5.151.172 ping -c 1 192.168.21.1
tcpdump -xx -enl -i tap921d2de4-70 icmp > icmp.out
hex=`sed -rn "s/.+0x[0-9]+:\s+//p" icmp.out| tr -d ' '| tr -d '
'`
# ovs-ofctl show br-int |grep tap921d2de4-708(tap921d2de4-70): addr:fe:16:3e:4d:0e:5b
sudo ovs-appctl ofproto/trace br-int in_port=8 $hex

注意一个重要点,我们最好用‘ovs-appctl ofproto/trace’来观测icmp reply包的情况。这样应该在qrouter-xxx中运行这条tcpdump,产生的icmp.out文件中有两个包,一个包是icmp request, 一个是icmp reply,应该删除icmp request,只留一条(因为ofproto/trace只能观察一条)。这样看到的结果是:

ubuntu@cloud1:~$ sudo ovs-appctl ofproto/trace br-int in_port=14 $hex
...
82. ct_state=+new-est,icmp,reg5=0x36, priority 75, cookie 0x1d89cb165a36d2d7ct(commit,zone=NXM_NX_REG6[0..15])drop-> Sets the packet to an untracked state, and clears all the conntrack fields.output:54resubmit(,92)
92. priority 0, cookie 0x1d89cb165a36d2d7drop
Final flow: recirc_id=0x3942,eth,icmp,reg5=0x36,reg6=0x7,in_port=14,vlan_tci=0x0000,dl_src=fa:16:3e:89:64:d5,dl_dst=fa:16:3e:06:25:82,nw_src=192.168.102.1,nw_dst=192.168.102.46,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=0,icmp_code=0
Megaflow: recirc_id=0x3942,ct_state=+new-est-rel-rpl,eth,icmp,in_port=14,nw_frag=no
Datapath actions: ct(commit,zone=7),21

关于结果‘Datapath actions: ct(commit,zone=7),21’的解读可参考:
OpenvSwitch Conntrack & NAT
OVS的conntrack和NAT,主要是介绍DPDK情况下的NAT补丁 - https://zhaozhanxu.com/2017/02/10/SDN/OVS/2017-02-10-conntrack/
'icmp_type=0,icmp_code=0’说明是icmp reply包,下一跳发给conntrack, 然后到VM port (21), zone=7是这个network的internal vlan,
这个‘br-int -> pop_vlan -> tap interface’这个路径是对的。
但这个也不能说明什么问题,因为我们只是给了一个imcp reply 单个包测试,没有上下文信息,也就没有conntrack项,所以这个结果也是正常的。但至少说明一点,只要conntrack无问题,这个br-int里的流表似乎没有问题。但它好像没有被offload啊。

现在需要看看网卡中的TC流表,这个流表也是动态的,所以我们设计下面的watch命令,然后同时在虚机里运行ping网关的命令。

watch -t -n 1 "(date '+TIME:%H:%M:%S' ; ovs-appctl dpctl/dump-flows type=offloaded ; echo '-----------------' ; ovs-appctl dpctl/dump-flows type=ovs) | tee -a result.txt"

我们发现下列TC flows只在’ovs-appctl dpctl/dump-flows type=ovs’段中,不在’ovs-appctl dpctl/dump-flows type=offloaded’段中。

$ grep -r 'src=fa:16:3e:89:64' result.txt |tail -n1
recirc_id(0x3953),in_port(19),ct_state(-new+est-rel+rpl),ct_zone(0x7),ct_mark(0),eth(src=fa:16:3e:89:64:d5,dst=fa:16:3e:06:25:82),eth_type(0x0800),ipv4(frag=no), packets:33, bytes:3234, used:0.492s, actions:drop

一些理论分析:

openflow controller下发流规则到ovsdb-server/ovs-vswitchd,再经netlink缓存到kernel datapath (可通过ovs-appctl dpctl/dump-flows type=ovs查看缓存的流规则)。以后kernel datapath直接根据这些cache流规则转发数据,如果不知道如何转发又得经netlink去查询(这个叫慢路径)。OVS的设计思路就是通过slow-path和fast-path的配合使用,完成网络数据的高效转发。
同理,若网卡支持hw offload,可通过TC将流规则也cache在网卡硬件中以提升性能。最初的Linux TC仅用于QoS, 仅在netdev的入方向与出方向增加了挂载点用于控制网络流量和速度延时优先级等。但现在TC增加了Classifier-Action子系统,可以根据网络数据包的报头识别报文并执行相应的Action. Classifier-Action子系统作为一种插件结构目前增加了对OpenFlow的支持,所有的OpenFlow规则都可以映射成TC规则。在4.9~4.14内核,Linux终于增加了对TC Flower硬件卸载的支持。也就是说OpenFlow规则有可能通过TC Flower的硬件卸载能力,在硬件(主要是网卡)中完成转发。
TC Flower硬件卸载的工作原理比较简单。当一条TC Flower规则被添加时,Linux TC会检查这条规则的挂载网卡是否支持并打开了NETIF_F_HW_TC标志位,并且是否实现了ndo_steup_tc(TC硬件卸载的挂载点)。如果都满足的话,这条TC Flower规则会传给网卡的ndo_steup_tc函数,进而下载到网卡内部. 
网卡的NETIF_F_HW_TC标志位可以通过ethtool来控制打开关闭:
# ethtool -K eth0 hw-tc-offload on
# ethtool -K eth0 hw-tc-offload offOVS在2018年增加了对TC Flower的支持[1], OVS的datapath现在有卸载到网卡的可能了。OVS初始化的时候,会向eSwitch下发一条默认的规则,如果网络包匹配不了任何其他规则,则会被这条默认规则匹配。这条规则的action就是将网络数据包送到eSwitch的管理主机,也就是说送到了位于Linux kernel的datapath上。如果这个网络数据包是首包的话,那根据前面的描述,在kernel的OVS datapath会继续上送到位于用户空间的ovs-vswitchd。因为ovs-vswitchd中有OpenFlow规则,ovs-vswitchd还是可以完成转发。不一样的地方是,ovs-vswitchd会判断当前数据流对应的规则能否offload(卸载)到网卡。如果可以的话,ovs-vswitchd会调用通过TC接口将flow规则下发至硬件。
在OVS-TC中,严格来说,现在Datapath有三个,一个是之前的OVS kernel datapath,一个是位于Kernel的TC datapath,另一个是位于网卡的TC datapath。
使用OVS-TC方案,可以提供比DPDK更高的网络性能。因为,首先网络转发的路径根本不用进操作系统,因此变的更短了。其次,网卡,作为专用网络设备,转发性能一般要强于基于通用硬件模拟的DPDK。另一方面,网卡的TC Flower offload功能,是随着网卡驱动支持的,在运维上成本远远小于DPDK。
但是OVS-TC方案也有自己的问题。首先,它需要特定网卡支持,不难想象的是,支持这个功能的网卡会更贵,这会导致成本上升,但是考虑到不使用DPDK之后释放出来的CPU和内存资源,这方面的成本能稍微抵消。其次,OVS-TC功能还不完善,例如connection track功能还没有很好的支持。第三,这个问题类似于DPDK,因为不经过Linux kernel,相应的一些工具也不可用了,这使得监控难度更大。TC Flower规则下发到网卡中,相应的网卡上必须有一个虚拟交换机,Mellanox称这个虚拟交换机为eSwitch.
1, 将PF设备的eswitch模式从legacy更改为switchdev(假设pci号为04:00.0)
echo switchdev > /sys/class/net/enp6s0f0/compat/devlink/mode
或者使用devlink工具,如下:
devlink dev switch set pci/0000:04:00.0 mode switchdev
2, 使用网卡的Hardware Offload功能
ethtool -K enp6s0f0 hw-tc-offload on
3, 使能Open vSwitch的Hardware Offload功能
systemctl start openvswitch
ovs-vsctl set Open_vSwitch . other_config:hw-offload=true
systemctl restart openvswitch
4, 其他命令
ovs-dpctl show
# type=offloaded is just an indication that a flow is handled by the TC datapath.
# This does not guarantee that it has been offloaded to the NIC
ovs-appctl dpctl/dump-flows type=offloaded
# View non-offloaded datapath flows
# sos_commands/openvswitch/ovs-appctl_dpctl.dump-flows_-m_system_ovs-system
# ovs-appctl dpctl/dump-conntrack
ovs-appctl dpctl/dump-flows type=ovs
egrep -B 1 "dropped [^0]+[0-9]+" sos_commands/networking/tc_-s_qdisc_show
5, Openstack环境下创建port时,注意需要传入capability:switchdev参数
openstack port create --network net_1     --vnic-tpye=direct --binding-profile '{"capabilities":{"switchdev"}}'  sriov_port1
6, 成功后,用tcpdump查看只能看到第一笔arp request/reply,因为后续虚机之间的流量offload到了网卡,在网卡中实现了转发,而不再经由OVS进行转发。
7, 查看流表转发规则:可以看到已经offload到网卡上的流表;
ovs-dpctl dump-flows type=offloaded[1] https://github.com/openvswitch/ovs/commit/576126a931cdf96d43443916d922462c7a16e350
[2] OpenVSwitch 硬件加速浅谈 - https://blog.csdn.net/cpongo2ppp1/article/details/90172431
[3] 基于智能网卡(Smart Nic)的Open vSwitch卸载方案简介 - https://www.cnblogs.com/dream397/p/14432472.html
[4] OVS 流表offload - https://www.dazhuanlan.com/2019/12/31/5e0af1858dada/

在这里插入图片描述
在这里插入图片描述
20220817更新 - 关于switchdev and offload and SmartNIC and DPU的更多理解

TCP Offloading Engine (TOE), TCP/IP协议处理(分片,重组,checksum等)工作从CPU转移到网卡
Remote Direct Memory Acess (RDMA), RDMA通过专有的Verbs interface绕开内核直接从用户空间访问RDMA网卡实现对本机和远程机主存的
访问, 所以它也叫host-offload(硬件处理), host-bypass(绕过内核)技术
TOE网卡和TC Flower offload的网卡和RDMA网卡都属于SmartNIC。 传统网卡负责L1-L2层,L3层以上由host上的CPU处理。
为适应高速网络,现代网卡中普遍卸载了部分 L3-L4 层的处理逻辑(e.g. 校验和计算、传输层分片重组等),来减轻 Host CPU 的处理负担。
甚至有些网卡(e.g. RDMA 网卡)还将整个 L4 层的处理都卸载到硬件上,以完全解放 Host CPU。
RDMA的网卡使用需要两端都支持RDMA技术的网卡才能发挥,TOE只需要单端即可。其中的 “工作负载” 不仅仅是 Networking,还可以是 Storage、Security 等。
Smart NIC基于ASIC实现属于第一代(基于以太网技术,并解决offload问题), DPU是采用FPGA硬件来实现的数据处理芯片属于第二代.(除了基于以太网和解决offload以外,还增加流量处理和云原生控制)。基础设施芯片, Intel的解决方案是FPGA来解决流量卸载的问题,然后再外挂一个可编程的通用小CPU来代替主机CPU来做一些数据包的预处理的事情,放在一块PCB版上成了FGPA+Xeon-D的模式。switchdev是一种switch offload(将交换结构offload到硬件交换机上, flow也可以offload)的内核通用接口,
这样可以基于这个标准内核switchdev接口来实现交换机的控制面和管理面,而避免同众多厂家的专用SDK打交道.
在switchdev驱动框架下,硬件交换机上的每个物理端口都在内核中注册为一个net_device, 这样就能使用现有工具(
如bridge, ip, iproutes)将端口绑定或桥接或隧道化或划分vlan.

20230316对dpu

try:

systemctl stop mlnx-switchdev-mode
ethtool -k enp59s0f0 hw-tc-offload: off
ethtool -k enp59s0f1 hw-tc-offload: off
ovs-vsctl set Open_vSwitch . other_config:hw-offload=false
systemctl restart openvswitch.service
sleep 10
systemctl restart neutron-openvswitch-agent

这个案子最终被证明是因为neutron ovs firewall driver不支持hw-offload所致,所以或者关闭hw-offload或者关闭SG都行(但octaiva需要SG所以它们升级之后有了octavia也有就有SG也就出问题了)

https://hareshkhandelwal.blog/2020/03/11/lets-understand-the-openvswitch-hardware-offload/
https://access.redhat.com/articles/4023401
Limitations:
“openvswitch” firewall driver cannot be used with offload as connection tracking properties of the flows are not supported in offload path. The default “iptables_hybrid” or “none” is supported. Patches have been merged to Ovs 2.13 to support connection tracking.

关于hw-offload环境的搭建见: https://blog.csdn.net/quqi99/article/details/116893909?spm=1001.2014.3001.5501

20210623 - 网络丢包

首先看br-data与br-ex上是否使用了不同的网卡,其次改成使用不同网卡后tunnel是否变成了相应的网段。

juju config neutron-openvswitch disable-mlockall=true
juju config neutron-openvswitch firewall-driver=iptables_hybrid#the flows statistics
ovs-appctl dpctl/show -s#memory and averages
ovs-appctl memory/show
ovs-appctl upcall/show
ovs-appctl coverage/show#ovs conntrack table statistics
ovs-appctl dpctl/ct-stats-show# show CT table
ovs-appctl dpctl/dump-conntrack -s# disable megaflows (wildcard flows)
ovs-appctl upcall/disable-megaflows
# disable UFID
ovs-appctl upcall/disable-ufid
内核态与用户态识别流都得通过key, 用户态与内核态构建key的size可能不一样(例如用户态可能不识别ipv6等),所以也有了megaflows与UFID (见https://docs.openvswitch.org/en/latest/topics/datapath/)两种key的识别模式,前者是通配符,后者是精确匹配。前者通配符会让更少的包从内核态到用户态去识别从而提升性能。若支持vlan,内核中对key增加了vlan key attribute
eth(...), vlan(vid=10, pcp=0), eth_type(0x0800), ip(proto=6, ...), tcp(...)
但这可能导致用户态不能识别这个key, 所以将eth_type, ip, tcp三个属性挪到encap中,若用户态不识别vlan也不会看到encap这样就不会误解它。
eth(...), eth_type(0x8100), vlan(vid=10, pcp=0), encap(eth_type(0x0800),
ip(proto=6, ...), tcp(...)))timeout 10 strace -c -p $(cat /var/run/openvswitch/ovs-vswitchd.pid)
cat sys/class/net/ens3f0/statistics/{tx,rx}_packets
date; iperf3 -c 192.168.10.127 -t 1800 -b 10G; date
# iperf3 -s -V
# iperf3 -c 10.100.0.1 -u -i 10 -t 30 -b 20G --get-server-output
# date; netperf -H node1 -l 30 -t TCP_RR -D-1 -j -- -O min_latency,mean_latency,max_latency,stddev_latency,p99_latency,transaction_rate -p5555; date# TCP_RR的send size为0, 真要测吞吐的话最好用TCP_STREAM. 但用TCP_RR能看到latency spikes
netperf -H node1 -t TCP_STREAM -fM -l 10 -D-1 -j -- -O min_latency,mean_latency,max_latency,stddev_latency,p99_latency,transaction_rate,throughput,throughput_units;XML dump from guest VM, to correlate with host NICs (e.g. `virsh dumpxml <domain>`)
sar output from hypervisor 
ovs-appctl dpctl/show -s
ovs-appctl memory/show
ovs-appctl upcall/showip -s -d link show eth0
grep -E 'bondA|bondB' sos_commands/networking/tc_-s_qdisc_show -A1
ovs-appctl dpif/show
ovs-appctl dpctl/dump-conntrack system@ovs-system# https://blog.csdn.net/quqi99/article/details/109340806
sysctl -p
net.core.rmem_max=268435456
net.core.wmem_max=268435456
net.core.rmem_default=536870912
net.core.wmem_default=536870912
net.ipv4.tcp_rmem=4096 5242880 33554432
net.ipv4.tcp_wmem=4096 5242880 33554432
net.ipv4.tcp_moderate_rcvbuf=1
net.ipv4.tcp_no_metrics_save=1
net.ipv4.tcp_congestion_control=htcp
net.ipv4.tcp_mtu_probing=1
net.core.netdev_budget=50000
net.core.netdev_budget_usecs=12000
net.core.somaxconn=300000
net.ipv4.tcp_max_syn_backlog=2048
net.core.netdev_max_backlog=600000
vm.vfs_cache_pressure=100
net.ipv4.route.flush=1
vm.min_free_kbytes=524288/sbin/ethtool -G ens3f0 rx 8192 tx 8192
/sbin/ethtool -G ens3f1 rx 8192 tx 8192
/sbin/ethtool -G ens9f0 rx 2047 tx 511
/sbin/ethtool -G ens2f0 rx 8192 tx 8192
/sbin/ethtool -G ens9f1 rx 2047 tx 511
/sbin/ethtool -G ens9f2 rx 2047 tx 511
/sbin/ethtool -G ens9f3 rx 2047 tx 511
/sbin/ethtool -G ens2f1 rx 8192 tx 8192ethtool -K bondB lro on$ cat pps.sh 
#!/bin/bash
INTERVAL="1"  # update interval in seconds
if [ -z "$1" ]; thenechoecho usage: $0 [network-interface]echoecho e.g. $0 eth0echoecho shows packets-per-secondexit
fi
IF=$1
while true
doR1=`cat /sys/class/net/$1/statistics/rx_packets`T1=`cat /sys/class/net/$1/statistics/tx_packets`sleep $INTERVALR2=`cat /sys/class/net/$1/statistics/rx_packets`T2=`cat /sys/class/net/$1/statistics/tx_packets`TXPPS=`expr $T2 - $T1`RXPPS=`expr $R2 - $R1`echo "TX $1: $TXPPS pkts/s RX $1: $RXPPS pkts/s"
done

是这个问题吗?(1732067) , 为此也建了一个vlan环境,见- https://blog.csdn.net/quqi99/article/details/118341936?

针对bug - https://bugs.launchpad.net/neutron/+bug/1732067进虚机的有状态流量经table=82,见OpenStack OVS防火墙驱动程序 - https://blog.csdn.net/sinat_20184565/article/details/95161981table=82, priority=71,conj_id=18,ct_state=+est-rel-rpl,ip,reg5=0x2 actions=strip_vlan,output:2
table=82, priority=71,conj_id=19,ct_state=+new-est,ip,reg5=0x2 actions=ct(commit,zone=NXM_NX_REG6[0..15]),strip_vlan,output:2,resubmit(,92)其实bug上说的是这个(ct_state=+est-rel-rpl,tcp):
cookie=0x94ebb7913c37a0ec, duration=415.490s, table=82, n_packets=5, n_bytes=424, idle_age=31, priority=70,ct_state=+est-rel-rpl,tcp,reg5=0xd,dl_dst=fa:16:3e:80:cb:0a,tp_dst=80 actions=strip_vlan,output:13它使用了output:2, 它不会地址学习(见:https://mail.openvswitch.org/pipermail/ovs-discuss/2016-August/042276.html)将'output:2'改成'actions=mod_vlan_vid:2,NORMAL' (没有vlan就是:actions=NORMAL)就好了,然后虚机的MAC(这个进虚机的流量)应现在下列输出中,这样就不会出现广播(在其他非server/client节点使用tcpdump看不到数据).见:https://bugs.launchpad.net/neutron/+bug/1732067/comments/9
如果VM (192.168.111.18/fa:16:3e:b2:c2:84, FIP=172.24.0.157), 外部机器是172.24.0.3/e6:73:51:97:74:4e
1, 外部机器没有127.24.0.157的arp cache,当从外部机器ping VM's FIP, 将广播
2, 流量进虚机时br-int学习远程地址e6:73:51:97:74:4e
[root@node-2 ~]# ovs-appctl fdb/show br-int | grep e6:73:51:97:74:4e2 3 e6:73:51:97:74:4e 3
3, 如果使用output:2它只是将入口包转给虚机但并没有地址学习来更新br-int上的fdb,即使持续从外部机器运行ping也不会地址学习.这样当ARP cache过期之后,ICMP reply包将会广播到整个br-int((This do not affect the original ping, but flood will affect other VMs on this host. If other VMs has ingress QoS, this flood will affect other VM's network connection))
4, 所以需将'output:2'改成'actions=NORMAL'做地址学习。上面说的是进虚机使用'action=output:PORT'无法进行地址学习,这样br-int上就没有远端的arp (ovs-appctl fdb/show br-int), 这样当流量出虚机时它又使用action=NORMAL,这个在没有dst mac entry in the fdb table的话就会广播,见ofproto-dpif-xlate.c:
xlate_normal(struct xlate_ctx *ctx){...if (mcast_snooping_enabled(ctx->xbridge->ms)&& !eth_addr_is_broadcast(flow->dl_dst)&& eth_addr_is_multicast(flow->dl_dst)&& is_ip_any(flow)) {...
} else {if (is_ip_local_multicast(flow, wc)) {xlate_normal_flood(ctx, in_xbundle, &xvlan);
}另外,ARP request会触发地址学习 - https://bugs.launchpad.net/neutron/+bug/1732067/comments/12

最后,通过下列配置解决。通过tcpdump(
tcpdump -w /tmp/xxx.pcap -i tap2799f757-4e -s 96)看到进出ovs的流量差不多,在有问题的时候也看到了win size变化,也没有看到重传信息了,似乎没问题。然后考虑缓存,通过hw_vif_multiqueue_enabled支持多队列,特别是增大rx_queue_size解决问题

hw_vif_multiqueue_enabled=true
rx_queue_size = 1024
tx_queue_size = 1024

注:也不是上面的原因造成的,继续减少revalidator thread num有效果但不能完全根除,估计还得将ovs进程pin到具体的cpu core上

25% * number_of_cores + 1 = revalidators
number_of_cores - revalidators= handlers
sudo ovs-vsctl --no-wait set Open_vSwitch . other_config:n-handler-threads=47
sudo ovs-vsctl --no-wait set Open_vSwitch . other_config:n-revalidator-threads=17sudo vi /etc/systemd/system.conf
#add "CPUAffinity=0-16"
sudo vi /etc/default/grub
# Remove (not Add) "isolcpus=16-127" to GRUB_CMDLINE_LINUX_DEFAULT
then do: sudo update-grub && sudo rebootcat /proc/cmdline
cat /sys/devices/system/cpu/isolated
numactl --hardware
openstack flavor set <flavor> --property hw:cpu_policy=shared
juju config nova-compute --reset cpu-shared-set
juju config nova-compute --reset cpu-dedicated-set
juju config nova-compute vcpu-pin-set="16-127"
juju config nova-compute cpu-shared-set="8-11" cpu-dedicated-set="12-27"for i in `ps aux|grep qemu| awk '{print $2}'`; do taskset -cp $i; done
ps H -o 'tid comm psr' $(pidof qemu-system-x86_64)| sort -nk 3| grep KVM
ps H -o 'tid comm psr' $(pidof ovs-vswitchd)| sort -nk 3hw:cpu_policy=dedicated
two compute nodes each with 28 cores (SMT enabled)
systemd CPUAffinity=0-7
isolcpus disabled
juju config nova-compute cpu-shared-set="8-11" cpu-dedicated-set="12-27"
flavor:  properties                 | hw:cpu_policy='dedicated', hw:emulator_threads_policy='share' |# virsh dumpxml instance-00000013| egrep "emulatorpin|vcpupin"<vcpupin vcpu='0' cpuset='25'/><vcpupin vcpu='1' cpuset='14'/><vcpupin vcpu='2' cpuset='22'/><vcpupin vcpu='3' cpuset='18'/><vcpupin vcpu='4' cpuset='16'/><vcpupin vcpu='5' cpuset='15'/><vcpupin vcpu='6' cpuset='19'/><vcpupin vcpu='7' cpuset='23'/><emulatorpin cpuset='8-11'/>Run stress on all cores used by vms on both hypervisors:stress-ng --taskset 8-27 --netlink-proc 20 --aggressive
Run iperf3 between two vms with small packet size, parallel tasks and bi-directional:iperf3 -u -t 10000 -c 192.168.21.204 -b 0  -l 16 --bidir -P 32mpstat from hypervisor:
Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
Average:     all    2.67    0.00   31.85    0.00    0.00    3.40    0.00    3.02    0.00   59.06Take sample of number or irq dropped packets:
awk '$2!="00000000" {print $0}' /proc/net/softnet_stat| cut -d ' ' -f 2- > 1
sleep 10
awk '$2!="00000000" {print $0}' /proc/net/softnet_stat| cut -d ' ' -f 2- > 2; diff 1 2
#repeat

20210810 - 一个dvr-snat+vrrp问题debug

https://bugs.launchpad.net/neutron/+bug/1945306

一个dvr-snat+vrrp环境(./generate-bundle.sh -s focal --dvr-snat-l3ha --name dvr --num-compute 3 --run), 当虚机和router(neutron l3-agent-list-hosting-router provider-router)不在同一个节点上时,会发现虚机无法ping sg-xxx接口(三个router节点都有qrouter-xxx/qr-xxx, 但只有master router有snat-xxx/sg-xxx).
虚机(192.168.21.245)在host-8上,router master节点是host-10(sg-xxx=192.168.21.41)
通过"ovs-dpctl dump-flows"抓到了下面包:
注意:通过"ovs-dpctl dump-flows"抓包时应该一直运行ping,运行ping可能还不好反应问题,实际上我是在bastion(10.5.0.8)上运行(sudo python3 -m http.server),然后在虚机中持续运行(while true; do curl --connect-timeout 3 http://10.5.0.8:8000; done):recirc_id(0x167),in_port(10),ct_state(+est-rel-rpl),eth(),eth_type(0x0800),ipv4(proto=6,frag=no),tcp(dst=22), packets:79, bytes:5250, used:0.829s, flags:P., actions:12
recirc_id(0),in_port(10),ct_state(-new-est),eth(dst=fa:16:3e:4c:b2:57),eth_type(0x0800),ipv4(frag=no), packets:12, bytes:888, used:0.817s, flags:S, actions:11
recirc_id(0),in_port(10),ct_state(-trk),eth(dst=fa:16:3e:c0:d6:96),eth_type(0x0800),ipv4(proto=6,frag=no), packets:79, bytes:5250, used:0.829s, flags:P., actions:ct(zone=3),recirc(0x167)#注意,这里使用'ovs-dpctl show'查看id, 它是kernel module的东西,而不是使用用户态的"sudo ovs-vsctl -- --columns=name,ofport list Interface tap281c13e7-5e"来查看
root@juju-c25ce5-dvr-8:/home/ubuntu# ovs-dpctl show
system@ovs-system:...port 1: br-int (internal)port 4: br-tun (internal)port 7: vxlan_sys_4789 (vxlan: packet_type=ptap)port 8: fg-28633e19-49 (internal)port 9: ha-50132af2-79 (internal)port 10: qr-c3d5af9a-1c (internal)port 11: sg-82f77fa5-db (internal)port 12: tap4eebd189-b3port 14: qg-c95dde7f-00 (internal)
$ openstack port list |grep b2:57
| 82f77fa5-db8c-4f03-980e-1f2a665357cc |                                                 | fa:16:3e:4c:b2:57 | ip_address='192.168.21.41', subnet_id='42cc2bee-110c-4ec7-88e2-aa74ecacfecb'   | ACTIVE |这说明:
fa:16:3e:4c:b2:57是sg-xxx的mac, 所以'actions:11'这条rule的存在说明,想ping sg-xxx的话只能在本机ping(所以从router master以外的节点ping sg-xxx是不可行的).
似乎下面这行也正常啊:
recirc_id(0x167),in_port(10),ct_state(+est-rel-rpl),eth(),eth_type(0x0800),ipv4(proto=6,frag=no),tcp(dst=22), packets:79, bytes:5250, used:0.829s, flags:P., actions:12正常的路径应该是:
vm -> qrouter-xxx -> br-int -> br-tun -> vxlan-0a0503d
在host-8上运行"ovs-tcpdump -ne -i vxlan-0a0503dd -l"确认vxlan接口上没有输出.
在host-8上运行(ip netns exec qrouter-0da4bf10-5fda-4987-a612-f8711b73c259 tcpdump -ne -i qr-c3d5af9a-1c -l "ether dst e8:2a:ea:44:55:66" ) 也没有输出 (e8:2a:ea:44:55:66是sg-xxx的mac)
使用"sudo conntrack -L | grep mark=1"也没看到drop的flow,看来就是traffic没有被转到br-int里去(它转到了snat-xx on local)
在最新版的wallaby上测试也有同样的问题,接下来也应该降级测测老版本是不是也有问题.在降级bionic版本中没这个问题,看到的in_port(9)从GW(qr-74e56da3-d3)出来的流量最终到了:
注:fa:16:3f:a1:ca:99是192.168.21.1, fa:16:3e:a4:80:66是sg-xxx的mac
recirc_id(0x13),in_port(9),ct_state(+est-rel-rpl),eth(),eth_type(0x0800),ipv4(proto=6,frag=no),tcp(dst=22), packets:5898, bytes:389772, used:0.004s, flags:P., actions:13
recirc_id(0),in_port(9),skb_mark(0x4000000),ct_state(-new-est),eth(src=fa:16:3e:3a:5c:b8,dst=fa:16:3e:a4:80:66),eth_type(0x0800),ipv4(tos=0/0x3,frag=no), packets:1464, bytes:117364, used:0.004s, flags:SFP., actions:push_vlan(vid=2,pcp=0),1,set(tunnel(tun_id=0x4d1,src=10.5.3.217,dst=10.5.3.163,ttl=64,tp_dst=4789,flags(df|key))),set(eth(src=fa:16:3f:a1:ca:99,dst=fa:16:3e:a4:80:66)),pop_vlan,set(skb_mark(0)),8,set(eth(src=fa:16:3e:3a:5c:b8,dst=fa:16:3e:a4:80:66)),set(skb_mark(0x4000000)),10,13
recirc_id(0),in_port(9),ct_state(-trk),eth(dst=fa:16:3e:f2:0b:e8),eth_type(0x0800),ipv4(proto=6,frag=no), packets:5898, bytes:389772, used:0.004s, flags:P., actions:ct(zone=2),recirc(0x13)root@juju-f327de-dvrbionic-7:~# ovs-dpctl show
system@ovs-system:port 0: ovs-system (internal)port 1: br-int (internal)...port 4: br-tun (internal)port 5: gre_sys (gre: packet_type=ptap)port 8: vxlan_sys_4789 (vxlan: packet_type=ptap)port 9: qr-74e56da3-d3 (internal)port 10: sg-fa90c9bb-1f (internal)port 11: fg-7863fe79-31 (internal)port 12: qg-cf0f77c3-9d (internal)port 13: tapbb121137-8f对比上面的,上面的就少了这一行:
recirc_id(0),in_port(9),skb_mark(0x4000000),ct_state(-new-est),eth(src=fa:16:3e:3a:5c:b8,dst=fa:16:3e:a4:80:66),eth_type(0x0800),ipv4(tos=0/0x3,frag=no), packets:1464, bytes:117364, used:0.004s, flags:SFP., actions:push_vlan(vid=2,pcp=0),1,set(tunnel(tun_id=0x4d1,src=10.5.3.217,dst=10.5.3.163,ttl=64,tp_dst=4789,flags(df|key))),set(eth(src=fa:16:3f:a1:ca:99,dst=fa:16:3e:a4:80:66)),pop_vlan,set(skb_mark(0)),8,set(eth(src=fa:16:3e:3a:5c:b8,dst=fa:16:3e:a4:80:66)),set(skb_mark(0x4000000)),10,13对比strip_vlan相关的流表也没发现什么问题 
正常的
root@juju-f327de-dvrbionic-7:~# ovs-ofctl dump-flows br-int | grep -r 'strip_vlan'
# to sg-xxx (fa:16:3e:a4:80:66), 'ovs-ofctl show br-int' shows 5 is sg-xxxcookie=0xe0ec949fc5d474e0, duration=3218.405s, table=60, n_packets=4, n_bytes=354, idle_age=1871, priority=4,dl_vlan=2,dl_dst=fa:16:3e:a4:80:66 actions=strip_vlan,output:5
# to VM (fa:16:3e:f2:0b:e8)cookie=0xe0ec949fc5d474e0, duration=2305.720s, table=60, n_packets=0, n_bytes=0, idle_age=2307, priority=4,dl_vlan=2,dl_dst=fa:16:3e:f2:0b:e8 actions=strip_vlan,output:8cookie=0xe0ec949fc5d474e0, duration=2304.534s, table=60, n_packets=3020, n_bytes=1582208, idle_age=244, priority=90,dl_vlan=2,dl_dst=fa:16:3e:f2:0b:e8 actions=load:0x8->NXM_NX_REG5[],load:0x2->NXM_NX_REG6[],strip_vlan,resubmit(,81)#非正常的
root@juju-c25ce5-dvr-8:/home/ubuntu# ovs-ofctl dump-flows br-int | grep -r 'strip_vlan'
# fa:16:3e:ec:de:47 is 192.168.21.1 (GW), 11 is qr-xxx (ovs-ofctl show br-int)cookie=0x2cb74c913a070f53, duration=9337.512s, table=3, n_packets=0, n_bytes=0, idle_age=65534, priority=5,dl_vlan=3,dl_dst=fa:16:3f:4c:33:9e actions=mod_dl_dst:fa:16:3e:ec:de:47,strip_vlan,output:11
# to sg-xxx (fa:16:3e:4c:b2:57), 12 is sg-xxxcookie=0x2cb74c913a070f53, duration=9337.499s, table=60, n_packets=0, n_bytes=0, idle_age=65534, priority=20,dl_vlan=3,dl_dst=fa:16:3e:4c:b2:57 actions=strip_vlan,output:12
# to VM (fa:16:3e:c0:d6:96), 8 is tapcookie=0x2cb74c913a070f53, duration=9334.490s, table=60, n_packets=0, n_bytes=0, idle_age=65534, priority=20,dl_vlan=3,dl_dst=fa:16:3e:c0:d6:96 actions=strip_vlan,output:8cookie=0x2cb74c913a070f53, duration=9333.196s, table=60, n_packets=1696, n_bytes=145105, idle_age=661, priority=90,dl_vlan=3,dl_dst=fa:16:3e:c0:d6:96 actions=load:0x8->NXM_NX_REG5[],load:0x3->NXM_NX_REG6[],strip_vlan,resubmit(,81)

其实上,当snat-xxx上持续pingVM时在计算节点上运行’ovs-dpctl dump-flows’看到如下信息在ussuri与stein上都是一样的(proto=6是IP包), 所以似乎和流表没关系。

# ussuri
recirc_id(0),in_port(13),ct_state(-trk),eth(src=fa:16:3e:d3:6f:80),eth_type(0x0800),ipv4(src=192.168.21.151,proto=6,frag=no), packets:24, bytes:2560, used:0.396s, flags:SP., actions:ct(zone=3),recirc(0x4b)#stein
recirc_id(0),in_port(12),ct_state(-trk),eth(src=fa:16:3e:4c:29:6d),eth_type(0x0800),ipv4(src=192.168.21.5,proto=6,frag=no), packets:3271, bytes:307846, used:1.656s, flags:SP., actions:ct(zone=3),recirc(0x1)

在持续ping时也在计算节点上运行"conntrack -L | grep mark=1 |grep 192.168.21"没看到东西,似乎conntrack也没有问题。
然后发现当加一个新的nova-compute, 3个router都不在这个nova-compute上,也就是说这个nova-compute上没有snat-xxx namespace时创建的虚机是可以ping的

juju add-unit nova-compute
openstack --debug server create --wait --image bionic --flavor m1.small --key-name testkey --nic net-id=$(openstack net show private -f value -c id) --availability-zone=nova:juju-21f0ba-focal-13.cloud.sts nosnatvm

在计算节点上运行trace, 注意,要带上mac,这样相当于地址学习后的结果。否则不带mac将看到洪泛的结果。

VM:             192.168.21.151 fa:16:3e:d3:6f:80
sg-5931e702-bc: 192.168.21.168 fa:16:3e:34:9e:64
https://pastebin.ubuntu.com/p/kc7pZxXQnc/
ovs-appctl ofproto/trace br-int in_port=9,ip,nw_proto=1,nw_src=192.168.21.151,nw_dst=192.168.21.168,dl_src=fa:16:3e:d3:6f:80,dl_dst=fa:16:3e:34:9e:64

但是上面说"recirc(0x6d) - resume conntrack with default ct_state=trk|new (use --ct-next to customize)", 所以我们使用:

ovs-appctl ofproto/trace br-int 'in_port=9,ip,nw_proto=1,nw_src=192.168.21.151,nw_dst=192.168.21.168,dl_src=fa:16:3e:d3:6f:80,dl_dst=fa:16:3e:34:9e:64' --ct-next 'trk,est'

结果见:https://pastebin.ubuntu.com/p/tSQXQFfPBw/
stein上没问题的结果见:https://pastebin.ubuntu.com/p/ZTfXd6rVZ9/
但ussuri似乎下面的table=94的规则导致它无法进入br-tun

94. reg6=0x3,dl_dst=fa:16:3e:34:9e:64, priority 12, cookie 0x8a4738b01717a42eoutput:8

然后我们在compute node上检查了到sg-xxx的流表,ussuri上确认有table=94的流表。

root@juju-21f0ba-focal-10:/home/ubuntu# ovs-ofctl dump-flows br-int | grep fa:16:3e:34:9e:64cookie=0x8a4738b01717a42e, duration=13204.575s, table=1, n_packets=0, n_bytes=0, idle_age=65534, priority=20,dl_vlan=3,dl_dst=fa:16:3e:34:9e:64 actions=mod_dl_src:fa:16:3e:5e:d6:96,resubmit(,60)cookie=0x8a4738b01717a42e, duration=13204.573s, table=60, n_packets=0, n_bytes=0, idle_age=65534, priority=20,dl_vlan=3,dl_dst=fa:16:3e:34:9e:64 actions=strip_vlan,output:8cookie=0x8a4738b01717a42e, duration=13202.646s, table=94, n_packets=12485, n_bytes=1063359, idle_age=0, priority=12,reg6=0x3,dl_dst=fa:16:3e:34:9e:64 actions=output:8cookie=0x8a4738b01717a42e, duration=13202.646s, table=94, n_packets=0, n_bytes=0, idle_age=65534, priority=10,reg6=0x3,dl_src=fa:16:3e:34:9e:64,dl_dst=00:00:00:00:00:00/01:00:00:00:00:00 actions=mod_vlan_vid:3,output:2#br-int(patch-tun) <-> br-tun(patch-int)
# sudo ovs-vsctl -- --columns=name,ofport list Interface |grep ': 8' -B1
name                : sg-5931e702-bc
ofport              : 8
# sudo ovs-vsctl -- --columns=name,ofport list Interface |grep ': 2' -B1
name                : ens7
ofport              : 2name                : vxlan-0a050015
ofport              : 2
--
name                : patch-tun
ofport              : 2root@juju-824e75-train2-8:~# ovs-ofctl dump-flows br-int | grep fa:16:3e:6b:60:7dcookie=0x4580a22bf3824b00, duration=13818.613s, table=1, n_packets=0, n_bytes=0, idle_age=65534, priority=20,dl_vlan=3,dl_dst=fa:16:3e:6b:60:7d actions=mod_dl_src:fa:16:3e:4b:f5:19,resubmit(,60)cookie=0x4580a22bf3824b00, duration=13818.611s, table=60, n_packets=0, n_bytes=0, idle_age=65534, priority=20,dl_vlan=3,dl_dst=fa:16:3e:6b:60:7d actions=strip_vlan,output:7

table=94是ACCEPTED_EGRESS_TRAFFIC_NORMAL_TABLE,只和firewall有关,相关patches如下:

hua@t440p:/bak/openstack/neutron$ git log --oneline 1c2e10f859...16.0.0 neutron/agent/linux/openvswitch_firewall/firewall.py
6dbba8d5ce Check SG members instead of ports to skip flow update
efa8dd0895 Add accepted egress direct flow
991126eb6e Merge "[OVS FW] Clean port rules if port not found in ovsdb"
b01e0c2aa9 [OVS FW] Clean port rules if port not found in ovsdb
5cb0ff418a Add more condition to check sg member exist
a94cb83e18 Merge "Handle OVSFWPortNotFound and OVSFWTagNotFound in ovs firewall"
e801159003 Handle OVSFWPortNotFound and OVSFWTagNotFound in ovs firewall
4b67a06403 Log OVS firewall conjunction creation

删除上面这条table=94这条rule之后就好了

sudo ovs-ofctl -Oopenflow13 --strict del-flows br-int "table=94, n_packets=12485, n_bytes=1063359, idle_age=0, priority=12,reg6=0x3,dl_dst=fa:16:3e:34:9e:64"

当explicitly_egress_direct=false时会创建table=94的流表 - https://review.opendev.org/c/openstack/neutron/+/704506/1/neutron/agent/linux/openvswitch_firewall/firewall.py
在本文中搜索1732067即可看到此bug的详细说明 (https://bugs.launchpad.net/neutron/+bug/1732067)

也就是说,这条table=94的流在dvr环境下可以创建,但是在vrrp + dvr环境下只应该在non-standby节点上创建。L3HARouterAgentPortBinding(port_id, router_id, l3_agent_id, state)能知道一个L3_agent_id是不是standby, 这个./neutron/db/l3_hamode_db.py#get_active_host_for_ha_router能知道它是不中standby

此处继续的代码分析见:https://zhhuabj.blog.csdn.net/article/details/78435072

20220826 - 比较问题发生前后的流

for i in 0 1 2; do juju run -u nova-compute/$i -- "sudo ovs-ofctl -O OpenFlow15 dump-flows --no-stats --rsort br-int" > flows-$i; done
  相关解决方案