文章目录
A、背景介绍
因前期规划不充分,将ceph的public网络与cluster网络 以及pve的corosync网络混合共用了一个。后因corosync的udp端口5045报文速率过快,导致网络丢包和跳ping,进而导致ceph osd down和slow 以及pve集群管理出现故障。如下图所示
B、整改规划
- 启用专用物理接口承载pve的集群流量(corosync走专用网络-千兆电口)
- ceph的public网络和cluster网络 继续沿用之前的高带宽网络
C、pve集群(corosync)网络要求
- 所有节点之间,lan延迟小于5ms
- corosync不会占用大量带宽,但是对延迟非常敏感
- 最好是将corosync与存储网络分开
- corosync是UDP 单播(端口5405-5412)
D、实施修改pve集群网络(每个集群节点都要执行)
1、先查看这两个文件的内容,正常情况应该是完全一样的
cat /etc/pve/corosync.conf #这个文件在集群正常运行时,默认是没有权限进行修改的
cat /etc/corosync/corosync.conf
logging {
debug: off
to_syslog: yes
}
nodelist {
node {
name: pve-ceph01
nodeid: 1
quorum_votes: 1
ring0_addr: 10.99.99.1
}
node {
name: pve-ceph02
nodeid: 2
quorum_votes: 1
ring0_addr: 10.99.99.2
}
node {
name: pve-ceph03
nodeid: 3
quorum_votes: 1
ring0_addr: 10.99.99.3
}
node {
name: pve-ceph04
nodeid: 4
quorum_votes: 1
ring0_addr: 10.99.99.4
}
}
quorum {
provider: corosync_votequorum
}
totem {
cluster_name: LT-TEST-CEPH
config_version: 6 #注意这里的版本号,修改后要加1
interface {
linknumber: 0
}
ip_version: ipv4-6
link_mode: passive
secauth: on
version: 2
}
2、修改/etc/corosync/corosync.conf 该文件内容
logging {
debug: off
to_syslog: yes
}
nodelist {
node {
name: pve-ceph01
nodeid: 1
quorum_votes: 1
ring0_addr: 10.15.11.198
}
node {
name: pve-ceph02
nodeid: 2
quorum_votes: 1
ring0_addr: 10.15.11.109
}
node {
name: pve-ceph03
nodeid: 3
quorum_votes: 1
ring0_addr: 10.15.11.78
}
node {
name: pve-ceph04
nodeid: 4
quorum_votes: 1
ring0_addr: 10.15.11.137
}
}
quorum {
provider: corosync_votequorum
}
totem {
cluster_name: LT-TEST-CEPH
config_version: 7
interface {
linknumber: 0
}
ip_version: ipv4-6
link_mode: passive
secauth: on
version: 2
}
3、停止该节点的集群服务
systemctl stop pve-cluster.service
systemctl stop corosync.service
4、将集群文件系统设为本地模式
pmxcfs -l
5、修改/etc/pve/corosync.conf文件内容
rm /etc/pve/corosync.conf
cp /etc/corosync/corosync.conf /etc/pve/
6、重启pve集群
killall pmxcfs
systemctl start pve-cluster.service
E、验证
1、查看pve集群节点状态
pvecm status
2、查看 corosync 集群当前节点连接状态
root@pve-ceph01:/var/log# corosync-cfgtool -s #现场未保留,这是正常集群的状态,异常状态的时候会有随机的节点显示 disconnect
Local node ID 1, transport knet
LINK ID 0 udp
addr = 10.15.11.198
status:
nodeid: 1: localhost
nodeid: 2: connected
nodeid: 3: connected
nodeid: 4: connected
3、查看 corosync 当前已连接的节点和链路
root@pve-ceph01:/var/log# corosync-cfgtool -n
Local node ID 1, transport knet
nodeid: 2 reachable
LINK: 0 udp (10.15.11.198->10.15.11.109) enabled connected mtu: 1397
nodeid: 3 reachable
LINK: 0 udp (10.15.11.198->10.15.11.78) enabled connected mtu: 1397
nodeid: 4 reachable
LINK: 0 udp (10.15.11.198->10.15.11.137) enabled connected mtu: 1397
4、查看corosync 集群选举状态
root@pve-ceph01:/var/log# corosync-quorumtool -s
Quorum information
------------------
Date: Fri May 5 15:06:45 2023
Quorum provider: corosync_votequorum
Nodes: 4
Node ID: 1
Ring ID: 1.4d6
Quorate: Yes
Votequorum information
----------------------
Expected votes: 4
Highest expected: 4
Total votes: 4
Quorum: 3
Flags: Quorate
Membership information
----------------------
Nodeid Votes Name
1 1 pve-ceph01 (local)
2 1 pve-ceph02
3 1 pve-ceph03
4 1 pve-ceph04
5、查看corosync的相关日志信息
journalctl -b -u corosync
附、参考文章
如果文章对你有帮助,欢迎点击上方按钮打赏作者
暂无评论