lspci | grep -i Mell
[root@localhost ~]# lspci | grep -i Mell
4b:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
4b:00.1 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
[root@localhost ~]#
处理本地yum源,安装依赖包
vi /etc/yum.repos.d/cndba.repo
[cndba.cn]
name = CNDBA.CN YUM Server
baseurl=file:///cndba
gpgcheck=0
enabled=1
#################
mkdir /cndba
mount -o loop CentOS-7-x86_64-DVD-2009.iso /cndba/
yum clean all
yum -y install tcl tk
下载驱动
根据操作系统版本,下载驱动
https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
安装驱动
tar -xvf MLNX_OFED_LINUX-23.10-4.0.9.1-rhel7.9-x86_64.tar
./MLNX_OFED_LINUX-23.10-4.0.9.1-rhel7.9-x86_64/mlnxofedinstall
ibv_devinfo 检查驱动类型,是IB模式,还是网络模式
ibv_devinfo
link_layer: InfiniBand # IB模式 LINK_TYPE_P1=1
ink_layer: Ethernet # 网络模式 LINK_TYPE_P1=0
# 如果是网络模式就需要修改模式,并重启服务器
mlxconfig -d 4b:00:0 set LINK_TYPE_P1=1
mlxconfig -d 4b:00.0 set LINK_TYPE_P1=1
ibnodes命令,会发现端口连接的信息
ibnodes |grep localhost
[root@localhost ~]# ibnodes |grep loca
Ca : 0x9c63c003005a2852 ports 1 "localhost HCA-1"
Ca : 0xa088c2030058bc6c ports 1 "localhost HCA-1"
Ca : 0xa088c2030058c29c ports 1 "localhost HCA-1"
Ca : 0xa088c2030058be0c ports 1 "localhost HCA-1"
Ca : 0xa088c2030058c2ec ports 1 "localhost HCA-1"
Ca : 0x9c63c003005a12c2 ports 1 "localhost HCA-1"
ibstatus可以查看网卡状态
[root@localhost ~]# ibstatus
Infiniband device 'mlx5_0' port 1 status:
default gid: fe80:0000:0000:0000:9c63:c003:005f:9202
base lid: 0x47
sm lid: 0x7
state: 4: ACTIVE
phys state: 5: LinkUp
rate: 100 Gb/sec (2X HDR)
link_layer: InfiniBand
Infiniband device 'mlx5_1' port 1 status:
default gid: fe80:0000:0000:0000:9c63:c003:005f:9203
base lid: 0xffff
sm lid: 0x0
state: 1: DOWN
phys state: 3: Disabled
rate: 10 Gb/sec (4X SDR)
link_layer: InfiniBand
[root@localhost ~]#
安装日志
[root@localhost MLNX_OFED_LINUX-23.10-4.0.9.1-rhel7.9-x86_64]# ./mlnxofedinstall
Logs dir: /tmp/MLNX_OFED_LINUX.110625.logs
General log file: /tmp/MLNX_OFED_LINUX.110625.logs/general.log
Verifying KMP rpms compatibility with target kernel...
This program will install the MLNX_OFED_LINUX package on your machine.
Note that all other Mellanox, OEM, OFED, RDMA or Distribution IB packages will be removed.
Those packages are removed due to conflicts with MLNX_OFED_LINUX, do not reinstall them.
Do you want to continue?[y/N]:y
Uninstalling MLNX_EN driver
rpm --nosignature -e --allmatches --nodeps libibverbs librdmacm libibverbs librdmacm
Starting MLNX_OFED_LINUX-23.10-4.0.9.1 installation ...
Preparing... ########################################
mlnx-tools-23.10-0.2310409 ########################################
Preparing... ########################################
libibverbs-2307mlnx47-1.2310409 ########################################
Installing mlnx-ofa_kernel RPM
。。。。。。
Device (4b:00.0):
4b:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
Link Width: x16
PCI Link Speed: 16GT/s
Device (4b:00.1):
4b:00.1 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
Link Width: x16
PCI Link Speed: 16GT/s
Installation finished successfully.
Preparing... ################################# [100%]
Updating / installing...
1:mlnx-fw-updater-23.10-4.0.9.1 ################################# [100%]
Added 'RUN_FW_UPDATER_ONBOOT=no to /etc/infiniband/openib.conf
Initializing...
Attempting to perform Firmware update...
^@Querying Mellanox devices firmware ...
Device #1:
----------
Device Type: ConnectX6
Part Number: MCX653106A-ECA_Ax
Description: ConnectX-6 VPI adapter card; H100Gb/s (HDR100; EDR IB and 100GbE); dual-port QSFP56; PCIe3.0 x16; tall bracket; ROHS R6
PSID: MT_0000000224
PCI Device Name: 4b:00.0
Base MAC: a088c28d8342
Versions: Current Available
FW 20.39.3004 20.39.4082
PXE 3.7.0300 3.7.0300
UEFI 14.32.0017 14.33.0012
Status: Update required
---------
Found 1 device(s) requiring firmware update...
Device #1: Updating FW ...
FSMST_INITIALIZE - OK
Writing Boot image component - OK
Done
Restart needed for updates to take effect.
Log File: /tmp/sL4B4OGlnS
Real log file: /tmp/MLNX_OFED_LINUX.110625.logs/fw_update.log
To load the new driver, run:
/etc/init.d/openibd restart
[root@localhost MLNX_OFED_LINUX-23.10-4.0.9.1-rhel7.9-x86_64]# /etc/init.d/openibd restart
client_loop: send disconnect: Broken pipe
##############
[root@localhost ~]# mlnx_tune
Mellanox Technologies - System Report
Operation System Status
CentOS Linux7.9.2009
3.10.0-1160.el7.x86_64
CPU Status
GenuineIntel Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz N/A
OK: Frequency 799.963MHz >>> CPU frequency is below maximum. Install cpupowerutils and run x86_energy_perf_policy performance.
Memory Status
Total: 755.11 GB
Free: 731.36 GB
Hugepages Status
On NUMA 1:
Transparent enabled: always
Transparent defrag: always
Hyper Threading Status
ACTIVE
IRQ Balancer Status
NOT PRESENT
Firewall Status
NOT PRESENT
IP table Status
NOT PRESENT
IPv6 table Status
NOT PRESENT
Driver Status
OK: MLNX_OFED_LINUX-23.10-4.0.9.1 (OFED-23.10-4.0.9)
ConnectX-6 Device Status on PCI 4b:00.0
FW version 20.39.4082
OK: PCI Width x16
OK: PCI Speed 16GT/s
PCI Max Payload Size 512
PCI Max Read Request 4096
Local CPUs list [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95]
ib0 (Port 1) Status
Link Type ib
OK: Link status Up
Speed EDR
MTU 2044
ConnectX-6 Device Status on PCI 4b:00.1
FW version 20.39.4082
OK: PCI Width x16
OK: PCI Speed 16GT/s
PCI Max Payload Size 512
PCI Max Read Request 4096
Local CPUs list [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95]
ens2f1 (Port 1) Status
Link Type eth
Warning: Link status Down >>> Check your port configuration (Physical connection, SM, IP).
Speed N/A
MTU 1500
OK: TX nocache copy 'off'
###########
[root@localhost ~]# ibv_devinfo
hca_id: mlx5_0
transport: InfiniBand (0)
fw_ver: 20.39.4082
node_guid: a088:c203:008d:8342
sys_image_guid: a088:c203:008d:8342
vendor_id: 0x02c9
vendor_part_id: 4123
hw_ver: 0x0
board_id: MT_0000000224
phys_port_cnt: 1
port: 1
state: PORT_ACTIVE (4)
max_mtu: 4096 (5)
active_mtu: 4096 (5)
sm_lid: 7
port_lid: 74
port_lmc: 0x00
link_layer: InfiniBand
hca_id: mlx5_1
transport: InfiniBand (0)
fw_ver: 20.39.4082
node_guid: a088:c203:008d:8343
sys_image_guid: a088:c203:008d:8342
vendor_id: 0x02c9
vendor_part_id: 4123
hw_ver: 0x0
board_id: MT_0000000224
phys_port_cnt: 1
port: 1
state: PORT_DOWN (1)
max_mtu: 4096 (5)
active_mtu: 1024 (3)
sm_lid: 0
port_lid: 0
port_lmc: 0x00
link_layer: Ethernet