HEX
Server: Apache/2.4.52 (Ubuntu)
System: Linux WebLive 5.15.0-79-generic #86-Ubuntu SMP Mon Jul 10 16:07:21 UTC 2023 x86_64
User: ubuntu (1000)
PHP: 7.4.33
Disabled: pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,pcntl_unshare,
Upload Files
File: //usr/local/qcloud/monitor/barad/plugin/collector/vm/rdma_monitor.py
import resource
import re
import sys
import os, time

sys.path.append(os.getcwd() + '/../../../comm/')
import constant
import datetime, subprocess
from plugin_base import VmBaseCollector
from utils.metric_handler import MetricHandler
from pynvml.pynvml import *
import cutils 
import json
import re
from Queue import Queue

CMD_TIMEOUT_SEC = 3
VAL_ERR         = -1
CMD_TIMEOUT_VAL = "-1"
HPBW_NUM = 10
QUEUE_DEPTH=10

class NetworkCollect():
    def __init__(self):
        dev_pairs = self.get_rdma_devices()
        self.ib_devs = dev_pairs[0]         # [mln5_bond_0]
        self.rdma_netdevs = dev_pairs[1]    # [bond0]
        self.netdev_slave_index = dev_pairs[2]   # [[0,1]]
        self.eth_ifs = dev_pairs[3]         # [eth0, eth1]
        self.eth_bdfs = dev_pairs[4]        # [5e:00.0, 5e:00.1]
        # hpbw with queue elements: 'rx_hpbw_max','rx_hpbw_min','rx_hpbw_p50','rx_hpbw_p90','rx_hpbw_avg','tx_hpbw_max','tx_hpbw_min','tx_hpbw_p50','tx_hpbw_p90','tx_hpbw_avg',
        self.hpbw_stats = dev_pairs[5]
        # alarm after system start 15 mins at least
        self.uptime_alarm_interval = 60 * 15
        self.uptime = self.read_uptime_secs()
        self.last_read_uptime = self.uptime

    def check_cmd_exist(self, cmd):
        for cmdpath in os.environ['PATH'].split(':'):
            if os.path.isdir(cmdpath) and cmd in os.listdir(cmdpath):
                return 1
        return 0

    # different from utils because we need high precision
    def ExecuteTimeoutCmd(self, command):
        """call shell-command and either return its output or kill it
        if it doesn't normally exit within timeout seconds and return None"""
        start = datetime.datetime.now()
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
        while process.poll() is None:
            time.sleep(0.0001)
            now = datetime.datetime.now()
            if (now - start).seconds> CMD_TIMEOUT_SEC:
                #os.kill(process.pid, signal.SIGKILL)
                #os.waitpid(-1, os.WNOHANG)
                '''modified by cloudyang 2014.07.15
                    fix for the fd-leak bug in subprocess module in python 2.6
                    hope it works'''
                process.kill()
                process.wait()
                return CMD_TIMEOUT_VAL   #timeout
        return process.stdout.read()

    def valueCast(self, value):
        try:
            return int(value)
        except ValueError:
            try:
                return round(float(value), 3)
            except ValueError:
                return VAL_ERR
    
    def read_uptime_secs(self):
        uptimefd = self.ExecuteTimeoutCmd('cut -d \'.\' -f 1 /proc/uptime')
        if uptimefd != CMD_TIMEOUT_VAL:
            return self.valueCast(uptimefd)
        return VAL_ERR

    # support with only number existed in file!
    def read_file_contents(self, path):
        try:
            if os.path.exists(path):
                with open(path) as fd:
                    contents = fd.readlines()
                    for line in contents:
                        return self.valueCast(line)
        except Exception:
            pass
        return VAL_ERR

    # support with only cmd result like:
    # name : val
    # so we can excute the val use this func
    def read_cmd_contents(self, cmd):
        cmdfd = self.ExecuteTimeoutCmd(cmd)
        if cmdfd != CMD_TIMEOUT_VAL:
            for line in cmdfd.splitlines():
                try:
                    return self.valueCast(line.split(":")[1])
                except Exception:
                    pass
        return VAL_ERR

    def check_pcie_width(self, bdf_list):
        err_list = []
        try:
            for bdf in bdf_list:
                max_width = cutils.FileQuickOp("/sys/bus/pci/devices/" + bdf + "/max_link_width").get_match_line(match="", single_match=True)
                cur_width = cutils.FileQuickOp("/sys/bus/pci/devices/" + bdf + "/current_link_width").get_match_line(match="", single_match=True)
                if cur_width != '16' or max_width != '16':
                    err_list.append(bdf)
        except Exception:
            pass
        return err_list

    def check_pcie_speed(self, bdf_list):
        err_list = []
        try:
            for bdf in bdf_list:
                max_speed = cutils.FileQuickOp("/sys/bus/pci/devices/" + bdf + "/max_link_speed").get_column_value(match="GT/s", split=" ", count=1)
                cur_speed = cutils.FileQuickOp("/sys/bus/pci/devices/" + bdf + "/current_link_speed").get_column_value(match="GT/s", split=" ", count=1)
                if cur_speed == '32':
                    continue
                if cur_speed != '' and max_speed != '' and cur_speed != max_speed:
                    err_list.append(bdf)
        except Exception:
            pass
        return err_list        

    def check_pcie_aer(self, bdf_list):
        err_list = []
        try:
            for bdf in bdf_list:
                aer_dev_fatal = cutils.FileQuickOp("/sys/bus/pci/devices/" + bdf + "/aer_dev_fatal").get_raw_data()
                for item in aer_dev_fatal:
                    if item.strip().split(' ')[1] != '0':
                        err_list.append(bdf)
        except Exception:
            pass
        return list(set(err_list))

    def get_rdma_devices(self):
        dev_cmd = 'ibdev2netdev'
        dev_pairs = []
        ethifs = []
        ibdevs = []
        netdevs = []
        ethbdfs = []
        slave_indexs = []
        hpbwstats = {}
        slave_num = 0
        linkfd = self.ExecuteTimeoutCmd(dev_cmd)
        for line in linkfd.splitlines():
            if line.find("==>") == -1:
                continue
            try:
                ibdev = line.split("==>")[0].strip().split()[0].strip()
                netdev = line.split("==>")[1].strip().split()[0].strip()
                if not netdev.startswith("bond"):
                    continue
                slave_path = '/sys/class/net/' + netdev + '/bonding/slaves'
                if not os.path.exists(slave_path):
                    continue
                slave_index = []
                with open(slave_path) as slaves_fd:
                    contents = slaves_fd.readlines()
                    for line in contents:
                        slaves = line.split()
                        for slave in slaves:
                            slave_index.append(slave_num)
                            slave_num += 1
                            ethifs.append(slave)
                            bdf_cmd = 'ethtool -i ' + slave
                            bdffd = self.ExecuteTimeoutCmd(bdf_cmd)
                            for line in bdffd.splitlines():
                                if line.find("bus-info") != -1:
                                    ethbdfs.append(line.split("bus-info:")[1].strip())
                ibdevs.append(ibdev)
                netdevs.append(netdev)
                slave_indexs.append(slave_index)
                hpbwstats[netdev] = [Queue(QUEUE_DEPTH) for _ in range(HPBW_NUM)]
            except:
                pass
        dev_pairs.append(ibdevs)
        dev_pairs.append(netdevs)
        dev_pairs.append(slave_indexs)
        dev_pairs.append(ethifs)
        dev_pairs.append(ethbdfs)
        dev_pairs.append(hpbwstats)
        return dev_pairs

    def get_monitor_info(self):
        monitor_info = []
        netdev_monitor_info = []
        ethif_monitor_info = []
        for i in range(len(self.eth_ifs)):
            ethif_info_key = [
                "link_detected",
                'link_events_down_phy',
                'dcqcn_enable',
                'max_link_speed',
                'max_link_width',
                'current_link_speed',
                'current_link_width',
                'eth_promisc',
                'aer_has_fatal',
                'device_id',
                'switch_ports',
                "lacp_id",
                'lacp_status',
                "rpg_time_reset",
                "rpg_ai_rate",
                "rpg_byte_reset",
                "rate_reduce_monitor_period",
                "cnp_dscp",
                "cnp_802p_prio"
            ]
            ethif_info_value = [VAL_ERR] * len(ethif_info_key)
            ethiffd = self.ExecuteTimeoutCmd('ethtool ' + self.eth_ifs[i] + ' | grep detected')
            if ethiffd != CMD_TIMEOUT_VAL:
                ethif_info_value[0] = 0
                for line in ethiffd.splitlines():
                    if line.find('yes') != -1:
                        ethif_info_value[0] = 1
                if self.uptime < self.uptime_alarm_interval and ethif_info_value[0] == 0:
                    ethif_info_value[0] = VAL_ERR
            ethif_info_value[1] = self.read_cmd_contents('ethtool -S ' + self.eth_ifs[i] + ' | grep -i link')
            rp = str(self.read_file_contents('/sys/class/net/' + self.eth_ifs[i] + '/ecn/roce_rp/enable/5')) 
            np = str(self.read_file_contents('/sys/class/net/' + self.eth_ifs[i] + '/ecn/roce_np/enable/5')) 
            try:
                ethif_info_value[2] = self.valueCast(rp + np) if rp != str(VAL_ERR) and np != str(VAL_ERR) else VAL_ERR
            except Exception:
                pass
            ethif_path = '/sys/class/net/' + self.eth_ifs[i] + '/device/max_link_speed'
            if os.path.exists(ethif_path):
                with open(ethif_path) as ethif_fd:
                    contents = ethif_fd.readlines()
                    for line in contents:
                        try:
                            ethif_info_value[3] = self.valueCast(line.split()[0])
                        except Exception:
                            pass
            ethif_path = '/sys/class/net/' + self.eth_ifs[i] + '/device/max_link_width'
            ethif_info_value[4] = self.read_file_contents(ethif_path)
            ethif_path = '/sys/class/net/' + self.eth_ifs[i] + '/device/current_link_speed'
            if os.path.exists(ethif_path):
                with open(ethif_path) as ethif_fd:
                    contents = ethif_fd.readlines()
                    for line in contents:
                        try:
                            ethif_info_value[5] = self.valueCast(line.split()[0])
                        except Exception:
                            pass
            ethif_path = '/sys/class/net/' + self.eth_ifs[i] + '/device/current_link_width'
            ethif_info_value[6] = self.read_file_contents(ethif_path)
            ethiffd = self.ExecuteTimeoutCmd('ifconfig -a ' + self.eth_ifs[i] + ' | grep -i promisc')
            if ethiffd == "":
                ethif_info_value[7] = 0
            elif ethiffd != CMD_TIMEOUT_VAL:
                ethif_info_value[7] = 1
            ethif_path = '/sys/bus/pci/devices/' + self.eth_bdfs[i] + '/aer_dev_fatal'
            if os.path.exists(ethif_path):
                with open(ethif_path) as ethif_fd:
                    contents = ethif_fd.readlines()
                    for line in contents:
                        try:
                            if line.strip().split(' ')[1] != '0':
                                ethif_info_value[8] = 1
                        except Exception:
                            pass
                ethif_info_value[8] = 0
            ethif_path = '/sys/class/net/' + self.eth_ifs[i] + '/device/device'
            if os.path.exists(ethif_path):
                with open(ethif_path) as ethif_fd:
                    contents = ethif_fd.readlines()
                    for line in contents:
                        ethif_info_value[9] = line.strip('\n')
            ethif_path = '/sys/bus/pci/devices/' + self.eth_bdfs[i]
            if os.path.exists(ethif_path):
                ethif_info_value[10] = []
                link = os.readlink(ethif_path)
                for b in link.split('/'):
                    if b == '..' or b == self.eth_bdfs[i] or 'pci' in b or 'devices' in b:
                        continue
                    ethif_info_value[10].append(b)
            #ethif_info_value[11] lacp processed below!
            ethif_path = '/sys/class/net/' + self.eth_ifs[i] + '/ecn/roce_rp/rpg_time_reset'
            ethif_info_value[13] = self.read_file_contents(ethif_path)
            ethif_path = '/sys/class/net/' + self.eth_ifs[i] + '/ecn/roce_rp/rpg_ai_rate'
            ethif_info_value[14] = self.read_file_contents(ethif_path)
            ethif_path = '/sys/class/net/' + self.eth_ifs[i] + '/ecn/roce_rp/rpg_byte_reset'
            ethif_info_value[15] = self.read_file_contents(ethif_path)
            ethif_path = '/sys/class/net/' + self.eth_ifs[i] + '/ecn/roce_rp/rate_reduce_monitor_period'
            ethif_info_value[16] = self.read_file_contents(ethif_path)
            ethif_path = '/sys/class/net/' + self.eth_ifs[i] + '/ecn/roce_np/cnp_dscp'
            ethif_info_value[17] = self.read_file_contents(ethif_path)
            ethif_path = '/sys/class/net/' + self.eth_ifs[i] + '/ecn/roce_np/cnp_802p_prio'
            ethif_info_value[18] = self.read_file_contents(ethif_path)
            ethif_monitor_info.append(dict(zip(ethif_info_key, ethif_info_value)))
        for i in range(len(self.ib_devs)):
            ibdev_info_key = [
                'link_state',
                'traffic_class',
                'qp_num',
                'cq_num',
                'mr_num',
                'pd_num',
                'cmid_num',
                'ctx_num',
                'ports_num',
            ]
            ibdev_info_value = [VAL_ERR] * len(ibdev_info_key)
            ibdev_path = '/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/state'
            if os.path.exists(ibdev_path):
                with open(ibdev_path) as ibdev_fd:
                    contents = ibdev_fd.readlines()
                    ibdev_info_value[0] = 0
                    for line in contents:
                        if line.find("ACTIVE") != -1:
                            ibdev_info_value[0] = 1
                            break
                    if ibdev_info_value[0] == 0 and self.uptime < self.uptime_alarm_interval: #abnormal in start period
                        ibdev_fd.flush()
                        os.fsync(ibdev_fd.fileno())
                        ibdev_info_value[0] = VAL_ERR
                if ibdev_info_value[0] == 0 and self.last_read_uptime < self.uptime_alarm_interval and self.uptime > self.uptime_alarm_interval:
                    self.ExecuteTimeoutCmd('/usr/local/qcloud/monitor/barad/admin/stop.sh && /usr/local/qcloud/monitor/barad/admin/trystart.sh')
            ibdev_path = '/sys/class/infiniband/' + self.ib_devs[i] + '/tc/1/traffic_class'
            if os.path.exists(ibdev_path):
                with open(ibdev_path) as ibdev_fd:
                    contents = ibdev_fd.readlines()
                    for line in contents:
                        try:
                            ibdev_info_value[1] = self.valueCast(line.split('=')[1])
                        except Exception:
                            pass
                    if ibdev_info_value[1] == 0 and self.uptime < self.uptime_alarm_interval: #abnormal in start period
                        ibdev_fd.flush()
                        os.fsync(ibdev_fd.fileno())
                        ibdev_info_value[1] = VAL_ERR 
                if ibdev_info_value[1] == 0 and self.last_read_uptime < self.uptime_alarm_interval and self.uptime > self.uptime_alarm_interval:
                    self.ExecuteTimeoutCmd('/usr/local/qcloud/monitor/barad/admin/stop.sh && /usr/local/qcloud/monitor/barad/admin/trystart.sh')
            ibdevfd = self.ExecuteTimeoutCmd('/opt/mellanox/iproute2/sbin/rdma -d resource show')
            if ibdevfd != CMD_TIMEOUT_VAL:
                for line in ibdevfd.splitlines():
                    if line.find(self.ib_devs[i]) != -1:
                        try:
                            ibdev_info_value[2] = self.valueCast(line.split("qp ")[1].split(" ")[0])
                            ibdev_info_value[3] = self.valueCast(line.split("cq ")[1].split(" ")[0])
                            ibdev_info_value[4] = self.valueCast(line.split("mr ")[1].split(" ")[0])
                            ibdev_info_value[5] = self.valueCast(line.split("pd ")[1].split(" ")[0])
                            ibdev_info_value[6] = self.valueCast(line.split("cm_id ")[1].split(" ")[0])
                            ibdev_info_value[7] = self.valueCast(line.split("ctx ")[1].split(" ")[0])
                            break
                        except Exception:
                            pass
            ibdevfd = self.ExecuteTimeoutCmd('cat /proc/net/bonding/' + self.rdma_netdevs[i] + " | grep \"Number of ports\"")
            if ibdevfd != CMD_TIMEOUT_VAL:
                for line in ibdevfd.splitlines():
                    try:
                        if line.find('Number of ports:') != -1:
                            ibdev_info_value[8] = self.valueCast(line.split(":")[1].strip)
                    except:
                        pass
            ibdevfd = self.ExecuteTimeoutCmd('cat /proc/net/bonding/' + self.rdma_netdevs[i] + " | egrep \"Aggregator active|Aggregator ID\" | grep -P \"^\S\"")
            if ibdevfd != CMD_TIMEOUT_VAL:
                id = [0]*2
                ididx = 0
                status = [0]*2
                statusid = 0
                for line in ibdevfd.splitlines():
                    try:
                        if line.find('Aggregator ID:') != -1 and ididx < 2:
                            id[ididx] = self.valueCast(line.split(':')[1].strip())
                            ididx += 1
                        elif line.find('Aggregator active:') != -1 and statusid < 2:
                            status[statusid] = self.valueCast(line.split(':')[1].strip())
                            statusid += 1
                    except Exception:
                        pass
                for idx in self.netdev_slave_index[i]:
                    ethif_monitor_info[idx]['lacp_status'] = status[idx%2]
                    ethif_monitor_info[idx]['lacp_id'] = id[idx%2]
            netdev_monitor_info.append(dict(zip(ibdev_info_key, ibdev_info_value)))
        monitor_info.append(netdev_monitor_info)
        monitor_info.append(ethif_monitor_info)
        return monitor_info

    def get_config_info(self):
        config_info = []
        netdev_config_info = []
        ethif_config_info = []
        bond_ip_set = []

        for i in range(len(self.ib_devs)):
            ibdevfd = self.ExecuteTimeoutCmd('ip addr show dev ' + self.rdma_netdevs[i] + ' | grep inet')
            for line in ibdevfd.splitlines():
                if line.find('inet') != -1 and line.find('inet6') == -1:
                    try:
                        bond_ip_set.append(line.split()[1].split('/')[0])
                    except Exception:
                        pass

        for i in range(len(self.ib_devs)):
            ibdev_info_key = [
                'bonding_mode',
                'active_mtu',
                'ofed_version',
                'nv_peer_mem_state',
                'nv_peer_mem_ver',
                'rdma_ip',
                'bond_mac',
                'bond_dev_order',
                'bond_duplicate_ip_dect'
            ]
            ibdev_info_value = [VAL_ERR] * len(ibdev_info_key)
            ibdev_path = '/sys/class/net/' + self.rdma_netdevs[i] + '/bonding/mode'
            if os.path.exists(ibdev_path):
                with open(ibdev_path) as ibdev_fd:
                    contents = ibdev_fd.readlines()
                    for line in contents:
                        try:
                            ibdev_info_value[0] = self.valueCast(line.split()[1])
                        except Exception:
                            pass
            ibdev_info_value[1] = self.read_file_contents('/sys/class/net/' + self.rdma_netdevs[i] +'/mtu')
            ibdevfd = self.ExecuteTimeoutCmd('ofed_info -s')
            for line in ibdevfd.splitlines():
                try:
                    ver_str = line.split('-')[1].replace('.', '') + line.split('-')[2].replace('.', '').replace(':', '')
                    ibdev_info_value[2] = self.valueCast(ver_str)
                except Exception:
                    pass
            ibdevfd = self.ExecuteTimeoutCmd('lsmod | grep nvidia_peermem')
            if ibdevfd == "":
                ibdev_info_value[3] = 0
            else:
                for line in ibdevfd.splitlines():
                    if line.find('nvidia_peermem') != -1:
                        ibdev_info_value[3] = 1
            if ibdev_info_value[3] != VAL_ERR:
                ibdevfd = self.ExecuteTimeoutCmd('modinfo nvidia_peermem')
                for line in ibdevfd.splitlines():
                    if line.find('version') != -1:
                        try:
                            ver_str = line.split(':')[1].strip().replace('.', '')
                            ibdev_info_value[4] = self.valueCast(ver_str)
                            break
                        except:
                            pass
            ibdevfd = self.ExecuteTimeoutCmd('ip addr show dev ' + self.rdma_netdevs[i] + ' | grep inet')
            for line in ibdevfd.splitlines():
                if line.find('inet') != -1 and line.find('inet6') == -1:
                    try:
                        ibdev_info_value[5] = line.split()[1].split('/')[0]
                    except Exception:
                        pass
            path = '/sys/class/net/' + self.rdma_netdevs[i] + '/address'
            if os.path.exists(path):
                with open(path) as fd:
                    contents = fd.readlines()
                    for line in contents:
                        try:
                            ibdev_info_value[6] = line.replace('\n', '')
                        except Exception:
                            pass
            lspci_fd = self.ExecuteTimeoutCmd('lspci -D -d 15b3: | awk \'{print $1}\'')
            if lspci_fd == CMD_TIMEOUT_VAL:
                ibdev_info_value[7] = VAL_ERR
            else:
                lspci_fd_len = len(lspci_fd.split())
                if i * 2 >= lspci_fd_len:
                    ibdev_info_value[7] = VAL_ERR
                else:
                    path = '/sys/bus/pci/devices/' + lspci_fd.split()[i*2] + '/infiniband'
                    if os.path.exists(path):
                        bond_dev_fd = self.ExecuteTimeoutCmd('ls  ' + path)
                    else:
                        bond_dev_fd = CMD_TIMEOUT_VAL
                    if bond_dev_fd == CMD_TIMEOUT_VAL:
                        ibdev_info_value[7] = VAL_ERR
                    else:
                        try:
                            if str(i) == bond_dev_fd.strip().split('_')[2]:
                                ibdev_info_value[7] = 1
                            else:
                                ibdev_info_value[7] = 0
                        except Exception:
                            pass
            if ibdev_info_value[5] != VAL_ERR:
                if bond_ip_set.count(ibdev_info_value[5]) > 1:
                    ibdev_info_value[8] = 1
                else:
                    ibdev_info_value[8] = 0
            netdev_config_info.append(dict(zip(ibdev_info_key, ibdev_info_value)))
        for i in range(len(self.eth_ifs)):
            ethif_info_key = [
                "q5_pfc_enabled",
                "prio_trust_state",
                'fw_ver',
                'ats_enabled',
                'acs',
                "serial",
                'mrss',
                'eth_mac'
            ]
            ethif_info_value = [VAL_ERR] * len(ethif_info_key)
            ethiffd = self.ExecuteTimeoutCmd('mlnx_qos -i ' + self.eth_ifs[i] + ' | grep enabled | awk \'/ / {print $7}\'')
            if ethiffd != CMD_TIMEOUT_VAL:
                for line in ethiffd.splitlines():
                    ethif_info_value[0] = self.valueCast(line)
            if ethif_info_value[0] == 0 and self.uptime < self.uptime_alarm_interval:
                ethif_info_value[0] = VAL_ERR #do not report
            ethiffd = self.ExecuteTimeoutCmd('mlnx_qos -i ' + self.eth_ifs[i] + ' | grep Priority')
            if ethiffd != CMD_TIMEOUT_VAL:
                ethif_info_value[1] = 0
                for line in ethiffd.splitlines():
                    if line.find("dscp") != -1:
                        ethif_info_value[1] = 1
                    elif line.find("pcp") != -1:
                        ethif_info_value[1] = 2
            if ethif_info_value[1] != 1 and self.uptime < self.uptime_alarm_interval:
                ethif_info_value[1] = VAL_ERR #do not report
            ethiffd = self.ExecuteTimeoutCmd('ethtool -i ' + self.eth_ifs[i] + ' | grep firmware')
            for line in ethiffd.splitlines():
                try:
                    ver_str = line.split(":")[1].split('(')[0].strip()
                    ethif_info_value[2] = self.valueCast(ver_str.replace('.', ''))
                except Exception:
                    pass
            ethiffd = self.ExecuteTimeoutCmd('lspci -s ' + self.eth_bdfs[i] + ' -v |grep Cap |grep \"\[1b0\]\"')
            if ethiffd == "":
                ethif_info_value[3] = 0
            elif ethiffd != CMD_TIMEOUT_VAL:
                ethif_info_value[3] = 1
            ethiffd = self.ExecuteTimeoutCmd('setpci -v -s ' + self.eth_bdfs[i] + ' f2a.w | awk \'{print $4}\'')
            if ethiffd != CMD_TIMEOUT_VAL:
                for line in ethiffd.splitlines():
                    try:
                        ethif_info_value[4] = int('0x' + line, 16)
                    except Exception:
                        pass
            ethiffd = self.ExecuteTimeoutCmd('lspci -vvv -s ' + self.eth_bdfs[i] + ' | grep Serial')
            if ethiffd != CMD_TIMEOUT_VAL:
                for line in ethiffd.splitlines():
                    try:
                        ethif_info_value[5] = line.split(":")[1].strip()
                    except Exception:
                        pass
            ethiffd = self.ExecuteTimeoutCmd('lspci -s ' + self.eth_bdfs[i] + ' -vv | grep -i MaxReadReq | awk \'{print $5}\'')
            if ethiffd != CMD_TIMEOUT_VAL:
                for line in ethiffd.splitlines():
                    ethif_info_value[6] = self.valueCast(line)
            path = '/sys/class/net/' + self.eth_ifs[i] + '/address'
            if os.path.exists(path):
                try:
                    with open(path) as fd:
                        contents = fd.readlines()
                        for line in contents:
                            ethif_info_value[7] = line.replace('\n', '')
                except Exception:
                    pass
            ethif_config_info.append(dict(zip(ethif_info_key, ethif_info_value)))
        config_info.append(netdev_config_info)
        config_info.append(ethif_config_info)
        return config_info
    
    def get_ethflow_stats(self):
        ethif_stats = []
        for i in range(len(self.eth_ifs)):
            ethif_key = [
                'tx_prio5_bytes',
                'rx_prio5_bytes',
                'tx_prio5_packets',
                'rx_prio5_packets',
                'rdma_outtraffic',
                'rdma_intraffic',
                'rdma_outpkg',
                'rdma_inpkg',
                'tx_prio5_pause',
                'rx_prio5_pause',
                'rx_prio5_discards',
                'rx_prio5_discards_ratio'
            ]
            ethif_value = [VAL_ERR] * len(ethif_key)
            ethif_value[0] = self.read_cmd_contents('ethtool -S ' + self.eth_ifs[i] + ' | grep tx_prio5_bytes:')
            ethif_value[1] = self.read_cmd_contents('ethtool -S ' + self.eth_ifs[i] + ' | grep rx_prio5_bytes:')
            ethif_value[2] = self.read_cmd_contents('ethtool -S ' + self.eth_ifs[i] + ' | grep tx_prio5_packets:')
            ethif_value[3] = self.read_cmd_contents('ethtool -S ' + self.eth_ifs[i] + ' | grep rx_prio5_packets:')
            net_path = "/proc/net/dev"
            if os.path.exists(net_path):
                with open(net_path) as net_file:
                    contents = net_file.readlines()
                    for line in contents:
                        if line.find(":") < 0:
                            continue
                        if line.find(self.eth_ifs[i]) != -1:
                            try:
                                line_fields = line.split(":")[1].lstrip().split()
                                ethif_value[4] = self.valueCast(line_fields[8])
                                ethif_value[5] = self.valueCast(line_fields[0])
                                ethif_value[6] = self.valueCast(line_fields[9])
                                ethif_value[7] = self.valueCast(line_fields[1])
                            except Exception:
                                pass
            ethif_value[8] = self.read_cmd_contents('ethtool -S ' + self.eth_ifs[i] + ' | grep tx_prio5_pause:')
            ethif_value[9] = self.read_cmd_contents('ethtool -S ' + self.eth_ifs[i] + ' | grep rx_prio5_pause:')
            ethif_value[10] = self.read_cmd_contents('ethtool -S ' + self.eth_ifs[i] + ' | grep rx_prio5_discards:')
            try:
                if ethif_value[10] != VAL_ERR and ethif_value[3] != VAL_ERR:
                    ethif_value[11] = float(ethif_value[10]) / ethif_value[3]
            except Exception:
                pass
            ethif_stats.append(dict(zip(ethif_key, ethif_value)))
        return ethif_stats

    def get_netdevflow_stats(self):
        netdev_stas = []
        for i in range(len(self.rdma_netdevs)):
            netdev_key = [
                'port_rcv_data',
                'port_rcv_packets',
                'np_ecn_marked_roce_packets',
                'rp_cnp_handled',
                'req_cqe_error',
                'req_remote_access_errors',
                'req_remote_invalid_request',
                'duplicate_request',
                'resp_cqe_error',
                'resp_local_length_error',
                'resp_remote_access_errors',
                'rx_icrc_encapsulated',
                'port_xmit_wait',
                'rnr_nak_retry_err',
                "out_of_buffer",
                "out_of_sequence",
                "packet_seq_err",
                "local_ack_timeout_err"
            ]
            netdev_value = [VAL_ERR] * len(netdev_key)
            netdev_value[0] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/counters/port_rcv_data')
            netdev_value[1] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/counters/port_rcv_packets')
            netdev_value[2] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/np_ecn_marked_roce_packets')
            netdev_value[3] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/rp_cnp_handled')
            netdev_value[4] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/req_cqe_error')
            netdev_value[5] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/req_remote_access_errors')
            netdev_value[6] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/req_remote_invalid_request')
            netdev_value[7] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/duplicate_request')
            netdev_value[8] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/resp_cqe_error')
            netdev_value[9] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/resp_local_length_error')
            netdev_value[10] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/resp_remote_access_errors')
            netdev_value[11] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/rx_icrc_encapsulated')
            netdev_value[12] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/counters/port_xmit_wait')
            netdev_value[13] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/rnr_nak_retry_err')
            netdev_value[14] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/out_of_buffer')
            netdev_value[15] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/out_of_sequence')
            netdev_value[16] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/packet_seq_err')
            netdev_value[17] = self.read_file_contents('/sys/class/infiniband/' + self.ib_devs[i] + '/ports/1/hw_counters/local_ack_timeout_err')
            netdev_stas.append(dict(zip(netdev_key, netdev_value)))
        return netdev_stas

    def get_hpbw_stats(self):
        hpfd = self.ExecuteTimeoutCmd("monitor_client -o")
        if hpfd == CMD_TIMEOUT_VAL:
            return 
        parts = hpfd.split("Device:")
        parts = parts[1:]
        for i in range(len(parts)):
            transmit = 1
            max_pattern = r"Max:\s*(-?\d+(?:\.\d+)?)"
            min_pattern = r"Min:\s*(-?\d+(?:\.\d+)?)"
            mean_pattern = r"Mean:\s*(-?\d+(?:\.\d+)?)"
            p50_pattern = r"P50:\s*(-?\d+(?:\.\d+)?)"
            p90_pattern = r"P90:\s*(-?\d+(?:\.\d+)?)"
            for line in parts[i].splitlines():
                try:
                    if line.find("bond") != -1:
                        devname = line.strip()
                    elif line.find("Received data points") != -1:
                        transmit = 0
                    elif line.find("Transmitted data points") != -1:
                        transmit = 1
                    elif line.find("Data points") != -1 and transmit == 0:
                        if not self.hpbw_stats[devname][0].full():
                            self.hpbw_stats[devname][0].put(self.valueCast(re.search(max_pattern, line).group(1)))
                        if not self.hpbw_stats[devname][1].full():
                            self.hpbw_stats[devname][1].put(self.valueCast(re.search(min_pattern, line).group(1)))
                        if not self.hpbw_stats[devname][2].full():
                            self.hpbw_stats[devname][2].put(self.valueCast(re.search(mean_pattern, line).group(1)))
                        if not self.hpbw_stats[devname][3].full():
                            self.hpbw_stats[devname][3].put(self.valueCast(re.search(p50_pattern, line).group(1)))
                        if not self.hpbw_stats[devname][4].full():
                            self.hpbw_stats[devname][4].put(self.valueCast(re.search(p90_pattern, line).group(1)))
                    elif line.find("Data points") != -1 and transmit == 1:
                        if not self.hpbw_stats[devname][5].full():
                            self.hpbw_stats[devname][5].put(self.valueCast(re.search(max_pattern, line).group(1)))
                        if not self.hpbw_stats[devname][6].full():
                            self.hpbw_stats[devname][6].put(self.valueCast(re.search(min_pattern, line).group(1)))
                        if not self.hpbw_stats[devname][7].full():
                            self.hpbw_stats[devname][7].put(self.valueCast(re.search(mean_pattern, line).group(1)))
                        if not self.hpbw_stats[devname][8].full():
                            self.hpbw_stats[devname][8].put(self.valueCast(re.search(p50_pattern, line).group(1)))
                        if not self.hpbw_stats[devname][9].full():
                            self.hpbw_stats[devname][9].put(self.valueCast(re.search(p90_pattern, line).group(1)))
                except Exception:
                    pass
        return 
    
    def get_phy_err_stats(self):
        ethif_stats = []
        for i in range(len(self.eth_ifs)):
            ethif_key = [
                'rx_crc_errors_phy',
                'rx_symbol_err_phy',
                'rx_pcs_symbol_err_phy'
            ]
            ethif_value = [VAL_ERR] * len(ethif_key)
            ethif_value[0] = self.read_cmd_contents('ethtool -S ' + self.eth_ifs[i] + ' | grep rx_crc_errors_phy:')
            ethif_value[1] = self.read_cmd_contents('ethtool -S ' + self.eth_ifs[i] + ' | grep rx_symbol_err_phy:')
            ethif_value[2] = self.read_cmd_contents('ethtool -S ' + self.eth_ifs[i] + ' | grep rx_pcs_symbol_err_phy:')
            ethif_stats.append(dict(zip(ethif_key, ethif_value)))
        return ethif_stats

class RDMACollector(VmBaseCollector):
    def init(self):
        self.set_frequency(10)
        self.collector = NetworkCollect()
        self.handler = MetricHandler()
        self.handler.namespace = 'qce/cvm'
        self.is_dns_normal = 1
        try:
            self.uuid = self.get_vm_uuid()
            self.vmip = self.get_vmip()
        except Exception:
            self.is_dns_normal = 0
            pass
        # last report time to barad, only hpbw can report per 10s, others are 1 min
        self.last_report_time = 0
        self.report_interval = 50 # Error elimination
        self.last_eth_stats = self.collector.get_ethflow_stats()
        self.last_netdev_stats = self.collector.get_netdevflow_stats()
        self.last_phy_err_stats = self.collector.get_phy_err_stats()
        self.config_info = self.collector.get_config_info()
        self.monitor_info = self.collector.get_monitor_info()
        # set monitor info sample interval to 60s
        self.monitor_info_interval = 60
        self.last_monitor_info_time = 0
        # set config info sample interval to 5 min
        self.config_info_interval = 300
        self.last_config_info_time = 0
        # alarm interval 1 hour
        self.alarm_interval = 1 * 60 * 60
        self.last_link_state_alarm_time = [0] * len(self.collector.rdma_netdevs)
        self.last_bond_dev_order_alarm_time = [0] * len(self.collector.rdma_netdevs)
        self.last_lacp_alarm_time = [0] * len(self.collector.rdma_netdevs)
        self.last_link_detected_alarm_time = [0] * len(self.collector.eth_ifs)
        self.last_rdma_pcie_alarm_time = [0]*len(self.collector.eth_ifs)
        self.last_pcie_switch_alarm_time = [0]*len(self.collector.eth_ifs)
       
    def is_rdma_ready(self):
        fd = self.collector.ExecuteTimeoutCmd("ibdev2netdev")
        ibdevs = []
        netdevs = []
        for line in fd.splitlines():
            if line.find("==>") == -1:
                continue
            try:
                ibdev = line.split("==>")[0].strip().split()[0].strip()
                netdev = line.split("==>")[1].strip().split()[0].strip()
                if not netdev.startswith("bond"):
                    continue
                ibdevs.append(ibdev)
                netdevs.append(netdev)
            except:
                pass
        if ibdevs != self.collector.ib_devs or netdevs != self.collector.rdma_netdevs:
            return 0
        return 1

    def rdma_dev_reinit(self):
        dev_pairs = self.collector.get_rdma_devices()
        self.collector.ib_devs = dev_pairs[0]         # [mln5_bond_0]
        self.collector.rdma_netdevs = dev_pairs[1]    # [bond0]
        self.collector.netdev_slave_index = dev_pairs[2]   # [[0,1]]
        self.collector.eth_ifs = dev_pairs[3]         # [eth0, eth1]
        self.collector.eth_bdfs = dev_pairs[4]        # [5e:00.0, 5e:00.1]
        # hpbw with queue elements: 'rx_hpbw_max','rx_hpbw_min','rx_hpbw_p50','rx_hpbw_p90','rx_hpbw_avg','tx_hpbw_max','tx_hpbw_min','tx_hpbw_p50','tx_hpbw_p90','tx_hpbw_avg',
        self.collector.hpbw_stats = dev_pairs[5]
        self.last_eth_stats = self.collector.get_ethflow_stats()
        self.last_netdev_stats = self.collector.get_netdevflow_stats()
        self.last_phy_err_stats = self.collector.get_phy_err_stats()
        self.config_info = self.collector.get_config_info()
        self.monitor_info = self.collector.get_monitor_info()
        self.last_link_state_alarm_time = [0] * len(self.collector.rdma_netdevs)
        self.last_bond_dev_order_alarm_time = [0] * len(self.collector.rdma_netdevs)
        self.last_lacp_alarm_time = [0] * len(self.collector.rdma_netdevs)
        self.last_link_detected_alarm_time = [0] * len(self.collector.eth_ifs)
        self.last_rdma_pcie_alarm_time = [0]*len(self.collector.eth_ifs)
        self.last_pcie_switch_alarm_time = [0]*len(self.collector.eth_ifs)
    
    def check_NO_CARRIER(self, device_name):
        try:
            iplink = self.collector.ExecuteTimeoutCmd('ip link show ' + device_name)
            if 'NO-CARRIER' in iplink:
                return True
        except Exception as e:
            return True
        return False

    def do_collect(self):
        #not rdma device or bonding device
        rdma_ready = self.is_rdma_ready()
        if rdma_ready == 0:
            self.rdma_dev_reinit() #reinit if rdma kernel module does not probe when barad start
        if not rdma_ready or not self.is_dns_normal:
            self.is_dns_normal = 1
            try:
                self.uuid = self.get_vm_uuid()
                self.vmip = self.get_vmip()
            except Exception:
                self.is_dns_normal = 0
            pass
            return
        if self.collector.uptime < self.collector.uptime_alarm_interval:
            self.collector.uptime = self.collector.read_uptime_secs()
        current_time = time.time()
        # alarming if condition triggered
        for i in range(len(self.collector.rdma_netdevs)):
            first_slave_idx = self.collector.netdev_slave_index[i][0]
            bond_mac = self.config_info[0][i]['bond_mac']
            eth_mac = self.config_info[1][first_slave_idx]['eth_mac']
            rdma_ip = self.config_info[0][i]['rdma_ip']
            serial = self.config_info[1][first_slave_idx]['serial']
            #uuid is absolutely str with "" or "xxxx-xxx"
            alarm_dimensions = [{"Key" : "Uuid", "Value" :self.uuid}, {"Key" : "BondMac", "Value": "0" if bond_mac == VAL_ERR else bond_mac }, {"Key" : "EthMac", "Value" : "0" if eth_mac == VAL_ERR else eth_mac}, {"Key" : "RdmaIp", "Value" : "0" if rdma_ip == VAL_ERR else rdma_ip}, {"Key" : "Bdf", "Value" : "0" if len(self.collector.eth_bdfs) == 0 else self.collector.eth_bdfs[first_slave_idx]}, {"Key" : "Sn", "Value": "0" if serial == VAL_ERR else serial}]
            try:
                if self.monitor_info[0][i]['link_state'] == VAL_ERR:
                    continue
                # link state
                if self.monitor_info[0][i]['link_state'] != 1:
                    if current_time - self.last_link_state_alarm_time[i] > self.alarm_interval or self.last_link_state_alarm_time[i] == 0:
                        alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(current_time))
                        alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions, "DeviceName":self.collector.rdma_netdevs[i], "DeviceId": "", 'Slot':self.collector.eth_bdfs[first_slave_idx], 'SN': self.config_info[1][first_slave_idx]['serial']}
                        alarmproxy_event = {"AlarmId":1118, "EventName":"link_state", "FaultType": "Hardware", "FaultDesc":"rdma bond link_state" }
                        data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                        self.put_data(data_alarmproxy)
                        self.last_link_state_alarm_time[i] = current_time
                else:
                    self.last_link_state_alarm_time[i] = 0
            except Exception:
                pass
            try:
                if self.config_info[0][i]['bond_dev_order'] == VAL_ERR:
                    continue
                # if not one-to-one correspondence, do alarm
                if self.config_info[0][i]['bond_dev_order'] != 1:
                    if current_time - self.last_bond_dev_order_alarm_time[i] > self.alarm_interval or self.last_bond_dev_order_alarm_time[i] == 0:
                        alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(current_time))
                        alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions, "DeviceName":self.collector.rdma_netdevs[i], "DeviceId": "", 'Slot':self.collector.eth_bdfs[first_slave_idx], 'SN': self.config_info[1][first_slave_idx]['serial']}
                        alarmproxy_event = {"AlarmId":1126, "EventName":"bond_dev_out_of_order", "FaultType": "Software", "FaultDesc":"rdma bond dev out of order" }
                        data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                        self.put_data(data_alarmproxy)
                        self.last_bond_dev_order_alarm_time[i] = current_time
                else:
                    self.last_bond_dev_order_alarm_time[i] = 0
            except Exception:
                pass
            # link_detected
            for slave_idx in self.collector.netdev_slave_index[i]:
                # change from 1 to 0, do alarm imm
                bond_mac = self.config_info[0][i]['bond_mac']
                eth_mac = self.config_info[1][slave_idx]['eth_mac']
                rdma_ip = self.config_info[0][i]['rdma_ip']
                serial = self.config_info[1][slave_idx]['serial']
                alarm_dimensions = [{"Key" : "Uuid", "Value" :self.uuid}, {"Key" : "BondMac", "Value": "0" if bond_mac == VAL_ERR else bond_mac}, {"Key" : "EthMac", "Value" : "0" if eth_mac == VAL_ERR else eth_mac}, {"Key" : "RdmaIp", "Value" : "0" if rdma_ip == VAL_ERR else rdma_ip}, {"Key" : "Bdf", "Value" : "0" if len(self.collector.eth_bdfs) == 0 else self.collector.eth_bdfs[slave_idx]}, {"Key" : "Sn", "Value": "0" if serial == VAL_ERR else serial}]
                try:    
                    if self.monitor_info[1][slave_idx]['link_detected'] == VAL_ERR:
                        continue
                    if self.monitor_info[1][slave_idx]['link_detected'] != 1 and self.check_NO_CARRIER(self.collector.eth_ifs[slave_idx]) == True:
                        if current_time - self.last_link_detected_alarm_time[slave_idx] > self.alarm_interval or self.last_link_detected_alarm_time[slave_idx] == 0:
                            alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(current_time))
                            alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions, "DeviceName":self.collector.eth_ifs[slave_idx], "DeviceId": "", 'Slot':self.collector.eth_bdfs[slave_idx], 'SN': self.config_info[1][slave_idx]['serial']}
                            alarmproxy_event = {"AlarmId":1117, "EventName":"link_detected", "FaultType": "Hardware", "FaultDesc":"rdma eth interface link detected"}
                            data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                            self.put_data(data_alarmproxy)
                            self.last_link_detected_alarm_time[slave_idx] = current_time
                    else:
                        self.last_link_detected_alarm_time[slave_idx] = 0
                except Exception:
                    pass
                try:
                    #lacp
                    if self.monitor_info[1][slave_idx]['lacp_status'] == VAL_ERR or self.monitor_info[0][i]['ports_num'] == VAL_ERR:
                        continue
                    if self.monitor_info[1][slave_idx]['lacp_status'] != 1 or self.monitor_info[0][i]['ports_num'] != 2 or (slave_idx % 2 == 0 and self.monitor_info[1][slave_idx]['lacp_id'] != self.monitor_info[1][slave_idx + 1]['lacp_id']):
                        if current_time - self.last_lacp_alarm_time[slave_idx] > self.alarm_interval or self.last_lacp_alarm_time[slave_idx] == 0:
                            alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(current_time))
                            alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions, "DeviceName":self.collector.eth_ifs[slave_idx], "DeviceId": "", 'Slot':self.collector.eth_bdfs[slave_idx], 'SN': self.config_info[1][slave_idx]['serial']}
                            alarmproxy_event = {"AlarmId":1131, "EventName":"lacp_status", "FaultType": "Hardware", "FaultDesc":"rdma eth interface lacp error"}
                            data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                            self.put_data(data_alarmproxy)
                            self.last_lacp_alarm_time[slave_idx] = current_time
                    else:
                        self.last_lacp_alarm_time[slave_idx] = 0
                except Exception:
                    pass
                try:
                    # rdma pcie 
                    if cutils.is_metal() != 1:
                        continue
                    if self.monitor_info[1][slave_idx]['aer_has_fatal'] == 1 or \
                        self.monitor_info[1][slave_idx]['current_link_width'] != VAL_ERR and self.monitor_info[1][slave_idx]['current_link_width'] != 16 or\
                        self.monitor_info[1][slave_idx]['max_link_width'] != VAL_ERR and self.monitor_info[1][slave_idx]['max_link_width'] != 16 or \
                        (self.monitor_info[1][slave_idx]['device_id'] == '0x1021' and \
                            ((self.monitor_info[1][slave_idx]['current_link_speed'] != VAL_ERR and self.monitor_info[1][slave_idx]['current_link_speed'] != 32) or \
                            (self.monitor_info[1][slave_idx]['max_link_speed'] != VAL_ERR and self.monitor_info[1][slave_idx]['max_link_speed'] != 32))) or \
                        (self.monitor_info[1][slave_idx]['device_id'] != '0x1021' and self.monitor_info[1][slave_idx]['device_id'] != VAL_ERR and \
                            ((self.monitor_info[1][slave_idx]['current_link_speed'] != VAL_ERR and self.monitor_info[1][slave_idx]['current_link_speed'] != 16) or \
                            (self.monitor_info[1][slave_idx]['max_link_speed'] != VAL_ERR and self.monitor_info[1][slave_idx]['max_link_speed'] != 16))):
                        if current_time - self.last_rdma_pcie_alarm_time[slave_idx] > self.alarm_interval or self.last_rdma_pcie_alarm_time[slave_idx] == 0:
                            alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(current_time))
                            alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions, "DeviceName":self.collector.eth_ifs[slave_idx], "DeviceId": "", 'Slot':self.collector.eth_bdfs[slave_idx], 'SN': self.config_info[1][slave_idx]['serial']}
                            alarmproxy_event = {"AlarmId":1129, "EventName":"rdma_pcie_link", "FaultType": "Hardware", "FaultDesc":"rdma pcie link error"}
                            data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                            self.put_data(data_alarmproxy)
                            self.last_rdma_pcie_alarm_time[slave_idx] = current_time
                    else:
                        self.last_rdma_pcie_alarm_time[slave_idx] = 0
                except Exception:
                    pass
                try:
                    #pcie switch
                    if len(self.collector.check_pcie_width(self.monitor_info[1][slave_idx]['switch_ports'])) != 0 or len(self.collector.check_pcie_speed(self.monitor_info[1][slave_idx]['switch_ports'])) != 0 or\
                        len(self.collector.check_pcie_aer(self.monitor_info[1][slave_idx]['switch_ports'])) != 0:
                        if current_time - self.last_pcie_switch_alarm_time[slave_idx] > self.alarm_interval or self.last_pcie_switch_alarm_time[slave_idx] == 0:
                            alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(current_time))
                            alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions, "DeviceName":self.collector.eth_ifs[slave_idx], "DeviceId": "", 'Slot':self.collector.eth_bdfs[slave_idx], 'SN': self.config_info[1][slave_idx]['serial']}
                            alarmproxy_event = {"AlarmId":1130, "EventName":"pcie_switch_link", "FaultType": "Hardware", "FaultDesc":"pcie switch error"}
                            data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                            self.put_data(data_alarmproxy)
                            self.last_pcie_switch_alarm_time[slave_idx] = current_time
                    else:
                        self.last_pcie_switch_alarm_time[slave_idx] = 0
                except Exception:
                    pass

        if current_time - self.last_monitor_info_time > self.monitor_info_interval:
            self.monitor_info = self.collector.get_monitor_info()
            self.last_monitor_info_time = current_time
        if current_time - self.last_config_info_time > self.config_info_interval:
            self.config_info = self.collector.get_config_info()
            self.last_config_info_time = current_time
        if current_time - self.last_report_time >= self.report_interval:
            eth_stats = self.collector.get_ethflow_stats()
            netdev_stats = self.collector.get_netdevflow_stats()
            phy_err_stats = self.collector.get_phy_err_stats()
        
        for i in range(len(self.collector.rdma_netdevs)):
            delta_time = current_time - self.last_report_time
            bond_tx_prio5_bytes = float(0)
            bond_rx_prio5_bytes = float(0)
            bond_tx_prio5_packets = float(0)
            bond_rx_prio5_packets = float(0)
            bond_tx_prio5_pause = float(0)
            bond_rx_prio5_pause = float(0)
            bond_rx_prio5_discards = float(0)
            #report or not, VAL_ERR do not report, 1 report
            bond_tx_prio5_bytes_report = VAL_ERR 
            bond_rx_prio5_bytes_report = VAL_ERR
            bond_tx_prio5_packets_report = VAL_ERR
            bond_rx_prio5_packets_report = VAL_ERR
            bond_tx_prio5_pause_report = VAL_ERR
            bond_rx_prio5_pause_report = VAL_ERR
            bond_rx_prio5_discards_report = VAL_ERR
            # eth interface monitor
            for slave_idx in self.collector.netdev_slave_index[i]:
                # rdma ip is not regarded as abnormal
                if self.collector.eth_ifs[slave_idx] == VAL_ERR or self.config_info[1][slave_idx]['serial'] == VAL_ERR or self.collector.eth_bdfs[slave_idx] == VAL_ERR:
                    continue
                try:
                    dimensions = {  'name': self.collector.eth_ifs[slave_idx], 
                                'vmip': self.vmip, 
                                'uuid': self.uuid, 
                                'rdma_ip': self.config_info[0][i]['rdma_ip'] if self.config_info[0][i]['rdma_ip'] != VAL_ERR else "0", 
                                'serial': self.config_info[1][slave_idx]['serial'],
                                'rdma_bdf': self.collector.eth_bdfs[slave_idx] }
                    if current_time - self.last_report_time >= self.report_interval:
                        tx_prio5_byte_val = float(eth_stats[slave_idx]['tx_prio5_bytes'] - self.last_eth_stats[slave_idx]['tx_prio5_bytes']) * 8 / delta_time
                        rx_prio5_bytes_val = float(eth_stats[slave_idx]['rx_prio5_bytes'] - self.last_eth_stats[slave_idx]['rx_prio5_bytes']) * 8 / delta_time
                        tx_prio5_packets_val = float(eth_stats[slave_idx]['tx_prio5_packets'] - self.last_eth_stats[slave_idx]['tx_prio5_packets']) / delta_time
                        rx_prio5_packets_val = float(eth_stats[slave_idx]['rx_prio5_packets'] - self.last_eth_stats[slave_idx]['rx_prio5_packets']) / delta_time
                        tx_prio5_pause_val = float(eth_stats[slave_idx]['tx_prio5_pause'] - self.last_eth_stats[slave_idx]['tx_prio5_pause']) / delta_time
                        rx_prio5_pause_val = float(eth_stats[slave_idx]['rx_prio5_pause'] - self.last_eth_stats[slave_idx]['rx_prio5_pause']) / delta_time
                        rx_prio5_discards_val = float(eth_stats[slave_idx]['rx_prio5_discards'] - self.last_eth_stats[slave_idx]['rx_prio5_discards']) / delta_time
                        tx_prio5_bytes = format(tx_prio5_byte_val, '.3f')
                        rx_prio5_bytes = format(rx_prio5_bytes_val, '.3f')
                        tx_prio5_packets = format(tx_prio5_packets_val, '.3f')
                        rx_prio5_packets = format(rx_prio5_packets_val, '.3f')
                        rdma_outtraffic = format(float(eth_stats[slave_idx]['rdma_outtraffic'] - self.last_eth_stats[slave_idx]['rdma_outtraffic']) * 8 / delta_time, '.3f')
                        rdma_intraffic = format(float(eth_stats[slave_idx]['rdma_intraffic'] - self.last_eth_stats[slave_idx]['rdma_intraffic']) * 8 / delta_time, '.3f')
                        rdma_outpkg = format(float(eth_stats[slave_idx]['rdma_outpkg'] - self.last_eth_stats[slave_idx]['rdma_outpkg']) / delta_time, '.3f')
                        rdma_inpkg = format(float(eth_stats[slave_idx]['rdma_inpkg'] - self.last_eth_stats[slave_idx]['rdma_inpkg']) / delta_time, '.3f')
                        tx_prio5_pause = format(tx_prio5_pause_val, '.3f')
                        rx_prio5_pause = format(rx_prio5_pause_val, '.3f')
                        rx_prio5_discards = format(rx_prio5_discards_val, '.3f')
                        if eth_stats[slave_idx]['tx_prio5_bytes'] != VAL_ERR:
                            bond_tx_prio5_bytes += tx_prio5_byte_val
                            bond_tx_prio5_bytes_report = 1
                        if eth_stats[slave_idx]['rx_prio5_bytes'] != VAL_ERR:
                            bond_rx_prio5_bytes += rx_prio5_bytes_val
                            bond_rx_prio5_bytes_report = 1
                        if eth_stats[slave_idx]['tx_prio5_packets'] != VAL_ERR:
                            bond_tx_prio5_packets += tx_prio5_packets_val
                            bond_tx_prio5_packets_report = 1
                        if eth_stats[slave_idx]['rx_prio5_packets'] != VAL_ERR:
                            bond_rx_prio5_packets += rx_prio5_packets_val
                            bond_rx_prio5_packets_report = 1
                        if eth_stats[slave_idx]['tx_prio5_pause'] != VAL_ERR:
                            bond_tx_prio5_pause += tx_prio5_pause_val
                            bond_tx_prio5_pause_report = 1
                        if eth_stats[slave_idx]['rx_prio5_pause'] != VAL_ERR:
                            bond_rx_prio5_pause += rx_prio5_pause_val
                            bond_rx_prio5_pause_report = 1
                        if eth_stats[slave_idx]['rx_prio5_discards'] != VAL_ERR:
                            bond_rx_prio5_discards += rx_prio5_discards_val
                            bond_rx_prio5_discards_report = 1
                except Exception:
                    continue
                try:
                    batch_metric_part = []
                    # eth dev basic info
                    if current_time - self.last_report_time >= self.report_interval:
                        #if self.monitor_info[1][slave_idx]['max_link_speed'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.monitor_info[1][slave_idx]['max_link_speed'] != 16):
                        #    batch_metric_part.append({'name': 'max_link_speed', 'value': self.monitor_info[1][slave_idx]['max_link_speed']})
                        #if self.monitor_info[1][slave_idx]['max_link_width'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.monitor_info[1][slave_idx]['max_link_width'] != 16):
                        #    batch_metric_part.append({'name': 'max_link_width', 'value': self.monitor_info[1][slave_idx]['max_link_width']})
                        if self.config_info[1][slave_idx]['fw_ver'] != VAL_ERR:
                            batch_metric_part.append({'name': 'fw_ver', 'value': self.config_info[1][slave_idx]['fw_ver']})
                        if self.monitor_info[1][slave_idx]['link_detected'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.monitor_info[1][slave_idx]['link_detected'] != 1):
                            batch_metric_part.append({'name': 'link_detected', 'value': self.monitor_info[1][slave_idx]['link_detected']})
                        if self.config_info[1][slave_idx]['q5_pfc_enabled'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.config_info[1][slave_idx]['q5_pfc_enabled'] != 1):
                            batch_metric_part.append({'name': 'q5_pfc_enabled', 'value': self.config_info[1][slave_idx]['q5_pfc_enabled']})
                        if self.config_info[1][slave_idx]['prio_trust_state'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.config_info[1][slave_idx]['prio_trust_state'] != 1):
                            batch_metric_part.append({'name': 'prio_trust_state', 'value': self.config_info[1][slave_idx]['prio_trust_state']})
                        if self.config_info[1][slave_idx]['ats_enabled'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.config_info[1][slave_idx]['ats_enabled'] != 0):
                            batch_metric_part.append({'name': 'ats_enabled', 'value': self.config_info[1][slave_idx]['ats_enabled']})
                        if self.config_info[1][slave_idx]['acs'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.config_info[1][slave_idx]['acs'] != 0):
                            batch_metric_part.append({'name': 'acs', 'value': self.config_info[1][slave_idx]['acs']})
                        if self.monitor_info[1][slave_idx]['dcqcn_enable'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.monitor_info[1][slave_idx]['dcqcn_enable'] != 11):
                            batch_metric_part.append({'name': 'dcqcn_enable', 'value': self.monitor_info[1][slave_idx]['dcqcn_enable']})
                        if self.monitor_info[1][slave_idx]['rpg_time_reset'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rpg_time_reset', 'value': self.monitor_info[1][slave_idx]['rpg_time_reset']})
                        if self.monitor_info[1][slave_idx]['rpg_ai_rate'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rpg_ai_rate', 'value': self.monitor_info[1][slave_idx]['rpg_ai_rate']})
                        if self.monitor_info[1][slave_idx]['rpg_byte_reset'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rpg_byte_reset', 'value': self.monitor_info[1][slave_idx]['rpg_byte_reset']})
                        if self.monitor_info[1][slave_idx]['rate_reduce_monitor_period'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rate_reduce_monitor_period', 'value': self.monitor_info[1][slave_idx]['rate_reduce_monitor_period']})
                        if self.monitor_info[1][slave_idx]['link_events_down_phy'] != VAL_ERR:
                            batch_metric_part.append({'name': 'link_events_down_phy', 'value': self.monitor_info[1][slave_idx]['link_events_down_phy']})
                        if self.monitor_info[1][slave_idx]['cnp_dscp'] != VAL_ERR:
                            batch_metric_part.append({'name': 'cnp_dscp', 'value': self.monitor_info[1][slave_idx]['cnp_dscp']})
                        if self.monitor_info[1][slave_idx]['cnp_802p_prio'] != VAL_ERR:
                            batch_metric_part.append({'name': 'cnp_802p_prio', 'value': self.monitor_info[1][slave_idx]['cnp_802p_prio']})
                        if self.config_info[1][slave_idx]['mrss'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.config_info[1][slave_idx]['mrss'] != 4096):
                            batch_metric_part.append({'name': 'mrss', 'value': self.config_info[1][slave_idx]['mrss']})
                        if self.monitor_info[1][slave_idx]['eth_promisc'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.monitor_info[1][slave_idx]['eth_promisc'] != 0):
                            batch_metric_part.append({'name': 'eth_promisc', 'value': self.monitor_info[1][slave_idx]['eth_promisc']})
                        # eth dev statistics
                        if eth_stats[slave_idx]['tx_prio5_bytes'] != VAL_ERR:
                            batch_metric_part.append({'name': 'tx_prio5_bytes', 'value': tx_prio5_bytes})
                        if eth_stats[slave_idx]['rx_prio5_bytes'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rx_prio5_bytes', 'value': rx_prio5_bytes})
                        if eth_stats[slave_idx]['tx_prio5_packets'] != VAL_ERR:
                            batch_metric_part.append({'name': 'tx_prio5_packets', 'value': tx_prio5_packets})
                        if eth_stats[slave_idx]['rx_prio5_packets'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rx_prio5_packets', 'value': rx_prio5_packets})
                        if eth_stats[slave_idx]['tx_prio5_pause'] != VAL_ERR:
                            batch_metric_part.append({'name': 'tx_prio5_pause', 'value': tx_prio5_pause})
                        if eth_stats[slave_idx]['rx_prio5_pause'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rx_prio5_pause', 'value': rx_prio5_pause})
                        if eth_stats[slave_idx]['rx_prio5_discards'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rx_prio5_discards', 'value': rx_prio5_discards})
                        if eth_stats[slave_idx]['rx_prio5_discards_ratio'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rx_prio5_discards_ratio', 'value': eth_stats[slave_idx]['rx_prio5_discards_ratio']})
                        if eth_stats[slave_idx]['rdma_outtraffic'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rdma_outtraffic', 'value': rdma_outtraffic})
                        if eth_stats[slave_idx]['rdma_intraffic'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rdma_intraffic', 'value': rdma_intraffic})
                        if eth_stats[slave_idx]['rdma_outpkg'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rdma_outpkg', 'value': rdma_outpkg})
                        if eth_stats[slave_idx]['rdma_inpkg'] != VAL_ERR:
                            batch_metric_part.append({'name': 'rdma_inpkg', 'value': rdma_inpkg})
                        # eth err stats
                        if phy_err_stats[slave_idx]['rx_crc_errors_phy'] != VAL_ERR and self.last_phy_err_stats[slave_idx]['rx_crc_errors_phy'] != VAL_ERR:
                            value = format(float(phy_err_stats[slave_idx]['rx_crc_errors_phy'] - self.last_phy_err_stats[slave_idx]['rx_crc_errors_phy'])*60/delta_time, '.3f')
                            batch_metric_part.append({'name': 'rx_crc_errors_phy', 'value': value})
                        if phy_err_stats[slave_idx]['rx_symbol_err_phy'] != VAL_ERR and self.last_phy_err_stats[slave_idx]['rx_symbol_err_phy'] != VAL_ERR:
                            value = format(float(phy_err_stats[slave_idx]['rx_symbol_err_phy'] - self.last_phy_err_stats[slave_idx]['rx_symbol_err_phy'])*60/delta_time, '.3f')
                            batch_metric_part.append({'name': 'rx_symbol_err_phy', 'value': value})
                        if phy_err_stats[slave_idx]['rx_pcs_symbol_err_phy'] != VAL_ERR and self.last_phy_err_stats[slave_idx]['rx_pcs_symbol_err_phy'] != VAL_ERR:
                            value = format(float(phy_err_stats[slave_idx]['rx_pcs_symbol_err_phy'] - self.last_phy_err_stats[slave_idx]['rx_pcs_symbol_err_phy'])*60/delta_time, '.3f')
                            batch_metric_part.append({'name': 'rx_pcs_symbol_err_phy', 'value': value})
                        self.handler.add_batch_metric(batch=batch_metric_part, dimensions=dimensions, timestamp=self.collector.valueCast(current_time))
                except Exception:
                    continue
            # netdev monitor
            try:
                first_slave_idx = self.collector.netdev_slave_index[i][0]
                # rdma ip is not regarded as abnormal
                if self.collector.rdma_netdevs[i] == VAL_ERR or self.config_info[1][first_slave_idx]['serial'] == VAL_ERR or self.collector.eth_bdfs[first_slave_idx] == VAL_ERR:
                    continue
                dimensions = {  'name': self.collector.rdma_netdevs[i], 
                            'vmip': self.vmip, 
                            'uuid': self.uuid, 
                            'rdma_ip': self.config_info[0][i]['rdma_ip'] if self.config_info[0][i]['rdma_ip'] != VAL_ERR else "0", 
                            'serial': self.config_info[1][first_slave_idx]['serial'],
                            'rdma_bdf': self.collector.eth_bdfs[first_slave_idx]}
                if current_time - self.last_report_time >= self.report_interval:
                    port_rcv_data = format(float(netdev_stats[i]['port_rcv_data'] - self.last_netdev_stats[i]['port_rcv_data']) * 8 / delta_time, '.3f')
                    port_rcv_packets = format(float(netdev_stats[i]['port_rcv_packets'] - self.last_netdev_stats[i]['port_rcv_packets']) / delta_time, '.3f')
                    np_ecn_marked_roce_packets = format(float(netdev_stats[i]['np_ecn_marked_roce_packets'] - self.last_netdev_stats[i]['np_ecn_marked_roce_packets']) / delta_time, '.3f')
                    rp_cnp_handled = format(float(netdev_stats[i]['rp_cnp_handled'] - self.last_netdev_stats[i]['rp_cnp_handled']) / delta_time, '.3f')
                    req_cqe_error = format(float(netdev_stats[i]['req_cqe_error'] - self.last_netdev_stats[i]['req_cqe_error']) / delta_time, '.3f')
                    req_remote_access_errors = format(float(netdev_stats[i]['req_remote_access_errors'] - self.last_netdev_stats[i]['req_remote_access_errors']) / delta_time, '.3f')
                    req_remote_invalid_request = format(float(netdev_stats[i]['req_remote_invalid_request'] - self.last_netdev_stats[i]['req_remote_invalid_request']) / delta_time, '.3f')
                    duplicate_request = format(float(netdev_stats[i]['duplicate_request'] - self.last_netdev_stats[i]['duplicate_request']) / delta_time, '.3f')
                    resp_cqe_error = format(float(netdev_stats[i]['resp_cqe_error'] - self.last_netdev_stats[i]['resp_cqe_error']) / delta_time, '.3f')
                    resp_local_length_error = format(float(netdev_stats[i]['resp_local_length_error'] - self.last_netdev_stats[i]['resp_local_length_error']) / delta_time, '.3f')
                    resp_remote_access_errors = format(float(netdev_stats[i]['resp_remote_access_errors'] - self.last_netdev_stats[i]['resp_remote_access_errors']) / delta_time, '.3f')
                    rx_icrc_encapsulated = format(float(netdev_stats[i]['rx_icrc_encapsulated'] - self.last_netdev_stats[i]['rx_icrc_encapsulated']) / delta_time, '.3f')
                    port_xmit_wait = format(float(netdev_stats[i]['port_xmit_wait'] - self.last_netdev_stats[i]['port_xmit_wait']) / delta_time, '.3f')
                    rnr_nak_retry_err = format(float(netdev_stats[i]['rnr_nak_retry_err'] - self.last_netdev_stats[i]['rnr_nak_retry_err']) / delta_time, '.3f')
                    out_of_buffer = format(float(netdev_stats[i]['out_of_buffer'] - self.last_netdev_stats[i]['out_of_buffer']) / delta_time, '.3f')
                    out_of_sequence = format(float(netdev_stats[i]['out_of_sequence'] - self.last_netdev_stats[i]['out_of_sequence']) / delta_time, '.3f')
                    packet_seq_err = format(float(netdev_stats[i]['packet_seq_err'] - self.last_netdev_stats[i]['packet_seq_err']) / delta_time, '.3f')
                    local_ack_timeout_err = format(float(netdev_stats[i]['local_ack_timeout_err'] - self.last_netdev_stats[i]['local_ack_timeout_err']) / delta_time, '.3f')
            except Exception:
                continue
            try:
                batch_metric_part = []
                if current_time - self.last_report_time >= self.report_interval:
                    #netdev basic info
                    if self.config_info[0][i]['bonding_mode'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.config_info[0][i]['bonding_mode'] != 4):
                        batch_metric_part.append({'name': 'bonding_mode', 'value': self.config_info[0][i]['bonding_mode']})
                    if self.monitor_info[0][i]['link_state'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.monitor_info[0][i]['link_state'] != 1):
                        batch_metric_part.append({'name': 'link_state', 'value': self.monitor_info[0][i]['link_state']})
                    if self.config_info[0][i]['active_mtu'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.config_info[0][i]['active_mtu'] != 9100):
                        batch_metric_part.append({'name': 'active_mtu', 'value': self.config_info[0][i]['active_mtu']})
                    if self.config_info[0][i]['ofed_version'] != VAL_ERR:
                        batch_metric_part.append({'name': 'ofed_version', 'value': self.config_info[0][i]['ofed_version']})
                    if self.config_info[0][i]['nv_peer_mem_state'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.config_info[0][i]['nv_peer_mem_state'] != 1):
                        batch_metric_part.append({'name': 'nv_peer_mem_state', 'value': self.config_info[0][i]['nv_peer_mem_state']})
                    if self.config_info[0][i]['nv_peer_mem_ver'] != VAL_ERR:
                        batch_metric_part.append({'name': 'nv_peer_mem_ver', 'value': self.config_info[0][i]['nv_peer_mem_ver']})
                    if self.config_info[0][i]['bond_duplicate_ip_dect'] != VAL_ERR:
                        batch_metric_part.append({'name': 'bond_duplicate_ip_dect', 'value': self.config_info[0][i]['bond_duplicate_ip_dect']})
                    if self.monitor_info[0][i]['traffic_class'] != VAL_ERR and not(self.collector.uptime < self.collector.uptime_alarm_interval and self.monitor_info[0][i]['traffic_class'] != 160):
                        batch_metric_part.append({'name': 'traffic_class', 'value': self.monitor_info[0][i]['traffic_class']})
                    if self.monitor_info[0][i]['qp_num'] != VAL_ERR:
                        batch_metric_part.append({'name': 'qp_num', 'value': self.monitor_info[0][i]['qp_num']})
                    if self.monitor_info[0][i]['cq_num'] != VAL_ERR:
                        batch_metric_part.append({'name': 'cq_num', 'value': self.monitor_info[0][i]['cq_num']})
                    if self.monitor_info[0][i]['mr_num'] != VAL_ERR:
                        batch_metric_part.append({'name': 'mr_num', 'value': self.monitor_info[0][i]['mr_num']})
                    if self.monitor_info[0][i]['pd_num'] != VAL_ERR:
                        batch_metric_part.append({'name': 'pd_num', 'value': self.monitor_info[0][i]['pd_num']})
                    if self.monitor_info[0][i]['cmid_num'] != VAL_ERR:
                        batch_metric_part.append({'name': 'cmid_num', 'value': self.monitor_info[0][i]['cmid_num']})
                    if self.monitor_info[0][i]['ctx_num'] != VAL_ERR:
                        batch_metric_part.append({'name': 'ctx_num', 'value': self.monitor_info[0][i]['ctx_num']})
                    #netdev statistics
                    if netdev_stats[i]['port_rcv_data'] != VAL_ERR:
                        batch_metric_part.append({'name': 'port_rcv_data', 'value': port_rcv_data})
                    if netdev_stats[i]['port_rcv_packets'] != VAL_ERR:
                        batch_metric_part.append({'name': 'port_rcv_packets', 'value': port_rcv_packets})
                    if netdev_stats[i]['np_ecn_marked_roce_packets'] != VAL_ERR:
                        batch_metric_part.append({'name': 'np_ecn_marked_roce_packets', 'value': np_ecn_marked_roce_packets})
                    if netdev_stats[i]['rp_cnp_handled'] != VAL_ERR:
                        batch_metric_part.append({'name': 'rp_cnp_handled', 'value': rp_cnp_handled})
                    if netdev_stats[i]['req_cqe_error'] != VAL_ERR:
                        batch_metric_part.append({'name': 'req_cqe_error', 'value': req_cqe_error})
                    if netdev_stats[i]['req_remote_access_errors'] != VAL_ERR:
                        batch_metric_part.append({'name': 'req_remote_access_errors', 'value': req_remote_access_errors})
                    if netdev_stats[i]['req_remote_invalid_request'] != VAL_ERR:
                        batch_metric_part.append({'name': 'req_remote_invalid_request', 'value': req_remote_invalid_request})
                    if netdev_stats[i]['duplicate_request'] != VAL_ERR:
                        batch_metric_part.append({'name': 'duplicate_request', 'value': duplicate_request})
                    if netdev_stats[i]['resp_cqe_error'] != VAL_ERR:
                        batch_metric_part.append({'name': 'resp_cqe_error', 'value': resp_cqe_error})
                    if netdev_stats[i]['resp_local_length_error'] != VAL_ERR:
                        batch_metric_part.append({'name': 'resp_local_length_error', 'value': resp_local_length_error})
                    if netdev_stats[i]['resp_remote_access_errors'] != VAL_ERR:
                        batch_metric_part.append({'name': 'resp_remote_access_errors', 'value': resp_remote_access_errors})
                    if netdev_stats[i]['rx_icrc_encapsulated'] != VAL_ERR:
                        batch_metric_part.append({'name': 'rx_icrc_encapsulated', 'value': rx_icrc_encapsulated})
                    if netdev_stats[i]['port_xmit_wait'] != VAL_ERR:
                        batch_metric_part.append({'name': 'port_xmit_wait', 'value': port_xmit_wait})
                    if netdev_stats[i]['rnr_nak_retry_err'] != VAL_ERR:
                        batch_metric_part.append({'name': 'rnr_nak_retry_err', 'value': rnr_nak_retry_err})
                    if netdev_stats[i]['out_of_buffer'] != VAL_ERR:
                        batch_metric_part.append({'name': 'out_of_buffer', 'value': out_of_buffer})
                    if netdev_stats[i]['out_of_sequence'] != VAL_ERR:
                        batch_metric_part.append({'name': 'out_of_sequence', 'value': out_of_sequence})
                    if netdev_stats[i]['packet_seq_err'] != VAL_ERR:
                        batch_metric_part.append({'name': 'packet_seq_err', 'value': packet_seq_err})
                    if netdev_stats[i]['local_ack_timeout_err'] != VAL_ERR:
                        batch_metric_part.append({'name': 'local_ack_timeout_err', 'value': local_ack_timeout_err})
                    # additional metrics for bond
                    if bond_tx_prio5_bytes_report != VAL_ERR:
                        batch_metric_part.append({'name': 'tx_prio5_bytes', 'value': format(bond_tx_prio5_bytes, '.3f')})
                    if bond_rx_prio5_bytes_report != VAL_ERR:
                        batch_metric_part.append({'name': 'rx_prio5_bytes', 'value': format(bond_rx_prio5_bytes, '.3f')})
                    if bond_tx_prio5_packets_report != VAL_ERR:
                        batch_metric_part.append({'name': 'tx_prio5_packets', 'value': format(bond_tx_prio5_packets, '.3f')})
                    if bond_rx_prio5_packets_report != VAL_ERR:
                        batch_metric_part.append({'name': 'rx_prio5_packets', 'value': format(bond_rx_prio5_packets, '.3f')})
                    if bond_tx_prio5_pause_report != VAL_ERR:
                        batch_metric_part.append({'name': 'tx_prio5_pause', 'value': format(bond_tx_prio5_pause, '.3f')})
                    if bond_rx_prio5_pause_report != VAL_ERR:
                        batch_metric_part.append({'name': 'rx_prio5_pause', 'value': format(bond_rx_prio5_pause, '.3f')})
                    if bond_rx_prio5_discards_report != VAL_ERR:
                        batch_metric_part.append({'name': 'rx_prio5_discards', 'value': format(bond_rx_prio5_discards, '.3f')})
                netdev = self.collector.rdma_netdevs[i]
                self.collector.get_hpbw_stats()
                for j in range(HPBW_NUM):
                    hpbw_val = VAL_ERR
                    try:
                        hpbw_val = self.collector.hpbw_stats[netdev][j].get_nowait()
                    except Exception:
                        hpbw_val = VAL_ERR
                    if hpbw_val == VAL_ERR:
                        continue
                    if j == 0:
                        batch_metric_part.append({'name': 'rx_hpbw_max', 'value': hpbw_val})
                    elif j == 1:
                        batch_metric_part.append({'name': 'rx_hpbw_min', 'value': hpbw_val})
                    elif j == 2:
                        batch_metric_part.append({'name': 'rx_hpbw_avg', 'value': hpbw_val})
                    elif j == 3:
                        batch_metric_part.append({'name': 'rx_hpbw_p50', 'value': hpbw_val})
                    elif j == 4:
                        batch_metric_part.append({'name': 'rx_hpbw_p90', 'value': hpbw_val})
                    elif j == 5:
                        batch_metric_part.append({'name': 'tx_hpbw_max', 'value': hpbw_val})
                    elif j == 6:
                        batch_metric_part.append({'name': 'tx_hpbw_min', 'value': hpbw_val})
                    elif j == 7:
                        batch_metric_part.append({'name': 'tx_hpbw_avg', 'value': hpbw_val})
                    elif j == 8:
                        batch_metric_part.append({'name': 'tx_hpbw_p50', 'value': hpbw_val})
                    elif j == 9:
                        batch_metric_part.append({'name': 'tx_hpbw_p90', 'value': hpbw_val})
                self.handler.add_batch_metric(batch=batch_metric_part, dimensions=dimensions, timestamp=self.collector.valueCast(current_time))
            except Exception:
                continue
        if current_time - self.last_report_time >= self.report_interval:
            self.last_eth_stats = self.collector.get_ethflow_stats()
            self.last_netdev_stats = self.collector.get_netdevflow_stats()
            self.last_phy_err_stats = self.collector.get_phy_err_stats()
            self.last_report_time = current_time
        if (len(self.handler.get_metrics()) > 0):
            data = {'sender': 'nws_sender', 'datas': self.handler.pop_metrics()}
            self.put_data(data)

def main():
    collector = RDMACollector()
    while 1:
        collector.collect()
        collector.dump_data()
        time.sleep(60)

if __name__ == '__main__':
    main()