File: //usr/local/qcloud/monitor/barad/plugin/collector/utils/collect_tool_gpu.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'hetiulin'
import time
import os
import sys
import re
import traceback
import logging
sys.path.append(os.getcwd() + "/../../../comm")
import constant
import subprocess
import commands
import psutil
import urllib2
from pynvml.pynvml import *
from pyzxml.pyzxml import *
from pydcmi.pydcmi import *
from cutils import console_logger, generate_config, is_metal, CommUtils
from base_process import BaseProcess
'''gpu info collect'''
class GpuCollect(BaseProcess):
gpu_init_count = 0
gpu_load_driver = 0
# 0: nvidia gpu (default) 1: vqGPU
gpu_type = 0
consumer_gpu = 0
handle_list = []
bdf_list = ""
gpu_virt_mode = 0
def __init__(self, logger = None ):
BaseProcess.__init__(self, constant.PLUGIN_CONFIG_PATH, self.__class__.__name__)
try:
cmd = "lspci -d 10de::000300 | wc -l"
count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3))
if (count != 0):
GpuCollect.consumer_gpu = 1
count = 0
# query by {vendorid: deviceid} ,put it in front,
# query by {deviceid:} , put behind
if (count == 0):
#vqgpu 1ea0:2aaa
cmd = "lspci -d 1ea0:2aaa -Dn | wc -l"
count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3))
if (count != 0):
cmd = "lspci -d 1ea0:2aaa -Dn | awk '{print $1}'"
bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
GpuCollect.bdf_list = bdf.split("\n")[:-1]
GpuCollect.gpu_init_count = count
GpuCollect.gpu_type = 1
# -s .0
# CVM : 00:08.0 3D controller: NVIDIA Corporation
# multifunc: 00:08.1 Audio device: NVIDIA Corporation High Definition Audio Controller
# grep -iE 'VGA|3D'
# server card: class code: 00302(PCI_CLASS_DISPLAY_3D)
# customer card: class code: 00300(PCI_CLASS_DISPLAY_VGA)
if (count == 0):
cmd = "lspci -d 10de: -Dnn -s .0 | grep -iE 'VGA|3D' | wc -l"
count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3))
if (count != 0):
cmd = "lspci -d 10de: -Dnn -s .0 | grep -iE 'VGA|3D' | awk '{print $1}'"
bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
GpuCollect.bdf_list = bdf.split("\n")[:-1]
GpuCollect.gpu_init_count = count
if GpuCollect.gpu_init_count > 0:
nvmlInit()
GpuCollect.gpu_load_driver = 1
mode = 0xff
if GpuCollect.gpu_type == 0:
handle = nvmlDeviceGetHandleByIndex(0)
mode = nvmlDeviceGetVirtualizationMode(handle)
GpuCollect.gpu_virt_mode = mode
if mode == NVML_GPU_VIRTUALIZATION_MODE_VGPU:
GpuCollect.consumer_gpu = 0
# Bare Metal GPU or vGPU/vSGA mode
if (mode == NVML_GPU_VIRTUALIZATION_MODE_NONE
or mode == NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU
or mode == NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA):
cmd = "lspci -d 10de: -Dnn -s 00.0 | grep -iE 'VGA|3D' | wc -l"
count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3))
cmd = "lspci -d 10de: -Dnn -s 00.0 | grep -iE 'VGA|3D' | awk '{print $1}'"
bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
GpuCollect.bdf_list = bdf.split("\n")[:-1]
GpuCollect.gpu_init_count = count
for i in range(0, GpuCollect.gpu_init_count):
handle = nvmlDeviceGetHandleByIndex(i)
GpuCollect.handle_list.append(handle)
else:
self.logger().info("lspci find no nvida gpu")
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
def __del__(self):
try:
if GpuCollect.gpu_init_count > 0 and GpuCollect.gpu_load_driver == 1:
nvmlShutdown()
GpuCollect.gpu_load_driver = 0
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
def handleError(self,err):
if (err.value == NVML_ERROR_NOT_SUPPORTED):
return "N/A"
else:
return err.__str__()
def get_gpu_count(self):
gpu_count = 0
if GpuCollect.gpu_init_count > 0 :
try :
gpu_count = int(nvmlDeviceGetCount())
except Exception as e:
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return gpu_count
def get_gpu_init_count(self):
return self.gpu_init_count
def get_lspci_bdf(self):
return self.bdf_list
def gpu_drives_check(self):
if GpuCollect.gpu_load_driver == 0:
cmd1 = 'which nvidia-smi | wc -l'
cmd2 = 'lsmod | grep nvidia | wc -l'
try :
rtn1 = CommUtils.ExecuteTimeoutCommand(cmd1, 3)
rtn2 = CommUtils.ExecuteTimeoutCommand(cmd2, 3)
if int(rtn1) > 0 and int(rtn2) > 0:
nvmlInit()
GpuCollect.gpu_load_driver = 1
mode = 0xff
if GpuCollect.gpu_type == 0:
handle = nvmlDeviceGetHandleByIndex(0)
mode = nvmlDeviceGetVirtualizationMode(handle)
GpuCollect.gpu_virt_mode = mode
if mode == NVML_GPU_VIRTUALIZATION_MODE_VGPU:
GpuCollect.consumer_gpu = 0
# Bare Metal GPU or vGPU/vSGA mode
if (mode == NVML_GPU_VIRTUALIZATION_MODE_NONE
or mode == NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU
or mode == NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA):
cmd = "lspci -d 10de: -Dnn -s 00.0 | grep -iE 'VGA|3D' | wc -l"
count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3))
cmd = "lspci -d 10de: -Dnn -s 00.0 | grep -iE 'VGA|3D' | awk '{print $1}'"
bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
GpuCollect.bdf_list = bdf.split("\n")[:-1]
GpuCollect.gpu_init_count = count
for i in range(0, GpuCollect.gpu_init_count):
handle = nvmlDeviceGetHandleByIndex(i)
GpuCollect.handle_list.append(handle)
except Exception as e:
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return GpuCollect.gpu_load_driver
def get_device_info_inforom(self,i):
gpu_inforom = 0
handle = self.handle_list[i]
try:
gpu_inforom = nvmlDeviceValidateInforom(handle)
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return gpu_inforom
def get_device_info_pcideviceid(self,i):
pcideviceid = 0
handle = self.handle_list[i]
try:
pciInfo = nvmlDeviceGetPciInfo(handle)
pcideviceid = pciInfo.pciDeviceId
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return pcideviceid
def get_device_info(self,i):
handle = self.handle_list[i]
# vqGPU, minor number default set 0
if GpuCollect.gpu_type == 1:
minor_number = 0
else:
minor_number = nvmlDeviceGetMinorNumber(handle)
pciInfo = nvmlDeviceGetPciInfo(handle)
gpu_id = pciInfo.busId
gpu_name = nvmlDeviceGetName(handle)
gpu_uuid = nvmlDeviceGetUUID(handle)
driver_ver = nvmlSystemGetDriverVersion()
gpu_virtual_mode = nvmlDeviceGetVirtualizationMode(handle)
serial = gpu_part_number = -1
# passthrough and baremetal gpu and vgpu host
if (GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_NONE
or GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH
or GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU):
try:
serial = nvmlDeviceGetSerial(handle)
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
try:
gpu_part_number = nvmlDeviceGetBoardPartNumber(handle)
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return (minor_number,driver_ver,gpu_id,gpu_name,serial,gpu_uuid,gpu_part_number,gpu_virtual_mode)
# caller ensure ecc=on
def get_device_retired_page_info(self, i):
pages_dbe=timestamps_dbe=pages_sbe=timestamps_sbe=[]
handle = self.handle_list[i]
try:
pages_dbe,timestamps_dbe = nvmlDeviceGetRetiredPages_v2(handle, NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR)
pages_sbe,timestamps_sbe = nvmlDeviceGetRetiredPages_v2(handle, NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS)
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return pages_dbe,timestamps_dbe,pages_sbe,timestamps_sbe
def get_device_run_info(self,i):
handle = self.handle_list[i]
nvidia_smi_error = 0
util = nvmlDeviceGetUtilizationRates(handle)
gpu_util = int(util.gpu)
memInfo = nvmlDeviceGetMemoryInfo(handle)
mem_total = memInfo.total / 1024 / 1024
mem_used = memInfo.used / 1024 / 1024
mem_usage = float(mem_used) / mem_total * 100
enc_util = nvmlDeviceGetEncoderUtilization(handle)
dec_util = nvmlDeviceGetDecoderUtilization(handle)
arch = self.get_gpu_device_arch(i)
temp = powDraw = powLimit = pow_usage = -1
# passthrough and baremetal gpu and vgpu host
if (GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_NONE
or GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH
or GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU):
try:
temp = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)
powDraw = (nvmlDeviceGetPowerUsage(handle) / 1000)
powLimit = (nvmlDeviceGetPowerManagementLimit(handle) / 1000)
pow_usage = float(powDraw) / powLimit * 100
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
nvidia_smi_error = 1
perf_stat = nvmlDeviceGetPowerState(handle)
persistence_mode = nvmlDeviceGetPersistenceMode(handle)
ecc_current=ecc_pending=ecc_agg_single_total_err=ecc_agg_double_total_err=-1
retired_page_single_ecc=retired_page_double_ecc=retired_page_pending=-1
if self.consumer_gpu == 0:
try:
ecc_current, ecc_pending = nvmlDeviceGetEccMode(handle)
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
if ecc_current == 1:
try:
ecc_agg_single_total_err = nvmlDeviceGetTotalEccErrors(handle,NVML_MEMORY_ERROR_TYPE_CORRECTED,NVML_AGGREGATE_ECC)
ecc_agg_double_total_err = nvmlDeviceGetTotalEccErrors(handle,NVML_MEMORY_ERROR_TYPE_UNCORRECTED,NVML_AGGREGATE_ECC)
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
try:
if arch < NVML_DEVICE_ARCH_AMPERE and ecc_current == 1:
for idx in range(NVML_PAGE_RETIREMENT_CAUSE_COUNT):
pages = nvmlDeviceGetRetiredPages(handle, idx)
count = len(pages)
if idx == 0:
retired_page_single_ecc = count
if idx == 1:
retired_page_double_ecc = count
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
try:
if arch < NVML_DEVICE_ARCH_AMPERE and ecc_current == 1:
retired_page_pending = 0 if NVML_FEATURE_DISABLED == nvmlDeviceGetRetiredPagesPendingStatus(handle) else 1
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
smclk = memclk = -1
try:
smclk = nvmlDeviceGetClockInfo(handle, NVML_CLOCK_SM)
memclk = nvmlDeviceGetClockInfo(handle, NVML_CLOCK_MEM)
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
nvidia_smi_error = 1
# only for check nvidia-smi command print info
try:
gpu_name = nvmlDeviceGetName(handle)
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
nvidia_smi_error = 1
# passthrough or baremetal consumer gpu
if (GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_NONE
or GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH):
if self.consumer_gpu == 1:
try:
gpu_fan = nvmlDeviceGetFanSpeedv2(handle, 0)
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
nvidia_smi_error = 1
return (gpu_util,enc_util[0],dec_util[0],smclk,memclk,mem_total,mem_used,mem_usage,temp,powDraw,powLimit,pow_usage,\
perf_stat,persistence_mode,ecc_current, ecc_pending,\
ecc_agg_single_total_err,ecc_agg_double_total_err,\
retired_page_single_ecc,retired_page_double_ecc,retired_page_pending,nvidia_smi_error)
#only for vgpu
def get_gpu_process_info(self, i):
handle = self.handle_list[i]
utils=[]
try:
utils = nvmlDeviceGetProcessUtilization(handle)
except NVMLError :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return utils
# caller ensure ecc=on
def check_field(self, i):
handle = self.handle_list[i]
#define NVML_FI_DEV_ECC_DBE_AGG_L1 19
#L1 cache double bit aggregate (persistent) ECC errors.
#define NVML_FI_DEV_ECC_DBE_AGG_L2 21
#L2 cache double bit aggregate (persistent) ECC errors.
#define NVML_FI_DEV_ECC_DBE_AGG_REG 25
#Register File double bit aggregate (persistent) ECC errors.
fieldIds = [19, 21, 25]
try :
values = nvmlDeviceGetFieldValues(handle, fieldIds)
for i, value in enumerate(values):
if value.nvmlReturn == NVML_SUCCESS:
if value.valueType == NVML_VALUE_TYPE_DOUBLE:
if value.value.dVal != 0:
return 1
elif value.valueType == NVML_VALUE_TYPE_UNSIGNED_INT:
if value.value.uiVal != 0:
return 1
elif value.valueType == NVML_VALUE_TYPE_UNSIGNED_LONG:
if value.value.ulVal != 0:
return 1
elif value.valueType == NVML_VALUE_TYPE_UNSIGNED_LONG_LONG:
if value.value.ullVal != 0:
return 1
elif value.valueType == NVML_VALUE_TYPE_SIGNED_LONG_LONG:
if value.value.sllVal != 0:
return 1
else:
return 0
except Exception as e:
pass
return 0
# caller ensure ecc=on
def check_field_row_remapping_failure(self, i):
handle = self.handle_list[i]
#define NVML_FI_DEV_REMAPPED_FAILURE 145
#If any rows failed to be remapped 1=yes 0=no.
fieldIds = [145]
try :
values = nvmlDeviceGetFieldValues(handle, fieldIds)
for i, value in enumerate(values):
if value.nvmlReturn == NVML_SUCCESS:
if value.valueType == NVML_VALUE_TYPE_DOUBLE:
if value.value.dVal != 0:
return 1
elif value.valueType == NVML_VALUE_TYPE_UNSIGNED_INT:
if value.value.uiVal != 0:
return 1
elif value.valueType == NVML_VALUE_TYPE_UNSIGNED_LONG:
if value.value.ulVal != 0:
return 1
elif value.valueType == NVML_VALUE_TYPE_UNSIGNED_LONG_LONG:
if value.value.ullVal != 0:
return 1
elif value.valueType == NVML_VALUE_TYPE_SIGNED_LONG_LONG:
if value.value.sllVal != 0:
return 1
else:
return 0
except Exception as e:
pass
return 0
# caller ensure ecc=on
def check_sram_ecc(self, i):
try :
handle = self.handle_list[i]
#eccCounts = nvmlDeviceGetDetailedEccErrors(handle, NVML_MEMORY_ERROR_TYPE_CORRECTED, NVML_AGGREGATE_ECC)
#if eccCounts.l1Cache !=0 or eccCounts.l2Cache != 0 or eccCounts.registerFile !=0 :
# return 1
eccCounts = nvmlDeviceGetDetailedEccErrors(handle, NVML_MEMORY_ERROR_TYPE_UNCORRECTED, NVML_AGGREGATE_ECC)
if eccCounts.l1Cache !=0 or eccCounts.l2Cache != 0 or eccCounts.registerFile !=0 :
return 1
uceCount = nvmlDeviceGetMemoryErrorCounter(handle, NVML_MEMORY_ERROR_TYPE_UNCORRECTED, NVML_AGGREGATE_ECC, NVML_MEMORY_LOCATION_SRAM)
if uceCount >= 10:
return 1
except Exception as e:
pass
return 0
# return row remapping pending status and failure occurred status
# if not support row remapping, return 0,0
def get_row_remapping_status(self, i):
try :
handle = self.handle_list[i]
pending,failure_occurred = nvmlDeviceGetRemappedRowsStatus(handle)
return pending,failure_occurred
except Exception as e:
pass
return 0,0
# caller ensure ecc=on
def get_row_remapping_histogram(self, i):
rr_low = rr_none = 0
try :
handle = self.handle_list[i]
val = nvmlDeviceGetRowRemapperHistogram(handle)
rr_low = val.low
rr_none = val.none
except Exception as e:
pass
return rr_low,rr_none
def get_gpu_pcie_link_width(self, i):
width_max = width_curr = 0
try :
handle = self.handle_list[i]
width_curr = nvmlDeviceGetCurrPcieLinkWidth(handle)
width_max = nvmlDeviceGetMaxPcieLinkWidth(handle)
except Exception as e:
pass
return width_max,width_curr
def get_gpu_device_arch(self, i):
arch = NVML_DEVICE_ARCH_TURING
try :
handle = self.handle_list[i]
arch = nvmlDeviceGetArchitecture(handle)
except Exception as e:
pass
return arch
def get_nvlink_err_count(self, i, linkidx, erridx):
try:
handle = self.handle_list[i]
return nvmlDeviceGetNvLinkErrorCounter(handle, linkidx, erridx)
except Exception as e:
pass
return 0
def get_nvlink_state(self, i, linkidx):
try:
handle = self.handle_list[i]
return nvmlDeviceGetNvLinkState(handle, linkidx)
except Exception as e:
pass
return 0
# consumer gpu not support
def check_gpu_clock_slowdown(self, i):
slow_down_hw = slow_down_sw = 0
if self.consumer_gpu == 1:
return slow_down_hw,slow_down_sw
try:
handle = self.handle_list[i]
supportedClocksThrottleReasons = nvmlDeviceGetSupportedClocksThrottleReasons(handle)
clocksThrottleReasons = nvmlDeviceGetCurrentClocksThrottleReasons(handle)
if supportedClocksThrottleReasons & clocksThrottleReasons & nvmlClocksThrottleReasonHwSlowdown != 0:
slow_down_hw = 1
if supportedClocksThrottleReasons & clocksThrottleReasons & nvmlClocksThrottleReasonHwThermalSlowdown != 0:
slow_down_hw = 1
if supportedClocksThrottleReasons & clocksThrottleReasons & nvmlClocksThrottleReasonHwPowerBrakeSlowdown != 0:
slow_down_hw = 1
if supportedClocksThrottleReasons & clocksThrottleReasons & nvmlClocksThrottleReasonSwThermalSlowdown != 0:
slow_down_sw = 1
#if supportedClocksThrottleReasons & clocksThrottleReasons & nvmlClocksThrottleReasonGpuIdle != 0:
# slow_down_hw = slow_down_sw = 1
except Exception as e:
pass
return slow_down_hw,slow_down_sw
def test_gpu_collect():
collector = GpuCollect()
deviceCount = collector.get_gpu_count()
print collector.get_gpu_init_count()
for i in range(0, deviceCount):
print collector.get_device_info(i)
print collector.get_device_run_info(i)
print collector.get_gpu_process_info(i)
print collector.get_device_retired_page_info(i)
print collector.check_sram_ecc(i)
print collector.check_field(i)
print collector.check_field_row_remapping_failure(i)
print collector.get_row_remapping_status(i)
print collector.get_row_remapping_histogram(i)
print collector.get_gpu_pcie_link_width(i)
print collector.get_gpu_device_arch(i)
print collector.check_gpu_clock_slowdown(i)
print collector.get_device_info_pcideviceid(i)
print collector.get_device_info_inforom(i)
for linkidx in range(12):
print collector.get_nvlink_state(i, linkidx)
for erridx in range(NVML_NVLINK_ERROR_COUNT):
print linkidx, erridx, collector.get_nvlink_err_count(i, linkidx, erridx)
'''zixiao npu info collect'''
class NpuCollect(BaseProcess):
npu_init_count = 0
npu_load_driver = 0
bdf_list = ""
npu_type = 0
zixiao_ver = 0
# for zxiaov2 only now
# An issue on the old driver of Zixiao v2.
# When calling zxml init, the driver sets a whitelist. After the barad process modifies its own process name, the zxml init interface returns an exception,
# and the results of all other zxml interface calls are invalid. This issue has been fixed in the new version of the driver.
# To avoid frequent printing of old driver, init a certain number of times (10 times).
zixiaov2_try_init_cnt = 0
def __init__(self, logger = None ):
BaseProcess.__init__(self, constant.PLUGIN_CONFIG_PATH, self.__class__.__name__)
try:
#zixiao npu
cmd = "lspci -d 1ea0:2a16 -Dn | wc -l"
count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3))
if count > 0:
cmd = "lspci -d 1ea0:2a16 -Dn | awk '{print $1}'"
bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
self.bdf_list = bdf.split("\n")[:-1]
self.npu_init_count = count
self.zixiao_ver = 1
if count == 0:
# zixiao v2
cmd = "lspci -d 1ea0:2a22 -Dn | wc -l"
count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3))
if count > 0:
cmd = "lspci -d 1ea0:2a22 -Dn | awk '{print $1}'"
bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
self.bdf_list = bdf.split("\n")[:-1]
self.npu_init_count = count
self.zixiao_ver = 2
self.logger().info("npu init count %d", count)
if self.npu_init_count > 0:
zxmlInit()
self.npu_load_driver = 1
self.logger().info("npu init ok")
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
def __del__(self):
try:
if self.npu_init_count > 0 and self.npu_load_driver == 1:
zxmlShutdown()
self.npu_load_driver = 0
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
def get_npu_type(self):
return self.npu_type
def get_npu_count(self):
count = 0
if self.npu_init_count > 0 :
try :
count = zxmlGetDevCount()
except Exception:
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return count
def get_npu_init_count(self):
return self.npu_init_count
def get_lspci_bdf(self):
return self.bdf_list
def npu_drives_check(self):
if self.npu_load_driver == 0:
if self.zixiao_ver == 2:
if self.zixiaov2_try_init_cnt >= 10:
return 0
cmd1 = "which zixiao-smi | wc -l"
cmd11 = "which zx-smi | wc -l"
cmd2 = "lsmod | grep zixiao | wc -l"
try :
rtn1 = CommUtils.ExecuteTimeoutCommand(cmd1, 3)
rtn11 = CommUtils.ExecuteTimeoutCommand(cmd11, 3)
rtn2 = CommUtils.ExecuteTimeoutCommand(cmd2, 3)
if (int(rtn1) > 0 and int(rtn2) > 0) or (int(rtn11) > 0 and int(rtn2) > 0):
if self.zixiao_ver == 2:
self.zixiaov2_try_init_cnt += 1
zxmlInit()
self.npu_load_driver = 1
self.logger().info("npu check init ok")
except Exception:
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return self.npu_load_driver
def get_device_info(self,i):
minor_number = -1
serial = part_number = did = name = uuid = driver_ver = ''
try:
minor_number = i
uuid = zxmlGetDevUuid(i)
driver_ver = zxmlDriverVer()
serial = zxmlGetDevSn(i).strip()
part_number = zxmlGetDevPn(i).strip()
dev_info = zxmlGetDevInfo(i)
did = "%04x:%02x:%02x.%x" % (dev_info.domain_id, dev_info.bus_id, dev_info.dev_id, dev_info.func_id)
did = did.upper()
name = dev_info.name
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return (minor_number,driver_ver,did,name,serial,uuid,part_number)
def get_device_run_info(self,i):
gpu_util = enc_util = dec_util = smclk = memclk = mem_total = mem_used = mem_usage = -1
temp = powDraw = powLimit = pow_usage = perf_stat = persistence_mode = -1
ecc_current = ecc_pending = -1
ecc_agg_single_total_err = ecc_agg_double_total_err = -1
retired_page_single_ecc = retired_page_double_ecc = retired_page_pending = -1
try:
#gpu_util = zxmlGetDevDtuUsage(i)
gpu_util = zxmlGetDevDtuUsageAsync(i)
gpu_util = zxmlGetDevDtuUsageAsync(i)
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
try:
memInfo = zxmlGetDevMem(i)
# mem_total: in bytes
# mem_used: in Mbytes
mem_total = memInfo.mem_total_size / 1024 / 1024
mem_used = memInfo.mem_used
mem_usage = float(mem_used) / mem_total * 100
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
try:
if self.zixiao_ver == 1:
temp_list = zxmlGetDevTemp(i)
temp = temp_list.cur_dev_temp
elif self.zixiao_ver == 2:
temp_list = zxmlGetDevTempV2(i)
temp = temp_list.cur_asic_temp
else:
temp = -1
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
try:
pow_list = zxmlGetDevPwr(i)
powDraw = pow_list.cur_pwr_consumption
powLimit = pow_list.pwr_capability
pow_usage = float(powDraw) / powLimit * 100
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
try:
if self.zixiao_ver == 1:
perf_stat = zxmlGetDevDpmLevel(i)
elif self.zixiao_ver == 2:
#not support
perf_stat = -1
else:
perf_stat = -1
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
try:
ecc_current = zxmlGetEccStatus(i)
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
try:
if self.zixiao_ver == 1:
clk_info = zxmlGetDevClk(i)
smclk = clk_info.cur_dtu_clock
memclk = clk_info.cur_hbm_clock
elif self.zixiao_ver == 2:
#not support
smclk = -1
memclk = -1
else:
smclk = -1
memclk = -1
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return (gpu_util,enc_util,dec_util, \
smclk,memclk,mem_total,mem_used,mem_usage, \
temp,powDraw,powLimit,pow_usage,\
perf_stat,persistence_mode,ecc_current, ecc_pending,\
ecc_agg_single_total_err,ecc_agg_double_total_err,\
retired_page_single_ecc,retired_page_double_ecc,retired_page_pending)
def test_npu_collect():
collector = NpuCollect()
deviceCount = collector.get_npu_count()
print collector.get_npu_init_count()
for i in range(0, deviceCount):
print collector.get_device_info(i)
print collector.get_device_run_info(i)
'''huawei npu info collect'''
class HwNpuCollect(BaseProcess):
npu_init_count = 0
npu_load_driver = 0
bdf_list = ""
npu_type = 1
card_num = 0
card_id_list = []
def __init__(self, logger = None ):
BaseProcess.__init__(self, constant.PLUGIN_CONFIG_PATH, self.__class__.__name__)
try:
#huawei 910B npu
cmd = "lspci -d 19e5:d802 -Dn | wc -l"
count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3))
if count > 0:
cmd = "lspci -d 19e5:d802 -Dn | awk '{print $1}'"
bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
self.bdf_list = bdf.split("\n")[:-1]
self.npu_init_count = count
self.logger().info("hw npu init count %d", count)
if self.npu_init_count > 0:
dcmiInit()
self.npu_load_driver = 1
self.logger().info("hw npu init ok")
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
def __del__(self):
try:
if self.npu_init_count > 0 and self.npu_load_driver == 1:
dcmiShutdown()
self.npu_load_driver = 0
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
def get_npu_type(self):
return self.npu_type
def get_npu_count(self):
count = 0
if self.npu_init_count > 0 :
try :
self.card_num, self.card_id_list = dcmiGetCardList()
count = self.card_num
except Exception:
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return count
def get_npu_init_count(self):
return self.npu_init_count
def get_lspci_bdf(self):
return self.bdf_list
def npu_drives_check(self):
if self.npu_load_driver == 0:
cmd1 = "which npu-smi | wc -l"
#cmd2 = "lsmod | grep xxx | wc -l"
try :
rtn1 = CommUtils.ExecuteTimeoutCommand(cmd1, 3)
#rtn2 = CommUtils.ExecuteTimeoutCommand(cmd2, 3)
rtn2 = 1
if int(rtn1) > 0 and int(rtn2) > 0:
dcmiInit()
self.npu_load_driver = 1
self.logger().info("hw npu check init ok")
except Exception:
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return self.npu_load_driver
def get_cardid_by_index(self, i):
#self.logger().info("hw npu index:%d card id:%d", i, self.card_id_list[i])
return self.card_id_list[i]
def get_device_info(self,i):
minor_number = -1
serial = part_number = did = name = uuid = driver_ver = ''
try:
card_id = self.get_cardid_by_index(i)
device_id = 0
minor_number = i
driver_ver = dcmiGetDriverVersion()
b,d,f = dcmiGetDevicePcieInfo(card_id, device_id)
did = "0000:%02x:%02x.%x" % (b,d,f)
did = did.upper()
name = dcmiGetDeviceChipInfoName(card_id, device_id)
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return (minor_number,driver_ver,did,name,serial,uuid,part_number)
def get_device_run_info(self,i):
gpu_util = enc_util = dec_util = smclk = memclk = mem_total = mem_used = mem_usage = -1
temp = powDraw = powLimit = pow_usage = perf_stat = persistence_mode = -1
ecc_current = ecc_pending = -1
ecc_agg_single_total_err = ecc_agg_double_total_err = -1
retired_page_single_ecc = retired_page_double_ecc = retired_page_pending = -1
try:
card_id = self.get_cardid_by_index(i)
device_id = 0
gpu_util = dcmiGetDeviceUtilRateAICore(card_id, device_id)
smclk = dcmiGetDeviceFrequency(card_id, device_id, DCMI_FREQ_TYPE_CPU)
mem_total, memclk, mem_used = dcmiGetDeviceHbmInfo(card_id, device_id)
mem_usage = float(mem_used) * 100 / mem_total
temp = dcmiGetDeviceTemp(card_id, device_id)
powDraw = dcmiGetDevicePowerInfo(card_id, device_id)
powDraw = float(powDraw) / 10
powLimit = 400.0
pow_usage = float(powDraw) / powLimit * 100
ecc_current, ecc_agg_single_total_err, ecc_agg_double_total_err, retired_page_single_ecc, retired_page_double_ecc = dcmiGetDeviceEccInfo(card_id, device_id ,DCMI_DEVICE_TYPE_HBM)
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
return (gpu_util,enc_util,dec_util, \
smclk,memclk,mem_total,mem_used,mem_usage, \
temp,powDraw,powLimit,pow_usage,\
perf_stat,persistence_mode,ecc_current, ecc_pending,\
ecc_agg_single_total_err,ecc_agg_double_total_err,\
retired_page_single_ecc,retired_page_double_ecc,retired_page_pending)
def check_health_status(self,i):
health = 0
try:
card_id = self.get_cardid_by_index(i)
device_id = 0
health = dcmiGetDeviceHealthStatus(card_id, device_id)
except Exception :
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
# 0: ok 1: warning 2: Alarm 3: Critical
if health == 2 or health == 3:
return 1
return 0
def test_hwnpu_collect():
collector = HwNpuCollect()
deviceCount = collector.get_npu_count()
print collector.get_npu_init_count()
for i in range(0, deviceCount):
print collector.get_device_info(i)
print collector.get_device_run_info(i)
print collector.check_health_status(i)
if __name__ == '__main__':
test_gpu_collect()