File: //usr/local/qcloud/monitor/barad/plugin/collector/vm/npu.py
import sys
import os,time
sys.path.append(os.getcwd() + '/../../../comm/')
import constant
from plugin_base import VmBaseCollector
from utils.collect_tool_gpu import NpuCollect
from utils.collect_tool_gpu import HwNpuCollect
from utils.metric_handler import MetricHandler
from pyzxml.pyzxml import *
from operator import itemgetter, attrgetter
class NpuCollector(VmBaseCollector):
def init(self):
self.set_frequency(10)
self.handler = MetricHandler()
self.handler.namespace = 'qce/cvm'
self.handler.dimensions = [ 'vm_uuid', 'vmip']
self.device_info = []
self.tmp_device_info = []
self.alarm_time = {}
self.zxcollector = NpuCollect()
zx_count = self.zxcollector.get_npu_init_count()
if (zx_count != 0):
self.init_count = zx_count
self.collector = self.zxcollector
return
self.hwcollector = HwNpuCollect()
hw_count = self.hwcollector.get_npu_init_count()
if (hw_count != 0):
self.init_count = hw_count
self.collector = self.hwcollector
return
self.init_count = 0
def do_collect(self):
# not npu cvm
if self.init_count == 0:
return
# 0 : drives do not install
drives = self.collector.npu_drives_check()
if drives == 0:
return
npu_type = self.collector.get_npu_type()
deviceCount = self.collector.get_npu_count()
if npu_type == 0:
# tencent zixiao
maxnpu_index = deviceCount
elif npu_type == 1:
self.tmp_device_info = self.device_info
if len(self.device_info) != deviceCount:
self.device_info = []
maxnpu_index = self.init_count
else:
pass
now = int(time.time())
vm_uuid = self.get_vm_uuid()
alarm_dimensions = [{"key" : "Uuid", "value" :vm_uuid}]
vmip = self.get_vmip()
barad_version = self.get_barad_version()
for i in range(0, maxnpu_index):
try:
if(len(self.device_info) < maxnpu_index):
minor_number,driver_ver,gpu_id,gpu_name,serial,gpu_uuid,gpu_part_number = self.collector.get_device_info(i)
info = {}
info['minor_number'] = minor_number
info['driver_ver'] = driver_ver
info['gpu_id'] = gpu_id
info['gpu_name'] = gpu_name
info['serial'] = serial
info['gpu_uuid'] = gpu_uuid
info['gpu_part_number'] = gpu_part_number
self.device_info.append(info)
self.alarm_time[gpu_id] = {'ecc_error_standard_time':0,
'health_status_abnormal_time':0,
'lost_time':0 }
else:
minor_number = self.device_info[i]['minor_number']
driver_ver = self.device_info[i]['driver_ver']
gpu_id = self.device_info[i]['gpu_id']
gpu_name = self.device_info[i]['gpu_name']
serial = self.device_info[i]['serial']
gpu_uuid = self.device_info[i]['gpu_uuid']
gpu_part_number = self.device_info[i]['gpu_part_number']
except Exception:
pass
try:
gpu_util,enc_util,dec_util,smclk,memclk,mem_total,mem_used,mem_usage,temp,powDraw,powLimit,pow_usage,\
perf_stat,persistence_mode,ecc_current, ecc_pending, ecc_agg_single_total_err,ecc_agg_double_total_err,\
retired_page_single_ecc,retired_page_double_ecc,retired_page_pending = self.collector.get_device_run_info(i)
except Exception:
pass
p_smutil=p_memutil=p_encutil=p_decutil=0
dimensions = {'vm_uuid': vm_uuid, 'vmip': vmip, 'minor_number': minor_number, 'gpu_name' : gpu_name, 'gpu_id' : gpu_id, \
'gpu_uuid' : gpu_uuid, 'driver_ver': driver_ver, 'serial': serial, 'gpu_part_number': gpu_part_number}
batch_metric = [
{'name':'gpu_util', 'value':gpu_util},
{'name':'gpu_enc_util', 'value':enc_util},
{'name':'gpu_dec_util', 'value':dec_util},
{'name':'gpu_smclk', 'value':smclk},
{'name':'gpu_memclk', 'value':memclk},
{'name':'gpu_p_smutil', 'value':p_smutil},
{'name':'gpu_p_memutil', 'value':p_memutil},
{'name':'gpu_p_encutil', 'value':p_encutil},
{'name':'gpu_p_decutil', 'value':p_decutil},
{'name':'gpu_mem_usage', 'value': mem_usage},
{'name':'gpu_mem_total', 'value':mem_total},
{'name':'gpu_mem_used', 'value':mem_used},
{'name':'gpu_pow_usage', 'value': pow_usage},
{'name':'gpu_pow_limit', 'value':powLimit},
{'name':'gpu_pow_draw', 'value':powDraw},
{'name':'gpu_temp', 'value':temp},
{'name':'gpu_perf_stat', 'value':perf_stat},
{'name':'gpu_persistence_mode', 'value':persistence_mode},
{'name':'gpu_ecc_current', 'value':ecc_current},
{'name':'gpu_ecc_pending', 'value':ecc_pending},
{'name':'gpu_ecc_agg_single_total_err', 'value':ecc_agg_single_total_err},
{'name':'gpu_ecc_agg_double_total_err', 'value':ecc_agg_double_total_err},
{'name':'gpu_retired_page_single_ecc', 'value':retired_page_single_ecc},
{'name':'gpu_retired_page_double_ecc', 'value':retired_page_double_ecc},
{'name':'gpu_retired_page_pending', 'value':retired_page_pending},
{'name':'gpu_inforom','value':0},
{'name':'gpu_ecc_terminate_app','value':0},
{'name':'gpu_xid_terminate_app','value':0}
]
self.handler.add_batch_metric(batch = batch_metric, dimensions = dimensions)
alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(now))
alarm_dimensions = [{"Key" : "Uuid", "Value" :vm_uuid}]
alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions,
"DeviceName":"GPU", "DeviceId": "", 'Slot':gpu_id, 'PN':gpu_part_number, 'SN': serial, "DriverVersion": driver_ver,
"BaradVersion": barad_version }
ecc_error_standard_time = self.alarm_time[gpu_id]['ecc_error_standard_time']
health_status_abnormal_time = self.alarm_time[gpu_id]['health_status_abnormal_time']
if npu_type == 0:
# tencent zixiao alarm, nothing until now
pass
elif npu_type == 1:
# huawei ascend alarm
send_alarm = 0
if ecc_error_standard_time == 0 or now - ecc_error_standard_time >= 60*60*24:
if retired_page_double_ecc >= 64:
send_alarm = 1
if send_alarm == 1:
alarmproxy_event = {"AlarmId":1103, "EventName":"gpu_ecc_error_standard", "FaultType": "Hardware", "FaultDesc":"hw npu HBM db isolated >=64" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['ecc_error_standard_time'] = now
send_alarm = 0
if health_status_abnormal_time == 0 or now - health_status_abnormal_time >= 60*60:
has_health_abnormal = self.collector.check_health_status(i)
if has_health_abnormal == 1:
send_alarm = 1
if send_alarm == 1:
alarmproxy_event = {"AlarmId":19000, "EventName":"npu_health_abnormal", "FaultType": "Hardware", "FaultDesc":"hw npu health status abnormal" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['health_status_abnormal_time'] = now
else:
pass
# send monitor data
if (len( self.handler.get_metrics()) > 0) :
data = {'sender':'nws_sender', 'datas': self.handler.pop_metrics()}
self.put_data(data)
if npu_type == 0:
# tencent zixiao alarm, nothing until now
pass
elif npu_type == 1:
# huawei ascend npu lost
bdf_init = [item.upper() for item in self.collector.get_lspci_bdf()]
bdf_now = [item['gpu_id'].upper() for item in self.device_info]
bdf_diff = list(set(bdf_init).difference(set(bdf_now)))
for bdf in bdf_diff :
if self.alarm_time.get(bdf, '') == '':
self.alarm_time[bdf] = { 'ecc_error_standard_time':0,
'health_status_abnormal_time':0,
'lost_time':0}
lost_time = self.alarm_time[bdf]['lost_time']
if now - lost_time >= 60*60 or lost_time == 0:
device = {}
for i in range(0,len(self.tmp_device_info)):
if bdf == self.tmp_device_info[i]['gpu_id'].upper():
device = self.tmp_device_info[i]
break
alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(now))
alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions,
"DeviceName":"GPU", "DeviceId": "", 'Slot':bdf, 'PN':device.get('gpu_part_number', ''), 'SN': device.get('serial', ''),
"DriverVersion": driver_ver, "BaradVersion": barad_version }
alarmproxy_event = {"AlarmId":1112, "EventName":"gpu_init_fail","FaultType": "Hardware", "FaultDesc":"init fail" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[bdf]['lost_time'] = now
else:
pass
def main():
collector = NpuCollector()
while True:
collector.collect()
collector.dump_data()
time.sleep(60)
if __name__ == '__main__':
main()