HEX

File: //usr/local/qcloud/monitor/barad/plugin/collector/vm/npu.py
import sys
import os,time
sys.path.append(os.getcwd() + '/../../../comm/')
import constant
from plugin_base import VmBaseCollector
from utils.collect_tool_gpu import NpuCollect
from utils.collect_tool_gpu import HwNpuCollect
from utils.metric_handler import MetricHandler
from pyzxml.pyzxml import *
from operator import itemgetter, attrgetter

class NpuCollector(VmBaseCollector):
    def init(self):
        self.set_frequency(10)
        self.handler = MetricHandler()
        self.handler.namespace = 'qce/cvm'
        self.handler.dimensions = [ 'vm_uuid', 'vmip']
        self.device_info = []
        self.tmp_device_info = []
        self.alarm_time = {}
        
        self.zxcollector = NpuCollect()
        zx_count = self.zxcollector.get_npu_init_count()
        if (zx_count != 0):
            self.init_count = zx_count
            self.collector = self.zxcollector
            return

        self.hwcollector = HwNpuCollect()
        hw_count = self.hwcollector.get_npu_init_count()
        if (hw_count != 0):
            self.init_count = hw_count
            self.collector = self.hwcollector
            return

        self.init_count = 0

    def do_collect(self):
        # not npu cvm
        if self.init_count == 0:
            return

        # 0 : drives do not install 
        drives = self.collector.npu_drives_check()
        if drives == 0:
            return

        npu_type = self.collector.get_npu_type()
        deviceCount = self.collector.get_npu_count()

        if npu_type == 0:
            # tencent zixiao
            maxnpu_index = deviceCount
        elif npu_type == 1:
            self.tmp_device_info = self.device_info
            if len(self.device_info) != deviceCount:
                self.device_info = []
            maxnpu_index = self.init_count
        else:
            pass

        now = int(time.time())
        vm_uuid = self.get_vm_uuid()
        alarm_dimensions = [{"key" : "Uuid", "value" :vm_uuid}]
        vmip =  self.get_vmip()
        barad_version = self.get_barad_version()

        for i in range(0, maxnpu_index):
            try:
                if(len(self.device_info) < maxnpu_index):
                    minor_number,driver_ver,gpu_id,gpu_name,serial,gpu_uuid,gpu_part_number = self.collector.get_device_info(i)
                    info = {}
                    info['minor_number'] = minor_number
                    info['driver_ver'] = driver_ver
                    info['gpu_id'] = gpu_id
                    info['gpu_name'] = gpu_name
                    info['serial'] = serial
                    info['gpu_uuid'] = gpu_uuid
                    info['gpu_part_number'] = gpu_part_number
                    self.device_info.append(info)
                    self.alarm_time[gpu_id]  = {'ecc_error_standard_time':0, 
                        'health_status_abnormal_time':0,
                        'lost_time':0 }
                else:
                    minor_number = self.device_info[i]['minor_number']
                    driver_ver = self.device_info[i]['driver_ver']
                    gpu_id = self.device_info[i]['gpu_id']
                    gpu_name = self.device_info[i]['gpu_name']
                    serial = self.device_info[i]['serial']
                    gpu_uuid = self.device_info[i]['gpu_uuid']
                    gpu_part_number = self.device_info[i]['gpu_part_number']
            except Exception:
                pass

            try:
                gpu_util,enc_util,dec_util,smclk,memclk,mem_total,mem_used,mem_usage,temp,powDraw,powLimit,pow_usage,\
                perf_stat,persistence_mode,ecc_current, ecc_pending, ecc_agg_single_total_err,ecc_agg_double_total_err,\
                retired_page_single_ecc,retired_page_double_ecc,retired_page_pending = self.collector.get_device_run_info(i)
            except Exception:
                pass

            p_smutil=p_memutil=p_encutil=p_decutil=0

            dimensions = {'vm_uuid': vm_uuid, 'vmip': vmip, 'minor_number': minor_number, 'gpu_name' : gpu_name, 'gpu_id' : gpu_id, \
            'gpu_uuid' : gpu_uuid, 'driver_ver': driver_ver, 'serial': serial, 'gpu_part_number': gpu_part_number}
            batch_metric = [
                {'name':'gpu_util',     'value':gpu_util},
                {'name':'gpu_enc_util', 'value':enc_util},
                {'name':'gpu_dec_util', 'value':dec_util},
                {'name':'gpu_smclk',    'value':smclk},
                {'name':'gpu_memclk',   'value':memclk},
                {'name':'gpu_p_smutil', 'value':p_smutil},
                {'name':'gpu_p_memutil', 'value':p_memutil},
                {'name':'gpu_p_encutil', 'value':p_encutil},
                {'name':'gpu_p_decutil', 'value':p_decutil},
                {'name':'gpu_mem_usage', 'value': mem_usage},
                {'name':'gpu_mem_total', 'value':mem_total},
                {'name':'gpu_mem_used', 'value':mem_used},
                {'name':'gpu_pow_usage', 'value': pow_usage},
                {'name':'gpu_pow_limit', 'value':powLimit},
                {'name':'gpu_pow_draw', 'value':powDraw},
                {'name':'gpu_temp', 'value':temp},
                {'name':'gpu_perf_stat', 'value':perf_stat},
                {'name':'gpu_persistence_mode', 'value':persistence_mode},
                {'name':'gpu_ecc_current', 'value':ecc_current},
                {'name':'gpu_ecc_pending', 'value':ecc_pending},
                {'name':'gpu_ecc_agg_single_total_err', 'value':ecc_agg_single_total_err},
                {'name':'gpu_ecc_agg_double_total_err', 'value':ecc_agg_double_total_err},
                {'name':'gpu_retired_page_single_ecc', 'value':retired_page_single_ecc},
                {'name':'gpu_retired_page_double_ecc', 'value':retired_page_double_ecc},
                {'name':'gpu_retired_page_pending', 'value':retired_page_pending},
                {'name':'gpu_inforom','value':0},
                {'name':'gpu_ecc_terminate_app','value':0},
                {'name':'gpu_xid_terminate_app','value':0}
            ]
            self.handler.add_batch_metric(batch = batch_metric, dimensions = dimensions)


            alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(now))
            alarm_dimensions = [{"Key" : "Uuid", "Value" :vm_uuid}]
            alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions,
                                 "DeviceName":"GPU", "DeviceId": "", 'Slot':gpu_id, 'PN':gpu_part_number, 'SN': serial, "DriverVersion": driver_ver,
                                 "BaradVersion": barad_version }
            
            ecc_error_standard_time = self.alarm_time[gpu_id]['ecc_error_standard_time']
            health_status_abnormal_time = self.alarm_time[gpu_id]['health_status_abnormal_time']

            if npu_type == 0:
                # tencent zixiao alarm, nothing until now
                pass
            elif npu_type == 1:
                # huawei ascend alarm
                send_alarm = 0
                if ecc_error_standard_time == 0 or now - ecc_error_standard_time >= 60*60*24:
                    if retired_page_double_ecc >= 64:
                        send_alarm = 1
                if send_alarm == 1:
                    alarmproxy_event = {"AlarmId":1103, "EventName":"gpu_ecc_error_standard", "FaultType": "Hardware", "FaultDesc":"hw npu HBM db isolated >=64" }
                    data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                    self.put_data(data_alarmproxy)
                    self.alarm_time[gpu_id]['ecc_error_standard_time'] = now
                
                send_alarm = 0
                if health_status_abnormal_time == 0 or now - health_status_abnormal_time >= 60*60:
                    has_health_abnormal = self.collector.check_health_status(i)
                    if has_health_abnormal == 1:
                        send_alarm = 1
                if send_alarm == 1:
                    alarmproxy_event = {"AlarmId":19000, "EventName":"npu_health_abnormal", "FaultType": "Hardware", "FaultDesc":"hw npu health status abnormal" }
                    data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                    self.put_data(data_alarmproxy)
                    self.alarm_time[gpu_id]['health_status_abnormal_time'] = now
            else:
                pass

        # send monitor data
        if (len( self.handler.get_metrics()) > 0) :
            data = {'sender':'nws_sender', 'datas': self.handler.pop_metrics()}
            self.put_data(data)

        if npu_type == 0:
            # tencent zixiao alarm, nothing until now
            pass
        elif npu_type == 1:
            # huawei ascend npu lost
            bdf_init = [item.upper() for item in self.collector.get_lspci_bdf()]
            bdf_now = [item['gpu_id'].upper() for item in self.device_info]
            bdf_diff = list(set(bdf_init).difference(set(bdf_now)))
            
            for bdf in bdf_diff :
                if self.alarm_time.get(bdf, '') == '':
                    self.alarm_time[bdf] = { 'ecc_error_standard_time':0,
                        'health_status_abnormal_time':0,
                        'lost_time':0}

                lost_time = self.alarm_time[bdf]['lost_time']
                if now - lost_time >= 60*60 or lost_time == 0:
                    device = {}
                    for i in range(0,len(self.tmp_device_info)):
                        if bdf == self.tmp_device_info[i]['gpu_id'].upper():
                            device = self.tmp_device_info[i]
                            break

                    alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(now))
                    alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions,
                                         "DeviceName":"GPU", "DeviceId": "", 'Slot':bdf, 'PN':device.get('gpu_part_number', ''), 'SN': device.get('serial', ''),
                                         "DriverVersion": driver_ver, "BaradVersion": barad_version }
                    alarmproxy_event = {"AlarmId":1112, "EventName":"gpu_init_fail","FaultType": "Hardware", "FaultDesc":"init fail" }
                    data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                    self.put_data(data_alarmproxy)
                    self.alarm_time[bdf]['lost_time'] = now
        else:
            pass

def main():
    collector = NpuCollector()
    while True:
        collector.collect()
        collector.dump_data()
        time.sleep(60)

if __name__ == '__main__':
    main()