HEX

File: //usr/local/qcloud/monitor/barad/plugin/collector/utils/collect_tool_gpu.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'hetiulin'

import time
import os
import sys
import re
import traceback
import logging
sys.path.append(os.getcwd() + "/../../../comm")
import constant
import subprocess
import commands
import psutil
import urllib2
from pynvml.pynvml import *
from pyzxml.pyzxml import *
from pydcmi.pydcmi import *
from cutils import console_logger, generate_config, is_metal, CommUtils
from base_process import BaseProcess


'''gpu info collect'''
class GpuCollect(BaseProcess):
    gpu_init_count = 0
    gpu_load_driver = 0
    # 0: nvidia gpu (default)  1: vqGPU
    gpu_type = 0 
    consumer_gpu = 0
    handle_list = []
    bdf_list = ""
    gpu_virt_mode = 0
    def __init__(self, logger = None ):
        BaseProcess.__init__(self, constant.PLUGIN_CONFIG_PATH, self.__class__.__name__)
        try:
            cmd = "lspci -d 10de::000300 | wc -l"
            count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3)) 
            if (count != 0):
                GpuCollect.consumer_gpu = 1

            count = 0
            # query by {vendorid: deviceid} ,put it in front,
            # query by {deviceid:} , put behind

            if (count == 0):
                #vqgpu  1ea0:2aaa
                cmd = "lspci -d 1ea0:2aaa -Dn | wc -l"
                count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3)) 
                if (count != 0):
                    cmd = "lspci -d 1ea0:2aaa -Dn | awk '{print $1}'"
                    bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
                    GpuCollect.bdf_list = bdf.split("\n")[:-1]
                    GpuCollect.gpu_init_count = count
                    GpuCollect.gpu_type = 1

            # -s .0
            # CVM      :  00:08.0 3D controller: NVIDIA Corporation
            # multifunc:  00:08.1 Audio device: NVIDIA Corporation High Definition Audio Controller
            # grep -iE 'VGA|3D'
            # server card:   class code: 00302(PCI_CLASS_DISPLAY_3D)
            # customer card: class code: 00300(PCI_CLASS_DISPLAY_VGA) 
            if (count == 0):
                cmd = "lspci -d 10de: -Dnn -s .0 | grep -iE 'VGA|3D' | wc -l"
                count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3)) 
                if (count != 0):
                    cmd = "lspci -d 10de: -Dnn -s .0 | grep -iE 'VGA|3D' | awk '{print $1}'"
                    bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
                    GpuCollect.bdf_list = bdf.split("\n")[:-1]
                    GpuCollect.gpu_init_count = count

            if GpuCollect.gpu_init_count > 0:
                nvmlInit()
                GpuCollect.gpu_load_driver = 1
                mode = 0xff
                if GpuCollect.gpu_type == 0:
                    handle = nvmlDeviceGetHandleByIndex(0)
                    mode = nvmlDeviceGetVirtualizationMode(handle)
                    GpuCollect.gpu_virt_mode = mode
                    if mode == NVML_GPU_VIRTUALIZATION_MODE_VGPU:
                        GpuCollect.consumer_gpu = 0
                # Bare Metal GPU or vGPU/vSGA mode
                if (mode == NVML_GPU_VIRTUALIZATION_MODE_NONE
                    or mode == NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU
                    or mode == NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA):
                    cmd = "lspci -d 10de: -Dnn -s 00.0 | grep -iE 'VGA|3D' | wc -l"
                    count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3))                     
                    cmd = "lspci -d 10de: -Dnn -s 00.0 | grep -iE 'VGA|3D' | awk '{print $1}'"
                    bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
                    GpuCollect.bdf_list = bdf.split("\n")[:-1]
                    GpuCollect.gpu_init_count = count
                for i in range(0, GpuCollect.gpu_init_count):
                    handle = nvmlDeviceGetHandleByIndex(i)
                    GpuCollect.handle_list.append(handle)
            else:
                self.logger().info("lspci find no nvida gpu")
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

    def __del__(self):
        try:
            if GpuCollect.gpu_init_count > 0 and GpuCollect.gpu_load_driver == 1:
                nvmlShutdown()
                GpuCollect.gpu_load_driver = 0
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

    def handleError(self,err):
        if (err.value == NVML_ERROR_NOT_SUPPORTED):
            return "N/A"
        else:
            return err.__str__()

    def get_gpu_count(self):
        gpu_count = 0
        if GpuCollect.gpu_init_count > 0 :
            try :
                gpu_count = int(nvmlDeviceGetCount())
            except Exception as e:
                self.logger().error(traceback.format_exc().replace('\n', '\\n'))
        return gpu_count
    
    def get_gpu_init_count(self):
        return self.gpu_init_count

    def get_lspci_bdf(self):
        return self.bdf_list 
       
    def gpu_drives_check(self):
        if GpuCollect.gpu_load_driver == 0:
            cmd1 = 'which nvidia-smi | wc -l'
            cmd2 = 'lsmod | grep nvidia | wc -l'
            try :
                rtn1 = CommUtils.ExecuteTimeoutCommand(cmd1, 3)
                rtn2 = CommUtils.ExecuteTimeoutCommand(cmd2, 3)
                if int(rtn1) > 0 and int(rtn2) > 0:
                    nvmlInit()
                    GpuCollect.gpu_load_driver = 1
                    mode = 0xff
                    if GpuCollect.gpu_type == 0:
                        handle = nvmlDeviceGetHandleByIndex(0)
                        mode = nvmlDeviceGetVirtualizationMode(handle)
                        GpuCollect.gpu_virt_mode = mode
                        if mode == NVML_GPU_VIRTUALIZATION_MODE_VGPU:
                            GpuCollect.consumer_gpu = 0
                    # Bare Metal GPU or vGPU/vSGA mode
                    if (mode == NVML_GPU_VIRTUALIZATION_MODE_NONE
                        or mode == NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU
                        or mode == NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA):
                        cmd = "lspci -d 10de: -Dnn -s 00.0 | grep -iE 'VGA|3D' | wc -l"
                        count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3))                     
                        cmd = "lspci -d 10de: -Dnn -s 00.0 | grep -iE 'VGA|3D' | awk '{print $1}'"
                        bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
                        GpuCollect.bdf_list = bdf.split("\n")[:-1]
                        GpuCollect.gpu_init_count = count
                    for i in range(0, GpuCollect.gpu_init_count):
                        handle = nvmlDeviceGetHandleByIndex(i)
                        GpuCollect.handle_list.append(handle)
            except Exception as e:
                self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        return GpuCollect.gpu_load_driver

    def get_device_info_inforom(self,i):
        gpu_inforom = 0
        handle = self.handle_list[i]
        try:
            gpu_inforom = nvmlDeviceValidateInforom(handle)
        except NVMLError :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        return gpu_inforom

    def get_device_info_pcideviceid(self,i):
        pcideviceid = 0
        handle = self.handle_list[i]
        try:
            pciInfo = nvmlDeviceGetPciInfo(handle)
            pcideviceid = pciInfo.pciDeviceId
        except NVMLError :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        return pcideviceid       
    
    def get_device_info(self,i):
        handle = self.handle_list[i]
        # vqGPU, minor number default set 0
        if GpuCollect.gpu_type == 1:
            minor_number = 0
        else:
            minor_number = nvmlDeviceGetMinorNumber(handle)
        pciInfo = nvmlDeviceGetPciInfo(handle) 
        gpu_id = pciInfo.busId
        gpu_name = nvmlDeviceGetName(handle)
        gpu_uuid = nvmlDeviceGetUUID(handle)
        driver_ver = nvmlSystemGetDriverVersion()
        gpu_virtual_mode = nvmlDeviceGetVirtualizationMode(handle)
        serial = gpu_part_number = -1
        # passthrough and baremetal gpu and vgpu host
        if (GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_NONE
            or GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH
            or GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU):
            try:
                serial = nvmlDeviceGetSerial(handle)
            except NVMLError :
                self.logger().error(traceback.format_exc().replace('\n', '\\n'))
            try:
                gpu_part_number = nvmlDeviceGetBoardPartNumber(handle)
            except NVMLError :
                self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        return (minor_number,driver_ver,gpu_id,gpu_name,serial,gpu_uuid,gpu_part_number,gpu_virtual_mode)

    # caller ensure ecc=on
    def get_device_retired_page_info(self, i):
        pages_dbe=timestamps_dbe=pages_sbe=timestamps_sbe=[]
        handle = self.handle_list[i]
        try:
            pages_dbe,timestamps_dbe = nvmlDeviceGetRetiredPages_v2(handle, NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR)
            pages_sbe,timestamps_sbe = nvmlDeviceGetRetiredPages_v2(handle, NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS)
        except NVMLError :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))
        return pages_dbe,timestamps_dbe,pages_sbe,timestamps_sbe

    def get_device_run_info(self,i):
        handle = self.handle_list[i]
        nvidia_smi_error = 0
        util = nvmlDeviceGetUtilizationRates(handle)
        gpu_util = int(util.gpu)
        memInfo = nvmlDeviceGetMemoryInfo(handle)
        mem_total = memInfo.total / 1024 / 1024
        mem_used = memInfo.used / 1024 / 1024
        mem_usage = float(mem_used) / mem_total * 100
        enc_util = nvmlDeviceGetEncoderUtilization(handle)
        dec_util = nvmlDeviceGetDecoderUtilization(handle)
        arch = self.get_gpu_device_arch(i)

        temp = powDraw = powLimit = pow_usage = -1
        # passthrough and baremetal gpu and vgpu host
        if (GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_NONE
            or GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH
            or GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU):
            try:
                temp = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)
                powDraw = (nvmlDeviceGetPowerUsage(handle) / 1000)
                powLimit = (nvmlDeviceGetPowerManagementLimit(handle) / 1000)
                pow_usage = float(powDraw) / powLimit * 100
            except NVMLError :
                self.logger().error(traceback.format_exc().replace('\n', '\\n'))
                nvidia_smi_error = 1

        perf_stat = nvmlDeviceGetPowerState(handle)
        persistence_mode = nvmlDeviceGetPersistenceMode(handle)

        ecc_current=ecc_pending=ecc_agg_single_total_err=ecc_agg_double_total_err=-1
        retired_page_single_ecc=retired_page_double_ecc=retired_page_pending=-1
        if self.consumer_gpu == 0:
            try:
                ecc_current, ecc_pending = nvmlDeviceGetEccMode(handle)
            except NVMLError :
                self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        if ecc_current == 1:
            try:
                ecc_agg_single_total_err = nvmlDeviceGetTotalEccErrors(handle,NVML_MEMORY_ERROR_TYPE_CORRECTED,NVML_AGGREGATE_ECC)
                ecc_agg_double_total_err = nvmlDeviceGetTotalEccErrors(handle,NVML_MEMORY_ERROR_TYPE_UNCORRECTED,NVML_AGGREGATE_ECC)
            except NVMLError :
                self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        try:
            if arch < NVML_DEVICE_ARCH_AMPERE and ecc_current == 1:
                for idx in range(NVML_PAGE_RETIREMENT_CAUSE_COUNT):
                    pages = nvmlDeviceGetRetiredPages(handle, idx)   
                    count = len(pages)
                    if idx == 0:
                        retired_page_single_ecc = count
                    if idx == 1:
                        retired_page_double_ecc = count
        except NVMLError :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        try:
            if arch < NVML_DEVICE_ARCH_AMPERE and ecc_current == 1:
                retired_page_pending = 0 if NVML_FEATURE_DISABLED == nvmlDeviceGetRetiredPagesPendingStatus(handle) else 1
        except NVMLError :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        smclk = memclk = -1
        try:
            smclk = nvmlDeviceGetClockInfo(handle, NVML_CLOCK_SM)
            memclk = nvmlDeviceGetClockInfo(handle, NVML_CLOCK_MEM)
        except NVMLError :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))
            nvidia_smi_error = 1

        # only for check nvidia-smi command print info
        try:
            gpu_name = nvmlDeviceGetName(handle)
        except NVMLError :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))
            nvidia_smi_error = 1

        # passthrough or baremetal consumer gpu
        if (GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_NONE 
            or GpuCollect.gpu_virt_mode == NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH):
            if self.consumer_gpu == 1:
                try:
                    gpu_fan = nvmlDeviceGetFanSpeedv2(handle, 0)
                except NVMLError :
                    self.logger().error(traceback.format_exc().replace('\n', '\\n'))
                    nvidia_smi_error = 1

        return (gpu_util,enc_util[0],dec_util[0],smclk,memclk,mem_total,mem_used,mem_usage,temp,powDraw,powLimit,pow_usage,\
            perf_stat,persistence_mode,ecc_current, ecc_pending,\
            ecc_agg_single_total_err,ecc_agg_double_total_err,\
            retired_page_single_ecc,retired_page_double_ecc,retired_page_pending,nvidia_smi_error)

    #only for vgpu
    def get_gpu_process_info(self, i):
        handle = self.handle_list[i]
        utils=[]
        try:
            utils = nvmlDeviceGetProcessUtilization(handle)
        except NVMLError :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))
        return utils

    # caller ensure ecc=on
    def check_field(self, i):
        handle = self.handle_list[i]
        #define NVML_FI_DEV_ECC_DBE_AGG_L1 19
                #L1 cache double bit aggregate (persistent) ECC errors.
        #define NVML_FI_DEV_ECC_DBE_AGG_L2 21
                #L2 cache double bit aggregate (persistent) ECC errors.
        #define NVML_FI_DEV_ECC_DBE_AGG_REG 25
                #Register File double bit aggregate (persistent) ECC errors.
        fieldIds = [19, 21, 25]

        try :
            values = nvmlDeviceGetFieldValues(handle, fieldIds)
            for i, value in enumerate(values):
                if value.nvmlReturn == NVML_SUCCESS:
                    if value.valueType == NVML_VALUE_TYPE_DOUBLE:
                        if value.value.dVal != 0:
                            return 1
                    elif value.valueType == NVML_VALUE_TYPE_UNSIGNED_INT:
                        if value.value.uiVal != 0:
                            return 1
                    elif value.valueType == NVML_VALUE_TYPE_UNSIGNED_LONG:
                        if value.value.ulVal != 0:
                            return 1
                    elif value.valueType == NVML_VALUE_TYPE_UNSIGNED_LONG_LONG:
                        if value.value.ullVal != 0:
                            return 1
                    elif value.valueType == NVML_VALUE_TYPE_SIGNED_LONG_LONG:
                        if value.value.sllVal != 0:
                            return 1
                    else:
                        return 0
        except Exception as e:
            pass
        return 0

    # caller ensure ecc=on
    def check_field_row_remapping_failure(self, i):
        handle = self.handle_list[i]
        #define NVML_FI_DEV_REMAPPED_FAILURE 145
                #If any rows failed to be remapped 1=yes 0=no.
        fieldIds = [145]

        try :
            values = nvmlDeviceGetFieldValues(handle, fieldIds)
            for i, value in enumerate(values):
                if value.nvmlReturn == NVML_SUCCESS:
                    if value.valueType == NVML_VALUE_TYPE_DOUBLE:
                        if value.value.dVal != 0:
                            return 1
                    elif value.valueType == NVML_VALUE_TYPE_UNSIGNED_INT:
                        if value.value.uiVal != 0:
                            return 1
                    elif value.valueType == NVML_VALUE_TYPE_UNSIGNED_LONG:
                        if value.value.ulVal != 0:
                            return 1
                    elif value.valueType == NVML_VALUE_TYPE_UNSIGNED_LONG_LONG:
                        if value.value.ullVal != 0:
                            return 1
                    elif value.valueType == NVML_VALUE_TYPE_SIGNED_LONG_LONG:
                        if value.value.sllVal != 0:
                            return 1
                    else:
                        return 0
        except Exception as e:
            pass
        return 0        

    # caller ensure ecc=on
    def check_sram_ecc(self, i):
        try :
            handle = self.handle_list[i]
            #eccCounts = nvmlDeviceGetDetailedEccErrors(handle, NVML_MEMORY_ERROR_TYPE_CORRECTED, NVML_AGGREGATE_ECC)
            #if eccCounts.l1Cache !=0 or eccCounts.l2Cache != 0 or eccCounts.registerFile !=0 :
            #    return 1
            eccCounts = nvmlDeviceGetDetailedEccErrors(handle, NVML_MEMORY_ERROR_TYPE_UNCORRECTED, NVML_AGGREGATE_ECC)
            if eccCounts.l1Cache !=0 or eccCounts.l2Cache != 0 or eccCounts.registerFile !=0 :
                return 1
            uceCount = nvmlDeviceGetMemoryErrorCounter(handle, NVML_MEMORY_ERROR_TYPE_UNCORRECTED, NVML_AGGREGATE_ECC, NVML_MEMORY_LOCATION_SRAM)
            if uceCount >= 10:
                return 1
        except Exception as e:
            pass
        return 0

    # return row remapping pending status and failure occurred status
    # if not support row remapping, return 0,0
    def get_row_remapping_status(self, i):
        try :
            handle = self.handle_list[i]
            pending,failure_occurred = nvmlDeviceGetRemappedRowsStatus(handle)
            return pending,failure_occurred
        except Exception as e:
            pass
        return 0,0

    # caller ensure ecc=on
    def get_row_remapping_histogram(self, i):
        rr_low = rr_none = 0
        try :
            handle = self.handle_list[i]
            val = nvmlDeviceGetRowRemapperHistogram(handle)
            rr_low = val.low
            rr_none = val.none
        except Exception as e:
            pass
        return rr_low,rr_none

    def get_gpu_pcie_link_width(self, i):
        width_max = width_curr = 0
        try :
            handle = self.handle_list[i]
            width_curr = nvmlDeviceGetCurrPcieLinkWidth(handle)
            width_max = nvmlDeviceGetMaxPcieLinkWidth(handle)
        except Exception as e:
            pass
        return width_max,width_curr

    def get_gpu_device_arch(self, i):
        arch = NVML_DEVICE_ARCH_TURING
        try :
            handle = self.handle_list[i]
            arch = nvmlDeviceGetArchitecture(handle)
        except Exception as e:
            pass
        return arch

    def get_nvlink_err_count(self, i, linkidx, erridx):
        try:
            handle = self.handle_list[i]
            return nvmlDeviceGetNvLinkErrorCounter(handle, linkidx, erridx)
        except Exception as e:
            pass
        return 0

    def get_nvlink_state(self, i, linkidx):
        try:
            handle = self.handle_list[i]
            return nvmlDeviceGetNvLinkState(handle, linkidx)
        except Exception as e:
            pass
        return 0

    # consumer gpu not support
    def check_gpu_clock_slowdown(self, i):
        slow_down_hw = slow_down_sw = 0
        if self.consumer_gpu == 1:
            return slow_down_hw,slow_down_sw
        try:
            handle = self.handle_list[i]
            supportedClocksThrottleReasons = nvmlDeviceGetSupportedClocksThrottleReasons(handle)
            clocksThrottleReasons = nvmlDeviceGetCurrentClocksThrottleReasons(handle)

            if supportedClocksThrottleReasons & clocksThrottleReasons & nvmlClocksThrottleReasonHwSlowdown != 0:
                slow_down_hw = 1
            if supportedClocksThrottleReasons & clocksThrottleReasons & nvmlClocksThrottleReasonHwThermalSlowdown != 0:
                slow_down_hw = 1
            if supportedClocksThrottleReasons & clocksThrottleReasons & nvmlClocksThrottleReasonHwPowerBrakeSlowdown != 0:
                slow_down_hw = 1
            if supportedClocksThrottleReasons & clocksThrottleReasons & nvmlClocksThrottleReasonSwThermalSlowdown != 0:
                slow_down_sw = 1
            #if supportedClocksThrottleReasons & clocksThrottleReasons & nvmlClocksThrottleReasonGpuIdle != 0:
            #   slow_down_hw = slow_down_sw = 1
        except Exception as e:
            pass  
        return slow_down_hw,slow_down_sw

def test_gpu_collect():
    collector = GpuCollect()
    deviceCount = collector.get_gpu_count()
    print collector.get_gpu_init_count()
    for i in range(0, deviceCount):
        print collector.get_device_info(i)
        print collector.get_device_run_info(i)
        print collector.get_gpu_process_info(i)
        print collector.get_device_retired_page_info(i)
        print collector.check_sram_ecc(i)
        print collector.check_field(i)
        print collector.check_field_row_remapping_failure(i)
        print collector.get_row_remapping_status(i)
        print collector.get_row_remapping_histogram(i)
        print collector.get_gpu_pcie_link_width(i)
        print collector.get_gpu_device_arch(i)
        print collector.check_gpu_clock_slowdown(i)
        print collector.get_device_info_pcideviceid(i)
        print collector.get_device_info_inforom(i)
        for linkidx in range(12):
            print collector.get_nvlink_state(i, linkidx)
            for erridx in range(NVML_NVLINK_ERROR_COUNT):
                print linkidx, erridx, collector.get_nvlink_err_count(i, linkidx, erridx)

'''zixiao npu info collect'''
class NpuCollect(BaseProcess):
    npu_init_count = 0
    npu_load_driver = 0
    bdf_list = ""
    npu_type = 0
    zixiao_ver = 0
    # for zxiaov2 only now
    # An issue on the old driver of Zixiao v2. 
    # When calling zxml init, the driver sets a whitelist. After the barad process modifies its own process name, the zxml init interface returns an exception, 
    # and the results of all other zxml interface calls are invalid. This issue has been fixed in the new version of the driver.
    # To avoid frequent printing of old driver, init a certain number of times (10 times).
    zixiaov2_try_init_cnt = 0
    def __init__(self, logger = None ):
        BaseProcess.__init__(self, constant.PLUGIN_CONFIG_PATH, self.__class__.__name__)
        try:
            #zixiao npu
            cmd = "lspci -d 1ea0:2a16 -Dn | wc -l"
            count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3)) 
            if count > 0:
                cmd = "lspci -d 1ea0:2a16 -Dn | awk '{print $1}'"
                bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
                self.bdf_list = bdf.split("\n")[:-1]
                self.npu_init_count = count
                self.zixiao_ver = 1

            if count == 0:
                # zixiao v2
                cmd = "lspci -d 1ea0:2a22 -Dn | wc -l"
                count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3))
                if count > 0:
                    cmd = "lspci -d 1ea0:2a22 -Dn | awk '{print $1}'"
                    bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
                    self.bdf_list = bdf.split("\n")[:-1]
                    self.npu_init_count = count
                    self.zixiao_ver = 2

            self.logger().info("npu init count %d", count)

            if self.npu_init_count > 0:
                zxmlInit()                
                self.npu_load_driver = 1
                self.logger().info("npu init ok")
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

    def __del__(self):
        try:
            if self.npu_init_count > 0 and self.npu_load_driver == 1:
                zxmlShutdown()
                self.npu_load_driver = 0
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

    def get_npu_type(self):
        return self.npu_type

    def get_npu_count(self):
        count = 0
        if self.npu_init_count > 0 :
            try :
                count = zxmlGetDevCount()
            except Exception:
                self.logger().error(traceback.format_exc().replace('\n', '\\n'))
        return count
    
    def get_npu_init_count(self):
        return self.npu_init_count

    def get_lspci_bdf(self):
        return self.bdf_list 
       
    def npu_drives_check(self):
        if self.npu_load_driver == 0:
            if self.zixiao_ver == 2:
                if self.zixiaov2_try_init_cnt >= 10:
                    return 0

            cmd1 = "which zixiao-smi | wc -l"
            cmd11 = "which zx-smi | wc -l"
            cmd2 = "lsmod | grep zixiao | wc -l"
            try :
                rtn1 = CommUtils.ExecuteTimeoutCommand(cmd1, 3)
                rtn11 = CommUtils.ExecuteTimeoutCommand(cmd11, 3)
                rtn2 = CommUtils.ExecuteTimeoutCommand(cmd2, 3)
                if (int(rtn1) > 0 and int(rtn2) > 0) or (int(rtn11) > 0 and int(rtn2) > 0):
                    if self.zixiao_ver == 2:
                        self.zixiaov2_try_init_cnt += 1
                    zxmlInit()
                    self.npu_load_driver = 1
                    self.logger().info("npu check init ok")
            except Exception:
                self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        return self.npu_load_driver
    
    def get_device_info(self,i):
        minor_number = -1
        serial = part_number = did = name = uuid = driver_ver = ''
        try:
            minor_number = i
            uuid = zxmlGetDevUuid(i)
            driver_ver = zxmlDriverVer()
            serial = zxmlGetDevSn(i).strip()
            part_number = zxmlGetDevPn(i).strip()
            dev_info = zxmlGetDevInfo(i)
            did = "%04x:%02x:%02x.%x" % (dev_info.domain_id, dev_info.bus_id, dev_info.dev_id, dev_info.func_id)
            did = did.upper()
            name = dev_info.name
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        return (minor_number,driver_ver,did,name,serial,uuid,part_number)

    def get_device_run_info(self,i):
        gpu_util = enc_util = dec_util = smclk = memclk = mem_total = mem_used = mem_usage = -1
        temp = powDraw = powLimit = pow_usage = perf_stat = persistence_mode = -1 
        ecc_current = ecc_pending = -1
        ecc_agg_single_total_err = ecc_agg_double_total_err = -1
        retired_page_single_ecc = retired_page_double_ecc = retired_page_pending = -1

        try:
            #gpu_util = zxmlGetDevDtuUsage(i)
            gpu_util = zxmlGetDevDtuUsageAsync(i)
            gpu_util = zxmlGetDevDtuUsageAsync(i)
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))
        try:
            memInfo = zxmlGetDevMem(i)
            # mem_total: in bytes
            # mem_used:  in Mbytes
            mem_total = memInfo.mem_total_size  / 1024 / 1024
            mem_used = memInfo.mem_used
            mem_usage = float(mem_used) / mem_total * 100
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))
        try:
            if self.zixiao_ver == 1:
                temp_list = zxmlGetDevTemp(i)
                temp = temp_list.cur_dev_temp
            elif self.zixiao_ver == 2:
                temp_list = zxmlGetDevTempV2(i)
                temp = temp_list.cur_asic_temp
            else:
                temp = -1
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))
        try:
            pow_list = zxmlGetDevPwr(i)
            powDraw = pow_list.cur_pwr_consumption
            powLimit = pow_list.pwr_capability
            pow_usage = float(powDraw) / powLimit * 100
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))
        try:
            if self.zixiao_ver == 1:
                perf_stat = zxmlGetDevDpmLevel(i)
            elif self.zixiao_ver == 2:
                #not support
                perf_stat = -1
            else:
                perf_stat = -1
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))
        try:
            ecc_current = zxmlGetEccStatus(i)
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))
        try:
            if self.zixiao_ver == 1:
                clk_info = zxmlGetDevClk(i)
                smclk = clk_info.cur_dtu_clock
                memclk = clk_info.cur_hbm_clock
            elif self.zixiao_ver == 2:
                #not support
                smclk = -1
                memclk = -1
            else:
                smclk = -1
                memclk = -1
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        return (gpu_util,enc_util,dec_util, \
            smclk,memclk,mem_total,mem_used,mem_usage, \
            temp,powDraw,powLimit,pow_usage,\
            perf_stat,persistence_mode,ecc_current, ecc_pending,\
            ecc_agg_single_total_err,ecc_agg_double_total_err,\
            retired_page_single_ecc,retired_page_double_ecc,retired_page_pending)

def test_npu_collect():
    collector = NpuCollect()
    deviceCount = collector.get_npu_count()
    print collector.get_npu_init_count()
    for i in range(0, deviceCount):
        print collector.get_device_info(i)
        print collector.get_device_run_info(i)

'''huawei npu info collect'''
class HwNpuCollect(BaseProcess):
    npu_init_count = 0
    npu_load_driver = 0
    bdf_list = ""
    npu_type = 1
    card_num = 0
    card_id_list = []
    def __init__(self, logger = None ):
        BaseProcess.__init__(self, constant.PLUGIN_CONFIG_PATH, self.__class__.__name__)
        try:
            #huawei 910B npu
            cmd = "lspci -d 19e5:d802 -Dn | wc -l"
            count = int(CommUtils.ExecuteTimeoutCommand(cmd, 3)) 
            if count > 0:
                cmd = "lspci -d 19e5:d802 -Dn | awk '{print $1}'"
                bdf = CommUtils.ExecuteTimeoutCommand(cmd, 3)
                self.bdf_list = bdf.split("\n")[:-1]
                self.npu_init_count = count

            self.logger().info("hw npu init count %d", count)

            if self.npu_init_count > 0:
                dcmiInit()
                self.npu_load_driver = 1
                self.logger().info("hw npu init ok")
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

    def __del__(self):
        try:
            if self.npu_init_count > 0 and self.npu_load_driver == 1:
                dcmiShutdown()
                self.npu_load_driver = 0
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

    def get_npu_type(self):
        return self.npu_type

    def get_npu_count(self):
        count = 0
        if self.npu_init_count > 0 :
            try :
                self.card_num, self.card_id_list = dcmiGetCardList()
                count = self.card_num
            except Exception:
                self.logger().error(traceback.format_exc().replace('\n', '\\n'))
        return count
    
    def get_npu_init_count(self):
        return self.npu_init_count

    def get_lspci_bdf(self):
        return self.bdf_list 
       
    def npu_drives_check(self):
        if self.npu_load_driver == 0:
            cmd1 = "which npu-smi | wc -l"
            #cmd2 = "lsmod | grep xxx | wc -l"
            try :
                rtn1 = CommUtils.ExecuteTimeoutCommand(cmd1, 3)
                #rtn2 = CommUtils.ExecuteTimeoutCommand(cmd2, 3)
                rtn2 = 1
                if int(rtn1) > 0 and int(rtn2) > 0:
                    dcmiInit()
                    self.npu_load_driver = 1
                    self.logger().info("hw npu check init ok")
            except Exception:
                self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        return self.npu_load_driver
    
    def get_cardid_by_index(self, i):
        #self.logger().info("hw npu index:%d card id:%d", i, self.card_id_list[i])
        return self.card_id_list[i]

    def get_device_info(self,i):
        minor_number = -1
        serial = part_number = did = name = uuid = driver_ver = ''
        try:
            card_id = self.get_cardid_by_index(i)
            device_id = 0
            minor_number = i
            driver_ver = dcmiGetDriverVersion()
            b,d,f = dcmiGetDevicePcieInfo(card_id, device_id)
            did = "0000:%02x:%02x.%x" % (b,d,f)
            did = did.upper()
            name = dcmiGetDeviceChipInfoName(card_id, device_id)
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        return (minor_number,driver_ver,did,name,serial,uuid,part_number)

    def get_device_run_info(self,i):
        gpu_util = enc_util = dec_util = smclk = memclk = mem_total = mem_used = mem_usage = -1
        temp = powDraw = powLimit = pow_usage = perf_stat = persistence_mode = -1 
        ecc_current = ecc_pending = -1
        ecc_agg_single_total_err = ecc_agg_double_total_err = -1
        retired_page_single_ecc = retired_page_double_ecc = retired_page_pending = -1

        try:
            card_id = self.get_cardid_by_index(i)
            device_id = 0
            gpu_util = dcmiGetDeviceUtilRateAICore(card_id, device_id)
            smclk = dcmiGetDeviceFrequency(card_id, device_id, DCMI_FREQ_TYPE_CPU)

            mem_total, memclk, mem_used = dcmiGetDeviceHbmInfo(card_id, device_id)
            mem_usage = float(mem_used) * 100 / mem_total

            temp = dcmiGetDeviceTemp(card_id, device_id)
            powDraw = dcmiGetDevicePowerInfo(card_id, device_id)
            powDraw = float(powDraw) / 10
            powLimit = 400.0
            pow_usage = float(powDraw) / powLimit * 100

            ecc_current, ecc_agg_single_total_err, ecc_agg_double_total_err, retired_page_single_ecc, retired_page_double_ecc = dcmiGetDeviceEccInfo(card_id, device_id ,DCMI_DEVICE_TYPE_HBM)
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))

        return (gpu_util,enc_util,dec_util, \
            smclk,memclk,mem_total,mem_used,mem_usage, \
            temp,powDraw,powLimit,pow_usage,\
            perf_stat,persistence_mode,ecc_current, ecc_pending,\
            ecc_agg_single_total_err,ecc_agg_double_total_err,\
            retired_page_single_ecc,retired_page_double_ecc,retired_page_pending)

    def check_health_status(self,i):
        health = 0
        try:
            card_id = self.get_cardid_by_index(i)
            device_id = 0
            health = dcmiGetDeviceHealthStatus(card_id, device_id)
        except Exception :
            self.logger().error(traceback.format_exc().replace('\n', '\\n'))
        # 0: ok  1: warning  2: Alarm  3: Critical
        if health == 2 or health == 3:
            return 1
        return 0


def test_hwnpu_collect():
    collector = HwNpuCollect()
    deviceCount = collector.get_npu_count()
    print collector.get_npu_init_count()
    for i in range(0, deviceCount):
        print collector.get_device_info(i)
        print collector.get_device_run_info(i)
        print collector.check_health_status(i)

if __name__ == '__main__':
    test_gpu_collect()