HEX

File: //usr/local/qcloud/monitor/barad/plugin/collector/vm/gpu.py
import sys
import os,time
sys.path.append(os.getcwd() + '/../../../comm/')
import constant
import globalvar

from plugin_base import VmBaseCollector
from utils.collect_tool_gpu import GpuCollect
from utils.metric_handler import MetricHandler
from pynvml.pynvml import *
from operator import itemgetter, attrgetter

class GpuCollector(VmBaseCollector):
    def init(self):
        self.set_frequency(10)
        self.collector = GpuCollect()
        self.handler = MetricHandler()
        self.handler.namespace = 'qce/cvm'
        self.handler.dimensions = [ 'vm_uuid', 'vmip']
        self.device_info = []
        self.tmp_device_info = []
        self.alarm_time = {}
        self.gpu_init_count = self.collector.get_gpu_init_count()
        self.checknvlink = 0xff
        self.nvlinknumber_pergpu = 0
        self.pcideviceid = 0xff
        self.arch = 0xff
        self.inforomcheck = 0

    def do_collect(self):
        # not gpu cvm
        if self.gpu_init_count == 0:
            return

        # 0 : drives do not install 
        drives = self.collector.gpu_drives_check()
        if drives == 0:
            return

        log_collect = 0
        deviceCount = self.collector.get_gpu_count()

        self.tmp_device_info = self.device_info
        if len(self.device_info) != deviceCount:
            self.device_info = []

        now = int(time.time())
        vm_uuid = self.get_vm_uuid()
        vmip =  self.get_vmip()
        barad_version = self.get_barad_version()

        for i in range(0, deviceCount):
            try:
                if(len(self.device_info) < deviceCount):
                    minor_number,driver_ver,gpu_id,gpu_name,serial,gpu_uuid,gpu_part_number,gpu_virtual_mode = self.collector.get_device_info(i)
                    info = {}
                    info['minor_number'] = minor_number
                    info['driver_ver'] = driver_ver
                    info['gpu_id'] = gpu_id
                    info['gpu_name'] = gpu_name
                    info['serial'] = serial
                    info['gpu_uuid'] = gpu_uuid
                    info['gpu_part_number'] = gpu_part_number
                    info['gpu_virtual_mode'] = gpu_virtual_mode
                    self.device_info.append(info)
                    self.alarm_time[gpu_id]  = {'ecc_page_pending_time':0, 
                        'inforom_failed_time':0, 
                        'ecc_error_standard_time':0, 
                        'ecc_error_tencent_time':0, 
                        'ecc_sram_error_time':0,
                        'ecc_row_remapping_failure_occurred_time':0,
                        'ecc_row_remapping_bank_low_time':0,
                        'gpu_pcie_link_width_time':0,
                        'nvlink_check_time':0,
                        'nvlink_state_check_time':0,
                        'gpu_clock_slowdown_time':0,
                        'gpu_clock_slowdown_sw_time':0,
                        'gpu_low_power_sw_thermal_slowdown_active_last_count':0,
                        'lost_time':0}
                    tempInfo = {}
                    tempInfo['bdf'] = gpu_id
                    tempInfo['sn'] = serial
                    tempInfo['pn'] = gpu_part_number
                    if tempInfo not in globalvar.GPUList['gpu']:
                        globalvar.GPUList['gpu'].append(tempInfo)
                else:
                    minor_number = self.device_info[i]['minor_number']
                    driver_ver = self.device_info[i]['driver_ver']
                    gpu_id = self.device_info[i]['gpu_id']
                    gpu_name = self.device_info[i]['gpu_name']
                    serial = self.device_info[i]['serial']
                    gpu_uuid = self.device_info[i]['gpu_uuid']
                    gpu_part_number = self.device_info[i]['gpu_part_number']
                    gpu_virtual_mode = self.device_info[i]['gpu_virtual_mode']
            except Exception as err:
                pass

            try:
                gpu_util,enc_util,dec_util,smclk,memclk,mem_total,mem_used,mem_usage,temp,powDraw,powLimit,pow_usage,\
                perf_stat,persistence_mode,ecc_current, ecc_pending, ecc_agg_single_total_err,ecc_agg_double_total_err,\
                retired_page_single_ecc,retired_page_double_ecc,retired_page_pending,nvidia_smi_error = self.collector.get_device_run_info(i)
            except NVMLError as err:
                pass

            p_smutil=p_memutil=p_encutil=p_decutil=0
            if gpu_virtual_mode == NVML_GPU_VIRTUALIZATION_MODE_VGPU:
                p_smutil,p_memutil,p_encutil,p_decutil = self.get_gpu_process_utils(i)

            gpu_inforom = 0
            if self.inforomcheck == 0:
                gpu_inforom = self.collector.get_device_info_inforom(i)
                #If the gpu_inforom value is 2 reported as 0, and included in the log
                if gpu_inforom == 2:
                    gpu_inforom = 0

            if self.arch == 0xff:
                self.arch = self.collector.get_gpu_device_arch(i)
                self.checknvlink = 0
                self.pcideviceid = self.collector.get_device_info_pcideviceid(i)
                #define PCI_DEVICE_ID_NVIDIA_A100_SXM4_40G      0x20b0
                #define PCI_DEVICE_ID_NVIDIA_A100_SXM4_80G      0x20b2
                #define PCI_DEVICE_ID_NVIDIA_A800_SXM4_80G      0x20f3
                #define PCI_DEVICE_ID_NVIDIA_H800_80G           0x2324
                #define PCI_DEVICE_ID_NVIDIA_H20                0x2329
                if self.pcideviceid == 0x20B010DE or self.pcideviceid == 0x20B210DE:
                    self.nvlinknumber_pergpu = 12
                    self.checknvlink = 1
                if self.pcideviceid == 0x20F310DE or self.pcideviceid == 0x232410DE:
                    self.nvlinknumber_pergpu = 8
                    self.checknvlink = 1
                if self.pcideviceid == 0x232910DE:
                    self.nvlinknumber_pergpu = 18
                    self.checknvlink = 1
                #check nvlink when vm has more than 1 gpu, no need check for 1 gpu vm, e.g. GT4.4XLARGE96 
                if deviceCount == 1:
                    self.checknvlink = 0

            gpu_ecc_terminate_app = 0
            gpu_xid_terminate_app = 0
            row_remapping_pending = 0
            row_remapping_failure_occurred = 0

            # retired_page_pending == -1 indicates unsupport page_retirment on ampere and next
            if retired_page_pending == -1 and ecc_current == 1:
                row_remapping_pending,row_remapping_failure_occurred = self.collector.get_row_remapping_status(i)
                
            # ampere and next arch :reuse retired_page_pending in barad
            if self.check_xid_error_need_reset(gpu_id):
                retired_page_pending = 1

            if retired_page_pending == 1 or self.check_xid_error_no_need_reset_but_terminate_app(gpu_id):
                gpu_ecc_terminate_app = 1

            if self.check_xid_error_terminate_app(gpu_id):
                gpu_xid_terminate_app = 1

            if globalvar.GPUDriverVersion == "" or globalvar.GPUDriverVersion != driver_ver:
                globalvar.GPUDriverVersion = driver_ver

            xid_61_62 = 0
            if self.check_xid_error_61_62(gpu_id):
                xid_61_62 = 1

            dimensions = {'vm_uuid': vm_uuid, 'vmip': vmip, 'minor_number': minor_number, 'gpu_name' : gpu_name, 'gpu_id' : gpu_id, \
            'gpu_uuid' : gpu_uuid, 'driver_ver': driver_ver, 'serial': serial, 'gpu_part_number': gpu_part_number}
            batch_metric = [
                {'name':'gpu_util',     'value':gpu_util},
                {'name':'gpu_enc_util', 'value':enc_util},
                {'name':'gpu_dec_util', 'value':dec_util},
                {'name':'gpu_smclk',    'value':smclk},
                {'name':'gpu_memclk',   'value':memclk},
                {'name':'gpu_p_smutil', 'value':p_smutil},
                {'name':'gpu_p_memutil', 'value':p_memutil},
                {'name':'gpu_p_encutil', 'value':p_encutil},
                {'name':'gpu_p_decutil', 'value':p_decutil},
                {'name':'gpu_mem_usage', 'value': mem_usage},
                {'name':'gpu_mem_total', 'value':mem_total},
                {'name':'gpu_mem_used', 'value':mem_used},
                {'name':'gpu_pow_usage', 'value': pow_usage},
                {'name':'gpu_pow_limit', 'value':powLimit},
                {'name':'gpu_pow_draw', 'value':powDraw},
                {'name':'gpu_temp', 'value':temp},
                {'name':'gpu_perf_stat', 'value':perf_stat},
                {'name':'gpu_persistence_mode', 'value':persistence_mode},
                {'name':'gpu_ecc_current', 'value':ecc_current},
                {'name':'gpu_ecc_pending', 'value':ecc_pending},
                {'name':'gpu_ecc_agg_single_total_err', 'value':ecc_agg_single_total_err},
                {'name':'gpu_ecc_agg_double_total_err', 'value':ecc_agg_double_total_err},
                {'name':'gpu_retired_page_single_ecc', 'value':retired_page_single_ecc},
                {'name':'gpu_retired_page_double_ecc', 'value':retired_page_double_ecc},
                {'name':'gpu_retired_page_pending', 'value':retired_page_pending},
                {'name':'gpu_inforom','value':gpu_inforom},
                {'name':'gpu_ecc_terminate_app','value':gpu_ecc_terminate_app},
                {'name':'gpu_xid_terminate_app','value':gpu_xid_terminate_app},
                {'name':'gpu_nvidia_smi_error','value':nvidia_smi_error},
                {'name':'gpu_xid_61_62','value':xid_61_62},
            ]
            self.handler.add_batch_metric(batch = batch_metric, dimensions = dimensions)

            alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(now))
            alarm_dimensions = [{"Key" : "Uuid", "Value" :vm_uuid}]
            alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions,
                                 "DeviceName":"GPU", "DeviceId": "", 'Slot':gpu_id, 'PN':gpu_part_number, 'SN': serial, "DriverVersion": driver_ver,
                                 "BaradVersion": barad_version }
            
            inforom_failed_time = self.alarm_time[gpu_id]['inforom_failed_time']
            #when gpu error ,WARNING: infoROM is corrupted at gpu 0000:00:08.0
            if now - inforom_failed_time >= 60*60*24  or inforom_failed_time == 0:
                if gpu_inforom == 1 :
                    alarmproxy_event = {"AlarmId":1107, "EventName":"gpu_inforom_failed", "FaultType": "Hardware", "FaultDesc":"gpu_inforom_failed" }
                    data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                    self.put_data(data_alarmproxy)
                    self.alarm_time[gpu_id]['inforom_failed_time'] = now
                    log_collect = 1

            # passthrough GPU VM or bare metal GPU machine
            if gpu_virtual_mode == NVML_GPU_VIRTUALIZATION_MODE_NONE or gpu_virtual_mode == NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH:
                ecc_page_pending_time = self.alarm_time[gpu_id]['ecc_page_pending_time']
                ecc_error_standard_time = self.alarm_time[gpu_id]['ecc_error_standard_time']
                ecc_error_tencent_time = self.alarm_time[gpu_id]['ecc_error_tencent_time']
                ecc_sram_error_time = self.alarm_time[gpu_id]['ecc_sram_error_time']
                ecc_row_remapping_failure_occurred_time = self.alarm_time[gpu_id]['ecc_row_remapping_failure_occurred_time']
                ecc_row_remapping_bank_low_time = self.alarm_time[gpu_id]['ecc_row_remapping_bank_low_time']
                gpu_pcie_link_width_time = self.alarm_time[gpu_id]['gpu_pcie_link_width_time']
                nvlink_check_time = self.alarm_time[gpu_id]['nvlink_check_time']
                nvlink_state_check_time = self.alarm_time[gpu_id]['nvlink_state_check_time']
                gpu_clock_slowdown_time = self.alarm_time[gpu_id]['gpu_clock_slowdown_time']
                gpu_clock_slowdown_sw_time = self.alarm_time[gpu_id]['gpu_clock_slowdown_sw_time']
                gpu_low_power_sw_thermal_slowdown_active_last_count = self.alarm_time[gpu_id]['gpu_low_power_sw_thermal_slowdown_active_last_count']

                if self.arch < NVML_DEVICE_ARCH_AMPERE and ecc_current == 1:
                    send_alarm = 0
                    if ecc_error_tencent_time == 0:
                        pages_dbe,timestamps_dbe,pages_sbe,timestamps_sbe = self.collector.get_device_retired_page_info(i)
                        dbe_count = len(pages_dbe)
                        sbe_count = len(pages_sbe)
                        #nvidia RMA: DBE >= 5 || SBE + DBE >=60
                        if dbe_count >= 5 or dbe_count + sbe_count  >=60:
                            send_alarm = 1
                    else:
                        if now - ecc_error_tencent_time >= 60*60*24:
                            send_alarm = 1

                    if send_alarm == 1:
                            alarmproxy_event = {"AlarmId":1102, "EventName":"gpu_ecc_error_tencent", "FaultType": "Hardware", "FaultDesc":"gpu ecc error(DBE >= 5 || SBE + DBE >=60)" }
                            data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                            self.put_data(data_alarmproxy)
                            self.alarm_time[gpu_id]['ecc_error_tencent_time'] = now

                    send_alarm = 0
                    if ecc_error_standard_time == 0:
                        pages_dbe,timestamps_dbe,pages_sbe,timestamps_sbe = self.collector.get_device_retired_page_info(i)
                        dbe_count = len(pages_dbe)
                        sbe_count = len(pages_sbe)
                        seconds = 0
                        timestamps_dbe_sorted = sorted(timestamps_dbe, reverse=True)
                        if len(timestamps_dbe_sorted) >= 5:
                            seconds = timestamps_dbe_sorted[0] - timestamps_dbe_sorted[4]
                        #nvidia RMA: 30days and DBE>=5 || DBE>=10 || SBE+DBE >=60
                        if ((seconds <= 30*24*60*60 and dbe_count >= 5 ) or  dbe_count >= 10 or (dbe_count + sbe_count >=60)):
                            send_alarm = 1
                    else:
                        if now - ecc_error_standard_time >= 60*60*24:
                            send_alarm = 1
                    if send_alarm == 1:
                            alarmproxy_event = {"AlarmId":1103, "EventName":"gpu_ecc_error_standard", "FaultType": "Hardware", "FaultDesc":"gpu ecc error(30days DBE>=5 || DBE>=10 || SBE+DBE >=60)" }
                            data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                            self.put_data(data_alarmproxy)
                            self.alarm_time[gpu_id]['ecc_error_standard_time'] = now
            
                send_alarm = 0
                if ecc_page_pending_time == 0:
                    if retired_page_pending == 1:
                        send_alarm = 1
                if send_alarm == 1:
                    alarmproxy_event = {"AlarmId":1104, "EventName":"gpu_retired_page_pending", "FaultType": "Hardware", "FaultDesc":"gpu retired page pending" }
                    data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                    self.put_data(data_alarmproxy)
                    self.alarm_time[gpu_id]['ecc_page_pending_time'] = now
                    log_collect = 1

                send_alarm = 0
                if ecc_row_remapping_failure_occurred_time == 0:
                    if self.arch >= NVML_DEVICE_ARCH_AMPERE and ecc_current == 1:
                        if row_remapping_failure_occurred == 1 or self.collector.check_field_row_remapping_failure(i):
                            send_alarm = 1
                else:
                    if now - ecc_row_remapping_failure_occurred_time >= 60*60:
                        send_alarm = 1

                if send_alarm == 1:
                    alarmproxy_event = {"AlarmId":1121, "EventName":"gpu_row_remapping_failure", "FaultType": "Hardware", "FaultDesc":"gpu row remapping failure occurred" }
                    data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                    self.put_data(data_alarmproxy)
                    self.alarm_time[gpu_id]['ecc_row_remapping_failure_occurred_time'] = now
                    log_collect = 1

                send_alarm = 0
                if ecc_sram_error_time == 0:
                    if ecc_current == 1:
                        filed = self.collector.check_field(i)
                        sram_ecc = self.collector.check_sram_ecc(i)
                        if filed == 1 or sram_ecc  == 1:
                            send_alarm = 1
                        if send_alarm == 0:
                            try:
                                # nvidia bug about field check, can't find sram uce, so here check sram uce by nvidia-smi command
                                command = "nvidia-smi -q -d ecc -i " + str(i) + " | grep -i 'SRAM Uncorrectable' | awk -F: '{print $2}'| sed 's/^[ \t]*//;s/[ \t]*$//' "
                                lines = os.popen(command).readlines()
                                for line in lines:
                                    # nvidia RMA: SRAM UCE >= 10
                                    if int(line.strip()) >= 10:
                                        send_alarm = 1
                            except Exception as err:
                                pass
                else:
                    if now - ecc_sram_error_time >= 60*60:
                        send_alarm = 1
                if send_alarm == 1:
                    alarmproxy_event = {"AlarmId":1114, "EventName":"gpu_sram_ecc_error", "FaultType": "Hardware", "FaultDesc":"gpu sram ecc error" }
                    data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                    self.put_data(data_alarmproxy)
                    self.alarm_time[gpu_id]['ecc_sram_error_time'] = now
                    log_collect = 1

                send_alarm = 0
                if ecc_row_remapping_bank_low_time == 0:
                    if self.arch >= NVML_DEVICE_ARCH_AMPERE and ecc_current == 1:
                        rr_low,rr_none = self.collector.get_row_remapping_histogram(i)
                        if rr_low != 0 or rr_none != 0:
                            send_alarm = 1
                else:
                    if now - ecc_row_remapping_bank_low_time >= 60*60:
                        send_alarm = 1

                if send_alarm == 1:
                    alarmproxy_event = {"AlarmId":1122, "EventName":"gpu_row_remapping_bank_low", "FaultType": "Hardware", "FaultDesc":"gpu row remapping bank low" }
                    data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                    self.put_data(data_alarmproxy)
                    self.alarm_time[gpu_id]['ecc_row_remapping_bank_low_time'] = now

                send_alarm = 0
                if gpu_pcie_link_width_time == 0:
                    # exclude A10
                    if self.arch >= NVML_DEVICE_ARCH_AMPERE and self.pcideviceid != 0x223610DE:
                        width_max, width_cur = self.collector.get_gpu_pcie_link_width(i)
                        if width_max != 16 or width_cur != 16:
                            send_alarm = 1
                else:
                    if now - gpu_pcie_link_width_time >= 60*60:
                        send_alarm = 1

                if send_alarm == 1:
                    alarmproxy_event = {"AlarmId":1123, "EventName":"gpu_pcie_link_width_err", "FaultType": "Hardware", "FaultDesc":"gpu pcie link width error" }
                    data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                    self.put_data(data_alarmproxy)
                    self.alarm_time[gpu_id]['gpu_pcie_link_width_time'] = now
                    log_collect = 1

                send_alarm = 0
                if nvlink_check_time == 0:
                    if self.checknvlink == 1:
                        for linkidx in range(self.nvlinknumber_pergpu):
                            # err index only 0/1/2 is valid
                            for erridx in range(2):
                                if self.collector.get_nvlink_err_count(i, linkidx, erridx) != 0:
                                    send_alarm = 1
                else:
                    if now - nvlink_check_time >= 60*60:
                        send_alarm = 1
                                
                if send_alarm == 1:
                    alarmproxy_event = {"AlarmId":1125, "EventName":"gpu_nvlink_err", "FaultType": "Hardware", "FaultDesc":"gpu nvlink error" }
                    data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                    self.put_data(data_alarmproxy)
                    self.alarm_time[gpu_id]['nvlink_check_time'] = now

                send_alarm = 0
                if nvlink_state_check_time == 0 or now - nvlink_state_check_time >= 60*60:
                    if self.checknvlink == 1:
                        for linkidx in range(self.nvlinknumber_pergpu):
                            if self.collector.get_nvlink_state(i, linkidx) == 0:
                                send_alarm = 1
                if send_alarm == 1:
                    # rarely, "NvLinkDisable=1" is configured, no nvlink inactive alarm send
                    try:
                        command = "cat /proc/driver/nvidia/params"
                        lines = os.popen(command).readlines()
                        for line in lines:
                            if line.find("NvLinkDisable") == -1:
                                continue
                            v = line.split(":")[1].strip()
                            if v == "1":
                                send_alarm = 0
                    except Exception as err:
                        pass
                if send_alarm == 1:
                    alarmproxy_event = {"AlarmId":1134, "EventName":"gpu_nvlink_state_inactive", "FaultType": "Hardware", "FaultDesc":"gpu nvlink inactive" }
                    data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                    self.put_data(data_alarmproxy)
                    self.alarm_time[gpu_id]['nvlink_state_check_time'] = now

                if self.arch >= NVML_DEVICE_ARCH_AMPERE:
                    slow_down_hw,slow_down_sw = self.collector.check_gpu_clock_slowdown(i)
                    if gpu_clock_slowdown_time == 0 or now - gpu_clock_slowdown_time >= 60*60:
                        if slow_down_hw == 1:
                            alarmproxy_event = {"AlarmId":1128, "EventName":"gpu_clock_slowdown", "FaultType": "Hardware", "FaultDesc":"gpu clock slowdown HW" }
                            data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                            self.put_data(data_alarmproxy)
                            self.alarm_time[gpu_id]['gpu_clock_slowdown_time'] = now
                    if gpu_clock_slowdown_sw_time == 0 or now - gpu_clock_slowdown_sw_time >= 5*60:
                        if slow_down_sw == 1:
                            alarmproxy_event = {"AlarmId":1135, "EventName":"gpu_clock_slowdown_sw", "FaultType": "Hardware", "FaultDesc":"gpu clock slowdown SW" }
                            data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                            self.put_data(data_alarmproxy)
                            self.alarm_time[gpu_id]['gpu_clock_slowdown_sw_time'] = now
                    # if low power usage and sw thermal slowdown active last 15min(900), then alarm
                    if gpu_low_power_sw_thermal_slowdown_active_last_count >= 90:
                        alarmproxy_event = {"AlarmId":1137, "EventName":"gpu_low_power_sw_thermal_slowdown_active", "FaultType": "Hardware", "FaultDesc":"gpu low power and sw thermal slowdown active" }
                        data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                        self.put_data(data_alarmproxy)
                        self.alarm_time[gpu_id]['gpu_low_power_sw_thermal_slowdown_active_last_count'] = 0
                    if slow_down_sw == 1 and pow_usage < 20.0:
                        self.alarm_time[gpu_id]['gpu_low_power_sw_thermal_slowdown_active_last_count'] += 1
                    else:
                        self.alarm_time[gpu_id]['gpu_low_power_sw_thermal_slowdown_active_last_count'] = 0

        self.inforomcheck = 1

        #check if any gpu lost
        bdf_init = [item.upper() for item in self.collector.get_lspci_bdf()]
        bdf_now = [item['gpu_id'].upper() for item in self.device_info]
        bdf_diff = list(set(bdf_init).difference(set(bdf_now)))
        
        for bdf in bdf_diff :
            if self.alarm_time.get(bdf, '') == '':
                self.alarm_time[bdf] = {'ecc_page_pending_time':0, 
                    'inforom_failed_time':0, 
                    'ecc_error_standard_time':0, 
                    'ecc_error_tencent_time':0, 
                    'ecc_sram_error_time':0, 
                    'ecc_row_remapping_failure_occurred_time':0,
                    'ecc_row_remapping_bank_low_time':0,
                    'gpu_pcie_link_width_time':0,
                    'nvlink_check_time':0,
                    'nvlink_state_check_time':0,
                    'gpu_clock_slowdown_time':0,
                    'gpu_clock_slowdown_sw_time':0,
                    'gpu_low_power_sw_thermal_slowdown_active_time':0,
                    'lost_time':0}

            lost_time = self.alarm_time[bdf]['lost_time']
            if now - lost_time >= 60*60 or lost_time == 0:
                device = {}
                for i in range(0,len(self.tmp_device_info)):
                    if bdf == self.tmp_device_info[i]['gpu_id'].upper():
                        device = self.tmp_device_info[i]
                        break

                alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(now))
                alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions,
                                     "DeviceName":"GPU", "DeviceId": "", 'Slot':bdf, 'PN':device.get('gpu_part_number', ''), 'SN': device.get('serial', ''),
                                     "DriverVersion": driver_ver, "BaradVersion": barad_version }
                lost_time = self.alarm_time[bdf]['lost_time']
                alarmproxy_event = {"AlarmId":1112, "EventName":"gpu_init_fail","FaultType": "Hardware", "FaultDesc":"init fail" }
                data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
                self.put_data(data_alarmproxy)
                self.alarm_time[bdf]['lost_time'] = now

        # send gpu monitor data
        if (len( self.handler.get_metrics()) > 0) :
            data = {'sender':'nws_sender', 'datas': self.handler.pop_metrics()}
            self.put_data(data)
        if (log_collect == 1):
            globalvar.collect_gpu_log()

    def get_gpu_process_utils(self, i):
        p_smutil=p_memutil=p_encutil=p_decutil=0
        try:
            utils = self.collector.get_gpu_process_info(i)
            #print utils
            count = len(utils)
            for u in utils:
                p_smutil += u.smutil
                p_memutil += u.memutil
                p_encutil += u.encutil
                p_decutil += u.decutil

            outputcount = 0
            if p_smutil > 80:
                sorted_utils = sorted(utils, key=attrgetter("smutil"), reverse=True)
                outputcount = 10
            elif p_memutil > 80:
                sorted_utils = sorted(utils, key=attrgetter("memutil"), reverse=True)
                outputcount = 10

            if outputcount != 0:
                if outputcount > count:
                    outputcount = count
                #for i in range(outputcount):
                    #self.logger().info("Top%d process info, No.%d pid:%d timestamp:%d sm:%d mem:%d enc:%d dec:%d", outputcount, i, sorted_utils[i].pid, sorted_utils[i].timestamp, sorted_utils[i].smutil, sorted_utils[i].memutil, sorted_utils[i].encutil, sorted_utils[i].decutil)
        except NVMLError as err:
            pass
        return (p_smutil, p_memutil, p_encutil, p_decutil)

    def check_xid_error_61_62(self, gpu_id):
        return self.get_specific_xid_flag(gpu_id, "61", 1) or self.get_specific_xid_flag(gpu_id, "62", 1)

    def check_xid_error_terminate_app(self, gpu_id):
        return self.get_specific_xid_flag(gpu_id, "43", 1) or self.get_specific_xid_flag(gpu_id, "45", 1)

    # kernel log: XID 95: This XID indicates an uncontained ECC error has occurred
    # reset pending flag = yes
    # UCE, CAN'T recover by itself, need reboot vm or reset GPU
    # xid 48\92 to reboot or reset GPU (experience from TEG)
    def check_xid_error_need_reset(self, gpu_id):
        return self.get_specific_xid_flag(gpu_id, "95", 0) or self.get_specific_xid_flag(gpu_id, "48", 0) or self.get_specific_xid_flag(gpu_id, "92", 0)

    # kernel log: XID 94: This XID indicates a contained ECC error has occurred
    # Row-remapping pending flag = yes
    # UCE, recover by itslef, NO need reboot vm or reset GPU
    def check_xid_error_no_need_reset_but_terminate_app(self, gpu_id):
        return self.get_specific_xid_flag(gpu_id, "94", 1)

    def get_specific_xid_flag(self, gpu_id, xid, remove):
        try:
            for loginfo in globalvar.XIDLogInfo['xid_err']:
                if loginfo['xid'] == xid and self.bdf_eq(loginfo['bdf'], gpu_id) == 1:
                    if remove == 1:
                        globalvar.XIDLogInfo['xid_err'].remove(loginfo)
                    return 1
        except Exception as e:
            pass
        return 0

    def display_xid_globalvar(self):
        for loginfo in globalvar.XIDLogInfo['xid_err']:
            self.logger().info("xid:%s [%s] info:%s", loginfo['xid'], loginfo['bdf'], loginfo['errorinfo'])

    # bdf in xid log: NVRM: Xid (0000:03:00)
    # bdf in nvml: 0000:03:00.0, sometime: 00000000:03:00.0
    def bdf_eq(self, xid_bdf, nvml_bdf):
        try:
            xid_bus = xid_bdf.split(':')[1].upper()
            xid_device = xid_bdf.split(':')[2].upper()
            nvml_bdf = nvml_bdf.split('.')[0]
            nvml_bus = nvml_bdf.split(':')[1].upper()
            nvml_device = nvml_bdf.split(':')[2].upper()
            if xid_bus == nvml_bus and xid_device == nvml_device:
                return 1
        except Exception as e:
            pass
        return 0

def main():
    collector = GpuCollector()
    collector.init()
    while True:
        collector.collect()
        collector.dump_data()
	time.sleep(60)

if __name__ == '__main__':
    main()