File: //usr/local/qcloud/monitor/barad/plugin/collector/vm/gpu.py
import sys
import os,time
sys.path.append(os.getcwd() + '/../../../comm/')
import constant
import globalvar
from plugin_base import VmBaseCollector
from utils.collect_tool_gpu import GpuCollect
from utils.metric_handler import MetricHandler
from pynvml.pynvml import *
from operator import itemgetter, attrgetter
class GpuCollector(VmBaseCollector):
def init(self):
self.set_frequency(10)
self.collector = GpuCollect()
self.handler = MetricHandler()
self.handler.namespace = 'qce/cvm'
self.handler.dimensions = [ 'vm_uuid', 'vmip']
self.device_info = []
self.tmp_device_info = []
self.alarm_time = {}
self.gpu_init_count = self.collector.get_gpu_init_count()
self.checknvlink = 0xff
self.nvlinknumber_pergpu = 0
self.pcideviceid = 0xff
self.arch = 0xff
self.inforomcheck = 0
def do_collect(self):
# not gpu cvm
if self.gpu_init_count == 0:
return
# 0 : drives do not install
drives = self.collector.gpu_drives_check()
if drives == 0:
return
log_collect = 0
deviceCount = self.collector.get_gpu_count()
self.tmp_device_info = self.device_info
if len(self.device_info) != deviceCount:
self.device_info = []
now = int(time.time())
vm_uuid = self.get_vm_uuid()
vmip = self.get_vmip()
barad_version = self.get_barad_version()
for i in range(0, deviceCount):
try:
if(len(self.device_info) < deviceCount):
minor_number,driver_ver,gpu_id,gpu_name,serial,gpu_uuid,gpu_part_number,gpu_virtual_mode = self.collector.get_device_info(i)
info = {}
info['minor_number'] = minor_number
info['driver_ver'] = driver_ver
info['gpu_id'] = gpu_id
info['gpu_name'] = gpu_name
info['serial'] = serial
info['gpu_uuid'] = gpu_uuid
info['gpu_part_number'] = gpu_part_number
info['gpu_virtual_mode'] = gpu_virtual_mode
self.device_info.append(info)
self.alarm_time[gpu_id] = {'ecc_page_pending_time':0,
'inforom_failed_time':0,
'ecc_error_standard_time':0,
'ecc_error_tencent_time':0,
'ecc_sram_error_time':0,
'ecc_row_remapping_failure_occurred_time':0,
'ecc_row_remapping_bank_low_time':0,
'gpu_pcie_link_width_time':0,
'nvlink_check_time':0,
'nvlink_state_check_time':0,
'gpu_clock_slowdown_time':0,
'gpu_clock_slowdown_sw_time':0,
'gpu_low_power_sw_thermal_slowdown_active_last_count':0,
'lost_time':0}
tempInfo = {}
tempInfo['bdf'] = gpu_id
tempInfo['sn'] = serial
tempInfo['pn'] = gpu_part_number
if tempInfo not in globalvar.GPUList['gpu']:
globalvar.GPUList['gpu'].append(tempInfo)
else:
minor_number = self.device_info[i]['minor_number']
driver_ver = self.device_info[i]['driver_ver']
gpu_id = self.device_info[i]['gpu_id']
gpu_name = self.device_info[i]['gpu_name']
serial = self.device_info[i]['serial']
gpu_uuid = self.device_info[i]['gpu_uuid']
gpu_part_number = self.device_info[i]['gpu_part_number']
gpu_virtual_mode = self.device_info[i]['gpu_virtual_mode']
except Exception as err:
pass
try:
gpu_util,enc_util,dec_util,smclk,memclk,mem_total,mem_used,mem_usage,temp,powDraw,powLimit,pow_usage,\
perf_stat,persistence_mode,ecc_current, ecc_pending, ecc_agg_single_total_err,ecc_agg_double_total_err,\
retired_page_single_ecc,retired_page_double_ecc,retired_page_pending,nvidia_smi_error = self.collector.get_device_run_info(i)
except NVMLError as err:
pass
p_smutil=p_memutil=p_encutil=p_decutil=0
if gpu_virtual_mode == NVML_GPU_VIRTUALIZATION_MODE_VGPU:
p_smutil,p_memutil,p_encutil,p_decutil = self.get_gpu_process_utils(i)
gpu_inforom = 0
if self.inforomcheck == 0:
gpu_inforom = self.collector.get_device_info_inforom(i)
#If the gpu_inforom value is 2 reported as 0, and included in the log
if gpu_inforom == 2:
gpu_inforom = 0
if self.arch == 0xff:
self.arch = self.collector.get_gpu_device_arch(i)
self.checknvlink = 0
self.pcideviceid = self.collector.get_device_info_pcideviceid(i)
#define PCI_DEVICE_ID_NVIDIA_A100_SXM4_40G 0x20b0
#define PCI_DEVICE_ID_NVIDIA_A100_SXM4_80G 0x20b2
#define PCI_DEVICE_ID_NVIDIA_A800_SXM4_80G 0x20f3
#define PCI_DEVICE_ID_NVIDIA_H800_80G 0x2324
#define PCI_DEVICE_ID_NVIDIA_H20 0x2329
if self.pcideviceid == 0x20B010DE or self.pcideviceid == 0x20B210DE:
self.nvlinknumber_pergpu = 12
self.checknvlink = 1
if self.pcideviceid == 0x20F310DE or self.pcideviceid == 0x232410DE:
self.nvlinknumber_pergpu = 8
self.checknvlink = 1
if self.pcideviceid == 0x232910DE:
self.nvlinknumber_pergpu = 18
self.checknvlink = 1
#check nvlink when vm has more than 1 gpu, no need check for 1 gpu vm, e.g. GT4.4XLARGE96
if deviceCount == 1:
self.checknvlink = 0
gpu_ecc_terminate_app = 0
gpu_xid_terminate_app = 0
row_remapping_pending = 0
row_remapping_failure_occurred = 0
# retired_page_pending == -1 indicates unsupport page_retirment on ampere and next
if retired_page_pending == -1 and ecc_current == 1:
row_remapping_pending,row_remapping_failure_occurred = self.collector.get_row_remapping_status(i)
# ampere and next arch :reuse retired_page_pending in barad
if self.check_xid_error_need_reset(gpu_id):
retired_page_pending = 1
if retired_page_pending == 1 or self.check_xid_error_no_need_reset_but_terminate_app(gpu_id):
gpu_ecc_terminate_app = 1
if self.check_xid_error_terminate_app(gpu_id):
gpu_xid_terminate_app = 1
if globalvar.GPUDriverVersion == "" or globalvar.GPUDriverVersion != driver_ver:
globalvar.GPUDriverVersion = driver_ver
xid_61_62 = 0
if self.check_xid_error_61_62(gpu_id):
xid_61_62 = 1
dimensions = {'vm_uuid': vm_uuid, 'vmip': vmip, 'minor_number': minor_number, 'gpu_name' : gpu_name, 'gpu_id' : gpu_id, \
'gpu_uuid' : gpu_uuid, 'driver_ver': driver_ver, 'serial': serial, 'gpu_part_number': gpu_part_number}
batch_metric = [
{'name':'gpu_util', 'value':gpu_util},
{'name':'gpu_enc_util', 'value':enc_util},
{'name':'gpu_dec_util', 'value':dec_util},
{'name':'gpu_smclk', 'value':smclk},
{'name':'gpu_memclk', 'value':memclk},
{'name':'gpu_p_smutil', 'value':p_smutil},
{'name':'gpu_p_memutil', 'value':p_memutil},
{'name':'gpu_p_encutil', 'value':p_encutil},
{'name':'gpu_p_decutil', 'value':p_decutil},
{'name':'gpu_mem_usage', 'value': mem_usage},
{'name':'gpu_mem_total', 'value':mem_total},
{'name':'gpu_mem_used', 'value':mem_used},
{'name':'gpu_pow_usage', 'value': pow_usage},
{'name':'gpu_pow_limit', 'value':powLimit},
{'name':'gpu_pow_draw', 'value':powDraw},
{'name':'gpu_temp', 'value':temp},
{'name':'gpu_perf_stat', 'value':perf_stat},
{'name':'gpu_persistence_mode', 'value':persistence_mode},
{'name':'gpu_ecc_current', 'value':ecc_current},
{'name':'gpu_ecc_pending', 'value':ecc_pending},
{'name':'gpu_ecc_agg_single_total_err', 'value':ecc_agg_single_total_err},
{'name':'gpu_ecc_agg_double_total_err', 'value':ecc_agg_double_total_err},
{'name':'gpu_retired_page_single_ecc', 'value':retired_page_single_ecc},
{'name':'gpu_retired_page_double_ecc', 'value':retired_page_double_ecc},
{'name':'gpu_retired_page_pending', 'value':retired_page_pending},
{'name':'gpu_inforom','value':gpu_inforom},
{'name':'gpu_ecc_terminate_app','value':gpu_ecc_terminate_app},
{'name':'gpu_xid_terminate_app','value':gpu_xid_terminate_app},
{'name':'gpu_nvidia_smi_error','value':nvidia_smi_error},
{'name':'gpu_xid_61_62','value':xid_61_62},
]
self.handler.add_batch_metric(batch = batch_metric, dimensions = dimensions)
alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(now))
alarm_dimensions = [{"Key" : "Uuid", "Value" :vm_uuid}]
alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions,
"DeviceName":"GPU", "DeviceId": "", 'Slot':gpu_id, 'PN':gpu_part_number, 'SN': serial, "DriverVersion": driver_ver,
"BaradVersion": barad_version }
inforom_failed_time = self.alarm_time[gpu_id]['inforom_failed_time']
#when gpu error ,WARNING: infoROM is corrupted at gpu 0000:00:08.0
if now - inforom_failed_time >= 60*60*24 or inforom_failed_time == 0:
if gpu_inforom == 1 :
alarmproxy_event = {"AlarmId":1107, "EventName":"gpu_inforom_failed", "FaultType": "Hardware", "FaultDesc":"gpu_inforom_failed" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['inforom_failed_time'] = now
log_collect = 1
# passthrough GPU VM or bare metal GPU machine
if gpu_virtual_mode == NVML_GPU_VIRTUALIZATION_MODE_NONE or gpu_virtual_mode == NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH:
ecc_page_pending_time = self.alarm_time[gpu_id]['ecc_page_pending_time']
ecc_error_standard_time = self.alarm_time[gpu_id]['ecc_error_standard_time']
ecc_error_tencent_time = self.alarm_time[gpu_id]['ecc_error_tencent_time']
ecc_sram_error_time = self.alarm_time[gpu_id]['ecc_sram_error_time']
ecc_row_remapping_failure_occurred_time = self.alarm_time[gpu_id]['ecc_row_remapping_failure_occurred_time']
ecc_row_remapping_bank_low_time = self.alarm_time[gpu_id]['ecc_row_remapping_bank_low_time']
gpu_pcie_link_width_time = self.alarm_time[gpu_id]['gpu_pcie_link_width_time']
nvlink_check_time = self.alarm_time[gpu_id]['nvlink_check_time']
nvlink_state_check_time = self.alarm_time[gpu_id]['nvlink_state_check_time']
gpu_clock_slowdown_time = self.alarm_time[gpu_id]['gpu_clock_slowdown_time']
gpu_clock_slowdown_sw_time = self.alarm_time[gpu_id]['gpu_clock_slowdown_sw_time']
gpu_low_power_sw_thermal_slowdown_active_last_count = self.alarm_time[gpu_id]['gpu_low_power_sw_thermal_slowdown_active_last_count']
if self.arch < NVML_DEVICE_ARCH_AMPERE and ecc_current == 1:
send_alarm = 0
if ecc_error_tencent_time == 0:
pages_dbe,timestamps_dbe,pages_sbe,timestamps_sbe = self.collector.get_device_retired_page_info(i)
dbe_count = len(pages_dbe)
sbe_count = len(pages_sbe)
#nvidia RMA: DBE >= 5 || SBE + DBE >=60
if dbe_count >= 5 or dbe_count + sbe_count >=60:
send_alarm = 1
else:
if now - ecc_error_tencent_time >= 60*60*24:
send_alarm = 1
if send_alarm == 1:
alarmproxy_event = {"AlarmId":1102, "EventName":"gpu_ecc_error_tencent", "FaultType": "Hardware", "FaultDesc":"gpu ecc error(DBE >= 5 || SBE + DBE >=60)" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['ecc_error_tencent_time'] = now
send_alarm = 0
if ecc_error_standard_time == 0:
pages_dbe,timestamps_dbe,pages_sbe,timestamps_sbe = self.collector.get_device_retired_page_info(i)
dbe_count = len(pages_dbe)
sbe_count = len(pages_sbe)
seconds = 0
timestamps_dbe_sorted = sorted(timestamps_dbe, reverse=True)
if len(timestamps_dbe_sorted) >= 5:
seconds = timestamps_dbe_sorted[0] - timestamps_dbe_sorted[4]
#nvidia RMA: 30days and DBE>=5 || DBE>=10 || SBE+DBE >=60
if ((seconds <= 30*24*60*60 and dbe_count >= 5 ) or dbe_count >= 10 or (dbe_count + sbe_count >=60)):
send_alarm = 1
else:
if now - ecc_error_standard_time >= 60*60*24:
send_alarm = 1
if send_alarm == 1:
alarmproxy_event = {"AlarmId":1103, "EventName":"gpu_ecc_error_standard", "FaultType": "Hardware", "FaultDesc":"gpu ecc error(30days DBE>=5 || DBE>=10 || SBE+DBE >=60)" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['ecc_error_standard_time'] = now
send_alarm = 0
if ecc_page_pending_time == 0:
if retired_page_pending == 1:
send_alarm = 1
if send_alarm == 1:
alarmproxy_event = {"AlarmId":1104, "EventName":"gpu_retired_page_pending", "FaultType": "Hardware", "FaultDesc":"gpu retired page pending" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['ecc_page_pending_time'] = now
log_collect = 1
send_alarm = 0
if ecc_row_remapping_failure_occurred_time == 0:
if self.arch >= NVML_DEVICE_ARCH_AMPERE and ecc_current == 1:
if row_remapping_failure_occurred == 1 or self.collector.check_field_row_remapping_failure(i):
send_alarm = 1
else:
if now - ecc_row_remapping_failure_occurred_time >= 60*60:
send_alarm = 1
if send_alarm == 1:
alarmproxy_event = {"AlarmId":1121, "EventName":"gpu_row_remapping_failure", "FaultType": "Hardware", "FaultDesc":"gpu row remapping failure occurred" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['ecc_row_remapping_failure_occurred_time'] = now
log_collect = 1
send_alarm = 0
if ecc_sram_error_time == 0:
if ecc_current == 1:
filed = self.collector.check_field(i)
sram_ecc = self.collector.check_sram_ecc(i)
if filed == 1 or sram_ecc == 1:
send_alarm = 1
if send_alarm == 0:
try:
# nvidia bug about field check, can't find sram uce, so here check sram uce by nvidia-smi command
command = "nvidia-smi -q -d ecc -i " + str(i) + " | grep -i 'SRAM Uncorrectable' | awk -F: '{print $2}'| sed 's/^[ \t]*//;s/[ \t]*$//' "
lines = os.popen(command).readlines()
for line in lines:
# nvidia RMA: SRAM UCE >= 10
if int(line.strip()) >= 10:
send_alarm = 1
except Exception as err:
pass
else:
if now - ecc_sram_error_time >= 60*60:
send_alarm = 1
if send_alarm == 1:
alarmproxy_event = {"AlarmId":1114, "EventName":"gpu_sram_ecc_error", "FaultType": "Hardware", "FaultDesc":"gpu sram ecc error" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['ecc_sram_error_time'] = now
log_collect = 1
send_alarm = 0
if ecc_row_remapping_bank_low_time == 0:
if self.arch >= NVML_DEVICE_ARCH_AMPERE and ecc_current == 1:
rr_low,rr_none = self.collector.get_row_remapping_histogram(i)
if rr_low != 0 or rr_none != 0:
send_alarm = 1
else:
if now - ecc_row_remapping_bank_low_time >= 60*60:
send_alarm = 1
if send_alarm == 1:
alarmproxy_event = {"AlarmId":1122, "EventName":"gpu_row_remapping_bank_low", "FaultType": "Hardware", "FaultDesc":"gpu row remapping bank low" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['ecc_row_remapping_bank_low_time'] = now
send_alarm = 0
if gpu_pcie_link_width_time == 0:
# exclude A10
if self.arch >= NVML_DEVICE_ARCH_AMPERE and self.pcideviceid != 0x223610DE:
width_max, width_cur = self.collector.get_gpu_pcie_link_width(i)
if width_max != 16 or width_cur != 16:
send_alarm = 1
else:
if now - gpu_pcie_link_width_time >= 60*60:
send_alarm = 1
if send_alarm == 1:
alarmproxy_event = {"AlarmId":1123, "EventName":"gpu_pcie_link_width_err", "FaultType": "Hardware", "FaultDesc":"gpu pcie link width error" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['gpu_pcie_link_width_time'] = now
log_collect = 1
send_alarm = 0
if nvlink_check_time == 0:
if self.checknvlink == 1:
for linkidx in range(self.nvlinknumber_pergpu):
# err index only 0/1/2 is valid
for erridx in range(2):
if self.collector.get_nvlink_err_count(i, linkidx, erridx) != 0:
send_alarm = 1
else:
if now - nvlink_check_time >= 60*60:
send_alarm = 1
if send_alarm == 1:
alarmproxy_event = {"AlarmId":1125, "EventName":"gpu_nvlink_err", "FaultType": "Hardware", "FaultDesc":"gpu nvlink error" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['nvlink_check_time'] = now
send_alarm = 0
if nvlink_state_check_time == 0 or now - nvlink_state_check_time >= 60*60:
if self.checknvlink == 1:
for linkidx in range(self.nvlinknumber_pergpu):
if self.collector.get_nvlink_state(i, linkidx) == 0:
send_alarm = 1
if send_alarm == 1:
# rarely, "NvLinkDisable=1" is configured, no nvlink inactive alarm send
try:
command = "cat /proc/driver/nvidia/params"
lines = os.popen(command).readlines()
for line in lines:
if line.find("NvLinkDisable") == -1:
continue
v = line.split(":")[1].strip()
if v == "1":
send_alarm = 0
except Exception as err:
pass
if send_alarm == 1:
alarmproxy_event = {"AlarmId":1134, "EventName":"gpu_nvlink_state_inactive", "FaultType": "Hardware", "FaultDesc":"gpu nvlink inactive" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['nvlink_state_check_time'] = now
if self.arch >= NVML_DEVICE_ARCH_AMPERE:
slow_down_hw,slow_down_sw = self.collector.check_gpu_clock_slowdown(i)
if gpu_clock_slowdown_time == 0 or now - gpu_clock_slowdown_time >= 60*60:
if slow_down_hw == 1:
alarmproxy_event = {"AlarmId":1128, "EventName":"gpu_clock_slowdown", "FaultType": "Hardware", "FaultDesc":"gpu clock slowdown HW" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['gpu_clock_slowdown_time'] = now
if gpu_clock_slowdown_sw_time == 0 or now - gpu_clock_slowdown_sw_time >= 5*60:
if slow_down_sw == 1:
alarmproxy_event = {"AlarmId":1135, "EventName":"gpu_clock_slowdown_sw", "FaultType": "Hardware", "FaultDesc":"gpu clock slowdown SW" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['gpu_clock_slowdown_sw_time'] = now
# if low power usage and sw thermal slowdown active last 15min(900), then alarm
if gpu_low_power_sw_thermal_slowdown_active_last_count >= 90:
alarmproxy_event = {"AlarmId":1137, "EventName":"gpu_low_power_sw_thermal_slowdown_active", "FaultType": "Hardware", "FaultDesc":"gpu low power and sw thermal slowdown active" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[gpu_id]['gpu_low_power_sw_thermal_slowdown_active_last_count'] = 0
if slow_down_sw == 1 and pow_usage < 20.0:
self.alarm_time[gpu_id]['gpu_low_power_sw_thermal_slowdown_active_last_count'] += 1
else:
self.alarm_time[gpu_id]['gpu_low_power_sw_thermal_slowdown_active_last_count'] = 0
self.inforomcheck = 1
#check if any gpu lost
bdf_init = [item.upper() for item in self.collector.get_lspci_bdf()]
bdf_now = [item['gpu_id'].upper() for item in self.device_info]
bdf_diff = list(set(bdf_init).difference(set(bdf_now)))
for bdf in bdf_diff :
if self.alarm_time.get(bdf, '') == '':
self.alarm_time[bdf] = {'ecc_page_pending_time':0,
'inforom_failed_time':0,
'ecc_error_standard_time':0,
'ecc_error_tencent_time':0,
'ecc_sram_error_time':0,
'ecc_row_remapping_failure_occurred_time':0,
'ecc_row_remapping_bank_low_time':0,
'gpu_pcie_link_width_time':0,
'nvlink_check_time':0,
'nvlink_state_check_time':0,
'gpu_clock_slowdown_time':0,
'gpu_clock_slowdown_sw_time':0,
'gpu_low_power_sw_thermal_slowdown_active_time':0,
'lost_time':0}
lost_time = self.alarm_time[bdf]['lost_time']
if now - lost_time >= 60*60 or lost_time == 0:
device = {}
for i in range(0,len(self.tmp_device_info)):
if bdf == self.tmp_device_info[i]['gpu_id'].upper():
device = self.tmp_device_info[i]
break
alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(now))
alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime, "Dimensions":alarm_dimensions,
"DeviceName":"GPU", "DeviceId": "", 'Slot':bdf, 'PN':device.get('gpu_part_number', ''), 'SN': device.get('serial', ''),
"DriverVersion": driver_ver, "BaradVersion": barad_version }
lost_time = self.alarm_time[bdf]['lost_time']
alarmproxy_event = {"AlarmId":1112, "EventName":"gpu_init_fail","FaultType": "Hardware", "FaultDesc":"init fail" }
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
self.alarm_time[bdf]['lost_time'] = now
# send gpu monitor data
if (len( self.handler.get_metrics()) > 0) :
data = {'sender':'nws_sender', 'datas': self.handler.pop_metrics()}
self.put_data(data)
if (log_collect == 1):
globalvar.collect_gpu_log()
def get_gpu_process_utils(self, i):
p_smutil=p_memutil=p_encutil=p_decutil=0
try:
utils = self.collector.get_gpu_process_info(i)
#print utils
count = len(utils)
for u in utils:
p_smutil += u.smutil
p_memutil += u.memutil
p_encutil += u.encutil
p_decutil += u.decutil
outputcount = 0
if p_smutil > 80:
sorted_utils = sorted(utils, key=attrgetter("smutil"), reverse=True)
outputcount = 10
elif p_memutil > 80:
sorted_utils = sorted(utils, key=attrgetter("memutil"), reverse=True)
outputcount = 10
if outputcount != 0:
if outputcount > count:
outputcount = count
#for i in range(outputcount):
#self.logger().info("Top%d process info, No.%d pid:%d timestamp:%d sm:%d mem:%d enc:%d dec:%d", outputcount, i, sorted_utils[i].pid, sorted_utils[i].timestamp, sorted_utils[i].smutil, sorted_utils[i].memutil, sorted_utils[i].encutil, sorted_utils[i].decutil)
except NVMLError as err:
pass
return (p_smutil, p_memutil, p_encutil, p_decutil)
def check_xid_error_61_62(self, gpu_id):
return self.get_specific_xid_flag(gpu_id, "61", 1) or self.get_specific_xid_flag(gpu_id, "62", 1)
def check_xid_error_terminate_app(self, gpu_id):
return self.get_specific_xid_flag(gpu_id, "43", 1) or self.get_specific_xid_flag(gpu_id, "45", 1)
# kernel log: XID 95: This XID indicates an uncontained ECC error has occurred
# reset pending flag = yes
# UCE, CAN'T recover by itself, need reboot vm or reset GPU
# xid 48\92 to reboot or reset GPU (experience from TEG)
def check_xid_error_need_reset(self, gpu_id):
return self.get_specific_xid_flag(gpu_id, "95", 0) or self.get_specific_xid_flag(gpu_id, "48", 0) or self.get_specific_xid_flag(gpu_id, "92", 0)
# kernel log: XID 94: This XID indicates a contained ECC error has occurred
# Row-remapping pending flag = yes
# UCE, recover by itslef, NO need reboot vm or reset GPU
def check_xid_error_no_need_reset_but_terminate_app(self, gpu_id):
return self.get_specific_xid_flag(gpu_id, "94", 1)
def get_specific_xid_flag(self, gpu_id, xid, remove):
try:
for loginfo in globalvar.XIDLogInfo['xid_err']:
if loginfo['xid'] == xid and self.bdf_eq(loginfo['bdf'], gpu_id) == 1:
if remove == 1:
globalvar.XIDLogInfo['xid_err'].remove(loginfo)
return 1
except Exception as e:
pass
return 0
def display_xid_globalvar(self):
for loginfo in globalvar.XIDLogInfo['xid_err']:
self.logger().info("xid:%s [%s] info:%s", loginfo['xid'], loginfo['bdf'], loginfo['errorinfo'])
# bdf in xid log: NVRM: Xid (0000:03:00)
# bdf in nvml: 0000:03:00.0, sometime: 00000000:03:00.0
def bdf_eq(self, xid_bdf, nvml_bdf):
try:
xid_bus = xid_bdf.split(':')[1].upper()
xid_device = xid_bdf.split(':')[2].upper()
nvml_bdf = nvml_bdf.split('.')[0]
nvml_bus = nvml_bdf.split(':')[1].upper()
nvml_device = nvml_bdf.split(':')[2].upper()
if xid_bus == nvml_bus and xid_device == nvml_device:
return 1
except Exception as e:
pass
return 0
def main():
collector = GpuCollector()
collector.init()
while True:
collector.collect()
collector.dump_data()
time.sleep(60)
if __name__ == '__main__':
main()