File: //usr/local/qcloud/monitor/barad/plugin/collector/vm/cvmlog.py
import sys
import os,time
sys.path.append(os.getcwd() + '/../../../comm/')
import constant
import json
import globalvar
from plugin_base import VmBaseCollector
from utils.metric_handler import MetricHandler
from operator import itemgetter, attrgetter
from cutils import incrementFileReadMng
class VmLogCollector(VmBaseCollector):
def init(self):
self.set_frequency(300)
self.handler = MetricHandler()
self.log_path = self.getLogPath()
self.frag_size = 4096 * 1024 # 4M
self.message_ref = incrementFileReadMng(self.frag_size)
self.message_ref.get_last_fn = self.message_ref.get_offload_last
self.run_once = False
self.handler.namespace = 'qce/cvm'
self.handler.dimensions = [ 'vm_uuid', 'vmip']
self._last_report_nws_time = 0
self.is_gpu_vm = 0xff
self.has_nvswitch = 0xff
self.nvswitch_alarm = 0
self.fabricmgr_log_last_initial_pos = -1
self.fabricmgr_log_last_scan_pos = -1
self._log_inode = None
self._log_size = -1
def gpu_vm(self):
is_gpu_vm = 0
try:
command = 'lspci -d 10de:'
lines = os.popen(command).readlines()
if len(lines) == 0:
is_gpu_vm = 0
else:
is_gpu_vm = 1
except Exception as e:
pass
return is_gpu_vm
def nvswitch(self):
has_nvsw = 0
try:
command = 'lspci -d 10de: | grep -i bridge'
lines = os.popen(command).readlines()
if len(lines) == 0:
has_nvsw = 0
else:
has_nvsw = 1
except Exception as e:
pass
return has_nvsw
# bdf in xid log: NVRM: Xid (0000:03:00)
# bdf in nvml: 0000:03:00.0, sometime: 00000000:03:00.0
def bdf_eq(self, xid_bdf, nvml_bdf):
try:
xid_bus = xid_bdf.split(':')[1].upper()
xid_device = xid_bdf.split(':')[2].upper()
nvml_bdf = nvml_bdf.split('.')[0]
nvml_bus = nvml_bdf.split(':')[1].upper()
nvml_device = nvml_bdf.split(':')[2].upper()
if xid_bus == nvml_bus and xid_device == nvml_device:
return 1
except Exception as e:
pass
return 0
def get_hard_info(self, bdf):
hard_info = {"sn":"","pn":""}
try:
for gpu in globalvar.GPUList['gpu']:
if self.bdf_eq(bdf, gpu['bdf']) == 1:
hard_info['sn'] = gpu['sn']
hard_info['pn'] = gpu['pn']
break
except Exception as e:
pass
return hard_info
def getLogPath(self):
log_path = '/var/log/messages'
file = '/etc/os-release'
if not os.path.exists(file):
return log_path
fd = open(file, "r")
data = fd.read()
fd.close()
if data != None:
lines = data.split('\n')
for index in range(0, len(lines)-1):
pos = lines[index].find("NAME=")
if(pos == -1):
continue
if lines[index].find("Ubuntu") != -1:
log_path = '/var/log/syslog'
else:
log_path = '/var/log/messages'
break
return log_path
# return 0 : fabric mgr service status is ok
def check_fabricmgr_service(self):
try:
ret = os.system('systemctl status nvidia-fabricmanager > /dev/null')
if ret == 0:
return 0
except Exception as e:
pass
return 1
def check_fabric_mgr_log(self):
result = {"error":[]}
try:
fabric_log = '/var/log/fabricmanager.log'
if not os.path.exists(fabric_log):
return result
stat = os.stat(fabric_log)
current_inode = stat.st_ino
current_size = stat.st_size
self.logger().info("Fabricmgr log current size: %d", current_size)
if (current_inode != self._log_inode) or (current_size < self._log_size):
self.fabricmgr_log_last_initial_pos = -1
self.fabricmgr_log_last_scan_pos = -1
self._log_inode = current_inode
self._log_size = current_size
self.logger().info("Fabricmgr log current_inode: %d ", current_inode)
self.logger().info("Fabricmgr log current_size: %d", current_size)
if self.fabricmgr_log_last_initial_pos == -1:
with open(fabric_log, 'rb') as f:
last_init_pos = -1
while True:
pos = f.tell()
line = f.readline()
if not line:
break
if b'Fabric Manager Log initializing at' in line:
last_init_pos = pos
self.logger().info("Fabricmgr log initializing, find one %d", last_init_pos)
self.fabricmgr_log_last_initial_pos = last_init_pos
self.fabricmgr_log_last_scan_pos = last_init_pos
self.logger().info("Fabricmgr log initializing last_init_pos: %d", last_init_pos)
if self.fabricmgr_log_last_initial_pos == -1:
self.logger().info("Fabricmgr log initializing no last_init_pos")
return result
scan_pos = self.fabricmgr_log_last_initial_pos
if self.fabricmgr_log_last_scan_pos > self.fabricmgr_log_last_initial_pos:
scan_pos = self.fabricmgr_log_last_scan_pos
if scan_pos >= current_size:
self.logger().info("Fabricmgr log no new content to scan: %d %d", scan_pos, current_size)
return result
self.logger().info("Fabricmgr log current scan_pos: %d", scan_pos)
found_error = False
with open(fabric_log, 'rb') as f:
try:
f.seek(scan_pos)
while True:
line = f.readline()
if not line:
break
if b'a fatal error occurred on NVSwitch' in line:
found_error = True
break
self.fabricmgr_log_last_scan_pos = f.tell()
self._log_size = f.tell()
except IOError as e:
pass
if found_error:
tempInfo = {}
tempInfo['fatal_error'] = 1
result['error'].append(tempInfo)
except Exception as e:
pass
return result
def checkVmGpuStatus(self):
# if we can't get the log file,report this vm
if not os.path.exists(self.log_path):
return None
vmLogInfo = {"xid_err":[], "nv_switch":[]}
if self.message_ref.log_offset == -1 and self.run_once:
self.message_ref.log_offset = 0
if not self.message_ref.inc_open_file(self.log_path, self.log_path):
return vmLogInfo
try:
while True:
data = self.message_ref.inc_seek_read_lines(self.frag_size)
if not data:
break
# self.logger.info(data)
lines = data.split('\n')
#report Xid and bdf when error occurs
for index in range(0, len(lines) - 1):
# centos & ubuntu: Oct 08 18:02:47 vm-x-x-centos kernel: NVRM: Xid (PCI:0000:e1:00): 31, pid=xx, Ch xx, intr xx. xx Fault: ...
if lines[index].find("NVRM") != -1 and lines[index].find("Xid") != -1:
#get xid in log
xidset = lines[index].split(',')[0]
xid = xidset.split('):')[1].strip()
bdfs = xidset.split('):')[0]
bdf = bdfs.split('PCI:')[1]
tempInfo = {}
tempInfo['bdf'] = bdf.lower()
hard_info = self.get_hard_info(bdf)
tempInfo['sn'] = hard_info['sn']
tempInfo['pn'] = hard_info['pn']
tempInfo['xid'] = xid
if tempInfo not in vmLogInfo['xid_err']:
vmLogInfo['xid_err'].append(tempInfo)
if xid == "95" or xid == "94" or xid == "48" or xid == "92" or xid == "43" or xid == "45" or xid == "61" or xid == "62":
if tempInfo not in globalvar.XIDLogInfo['xid_err']:
globalvar.XIDLogInfo['xid_err'].append(tempInfo)
if lines[index].find("nvidia-nvswitch") != -1 and lines[index].find("Failed to initialize device") != -1:
tmp = {}
tmp['init_fail'] = 1
if tmp not in vmLogInfo['nv_switch']:
vmLogInfo['nv_switch'].append(tmp)
except Exception as e:
self.logger().error("nv gpu collect exception: %s", e)
self.logger().exception(e)
finally:
self.message_ref.inc_close_file()
return vmLogInfo
def do_collect(self):
if self.is_gpu_vm == 0xff:
self.is_gpu_vm = self.gpu_vm()
self.has_nvswitch = self.nvswitch()
if not self.is_gpu_vm:
return
vmLogInfo = self.checkVmGpuStatus()
self.run_once = True
log_collect = 0
vm_uuid = self.get_vm_uuid()
vmip = self.get_vmip()
barad_version = self.get_barad_version()
driver_version = globalvar.GPUDriverVersion
dimensions = {'vm_uuid': vm_uuid, 'vmip': vmip }
dim_alarm = [{"Key" : "Uuid", "Value" :vm_uuid}]
now = int(time.time())
batch_metric = []
if vmLogInfo != None:
batch_metric.append({"name":"xid_err",'value':1 if len(vmLogInfo['xid_err']) >0 else 0})
alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime(now))
for loginfo in vmLogInfo['xid_err']:
alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime,
"Slot":loginfo['bdf'], "SN": loginfo['sn'], "PN": loginfo['pn'], "Dimensions":dim_alarm,
"DriverVersion": driver_version, "BaradVersion": barad_version}
ExtendMsg = {"Xid":loginfo['xid']}
if loginfo['xid'] == "81":
alarmproxy_event = {"AlarmId":1105, "EventName":"gpu_vga_subsystem_error", "FaultType": "Hardware", "FaultDesc":"gpu vga subsystem error, Xid_error:81",
"DeviceName":"GPU", "DeviceId": "","ExtendMsg":ExtendMsg}
log_collect = 1
elif loginfo['xid'] == "122" or loginfo['xid'] == "123" or loginfo['xid'] == "124":
alarmproxy_event = {"AlarmId":1115, "EventName":"gpu_spi_op_fail", "FaultType": "Hardware", "FaultDesc":"GPU SPI PMU RPC Read/Write/Erase Failure",
"DeviceName":"GPU", "DeviceId": "","ExtendMsg":ExtendMsg}
elif loginfo['xid'] == "79":
alarmproxy_event = {"AlarmId":1116, "EventName":"gpu_fall_off_bus", "FaultType": "Hardware", "FaultDesc":"GPU has fallen off the bus",
"DeviceName":"GPU", "DeviceId": "","ExtendMsg":ExtendMsg}
log_collect = 1
elif loginfo['xid'] == "110":
alarmproxy_event = {"AlarmId":1120, "EventName":"gpu_security_fault_error", "FaultType": "Hardware", "FaultDesc":"GPU Security Fault Error",
"DeviceName":"GPU", "DeviceId": "","ExtendMsg":ExtendMsg}
log_collect = 1
elif loginfo['xid'] == "95" or loginfo['xid'] == "48" or loginfo['xid'] == "94":
alarmproxy_event = {"AlarmId":1119, "EventName":"gpu_ecc_terminate_app", "FaultType": "Hardware", "FaultDesc":"gpu ecc terminate app",
"DeviceName":"GPU", "DeviceId": "","ExtendMsg":ExtendMsg}
elif loginfo['xid'] == "61" or loginfo['xid'] == "62":
alarmproxy_event = {"AlarmId":1124, "EventName":"gpu_driver_bug", "FaultType": "Hardware", "FaultDesc":"gpu driver bug",
"DeviceName":"GPU", "DeviceId": "","ExtendMsg":ExtendMsg}
elif loginfo['xid'] == "74":
alarmproxy_event = {"AlarmId":1132, "EventName":"gpu_nvlink_err_xid74", "FaultType": "Hardware", "FaultDesc":"GPU nvlink error xid74",
"DeviceName":"GPU", "DeviceId": "","ExtendMsg":ExtendMsg}
elif loginfo['xid'] == "92":
alarmproxy_event = {"AlarmId":1136, "EventName":"gpu_high_sb_ecc_error_rate", "FaultType": "Hardware", "FaultDesc":"GPU High single-bit ECC error rate",
"DeviceName":"GPU", "DeviceId": "","ExtendMsg":ExtendMsg}
else:
alarmproxy_event = {"AlarmId":1106, "EventName":"gpu_xid_error_unspecified", "FaultType": "Hardware", "FaultDesc":"gpu xid error unspecified",
"DeviceName":"GPU", "DeviceId": "","ExtendMsg":ExtendMsg}
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
# for test fabricmgr log scan:
# fabriclog = self.check_fabric_mgr_log()
if self.has_nvswitch == 1:
fabricmgr_srv_status = self.check_fabricmgr_service()
batch_metric.append({'name': 'fabricmanager_service', 'value': fabricmgr_srv_status})
if self.nvswitch_alarm == 0:
try:
for loginfo in vmLogInfo['nv_switch']:
alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime,
"Slot":"", "SN": "", "PN": "", "Dimensions":dim_alarm, "DriverVersion": driver_version, "BaradVersion": barad_version}
ExtendMsg = {}
if loginfo['init_fail'] == 1:
alarmproxy_event = {"AlarmId":1127, "EventName":"gpu_nvswitch_init_fail", "FaultType": "Hardware", "FaultDesc":"gpu nvswitch init fail",
"DeviceName":"NVSwitch", "DeviceId": "","ExtendMsg":ExtendMsg}
log_collect = 1
self.nvswitch_alarm = 1
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
except Exception as e:
pass
if self.nvswitch_alarm == 0 and fabricmgr_srv_status == 0:
try:
fabriclog = self.check_fabric_mgr_log()
for loginfo in fabriclog['error']:
alarmproxy_metric = {"CallerName": "barad_agent", "CallerKey":"PSbhht7wQLEbH6OjDXTayQ==", "AlarmTime":alarmtime,
"Slot":"", "SN": "", "PN": "", "Dimensions":dim_alarm, "DriverVersion": driver_version, "BaradVersion": barad_version}
ExtendMsg = {}
if loginfo['fatal_error'] == 1:
alarmproxy_event = {"AlarmId":1133, "EventName":"gpu_fabric_mgr_fatal_error", "FaultType": "Hardware", "FaultDesc":"nvswitch fabricmanager fatal error occurred",
"DeviceName":"NVSwitch", "DeviceId": "","ExtendMsg":ExtendMsg}
log_collect = 1
self.nvswitch_alarm = 1
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': dict(alarmproxy_metric, **alarmproxy_event)}
self.put_data(data_alarmproxy)
except Exception as e:
pass
self.handler.add_batch_metric(batch=batch_metric, dimensions=dimensions, timestamp=now)
if (len( self.handler.get_metrics()) > 0) :
data = {'sender':'nws_sender', 'datas': self.handler.pop_metrics()}
self.put_data(data)
if (log_collect == 1):
globalvar.collect_gpu_log()
def main():
collector = VmLogCollector()
collector.init()
while True:
collector.collect()
collector.dump_data()
time.sleep(10)
if __name__ == '__main__':
main()