File: //usr/local/qcloud/monitor/barad/plugin/collector/vm/nvme.py
# -*- coding: UTF-8 -*-
from multiprocessing.dummy import current_process
import os
from platform import release
import stat
import sys
import time
import fcntl
import datetime
import traceback
import glob
import re
CURRENT_FILE_PATH=os.path.dirname(os.path.realpath(__file__))
sys.path.append(CURRENT_FILE_PATH + '/../../../comm/')
LOG_PATH=CURRENT_FILE_PATH+'/../../../log/'
TOOL_PATH=CURRENT_FILE_PATH+'/../../../tools/'
import constant
from plugin_base import VmBaseCollector
from utils.metric_handler import MetricHandler
from cutils import CommUtils
from logging import getLogger
"""
VM nvme device status:
1:error
0:normal
"""
def get_file_content(file):
content = ""
if not os.path.exists(file):
return content
with open(file) as f:
content = f.readline().strip()
return content
class VsVmNvmeQuality:
def __init__(self, logger):
self.logger = logger
self.log_offset = None
self.is_nvme_vm = 0
self.is_nvme_vm_valid = 0
self.prepare_init = 0
self.log_path = '/var/log/messages'
self.nvme_tool = TOOL_PATH+'/nvme'
self.nvme_hard_info={} #记录nvme BDF,SN,PN,避免故障时再查询导致。查询周期为半小时一次。
self.nvme_hard_info_update_period = 30*60
self.nvme_list_info={} #存储nvme list的执行结果
self.nvme_list_info_update_period = 30*60
self.nvme_smart_log_info={} #以nvme盘维度,存储该盘下各类smart log查询的指标数值。1小时查询一次。
self.nvme_smart_log_info_update_period=60*60
self.last_collect_nvme_info_time = 0 #记录上次查询nvme硬件信息的时间,每过1小时清除一次,重新查询
self.collect_nvme_info_time_update_period = 60*60
self.log_nvme_error_info={"nvme_disk_bad":set()} #系统日志中查询到的各故障类别的nvme bdf
self.re_bdf = re.compile(r"[0-9a-fA-F]{4,}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F]{1}")
self.re_nvme_name = re.compile(r'nvme[0-9]+', re.IGNORECASE)
self.nvme_block_bad_keywords = ["timeout, disable controller","timeout, reset controller","Removing after probe failure status",
"IO queues not created","failed to mark controller live","frozen state error detected, reset controller",
"failure state error detected, request disconnect","Device not ready; aborting","Minimum device page size",
"Reading VS failed","Reading CAP failed","Identify Controller failed"]
def get_nvme_hard_info(self):
return self.nvme_hard_info
def get_nvme_smart_log_info(self):
return self.nvme_smart_log_info
def get_log_nvme_error_info(self):
return self.log_nvme_error_info
def dump_data(self):
'''for debug'''
print("nvme_hard_info:", self.get_nvme_hard_info())
print("nvme_samrt_log_info:", self.get_nvme_smart_log_info())
print("log_nvme_error_info:", self.get_log_nvme_error_info())
def get_bdf_by_name(self, nvme_name):
for bdf, nvme_info in self.nvme_hard_info.items():
if nvme_name == nvme_info.get('nvme_name', ''):
return bdf
return None
def sys_prepare(self):
if self.prepare_init == 1:
return 0
file_list = ['/etc/os-release','/etc/redhat-release','/etc/tlinux-release','/etc/centos-release','/etc/system-release']
release_file=""
for file in file_list:
if os.path.exists(file):
release_file = file
break
if release_file == "":
self.logger().error("get release file failed")
return 0
with open(release_file) as f:
for line in f:
if "NAME=" in line:
if "Ubuntu" in line or "Debian" in line:
self.log_path = '/var/log/syslog'
else:
self.log_path = '/var/log/messages'
break
self.prepare_init = 1
return 0
def get_nvme_hard_info_by_nvmelist(self, nvme_name):
nvme_dname = "/dev/%sn1" % nvme_name
info = self.nvme_list_info.get(nvme_dname, dict())
now = int(time.time())
if info and now - info.get('update_time',0) < self.nvme_list_info_update_period:
return info
os.chmod(self.nvme_tool,stat.S_IRWXU)
cmd = "%s list " % (self.nvme_tool)
lines = CommUtils.ExecuteTimeoutCommand(cmd,3).split('\n')
for line in lines:
line = line.strip()
if not line.startswith("/dev/"):
continue
items = re.split(r'\s{2,}', line)
if len(items)>2:
nvme_dname = items[0]
nvme_serial = items[1]
nvme_model = items[2]
self.nvme_list_info[nvme_dname] = {'nvme_model':nvme_model,'nvme_serial':nvme_serial, 'update_time':now}
return self.nvme_list_info.get(nvme_dname, dict())
def get_nvme_hard_info_by_nvmefile(self, nvme_dir):
nvme_info={}
nvme_serial_path = nvme_dir + "/serial"
nvme_serial = get_file_content(nvme_serial_path)
if nvme_serial == "":
return nvme_info
nvme_info['nvme_serial'] = nvme_serial
nvme_model_path = nvme_dir + "/model"
nvme_info['nvme_model'] = get_file_content(nvme_model_path)
nvme_info['update_time'] = int(time.time())
return nvme_info
def collect_nvme_hard_info(self, nvme_bdf):
now = int(time.time())
if now - self.nvme_hard_info.get(nvme_bdf, dict()).get("updata_time", 0) < self.nvme_hard_info_update_period:
return
pci_addr = nvme_bdf
try:
nvme_file = glob.glob('/sys/bus/pci/devices/%s/nvme/nvme*' % pci_addr)
if not nvme_file:
nvme_file = glob.glob('/sys/devices/pci%s/%s/nvme/nvme*' % (pci_addr[0:7],pci_addr))
if not nvme_file:
nvme_file = glob.glob('/sys/devices/pci%s/%s/misc/nvme*' % (pci_addr[0:7],pci_addr))
if nvme_file:
nvme_name = os.path.basename(nvme_file[0])
nvme_info = self.get_nvme_hard_info_by_nvmefile(nvme_file[0])
if not nvme_info:
nvme_info = self.get_nvme_hard_info_by_nvmelist(nvme_name)
if not nvme_info:
self.logger().error("get nvme hardware info failed")
return
nvme_info['bdf'] = nvme_bdf
nvme_info['nvme_name'] = nvme_name
self.nvme_hard_info[nvme_bdf] = nvme_info
else:
self.logger().error("get nvme hardware info failed")
except Exception:
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
# collect all vm nvme device status
def collect_nvme_info(self):
# pdb.set_trace()
try:
command = 'lspci -D | grep "Non-Volatile memory controller" '
lines = os.popen(command).readlines()
for line in lines:
nvme_bdf = line.split()[0] #get nvme disk BDF
self.collect_nvme_hard_info(nvme_bdf)
self.check_nvme_smart_log_info(nvme_bdf)
self.parse_nvme_log()
except Exception:
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
def parse_nvme_log(self):
# pdb.set_trace()
if self.prepare_init == 0:
return None
# if we can't get the log file,report this vm
if not os.path.exists(self.log_path):
self.logger().info("%s is not exist" % self.log_path)
return None
with open(self.log_path) as f:
if self.log_offset == None: #进程启动后首次读取,放弃旧历史log内容,直接seek到文件末尾
f.seek(0, 2)
self.log_offset = f.tell()
return
elif os.path.getsize(self.log_path) < self.log_offset: #文件大小小于上次读取的偏移量,认为文件发生转置,重置offset
self.log_offset = 0
f.seek(self.log_offset, 0) #从文件开始位置设置读取偏移量
nvme_name=None
for line in f:
if not "nvme" in line:
continue
nvme_bdf = None
nvme_name = None
for keyword in self.nvme_block_bad_keywords:
if keyword in line:
bdf_match = self.re_bdf.search(line)
if bdf_match:
nvme_bdf = bdf_match.group()
else:
nvme_name_match = self.re_nvme_name.search(line)
if nvme_name_match:
nvme_name = nvme_name_match.group()
nvme_bdf = self.get_bdf_by_name(nvme_name)
else:
self.logger().error("get nvme disk name failed")
if nvme_bdf:
self.log_nvme_error_info['nvme_disk_bad'].add(nvme_bdf)
self.log_offset = f.tell()
def parse_nvme_smart_log(self, lines):
nvme_data=dict()
for line in lines:
parts = line.split(':')
if len(parts) != 2:
continue
value_str = parts[1].strip()
if value_str.isdigit():
value = int(value_str)
elif value_str[0:-1].strip().isdigit():
value = int(value_str[0:-1].strip())
elif value_str.count('.') == 1 and value_str.replace('.','').isdigit():
value = float(value_str)
elif value_str[0:-1].strip().count('.') == 1 and value_str[0:-1].strip().replace('.','').isdigit():
value = float(value_str[0:-1].strip())
else:
value = value_str
nvme_data[parts[0].strip()]=value
return nvme_data
def check_nvme_smart_log_info(self,nvme_bdf):
if not os.path.exists(self.nvme_tool):
self.logger().error("%s is not exist" % self.nvme_tool)
return
now = int(time.time())
nvme_name = self.nvme_hard_info.get(nvme_bdf, dict()).get('nvme_name', '')
if not nvme_name:
self.logger().error("can't get %s nvme name, nvme info dict:%s" % (nvme_bdf, str(self.nvme_hard_info)))
return
if now - self.nvme_smart_log_info.get(nvme_bdf,dict()).get('update_time',0) < self.nvme_smart_log_info_update_period:
return
self.nvme_smart_log_info[nvme_bdf]={'update_time':now}
nvme_dname = "/dev/%sn1" % nvme_name
os.chmod(self.nvme_tool,stat.S_IRWXU)
cmd = "%s smart-log %s " % (self.nvme_tool, nvme_dname)
lines = CommUtils.ExecuteTimeoutCommand(cmd,3).split('\n')
nvme_data = self.parse_nvme_smart_log(lines)
smart_log_info = {}
if "percentage_used" in nvme_data:
smart_log_info["nvme_life"] = nvme_data["percentage_used"]
if "media_errors" in nvme_data:
smart_log_info["media_errors"] = nvme_data["media_errors"]
if "critical_warning" in nvme_data:
smart_log_info["critical_warning"] = nvme_data["critical_warning"]
if len(smart_log_info) > 0:
smart_log_info['update_time'] = now
self.nvme_smart_log_info[nvme_bdf] = smart_log_info
def check_nvme_drop_disk_info(self):
if not os.path.exists(self.nvme_tool):
self.logger().error("%s is not exist" % self.nvme_tool)
return
# now = int(time.time())
# if now - self.nvme_smart_log_info.get(nvme_bdf,dict()).get('update_time',0) < self.nvme_smart_log_info_update_period:
# return
os.chmod(self.nvme_tool,stat.S_IRWXU)
cmd = "%s list " % (self.nvme_tool)
cmd_out = CommUtils.ExecuteTimeoutCommand(cmd,3).split('\n')
nvme_lines = 0
for line in cmd_out:
if '/dev/nvme' in line:
nvme_lines += 1
cmd = 'lspci -D | grep "Non-Volatile memory controller" '
# pci_lines = os.popen(cmd).readlines()
cmd_out = CommUtils.ExecuteTimeoutCommand(cmd,3).split('\n')
pci_lines = 0
for line in cmd_out:
if 'Non-Volatile memory controller' in line:
pci_lines += 1
if nvme_lines != pci_lines:
return 1
else:
return 0
def isNvmeVM(self):
if self.is_nvme_vm_valid == 1:
return None
command = 'lspci -nn | grep "Non-Volatile memory controller"'
lines = os.popen(command).readlines()
if len(lines) == 0:
self.is_nvme_vm = 0
else:
self.is_nvme_vm = 1
self.is_nvme_vm_valid = 1
return None
def doCollect(self):
self.sys_prepare()
self.clear_data()
self.collect_nvme_info()
def clear_data(self):
self.log_nvme_error_info['nvme_disk_bad'].clear()
now = int(time.time())
if now - self.last_collect_nvme_info_time > self.collect_nvme_info_time_update_period:
self.last_collect_nvme_info_time = now
#每隔一小时 读取nvme info先清空之前的,防止有原来的数据导致误告警
self.nvme_hard_info.clear()
self.nvme_smart_log_info.clear()
class VsVmNvmeQualityCollector(VmBaseCollector):
def init(self):
self.set_frequency(60)
self._collector = VsVmNvmeQuality(self.logger)
self._handle = MetricHandler()
self._handle.namespace = 'qce/cvm'
self._last_report_nws_time = 0
self._last_report_alarm_time = 0
self.alarm_control_info={
'nvme_through_sold_life_end':{"alarm_period":24*60*60, "alarm_id":1111, "alarm_desc":"nvme life end"},
'nvme_disk_media_errors':{"alarm_period":24*60*60, "alarm_id":1110, "alarm_desc":"nvme disk media errors count"},
'nvme_disk_critical_warning':{"alarm_period":24*60*60, "alarm_id":1109, "alarm_desc":"nvme disk critical warning count"},
'nvme_disk_bad':{"alarm_period":60*60, "alarm_id":1101, "alarm_desc":"nvme disk bad"}
}
self.alarm_time_record={}
self.uuid=""
def do_alarm_proxy(self, event_name, nvme_info):
alarmtime = time.strftime("%Y-%m-%d %H:%M:%S+0800", time.localtime())
alarmproxy_metric = {"CallerName": "barad_agent",
"CallerKey":"PSbhht7wQLEbH6OjDXTayQ==",
"AlarmTime":alarmtime,
"Slot":nvme_info.get('bdf',''),
"SN": nvme_info.get('nvme_serial',''),
"PN": nvme_info.get('nvme_model',''),
"Dimensions":[
{"Key" : "Uuid",
"Value" :self.uuid}
],
"AlarmId":self.alarm_control_info.get(event_name,dict()).get("alarm_id",0),
"EventName":event_name,
"FaultType": "Hardware",
"FaultDesc":self.alarm_control_info.get(event_name,dict()).get("alarm_desc",""),
"DeviceName":"NVMe",
"DeviceId": ""}
data_alarmproxy = {'sender':'alarmproxy_sender', 'datas': alarmproxy_metric}
self.put_data(data_alarmproxy)
def do_data_event(self, event_name, nvme_name):
stime = int(time.time())
event_metric = {"version":1,
"Action": "SendEventAlarm",
"caller":"cvm",
"callee":"QCMonitor",
"productName":"cvm",
"timestamp":stime,
"occurTime":stime,
"dimensions":[{"key" : "uuid", "value" :self.uuid}],
"eventName":event_name,
"status":1,
"additionalMsg":[{"key" : "name", "value" :nvme_name}]
}
data_event = {'sender':'event_sender', 'datas': event_metric}
self.put_data(data_event)
def do_data_tss(self, event_name, nvme_name):
ftime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
stime = int(time.time())
tss = {"version":1,
"Action": "SendTssMessage",
"caller":"cvm",
"callee":"QCMonitor",
"productName":"cvm",
"timestamp":stime,
"occurTime":ftime,
"dimensions":[{"key" : "uuidList", "value" :[self.uuid]}],
"eventName":event_name,
"status":1,
"additionalMsg":{"disk_info":nvme_name}}
data_tss = {'sender':'event_sender', 'datas': tss}
self.put_data(data_tss)
def control_alarm(self,event_name,nvme_info):
bdf = nvme_info.get('bdf',None)
if not bdf:
return
last_alarm_time = self.alarm_time_record.get(event_name, dict()).get(bdf, 0)
now = int(time.time())
if now - last_alarm_time > self.alarm_control_info.get(event_name, dict()).get('alarm_period', 60*60):
self.do_alarm_proxy(event_name, nvme_info)
event_record = self.alarm_time_record.get(event_name,dict())
if event_record:
self.alarm_time_record[event_name][bdf] = now
else:
self.alarm_time_record[event_name]={bdf:now}
def do_collect(self):
# pdb.set_trace()
now = int(time.time())
try:
self._collector.isNvmeVM()
if self._collector.is_nvme_vm:
self._collector.doCollect()
self.uuid=self.get_vm_uuid()
else:
return
except Exception as e:
self.logger().error(traceback.format_exc().replace('\n', '\\n'))
self._handle.clean_metrics()
nvme_hard_info = self._collector.get_nvme_hard_info()
nvme_smart_log_info = self._collector.get_nvme_smart_log_info()
log_nvme_error_info = self._collector.get_log_nvme_error_info()
disk_drop_error_info = self._collector.check_nvme_drop_disk_info()
for bdf, info in nvme_smart_log_info.items():
nvme_name=nvme_hard_info.get(bdf, dict()).get('nvme_name','')
dimension = {'uuid':self.uuid,'address':bdf,'nvme_name':nvme_name}
if info.get('nvme_life', 0) >= 97:
self.control_alarm("nvme_through_sold_life_end", nvme_hard_info.get(bdf, dict()))
self.logger().error("nvme_smart_log_info nvme_life %s" % nvme_smart_log_info)
self._handle.add_metric(name='nvme_life',dimensions=dimension, timestamp=now, value=info.get('nvme_life', 0))
if info.get('media_errors', 0) > 0:
self.control_alarm("nvme_disk_media_errors", nvme_hard_info.get(bdf, dict()))
self.logger().error("nvme_smart_log_info media_errors %s" % nvme_smart_log_info)
self._handle.add_metric(name='media_errors',dimensions=dimension, timestamp=now, value=info.get('media_errors', 0))
if info.get('critical_warning', 0) > 0:
self.control_alarm("nvme_disk_critical_warning", nvme_hard_info.get(bdf, dict()))
self.logger().error("nvme_smart_log_info critical_warning %s" % nvme_smart_log_info)
self._handle.add_metric(name='critical_warning',dimensions=dimension, timestamp=now, value=info.get('critical_warning', 0))
for event_name, bdfs in log_nvme_error_info.items():
for bdf in bdfs:
self.control_alarm(event_name, nvme_hard_info.get(bdf, dict()))
if now - self._last_report_nws_time >= 60:
nvme_name=nvme_hard_info.get(bdf, dict()).get('nvme_name','')
dimension = {'uuid':self.uuid,'address':bdf,'nvme_name':nvme_name}
self._handle.add_metric(name='nvme_error',dimensions=dimension, timestamp=now,value=1)
self.do_data_event("nvme_error", nvme_name)
self.do_data_tss("nvme_error", nvme_name)
if now - self._last_report_nws_time >= 60:
dimension = {'uuid':self.uuid,'address':'','nvme_name':''}
self._handle.add_metric(name='nvme_disk_drop_error',dimensions=dimension, timestamp=now,value=disk_drop_error_info)
if len(self._handle.get_metrics()) > 0:
data = {'sender': 'nws_sender', 'datas':self._handle.pop_metrics()}
self.put_data(data)
self._last_report_nws_time = now
def main():
collector = VsVmNvmeQualityCollector()
while 1:
collector.do_collect()
collector._collector.dump_data()
#collector.dump_data()
time.sleep(15)
if __name__ == '__main__':
main()