File: //usr/local/qcloud/monitor/barad/plugin/collector/vm/check.py
import sys
import os,time
sys.path.append(os.getcwd() + '/../../../comm/')
import constant
from plugin_base import VmBaseCollector
from utils.metric_handler import MetricHandler
from cutils import CommUtils
class CheckCollector(VmBaseCollector):
def init(self):
self.set_frequency(600)
self.handler = MetricHandler()
self.handler.namespace = 'qce/cvm'
self.handler.dimensions = [ 'vm_uuid', 'vmip']
self.hasNvidiaGpu = 0xff
def isNvidiaGpuVM(self):
is_gpu_vm = 0
try:
command = 'lspci -d 10de:'
lines = CommUtils.ExecuteTimeoutCommand(command, 3)
if len(lines) == 0:
is_gpu_vm = 0
else:
is_gpu_vm = 1
except Exception as e:
pass
return is_gpu_vm
def hasDProcessInNvidiaFunction(self):
command = "ps -eo state,pid"
try:
output = CommUtils.ExecuteTimeoutCommand(command, 3)
lines = output.strip().split('\n')
except Exception as e:
return 0
for line in lines[1:]:
try:
state, pid = line.strip().split()
if state != "D":
continue
command2 = "cat /proc/%d/stack" % int(pid)
output2 = CommUtils.ExecuteTimeoutCommand(command2, 3)
if "nvidia" in output2:
return 1
except Exception as e:
pass
return 0
def do_collect(self):
vm_uuid = self.get_vm_uuid()
vmip = self.get_vmip()
now = int(time.time())
if self.hasNvidiaGpu == 0xff:
self.hasNvidiaGpu = self.isNvidiaGpuVM()
if self.hasNvidiaGpu == 1:
hasNvidiaHung = self.hasDProcessInNvidiaFunction()
dimensions = {'vm_uuid': vm_uuid, 'vmip': vmip }
batch_metric = [
{'name':'gpu_nvidia_hung', 'value':hasNvidiaHung},
]
self.handler.add_batch_metric(batch = batch_metric, dimensions = dimensions, timestamp = now)
if (len(self.handler.get_metrics()) > 0) :
data = {'sender':'nws_sender', 'datas': self.handler.pop_metrics()}
self.put_data(data)
def main():
collector = CheckCollector()
collector.init()
collector.collect()
collector.dump_data()
if __name__ == '__main__':
main()