HEX
Server: Apache/2.4.52 (Ubuntu)
System: Linux WebLive 5.15.0-79-generic #86-Ubuntu SMP Mon Jul 10 16:07:21 UTC 2023 x86_64
User: ubuntu (1000)
PHP: 7.4.33
Disabled: pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,pcntl_unshare,
Upload Files
File: //usr/local/qcloud/monitor/barad/plugin/collector/vm/check.py
import sys
import os,time
sys.path.append(os.getcwd() + '/../../../comm/')
import constant
from plugin_base import VmBaseCollector
from utils.metric_handler import MetricHandler
from cutils import CommUtils

class CheckCollector(VmBaseCollector):
    def init(self):
        self.set_frequency(600)
        self.handler = MetricHandler()
        self.handler.namespace = 'qce/cvm'
        self.handler.dimensions = [ 'vm_uuid', 'vmip']
        self.hasNvidiaGpu = 0xff

    def isNvidiaGpuVM(self):
        is_gpu_vm = 0
        try:
            command = 'lspci -d 10de:'
            lines = CommUtils.ExecuteTimeoutCommand(command, 3)
            if len(lines) == 0:
                is_gpu_vm = 0
            else:
                is_gpu_vm = 1
        except Exception as e:
            pass
        return is_gpu_vm

    def hasDProcessInNvidiaFunction(self):
        command = "ps -eo state,pid"
        try:
            output = CommUtils.ExecuteTimeoutCommand(command, 3)
            lines = output.strip().split('\n')
        except Exception as e:
            return 0

        for line in lines[1:]:
            try:
                state, pid = line.strip().split()
                if state != "D":
                    continue
                command2 = "cat /proc/%d/stack" % int(pid)
                output2 = CommUtils.ExecuteTimeoutCommand(command2, 3)
                if "nvidia" in output2:
                    return 1
            except Exception as e:
                pass

        return 0

    def do_collect(self):
        vm_uuid = self.get_vm_uuid()
        vmip =  self.get_vmip()
        now = int(time.time())

        if self.hasNvidiaGpu == 0xff:
            self.hasNvidiaGpu = self.isNvidiaGpuVM()

        if self.hasNvidiaGpu == 1:
            hasNvidiaHung = self.hasDProcessInNvidiaFunction()

            dimensions = {'vm_uuid': vm_uuid, 'vmip': vmip }
            batch_metric = [
                {'name':'gpu_nvidia_hung', 'value':hasNvidiaHung},
            ]

            self.handler.add_batch_metric(batch = batch_metric, dimensions = dimensions, timestamp = now)

        if (len(self.handler.get_metrics()) > 0) :
            data = {'sender':'nws_sender', 'datas': self.handler.pop_metrics()}
            self.put_data(data)

def main():
    collector = CheckCollector()
    collector.init()
    collector.collect()
    collector.dump_data()

if __name__ == '__main__':
    main()