HEX

File: //usr/local/qcloud/monitor/barad/plugin/collector/utils/log_utils.py
"""
      Description :  utils for log monitoring 
      created by  :  hetiulin 2014/08/05 
"""
# -*- coding: utf-8 -*-
__author__ = 'hetiulin'

import os
from os import path
import hashlib
import re

MAX_LEN_MD5 = 512L

class LogUtils:   
    def make_logfile_list(self, filename):
        filelist = []
        # 将filename拆分成目录和文件名(正则表达式)
        dir, file_pattern = path.split(filename)
        if not dir or not file_pattern:
            raise Exception('filename != directory + file_pattern')
        prog = re.compile(file_pattern)
        for file in os.listdir(dir):
            full_dir = path.join(dir, file)
            try:
                if path.isfile(full_dir) and prog.match(file):
                    fileobj = {}
                    fileobj['filename'] = full_dir
                    filestat = os.stat(full_dir)
                    fileobj['ino'] = filestat.st_ino
                    fileobj['dev'] = filestat.st_dev
                    fileobj['size'] = filestat.st_size
                    fileobj['mtime'] = long(filestat.st_mtime)
                    fileobj['processed_size'] = 0L
                    if MAX_LEN_MD5 > filestat.st_size:    
                        fileobj['md5size'] = filestat.st_size
                    else:
                        fileobj['md5size'] = MAX_LEN_MD5
                    # 计算MD5SUM（最多计算文件的前512字节）
                    with open(full_dir) as f:
                        fileobj['md5sum'] = hashlib.md5(f.read(fileobj['md5size'])).hexdigest()
                    filelist.append(fileobj)
            except:
                # 如果无法读取该日志文件，则忽略
                pass
        return filelist
    
    # 从临时文件中读取旧文件列表
    def read_old_fileinfos(self, filepath):
        # print filepath
        files_old = []
        try:
            if path.exists(filepath):
                with open(filepath) as file:
                    for line in file.readlines():
                        file_info = line.split()
                        if 8 == len(file_info):
                            fileobj = {}
                            fileobj['filename'] = file_info[0]
                            fileobj['ino'] = long(file_info[1])
                            fileobj['dev'] = int(file_info[2])
                            fileobj['size'] = long(file_info[3])
                            fileobj['mtime'] = long(file_info[4])
                            fileobj['processed_size'] = long(file_info[5])
                            fileobj['md5size'] = long(file_info[6])
                            fileobj['md5sum'] = file_info[7]
                            files_old.append(fileobj)
        except:
            # 如果无法读取旧文件信息，则忽略
            pass
        return files_old
    
    # 将本轮已处理的文件信息写入临时文件
    def write_new_fileinfo(self, filepath, fileinfos):
        string = []
        file = open(filepath, 'w')
        for fileinfo in fileinfos:
            s = "%s %d %d %d %d %d %d %s\n" % (fileinfo['filename'], 
                fileinfo['ino'], fileinfo['dev'], fileinfo['size'], fileinfo['mtime'],
                fileinfo['processed_size'], fileinfo['md5size'], fileinfo['md5sum'])
            string.append(s)
        try:
            file.writelines(string)
        except:
            raise Exception("Failed: can't write to file %s" % filepath)
        file.close()           
    
    # 查找与旧文件相同的新文件的索引，如果没找到则返回-1
    def find_old2new(self, old2new, num_new, i_old):
        p = i_old * num_new
        for col in range(num_new):
            if 1 == old2new[p+col]:
                return col
        return -1
    
    def setup_old2new(self, old, num_old, new, num_new):
        old2new = [0] * num_new * num_old
        p = 0
        # 建立新旧文件的映射矩阵
        for row in range(num_old):
            for col in range(num_new):
                if self.__is_same_file(old[row], new[col]):
                    old2new[p+col] = 1
                else:
                    old2new[p+col] = 0
            p += num_new
        # print old2new
        
        if 1 < num_old or 1 < num_new:
            p = 0
            # 消除重复的映射，使得新旧文件是一一对应的
            for row in range(num_old):
                ones = 0
                for col in range(num_new):
                    if 1 == old2new[p+col]:
                        ones += 1
                        if 2 == ones:
                            self.__resolve_old2new(old2new, num_old, num_new)
                            return old2new
                p += num_new
            for col in range(num_new):
                p = col
                ones = 0
                for row in range(num_old):
                    if 1 == old2new[p]:
                        ones += 1
                        if 2 == ones:
                            self.__resolve_old2new(old2new, num_old, num_new)
                            return old2new
                    p += num_new
            
        # print old2new
        return old2new
    
    # 判断两个文件是否相同.
    # 参数：  old 旧文件
    #         new 新文件
    def __is_same_file(self, old, new):
        # print old
        # print new
        if old['ino'] != new['ino'] or old['dev'] != new['dev']:
        # inodes和device id必须相同
            return False
        if old['mtime'] > new['mtime']:
        # mtime只能变大
            return False
        if old['size'] > new['size']:
        # 文件大小必须变大，若被截断或替换则会当成新文件
            return False
        # if old['size'] == new['size'] and old['mtime'] < new['mtime']:
        # 文件大小不变时mtime也不改变
            # return False
        if -1 == old['md5size'] or -1 == new['md5size']:
        # 无法比较md5sum时当作两个不同文件
            return False
        if old['md5size'] > new['md5size']:
        # 生成md5的文件块大小不能变小
            return False
        if old['md5size'] == new['md5size']:
            if old['md5sum'] != new['md5sum']:
            # md5sum必须相同
                return False
        else:
            if 0 < old['md5size']:
                with open(new['filename']) as file:
                    tmp = hashlib.md5(file.read(old['md5size'])).hexdigest()
                    if tmp != old['md5sum']:
                        return False
        return True
    
    def __resolve_old2new(self, old2new, num_old, num_new):
        protected_rows = [0]*num_old
        protected_cols = [0]*num_new
        for row in range(num_old):
            # 找出所有只包含一个“1”且“1”在交点上的行和列
            c = self.__is_uniq_row(old2new, num_new, row)
            if -1 != c and -1 != self.__is_uniq_col(old2new, num_old, num_new, c):
                protected_rows[i] = 1
                protected_cols[c] = 1
        
        # 对其他行和列去重
        if num_old <= num_new:
            # 从左上角开始
            for row in range(num_old):
                if 1 == protected_rows[row]:
                    continue
                p = row * num_new
                for col in range(num_new):
                    # 找到第一个“1”且该列没有被protected
                    if 1 == old2new[p+col] and 1 != protected_cols[col]:
                        self.__cross_out(old2new, num_old, num_new, row, col, protected_rows, protected_cols)
                        break
        else:
            # 从右下角开始
            for row in range(num_old-1, -1, -1):
                if 1 == protected_rows[row]:
                    continue
                p = row * num_new
                for col in range(num_new-1, -1, -1):
                    # 找到第一个“1”且该列没有被protected
                    if 1 == old2new[p+col] and 1 != protected_cols[col]:
                        self.__cross_out(old2new, num_old, num_new, row, col, protected_rows, protected_cols)
                        break
    
    # 置所给行和列的除交点以外的点为“0”
    # 参数： old2new   映射矩阵
    #        num_rows  矩阵的行数
    #        num_cols  矩阵的列数
    #        row       哪一行
    #        col       哪一列
    def __cross_out(self, old2new, num_rows, num_cols, row, col, p_rows, p_cols):
        p = row * num_cols
        for i in range(num_cols):
            # 对行进行处理
            if 1 != p_cols[i] and col != i:
                old2new[p+i] = 0
                
        p = col
        for i in range(num_rows):
            # 对列进行处理
            if 1 != p_rows[i] and row != i:
                old2new[p] = 0
            p += num_cols
    
    # 判断某一行是否只有一个“1”
    # 参数： old2new   映射矩阵
    #        num_cols  矩阵的列数
    #        row       判断哪一行
    # 返回值： “1”所在的列号，如果该行有0个或多个“1”则返回-1
    def __is_uniq_row(self, old2new, num_cols, row):
        p = row * num_cols
        ones = 0
        ret = -1
        for i in range(num_cols):
            if 1 == old2new[p+i]:
                ones += 1
                if 2 == ones:
                    ret = -1
                    break
                ret = i
        return ret

    # 判断某一列是否只有一个“1”
    # 参数： old2new   映射矩阵
    #        num_rows  矩阵的行数
    #        num_cols  矩阵的列数
    #        col       判断哪一列
    # 返回值： “1”所在的行号，如果该列有0个或多个“1”则返回-1
    def __is_uniq_col(self, old2new, num_rows, num_cols, col):
        p = col
        ones = 0
        ret = -1
        for i in range(num_rows):
            if 1 == old2new[p]:
                ones += 1
                if 2 == ones:
                    ret = -1
                    break
                ret = i
            p += num_cols
        return ret
        
def test_make_logfile_list():
    log = LogUtils()
    filelist = log.make_logfile_list('./.*(.py)$')
    print filelist
   
def test_readwrite_fileinfo():
    log = LogUtils()
    fileinfo = [{'filename':'a.log','ino':1L, 'dev':1L, 'mtime':1, 'size':1L, 'processed_size':1L,
                'incomplete': 0, 'md5size':1, 'md5sum':'aa'},
                {'filename':'b.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'processed_size':1,
                'incomplete': 0,'md5size':1, 'md5sum':'aa'}]
    log.write_new_fileinfo('TEST', fileinfo)
    fileinfo2 = log.read_old_fileinfos('TEST')
    print fileinfo2
   
def test_setup_old2new():
    log = LogUtils()
    # 1  1  1
    # 1  1  1
    # 1  1  1
    old = [
            {'filename':'a.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'},
            {'filename':'b.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'},
            {'filename':'c.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'}
          ]
    new = [
            {'filename':'1.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'},
            {'filename':'2.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'},
            {'filename':'3.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'}
          ]
    old2new = log.setup_old2new(old, 3, new, 3)
    p = 0
    for i in range(3):
        for j in range(3):
            print old2new[p+j],
        p += 3
        print
    # 1
    old = [
            {'filename':'a.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'}
          ]
    new = [
            {'filename':'1.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'}
          ]
    old2new = log.setup_old2new(old, 1, new, 1)
    print old2new

if __name__ == '__main__':
    test_make_logfile_list()
    # test_setup_old2new()
    # test_readwrite_fileinfo()