File: //usr/local/qcloud/monitor/barad/plugin/collector/utils/log_utils.py
"""
Description : utils for log monitoring
created by : hetiulin 2014/08/05
"""
# -*- coding: utf-8 -*-
__author__ = 'hetiulin'
import os
from os import path
import hashlib
import re
MAX_LEN_MD5 = 512L
class LogUtils:
def make_logfile_list(self, filename):
filelist = []
# 将filename拆分成目录和文件名(正则表达式)
dir, file_pattern = path.split(filename)
if not dir or not file_pattern:
raise Exception('filename != directory + file_pattern')
prog = re.compile(file_pattern)
for file in os.listdir(dir):
full_dir = path.join(dir, file)
try:
if path.isfile(full_dir) and prog.match(file):
fileobj = {}
fileobj['filename'] = full_dir
filestat = os.stat(full_dir)
fileobj['ino'] = filestat.st_ino
fileobj['dev'] = filestat.st_dev
fileobj['size'] = filestat.st_size
fileobj['mtime'] = long(filestat.st_mtime)
fileobj['processed_size'] = 0L
if MAX_LEN_MD5 > filestat.st_size:
fileobj['md5size'] = filestat.st_size
else:
fileobj['md5size'] = MAX_LEN_MD5
# 计算MD5SUM(最多计算文件的前512字节)
with open(full_dir) as f:
fileobj['md5sum'] = hashlib.md5(f.read(fileobj['md5size'])).hexdigest()
filelist.append(fileobj)
except:
# 如果无法读取该日志文件,则忽略
pass
return filelist
# 从临时文件中读取旧文件列表
def read_old_fileinfos(self, filepath):
# print filepath
files_old = []
try:
if path.exists(filepath):
with open(filepath) as file:
for line in file.readlines():
file_info = line.split()
if 8 == len(file_info):
fileobj = {}
fileobj['filename'] = file_info[0]
fileobj['ino'] = long(file_info[1])
fileobj['dev'] = int(file_info[2])
fileobj['size'] = long(file_info[3])
fileobj['mtime'] = long(file_info[4])
fileobj['processed_size'] = long(file_info[5])
fileobj['md5size'] = long(file_info[6])
fileobj['md5sum'] = file_info[7]
files_old.append(fileobj)
except:
# 如果无法读取旧文件信息,则忽略
pass
return files_old
# 将本轮已处理的文件信息写入临时文件
def write_new_fileinfo(self, filepath, fileinfos):
string = []
file = open(filepath, 'w')
for fileinfo in fileinfos:
s = "%s %d %d %d %d %d %d %s\n" % (fileinfo['filename'],
fileinfo['ino'], fileinfo['dev'], fileinfo['size'], fileinfo['mtime'],
fileinfo['processed_size'], fileinfo['md5size'], fileinfo['md5sum'])
string.append(s)
try:
file.writelines(string)
except:
raise Exception("Failed: can't write to file %s" % filepath)
file.close()
# 查找与旧文件相同的新文件的索引,如果没找到则返回-1
def find_old2new(self, old2new, num_new, i_old):
p = i_old * num_new
for col in range(num_new):
if 1 == old2new[p+col]:
return col
return -1
def setup_old2new(self, old, num_old, new, num_new):
old2new = [0] * num_new * num_old
p = 0
# 建立新旧文件的映射矩阵
for row in range(num_old):
for col in range(num_new):
if self.__is_same_file(old[row], new[col]):
old2new[p+col] = 1
else:
old2new[p+col] = 0
p += num_new
# print old2new
if 1 < num_old or 1 < num_new:
p = 0
# 消除重复的映射,使得新旧文件是一一对应的
for row in range(num_old):
ones = 0
for col in range(num_new):
if 1 == old2new[p+col]:
ones += 1
if 2 == ones:
self.__resolve_old2new(old2new, num_old, num_new)
return old2new
p += num_new
for col in range(num_new):
p = col
ones = 0
for row in range(num_old):
if 1 == old2new[p]:
ones += 1
if 2 == ones:
self.__resolve_old2new(old2new, num_old, num_new)
return old2new
p += num_new
# print old2new
return old2new
# 判断两个文件是否相同.
# 参数: old 旧文件
# new 新文件
def __is_same_file(self, old, new):
# print old
# print new
if old['ino'] != new['ino'] or old['dev'] != new['dev']:
# inodes和device id必须相同
return False
if old['mtime'] > new['mtime']:
# mtime只能变大
return False
if old['size'] > new['size']:
# 文件大小必须变大,若被截断或替换则会当成新文件
return False
# if old['size'] == new['size'] and old['mtime'] < new['mtime']:
# 文件大小不变时mtime也不改变
# return False
if -1 == old['md5size'] or -1 == new['md5size']:
# 无法比较md5sum时当作两个不同文件
return False
if old['md5size'] > new['md5size']:
# 生成md5的文件块大小不能变小
return False
if old['md5size'] == new['md5size']:
if old['md5sum'] != new['md5sum']:
# md5sum必须相同
return False
else:
if 0 < old['md5size']:
with open(new['filename']) as file:
tmp = hashlib.md5(file.read(old['md5size'])).hexdigest()
if tmp != old['md5sum']:
return False
return True
def __resolve_old2new(self, old2new, num_old, num_new):
protected_rows = [0]*num_old
protected_cols = [0]*num_new
for row in range(num_old):
# 找出所有只包含一个“1”且“1”在交点上的行和列
c = self.__is_uniq_row(old2new, num_new, row)
if -1 != c and -1 != self.__is_uniq_col(old2new, num_old, num_new, c):
protected_rows[i] = 1
protected_cols[c] = 1
# 对其他行和列去重
if num_old <= num_new:
# 从左上角开始
for row in range(num_old):
if 1 == protected_rows[row]:
continue
p = row * num_new
for col in range(num_new):
# 找到第一个“1”且该列没有被protected
if 1 == old2new[p+col] and 1 != protected_cols[col]:
self.__cross_out(old2new, num_old, num_new, row, col, protected_rows, protected_cols)
break
else:
# 从右下角开始
for row in range(num_old-1, -1, -1):
if 1 == protected_rows[row]:
continue
p = row * num_new
for col in range(num_new-1, -1, -1):
# 找到第一个“1”且该列没有被protected
if 1 == old2new[p+col] and 1 != protected_cols[col]:
self.__cross_out(old2new, num_old, num_new, row, col, protected_rows, protected_cols)
break
# 置所给行和列的除交点以外的点为“0”
# 参数: old2new 映射矩阵
# num_rows 矩阵的行数
# num_cols 矩阵的列数
# row 哪一行
# col 哪一列
def __cross_out(self, old2new, num_rows, num_cols, row, col, p_rows, p_cols):
p = row * num_cols
for i in range(num_cols):
# 对行进行处理
if 1 != p_cols[i] and col != i:
old2new[p+i] = 0
p = col
for i in range(num_rows):
# 对列进行处理
if 1 != p_rows[i] and row != i:
old2new[p] = 0
p += num_cols
# 判断某一行是否只有一个“1”
# 参数: old2new 映射矩阵
# num_cols 矩阵的列数
# row 判断哪一行
# 返回值: “1”所在的列号,如果该行有0个或多个“1”则返回-1
def __is_uniq_row(self, old2new, num_cols, row):
p = row * num_cols
ones = 0
ret = -1
for i in range(num_cols):
if 1 == old2new[p+i]:
ones += 1
if 2 == ones:
ret = -1
break
ret = i
return ret
# 判断某一列是否只有一个“1”
# 参数: old2new 映射矩阵
# num_rows 矩阵的行数
# num_cols 矩阵的列数
# col 判断哪一列
# 返回值: “1”所在的行号,如果该列有0个或多个“1”则返回-1
def __is_uniq_col(self, old2new, num_rows, num_cols, col):
p = col
ones = 0
ret = -1
for i in range(num_rows):
if 1 == old2new[p]:
ones += 1
if 2 == ones:
ret = -1
break
ret = i
p += num_cols
return ret
def test_make_logfile_list():
log = LogUtils()
filelist = log.make_logfile_list('./.*(.py)$')
print filelist
def test_readwrite_fileinfo():
log = LogUtils()
fileinfo = [{'filename':'a.log','ino':1L, 'dev':1L, 'mtime':1, 'size':1L, 'processed_size':1L,
'incomplete': 0, 'md5size':1, 'md5sum':'aa'},
{'filename':'b.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'processed_size':1,
'incomplete': 0,'md5size':1, 'md5sum':'aa'}]
log.write_new_fileinfo('TEST', fileinfo)
fileinfo2 = log.read_old_fileinfos('TEST')
print fileinfo2
def test_setup_old2new():
log = LogUtils()
# 1 1 1
# 1 1 1
# 1 1 1
old = [
{'filename':'a.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'},
{'filename':'b.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'},
{'filename':'c.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'}
]
new = [
{'filename':'1.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'},
{'filename':'2.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'},
{'filename':'3.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'}
]
old2new = log.setup_old2new(old, 3, new, 3)
p = 0
for i in range(3):
for j in range(3):
print old2new[p+j],
p += 3
print
# 1
old = [
{'filename':'a.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'}
]
new = [
{'filename':'1.log','ino':1, 'dev':1, 'mtime':1, 'size':1, 'md5size':1, 'md5sum':'aa'}
]
old2new = log.setup_old2new(old, 1, new, 1)
print old2new
if __name__ == '__main__':
test_make_logfile_list()
# test_setup_old2new()
# test_readwrite_fileinfo()