过滤 Python 仅记录来自某些文件的行
Filter Python log for only lines from certain files
我有一个共享单个日志的 Python 项目。不幸的是,有时我想通读日志,但仅限于某些文件中的事件。我写了下面的代码来解决这个问题,但它很难看。本质上,它只是使用要忽略的文件黑名单。我可以通过给它一个 __init__
来改进它,其中黑名单或白名单可以作为变量传递,但我只是想在润色之前检查是否有解决这个问题的简单方法。
class FocusedLog:
''' Iterate through only the parts of the log we care about. '''
def __iter__(self):
with open('/tmp/{}.shared.log'.format(os.path.basename(__file__)), 'r') as raw:
skip = False
for line in raw:
if line.startswith('2021-08'):
skip = False
if 'hpack' in line or 'selenium' in line or 'urllib' in line or 'statsapi' in line:
skip = True
# TODO Filter out extraneous files.
if not skip:#True:#'hpack' not in line and 'selenium' not in line:
yield line
使用基于 logdissect
模块的自定义解析器解析日志文件。
假设日志格式为
%(asctime)s - %(levelname)s - %(message)s
解析它的正则表达式可以是
^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (INFO|ERROR|WARNING|CRITICAL) - (.*)$
要解析的代码
'''
Log format
'%(asctime)s - %(levelname)s - %(message)s'
'''
from logdissect import parsers,filters
# File contents
'''
2021-08-21 18:06:22,458 - INFO - info message
2021-08-22 18:06:22,458 - WARNING - warn message selenium
2021-08-22 18:06:22,458 - ERROR - error message
2021-08-22 18:06:22,458 - CRITICAL - critical message hpack
2021-08-22 18:06:22,521 - WARNING - another warn message urllib statsapi
'''
fpath = '/home/lmc/tmp/test.txt'
dfilter = '2021-08-22'
ownparser = parsers.blank.ParseModule()
ownparser.name = 'custom parser'
ownparser.format_regex = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (INFO|ERROR|WARNING|CRITICAL) - (.*)$'
ownparser.fields = ['ts', 'level', 'message']
ownparser.datestamp_type = 'None'
grepFilter = filters.grep.FilterModule()
data = ownparser.parse_file(fpath)
new_data = grepFilter.filter_data(data, values=['{}.*(?:hpack|selenium|urllib|statsapi)'.format(dfilter)])
for m in new_data['entries']:
print(m['message'])
输出:
warn message selenium
critical message hpack
another warn message urllib statsapi
这是我最终得到的解决方案。不过,另一个答案很有帮助。
import re
class FocusedLog:
''' Iterate through only the parts of the log from files
in the given whitelist. '''
def __init__(self, whitelist, fname):
self.whitelist = whitelist
self.fname = fname
def __iter__(self):
with open(self.fname, 'r') as raw:
skip = True
big_regex = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) (INFO|DEBUG|ERROR|WARNING|CRITICAL) (\S*) (.*)$'
pat = re.compile(big_regex)
for line in raw:
if pat.match(line):
skip = True
match = pat.match(line)
src = match.group(3)
if src in self.whitelist:
skip = False
if not skip:
yield line
if __name__ == '__main__':
fname = '/tmp/{}.shared.log'.format(os.path.basename(__file__))
whitelist = ['__main__', ...]
for line in FocusedLog(whitelist, fname):
print(line)
我有一个共享单个日志的 Python 项目。不幸的是,有时我想通读日志,但仅限于某些文件中的事件。我写了下面的代码来解决这个问题,但它很难看。本质上,它只是使用要忽略的文件黑名单。我可以通过给它一个 __init__
来改进它,其中黑名单或白名单可以作为变量传递,但我只是想在润色之前检查是否有解决这个问题的简单方法。
class FocusedLog:
''' Iterate through only the parts of the log we care about. '''
def __iter__(self):
with open('/tmp/{}.shared.log'.format(os.path.basename(__file__)), 'r') as raw:
skip = False
for line in raw:
if line.startswith('2021-08'):
skip = False
if 'hpack' in line or 'selenium' in line or 'urllib' in line or 'statsapi' in line:
skip = True
# TODO Filter out extraneous files.
if not skip:#True:#'hpack' not in line and 'selenium' not in line:
yield line
使用基于 logdissect
模块的自定义解析器解析日志文件。
假设日志格式为
%(asctime)s - %(levelname)s - %(message)s
解析它的正则表达式可以是
^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (INFO|ERROR|WARNING|CRITICAL) - (.*)$
要解析的代码
'''
Log format
'%(asctime)s - %(levelname)s - %(message)s'
'''
from logdissect import parsers,filters
# File contents
'''
2021-08-21 18:06:22,458 - INFO - info message
2021-08-22 18:06:22,458 - WARNING - warn message selenium
2021-08-22 18:06:22,458 - ERROR - error message
2021-08-22 18:06:22,458 - CRITICAL - critical message hpack
2021-08-22 18:06:22,521 - WARNING - another warn message urllib statsapi
'''
fpath = '/home/lmc/tmp/test.txt'
dfilter = '2021-08-22'
ownparser = parsers.blank.ParseModule()
ownparser.name = 'custom parser'
ownparser.format_regex = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (INFO|ERROR|WARNING|CRITICAL) - (.*)$'
ownparser.fields = ['ts', 'level', 'message']
ownparser.datestamp_type = 'None'
grepFilter = filters.grep.FilterModule()
data = ownparser.parse_file(fpath)
new_data = grepFilter.filter_data(data, values=['{}.*(?:hpack|selenium|urllib|statsapi)'.format(dfilter)])
for m in new_data['entries']:
print(m['message'])
输出:
warn message selenium
critical message hpack
another warn message urllib statsapi
这是我最终得到的解决方案。不过,另一个答案很有帮助。
import re
class FocusedLog:
''' Iterate through only the parts of the log from files
in the given whitelist. '''
def __init__(self, whitelist, fname):
self.whitelist = whitelist
self.fname = fname
def __iter__(self):
with open(self.fname, 'r') as raw:
skip = True
big_regex = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) (INFO|DEBUG|ERROR|WARNING|CRITICAL) (\S*) (.*)$'
pat = re.compile(big_regex)
for line in raw:
if pat.match(line):
skip = True
match = pat.match(line)
src = match.group(3)
if src in self.whitelist:
skip = False
if not skip:
yield line
if __name__ == '__main__':
fname = '/tmp/{}.shared.log'.format(os.path.basename(__file__))
whitelist = ['__main__', ...]
for line in FocusedLog(whitelist, fname):
print(line)