使用 Python 从日志文件中解析具有 JSON 条目的数据流
Parse stream of data having JSON entries from log file using Python
我需要解析几乎是 JSON 格式的数据流,并以日期格式为借口,如下所示。
2020-09-28 15:52:13.633+0000 INFO |RestAPI.ServiceManager | Request #399: {
"context": {
"httpContextKey": 18446744071313531680,
"verbId": 2,
"verb": "GET",
"originalVerb": "GET",
"uri": "/services/v2/installation/deployments",
"protocol": "https",
"headers": {
"X-OGG-Context": "services",
"X-OGG-Service": "ServiceManager",
"X-OGG-Version": "v2",
"X-OGG-Resource": "installation/deployments",
"Content-Length": "0",
"Accept": "application/json"
},
"host": "testing-db.com",
"securityEnabled": true,
"authorization": null,
"requestId": 399,
"uriTemplate": "/services/{version}/installation/deployments",
"catalogUriTemplate": "/services/{version}/metadata-catalog/deployments"
},
"isScaRequest": true,
"content": null,
"parameters": {
"uri": {
"version": "v2"
}
}
}
Response: {
"context": {
"httpContextKey": 18446744071313531680,
"requestId": 399,
"code": "200 OK",
"headers": {
"Content-Type": "application/json",
"Expires": "0",
"Pragma": "no-cache",
"Strict-Transport-Security": "max-age=31536000;includeSubDomains"
},
"Content-Type": "application/json",
"contentType": "application/json"
},
"isScaResponse": true,
"originScaRequest": {
"context": {
"httpContextKey": 18446744071313531680,
"verbId": 2,
"verb": "GET",
"originalVerb": "GET",
"uri": "/services/v2/installation/deployments",
"protocol": "https",
"headers": {
"X-OGG-Context": "services",
"X-OGG-Service": "ServiceManager",
"X-OGG-Version": "v2",
"X-OGG-Resource": "installation/deployments",
"Content-Length": "0",
"Accept": "application/json"
},
"securityEnabled": true,
"authorization": null,
"requestId": 399
},
"isScaRequest": true
},
"content": {
"$schema": "api:standardResponse",
"links": [
{
"rel": "describedby",
"href": "https://testing-db.com/services/v2/metadata-catalog/deployments",
"mediaType": "application/schema+json"
}
],
"messages": [],
"response": {
"$schema": "ogg:installationDeployments",
"xagEnabled": false,
"deployments": [
{
"deploymentId": "39398e93-7e53-484c-9e90-3bf2f820ee73",
"deploymentName": "FOR-TMDB",
"enabled": true,
"status": "running"
},
{
"deploymentId": "30233230-94a6-4ae7-9b5e-db66105d9046",
"deploymentName": "ServiceManager",
"enabled": true,
"status": "running"
}
]
}
}
}
当 tail -0f apiserver.log | python parse
为 运行 时,它应该解析每个 JSON 条目并获取以显示任何实体,例如 DATE
和 Request
详细信息。注意日志还有 JSON 格式的 Response
需要排除。
我尝试使用下面的代码利用生成器,但问题是如何对多行进行分组然后解析以获取 JSON 条目?因为我需要退出循环才能得到它,但它会流式传输失败。
import sys
def read_stdin():
readline = sys.stdin.readline()
while readline:
yield readline
readline = sys.stdin.readline()
for line in read_stdin():
print(line)
你有几个选择(第二个更简单,可能是最好的):
- 您可以使用
PyPi
存储库中的 regex
包并尝试识别请求 JSON
字符串,如下所示:
import regex
import sys
import json
json_rex = regex.compile(r"""
(?<json> #capturing group json
{ #open {
(?: #non-capturing group
[^{}]++ #anything but {} without backtracking
| #or
(?&json) #recursive substitute of group expr
)*
} #close }
)
""", flags=regex.VERBOSE)
while True:
line = sys.stdin.readline()
if not line:
break
if regex.search(r'Request #\d+: {', line):
json_str = '{\n'
while True:
json_str += sys.stdin.readline()
if json_rex.match(json_str):
d = json.loads(json_str) # we have our dictionary
print(d)
break
- 尝试将累积行解析为
JSON
直到没有错误:
import re
import sys
import json
while True:
line = sys.stdin.readline()
if not line:
break
if re.search(r'Request #\d+: {', line):
json_str = '{\n'
while True:
json_str += sys.stdin.readline()
try:
d = json.loads(json_str) # we have our dictionary, perhaps
except Exception:
pass
else:
print(d)
break
我需要解析几乎是 JSON 格式的数据流,并以日期格式为借口,如下所示。
2020-09-28 15:52:13.633+0000 INFO |RestAPI.ServiceManager | Request #399: {
"context": {
"httpContextKey": 18446744071313531680,
"verbId": 2,
"verb": "GET",
"originalVerb": "GET",
"uri": "/services/v2/installation/deployments",
"protocol": "https",
"headers": {
"X-OGG-Context": "services",
"X-OGG-Service": "ServiceManager",
"X-OGG-Version": "v2",
"X-OGG-Resource": "installation/deployments",
"Content-Length": "0",
"Accept": "application/json"
},
"host": "testing-db.com",
"securityEnabled": true,
"authorization": null,
"requestId": 399,
"uriTemplate": "/services/{version}/installation/deployments",
"catalogUriTemplate": "/services/{version}/metadata-catalog/deployments"
},
"isScaRequest": true,
"content": null,
"parameters": {
"uri": {
"version": "v2"
}
}
}
Response: {
"context": {
"httpContextKey": 18446744071313531680,
"requestId": 399,
"code": "200 OK",
"headers": {
"Content-Type": "application/json",
"Expires": "0",
"Pragma": "no-cache",
"Strict-Transport-Security": "max-age=31536000;includeSubDomains"
},
"Content-Type": "application/json",
"contentType": "application/json"
},
"isScaResponse": true,
"originScaRequest": {
"context": {
"httpContextKey": 18446744071313531680,
"verbId": 2,
"verb": "GET",
"originalVerb": "GET",
"uri": "/services/v2/installation/deployments",
"protocol": "https",
"headers": {
"X-OGG-Context": "services",
"X-OGG-Service": "ServiceManager",
"X-OGG-Version": "v2",
"X-OGG-Resource": "installation/deployments",
"Content-Length": "0",
"Accept": "application/json"
},
"securityEnabled": true,
"authorization": null,
"requestId": 399
},
"isScaRequest": true
},
"content": {
"$schema": "api:standardResponse",
"links": [
{
"rel": "describedby",
"href": "https://testing-db.com/services/v2/metadata-catalog/deployments",
"mediaType": "application/schema+json"
}
],
"messages": [],
"response": {
"$schema": "ogg:installationDeployments",
"xagEnabled": false,
"deployments": [
{
"deploymentId": "39398e93-7e53-484c-9e90-3bf2f820ee73",
"deploymentName": "FOR-TMDB",
"enabled": true,
"status": "running"
},
{
"deploymentId": "30233230-94a6-4ae7-9b5e-db66105d9046",
"deploymentName": "ServiceManager",
"enabled": true,
"status": "running"
}
]
}
}
}
当 tail -0f apiserver.log | python parse
为 运行 时,它应该解析每个 JSON 条目并获取以显示任何实体,例如 DATE
和 Request
详细信息。注意日志还有 JSON 格式的 Response
需要排除。
我尝试使用下面的代码利用生成器,但问题是如何对多行进行分组然后解析以获取 JSON 条目?因为我需要退出循环才能得到它,但它会流式传输失败。
import sys
def read_stdin():
readline = sys.stdin.readline()
while readline:
yield readline
readline = sys.stdin.readline()
for line in read_stdin():
print(line)
你有几个选择(第二个更简单,可能是最好的):
- 您可以使用
PyPi
存储库中的regex
包并尝试识别请求JSON
字符串,如下所示:
import regex
import sys
import json
json_rex = regex.compile(r"""
(?<json> #capturing group json
{ #open {
(?: #non-capturing group
[^{}]++ #anything but {} without backtracking
| #or
(?&json) #recursive substitute of group expr
)*
} #close }
)
""", flags=regex.VERBOSE)
while True:
line = sys.stdin.readline()
if not line:
break
if regex.search(r'Request #\d+: {', line):
json_str = '{\n'
while True:
json_str += sys.stdin.readline()
if json_rex.match(json_str):
d = json.loads(json_str) # we have our dictionary
print(d)
break
- 尝试将累积行解析为
JSON
直到没有错误:
import re
import sys
import json
while True:
line = sys.stdin.readline()
if not line:
break
if re.search(r'Request #\d+: {', line):
json_str = '{\n'
while True:
json_str += sys.stdin.readline()
try:
d = json.loads(json_str) # we have our dictionary, perhaps
except Exception:
pass
else:
print(d)
break