无法正确解析 YAML
Can't parse YAML correctly
我在 python 中解析了以下 YAML 数据:
>>> import yaml
>>> yaml.load("""
... ---
... categories: {1: Yes, 2: No}
... increasing: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10]
... ...
... """)
并将其作为输出:
{'increasing': [0, 1, 2, 3, 4, 5, 6, 7, '08', '09', 10], 'categories': {1: True, 2: False}}
- 为什么 "Yes" 和 "No" 转换为 True 和 False?
- 为什么“08”和“09”被解析为字符串,而其他数字被解析为截断前导零的数字?
Yes
和 No
在 YAML 中有特殊含义。看看 Wikipedia article。为了避免这种情况,您可以更改您的 YAML 以包含引号并且看起来像这样
>>> yaml.load("""
... ---
... categories: {1: "Yes", 2: "No"}
... increasing: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10]
... ...
... """)
关于 08 和 09 的前导零,我不太确定为什么会这样,但这似乎不是 python 问题
您关于 00
到 07
前导零被截断的推论是不正确的。这些都是八进制字符,因为前导 0
并被如此解释。
由于八进制字符不能包含 8
或 9
,因此 08
和 09
只能是字符串,您的 YAML 解析器会这样加载它们。
这实际上是 YAML 1.1 in YAML 1.2 octal numbers 的剩余(向后兼容性)应该以 0o
开头
Yes
和 No
分别加载为 True
和 False
。也是 YAML-1.1-ishm。 1.2 规范不再提及这些备选方案。如果你引用那些字符串,它们将不会被转换
通过添加以下规则,您可以相对轻松地构建一个不接受 True/False 的 Yes/No/On/Off 变体的解析器:
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:bool',
re.compile(u'''^(?:true|True|TRUE|false|False|FALSE)$''', re.X),
list(u'tTfF'))
或使用正常的 Resolver
并删除适当的起始符号条目:
import ruamel.yaml as yaml
from ruamel.yaml.resolver import Resolver
yaml_str = """\
categories: {1: Yes, 2: No}
"""
for ch in list(u'yYnNoO'):
del Resolver.yaml_implicit_resolvers[ch]
data = yaml.load(yaml_str, Loader=yaml.Loader)
print(data)
给你:
{'categories': {1: 'Yes', 2: 'No'}}
让所有以 0 开头的纯数字字符串被识别为普通整数并不是那么简单,因为如果您更改 int
的隐式解析器并传递以 0 开头的字符串,您得到一个解析问题,因为 08
是根据八进制转换的 ¹:
import re
import ruamel.yaml as yaml
from ruamel.yaml.reader import Reader
from ruamel.yaml.resolver import BaseResolver, Resolver
from ruamel.yaml.scanner import RoundTripScanner
from ruamel.yaml.parser_ import Parser
from ruamel.yaml.composer import Composer
from ruamel.yaml.constructor import RoundTripConstructor
from ruamel.yaml import RoundTripLoader
from ruamel.yaml.compat import to_str
yaml_str = """\
categories: {1: Yes, 2: No}
increasing: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10]
"""
class MyResolver(BaseResolver):
pass
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:bool',
re.compile(u'''^(?:true|True|TRUE|false|False|FALSE)$''', re.X),
list(u'tTfF'))
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:float',
re.compile(u'''^(?:
[-+]?(?:[0-9][0-9_]*)\.[0-9_]*(?:[eE][-+]?[0-9]+)?
|[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
|\.[0-9_]+(?:[eE][-+][0-9]+)?
|[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\.[0-9_]*
|[-+]?\.(?:inf|Inf|INF)
|\.(?:nan|NaN|NAN))$''', re.X),
list(u'-+0123456789.'))
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:int',
re.compile(u'''^(?:[-+]?0b[0-1_]+
|[-+]?[0-9]+
|[-+]?0o?[0-7_]+
|[-+]?(?:0|[1-9][0-9_]*)
|[-+]?0x[0-9a-fA-F_]+
|[-+]?[1-9][0-9_]*(?::[0-5]?[0-9])+)$''', re.X),
list(u'-+0123456789'))
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:merge',
re.compile(u'^(?:<<)$'),
[u'<'])
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:null',
re.compile(u'''^(?: ~
|null|Null|NULL
| )$''', re.X),
[u'~', u'n', u'N', u''])
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:timestamp',
re.compile(u'''^(?:[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]
|[0-9][0-9][0-9][0-9] -[0-9][0-9]? -[0-9][0-9]?
(?:[Tt]|[ \t]+)[0-9][0-9]?
:[0-9][0-9] :[0-9][0-9] (?:\.[0-9]*)?
(?:[ \t]*(?:Z|[-+][0-9][0-9]?(?::[0-9][0-9])?))?)$''', re.X),
list(u'0123456789'))
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:value',
re.compile(u'^(?:=)$'),
[u'='])
# The following resolver is only for documentation purposes. It cannot work
# because plain scalars cannot start with '!', '&', or '*'.
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:yaml',
re.compile(u'^(?:!|&|\*)$'),
list(u'!&*'))
class MyRoundTripConstructor(RoundTripConstructor):
def construct_yaml_int(self, node):
value = to_str(self.construct_scalar(node))
value = value.replace('_', '')
sign = +1
if value[0] == '-':
sign = -1
if value[0] in '+-':
value = value[1:]
if value == '0':
return 0
elif value.startswith('0b'):
return sign*int(value[2:], 2)
elif value.startswith('0x'):
return sign*int(value[2:], 16)
elif value.startswith('0o'):
return sign*int(value[2:], 8)
#elif value[0] == '0':
# return sign*int(value, 8)
elif ':' in value:
digits = [int(part) for part in value.split(':')]
digits.reverse()
base = 1
value = 0
for digit in digits:
value += digit*base
base *= 60
return sign*value
else:
return sign*int(value)
MyRoundTripConstructor.add_constructor(
u'tag:yaml.org,2002:int',
MyRoundTripConstructor.construct_yaml_int)
class MyRoundTripLoader(Reader, RoundTripScanner, Parser,
Composer, MyRoundTripConstructor, MyResolver):
def __init__(self, stream):
Reader.__init__(self, stream)
RoundTripScanner.__init__(self)
Parser.__init__(self)
Composer.__init__(self)
MyRoundTripConstructor.__init__(self)
MyResolver.__init__(self)
for ch in list(u'yYnNoO'):
del Resolver.yaml_implicit_resolvers[ch]
data = yaml.load(yaml_str, Loader=MyRoundTripLoader)
print(data['increasing'])
然后打印:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
(它也将 Yes/No 作为字符串,而不首先在内部查找中插入识别模式 table)
¹ 我用的是ruamel.yaml for this, of which I am the author. PyYAML,基于ruamel.yaml,应该可以支持类似的推导。
我在 python 中解析了以下 YAML 数据:
>>> import yaml
>>> yaml.load("""
... ---
... categories: {1: Yes, 2: No}
... increasing: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10]
... ...
... """)
并将其作为输出:
{'increasing': [0, 1, 2, 3, 4, 5, 6, 7, '08', '09', 10], 'categories': {1: True, 2: False}}
- 为什么 "Yes" 和 "No" 转换为 True 和 False?
- 为什么“08”和“09”被解析为字符串,而其他数字被解析为截断前导零的数字?
Yes
和 No
在 YAML 中有特殊含义。看看 Wikipedia article。为了避免这种情况,您可以更改您的 YAML 以包含引号并且看起来像这样
>>> yaml.load("""
... ---
... categories: {1: "Yes", 2: "No"}
... increasing: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10]
... ...
... """)
关于 08 和 09 的前导零,我不太确定为什么会这样,但这似乎不是 python 问题
您关于 00
到 07
前导零被截断的推论是不正确的。这些都是八进制字符,因为前导 0
并被如此解释。
由于八进制字符不能包含 8
或 9
,因此 08
和 09
只能是字符串,您的 YAML 解析器会这样加载它们。
这实际上是 YAML 1.1 in YAML 1.2 octal numbers 的剩余(向后兼容性)应该以 0o
Yes
和 No
分别加载为 True
和 False
。也是 YAML-1.1-ishm。 1.2 规范不再提及这些备选方案。如果你引用那些字符串,它们将不会被转换
通过添加以下规则,您可以相对轻松地构建一个不接受 True/False 的 Yes/No/On/Off 变体的解析器:
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:bool',
re.compile(u'''^(?:true|True|TRUE|false|False|FALSE)$''', re.X),
list(u'tTfF'))
或使用正常的 Resolver
并删除适当的起始符号条目:
import ruamel.yaml as yaml
from ruamel.yaml.resolver import Resolver
yaml_str = """\
categories: {1: Yes, 2: No}
"""
for ch in list(u'yYnNoO'):
del Resolver.yaml_implicit_resolvers[ch]
data = yaml.load(yaml_str, Loader=yaml.Loader)
print(data)
给你:
{'categories': {1: 'Yes', 2: 'No'}}
让所有以 0 开头的纯数字字符串被识别为普通整数并不是那么简单,因为如果您更改 int
的隐式解析器并传递以 0 开头的字符串,您得到一个解析问题,因为 08
是根据八进制转换的 ¹:
import re
import ruamel.yaml as yaml
from ruamel.yaml.reader import Reader
from ruamel.yaml.resolver import BaseResolver, Resolver
from ruamel.yaml.scanner import RoundTripScanner
from ruamel.yaml.parser_ import Parser
from ruamel.yaml.composer import Composer
from ruamel.yaml.constructor import RoundTripConstructor
from ruamel.yaml import RoundTripLoader
from ruamel.yaml.compat import to_str
yaml_str = """\
categories: {1: Yes, 2: No}
increasing: [00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10]
"""
class MyResolver(BaseResolver):
pass
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:bool',
re.compile(u'''^(?:true|True|TRUE|false|False|FALSE)$''', re.X),
list(u'tTfF'))
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:float',
re.compile(u'''^(?:
[-+]?(?:[0-9][0-9_]*)\.[0-9_]*(?:[eE][-+]?[0-9]+)?
|[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
|\.[0-9_]+(?:[eE][-+][0-9]+)?
|[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\.[0-9_]*
|[-+]?\.(?:inf|Inf|INF)
|\.(?:nan|NaN|NAN))$''', re.X),
list(u'-+0123456789.'))
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:int',
re.compile(u'''^(?:[-+]?0b[0-1_]+
|[-+]?[0-9]+
|[-+]?0o?[0-7_]+
|[-+]?(?:0|[1-9][0-9_]*)
|[-+]?0x[0-9a-fA-F_]+
|[-+]?[1-9][0-9_]*(?::[0-5]?[0-9])+)$''', re.X),
list(u'-+0123456789'))
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:merge',
re.compile(u'^(?:<<)$'),
[u'<'])
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:null',
re.compile(u'''^(?: ~
|null|Null|NULL
| )$''', re.X),
[u'~', u'n', u'N', u''])
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:timestamp',
re.compile(u'''^(?:[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]
|[0-9][0-9][0-9][0-9] -[0-9][0-9]? -[0-9][0-9]?
(?:[Tt]|[ \t]+)[0-9][0-9]?
:[0-9][0-9] :[0-9][0-9] (?:\.[0-9]*)?
(?:[ \t]*(?:Z|[-+][0-9][0-9]?(?::[0-9][0-9])?))?)$''', re.X),
list(u'0123456789'))
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:value',
re.compile(u'^(?:=)$'),
[u'='])
# The following resolver is only for documentation purposes. It cannot work
# because plain scalars cannot start with '!', '&', or '*'.
MyResolver.add_implicit_resolver(
u'tag:yaml.org,2002:yaml',
re.compile(u'^(?:!|&|\*)$'),
list(u'!&*'))
class MyRoundTripConstructor(RoundTripConstructor):
def construct_yaml_int(self, node):
value = to_str(self.construct_scalar(node))
value = value.replace('_', '')
sign = +1
if value[0] == '-':
sign = -1
if value[0] in '+-':
value = value[1:]
if value == '0':
return 0
elif value.startswith('0b'):
return sign*int(value[2:], 2)
elif value.startswith('0x'):
return sign*int(value[2:], 16)
elif value.startswith('0o'):
return sign*int(value[2:], 8)
#elif value[0] == '0':
# return sign*int(value, 8)
elif ':' in value:
digits = [int(part) for part in value.split(':')]
digits.reverse()
base = 1
value = 0
for digit in digits:
value += digit*base
base *= 60
return sign*value
else:
return sign*int(value)
MyRoundTripConstructor.add_constructor(
u'tag:yaml.org,2002:int',
MyRoundTripConstructor.construct_yaml_int)
class MyRoundTripLoader(Reader, RoundTripScanner, Parser,
Composer, MyRoundTripConstructor, MyResolver):
def __init__(self, stream):
Reader.__init__(self, stream)
RoundTripScanner.__init__(self)
Parser.__init__(self)
Composer.__init__(self)
MyRoundTripConstructor.__init__(self)
MyResolver.__init__(self)
for ch in list(u'yYnNoO'):
del Resolver.yaml_implicit_resolvers[ch]
data = yaml.load(yaml_str, Loader=MyRoundTripLoader)
print(data['increasing'])
然后打印:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
(它也将 Yes/No 作为字符串,而不首先在内部查找中插入识别模式 table)
¹ 我用的是ruamel.yaml for this, of which I am the author. PyYAML,基于ruamel.yaml,应该可以支持类似的推导。