OCaml:如何解码 unicode-escape 字符串?
OCaml: How to decode unicode-escape string?
给定一个 str 如下:
let str = "#include \u003Cunordered_map\u003E\u000D\u000A"
如何将 unicode-escape 字符串解码为 unicode 字符串或 OCaml 中的 Ascii 字符串?
在python我可以轻松做到
str.decode("unicode-escape")
如果您的嵌入式转义序列总是要对 ASCII 字符进行编码,如您所说,您可以找到它们并将它们替换为解码后的等效字符:
let decode s =
let re = Str.regexp "\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]" in
let s1 n = String.make 1 (Char.chr n) in
let subst = function
| Str.Delim u -> s1 (int_of_string ("0x" ^ String.sub u 2 4))
| Str.Text t -> t
in
String.concat "" (List.map subst (Str.full_split re s))
这适用于您的示例:
val decode : string -> string = <fun>
# decode "#include \u003Cunordered_map\u003E\u000D\u000A";;
- : string = "#include <unordered_map>\r\n"
确实,Python 内置支持解码这些序列。
更新
要通过转换为 UTF-8 来支持所有四位十六进制转义序列 "\uXXXX"
,您可以使用此代码:
let utf8encode s =
let prefs = [| 0x0; 0xc0; 0xe0 |] in
let s1 n = String.make 1 (Char.chr n) in
let rec ienc k sofar resid =
let bct = if k = 0 then 7 else 6 - k in
if resid < 1 lsl bct then
(s1 (prefs.(k) + resid)) ^ sofar
else
ienc (k + 1) (s1 (0x80 + resid mod 64) ^ sofar) (resid / 64)
in
ienc 0 "" (int_of_string ("0x" ^ s))
let decode2 s =
let re = Str.regexp "\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]" in
let subst = function
| Str.Delim u -> utf8encode (String.sub u 2 4)
| Str.Text t -> t
in
String.concat "" (List.map subst (Str.full_split re s))
它也适用于您的示例和其他一些示例:
val utf8encode : string -> string = <fun>
val decode2 : string -> string = <fun>
# decode2 "#include \u003Cunordered_map\u003E\u000D\u000A";;
- : string = "#include <unordered_map>\r\n"
# print_endline (decode2 "\u00A2");;
¢
- : unit = ()
# print_endline (decode2 "\u20AC");;
€
- : unit = ()
给定一个 str 如下:
let str = "#include \u003Cunordered_map\u003E\u000D\u000A"
如何将 unicode-escape 字符串解码为 unicode 字符串或 OCaml 中的 Ascii 字符串?
在python我可以轻松做到
str.decode("unicode-escape")
如果您的嵌入式转义序列总是要对 ASCII 字符进行编码,如您所说,您可以找到它们并将它们替换为解码后的等效字符:
let decode s =
let re = Str.regexp "\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]" in
let s1 n = String.make 1 (Char.chr n) in
let subst = function
| Str.Delim u -> s1 (int_of_string ("0x" ^ String.sub u 2 4))
| Str.Text t -> t
in
String.concat "" (List.map subst (Str.full_split re s))
这适用于您的示例:
val decode : string -> string = <fun>
# decode "#include \u003Cunordered_map\u003E\u000D\u000A";;
- : string = "#include <unordered_map>\r\n"
确实,Python 内置支持解码这些序列。
更新
要通过转换为 UTF-8 来支持所有四位十六进制转义序列 "\uXXXX"
,您可以使用此代码:
let utf8encode s =
let prefs = [| 0x0; 0xc0; 0xe0 |] in
let s1 n = String.make 1 (Char.chr n) in
let rec ienc k sofar resid =
let bct = if k = 0 then 7 else 6 - k in
if resid < 1 lsl bct then
(s1 (prefs.(k) + resid)) ^ sofar
else
ienc (k + 1) (s1 (0x80 + resid mod 64) ^ sofar) (resid / 64)
in
ienc 0 "" (int_of_string ("0x" ^ s))
let decode2 s =
let re = Str.regexp "\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]" in
let subst = function
| Str.Delim u -> utf8encode (String.sub u 2 4)
| Str.Text t -> t
in
String.concat "" (List.map subst (Str.full_split re s))
它也适用于您的示例和其他一些示例:
val utf8encode : string -> string = <fun>
val decode2 : string -> string = <fun>
# decode2 "#include \u003Cunordered_map\u003E\u000D\u000A";;
- : string = "#include <unordered_map>\r\n"
# print_endline (decode2 "\u00A2");;
¢
- : unit = ()
# print_endline (decode2 "\u20AC");;
€
- : unit = ()