十六进制转unicode

Convert hexadecimal to unicode

我正在尝试从 IFC 格式文件中读取一些文本字段,但它们似乎都是十六进制的。 (结果可能是一些俄语文本)。

我可以找到描述如何转换为 Ascii 的不同帖子,并且还找到了关于如何 convert from hexadecimal to string for unicode 的答案,这是给出的代码:

public static string FromHexString(string hexString)
{
    var bytes = new byte[hexString.Length / 2];
    for (var i = 0; i < bytes.Length; i++)
    {
        bytes[i] = Convert.ToByte(hexString.Substring(i * 2, 2), 16);
    }

    return Encoding.Unicode.GetString(bytes); // returns: "Hello world" for "48656C6C6F20776F726C64"
}

但它对我不起作用,使用这个函数,这是我得到的结果:

原始字符串看起来像 \X220043504310440043E\X0\。我不知道 \X2\\X0\ 是什么意思,但我猜(错误?)它是特定于 IFC 格式来定义编码的???

编码使用 Big Endian Unicode:

return Encoding.BigEndianUnicode.GetString(bytes);

已使用您的字符串和来自 http://www.steptools.com/stds/step/IS_final_p21e3.html

的测试用例进行测试

请注意,整个 IFC 格式非常复杂。编写一个支持各种 \S\(something)\P(something)\\X2\(hex)\X4\(hex)\X\(hex)(加上结束 \X0\)。关于 \X4\ 示例的文档中甚至存在问题(给出的是 7 个十六进制数字而不是 8 个十六进制数字),而且整个文件似乎应该在转义序列之外进行 UTF-8 编码。

啊啊啊完成了:

一些测试:

// With .NET Core/.NET 5.0 you'll need the nuget 
// https://www.nuget.org/packages/System.Text.Encoding.CodePages/
// And this line
//Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
// Nothing is needed with .NET Framework

string strExampleUnquoted = ItfStringDecoder.DecodeItfString(@"\X220043504310440043E\X0\");

string str1Unquoted = ItfStringDecoder.DecodeItfString(@"CAT");
string str2Unquoted = ItfStringDecoder.DecodeItfString(@"Don''t");
string str3Unquoted = ItfStringDecoder.DecodeItfString(@"''");
string str4Unquoted = ItfStringDecoder.DecodeItfString(@"");
string str5Unquoted = ItfStringDecoder.DecodeItfString(@"\S\Drger");
string str6Unquoted = ItfStringDecoder.DecodeItfString(@"h\S\ttel");
string str7Unquoted = ItfStringDecoder.DecodeItfString(@"\PE\S\*\S\U\S\b");
string str8Unquoted = ItfStringDecoder.DecodeItfString(@"\X2C0\X0\");
string str9Unquoted = ItfStringDecoder.DecodeItfString(@"\X2B103B203B3\X0\");
string str10Unquoted = ItfStringDecoder.DecodeItfString(@"\X4[=11=]01F600\X0\");
string str11Unquoted = ItfStringDecoder.DecodeItfString(@"\X4[=11=]01F6000001F638\X0\");
string str12Unquoted = ItfStringDecoder.DecodeItfString(@"see \X\A7 4.1");
string str13Unquoted = ItfStringDecoder.DecodeItfString(@"line one\X[=11=]Aline two");

string str1Quoted = ItfStringDecoder.DecodeItfString(@"'CAT'", true);
string str2Quoted = ItfStringDecoder.DecodeItfString(@"'Don''t'", true);
string str3Quoted = ItfStringDecoder.DecodeItfString(@"''''", true);
string str4Quoted = ItfStringDecoder.DecodeItfString(@"''", true);
string str5Quoted = ItfStringDecoder.DecodeItfString(@"'\S\Drger'", true);
string str6Quoted = ItfStringDecoder.DecodeItfString(@"'h\S\ttel'", true);
string str7Quoted = ItfStringDecoder.DecodeItfString(@"'\PE\S\*\S\U\S\b'", true);
string str8Quoted = ItfStringDecoder.DecodeItfString(@"'\X2C0\X0\'", true);
string str9Quoted = ItfStringDecoder.DecodeItfString(@"'\X2B103B203B3\X0\'", true);
string str10Quoted = ItfStringDecoder.DecodeItfString(@"'\X4[=11=]01F600\X0\'", true);
string str11Quoted = ItfStringDecoder.DecodeItfString(@"'\X4[=11=]01F6000001F638\X0\'", true);
string str12Quoted = ItfStringDecoder.DecodeItfString(@"'see \X\A7 4.1'", true);
string str13Quoted = ItfStringDecoder.DecodeItfString(@"'line one\X[=11=]Aline two'", true);

解码器:

public class ItfStringDecoder
{
    /// <summary>
    /// 
    /// </summary>
    /// <param name="bytes"></param>
    /// <param name="quoted">true = 'XYZ', false = XYZ</param>
    /// <returns></returns>
    public static string DecodeItfString(byte[] bytes, bool quoted = false)
    {
        return DecodeItfString(Encoding.UTF8.GetString(bytes), quoted);
    }

    /// <summary>
    /// 
    /// </summary>
    /// <param name="str"></param>
    /// <param name="quoted">true = 'XYZ', false = XYZ</param>
    /// <returns></returns>
    public static string DecodeItfString(string str, bool quoted = false)
    {
        // We start with iso-8859-1 that is null
        Encoding encoding = null;

        int start = 0;
        int end = str.Length - 1;

        if (quoted)
        {
            if (!str.StartsWith('\''))
            {
                throw new FormatException("Malformed string, non starting with \"'\"");
            }

            if (!str.EndsWith('\''))
            {
                throw new FormatException("Malformed string, non ending with \"'\"");
            }

            start = 1;
            end = str.Length - 2;
        }

        var sb = new StringBuilder();

        for (int i = start; i <= end; i++)
        {
            char ch0 = str[i];

            if (ch0 == '\'')
            {
                if (i + 1 > end || str[i + 1] != '\'')
                {
                    throw new FormatException($"Malformed string, \"'\" not followed by \"'\" at position {i}");
                }

                sb.Append('\'');
                i++;
            }
            else if (ch0 == '\')
            {
                if (i + 1 > end)
                {
                    throw new FormatException($"Malformed string, \"\\" not followed by legal character at position {i}");
                }

                char ch1 = str[i + 1];

                switch (ch1)
                {
                    case '\':
                        sb.Append('\');
                        i++;
                        break;
                    case 'S':
                        i += DecodeItfStringPage(str, i, end, sb, encoding);
                        break;
                    case 'P':
                        i += DecodeItfStringAlphabet(str, i, end, out encoding);
                        break;
                    case 'X':
                        i += DecodeItfStringExtendedOrArbitary(str, i, end, sb);
                        break;
                    default:
                        throw new FormatException($"Malformed string, \"\\" followed by illegal character at position {i}");
                }
            }
            else
            {

                sb.Append(ch0);
            }
        }

        return sb.ToString();
    }

    private static int DecodeItfStringPage(string str, int i, int end, StringBuilder sb, Encoding encoding)
    {
        if (i + 3 > end || str[i + 2] != '\')
        {
            throw new FormatException($"Malformed string, \"\S\" not followed by legal character at position {i}");
        }

        char ch3 = str[i + 3];

        // Latin codepoint
        if (ch3 == ' ' ||
            (ch3 >= '0' && ch3 <= '9') ||
            (ch3 >= 'a' && ch3 <= 'z') ||
            (ch3 >= 'A' && ch3 <= 'Z') ||
            ch3 == '_' ||
            ch3 == '!' || ch3 == '"' || ch3 == '*' || ch3 == '$' || ch3 == '%' || ch3 == '&' || ch3 == '.' || ch3 == '#' ||
            ch3 == '+' || ch3 == ',' || ch3 == '-' || ch3 == '(' || ch3 == ')' || ch3 == '?' || ch3 == '/' || ch3 == ':' ||
            ch3 == ';' || ch3 == '<' || ch3 == '=' || ch3 == '>' || ch3 == '@' || ch3 == '[' || ch3 == ']' || ch3 == '{' ||
            ch3 == '|' || ch3 == '}' || ch3 == '^' || ch3 == '`' || ch3 == '~' ||
            ch3 == '\' || ch3 == '\'')
        {
            // ok
        }
        else
        {
            throw new FormatException($"Malformed string, \"\S\" not followed by legal character at position {i}");
        }

        // Little cheat for iso-8859-1
        if (encoding == null)
        {
            // The iso-8859-1 encoding maps 1:1 with the first 256 unicode codepoints
            sb.Append((char)(ch3 + 128));
        }
        else
        {
            // Without array allocation (this is allocated on the stack)
            ReadOnlySpan<byte> bytes = stackalloc byte[] { (byte)(ch3 + 128) };
            // Classic with array
            //var bytes = new byte[] { (byte)(ch3 + 128) };
            sb.Append(encoding.GetString(bytes));
        }

        return 3;
    }

    private static int DecodeItfStringAlphabet(string str, int i, int end, out Encoding encoding)
    {
        if (i + 3 > end || str[i + 3] != '\')
        {
            throw new FormatException($"Malformed string, \"\P\" not followed by legal character at position {i}");
        }

        char ch2 = str[i + 2];

        if (ch2 < 'A' || ch2 > 'I')
        {
            throw new FormatException($"Malformed string, \"\P\" not followed by legal character at position {i}");
        }

        int ix = ch2 - 'A';

        // We don't need an encoder for iso-8859-1
        // and 28591 is iso-8859-1, 28592 is iso-8859-2...
        encoding = ix == 0 ? null : Encoding.GetEncoding(28591 + ix);

        return 3;
    }

    private static int DecodeItfStringExtendedOrArbitary(string str, int i, int end, StringBuilder sb)
    {
        if (i + 4 > end)
        {
            throw new FormatException($"Malformed string, \"\X\" not followed by legal character at position {i}");
        }

        char ch2 = str[i + 2];

        if (ch2 == '\')
        {
            byte b1, b2;

            if (!TryFromHex(str[i + 3], out b1) || !TryFromHex(str[i + 4], out b2))
            {
                throw new FormatException($"Malformed string, \"\X\\" not followed by legal character at position {i}");
            }

            byte b = (byte)(b1 * 16 + b2);
            sb.Append((char)b);

            return 4;
        }

        if (ch2 == '2')
        {
            if (str[i + 3] != '\')
            {
                throw new FormatException($"Malformed string, \"\X2\" not followed by legal character at position {i}");
            }

            int j = i + 4;

            while (true)
            {
                if (j + 3 > end)
                {
                    throw new FormatException($"Malformed string, \"\X2\" not followed by legal sequence of characters at position {j}");
                }

                byte b1, b2, b3, b4;

                if (!TryFromHex(str[j], out b1) || !TryFromHex(str[j + 1], out b2) ||
                    !TryFromHex(str[j + 2], out b3) || !TryFromHex(str[j + 3], out b4))
                {
                    throw new FormatException($"Malformed string, \"\X2\\" not followed by legal character at position {j}");
                }

                char ch = (char)(b1 << 12 | b2 << 8 | b3 << 4 | b4);
                sb.Append(ch);

                j += 4;

                if (j + 3 > end)
                {
                    throw new FormatException($"Malformed string, \"\X2\" not followed by legal sequence of characters at position {j}");
                }

                if (str[j] == '\')
                {
                    if (str[j + 1] == 'X' && str[j + 2] == '0' && str[j + 3] == '\')
                    {
                        j += 3;
                        return j - i;
                    }

                    throw new FormatException($"Malformed string, \"\X2\" not followed by legal sequence of characters at position {j}");
                }
            }
        }

        if (ch2 == '4')
        {
            if (str[i + 3] != '\')
            {
                throw new FormatException($"Malformed string, \"\X4\" not followed by legal character at position {i}");
            }

            int j = i + 4;

            while (true)
            {
                if (j + 7 > end)
                {
                    throw new FormatException($"Malformed string, \"\X4\" not followed by legal sequence of characters at position {j}");
                }

                int utf32;

                {
                    byte b1, b2, b3, b4;

                    if (!TryFromHex(str[j], out b1) || !TryFromHex(str[j + 1], out b2) ||
                        !TryFromHex(str[j + 2], out b3) || !TryFromHex(str[j + 3], out b4))
                    {
                        throw new FormatException($"Malformed string, \"\X4\\" not followed by legal character at position {j}");
                    }

                    utf32 = b1 << 12 | b2 << 8 | b3 << 4 | b4;
                    utf32 <<= 16;

                    j += 4;
                }

                {
                    byte b1, b2, b3, b4;

                    if (!TryFromHex(str[j], out b1) || !TryFromHex(str[j + 1], out b2) ||
                        !TryFromHex(str[j + 2], out b3) || !TryFromHex(str[j + 3], out b4))
                    {
                        throw new FormatException($"Malformed string, \"\X4\\" not followed by legal character at position {j}");
                    }

                    utf32 |= b1 << 12 | b2 << 8 | b3 << 4 | b4;

                    j += 4;
                }

                sb.Append(char.ConvertFromUtf32(utf32));

                if (j + 3 > end)
                {
                    throw new FormatException($"Malformed string, \"\X4\" not followed by legal sequence of characters at position {j}");
                }

                if (str[j] == '\')
                {
                    if (str[j + 1] == 'X' && str[j + 2] == '0' && str[j + 3] == '\')
                    {
                        j += 3;
                        return j - i;
                    }

                    throw new FormatException($"Malformed string, \"\X4\" not followed by legal sequence of characters at position {j}");
                }
            }
        }

        throw new FormatException($"Malformed string, \"\X\" not followed by legal character at position {i}");
    }

    private static bool TryFromHex(char ch, out byte value)
    {
        if (ch >= '0' && ch <= '9')
        {
            value = (byte)(ch - '0');
            return true;
        }
        else if (ch >= 'A' && ch <= 'F')
        {
            value = (byte)(10 + ch - 'A');
            return true;
        }
        else if (ch >= 'a' && ch <= 'f')
        {
            value = (byte)(10 + ch - 'a');
            return true;
        }

        value = 0;
        return false;
    }
}