在 C# 中从 CP1251 转换为 Unicode

Convert To Unicode from CP1251 in C#

我有包含俄语文本的大数据

"Ìèíàñÿí Ðóäèê Ñàðêèñîâ"

我需要在 C# 中将其转换为 UNICODE

"Минасян Рудик Саркисов"

如何转换?

如果真的是1251,那么代码是:

var enc1251 = Encoding.GetEncoding(1251);
var enc8859 = Encoding.GetEncoding("iso-8859-1");
string str = "Ìèíàñÿí Ðóäèê Ñàðêèñîâ";
byte[] bytes = enc8859.GetBytes(str);
string str2 = enc1251.GetString(bytes);

Encoding.GetEncoding("iso-8859-1").GetBytes(str) returns“原始”(未处理)byte[] 数组,然后我用 CP1251 解码。

我正在添加一个小程序来“解决”这类问题。注意这是一个.NET Framework程序,而不是.NET Core程序,因为.NET Core对Encoding.GetEncodings有问题.该程序将搜索可用于修复错误编码文本问题的编码。它会显示一个候选编码列表,可以 encode/decode 给定的全文,然后会尝试匹配它们。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

class Program
{
    static void Main(string[] args)
    {
        Console.OutputEncoding = System.Text.Encoding.UTF8;

        var encodings = Encoding.GetEncodings();

        Console.WriteLine($"Testing {encodings.Length} encodings");
        Console.WriteLine();

        string from = "Ìèíàñÿí Ðóäèê Ñàðêèñîâ"; // "Єюээ";
        string to = "Минасян Рудик Саркисов"; // "тонн";

        var encodingsFrom = new List<(string Hex, Encoding Encoding)>();
        var encodingsTo = new List<(string Hex, Encoding Encoding)>();

        var stringPlusEncodings = new[]
        {
            new { Str = from, Encodings = encodingsFrom },
            new { Str = to, Encodings = encodingsTo },
        };

        foreach (var stringPlusEncoding in stringPlusEncodings)
        {
            Console.WriteLine(stringPlusEncoding.Str);

            foreach (var info in encodings)
            {
                var enc = info.GetEncoding();

                bool unicodeEncoding = enc.BodyName.StartsWith("utf-");

                if (!enc.IsSingleByte && !unicodeEncoding)
                {
                    Console.WriteLine($"Skipped {enc.BodyName}");
                    continue;
                }

                enc = (Encoding)enc.Clone();

                // We replace unknown characters with easy-to-find code 0
                // Note that this is useless for encodings that map all the characters
                // and use the 0 for something else (like utf-16 and utf-32)
                enc.EncoderFallback = new EncoderReplacementFallback("[=11=]");

                var bytes = enc.GetBytes(stringPlusEncoding.Str);

                if (!unicodeEncoding && (bytes.Length == 0 || bytes.Any(x => x == 0)))
                {
                    continue;
                }

                // Write in hex format
                string encodedHex = string.Join(" ", bytes.Select(x => x.ToString("x2")));

                Console.WriteLine($"{encodedHex} {enc.HeaderName}");

                stringPlusEncoding.Encodings.Add((encodedHex, enc));
            }

            Console.WriteLine(string.Empty);
        }

        Console.WriteLine("Candidates:");

        foreach (var encodingFrom in encodingsFrom)
        {
            var encodingsTo2 = encodingsTo.Where(x => encodingFrom.Hex == x.Hex).ToArray();

            foreach (var encodingTo in encodingsTo2)
            {
                Console.WriteLine($"{encodingFrom.Encoding.HeaderName} -> {encodingTo.Encoding.HeaderName}");
            }
        }
    }
}