有没有更有效的方法来比较两个列表的项目并找到 leftOuterItems、rightOuterItems 和 matchedItems?

Is there a more efficient way to compare the items of two lists and find leftOuterItems, rightOuterItems and matchedItems?

我们经常需要一种方法来比较两个列表的项目,找出哪些项目只存在于ListA(leftOuterItems),哪些项目仅存在于ListB(rightOuterItems)及其共同项目(matchedItems)...

我最终得到了两个解决方案,如下所示:

一种方法是对列表进行排序并逐个迭代(当集合由于排序而导致项目过多时会降低性能),另一种方法是使用字典和散列(比当集合有几个项目时的第一种方式 - 由于内存分配等)

*另外请记住,我想比较两个对象列表,例如两个 class 人(不仅仅是基元)列表。这就是我创建通用扩展方法的原因

那么,你有什么更好的建议吗?

在此先感谢您!

class Program
{
    static void Main(string[] args)
    {
        var fewItemsList1 = new[] { 1, 4, 2, 3, 7, 6, 9, 5 };
        var fewItemsList2 = new[] { 15, 5, 14, 6, 13, 7, 12, 8, 11, 9, 10 };
        Run(100_000, fewItemsList1, fewItemsList2);

        var manyItemsList1 = Enumerable.Range(0, 100_000).ToArray();
        var manyItemsList2 = Enumerable.Range(50000, 150_000).ToArray();
        Run(1000, manyItemsList1, manyItemsList2);

        Console.WriteLine("Hello World!");
        Console.Read();
    }

    private static void Run(int count, int[] l1, int[] l2)
    {
        var sw = Stopwatch.StartNew();
        for (int i = 0; i < count; i++)
            l1.OrderedCompare(l2, x => x, x => x, out int[] leftOuterItems, out int[] rightOuterItems, out (int, int)[] matchedItems);
        sw.Stop();

        Console.WriteLine($"OrderedCompare for {count} iterations with L1 items:{l1.Count()} and L2 items:{l2.Count()} took {sw.Elapsed}");

        sw.Restart();
        for (int i = 0; i < count; i++)
            l1.HashedCompare(l2, x => x, x => x, out int[] leftOuterItems2, out int[] rightOuterItems2, out (int, int)[] matchedItems2);
        Console.WriteLine($"HashedCompare for {count} with L1 items:{l1.Count()} and L2 items:{l2.Count()} iterations took {sw.Elapsed}");
    }
}

public static class Extensions
{
    public static void OrderedCompare<T1, T2, TKey>(
        this IEnumerable<T1> source,
        IEnumerable<T2> target,
        Func<T1, TKey> sourceKeyGetter,
        Func<T2, TKey> targetKeyGetter,
        out T1[] leftOuterItems,
        out T2[] rightOuterItems,
        out (T1, T2)[] matchedItems) where TKey : IComparable<TKey>
    {
        var leftOuterItemsList = new List<T1>();
        var rightOuterItemsList = new List<T2>();
        var matchedItemsList = new List<(T1, T2)>();
        source = source.OrderBy(x => sourceKeyGetter(x)).ToArray();
        target = target.OrderBy(x => targetKeyGetter(x)).ToArray();

        bool reverseCompare = false;
        int i = 0, j = 0, sourcZeroBasedCount = source.Count() - 1, targetZeroBaseCount = target.Count() - 1;
        while (true)
        {
            var end = i == sourcZeroBasedCount && j == targetZeroBaseCount;
            var sourceItem = source.ElementAt(i);
            var targetItem = target.ElementAt(j);
            var sourceKey = sourceKeyGetter(sourceItem);
            var targetKey = targetKeyGetter(targetItem);

            int diff = reverseCompare ? targetKey.CompareTo(sourceKey) : sourceKey.CompareTo(targetKey);
            reverseCompare = i == sourcZeroBasedCount || j == targetZeroBaseCount;
            switch (diff)
            {
                case -1:
                    leftOuterItemsList.Add(sourceItem);
                    i = i < sourcZeroBasedCount ? i + 1 : i;
                    break;
                case 0:
                    matchedItemsList.Add((sourceItem, targetItem));
                    i = i < sourcZeroBasedCount ? i + 1 : i;
                    j = j < targetZeroBaseCount ? j + 1 : j;
                    break;
                case 1:
                    rightOuterItemsList.Add(targetItem);
                    j = j < targetZeroBaseCount ? j + 1 : j;
                    break;
            }

            if (end)
                break;
        }

        leftOuterItems = leftOuterItemsList.ToArray();
        rightOuterItems = rightOuterItemsList.ToArray();
        matchedItems = matchedItemsList.ToArray();
    }

    public static void HashedCompare<T1, T2, TKey>(
        this IEnumerable<T1> source,
        IEnumerable<T2> target,
        Func<T1, TKey> sourceKeyGetter,
        Func<T2, TKey> targetKeyGetter,
        out T1[] leftOuterItems,
        out T2[] rightOuterItems,
        out (T1, T2)[] matchedItems) where TKey : IComparable<TKey>
    {
        var sourceDic = source.ToDictionary(x => sourceKeyGetter(x));
        var targetDic = target.ToDictionary(x => targetKeyGetter(x));

        var leftOuterKeys = sourceDic.Keys.Except(targetDic.Keys).ToArray();
        var rightOuterKeys = targetDic.Keys.Except(sourceDic.Keys).ToArray();
        var matchedKeys = sourceDic.Keys.Concat(targetDic.Keys).Except(leftOuterKeys.Concat(rightOuterKeys)).ToArray();

        leftOuterItems = leftOuterKeys.Select(key => sourceDic[key]).ToArray();
        rightOuterItems = rightOuterKeys.Select(key => targetDic[key]).ToArray();
        matchedItems = matchedKeys.Select(key => (sourceDic[key], targetDic[key])).ToArray();
    }
}

您可以使用 ExceptIntersect,它们都可以在 Sets(轻量级哈希集)上工作,并且可以工作 O(n) 线性时间复杂度

var list1 = new[] { 1, 4, 2, 3, 7, 6, 9, 5 };
var List2 = new[] { 15, 5, 14, 6, 13, 7, 12, 8, 11, 9, 10 };
Console.WriteLine(string.Join(", ", list1.Except(List2)));
Console.WriteLine(string.Join(", ", List2.Except(list1)));
Console.WriteLine(string.Join(", ", List2.Intersect(list1)));

输出

1, 4, 2, 3
15, 14, 13, 12, 8, 11, 10
5, 6, 7, 9

至于它是快还是慢,你必须基准测试,但我的直觉是它们会更有效率和更快。


关于基准测试

使用经过可靠测试的框架,例如 BenchmarkDotNet,如果您自己滚动,您可能会以无限种方式得到错误的结果

HashedCompare() 中的大部分低效率归因于字典中不必要的枚举和查找。如果您以命令式风格编写算法,则可以避免所有这些,并且代码变得更容易理解:

我支持 @00110001 建议您应该使用适当的基准测试框架,因为不同实现之间的差异具有相同的复杂程度。

public static void HashedCompare<T1, T2, TKey>(
    this IEnumerable<T1> source,
    IEnumerable<T2> target,
    Func<T1, TKey> sourceKeyGetter,
    Func<T2, TKey> targetKeyGetter,
    out List<T1> leftOuterItems,
    out List<T2> rightOuterItems,
    out List<(T1, T2)> matchedItems) where TKey : IEquatable<TKey>
{
    var sourceItems = source.ToDictionary(x => sourceKeyGetter(x));
    var targetItems = target.ToDictionary(x => targetKeyGetter(x));

    matchedItems = new List<(T1, T2)>();
    leftOuterItems = new List<T1>();
    rightOuterItems = new List<T2>();
    foreach (var sourceItem in sourceItems)
    {
        if (targetItems.TryGetValue(sourceItem.Key, out var targetItem))
            matchedItems.Add((sourceItem.Value, targetItem));
        else
            leftOuterItems.Add(sourceItem.Value);
    }

    foreach (var targetItem in targetItems)
    {
        if (!sourceItems.ContainsKey(targetItem.Key))
            rightOuterItems.Add(targetItem.Value);
    }
}