为什么 cv::parallel_for_ 运行比我自己的实现更快？

Question

我正在实现 RGB 图像（无符号字符类型）的最近邻域大小调整算法。考虑到在 Android ARMv8 平台上与 OpenCV 的速度比较，我发现 OpenCV 使用 cv::parallel_for_ 进行多线程加速。

因此，我深入研究了OpenCV的cv::resize()对应的源代码，复制并粘贴实际运行的代码，放入我的main.cpp。它包含一个仿函数resizeNNInvoker，以及在这个仿函数上执行多线程计算的cv::parallel_for_。

让我感到困惑的是 cv::parallel_for_ 版本运行比使用 my_parallel_for_ 更快，后者的代码与 OpenCV 的代码保持一致。

为了更清楚：

在 Android armv8 平台上测试
**用OpenMP多线程编译OpenCV，转其他并行框架
转到 OpenCV 的 cv::parallel_for_，将其源代码更改为与 my_parallel_for_ 相同（见下文）
cv::setNumThreads(4) 使用 4 个线程，并绑定 4 个 cpu 大内核（使用 ncnn API）
所有代码在Release模式下编译（通过CMake）
测试输入图片：width=7680，height=4320，目标图片尺寸：7680/3，4320/3。

时间成本如下：

方法	时间成本
简历::parallel_for_	3.24 毫秒
我的_parallel_for_	7.67 毫秒
替换 openmp	7.75 毫秒

// my own implementation of parallel_for_, copied from OpenCV source code
void my_parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body)
{
    #pragma omp parallel for schedule(dynamic) num_threads(4)
    for (int i = range.start; i < range.end; ++i)
        body(cv::Range(i, i + 1));
}

// The functor that performs nearest neighbor resizing, copied from opencv source
class resizeNNInvoker : public cv::ParallelLoopBody
{
public:
    resizeNNInvoker(const cv::Mat& _src, cv::Mat &_dst, int *_x_ofs, double _ify) :
        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
        ify(_ify)
    {
    }

    virtual void operator() (const cv::Range& range) const CV_OVERRIDE
    {
        //printf("--- resizeNNInvoker get called\n");
        cv::Size ssize = src.size(), dsize = dst.size();
        int y, x, pix_size = (int)src.elemSize();

        for( y = range.start; y < range.end; y++ )
        {
            uchar* D = dst.data + dst.step*y;
            int sy = std::min(cvFloor(y*ify), ssize.height-1);
            const uchar* S = src.ptr(sy);

            switch( pix_size )
            {
            case 1:
                for( x = 0; x <= dsize.width - 2; x += 2 )
                {
                    uchar t0 = S[x_ofs[x]];
                    uchar t1 = S[x_ofs[x+1]];
                    D[x] = t0;
                    D[x+1] = t1;
                }

                for( ; x < dsize.width; x++ )
                    D[x] = S[x_ofs[x]];
                break;
            case 2:
                for( x = 0; x < dsize.width; x++ )
                    *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
                break;
            case 3:
                for( x = 0; x < dsize.width; x++, D += 3 )
                {
                    const uchar* _tS = S + x_ofs[x];
                    D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
                }
                break;
            case 4:
                for( x = 0; x < dsize.width; x++ )
                    *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
                break;
            case 6:
                for( x = 0; x < dsize.width; x++, D += 6 )
                {
                    const ushort* _tS = (const ushort*)(S + x_ofs[x]);
                    ushort* _tD = (ushort*)D;
                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
                }
                break;
            case 8:
                for( x = 0; x < dsize.width; x++, D += 8 )
                {
                    const int* _tS = (const int*)(S + x_ofs[x]);
                    int* _tD = (int*)D;
                    _tD[0] = _tS[0]; _tD[1] = _tS[1];
                }
                break;
            case 12:
                for( x = 0; x < dsize.width; x++, D += 12 )
                {
                    const int* _tS = (const int*)(S + x_ofs[x]);
                    int* _tD = (int*)D;
                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
                }
                break;
            default:
                for( x = 0; x < dsize.width; x++, D += pix_size )
                {
                    const uchar* _tS = S + x_ofs[x];
                    for (int k = 0; k < pix_size; k++)
                        D[k] = _tS[k];
                }
            }
        }
    }

private:
    const cv::Mat& src;
    cv::Mat& dst;
    int* x_ofs;
    double ify;

    resizeNNInvoker(const resizeNNInvoker&);
    resizeNNInvoker& operator=(const resizeNNInvoker&);
};

// The entry function that calls nearest neighbor resizing with openmp multi-thread
void resize_nearest(const uchar* src_buf, int src_height, int src_width, int src_linebytes, uchar* dst_buf, int dst_height, int dst_width, int dst_linebytes, const Option& opt)
{
    cv::Size src_size;
    src_size.height = src_height;
    src_size.width = src_width;
    cv::Mat src(src_size, CV_8UC3, const_cast<uchar*>(src_buf));

    cv::Size dst_size;
    dst_size.height = dst_height;
    dst_size.width = dst_width;
    cv::Mat dst(dst_size, CV_8UC3, dst_buf);

    cv::Size ssize = src.size(), dsize = dst.size();

    double inv_scale_x = (double)dsize.width/ssize.width;
    double inv_scale_y = (double)dsize.height/ssize.height;
    double fx = inv_scale_x;
    double fy = inv_scale_y;

    cv::AutoBuffer<int> _x_ofs(dsize.width);
    int* x_ofs = _x_ofs.data();
    int pix_size = (int)src.elemSize();
    double ifx = 1./fx, ify = 1./fy;
    int x;

    for( x = 0; x < dsize.width; x++ )
    {
        int sx = cvFloor(x*ifx);
        x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
    }

    cv::Range range(0, dsize.height);

    // !! define the instance of resizeNNInvoker functor.
    resizeNNInvoker invoker(src, dst, x_ofs, ify);

#if 0
    cv::parallel_for_(range, invoker);   //!! use opencv's, cost 3.24 ms
#elif 0
    my_parallel_for_(range, invoker);    //!! use own implementation, cost 7.67 ms
#else
    set_omp_dynamic(1);    //!! use inplace-implementation, cost 7.75 ms
    cv::Range stripeRange = range;
    #pragma omp parallel for schedule(dynamic) num_threads(4)
    for (int i = stripeRange.start; i < stripeRange.end; ++i)
        invoker(cv::Range(i, i + 1));
#endif
}

Answer 1

这是从 OpenCV 到 select 实际线程框架的代码：

#ifdef CV_PARALLEL_FRAMEWORK
#if defined HAVE_TBB

#if TBB_INTERFACE_VERSION >= 8000
        tbbArena.execute(pbody);
#else
        pbody();
#endif

#elif defined HAVE_HPX
        pbody();

#elif defined HAVE_OPENMP

        #pragma omp parallel for schedule(dynamic) num_threads(numThreads > 0 ? numThreads : numThreadsMax)
        for (int i = stripeRange.start; i < stripeRange.end; ++i)
            pbody(Range(i, i + 1));

#elif defined HAVE_GCD

        dispatch_queue_t concurrent_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
        dispatch_apply_f(stripeRange.end - stripeRange.start, concurrent_queue, &pbody, block_function);

#elif defined WINRT

        Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);

#elif defined HAVE_CONCURRENCY

        if(!pplScheduler || pplScheduler->Id() == Concurrency::CurrentScheduler::Id())
        {
            Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
        }
        else
        {
            pplScheduler->Attach();
            Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
            Concurrency::CurrentScheduler::Detach();
        }

#elif defined HAVE_PTHREADS_PF

        parallel_for_pthreads(pbody.stripeRange(), pbody, pbody.stripeRange().size());

#else

#error You have hacked and compiling with unsupported parallel framework

#endif

        ctx.finalize();  // propagate exceptions if exists
        return;
#endif // CV_PARALLEL_FRAMEWORK

所以这是优先顺序：

TBB任务竞技场
待定
HPX
OPENMP
苹果 GCD
WINRT 并发
Windows 并发
PThread

也许您的 opencv parallel_for 使用 TBB 而您的代码使用 OpenMP？

不确定是否可行，但您可以尝试显式地使用 opencv 中的 openmp，例如 C++ 中的 cv::parallel::openmp::parallel_for

Answer 2

我终于明白了。我在 CMakeLists.txt 中的 OpenMP 配置导致性能不匹配。

具体来说，我构建了依赖于静态库 libplain.a 的可执行文件 test，然后 libplain.a 使用这些（上一个错了）：

find_package(OpenMP)
if(NOT TARGET OpenMP::OpenMP_CXX AND (OpenMP_CXX_FOUND OR OPENMP_FOUND))
    target_compile_options(plain PRIVATE ${OpenMP_CXX_FLAGS})
endif()

if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
    if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20))
        target_compile_options(plain PRIVATE -fopenmp)
        target_link_libraries(plain PUBLIC -fopenmp -static-openmp)
    elseif(OpenMP_CXX_FOUND)
        target_link_libraries(plain PUBLIC OpenMP::OpenMP_CXX)
    else()
        target_link_libraries(plain PRIVATE "${OpenMP_CXX_FLAGS}")
    endif()
endif()

现在将所有 PRIVATE 可见性更改为 PUBLIC，然后 openmp 编译和 link 标志正确传播到可执行目标 test:

target_link_libraries(plain PUBLIC ${OpenCV_LIBS})

find_package(OpenMP)
if(NOT TARGET OpenMP::OpenMP_CXX AND (OpenMP_CXX_FOUND OR OPENMP_FOUND))
    target_compile_options(plain PUBLIC ${OpenMP_CXX_FLAGS})
endif()

if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
    if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20))
        target_compile_options(plain PUBLIC -fopenmp)
        target_link_libraries(plain PUBLIC -fopenmp -static-openmp)
    elseif(OpenMP_CXX_FOUND)
        target_link_libraries(plain PUBLIC OpenMP::OpenMP_CXX)
    else()
        target_link_libraries(plain PUBLIC "${OpenMP_CXX_FLAGS}")
    endif()
endif()

使用此更新的 cmake 配置，重建程序，my_parallel_for_ 获得与 cv::parallel_for_ 几乎相同的速度。

为什么 cv::parallel_for_ 运行比我自己的实现更快？

Why cv::parallel_for_ run faster than my own implementation?

c++

multithreading

opencv

compilation

openmp

为什么 cv::parallel_for_ 运行 比我自己的实现更快？

Why cv::parallel_for_ run faster than my own implementation?

c++

multithreading

opencv

compilation

openmp

为什么 cv::parallel_for_ 运行比我自己的实现更快？