为什么 cv::parallel_for_ 运行 比我自己的实现更快?

Why cv::parallel_for_ run faster than my own implementation?

我正在实现 RGB 图像(无符号字符类型)的最近邻域大小调整算法。考虑到在 Android ARMv8 平台上与 OpenCV 的速度比较,我发现 OpenCV 使用 cv::parallel_for_ 进行多线程加速。

因此,我深入研究了OpenCV的cv::resize()对应的源代码,复制并粘贴实际运行的代码,放入我的main.cpp。它包含一个仿函数resizeNNInvoker,以及在这个仿函数上执行多线程计算的cv::parallel_for_

让我感到困惑的是 cv::parallel_for_ 版本 运行 比使用 my_parallel_for_ 更快,后者的代码与 OpenCV 的代码保持一致。

为了更清楚:

时间成本如下:

方法 时间成本
简历::parallel_for_ 3.24 毫秒
我的_parallel_for_ 7.67 毫秒
替换 openmp 7.75 毫秒
// my own implementation of parallel_for_, copied from OpenCV source code
void my_parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body)
{
    #pragma omp parallel for schedule(dynamic) num_threads(4)
    for (int i = range.start; i < range.end; ++i)
        body(cv::Range(i, i + 1));
}

// The functor that performs nearest neighbor resizing, copied from opencv source
class resizeNNInvoker : public cv::ParallelLoopBody
{
public:
    resizeNNInvoker(const cv::Mat& _src, cv::Mat &_dst, int *_x_ofs, double _ify) :
        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
        ify(_ify)
    {
    }

    virtual void operator() (const cv::Range& range) const CV_OVERRIDE
    {
        //printf("--- resizeNNInvoker get called\n");
        cv::Size ssize = src.size(), dsize = dst.size();
        int y, x, pix_size = (int)src.elemSize();

        for( y = range.start; y < range.end; y++ )
        {
            uchar* D = dst.data + dst.step*y;
            int sy = std::min(cvFloor(y*ify), ssize.height-1);
            const uchar* S = src.ptr(sy);

            switch( pix_size )
            {
            case 1:
                for( x = 0; x <= dsize.width - 2; x += 2 )
                {
                    uchar t0 = S[x_ofs[x]];
                    uchar t1 = S[x_ofs[x+1]];
                    D[x] = t0;
                    D[x+1] = t1;
                }

                for( ; x < dsize.width; x++ )
                    D[x] = S[x_ofs[x]];
                break;
            case 2:
                for( x = 0; x < dsize.width; x++ )
                    *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
                break;
            case 3:
                for( x = 0; x < dsize.width; x++, D += 3 )
                {
                    const uchar* _tS = S + x_ofs[x];
                    D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
                }
                break;
            case 4:
                for( x = 0; x < dsize.width; x++ )
                    *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
                break;
            case 6:
                for( x = 0; x < dsize.width; x++, D += 6 )
                {
                    const ushort* _tS = (const ushort*)(S + x_ofs[x]);
                    ushort* _tD = (ushort*)D;
                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
                }
                break;
            case 8:
                for( x = 0; x < dsize.width; x++, D += 8 )
                {
                    const int* _tS = (const int*)(S + x_ofs[x]);
                    int* _tD = (int*)D;
                    _tD[0] = _tS[0]; _tD[1] = _tS[1];
                }
                break;
            case 12:
                for( x = 0; x < dsize.width; x++, D += 12 )
                {
                    const int* _tS = (const int*)(S + x_ofs[x]);
                    int* _tD = (int*)D;
                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
                }
                break;
            default:
                for( x = 0; x < dsize.width; x++, D += pix_size )
                {
                    const uchar* _tS = S + x_ofs[x];
                    for (int k = 0; k < pix_size; k++)
                        D[k] = _tS[k];
                }
            }
        }
    }

private:
    const cv::Mat& src;
    cv::Mat& dst;
    int* x_ofs;
    double ify;

    resizeNNInvoker(const resizeNNInvoker&);
    resizeNNInvoker& operator=(const resizeNNInvoker&);
};

// The entry function that calls nearest neighbor resizing with openmp multi-thread
void resize_nearest(const uchar* src_buf, int src_height, int src_width, int src_linebytes, uchar* dst_buf, int dst_height, int dst_width, int dst_linebytes, const Option& opt)
{
    cv::Size src_size;
    src_size.height = src_height;
    src_size.width = src_width;
    cv::Mat src(src_size, CV_8UC3, const_cast<uchar*>(src_buf));

    cv::Size dst_size;
    dst_size.height = dst_height;
    dst_size.width = dst_width;
    cv::Mat dst(dst_size, CV_8UC3, dst_buf);

    cv::Size ssize = src.size(), dsize = dst.size();

    double inv_scale_x = (double)dsize.width/ssize.width;
    double inv_scale_y = (double)dsize.height/ssize.height;
    double fx = inv_scale_x;
    double fy = inv_scale_y;

    cv::AutoBuffer<int> _x_ofs(dsize.width);
    int* x_ofs = _x_ofs.data();
    int pix_size = (int)src.elemSize();
    double ifx = 1./fx, ify = 1./fy;
    int x;

    for( x = 0; x < dsize.width; x++ )
    {
        int sx = cvFloor(x*ifx);
        x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
    }

    cv::Range range(0, dsize.height);

    // !! define the instance of resizeNNInvoker functor.
    resizeNNInvoker invoker(src, dst, x_ofs, ify);

#if 0
    cv::parallel_for_(range, invoker);   //!! use opencv's, cost 3.24 ms
#elif 0
    my_parallel_for_(range, invoker);    //!! use own implementation, cost 7.67 ms
#else
    set_omp_dynamic(1);    //!! use inplace-implementation, cost 7.75 ms
    cv::Range stripeRange = range;
    #pragma omp parallel for schedule(dynamic) num_threads(4)
    for (int i = stripeRange.start; i < stripeRange.end; ++i)
        invoker(cv::Range(i, i + 1));
#endif
}

这是从 OpenCV 到 select 实际线程框架的代码:

#ifdef CV_PARALLEL_FRAMEWORK
#if defined HAVE_TBB

#if TBB_INTERFACE_VERSION >= 8000
        tbbArena.execute(pbody);
#else
        pbody();
#endif

#elif defined HAVE_HPX
        pbody();

#elif defined HAVE_OPENMP

        #pragma omp parallel for schedule(dynamic) num_threads(numThreads > 0 ? numThreads : numThreadsMax)
        for (int i = stripeRange.start; i < stripeRange.end; ++i)
            pbody(Range(i, i + 1));

#elif defined HAVE_GCD

        dispatch_queue_t concurrent_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
        dispatch_apply_f(stripeRange.end - stripeRange.start, concurrent_queue, &pbody, block_function);

#elif defined WINRT

        Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);

#elif defined HAVE_CONCURRENCY

        if(!pplScheduler || pplScheduler->Id() == Concurrency::CurrentScheduler::Id())
        {
            Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
        }
        else
        {
            pplScheduler->Attach();
            Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
            Concurrency::CurrentScheduler::Detach();
        }

#elif defined HAVE_PTHREADS_PF

        parallel_for_pthreads(pbody.stripeRange(), pbody, pbody.stripeRange().size());

#else

#error You have hacked and compiling with unsupported parallel framework

#endif

        ctx.finalize();  // propagate exceptions if exists
        return;
#endif // CV_PARALLEL_FRAMEWORK

所以这是优先顺序:

  1. TBB任务竞技场
  2. 待定
  3. HPX
  4. OPENMP
  5. 苹果 GCD
  6. WINRT 并发
  7. Windows 并发
  8. PThread

也许您的 opencv parallel_for 使用 TBB 而您的代码使用 OpenMP?

不确定是否可行,但您可以尝试显式地使用 opencv 中的 openmp,例如 C++ 中的 cv::parallel::openmp::parallel_for

我终于明白了。我在 CMakeLists.txt 中的 OpenMP 配置导致性能不匹配。

具体来说,我构建了依赖于静态库 libplain.a 的可执行文件 test,然后 libplain.a 使用这些(上一个错了):

find_package(OpenMP)
if(NOT TARGET OpenMP::OpenMP_CXX AND (OpenMP_CXX_FOUND OR OPENMP_FOUND))
    target_compile_options(plain PRIVATE ${OpenMP_CXX_FLAGS})
endif()

if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
    if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20))
        target_compile_options(plain PRIVATE -fopenmp)
        target_link_libraries(plain PUBLIC -fopenmp -static-openmp)
    elseif(OpenMP_CXX_FOUND)
        target_link_libraries(plain PUBLIC OpenMP::OpenMP_CXX)
    else()
        target_link_libraries(plain PRIVATE "${OpenMP_CXX_FLAGS}")
    endif()
endif()

现在将所有 PRIVATE 可见性更改为 PUBLIC,然后 openmp 编译和 link 标志正确传播到可执行目标 test:

target_link_libraries(plain PUBLIC ${OpenCV_LIBS})

find_package(OpenMP)
if(NOT TARGET OpenMP::OpenMP_CXX AND (OpenMP_CXX_FOUND OR OPENMP_FOUND))
    target_compile_options(plain PUBLIC ${OpenMP_CXX_FLAGS})
endif()

if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
    if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20))
        target_compile_options(plain PUBLIC -fopenmp)
        target_link_libraries(plain PUBLIC -fopenmp -static-openmp)
    elseif(OpenMP_CXX_FOUND)
        target_link_libraries(plain PUBLIC OpenMP::OpenMP_CXX)
    else()
        target_link_libraries(plain PUBLIC "${OpenMP_CXX_FLAGS}")
    endif()
endif()

使用此更新的 cmake 配置,重建程序,my_parallel_for_ 获得与 cv::parallel_for_ 几乎相同的速度。