为什么 cv::parallel_for_ 运行 比我自己的实现更快?
Why cv::parallel_for_ run faster than my own implementation?
我正在实现 RGB 图像(无符号字符类型)的最近邻域大小调整算法。考虑到在 Android ARMv8 平台上与 OpenCV 的速度比较,我发现 OpenCV 使用 cv::parallel_for_
进行多线程加速。
因此,我深入研究了OpenCV的cv::resize()
对应的源代码,复制并粘贴实际运行的代码,放入我的main.cpp
。它包含一个仿函数resizeNNInvoker
,以及在这个仿函数上执行多线程计算的cv::parallel_for_
。
让我感到困惑的是 cv::parallel_for_
版本 运行 比使用 my_parallel_for_
更快,后者的代码与 OpenCV 的代码保持一致。
为了更清楚:
- 在 Android armv8 平台上测试
- **用OpenMP多线程编译OpenCV,转其他并行框架
- 转到 OpenCV 的
cv::parallel_for_
,将其源代码更改为与 my_parallel_for_
相同(见下文)
cv::setNumThreads(4)
使用 4 个线程,并绑定 4 个 cpu 大内核(使用 ncnn API)
- 所有代码在Release模式下编译(通过CMake)
- 测试输入图片:width=7680,height=4320,目标图片尺寸:7680/3,4320/3。
时间成本如下:
方法
时间成本
简历::parallel_for_
3.24 毫秒
我的_parallel_for_
7.67 毫秒
替换 openmp
7.75 毫秒
// my own implementation of parallel_for_, copied from OpenCV source code
void my_parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body)
{
#pragma omp parallel for schedule(dynamic) num_threads(4)
for (int i = range.start; i < range.end; ++i)
body(cv::Range(i, i + 1));
}
// The functor that performs nearest neighbor resizing, copied from opencv source
class resizeNNInvoker : public cv::ParallelLoopBody
{
public:
resizeNNInvoker(const cv::Mat& _src, cv::Mat &_dst, int *_x_ofs, double _ify) :
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
ify(_ify)
{
}
virtual void operator() (const cv::Range& range) const CV_OVERRIDE
{
//printf("--- resizeNNInvoker get called\n");
cv::Size ssize = src.size(), dsize = dst.size();
int y, x, pix_size = (int)src.elemSize();
for( y = range.start; y < range.end; y++ )
{
uchar* D = dst.data + dst.step*y;
int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.ptr(sy);
switch( pix_size )
{
case 1:
for( x = 0; x <= dsize.width - 2; x += 2 )
{
uchar t0 = S[x_ofs[x]];
uchar t1 = S[x_ofs[x+1]];
D[x] = t0;
D[x+1] = t1;
}
for( ; x < dsize.width; x++ )
D[x] = S[x_ofs[x]];
break;
case 2:
for( x = 0; x < dsize.width; x++ )
*(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
break;
case 3:
for( x = 0; x < dsize.width; x++, D += 3 )
{
const uchar* _tS = S + x_ofs[x];
D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
}
break;
case 4:
for( x = 0; x < dsize.width; x++ )
*(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
break;
case 6:
for( x = 0; x < dsize.width; x++, D += 6 )
{
const ushort* _tS = (const ushort*)(S + x_ofs[x]);
ushort* _tD = (ushort*)D;
_tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
}
break;
case 8:
for( x = 0; x < dsize.width; x++, D += 8 )
{
const int* _tS = (const int*)(S + x_ofs[x]);
int* _tD = (int*)D;
_tD[0] = _tS[0]; _tD[1] = _tS[1];
}
break;
case 12:
for( x = 0; x < dsize.width; x++, D += 12 )
{
const int* _tS = (const int*)(S + x_ofs[x]);
int* _tD = (int*)D;
_tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
}
break;
default:
for( x = 0; x < dsize.width; x++, D += pix_size )
{
const uchar* _tS = S + x_ofs[x];
for (int k = 0; k < pix_size; k++)
D[k] = _tS[k];
}
}
}
}
private:
const cv::Mat& src;
cv::Mat& dst;
int* x_ofs;
double ify;
resizeNNInvoker(const resizeNNInvoker&);
resizeNNInvoker& operator=(const resizeNNInvoker&);
};
// The entry function that calls nearest neighbor resizing with openmp multi-thread
void resize_nearest(const uchar* src_buf, int src_height, int src_width, int src_linebytes, uchar* dst_buf, int dst_height, int dst_width, int dst_linebytes, const Option& opt)
{
cv::Size src_size;
src_size.height = src_height;
src_size.width = src_width;
cv::Mat src(src_size, CV_8UC3, const_cast<uchar*>(src_buf));
cv::Size dst_size;
dst_size.height = dst_height;
dst_size.width = dst_width;
cv::Mat dst(dst_size, CV_8UC3, dst_buf);
cv::Size ssize = src.size(), dsize = dst.size();
double inv_scale_x = (double)dsize.width/ssize.width;
double inv_scale_y = (double)dsize.height/ssize.height;
double fx = inv_scale_x;
double fy = inv_scale_y;
cv::AutoBuffer<int> _x_ofs(dsize.width);
int* x_ofs = _x_ofs.data();
int pix_size = (int)src.elemSize();
double ifx = 1./fx, ify = 1./fy;
int x;
for( x = 0; x < dsize.width; x++ )
{
int sx = cvFloor(x*ifx);
x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
}
cv::Range range(0, dsize.height);
// !! define the instance of resizeNNInvoker functor.
resizeNNInvoker invoker(src, dst, x_ofs, ify);
#if 0
cv::parallel_for_(range, invoker); //!! use opencv's, cost 3.24 ms
#elif 0
my_parallel_for_(range, invoker); //!! use own implementation, cost 7.67 ms
#else
set_omp_dynamic(1); //!! use inplace-implementation, cost 7.75 ms
cv::Range stripeRange = range;
#pragma omp parallel for schedule(dynamic) num_threads(4)
for (int i = stripeRange.start; i < stripeRange.end; ++i)
invoker(cv::Range(i, i + 1));
#endif
}
这是从 OpenCV 到 select 实际线程框架的代码:
#ifdef CV_PARALLEL_FRAMEWORK
#if defined HAVE_TBB
#if TBB_INTERFACE_VERSION >= 8000
tbbArena.execute(pbody);
#else
pbody();
#endif
#elif defined HAVE_HPX
pbody();
#elif defined HAVE_OPENMP
#pragma omp parallel for schedule(dynamic) num_threads(numThreads > 0 ? numThreads : numThreadsMax)
for (int i = stripeRange.start; i < stripeRange.end; ++i)
pbody(Range(i, i + 1));
#elif defined HAVE_GCD
dispatch_queue_t concurrent_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
dispatch_apply_f(stripeRange.end - stripeRange.start, concurrent_queue, &pbody, block_function);
#elif defined WINRT
Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
#elif defined HAVE_CONCURRENCY
if(!pplScheduler || pplScheduler->Id() == Concurrency::CurrentScheduler::Id())
{
Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
}
else
{
pplScheduler->Attach();
Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
Concurrency::CurrentScheduler::Detach();
}
#elif defined HAVE_PTHREADS_PF
parallel_for_pthreads(pbody.stripeRange(), pbody, pbody.stripeRange().size());
#else
#error You have hacked and compiling with unsupported parallel framework
#endif
ctx.finalize(); // propagate exceptions if exists
return;
#endif // CV_PARALLEL_FRAMEWORK
所以这是优先顺序:
- TBB任务竞技场
- 待定
- HPX
- OPENMP
- 苹果 GCD
- WINRT 并发
- Windows 并发
- PThread
也许您的 opencv parallel_for 使用 TBB 而您的代码使用 OpenMP?
不确定是否可行,但您可以尝试显式地使用 opencv 中的 openmp,例如 C++ 中的 cv::parallel::openmp::parallel_for
我终于明白了。我在 CMakeLists.txt 中的 OpenMP 配置导致性能不匹配。
具体来说,我构建了依赖于静态库 libplain.a
的可执行文件 test
,然后 libplain.a
使用这些(上一个错了):
find_package(OpenMP)
if(NOT TARGET OpenMP::OpenMP_CXX AND (OpenMP_CXX_FOUND OR OPENMP_FOUND))
target_compile_options(plain PRIVATE ${OpenMP_CXX_FLAGS})
endif()
if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20))
target_compile_options(plain PRIVATE -fopenmp)
target_link_libraries(plain PUBLIC -fopenmp -static-openmp)
elseif(OpenMP_CXX_FOUND)
target_link_libraries(plain PUBLIC OpenMP::OpenMP_CXX)
else()
target_link_libraries(plain PRIVATE "${OpenMP_CXX_FLAGS}")
endif()
endif()
现在将所有 PRIVATE
可见性更改为 PUBLIC
,然后 openmp 编译和 link 标志正确传播到可执行目标 test
:
target_link_libraries(plain PUBLIC ${OpenCV_LIBS})
find_package(OpenMP)
if(NOT TARGET OpenMP::OpenMP_CXX AND (OpenMP_CXX_FOUND OR OPENMP_FOUND))
target_compile_options(plain PUBLIC ${OpenMP_CXX_FLAGS})
endif()
if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20))
target_compile_options(plain PUBLIC -fopenmp)
target_link_libraries(plain PUBLIC -fopenmp -static-openmp)
elseif(OpenMP_CXX_FOUND)
target_link_libraries(plain PUBLIC OpenMP::OpenMP_CXX)
else()
target_link_libraries(plain PUBLIC "${OpenMP_CXX_FLAGS}")
endif()
endif()
使用此更新的 cmake 配置,重建程序,my_parallel_for_
获得与 cv::parallel_for_
几乎相同的速度。
我正在实现 RGB 图像(无符号字符类型)的最近邻域大小调整算法。考虑到在 Android ARMv8 平台上与 OpenCV 的速度比较,我发现 OpenCV 使用 cv::parallel_for_
进行多线程加速。
因此,我深入研究了OpenCV的cv::resize()
对应的源代码,复制并粘贴实际运行的代码,放入我的main.cpp
。它包含一个仿函数resizeNNInvoker
,以及在这个仿函数上执行多线程计算的cv::parallel_for_
。
让我感到困惑的是 cv::parallel_for_
版本 运行 比使用 my_parallel_for_
更快,后者的代码与 OpenCV 的代码保持一致。
为了更清楚:
- 在 Android armv8 平台上测试
- **用OpenMP多线程编译OpenCV,转其他并行框架
- 转到 OpenCV 的
cv::parallel_for_
,将其源代码更改为与my_parallel_for_
相同(见下文) cv::setNumThreads(4)
使用 4 个线程,并绑定 4 个 cpu 大内核(使用 ncnn API)- 所有代码在Release模式下编译(通过CMake)
- 测试输入图片:width=7680,height=4320,目标图片尺寸:7680/3,4320/3。
时间成本如下:
方法 | 时间成本 |
---|---|
简历::parallel_for_ | 3.24 毫秒 |
我的_parallel_for_ | 7.67 毫秒 |
替换 openmp | 7.75 毫秒 |
// my own implementation of parallel_for_, copied from OpenCV source code
void my_parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body)
{
#pragma omp parallel for schedule(dynamic) num_threads(4)
for (int i = range.start; i < range.end; ++i)
body(cv::Range(i, i + 1));
}
// The functor that performs nearest neighbor resizing, copied from opencv source
class resizeNNInvoker : public cv::ParallelLoopBody
{
public:
resizeNNInvoker(const cv::Mat& _src, cv::Mat &_dst, int *_x_ofs, double _ify) :
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
ify(_ify)
{
}
virtual void operator() (const cv::Range& range) const CV_OVERRIDE
{
//printf("--- resizeNNInvoker get called\n");
cv::Size ssize = src.size(), dsize = dst.size();
int y, x, pix_size = (int)src.elemSize();
for( y = range.start; y < range.end; y++ )
{
uchar* D = dst.data + dst.step*y;
int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.ptr(sy);
switch( pix_size )
{
case 1:
for( x = 0; x <= dsize.width - 2; x += 2 )
{
uchar t0 = S[x_ofs[x]];
uchar t1 = S[x_ofs[x+1]];
D[x] = t0;
D[x+1] = t1;
}
for( ; x < dsize.width; x++ )
D[x] = S[x_ofs[x]];
break;
case 2:
for( x = 0; x < dsize.width; x++ )
*(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
break;
case 3:
for( x = 0; x < dsize.width; x++, D += 3 )
{
const uchar* _tS = S + x_ofs[x];
D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
}
break;
case 4:
for( x = 0; x < dsize.width; x++ )
*(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
break;
case 6:
for( x = 0; x < dsize.width; x++, D += 6 )
{
const ushort* _tS = (const ushort*)(S + x_ofs[x]);
ushort* _tD = (ushort*)D;
_tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
}
break;
case 8:
for( x = 0; x < dsize.width; x++, D += 8 )
{
const int* _tS = (const int*)(S + x_ofs[x]);
int* _tD = (int*)D;
_tD[0] = _tS[0]; _tD[1] = _tS[1];
}
break;
case 12:
for( x = 0; x < dsize.width; x++, D += 12 )
{
const int* _tS = (const int*)(S + x_ofs[x]);
int* _tD = (int*)D;
_tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
}
break;
default:
for( x = 0; x < dsize.width; x++, D += pix_size )
{
const uchar* _tS = S + x_ofs[x];
for (int k = 0; k < pix_size; k++)
D[k] = _tS[k];
}
}
}
}
private:
const cv::Mat& src;
cv::Mat& dst;
int* x_ofs;
double ify;
resizeNNInvoker(const resizeNNInvoker&);
resizeNNInvoker& operator=(const resizeNNInvoker&);
};
// The entry function that calls nearest neighbor resizing with openmp multi-thread
void resize_nearest(const uchar* src_buf, int src_height, int src_width, int src_linebytes, uchar* dst_buf, int dst_height, int dst_width, int dst_linebytes, const Option& opt)
{
cv::Size src_size;
src_size.height = src_height;
src_size.width = src_width;
cv::Mat src(src_size, CV_8UC3, const_cast<uchar*>(src_buf));
cv::Size dst_size;
dst_size.height = dst_height;
dst_size.width = dst_width;
cv::Mat dst(dst_size, CV_8UC3, dst_buf);
cv::Size ssize = src.size(), dsize = dst.size();
double inv_scale_x = (double)dsize.width/ssize.width;
double inv_scale_y = (double)dsize.height/ssize.height;
double fx = inv_scale_x;
double fy = inv_scale_y;
cv::AutoBuffer<int> _x_ofs(dsize.width);
int* x_ofs = _x_ofs.data();
int pix_size = (int)src.elemSize();
double ifx = 1./fx, ify = 1./fy;
int x;
for( x = 0; x < dsize.width; x++ )
{
int sx = cvFloor(x*ifx);
x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
}
cv::Range range(0, dsize.height);
// !! define the instance of resizeNNInvoker functor.
resizeNNInvoker invoker(src, dst, x_ofs, ify);
#if 0
cv::parallel_for_(range, invoker); //!! use opencv's, cost 3.24 ms
#elif 0
my_parallel_for_(range, invoker); //!! use own implementation, cost 7.67 ms
#else
set_omp_dynamic(1); //!! use inplace-implementation, cost 7.75 ms
cv::Range stripeRange = range;
#pragma omp parallel for schedule(dynamic) num_threads(4)
for (int i = stripeRange.start; i < stripeRange.end; ++i)
invoker(cv::Range(i, i + 1));
#endif
}
这是从 OpenCV 到 select 实际线程框架的代码:
#ifdef CV_PARALLEL_FRAMEWORK
#if defined HAVE_TBB
#if TBB_INTERFACE_VERSION >= 8000
tbbArena.execute(pbody);
#else
pbody();
#endif
#elif defined HAVE_HPX
pbody();
#elif defined HAVE_OPENMP
#pragma omp parallel for schedule(dynamic) num_threads(numThreads > 0 ? numThreads : numThreadsMax)
for (int i = stripeRange.start; i < stripeRange.end; ++i)
pbody(Range(i, i + 1));
#elif defined HAVE_GCD
dispatch_queue_t concurrent_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
dispatch_apply_f(stripeRange.end - stripeRange.start, concurrent_queue, &pbody, block_function);
#elif defined WINRT
Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
#elif defined HAVE_CONCURRENCY
if(!pplScheduler || pplScheduler->Id() == Concurrency::CurrentScheduler::Id())
{
Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
}
else
{
pplScheduler->Attach();
Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
Concurrency::CurrentScheduler::Detach();
}
#elif defined HAVE_PTHREADS_PF
parallel_for_pthreads(pbody.stripeRange(), pbody, pbody.stripeRange().size());
#else
#error You have hacked and compiling with unsupported parallel framework
#endif
ctx.finalize(); // propagate exceptions if exists
return;
#endif // CV_PARALLEL_FRAMEWORK
所以这是优先顺序:
- TBB任务竞技场
- 待定
- HPX
- OPENMP
- 苹果 GCD
- WINRT 并发
- Windows 并发
- PThread
也许您的 opencv parallel_for 使用 TBB 而您的代码使用 OpenMP?
不确定是否可行,但您可以尝试显式地使用 opencv 中的 openmp,例如 C++ 中的 cv::parallel::openmp::parallel_for
我终于明白了。我在 CMakeLists.txt 中的 OpenMP 配置导致性能不匹配。
具体来说,我构建了依赖于静态库 libplain.a
的可执行文件 test
,然后 libplain.a
使用这些(上一个错了):
find_package(OpenMP)
if(NOT TARGET OpenMP::OpenMP_CXX AND (OpenMP_CXX_FOUND OR OPENMP_FOUND))
target_compile_options(plain PRIVATE ${OpenMP_CXX_FLAGS})
endif()
if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20))
target_compile_options(plain PRIVATE -fopenmp)
target_link_libraries(plain PUBLIC -fopenmp -static-openmp)
elseif(OpenMP_CXX_FOUND)
target_link_libraries(plain PUBLIC OpenMP::OpenMP_CXX)
else()
target_link_libraries(plain PRIVATE "${OpenMP_CXX_FLAGS}")
endif()
endif()
现在将所有 PRIVATE
可见性更改为 PUBLIC
,然后 openmp 编译和 link 标志正确传播到可执行目标 test
:
target_link_libraries(plain PUBLIC ${OpenCV_LIBS})
find_package(OpenMP)
if(NOT TARGET OpenMP::OpenMP_CXX AND (OpenMP_CXX_FOUND OR OPENMP_FOUND))
target_compile_options(plain PUBLIC ${OpenMP_CXX_FLAGS})
endif()
if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20))
target_compile_options(plain PUBLIC -fopenmp)
target_link_libraries(plain PUBLIC -fopenmp -static-openmp)
elseif(OpenMP_CXX_FOUND)
target_link_libraries(plain PUBLIC OpenMP::OpenMP_CXX)
else()
target_link_libraries(plain PUBLIC "${OpenMP_CXX_FLAGS}")
endif()
endif()
使用此更新的 cmake 配置,重建程序,my_parallel_for_
获得与 cv::parallel_for_
几乎相同的速度。