MPI + OpenMP 分段错误和不可预测的行为

Question

我决定 post 经过几个小时尝试解决类似问题但没有成功。我正在编写一个 C++ MPI+OpenMP 代码，其中一个 MPI 节点（服务器）将双数组发送到其他节点。服务器生成线程以便同时发送给多个客户端。串行版本（单独使用 MPI）运行良好，单线程版本也是如此。多线程版本 (openmp) 在运行dom 迭代次数后不断抛出分段错误。 printf("%d: cur_idx:%d, opt_k.k:%d, idx:%d, N:%d \n", tid, cur_idx,opt_k.k,idx,N) 行打印出每次迭代的值。不可预测性是迭代次数（在一次事件中，代码运行成功只是在我再次尝试运行后立即抛出段错误）。然而，它总是以 num_threads=1 结束。 getData returns 结构向量，结构定义为 (int,int,double *).

这是代码

double *tStatistics=new double[8], tmp_time; // wall clock time
double SY, Sto;
int a_tasks=0, file_p=0;
vector<myDataType *> d = getData();

int idx=0; opt_k.k=1; opt_k.proc_files=0; opt_k.p=this->node_sz;
opt_k.proc_files=0; SY=0; Sto=0;
std::fill(header,header+SZ_HEADER,-1);

omp_set_num_threads(5);// for now
// parallel region

#pragma omp parallel default(none) shared(d,idx,SY,Sto) private(a_tasks)
{
    double *myHeader=new double[SZ_HEADER];
    std::fill(myHeader,myHeader+SZ_HEADER,0);
    int tid = omp_get_thread_num(), cur_idx, cur_k; int N;
    //#pragma omp atomic
        N=d.size();
    while (idx<N) {
        // Assign tasks and fetch results where available
        cur_idx=N;
        #pragma omp critical(update__idx)
        {
            if (idx<N) {
                cur_idx=idx; cur_k=opt_k.k; idx+=cur_k;
            }
        }
        if (cur_idx<N) {
            printf("%d: cur_idx:%d, opt_k.k:%d, idx:%d, N:%d \n", tid, cur_idx,opt_k.k,idx,N);
            MPI_Recv(myHeader,SZ_HEADER,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MY_COMM_GRP,this->Stat);
            if(this->Stat->MPI_TAG == TAG_HEADER){ // serve tasks
                while (cur_k && cur_idx<N) {
                    myHeader[1]=d[cur_idx]->nRows; myHeader[2]=d[cur_idx]->nCols; myHeader[3]=cur_idx; myHeader[9]=--cur_k;
                    MPI_Send(myHeader,SZ_HEADER,MPI_DOUBLE,(int)myHeader[4],TAG_DATA,MY_COMM_GRP);
                    MPI_Send(d[cur_idx]->data,d[cur_idx]->nRows*d[cur_idx]->nCols,MPI_DOUBLE,(int)myHeader[4],TAG_DATA,MY_COMM_GRP);
                    delete[] d[cur_idx]->data;  ++cur_idx;
                }
            }else if(this->Stat->MPI_TAG == TAG_RESULT){ // collect results
                printf("%d - 4\n", tid);
            }

        } //end if(loopmain)
    } // end while(loopmain)

} // end parallel section

message("terminate slaves");
for(int i=1;i<node_sz;++i){ // terminate
  MPI_Recv(header,SZ_HEADER,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MY_COMM_GRP,this->Stat);
  MPI_Send(header,SZ_HEADER,MPI_DOUBLE,(int)header[4],TAG_TERMINATE,MY_COMM_GRP);
}
return 0;

另一个匹配函数是

void CMpifun::slave2()
{
    double *Data; vector<myDataType> dataQ; vector<hist_type> resQ;
    char out_opt='b'; // irrelevant
    myDataType *out_im = new myDataType;    hist_type *out_hist;    CLdp ldp;
    int file_cnt=0; double tmp_t; //local variables

    while (true) { // main while loop
        header[4]=myRank;   MPI_Send(header,SZ_HEADER,MPI_DOUBLE,MASTER,TAG_HEADER,MY_COMM_GRP);
        MPI_Recv(header,SZ_HEADER,MPI_DOUBLE,MASTER,MPI_ANY_TAG,MY_COMM_GRP,this->Stat);
        if(this->Stat->MPI_TAG == TAG_TERMINATE) {
            break;
        }
        //receive data
        while(true) {
            Data=new double[(int)(header[1]*header[2])];
            MPI_Recv(Data,(int)(header[1]*header[2]),MPI_DOUBLE,MASTER,TAG_DATA,MY_COMM_GRP,this->Stat);
            myDataType d; d.data=Data; d.nRows=(int)header[1]; d.nCols=(int)header[2];
            //dataQ.push_back(d);
            delete[] Data;
            file_cnt++;
            if ((int)header[9]) {
                MPI_Recv(header,SZ_HEADER,MPI_DOUBLE,MASTER,TAG_DATA,MY_COMM_GRP,this->Stat);
            } else break;
        }
    } // end main while loop
    message("terminating");

我已经尝试了解决类似问题的所有建议。这是我的环境设置

export OMP_WAIT_POLICY="active" 
export OMP_NUM_THREADS=4  
export OMP_DYNAMIC=true # "true","false"  
export OMP_STACKSIZE=200M # 
export KMP_STACKSIZE=$OMP_STACKSIZE  
ulimit -s unlimited

非常感谢所有参与的人。我越来越相信这在某种程度上与内存分配有关，但也不明白为什么。我现在有以下代码：

double CMpifun::sendData2()
{
double *tStatistics=new double[8], tmp_time; // wall clock time
double SY, Sto; int a_tasks=0, file_p=0;
vector<myDataType *> d = getData();

int idx=0; opt_k.k=1; opt_k.proc_files=0; opt_k.p=this->node_sz;
opt_k.proc_files=0; SY=0; Sto=0;
std::fill(header,header+SZ_HEADER,-1);

omp_set_num_threads(224);// for now
// parallel region

#pragma omp parallel default(none) shared(idx,SY,Sto,d) private(a_tasks)
{
    double *myHeader=new double[SZ_HEADER];
    std::fill(myHeader,myHeader+SZ_HEADER,0);
    int tid = omp_get_thread_num(), cur_idx, cur_k; int N;

    //#pragma omp critical(update__idx)
    {
        N=d.size();
    }
    while (idx<N) {
        // Assign tasks and fetch results where available
        cur_idx=N;
        #pragma omp critical(update__idx)
        {
            if (idx<N) {
                cur_idx=idx; cur_k=opt_k.k; idx+=cur_k;
            }
        }
        if (cur_idx<N) {
            //printf("%d: cur_idx:%d, opt_k.k:%d, idx:%d, N:%d \n", tid, cur_idx,opt_k.k,idx,N);
            printf("%d: cur_idx:%d, N:%d \n", tid, cur_idx,N);
            //#pragma omp critical(update__idx)
            {
                MPI_Recv(myHeader,SZ_HEADER,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MY_COMM_GRP,this->Stat);
            }
            if(this->Stat->MPI_TAG == TAG_HEADER){ // serve tasks
                while (cur_k && cur_idx<N) {
                    //#pragma omp critical(update__idx)
                    {
                        myHeader[1]=d[cur_idx]->nRows; myHeader[2]=d[cur_idx]->nCols;   myHeader[3]=cur_idx;
                        myHeader[9]=--cur_k;
                        MPI_Send(myHeader,SZ_HEADER,MPI_DOUBLE,(int)myHeader[4],TAG_DATA,MY_COMM_GRP);
                        MPI_Send(d[cur_idx]->data,d[cur_idx]->nRows*d[cur_idx]->nCols,MPI_DOUBLE,(int)myHeader[4],TAG_DATA,MY_COMM_GRP);
                        delete[] d[cur_idx]->data;
                    }
                    ++cur_idx;
                }
            }else if(this->Stat->MPI_TAG == TAG_RESULT){ // collect results
                printf("%d - 4\n", tid);
            }

        } //end if(loopmain)
    } // end while(loopmain)

} // end parallel section

message("terminate slaves");
for(int i=1;i<node_sz;++i){ // terminate
  MPI_Recv(header,SZ_HEADER,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MY_COMM_GRP,this->Stat);
  MPI_Send(header,SZ_HEADER,MPI_DOUBLE,(int)header[4],TAG_TERMINATE,MY_COMM_GRP);
}
return 0;

而且是一对

void CMpifun::slave2()
{
double *Data; vector<myDataType> dataQ; vector<hist_type> resQ;
char out_opt='b'; // irrelevant
myDataType *out_im = new myDataType;    hist_type *out_hist;    CLdp ldp;
int file_cnt=0; double tmp_t; //local variables

while (true) { // main while loop
header[4]=myRank;   MPI_Send(header,SZ_HEADER,MPI_DOUBLE,MASTER,TAG_HEADER,MY_COMM_GRP);
MPI_Recv(header,SZ_HEADER,MPI_DOUBLE,MASTER,MPI_ANY_TAG,MY_COMM_GRP,this->Stat);
if(this->Stat->MPI_TAG == TAG_TERMINATE) {
    break;
}
//receive data
while(true) {
    Data=new double[(int)(header[1]*header[2])];
    MPI_Recv(Data,(int)(header[1]*header[2]),MPI_DOUBLE,MASTER,TAG_DATA,MY_COMM_GRP,this->Stat);
    myDataType *d=new myDataType; d->data=Data; d->nRows=(int)header[1]; d->nCols=(int)header[2];
    dataQ.push_back(*d);
    delete[] Data;
    file_cnt++;
    if ((int)header[9]) {
        MPI_Recv(header,SZ_HEADER,MPI_DOUBLE,MASTER,TAG_DATA,MY_COMM_GRP,this->Stat);
    } else break;
}

// Error section: Uncommenting next line causes seg fault
/*while (dataQ.size()) { // process data
    out_hist = new hist_type();
    myDataType d = dataQ.back(); dataQ.pop_back(); // critical section
    ldp.process(d.data, d.nRows,d.nCols,out_opt,out_im, out_hist);
    resQ.push_back(*out_hist); out_hist=0;
    delete[] d.data; delete[] out_im->data;
}*/

//time_arr[1] /= file_cnt; time_arr[2] /= file_cnt;
//header[6]=time_arr[0]; header[7]=time_arr[1]; header[8]=time_arr[2];
//header[4]=myRank; header[9]=resQ.size();

} // end main while loop

更新是，如果我取消注释 Slave2() 函数中的 while 循环，则运行不会完成。我不明白的是，这个函数（slave2）没有任何openmp/threading，但它似乎有效果。此外，它不与线程函数共享任何变量。如果我注释掉麻烦的部分然后代码运行s，不管我设置的线程数 (4, 18, 300)。我的 OpenMP 环境变量保持不变。 limit -a的输出如下，

core file size          (blocks, -c) 0
data seg size           (kbytes, -d) unlimited
scheduling priority             (-e) 0
file size               (blocks, -f) unlimited
pending signals                 (-i) 30473
max locked memory       (kbytes, -l) 64
max memory size         (kbytes, -m) unlimited
open files                      (-n) 1024
pipe size            (512 bytes, -p) 8
POSIX message queues     (bytes, -q) 819200
real-time priority              (-r) 0
stack size              (kbytes, -s) 37355
cpu time               (seconds, -t) unlimited
max user processes              (-u) 30473
virtual memory          (kbytes, -v) unlimited
file locks                      (-x) unlimited

我的构造函数也调用了 mpi_init_thread。为了解决@Tim 问题，我使用动态内存（使用 new）的原因是为了不使堆栈内存膨胀，遵循对类似问题的解决方案的建议。感谢您的帮助。

Answer 1

我看到的最大问题是您的代码显示出许多竞争条件。您看到的不稳定行为无疑是由此引起的。请记住，无论何时访问 OpenMP 中的共享变量（通过 shared 关键字声明或通过全局范围声明），您正在访问的内存都可以由该组中的任何其他线程读取或写入，并且不保证顺序.例如，

N = d.size();

是竞争条件，因为 std::vector 不是线程安全的。因为您在 class 内部使用 OpenMP，所以任何成员变量也被视为 "global"，因此默认情况下不是线程安全的。

如@tim18 所述，因为您是从 OpenMP 并行区域内调用 MPI 例程，所以您应该使用 MPI_Init_thread 函数将 MPI 运行时初始化为线程安全的。

顺便说一句，您的 C++ 需要一些工作。永远不要在用户级代码中使用 new 或 delete。使用 RAII 管理对象生命周期并将大型数据结构包装在为您管理生命周期的精简对象中。例如，这一行

delete[] d[cur_idx]->data;

告诉我，您的代码中潜伏着恶魔，等待释放给毫无戒心的用户（可能是您！）。顺便说一句，这也是一个竞争条件。许多恶魔！

MPI + OpenMP 分段错误和不可预测的行为

MPI + OpenMP segmentation fault and unpredictable behavior

c++

mpi

openmp

segmentation-fault