MPI 块和 scatterv

MPI blocks and scatterv

我有一个二维矩阵,假设这个矩阵是下面的 4x6 矩阵:

 1  2  3  4  5  6
 7  8  9 10 11 12
13 14 15 16 17 18
19 20 21 22 23 24

我希望 4 个进程中的每个进程都获得一个 2x3 子矩阵并将其放入 4x5 缓冲区中。

类似的东西:

0  0  0  0  0
0  1  2  3  0
0  7  8  9  0
0  0  0  0  0

0  0  0  0  0
0  4  5  6  0
0 10 11 12  0
0  0  0  0  0

0  0  0  0  0
0 13 14 15  0
0 19 20 21  0
0  0  0  0  0

0  0  0  0  0
0 16 17 18  0
0 22 23 24  0
0  0  0  0  0    

额外的行不是问题,因为我可以调用 MPI_Scatterv 并指向进程缓冲区的第二行,但是额外的列使事情变得复杂。 这可能仅使用 MPI 数据类型和一个 MPI_Scatterv 调用吗?如果是,请给我一些指导。

MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
dim[0] = dim[1] = sqrt(numtasks);
periods[0] = periods[1] = 0;
MPI_Cart_create(MPI_COMM_WORLD, 2, dim, periods, 1, &commCart);
MPI_Comm_rank(commCart, &taskid);

NPROWS = dim[0];
NPCOLS = dim[1];
blockRows = ROWS / NPROWS;
blockColumns = COLS / NPCOLS;

if (taskid == MASTER) {
    for(i=0;i<ROWS*COLS;i++){
        global[i]=i;
    }
}

float* local;
local = malloc(blockRows * (blockColumns+2) * sizeof (float));

for (i = 0; i < blockRows * (blockColumns+2); i++) {
    local[0][i] = 0;
    local[1][i] = 0;
}
MPI_Datatype type, resizedtype,column;
int sizes[2]    = {ROWS,COLS};  
int subsizes[2] = {blockRows,blockColumns}; 
int starts[2]   = {0,0};  

MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_FLOAT, &type);  
MPI_Type_create_resized(type, 0, blockColumns*sizeof(float), &resizedtype);
MPI_Type_commit(&resizedtype);

int *counts = malloc(numworkers*sizeof(int));
int *displs = malloc(numworkers*sizeof(int));
for(i=0;i<numworkers;i++){
    counts[i] = 1;
}

int disp = 0;
for(i=0;i<NPROWS;i++){
    for(j=0;j<NPCOLS;j++){
        displs[i*dim[0] + j] = disp;
        disp++;
    }
    disp += (blockColumns-1)*dim[0];
}

MPI_Scatterv(global, counts, displs, resizedtype,      
        &local[0][blockColumns], blockRows*blockColumns, MPI_FLOAT,  
        MASTER, commCart);

关于您实际希望代码如何工作,我已经猜到了一些(例如,local 当前被定义为 1D 和 2D 数组的方式在内部并不一致)。主要注意事项有:

  1. 您需要两种数据类型:一种用于选择全局的正确元素,另一种用于存储到本地的正确元素。
  2. 最简单的方法是将sendtype调整到MPI_FLOAT那么大,这样就可以随意放置;因此,位移以浮点数计算。
  3. 在接收端,您只需接收一个子数组 - 无需调整它的大小。

我还没有检查代码的通用性,但它似乎适用于您所说明的特定情况,即跨 2x2 分解的 4x6 矩阵。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include <mpi.h>

#define MASTER 0

#define ROWS 4
#define COLS 6

void main(void)
{
  int dim[2], periods[2], NPROWS, NPCOLS, blockRows, blockColumns;
  int numtasks, taskid, i, j;

  MPI_Comm commCart;

  MPI_Init(NULL, NULL);
  MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
  dim[0] = dim[1] = sqrt(numtasks);
  periods[0] = periods[1] = 0;
  MPI_Cart_create(MPI_COMM_WORLD, 2, dim, periods, 1, &commCart);
  MPI_Comm_rank(commCart, &taskid);

  NPROWS = dim[0];
  NPCOLS = dim[1];
  blockRows = ROWS / NPROWS;
  blockColumns = COLS / NPCOLS;

  float* global;
  if (taskid == MASTER) {
    global = malloc(ROWS * COLS * sizeof (float));
    for(i=0;i<ROWS*COLS;i++){
      global[i]=i+1;
    }
  }

  float* local;
  local = malloc((blockRows+2) * (blockColumns+2) * sizeof (float));

  for (i = 0; i < (blockRows+2) * (blockColumns+2); i++) {
    local[i] = 0;
  }


  MPI_Datatype type, resizedtype,column;
  int sizes[2]    = {ROWS,COLS};
  int subsizes[2] = {blockRows,blockColumns};
  int starts[2]   = {0,0};

  MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_FLOAT, \
&type);
  MPI_Type_create_resized(type, 0, sizeof(float), &resizedtype);
  MPI_Type_commit(&resizedtype);

  int *counts = malloc(numtasks*sizeof(int));
  int *displs = malloc(numtasks*sizeof(int));
  for(i=0;i<numtasks;i++){
    counts[i] = 1;
  }

  int disp = 0;
  for(i=0;i<NPROWS;i++){
    for(j=0;j<NPCOLS;j++){
      disp = i*blockRows*COLS+j*blockColumns;
      displs[i*dim[0] + j] = disp;
    }
  }

  MPI_Datatype localtype;
  int localsizes[2]    = {blockRows+2,blockColumns+2};
  int localsubsizes[2] = {blockRows,blockColumns};
  int localstarts[2]   = {1,1};

  MPI_Type_create_subarray(2, localsizes, localsubsizes, localstarts, MPI_ORDER\
_C, MPI_FLOAT, &localtype);
  MPI_Type_commit(&localtype);


  MPI_Scatterv(global, counts, displs, resizedtype,
               &local[0], 1, localtype,
               MASTER, commCart);


  for (i=0; i < (blockRows+2)*(blockColumns+2); i++)
    {
      printf("rank %d: local[%d] = %f\n", taskid, i, local[i]);
    }

  MPI_Finalize();
}

这是输出 - 看起来像你想要的:

mpiexec -n 4 ./scatterv
rank 0: local[0] = 0.000000
rank 0: local[1] = 0.000000
rank 0: local[2] = 0.000000
rank 0: local[3] = 0.000000
rank 0: local[4] = 0.000000
rank 0: local[5] = 0.000000
rank 0: local[6] = 1.000000
rank 0: local[7] = 2.000000
rank 0: local[8] = 3.000000
rank 0: local[9] = 0.000000
rank 0: local[10] = 0.000000
rank 0: local[11] = 7.000000
rank 0: local[12] = 8.000000
rank 0: local[13] = 9.000000
rank 0: local[14] = 0.000000
rank 0: local[15] = 0.000000
rank 0: local[16] = 0.000000
rank 0: local[17] = 0.000000
rank 0: local[18] = 0.000000
rank 0: local[19] = 0.000000
rank 1: local[0] = 0.000000
rank 1: local[1] = 0.000000
rank 1: local[2] = 0.000000
rank 1: local[3] = 0.000000
rank 1: local[4] = 0.000000
rank 1: local[5] = 0.000000
rank 1: local[6] = 4.000000
rank 1: local[7] = 5.000000
rank 1: local[8] = 6.000000
rank 1: local[9] = 0.000000
rank 1: local[10] = 0.000000
rank 1: local[11] = 10.000000
rank 1: local[12] = 11.000000
rank 1: local[13] = 12.000000
rank 1: local[14] = 0.000000
rank 1: local[15] = 0.000000
rank 1: local[16] = 0.000000
rank 1: local[17] = 0.000000
rank 1: local[18] = 0.000000
rank 1: local[19] = 0.000000
rank 2: local[0] = 0.000000
rank 2: local[1] = 0.000000
rank 2: local[2] = 0.000000
rank 2: local[3] = 0.000000
rank 2: local[4] = 0.000000
rank 2: local[5] = 0.000000
rank 2: local[6] = 13.000000
rank 2: local[7] = 14.000000
rank 2: local[8] = 15.000000
rank 2: local[9] = 0.000000
rank 2: local[10] = 0.000000
rank 2: local[11] = 19.000000
rank 2: local[12] = 20.000000
rank 2: local[13] = 21.000000
rank 2: local[14] = 0.000000
rank 2: local[15] = 0.000000
rank 2: local[16] = 0.000000
rank 2: local[17] = 0.000000
rank 2: local[18] = 0.000000
rank 2: local[19] = 0.000000
rank 3: local[0] = 0.000000
rank 3: local[1] = 0.000000
rank 3: local[2] = 0.000000
rank 3: local[3] = 0.000000
rank 3: local[4] = 0.000000
rank 3: local[5] = 0.000000
rank 3: local[6] = 16.000000
rank 3: local[7] = 17.000000
rank 3: local[8] = 18.000000
rank 3: local[9] = 0.000000
rank 3: local[10] = 0.000000
rank 3: local[11] = 22.000000
rank 3: local[12] = 23.000000
rank 3: local[13] = 24.000000
rank 3: local[14] = 0.000000
rank 3: local[15] = 0.000000
rank 3: local[16] = 0.000000
rank 3: local[17] = 0.000000
rank 3: local[18] = 0.000000
rank 3: local[19] = 0.000000