在 FORTRAN 中使用 MPI 收集二维数组的片段

Gathering segments of 2D arrays with MPI in FORTRAN

我有一个代码跟随粒子并在 N 个时间片输出粒子。所以我最后得到的是一个数组 TEMP(1:M,0:N),其中 M 是粒子总数。现在这个问题很容易通过分割粒子来平行化。所以每个处理器都在一个数组 TEMP(MYSTART:MYEND,0:N) 上工作,而主处理器在 TEMP(1:M,0:N) 中工作。现在我需要重新组合数组(实际上有多个这样的数组,但现在只关注一个)。使用我编写的 SENDV-RECV 循环:

  IF (myid == master) THEN
     ALLOCATE(B_lines(1:nlines,0:nsteps),...
  ELSE
     ALLOCATE(B_lines(mystart:myend,0:nsteps),...
  END IF

  CALL MPI_BARRIER(MPI_COMM_FIELDLINES,ierr_mpi)
  IF (ierr_mpi /=0) CALL andle_err(MPI_BARRIER_ERR,'fieldlines_init',ierr_mpi)
  IF (myid == master) THEN
     ALLOCATE(buffer_mast(4,0:nsteps))
     DO i = myend+1, nlines
        CALL MPI_RECV(buffer_mast,4*(nsteps+1),MPI_DOUBLE_PRECISION,&
                      MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_FIELDLINES,status,ierr_mpi)
        IF (ierr_mpi /=0) CALL handle_err(MPI_RECV_ERR,'fieldlines_init_mgrid',ierr_mpi)
        sender = status(MPI_SOURCE)
        j      = status(MPI_TAG)
        R_lines(j,:)   = buffer_mast(1,:) 
        Z_lines(j,:)   = buffer_mast(2,:)
        PHI_lines(j,:) = buffer_mast(3,:)
        B_lines(j,:)   = buffer_mast(4,:)
     END DO
     DEALLOCATE(buffer_mast)
  ELSE
     IF (mystart <= nlines) THEN
        ALLOCATE(buffer_slav(4,0:nsteps))
        DO j = mystart, myend
           buffer_slav(1,:) = R_lines(j,:)
           buffer_slav(2,:) = Z_lines(j,:)
           buffer_slav(3,:) = PHI_lines(j,:)
           buffer_slav(4,:) = B_lines(j,:)
           CALL MPI_SEND(buffer_slav,4*(nsteps+1),MPI_DOUBLE_PRECISION,master,j,MPI_COMM_FIELDLINES,ierr_mpi)
           IF (ierr_mpi /=0) CALL handle_err(MPI_SEND_ERR,'fieldlines_init_mgrid',ierr_mpi)
        END DO
        DEALLOCATE(buffer_slav)
     END IF
  END IF

现在这工作得很好,但扩展性很差。即使只有 64 个内核,代码也会花费大量时间来回发送数据。现在我想利用 GATHERV。所以我创建了一个子程序,我称之为:

CALL FIELDLINES_TRANSMIT_2DDBL(mystart,myend,0,nsteps,B_lines(mystart:myend,0:nsteps),&
                          numprocs,mnum,moffsets,myid,master,MPI_COMM_FIELDLINES,ier)

看起来像:

SUBROUTINE FIELDLINES_TRANSMIT_2DDBL(n1,n2,m1,m2,data_in,nproc,mnum,moffsets,id,root,COMM_local,ier)
USE stel_kinds, ONLY: rprec
IMPLICIT NONE
INCLUDE 'mpif.h' ! MPI
INTEGER, INTENT(in)           :: n1,n2,m1,m2,nproc,id,root,COMM_local
INTEGER, INTENT(in)           :: mnum(nproc), moffsets(nproc)
REAL(rprec), INTENT(inout)    :: data_in(n1:n2,m1:m2)
INTEGER, INTENT(inout)        :: ier
INTEGER, PARAMETER            :: ndims=2
INTEGER, PARAMETER            :: sstart(2) = (/0,0/) ! Starting offsets
INTEGER                       :: dbl_size, localsize, ARRAY_SEND_TYPE, RESIZED_ARRAY_SEND_TYPE
INTEGER                       :: asize(ndims), ssize(ndims), mrec(nproc)
INTEGER(KIND=MPI_ADDRESS_KIND):: low_bound,extent
DOUBLE PRECISION, ALLOCATABLE :: buffer_temp(:,:)

IF (ier <0) RETURN
mrec = 1
ssize(1) = n2-n1+1
ssize(2) = m2-m1+1
localsize = mnum(id+1) 
ALLOCATE(buffer_temp(ssize(1),ssize(2)))
buffer_temp(1:ssize(1),1:ssize(2)) = data_in(n1:n2,m1:m2)
asize    = ssize
CALL MPI_BCAST(asize, 2, MPI_INTEGER, root, COMM_local, ier)
CALL MPI_TYPE_CREATE_SUBARRAY(ndims,asize,ssize,sstart,MPI_ORDER_FORTRAN,&
                              MPI_DOUBLE_PRECISION,ARRAY_SEND_TYPE,ier)
CALL MPI_TYPE_COMMIT(ARRAY_SEND_TYPE,ier)
CALL MPI_TYPE_SIZE(MPI_DOUBLE_PRECISION, dbl_size,ier)
low_bound = 0
extent = dbl_size
CALL MPI_TYPE_CREATE_RESIZED(ARRAY_SEND_TYPE,low_bound,extent,RESIZED_ARRAY_SEND_TYPE,ier)
CALL MPI_TYPE_COMMIT(RESIZED_ARRAY_SEND_TYPE,ier)
IF (id == root) THEN
   localsize = PRODUCT(ssize)
   mrec(1)   = localsize
   CALL MPI_GATHERV(MPI_IN_PLACE,1,MPI_DOUBLE_PRECISION,&
                    data_in,mrec, moffsets,RESIZED_ARRAY_SEND_TYPE,&
                    root,COMM_local,ier)
ELSE
   CALL MPI_GATHERV(buffer_temp,localsize,MPI_DOUBLE_PRECISION,&
                    buffer_temp,mrec, moffsets,RESIZED_ARRAY_SEND_TYPE,&
                    root,COMM_local,ier)
END IF
CALL MPI_TYPE_FREE(ARRAY_SEND_TYPE,ier); IF (ier <0) RETURN
CALL MPI_TYPE_FREE(RESIZED_ARRAY_SEND_TYPE,ier); IF (ier <0) RETURN
DEALLOCATE(buffer_temp)
ier = 0
CALL MPI_BARRIER(COMM_local, ier)
RETURN
END SUBROUTINE FIELDLINES_TRANSMIT_2DDBL

现在我有了另一个具有相反索引 (0:nsteps,1:nlines) 的代码,它似乎工作得很好,但是当我尝试传递索引为 (1:nlines,0:nsteps) 的数组时,结果数组看起来一团糟。如有任何建议,我们将不胜感激。

我不是很清楚你设置的所有细节,但这里有一个简单的解决方案,它说明了如何在工作完全按进程划分的限制下进行,即 nlines 是大小的倍数。

在这种情况下,母版的图案是一个简单的矢量(即二维子阵列)。唯一的技巧是适当地调整矢量的大小,以便贡献在接收器处正确排列。您不需要使用 Gatherv - 每个进程发送一堆双打,而主进程从每个人那里接收一个向量。

一般情况下的解决方案取决于数据划分不准确时如何划分数据。如果除最后一个数据较少的进程外,所有进程都具有相同数量的数据,则可以使用此解决方案,但您可以填充最后一个进程的数据,以便它可以发送和接收与其余进程相同数量的数据(包括一些虚拟数据)。

我初始化数据,例如,在第 3 步,跨进程分布的行的值为 301、302、303,... 300+n 行。如果我 运行 处理 4 个进程,12 行和 4 个步骤,那么输出看起来是正确的:

笔记本电脑:~$ mpiexec -n 4 ./gather2d

 rank, mystart, myend            2           7           9
 rank, mystart, myend            1           4           6
 rank, mystart, myend            0           1           3
 rank, mystart, myend            3          10          12
           1         101         201         301         401
           2         102         202         302         402
           3         103         203         303         403
           4         104         204         304         404
           5         105         205         305         405
           6         106         206         306         406
           7         107         207         307         407
           8         108         208         308         408
           9         109         209         309         409
          10         110         210         310         410
          11         111         211         311         411
          12         112         212         312         412

代码如下:

program gather2d

  use mpi

  implicit none

  integer :: nlines, nsteps
  integer :: comm, rank, size, ierr, arraytype, arraytyperesized
  integer :: mystart, myend
  integer :: i, j
  integer :: dblesize

  double precision, allocatable, dimension(:,:) :: B_lines

  integer(kind=mpi_address_kind) lb, extent

  comm = MPI_COMM_WORLD

  nlines = 12
  nsteps = 4

  call MPI_Init(ierr)

  call MPI_Comm_size(comm, size, ierr)
  call MPI_Comm_rank(comm, rank, ierr)

  mystart = nlines/size * rank + 1
  myend   = nlines/size * (rank+1)

  write(*,*) 'rank, mystart, myend ', rank, mystart, myend

  if (rank == 0) then

    allocate(B_lines(1:nlines, 0:nsteps))

  else

    allocate(B_lines(mystart:myend, 0:nsteps))

  end if

  do i = mystart, myend
     do j = 0, nsteps

        B_lines(i,j) = i+100*j

     end do
  end do

  call MPI_Type_vector(nsteps+1, myend-mystart+1, nlines, &
                       MPI_DOUBLE_PRECISION, arraytype, ierr)

  call MPI_Type_size(MPI_DOUBLE_PRECISION, dblesize, ierr)

  lb = 0
  extent = (myend-mystart+1)*dblesize

  call MPI_Type_Create_resized(arraytype, lb, extent, arraytyperesized, ierr)

  call MPI_Type_commit(arraytyperesized, ierr)

  if (rank == 0) then

     call MPI_Gather(MPI_IN_PLACE, (nsteps+1)*(myend-mystart+1), &
                     MPI_DOUBLE_PRECISION, &
                     B_lines, 1, arraytyperesized, 0, comm, ierr)

  else

     call MPI_Gather(B_lines, (nsteps+1)*(myend-mystart+1), &
                     MPI_DOUBLE_PRECISION, &
                     B_lines, 1, arraytyperesized, 0, comm, ierr)

  end if

  if (rank == 0) then

     do i = 1, nlines
        write(*,*) (int(B_lines(i,j)), j = 0, nsteps)
     end do

  end if

  call MPI_Finalize(ierr)

end program gather2d

好的,所以我找到了一种耗费内存的方法来执行此操作。基本上我把所有东西都放在全局大小的数组中并使用 MPI_REDUCE。这比我的旧队列系统快得多,但以内存为代价。这是代码

SUBROUTINE TRANSMIT_2DDBL(n1,n2,m1,m2,data_in,n1_gbl,n2_gbl,id,root,COMM_local,ier)
IMPLICIT NONE
INCLUDE 'mpif.h' ! MPI
INTEGER, INTENT(in)           :: n1,n2,m1,m2,id,root,COMM_local
INTEGER, INTENT(in)           :: n1_gbl, n2_gbl
DOUBLE PRECISION, INTENT(inout)    :: data_in(n1:n2,m1:m2)
INTEGER, INTENT(inout)        :: ier
INTEGER                       :: nt_gbl
DOUBLE PRECISION, ALLOCATABLE :: buffer_temp(:,:)

IF (ier <0) RETURN
nt_gbl=(n2_gbl-n1_gbl+1)*(m2-m1+1)
ALLOCATE(buffer_temp(n1_gbl:n2_gbl,m1:m2))
buffer_temp = 0
buffer_temp(n1:n2,m1:m2) = data_in(n1:n2,m1:m2)
IF (id == root) THEN
   CALL MPI_REDUCE(MPI_IN_PLACE,buffer_temp,nt_gbl,MPI_DOUBLE_PRECISION,MPI_SUM,&
                   root,COMM_local,ier)
   data_in = buffer_temp
ELSE
     CALL MPI_REDUCE(buffer_temp,buffer_temp,nt_gbl,MPI_DOUBLE_PRECISION,MPI_SUM,&
                  root,COMM_local,ier)
END IF
DEALLOCATE(buffer_temp)
ier = 0
CALL MPI_BARRIER(COMM_local, ier)
RETURN
END SUBROUTINE TRANSMIT_2DDBL

当然root进程在进入前需要分别传入n1_gbl和n2_gbl作为n1和n2。但它似乎提高了我的表现。