在 FORTRAN 中使用 MPI 收集二维数组的片段
Gathering segments of 2D arrays with MPI in FORTRAN
我有一个代码跟随粒子并在 N 个时间片输出粒子。所以我最后得到的是一个数组 TEMP(1:M,0:N),其中 M 是粒子总数。现在这个问题很容易通过分割粒子来平行化。所以每个处理器都在一个数组 TEMP(MYSTART:MYEND,0:N) 上工作,而主处理器在 TEMP(1:M,0:N) 中工作。现在我需要重新组合数组(实际上有多个这样的数组,但现在只关注一个)。使用我编写的 SENDV-RECV 循环:
IF (myid == master) THEN
ALLOCATE(B_lines(1:nlines,0:nsteps),...
ELSE
ALLOCATE(B_lines(mystart:myend,0:nsteps),...
END IF
CALL MPI_BARRIER(MPI_COMM_FIELDLINES,ierr_mpi)
IF (ierr_mpi /=0) CALL andle_err(MPI_BARRIER_ERR,'fieldlines_init',ierr_mpi)
IF (myid == master) THEN
ALLOCATE(buffer_mast(4,0:nsteps))
DO i = myend+1, nlines
CALL MPI_RECV(buffer_mast,4*(nsteps+1),MPI_DOUBLE_PRECISION,&
MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_FIELDLINES,status,ierr_mpi)
IF (ierr_mpi /=0) CALL handle_err(MPI_RECV_ERR,'fieldlines_init_mgrid',ierr_mpi)
sender = status(MPI_SOURCE)
j = status(MPI_TAG)
R_lines(j,:) = buffer_mast(1,:)
Z_lines(j,:) = buffer_mast(2,:)
PHI_lines(j,:) = buffer_mast(3,:)
B_lines(j,:) = buffer_mast(4,:)
END DO
DEALLOCATE(buffer_mast)
ELSE
IF (mystart <= nlines) THEN
ALLOCATE(buffer_slav(4,0:nsteps))
DO j = mystart, myend
buffer_slav(1,:) = R_lines(j,:)
buffer_slav(2,:) = Z_lines(j,:)
buffer_slav(3,:) = PHI_lines(j,:)
buffer_slav(4,:) = B_lines(j,:)
CALL MPI_SEND(buffer_slav,4*(nsteps+1),MPI_DOUBLE_PRECISION,master,j,MPI_COMM_FIELDLINES,ierr_mpi)
IF (ierr_mpi /=0) CALL handle_err(MPI_SEND_ERR,'fieldlines_init_mgrid',ierr_mpi)
END DO
DEALLOCATE(buffer_slav)
END IF
END IF
现在这工作得很好,但扩展性很差。即使只有 64 个内核,代码也会花费大量时间来回发送数据。现在我想利用 GATHERV。所以我创建了一个子程序,我称之为:
CALL FIELDLINES_TRANSMIT_2DDBL(mystart,myend,0,nsteps,B_lines(mystart:myend,0:nsteps),&
numprocs,mnum,moffsets,myid,master,MPI_COMM_FIELDLINES,ier)
看起来像:
SUBROUTINE FIELDLINES_TRANSMIT_2DDBL(n1,n2,m1,m2,data_in,nproc,mnum,moffsets,id,root,COMM_local,ier)
USE stel_kinds, ONLY: rprec
IMPLICIT NONE
INCLUDE 'mpif.h' ! MPI
INTEGER, INTENT(in) :: n1,n2,m1,m2,nproc,id,root,COMM_local
INTEGER, INTENT(in) :: mnum(nproc), moffsets(nproc)
REAL(rprec), INTENT(inout) :: data_in(n1:n2,m1:m2)
INTEGER, INTENT(inout) :: ier
INTEGER, PARAMETER :: ndims=2
INTEGER, PARAMETER :: sstart(2) = (/0,0/) ! Starting offsets
INTEGER :: dbl_size, localsize, ARRAY_SEND_TYPE, RESIZED_ARRAY_SEND_TYPE
INTEGER :: asize(ndims), ssize(ndims), mrec(nproc)
INTEGER(KIND=MPI_ADDRESS_KIND):: low_bound,extent
DOUBLE PRECISION, ALLOCATABLE :: buffer_temp(:,:)
IF (ier <0) RETURN
mrec = 1
ssize(1) = n2-n1+1
ssize(2) = m2-m1+1
localsize = mnum(id+1)
ALLOCATE(buffer_temp(ssize(1),ssize(2)))
buffer_temp(1:ssize(1),1:ssize(2)) = data_in(n1:n2,m1:m2)
asize = ssize
CALL MPI_BCAST(asize, 2, MPI_INTEGER, root, COMM_local, ier)
CALL MPI_TYPE_CREATE_SUBARRAY(ndims,asize,ssize,sstart,MPI_ORDER_FORTRAN,&
MPI_DOUBLE_PRECISION,ARRAY_SEND_TYPE,ier)
CALL MPI_TYPE_COMMIT(ARRAY_SEND_TYPE,ier)
CALL MPI_TYPE_SIZE(MPI_DOUBLE_PRECISION, dbl_size,ier)
low_bound = 0
extent = dbl_size
CALL MPI_TYPE_CREATE_RESIZED(ARRAY_SEND_TYPE,low_bound,extent,RESIZED_ARRAY_SEND_TYPE,ier)
CALL MPI_TYPE_COMMIT(RESIZED_ARRAY_SEND_TYPE,ier)
IF (id == root) THEN
localsize = PRODUCT(ssize)
mrec(1) = localsize
CALL MPI_GATHERV(MPI_IN_PLACE,1,MPI_DOUBLE_PRECISION,&
data_in,mrec, moffsets,RESIZED_ARRAY_SEND_TYPE,&
root,COMM_local,ier)
ELSE
CALL MPI_GATHERV(buffer_temp,localsize,MPI_DOUBLE_PRECISION,&
buffer_temp,mrec, moffsets,RESIZED_ARRAY_SEND_TYPE,&
root,COMM_local,ier)
END IF
CALL MPI_TYPE_FREE(ARRAY_SEND_TYPE,ier); IF (ier <0) RETURN
CALL MPI_TYPE_FREE(RESIZED_ARRAY_SEND_TYPE,ier); IF (ier <0) RETURN
DEALLOCATE(buffer_temp)
ier = 0
CALL MPI_BARRIER(COMM_local, ier)
RETURN
END SUBROUTINE FIELDLINES_TRANSMIT_2DDBL
现在我有了另一个具有相反索引 (0:nsteps,1:nlines) 的代码,它似乎工作得很好,但是当我尝试传递索引为 (1:nlines,0:nsteps) 的数组时,结果数组看起来一团糟。如有任何建议,我们将不胜感激。
我不是很清楚你设置的所有细节,但这里有一个简单的解决方案,它说明了如何在工作完全按进程划分的限制下进行,即 nlines 是大小的倍数。
在这种情况下,母版的图案是一个简单的矢量(即二维子阵列)。唯一的技巧是适当地调整矢量的大小,以便贡献在接收器处正确排列。您不需要使用 Gatherv - 每个进程发送一堆双打,而主进程从每个人那里接收一个向量。
一般情况下的解决方案取决于数据划分不准确时如何划分数据。如果除最后一个数据较少的进程外,所有进程都具有相同数量的数据,则可以使用此解决方案,但您可以填充最后一个进程的数据,以便它可以发送和接收与其余进程相同数量的数据(包括一些虚拟数据)。
我初始化数据,例如,在第 3 步,跨进程分布的行的值为 301、302、303,... 300+n 行。如果我 运行 处理 4 个进程,12 行和 4 个步骤,那么输出看起来是正确的:
笔记本电脑:~$ mpiexec -n 4 ./gather2d
rank, mystart, myend 2 7 9
rank, mystart, myend 1 4 6
rank, mystart, myend 0 1 3
rank, mystart, myend 3 10 12
1 101 201 301 401
2 102 202 302 402
3 103 203 303 403
4 104 204 304 404
5 105 205 305 405
6 106 206 306 406
7 107 207 307 407
8 108 208 308 408
9 109 209 309 409
10 110 210 310 410
11 111 211 311 411
12 112 212 312 412
代码如下:
program gather2d
use mpi
implicit none
integer :: nlines, nsteps
integer :: comm, rank, size, ierr, arraytype, arraytyperesized
integer :: mystart, myend
integer :: i, j
integer :: dblesize
double precision, allocatable, dimension(:,:) :: B_lines
integer(kind=mpi_address_kind) lb, extent
comm = MPI_COMM_WORLD
nlines = 12
nsteps = 4
call MPI_Init(ierr)
call MPI_Comm_size(comm, size, ierr)
call MPI_Comm_rank(comm, rank, ierr)
mystart = nlines/size * rank + 1
myend = nlines/size * (rank+1)
write(*,*) 'rank, mystart, myend ', rank, mystart, myend
if (rank == 0) then
allocate(B_lines(1:nlines, 0:nsteps))
else
allocate(B_lines(mystart:myend, 0:nsteps))
end if
do i = mystart, myend
do j = 0, nsteps
B_lines(i,j) = i+100*j
end do
end do
call MPI_Type_vector(nsteps+1, myend-mystart+1, nlines, &
MPI_DOUBLE_PRECISION, arraytype, ierr)
call MPI_Type_size(MPI_DOUBLE_PRECISION, dblesize, ierr)
lb = 0
extent = (myend-mystart+1)*dblesize
call MPI_Type_Create_resized(arraytype, lb, extent, arraytyperesized, ierr)
call MPI_Type_commit(arraytyperesized, ierr)
if (rank == 0) then
call MPI_Gather(MPI_IN_PLACE, (nsteps+1)*(myend-mystart+1), &
MPI_DOUBLE_PRECISION, &
B_lines, 1, arraytyperesized, 0, comm, ierr)
else
call MPI_Gather(B_lines, (nsteps+1)*(myend-mystart+1), &
MPI_DOUBLE_PRECISION, &
B_lines, 1, arraytyperesized, 0, comm, ierr)
end if
if (rank == 0) then
do i = 1, nlines
write(*,*) (int(B_lines(i,j)), j = 0, nsteps)
end do
end if
call MPI_Finalize(ierr)
end program gather2d
好的,所以我找到了一种耗费内存的方法来执行此操作。基本上我把所有东西都放在全局大小的数组中并使用 MPI_REDUCE
。这比我的旧队列系统快得多,但以内存为代价。这是代码
SUBROUTINE TRANSMIT_2DDBL(n1,n2,m1,m2,data_in,n1_gbl,n2_gbl,id,root,COMM_local,ier)
IMPLICIT NONE
INCLUDE 'mpif.h' ! MPI
INTEGER, INTENT(in) :: n1,n2,m1,m2,id,root,COMM_local
INTEGER, INTENT(in) :: n1_gbl, n2_gbl
DOUBLE PRECISION, INTENT(inout) :: data_in(n1:n2,m1:m2)
INTEGER, INTENT(inout) :: ier
INTEGER :: nt_gbl
DOUBLE PRECISION, ALLOCATABLE :: buffer_temp(:,:)
IF (ier <0) RETURN
nt_gbl=(n2_gbl-n1_gbl+1)*(m2-m1+1)
ALLOCATE(buffer_temp(n1_gbl:n2_gbl,m1:m2))
buffer_temp = 0
buffer_temp(n1:n2,m1:m2) = data_in(n1:n2,m1:m2)
IF (id == root) THEN
CALL MPI_REDUCE(MPI_IN_PLACE,buffer_temp,nt_gbl,MPI_DOUBLE_PRECISION,MPI_SUM,&
root,COMM_local,ier)
data_in = buffer_temp
ELSE
CALL MPI_REDUCE(buffer_temp,buffer_temp,nt_gbl,MPI_DOUBLE_PRECISION,MPI_SUM,&
root,COMM_local,ier)
END IF
DEALLOCATE(buffer_temp)
ier = 0
CALL MPI_BARRIER(COMM_local, ier)
RETURN
END SUBROUTINE TRANSMIT_2DDBL
当然root进程在进入前需要分别传入n1_gbl和n2_gbl作为n1和n2。但它似乎提高了我的表现。
我有一个代码跟随粒子并在 N 个时间片输出粒子。所以我最后得到的是一个数组 TEMP(1:M,0:N),其中 M 是粒子总数。现在这个问题很容易通过分割粒子来平行化。所以每个处理器都在一个数组 TEMP(MYSTART:MYEND,0:N) 上工作,而主处理器在 TEMP(1:M,0:N) 中工作。现在我需要重新组合数组(实际上有多个这样的数组,但现在只关注一个)。使用我编写的 SENDV-RECV 循环:
IF (myid == master) THEN
ALLOCATE(B_lines(1:nlines,0:nsteps),...
ELSE
ALLOCATE(B_lines(mystart:myend,0:nsteps),...
END IF
CALL MPI_BARRIER(MPI_COMM_FIELDLINES,ierr_mpi)
IF (ierr_mpi /=0) CALL andle_err(MPI_BARRIER_ERR,'fieldlines_init',ierr_mpi)
IF (myid == master) THEN
ALLOCATE(buffer_mast(4,0:nsteps))
DO i = myend+1, nlines
CALL MPI_RECV(buffer_mast,4*(nsteps+1),MPI_DOUBLE_PRECISION,&
MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_FIELDLINES,status,ierr_mpi)
IF (ierr_mpi /=0) CALL handle_err(MPI_RECV_ERR,'fieldlines_init_mgrid',ierr_mpi)
sender = status(MPI_SOURCE)
j = status(MPI_TAG)
R_lines(j,:) = buffer_mast(1,:)
Z_lines(j,:) = buffer_mast(2,:)
PHI_lines(j,:) = buffer_mast(3,:)
B_lines(j,:) = buffer_mast(4,:)
END DO
DEALLOCATE(buffer_mast)
ELSE
IF (mystart <= nlines) THEN
ALLOCATE(buffer_slav(4,0:nsteps))
DO j = mystart, myend
buffer_slav(1,:) = R_lines(j,:)
buffer_slav(2,:) = Z_lines(j,:)
buffer_slav(3,:) = PHI_lines(j,:)
buffer_slav(4,:) = B_lines(j,:)
CALL MPI_SEND(buffer_slav,4*(nsteps+1),MPI_DOUBLE_PRECISION,master,j,MPI_COMM_FIELDLINES,ierr_mpi)
IF (ierr_mpi /=0) CALL handle_err(MPI_SEND_ERR,'fieldlines_init_mgrid',ierr_mpi)
END DO
DEALLOCATE(buffer_slav)
END IF
END IF
现在这工作得很好,但扩展性很差。即使只有 64 个内核,代码也会花费大量时间来回发送数据。现在我想利用 GATHERV。所以我创建了一个子程序,我称之为:
CALL FIELDLINES_TRANSMIT_2DDBL(mystart,myend,0,nsteps,B_lines(mystart:myend,0:nsteps),&
numprocs,mnum,moffsets,myid,master,MPI_COMM_FIELDLINES,ier)
看起来像:
SUBROUTINE FIELDLINES_TRANSMIT_2DDBL(n1,n2,m1,m2,data_in,nproc,mnum,moffsets,id,root,COMM_local,ier)
USE stel_kinds, ONLY: rprec
IMPLICIT NONE
INCLUDE 'mpif.h' ! MPI
INTEGER, INTENT(in) :: n1,n2,m1,m2,nproc,id,root,COMM_local
INTEGER, INTENT(in) :: mnum(nproc), moffsets(nproc)
REAL(rprec), INTENT(inout) :: data_in(n1:n2,m1:m2)
INTEGER, INTENT(inout) :: ier
INTEGER, PARAMETER :: ndims=2
INTEGER, PARAMETER :: sstart(2) = (/0,0/) ! Starting offsets
INTEGER :: dbl_size, localsize, ARRAY_SEND_TYPE, RESIZED_ARRAY_SEND_TYPE
INTEGER :: asize(ndims), ssize(ndims), mrec(nproc)
INTEGER(KIND=MPI_ADDRESS_KIND):: low_bound,extent
DOUBLE PRECISION, ALLOCATABLE :: buffer_temp(:,:)
IF (ier <0) RETURN
mrec = 1
ssize(1) = n2-n1+1
ssize(2) = m2-m1+1
localsize = mnum(id+1)
ALLOCATE(buffer_temp(ssize(1),ssize(2)))
buffer_temp(1:ssize(1),1:ssize(2)) = data_in(n1:n2,m1:m2)
asize = ssize
CALL MPI_BCAST(asize, 2, MPI_INTEGER, root, COMM_local, ier)
CALL MPI_TYPE_CREATE_SUBARRAY(ndims,asize,ssize,sstart,MPI_ORDER_FORTRAN,&
MPI_DOUBLE_PRECISION,ARRAY_SEND_TYPE,ier)
CALL MPI_TYPE_COMMIT(ARRAY_SEND_TYPE,ier)
CALL MPI_TYPE_SIZE(MPI_DOUBLE_PRECISION, dbl_size,ier)
low_bound = 0
extent = dbl_size
CALL MPI_TYPE_CREATE_RESIZED(ARRAY_SEND_TYPE,low_bound,extent,RESIZED_ARRAY_SEND_TYPE,ier)
CALL MPI_TYPE_COMMIT(RESIZED_ARRAY_SEND_TYPE,ier)
IF (id == root) THEN
localsize = PRODUCT(ssize)
mrec(1) = localsize
CALL MPI_GATHERV(MPI_IN_PLACE,1,MPI_DOUBLE_PRECISION,&
data_in,mrec, moffsets,RESIZED_ARRAY_SEND_TYPE,&
root,COMM_local,ier)
ELSE
CALL MPI_GATHERV(buffer_temp,localsize,MPI_DOUBLE_PRECISION,&
buffer_temp,mrec, moffsets,RESIZED_ARRAY_SEND_TYPE,&
root,COMM_local,ier)
END IF
CALL MPI_TYPE_FREE(ARRAY_SEND_TYPE,ier); IF (ier <0) RETURN
CALL MPI_TYPE_FREE(RESIZED_ARRAY_SEND_TYPE,ier); IF (ier <0) RETURN
DEALLOCATE(buffer_temp)
ier = 0
CALL MPI_BARRIER(COMM_local, ier)
RETURN
END SUBROUTINE FIELDLINES_TRANSMIT_2DDBL
现在我有了另一个具有相反索引 (0:nsteps,1:nlines) 的代码,它似乎工作得很好,但是当我尝试传递索引为 (1:nlines,0:nsteps) 的数组时,结果数组看起来一团糟。如有任何建议,我们将不胜感激。
我不是很清楚你设置的所有细节,但这里有一个简单的解决方案,它说明了如何在工作完全按进程划分的限制下进行,即 nlines 是大小的倍数。
在这种情况下,母版的图案是一个简单的矢量(即二维子阵列)。唯一的技巧是适当地调整矢量的大小,以便贡献在接收器处正确排列。您不需要使用 Gatherv - 每个进程发送一堆双打,而主进程从每个人那里接收一个向量。
一般情况下的解决方案取决于数据划分不准确时如何划分数据。如果除最后一个数据较少的进程外,所有进程都具有相同数量的数据,则可以使用此解决方案,但您可以填充最后一个进程的数据,以便它可以发送和接收与其余进程相同数量的数据(包括一些虚拟数据)。
我初始化数据,例如,在第 3 步,跨进程分布的行的值为 301、302、303,... 300+n 行。如果我 运行 处理 4 个进程,12 行和 4 个步骤,那么输出看起来是正确的:
笔记本电脑:~$ mpiexec -n 4 ./gather2d
rank, mystart, myend 2 7 9
rank, mystart, myend 1 4 6
rank, mystart, myend 0 1 3
rank, mystart, myend 3 10 12
1 101 201 301 401
2 102 202 302 402
3 103 203 303 403
4 104 204 304 404
5 105 205 305 405
6 106 206 306 406
7 107 207 307 407
8 108 208 308 408
9 109 209 309 409
10 110 210 310 410
11 111 211 311 411
12 112 212 312 412
代码如下:
program gather2d
use mpi
implicit none
integer :: nlines, nsteps
integer :: comm, rank, size, ierr, arraytype, arraytyperesized
integer :: mystart, myend
integer :: i, j
integer :: dblesize
double precision, allocatable, dimension(:,:) :: B_lines
integer(kind=mpi_address_kind) lb, extent
comm = MPI_COMM_WORLD
nlines = 12
nsteps = 4
call MPI_Init(ierr)
call MPI_Comm_size(comm, size, ierr)
call MPI_Comm_rank(comm, rank, ierr)
mystart = nlines/size * rank + 1
myend = nlines/size * (rank+1)
write(*,*) 'rank, mystart, myend ', rank, mystart, myend
if (rank == 0) then
allocate(B_lines(1:nlines, 0:nsteps))
else
allocate(B_lines(mystart:myend, 0:nsteps))
end if
do i = mystart, myend
do j = 0, nsteps
B_lines(i,j) = i+100*j
end do
end do
call MPI_Type_vector(nsteps+1, myend-mystart+1, nlines, &
MPI_DOUBLE_PRECISION, arraytype, ierr)
call MPI_Type_size(MPI_DOUBLE_PRECISION, dblesize, ierr)
lb = 0
extent = (myend-mystart+1)*dblesize
call MPI_Type_Create_resized(arraytype, lb, extent, arraytyperesized, ierr)
call MPI_Type_commit(arraytyperesized, ierr)
if (rank == 0) then
call MPI_Gather(MPI_IN_PLACE, (nsteps+1)*(myend-mystart+1), &
MPI_DOUBLE_PRECISION, &
B_lines, 1, arraytyperesized, 0, comm, ierr)
else
call MPI_Gather(B_lines, (nsteps+1)*(myend-mystart+1), &
MPI_DOUBLE_PRECISION, &
B_lines, 1, arraytyperesized, 0, comm, ierr)
end if
if (rank == 0) then
do i = 1, nlines
write(*,*) (int(B_lines(i,j)), j = 0, nsteps)
end do
end if
call MPI_Finalize(ierr)
end program gather2d
好的,所以我找到了一种耗费内存的方法来执行此操作。基本上我把所有东西都放在全局大小的数组中并使用 MPI_REDUCE
。这比我的旧队列系统快得多,但以内存为代价。这是代码
SUBROUTINE TRANSMIT_2DDBL(n1,n2,m1,m2,data_in,n1_gbl,n2_gbl,id,root,COMM_local,ier)
IMPLICIT NONE
INCLUDE 'mpif.h' ! MPI
INTEGER, INTENT(in) :: n1,n2,m1,m2,id,root,COMM_local
INTEGER, INTENT(in) :: n1_gbl, n2_gbl
DOUBLE PRECISION, INTENT(inout) :: data_in(n1:n2,m1:m2)
INTEGER, INTENT(inout) :: ier
INTEGER :: nt_gbl
DOUBLE PRECISION, ALLOCATABLE :: buffer_temp(:,:)
IF (ier <0) RETURN
nt_gbl=(n2_gbl-n1_gbl+1)*(m2-m1+1)
ALLOCATE(buffer_temp(n1_gbl:n2_gbl,m1:m2))
buffer_temp = 0
buffer_temp(n1:n2,m1:m2) = data_in(n1:n2,m1:m2)
IF (id == root) THEN
CALL MPI_REDUCE(MPI_IN_PLACE,buffer_temp,nt_gbl,MPI_DOUBLE_PRECISION,MPI_SUM,&
root,COMM_local,ier)
data_in = buffer_temp
ELSE
CALL MPI_REDUCE(buffer_temp,buffer_temp,nt_gbl,MPI_DOUBLE_PRECISION,MPI_SUM,&
root,COMM_local,ier)
END IF
DEALLOCATE(buffer_temp)
ier = 0
CALL MPI_BARRIER(COMM_local, ier)
RETURN
END SUBROUTINE TRANSMIT_2DDBL
当然root进程在进入前需要分别传入n1_gbl和n2_gbl作为n1和n2。但它似乎提高了我的表现。