使用 MPI 并行化 do 循环?
Parallelizing do loop with MPI?
我想将以下程序转换为 MPI 程序:
program pi
implicit none
integer, parameter :: DARTS = 50000, ROUNDS = 10, MASTER = 0
double precision :: pi_est
double precision :: homepi, avepi, pirecv, pisum
integer :: rank
integer :: i, n
integer, allocatable :: seed(:)
! we set it to zero in the sequential run
rank = 0
! initialize the random number generator
! we make sure the seed is different for each task
call random_seed()
call random_seed(size = n)
allocate(seed(n))
seed = 12 + rank*11
call random_seed(put=seed(1:n))
deallocate(seed)
avepi = 0
do i = 0, ROUNDS-1
pi_est = dboard(DARTS)
! calculate the average value of pi over all iterations
avepi = ((avepi*i) + pi_est)/(i + 1)
end do
print *, "Pi is ", avepi
contains
double precision function dboard(darts)
integer, intent(in) :: darts
double precision :: x_coord, y_coord
integer :: score, n
score = 0
do n = 1, darts
call random_number(x_coord)
call random_number(y_coord)
if ((x_coord**2 + y_coord**2) <= 1.0d0) then
score = score + 1
end if
end do
dboard = 4.0d0*score/darts
end function
end program
我想我要做的是将 do 循环分成 n 个部分,其中 n 是处理器的数量,将结果保存在一个向量上,然后计算向量的平均值。我不确定这是否正确,也不确定如何实施该更改。
这是我目前所了解的情况:
一个模块mpi_params.f90
module mpi_params
USE MPI
implicit none
integer :: ierr, numprocs, proc_num, &
points_per_proc, istart, iend
integer, allocatable, dimension(:) :: displs, recvcounts
doubleprecision, allocatable, dimension(:) :: proc_contrib
contains
subroutine init_mpi_params(nn)
integer, intent(in) :: nn
integer :: i
! Determine how many points to handle with each proc
if ( mod(nn,numprocs)==0 ) then
points_per_proc = nn/numprocs
else
points_per_proc = (nn-mod(nn,numprocs))/numprocs
if (numprocs-1 == proc_num ) points_per_proc = nn - points_per_proc*(numprocs-1)
end if
! Determine start and end index for this proc's points
istart = proc_num * points_per_proc + 1
if (numprocs-1 == proc_num ) istart = proc_num*(nn-mod(nn,numprocs))/numprocs +1
iend = istart + points_per_proc - 1
if (numprocs-1 == proc_num ) iend = nn
ALLOCATE(proc_contrib(points_per_proc))
!print *, 'about to allocate displs'
allocate(displs(numprocs),source=(/(i*(nn-mod(nn,numprocs))/numprocs,i=0,numprocs-1)/))
!print *, 'about to allocate recvcounts'
allocate(recvcounts(numprocs),source=(nn-mod(nn,numprocs))/numprocs)
recvcounts(numprocs)=nn - points_per_proc*(numprocs-1)
if (numprocs-1 == proc_num ) recvcounts(numprocs) = iend-istart+1
end subroutine init_mpi_params
end module mpi_params
和程序 piMPI.f90
program pi
use mpi_params
implicit none
integer, parameter :: DARTS = 50000, ROUNDS = 10, MASTER = 0
double precision :: pi_est
double precision :: homepi, avepi, pirecv, pisum
integer :: rank
integer :: i, n
integer, allocatable :: seed(:)
double precision :: y(ROUNDS)
call mpi_init(ierr)
call mpi_comm_size(MPI_COMM_WORLD, numprocs, ierr)
call mpi_comm_rank(MPI_COMM_WORLD, proc_num, ierr)
CALL init_mpi_params(ROUNDS)
! we set it to zero in the sequential run
rank = 0
! initialize the random number generator
! we make sure the seed is different for each task
call random_seed()
call random_seed(size = n)
allocate(seed(n))
seed = 12 + rank*11
call random_seed(put=seed(1:n))
deallocate(seed)
avepi = 0
do i = istart, iend
proc_contrib(i) = dboard(DARTS)
end do
!!! MPI Reduce?
call MPI_ALLGATHER(proc_contrib, points_per_proc, MPI_DOUBLE_PRECISION, &
y, points_per_proc, MPI_DOUBLE_PRECISION, &
MPI_COMM_WORLD, ierr)
avepi = sum(y)/ROUNDS
if (proc_num .eq. 0) then
print *, "Pi is ", avepi
end if
call mpi_finalize(ierr)
contains
double precision function dboard(darts)
integer, intent(in) :: darts
double precision :: x_coord, y_coord
integer :: score, n
score = 0
do n = 1, darts
call random_number(x_coord)
call random_number(y_coord)
if ((x_coord**2 + y_coord**2) <= 1.0d0) then
score = score + 1
end if
end do
dboard = 4.0d0*score/darts
end function
end program
我可以编译这段代码:
$ mpif90 mpi_params.f90 piMPI.f90
和运行 它有 1 或 2 个处理器
$ mpiexec -n 1 ./a.out
Pi is 3.1369359999999999
$ mpiexec -n 2 ./a.out
Pi is 1.5679600000000000
但是结果好像n=2不对。此外,如果我尝试使用 3 个或更多 运行 它,我会收到以下错误:
$ mpiexec -n 3 ./a.out
Fatal error in PMPI_Allgather: Message truncated, error stack:
PMPI_Allgather(992)...............: MPI_Allgather(sbuf=0x213e9f0, scount=3, MPI_DOUBLE_PRECISION, rbuf=0x7ffc2638df80, rcount=3, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) failed
MPIR_Allgather_impl(838)..........:
MPIR_Allgather(797)...............:
MPIR_Allgather_intra(555).........:
MPIDI_CH3U_Receive_data_found(131): Message from rank 2 and tag 7 truncated; 32 bytes received but buffer size is 24
Fatal error in PMPI_Allgather: Message truncated, error stack:
PMPI_Allgather(992)...............: MPI_Allgather(sbuf=0x24189f0, scount=3, MPI_DOUBLE_PRECISION, rbuf=0x7fff89575790, rcount=3, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) failed
MPIR_Allgather_impl(838)..........:
MPIR_Allgather(797)...............:
MPIR_Allgather_intra(532).........:
MPIDI_CH3U_Receive_data_found(131): Message from rank 2 and tag 7 truncated; 32 bytes received but buffer size is 24
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 5990 RUNNING AT UltraPro
= EXIT CODE: 1
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
我做错了什么?
如果我已经理解了您的代码,但也有可能我没有理解,那么它就是一个简单的 Monte Carlo pi 值的计算,具有很好的特性,对于初学者并行程序员来说,这很简单计算更多(随机)数字将提高总估计的准确性。要进行 M
计算,您可以让一个进程计算所有这些,或者 P
个进程计算其中的 M/P
,然后取平均值以获得相同的精度。在这种方法中,在程序结束时将局部值最终减少为全局值之前,不需要进行任何消息传递。
所以首先让每个进程计算运行的迭代次数,让每个进程通过使用程序参数自己计算,并通过调用 mpi 例程找出 num_procs
等
我认为你的代码大纲应该是这样的:
program main
! all processes make same declarations, including variables to be used
! to calculate pi, and parameters
call mpi_init(...)
...
! calculate pi independently on each process, no MPI calls necessary
! each process uses program parameters to calculate own contribution
call mpi_reduce(local_pi, master_pi, 1, mpi_double_precision, mpi_sum, 0, &
mpi_comm_world, ierr)
if (proc_num==0) write(*,*) 'pi = ', master_pi/num_procs
call mpi_finalize
仅此而已。
如果有人正在寻找可以编译的代码,这是我的工作解决方案:
program pi
use mpi_params
implicit none
integer, parameter :: DARTS = 500000, ROUNDS = 100, MASTER = 0
double precision :: pi_est
double precision :: homepi, avepi, pirecv, pisum
integer :: rank
integer :: i, n
integer, allocatable :: seed(:)
double precision :: y
double precision :: sumpi
call mpi_init(ierr)
call mpi_comm_size(MPI_COMM_WORLD, numprocs, ierr)
call mpi_comm_rank(MPI_COMM_WORLD, proc_num, ierr)
CALL init_mpi_params(ROUNDS)
! we set it to zero in the sequential run
rank = 0
! initialize the random number generator
! we make sure the seed is different for each task
call random_seed()
call random_seed(size = n)
allocate(seed(n))
seed = 12 + rank*11
call random_seed(put=seed(1:n))
deallocate(seed)
y=0.0d0
do i = istart, iend
y = y + dboard(DARTS)
end do
call mpi_reduce(y, sumpi, 1, mpi_double_precision, mpi_sum, 0, &
mpi_comm_world, ierr)
if (proc_num==0) write(*,*) 'pi = ', sumpi/ROUNDS
call mpi_finalize(ierr)
contains
double precision function dboard(darts)
integer, intent(in) :: darts
double precision :: x_coord, y_coord
integer :: score, n
score = 0
do n = 1, darts
call random_number(x_coord)
call random_number(y_coord)
if ((x_coord**2 + y_coord**2) <= 1.0d0) then
score = score + 1
end if
end do
dboard = 4.0d0*score/darts
end function
end program
和额外模块
module mpi_params
USE MPI
implicit none
integer :: ierr, numprocs, proc_num, &
points_per_proc, istart, iend
doubleprecision, allocatable, dimension(:) :: proc_contrib
contains
subroutine init_mpi_params(nn)
integer, intent(in) :: nn
integer :: i
! Determine how many points to handle with each proc
if ( mod(nn,numprocs)==0 ) then
points_per_proc = nn/numprocs
else
points_per_proc = (nn-mod(nn,numprocs))/numprocs
if (numprocs-1 == proc_num ) points_per_proc = nn - points_per_proc*(numprocs-1)
end if
! Determine start and end index for this proc's points
istart = proc_num * points_per_proc + 1
if (numprocs-1 == proc_num ) istart = proc_num*(nn-mod(nn,numprocs))/numprocs +1
iend = istart + points_per_proc - 1
if (numprocs-1 == proc_num ) iend = nn
ALLOCATE(proc_contrib(points_per_proc))
end subroutine init_mpi_params
end module mpi_params
这段代码可以用
编译
mpif90 mpi_params.f90 piMPI.f90
并以
运行
time mpiexec -n 10 ./a.out
比@HighPerformanceMark 提出的解决方案更复杂,因为我想保留拆分 do 循环的想法(对我正在处理的其他一些代码很有用)
我想将以下程序转换为 MPI 程序:
program pi
implicit none
integer, parameter :: DARTS = 50000, ROUNDS = 10, MASTER = 0
double precision :: pi_est
double precision :: homepi, avepi, pirecv, pisum
integer :: rank
integer :: i, n
integer, allocatable :: seed(:)
! we set it to zero in the sequential run
rank = 0
! initialize the random number generator
! we make sure the seed is different for each task
call random_seed()
call random_seed(size = n)
allocate(seed(n))
seed = 12 + rank*11
call random_seed(put=seed(1:n))
deallocate(seed)
avepi = 0
do i = 0, ROUNDS-1
pi_est = dboard(DARTS)
! calculate the average value of pi over all iterations
avepi = ((avepi*i) + pi_est)/(i + 1)
end do
print *, "Pi is ", avepi
contains
double precision function dboard(darts)
integer, intent(in) :: darts
double precision :: x_coord, y_coord
integer :: score, n
score = 0
do n = 1, darts
call random_number(x_coord)
call random_number(y_coord)
if ((x_coord**2 + y_coord**2) <= 1.0d0) then
score = score + 1
end if
end do
dboard = 4.0d0*score/darts
end function
end program
我想我要做的是将 do 循环分成 n 个部分,其中 n 是处理器的数量,将结果保存在一个向量上,然后计算向量的平均值。我不确定这是否正确,也不确定如何实施该更改。
这是我目前所了解的情况:
一个模块mpi_params.f90
module mpi_params
USE MPI
implicit none
integer :: ierr, numprocs, proc_num, &
points_per_proc, istart, iend
integer, allocatable, dimension(:) :: displs, recvcounts
doubleprecision, allocatable, dimension(:) :: proc_contrib
contains
subroutine init_mpi_params(nn)
integer, intent(in) :: nn
integer :: i
! Determine how many points to handle with each proc
if ( mod(nn,numprocs)==0 ) then
points_per_proc = nn/numprocs
else
points_per_proc = (nn-mod(nn,numprocs))/numprocs
if (numprocs-1 == proc_num ) points_per_proc = nn - points_per_proc*(numprocs-1)
end if
! Determine start and end index for this proc's points
istart = proc_num * points_per_proc + 1
if (numprocs-1 == proc_num ) istart = proc_num*(nn-mod(nn,numprocs))/numprocs +1
iend = istart + points_per_proc - 1
if (numprocs-1 == proc_num ) iend = nn
ALLOCATE(proc_contrib(points_per_proc))
!print *, 'about to allocate displs'
allocate(displs(numprocs),source=(/(i*(nn-mod(nn,numprocs))/numprocs,i=0,numprocs-1)/))
!print *, 'about to allocate recvcounts'
allocate(recvcounts(numprocs),source=(nn-mod(nn,numprocs))/numprocs)
recvcounts(numprocs)=nn - points_per_proc*(numprocs-1)
if (numprocs-1 == proc_num ) recvcounts(numprocs) = iend-istart+1
end subroutine init_mpi_params
end module mpi_params
和程序 piMPI.f90
program pi
use mpi_params
implicit none
integer, parameter :: DARTS = 50000, ROUNDS = 10, MASTER = 0
double precision :: pi_est
double precision :: homepi, avepi, pirecv, pisum
integer :: rank
integer :: i, n
integer, allocatable :: seed(:)
double precision :: y(ROUNDS)
call mpi_init(ierr)
call mpi_comm_size(MPI_COMM_WORLD, numprocs, ierr)
call mpi_comm_rank(MPI_COMM_WORLD, proc_num, ierr)
CALL init_mpi_params(ROUNDS)
! we set it to zero in the sequential run
rank = 0
! initialize the random number generator
! we make sure the seed is different for each task
call random_seed()
call random_seed(size = n)
allocate(seed(n))
seed = 12 + rank*11
call random_seed(put=seed(1:n))
deallocate(seed)
avepi = 0
do i = istart, iend
proc_contrib(i) = dboard(DARTS)
end do
!!! MPI Reduce?
call MPI_ALLGATHER(proc_contrib, points_per_proc, MPI_DOUBLE_PRECISION, &
y, points_per_proc, MPI_DOUBLE_PRECISION, &
MPI_COMM_WORLD, ierr)
avepi = sum(y)/ROUNDS
if (proc_num .eq. 0) then
print *, "Pi is ", avepi
end if
call mpi_finalize(ierr)
contains
double precision function dboard(darts)
integer, intent(in) :: darts
double precision :: x_coord, y_coord
integer :: score, n
score = 0
do n = 1, darts
call random_number(x_coord)
call random_number(y_coord)
if ((x_coord**2 + y_coord**2) <= 1.0d0) then
score = score + 1
end if
end do
dboard = 4.0d0*score/darts
end function
end program
我可以编译这段代码:
$ mpif90 mpi_params.f90 piMPI.f90
和运行 它有 1 或 2 个处理器
$ mpiexec -n 1 ./a.out
Pi is 3.1369359999999999
$ mpiexec -n 2 ./a.out
Pi is 1.5679600000000000
但是结果好像n=2不对。此外,如果我尝试使用 3 个或更多 运行 它,我会收到以下错误:
$ mpiexec -n 3 ./a.out
Fatal error in PMPI_Allgather: Message truncated, error stack:
PMPI_Allgather(992)...............: MPI_Allgather(sbuf=0x213e9f0, scount=3, MPI_DOUBLE_PRECISION, rbuf=0x7ffc2638df80, rcount=3, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) failed
MPIR_Allgather_impl(838)..........:
MPIR_Allgather(797)...............:
MPIR_Allgather_intra(555).........:
MPIDI_CH3U_Receive_data_found(131): Message from rank 2 and tag 7 truncated; 32 bytes received but buffer size is 24
Fatal error in PMPI_Allgather: Message truncated, error stack:
PMPI_Allgather(992)...............: MPI_Allgather(sbuf=0x24189f0, scount=3, MPI_DOUBLE_PRECISION, rbuf=0x7fff89575790, rcount=3, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) failed
MPIR_Allgather_impl(838)..........:
MPIR_Allgather(797)...............:
MPIR_Allgather_intra(532).........:
MPIDI_CH3U_Receive_data_found(131): Message from rank 2 and tag 7 truncated; 32 bytes received but buffer size is 24
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 5990 RUNNING AT UltraPro
= EXIT CODE: 1
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
我做错了什么?
如果我已经理解了您的代码,但也有可能我没有理解,那么它就是一个简单的 Monte Carlo pi 值的计算,具有很好的特性,对于初学者并行程序员来说,这很简单计算更多(随机)数字将提高总估计的准确性。要进行 M
计算,您可以让一个进程计算所有这些,或者 P
个进程计算其中的 M/P
,然后取平均值以获得相同的精度。在这种方法中,在程序结束时将局部值最终减少为全局值之前,不需要进行任何消息传递。
所以首先让每个进程计算运行的迭代次数,让每个进程通过使用程序参数自己计算,并通过调用 mpi 例程找出 num_procs
等
我认为你的代码大纲应该是这样的:
program main
! all processes make same declarations, including variables to be used
! to calculate pi, and parameters
call mpi_init(...)
...
! calculate pi independently on each process, no MPI calls necessary
! each process uses program parameters to calculate own contribution
call mpi_reduce(local_pi, master_pi, 1, mpi_double_precision, mpi_sum, 0, &
mpi_comm_world, ierr)
if (proc_num==0) write(*,*) 'pi = ', master_pi/num_procs
call mpi_finalize
仅此而已。
如果有人正在寻找可以编译的代码,这是我的工作解决方案:
program pi
use mpi_params
implicit none
integer, parameter :: DARTS = 500000, ROUNDS = 100, MASTER = 0
double precision :: pi_est
double precision :: homepi, avepi, pirecv, pisum
integer :: rank
integer :: i, n
integer, allocatable :: seed(:)
double precision :: y
double precision :: sumpi
call mpi_init(ierr)
call mpi_comm_size(MPI_COMM_WORLD, numprocs, ierr)
call mpi_comm_rank(MPI_COMM_WORLD, proc_num, ierr)
CALL init_mpi_params(ROUNDS)
! we set it to zero in the sequential run
rank = 0
! initialize the random number generator
! we make sure the seed is different for each task
call random_seed()
call random_seed(size = n)
allocate(seed(n))
seed = 12 + rank*11
call random_seed(put=seed(1:n))
deallocate(seed)
y=0.0d0
do i = istart, iend
y = y + dboard(DARTS)
end do
call mpi_reduce(y, sumpi, 1, mpi_double_precision, mpi_sum, 0, &
mpi_comm_world, ierr)
if (proc_num==0) write(*,*) 'pi = ', sumpi/ROUNDS
call mpi_finalize(ierr)
contains
double precision function dboard(darts)
integer, intent(in) :: darts
double precision :: x_coord, y_coord
integer :: score, n
score = 0
do n = 1, darts
call random_number(x_coord)
call random_number(y_coord)
if ((x_coord**2 + y_coord**2) <= 1.0d0) then
score = score + 1
end if
end do
dboard = 4.0d0*score/darts
end function
end program
和额外模块
module mpi_params
USE MPI
implicit none
integer :: ierr, numprocs, proc_num, &
points_per_proc, istart, iend
doubleprecision, allocatable, dimension(:) :: proc_contrib
contains
subroutine init_mpi_params(nn)
integer, intent(in) :: nn
integer :: i
! Determine how many points to handle with each proc
if ( mod(nn,numprocs)==0 ) then
points_per_proc = nn/numprocs
else
points_per_proc = (nn-mod(nn,numprocs))/numprocs
if (numprocs-1 == proc_num ) points_per_proc = nn - points_per_proc*(numprocs-1)
end if
! Determine start and end index for this proc's points
istart = proc_num * points_per_proc + 1
if (numprocs-1 == proc_num ) istart = proc_num*(nn-mod(nn,numprocs))/numprocs +1
iend = istart + points_per_proc - 1
if (numprocs-1 == proc_num ) iend = nn
ALLOCATE(proc_contrib(points_per_proc))
end subroutine init_mpi_params
end module mpi_params
这段代码可以用
编译mpif90 mpi_params.f90 piMPI.f90
并以
运行time mpiexec -n 10 ./a.out
比@HighPerformanceMark 提出的解决方案更复杂,因为我想保留拆分 do 循环的想法(对我正在处理的其他一些代码很有用)