1# Demonstrate how to work with Python GPU arrays using CUDA-aware MPI. 2# We choose the CuPy library for simplicity, but any CUDA array which 3# has the __cuda_array_interface__ attribute defined will work. 4# 5# Run this script using the following command: 6# mpiexec -n 2 python use_cupy.py 7 8from mpi4py import MPI 9import cupy 10 11 12comm = MPI.COMM_WORLD 13size = comm.Get_size() 14rank = comm.Get_rank() 15 16# Allreduce 17sendbuf = cupy.arange(10, dtype='i') 18recvbuf = cupy.empty_like(sendbuf) 19# always make sure the GPU buffer is ready before any MPI operation 20cupy.cuda.get_current_stream().synchronize() 21comm.Allreduce(sendbuf, recvbuf) 22assert cupy.allclose(recvbuf, sendbuf*size) 23 24# Bcast 25if rank == 0: 26 buf = cupy.arange(100, dtype=cupy.complex64) 27else: 28 buf = cupy.empty(100, dtype=cupy.complex64) 29cupy.cuda.get_current_stream().synchronize() 30comm.Bcast(buf) 31assert cupy.allclose(buf, cupy.arange(100, dtype=cupy.complex64)) 32 33# Send-Recv 34if rank == 0: 35 buf = cupy.arange(20, dtype=cupy.float64) 36 cupy.cuda.get_current_stream().synchronize() 37 comm.Send(buf, dest=1, tag=88) 38else: 39 buf = cupy.empty(20, dtype=cupy.float64) 40 cupy.cuda.get_current_stream().synchronize() 41 comm.Recv(buf, source=0, tag=88) 42 assert cupy.allclose(buf, cupy.arange(20, dtype=cupy.float64)) 43