1# Demonstrate how to work with Python GPU arrays using CUDA-aware MPI.
2# We choose the CuPy library for simplicity, but any CUDA array which
3# has the __cuda_array_interface__ attribute defined will work.
4#
5# Run this script using the following command:
6# mpiexec -n 2 python use_cupy.py
7
8from mpi4py import MPI
9import cupy
10
11
12comm = MPI.COMM_WORLD
13size = comm.Get_size()
14rank = comm.Get_rank()
15
16# Allreduce
17sendbuf = cupy.arange(10, dtype='i')
18recvbuf = cupy.empty_like(sendbuf)
19# always make sure the GPU buffer is ready before any MPI operation
20cupy.cuda.get_current_stream().synchronize()
21comm.Allreduce(sendbuf, recvbuf)
22assert cupy.allclose(recvbuf, sendbuf*size)
23
24# Bcast
25if rank == 0:
26    buf = cupy.arange(100, dtype=cupy.complex64)
27else:
28    buf = cupy.empty(100, dtype=cupy.complex64)
29cupy.cuda.get_current_stream().synchronize()
30comm.Bcast(buf)
31assert cupy.allclose(buf, cupy.arange(100, dtype=cupy.complex64))
32
33# Send-Recv
34if rank == 0:
35    buf = cupy.arange(20, dtype=cupy.float64)
36    cupy.cuda.get_current_stream().synchronize()
37    comm.Send(buf, dest=1, tag=88)
38else:
39    buf = cupy.empty(20, dtype=cupy.float64)
40    cupy.cuda.get_current_stream().synchronize()
41    comm.Recv(buf, source=0, tag=88)
42    assert cupy.allclose(buf, cupy.arange(20, dtype=cupy.float64))
43