1 #include "cado.h" // IWYU pragma: keep
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "select_mpi.h"
6 
7 long failed_offset = 0;
8 
check_if_large_allgather_is_ok(MPI_Comm comm0)9 int check_if_large_allgather_is_ok(MPI_Comm comm0)
10 {
11     /* This check is here to provoke early failure under openmpi, with
12      * some collection of parameters.
13      *
14      * When the psm2 mtl is used, we have a limit on the maximum size of
15      * messages, it seems. It seems totally backwards to me. Short of a
16      * way to check ahead of time what the limit is, we'll simply try and
17      * see if this is indeed the case.
18      *
19      * Here is an excerpt of the error message
20      Message size 5326413824 bigger than supported by PSM2 API. Max = 4294967296
21      [node-1:04815] *** An error occurred in MPI_Allgather
22      [node-1:04815] *** reported by process [3170238465,0]
23      [node-1:04815] *** on communicator MPI_COMM_WORLD
24      [node-1:04815] *** MPI_ERR_OTHER: known error not in list
25      [node-1:04815] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
26      [node-1:04815] ***    and potentially your MPI job)
27      [node-1.nancy.grid5000.fr:04793] 1 more process has sent help message help-mtl-psm2.txt / message too big
28 
29      * Are we safe with the OFI mtl ? not really. As it happens, this
30      * same check fails as well, with the rather unpleasant specifics
31      * that the MPI_Allgather call returns fine -- only the copied data
32      * doesn't quite meet the expectations... :-((((
33      *
34      * It seems that to avoir this error, one has to use --mca pml ob1 .
35      * At least on omnipath. I haven't checked on infiniband.
36      */
37     /*
38      * This function returns:
39      *  0 on success.
40      *  a non-zero MPI Error code if MPI_Allgather returned one.
41      *  -1 if no MPI Error code was returned, but the result of Allgather
42      *  was wrong.
43      *  -2 if memory allocation failed.
44      *
45      * (note that the MPI document guarantees that MPI error codes are
46      * positive integers)
47      */
48 
49     size_t s = 1 << 16;
50     unsigned int items_at_a_time = (1 << 16) + 1;
51     MPI_Datatype mpi_ft;
52     MPI_Type_contiguous(s, MPI_BYTE, &mpi_ft);
53     MPI_Type_commit(&mpi_ft);
54     int comm_size0, comm_rank0;
55     MPI_Comm_size(comm0, &comm_size0);
56     MPI_Comm_rank(comm0, &comm_rank0);
57     MPI_Comm comm;
58 
59     /* The test makes more sense for larger communicators, but as a
60      * matter of fact it works already for only 2 jobs, and that gives us
61      * more control on the allocated size.
62      */
63     const int number_of_nodes_for_test = 2;
64 
65     MPI_Comm_split(comm0, comm_rank0 < number_of_nodes_for_test, comm_rank0, &comm);
66     int comm_size, comm_rank;
67     MPI_Comm_size(comm, &comm_size);
68     MPI_Comm_rank(comm, &comm_rank);
69 
70     int rc = 0;
71 
72     if (comm_rank0 < number_of_nodes_for_test) {
73         MPI_Comm_set_errhandler(comm, MPI_ERRORS_RETURN);
74         void * data = malloc(items_at_a_time * comm_size * s);
75         memset(data, 0, items_at_a_time * comm_size * s);
76         int alloc_ok = data != NULL;
77         MPI_Allreduce(MPI_IN_PLACE, &alloc_ok, 1, MPI_INT, MPI_MIN, comm);
78         if (alloc_ok) {
79             memset(((char*)data) + items_at_a_time * s * comm_rank, 0x42, items_at_a_time * s);
80             rc = MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
81                     data, items_at_a_time,
82                     mpi_ft, comm);
83             if (rc == 0) {
84                 void * p = memchr(data, 0, items_at_a_time * comm_size * s);
85                 if (p != NULL) {
86                     /* We found a zero, we shouldn't ! */
87                     rc = -1;
88                     failed_offset = ((char*)p)-(char*)data;
89                 }
90             }
91             MPI_Barrier(comm);
92         } else {
93             rc = -2;
94         }
95         if (data) free(data);
96         MPI_Type_free(&mpi_ft);
97         MPI_Comm_free(&comm);
98     }
99 
100     MPI_Barrier(comm0);
101     return rc;
102 }
103 
104 /* Theoretically ucx on omnipath should work, but it doesn't seem
105  * obvious...
106  *
107  * At least this issue was purportedly fixed:
108     https://github.com/openucx/ucx/issues/750
109 */
110 const char * error_explanation_mpi[] = {
111     "As a workaround, you might want to use another (mtl-type) layer.",
112     "With OpenMPI, here is what can be done (it does not seem useful to compile an",
113     "ucx-enabled version):",
114     /*
115        "Compile ucx-1.5.1 (I guess v1.5 can do):",
116        "  sudo apt install libfabric-dev pkg-config librdmacm-dev",
117        "  git clone https://github.com/openucx/ucx.git",
118        "  cd ucx ; git co v1.5.1",
119        "  ./autogen.sh",
120        "  ./configure --prefix=/opt/ucx-1.5.1 && make -j64 && make install",
121        "  cd ../",
122        */
123     "Compile openmpi-4.0.1 (I only tested this version):",
124     "  tar xf openmpi-4.0.1.tar.bz2",
125     "  cd openmpi-4.0.1",
126     "  ./configure --prefix=/opt",
127         // " --with-ucx=/opt/ucx-1.5.1",
128     " \\",
129     "    --disable-mpi-fortran --without-cuda --disable-opencl \\",
130     "    && make -j64 && make install",
131     "Compile using the software above (set all PATHs correctly first).",
132     "Run as follows, depending on the hardware:",
133     "  On Mellanox infiniband ConnectX-3:",
134     "    mpiexec -n 4 --prefix /opt/openmpi-4.0.1/ \\",
135     "      --mca btl_openib_allow_ib true \\",
136     "      --mca btl openib \\",
137     "      --mca mtl ^ofi \\",
138     "      --mca pml ob1 \\",
139     "      ./a.out",
140     "  On Intel OmniPath:",
141     "    mpiexec -n 4 --prefix /opt/openmpi-4.0.1/ \\",
142     "      --mca btl_openib_allow_ib true \\",
143     "      --mca btl openib \\",
144     "      --mca mtl ^psm2 \\",
145     "      --mca pml ob1 \\",
146     "      ./a.out",
147 };
148 
149 
check_large_allgather_or_abort(const char * prefix)150 void check_large_allgather_or_abort(const char * prefix)
151 {
152     int rank, size;
153     MPI_Comm_size(MPI_COMM_WORLD, &size);
154     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
155     if (!rank) printf("%schecking if large MPI_Allgather (>4GB) works: ...\n", prefix);
156     int err = check_if_large_allgather_is_ok(MPI_COMM_WORLD);
157     if (!rank) {
158         printf("%schecking if large MPI_Allgather (>4GB) works: %s\n", prefix,
159                 ok_NOK(err == 0));
160         if (err != 0)
161             fprintf(stderr, "%schecking if large MPI_Allgather (>4GB) works: %s\n", prefix,
162                     ok_NOK(err == 0));
163     }
164     if (err == 0) return;
165 
166     if (err == -2) {
167         fprintf(stderr, "%sCould not allocate memory buffer."
168                 " Proceeding anyway, since this is just a test,"
169                 " but the program will probably abort sooner"
170                 " or later anyway.\n", prefix);
171         return;
172     }
173 
174     int someone_has_minusone = (err == -1);
175     MPI_Allreduce(MPI_IN_PLACE, &someone_has_minusone, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
176     if (someone_has_minusone) {
177         long * offsets = malloc(size * sizeof(long));
178         offsets[rank] = failed_offset;
179         MPI_Gather(&failed_offset, 1, MPI_LONG,
180                 offsets, 1, MPI_LONG, 0, MPI_COMM_WORLD);
181         if (!rank) {
182             for(int i = 0 ; i < size ; i++) {
183                 fprintf(stderr, "node %d failed_offset = 0x%lx\n", i, offsets[i]);
184         }
185         }
186         free(offsets);
187     }
188 
189     if (!rank) {
190         if (err > 0) { /* return an MPI Error if we've got one. */
191             /* MPI_ERR_OTHER... mostly useless */
192             char error[1024];
193             int errorlen = sizeof(error);
194             MPI_Error_string(err, error, &errorlen);
195             fprintf(stderr, "%sMPI error returned:\n%s%s\n",
196                     prefix, prefix, error);
197         }
198         size_t s = sizeof(error_explanation_mpi)/sizeof(error_explanation_mpi[0]);
199         fprintf(stderr, "%s%s\n%s%s\n",
200                 prefix,
201                 "A basic test of an MPI_Allgather with large messages (>4GB) failed.",
202                 prefix,
203                 "This could be due to the PSM2 layer having an API limit at 4GB per message."
204                );
205         for(size_t i = 0 ; i < s ; ++i) {
206             fprintf(stderr, "%s%s\n", prefix, error_explanation_mpi[i]);
207         }
208     }
209     abort();
210 }
211 
check_if_large_mpi_send_is_ok(MPI_Comm comm0)212 int check_if_large_mpi_send_is_ok(MPI_Comm comm0)
213 {
214     /* The test makes more sense for larger communicators, but as a
215      * matter of fact it works already for only 2 jobs, and that gives us
216      * more control on the allocated size.
217      */
218     const int number_of_nodes_for_test = 2;
219 
220     int comm_rank0;
221     MPI_Comm_rank(comm0, &comm_rank0);
222 
223     MPI_Comm comm;
224     MPI_Comm_split(comm0, comm_rank0 < number_of_nodes_for_test, comm_rank0, &comm);
225 
226     int err = 0;
227 
228     if (comm_rank0 < number_of_nodes_for_test) {
229         int comm_rank;
230         MPI_Comm_rank(comm, &comm_rank);
231 
232         size_t chunk = 3<<29;
233 
234         void * data = malloc(chunk);
235 
236         int alloc_ok = data != NULL;
237         MPI_Allreduce(MPI_IN_PLACE, &alloc_ok, 1, MPI_INT, MPI_MIN, comm);
238         if (alloc_ok) {
239             memset(data, 0x42, chunk);
240             if (comm_rank == 1) {
241                 err = MPI_Send(data, chunk, MPI_BYTE, 0, 0xbeef, comm);
242             } else if (comm_rank == 0) {
243                 err = MPI_Recv(data, chunk, MPI_BYTE, 1, 0xbeef, comm, MPI_STATUS_IGNORE);
244             }
245             free(data);
246         } else {
247             err = -2;
248         }
249         MPI_Barrier(comm);
250     }
251     MPI_Barrier(comm0);
252 
253     return err;
254 }
255 
check_large_mpi_send_or_abort(const char * prefix)256 void check_large_mpi_send_or_abort(const char * prefix)
257 {
258     int rank, size;
259     MPI_Comm_size(MPI_COMM_WORLD, &size);
260     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
261     if (!rank) printf("%schecking if large MPI_Send (>1GB) works: ...\n", prefix);
262     int err = check_if_large_mpi_send_is_ok(MPI_COMM_WORLD);
263     if (!rank) {
264         printf("%schecking if large MPI_Send (>1GB) works: %s\n", prefix,
265                 ok_NOK(err == 0));
266         if (err != 0)
267             fprintf(stderr, "%schecking if large MPI_Send (>1GB) works: %s\n", prefix,
268                     ok_NOK(err == 0));
269     }
270     if (err == 0) return;
271 
272     if (err == -2) {
273         fprintf(stderr, "%sCould not allocate memory buffer."
274                 " Proceeding anyway, since this is just a test,"
275                 " but the program will probably abort sooner"
276                 " or later anyway.\n", prefix);
277         return;
278     }
279 
280     if (!rank) {
281         /* MPI_ERR_OTHER... mostly useless */
282         char error[1024];
283         int errorlen = sizeof(error);
284         MPI_Error_string(err, error, &errorlen);
285         fprintf(stderr, "%sMPI error returned:\n%s%s\n",
286                 prefix, prefix, error);
287         fprintf(stderr, "%s%s\n", prefix,
288                 "A basic test of an MPI_Send with large messages (>1GB) failed."
289                );
290         size_t s = sizeof(error_explanation_mpi)/sizeof(error_explanation_mpi[0]);
291         for(size_t i = 0 ; i < s ; ++i) {
292             fprintf(stderr, "%s%s\n", prefix, error_explanation_mpi[i]);
293         }
294     }
295     abort();
296 }
297 
check_for_mpi_problems()298 void check_for_mpi_problems()
299 {
300     int size;
301     MPI_Comm_size(MPI_COMM_WORLD, &size);
302     if (size < 2) {
303         printf("# Skipping MPI checks (only %d node)\n", size);
304     } else {
305         check_large_mpi_send_or_abort("# ");
306         // check_large_allgather_or_abort("# ");
307         check_large_allgather_or_abort("# ");
308     }
309 }
310 
311 #ifdef WANT_MAIN
main(int argc,char * argv[])312 int main(int argc, char * argv[])
313 {
314     MPI_Init(&argc, &argv);
315     check_for_mpi_problems();
316     MPI_Finalize();
317 }
318 #endif
319 
320