1 #include "cado.h" // IWYU pragma: keep
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "select_mpi.h"
6
7 long failed_offset = 0;
8
check_if_large_allgather_is_ok(MPI_Comm comm0)9 int check_if_large_allgather_is_ok(MPI_Comm comm0)
10 {
11 /* This check is here to provoke early failure under openmpi, with
12 * some collection of parameters.
13 *
14 * When the psm2 mtl is used, we have a limit on the maximum size of
15 * messages, it seems. It seems totally backwards to me. Short of a
16 * way to check ahead of time what the limit is, we'll simply try and
17 * see if this is indeed the case.
18 *
19 * Here is an excerpt of the error message
20 Message size 5326413824 bigger than supported by PSM2 API. Max = 4294967296
21 [node-1:04815] *** An error occurred in MPI_Allgather
22 [node-1:04815] *** reported by process [3170238465,0]
23 [node-1:04815] *** on communicator MPI_COMM_WORLD
24 [node-1:04815] *** MPI_ERR_OTHER: known error not in list
25 [node-1:04815] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
26 [node-1:04815] *** and potentially your MPI job)
27 [node-1.nancy.grid5000.fr:04793] 1 more process has sent help message help-mtl-psm2.txt / message too big
28
29 * Are we safe with the OFI mtl ? not really. As it happens, this
30 * same check fails as well, with the rather unpleasant specifics
31 * that the MPI_Allgather call returns fine -- only the copied data
32 * doesn't quite meet the expectations... :-((((
33 *
34 * It seems that to avoir this error, one has to use --mca pml ob1 .
35 * At least on omnipath. I haven't checked on infiniband.
36 */
37 /*
38 * This function returns:
39 * 0 on success.
40 * a non-zero MPI Error code if MPI_Allgather returned one.
41 * -1 if no MPI Error code was returned, but the result of Allgather
42 * was wrong.
43 * -2 if memory allocation failed.
44 *
45 * (note that the MPI document guarantees that MPI error codes are
46 * positive integers)
47 */
48
49 size_t s = 1 << 16;
50 unsigned int items_at_a_time = (1 << 16) + 1;
51 MPI_Datatype mpi_ft;
52 MPI_Type_contiguous(s, MPI_BYTE, &mpi_ft);
53 MPI_Type_commit(&mpi_ft);
54 int comm_size0, comm_rank0;
55 MPI_Comm_size(comm0, &comm_size0);
56 MPI_Comm_rank(comm0, &comm_rank0);
57 MPI_Comm comm;
58
59 /* The test makes more sense for larger communicators, but as a
60 * matter of fact it works already for only 2 jobs, and that gives us
61 * more control on the allocated size.
62 */
63 const int number_of_nodes_for_test = 2;
64
65 MPI_Comm_split(comm0, comm_rank0 < number_of_nodes_for_test, comm_rank0, &comm);
66 int comm_size, comm_rank;
67 MPI_Comm_size(comm, &comm_size);
68 MPI_Comm_rank(comm, &comm_rank);
69
70 int rc = 0;
71
72 if (comm_rank0 < number_of_nodes_for_test) {
73 MPI_Comm_set_errhandler(comm, MPI_ERRORS_RETURN);
74 void * data = malloc(items_at_a_time * comm_size * s);
75 memset(data, 0, items_at_a_time * comm_size * s);
76 int alloc_ok = data != NULL;
77 MPI_Allreduce(MPI_IN_PLACE, &alloc_ok, 1, MPI_INT, MPI_MIN, comm);
78 if (alloc_ok) {
79 memset(((char*)data) + items_at_a_time * s * comm_rank, 0x42, items_at_a_time * s);
80 rc = MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
81 data, items_at_a_time,
82 mpi_ft, comm);
83 if (rc == 0) {
84 void * p = memchr(data, 0, items_at_a_time * comm_size * s);
85 if (p != NULL) {
86 /* We found a zero, we shouldn't ! */
87 rc = -1;
88 failed_offset = ((char*)p)-(char*)data;
89 }
90 }
91 MPI_Barrier(comm);
92 } else {
93 rc = -2;
94 }
95 if (data) free(data);
96 MPI_Type_free(&mpi_ft);
97 MPI_Comm_free(&comm);
98 }
99
100 MPI_Barrier(comm0);
101 return rc;
102 }
103
104 /* Theoretically ucx on omnipath should work, but it doesn't seem
105 * obvious...
106 *
107 * At least this issue was purportedly fixed:
108 https://github.com/openucx/ucx/issues/750
109 */
110 const char * error_explanation_mpi[] = {
111 "As a workaround, you might want to use another (mtl-type) layer.",
112 "With OpenMPI, here is what can be done (it does not seem useful to compile an",
113 "ucx-enabled version):",
114 /*
115 "Compile ucx-1.5.1 (I guess v1.5 can do):",
116 " sudo apt install libfabric-dev pkg-config librdmacm-dev",
117 " git clone https://github.com/openucx/ucx.git",
118 " cd ucx ; git co v1.5.1",
119 " ./autogen.sh",
120 " ./configure --prefix=/opt/ucx-1.5.1 && make -j64 && make install",
121 " cd ../",
122 */
123 "Compile openmpi-4.0.1 (I only tested this version):",
124 " tar xf openmpi-4.0.1.tar.bz2",
125 " cd openmpi-4.0.1",
126 " ./configure --prefix=/opt",
127 // " --with-ucx=/opt/ucx-1.5.1",
128 " \\",
129 " --disable-mpi-fortran --without-cuda --disable-opencl \\",
130 " && make -j64 && make install",
131 "Compile using the software above (set all PATHs correctly first).",
132 "Run as follows, depending on the hardware:",
133 " On Mellanox infiniband ConnectX-3:",
134 " mpiexec -n 4 --prefix /opt/openmpi-4.0.1/ \\",
135 " --mca btl_openib_allow_ib true \\",
136 " --mca btl openib \\",
137 " --mca mtl ^ofi \\",
138 " --mca pml ob1 \\",
139 " ./a.out",
140 " On Intel OmniPath:",
141 " mpiexec -n 4 --prefix /opt/openmpi-4.0.1/ \\",
142 " --mca btl_openib_allow_ib true \\",
143 " --mca btl openib \\",
144 " --mca mtl ^psm2 \\",
145 " --mca pml ob1 \\",
146 " ./a.out",
147 };
148
149
check_large_allgather_or_abort(const char * prefix)150 void check_large_allgather_or_abort(const char * prefix)
151 {
152 int rank, size;
153 MPI_Comm_size(MPI_COMM_WORLD, &size);
154 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
155 if (!rank) printf("%schecking if large MPI_Allgather (>4GB) works: ...\n", prefix);
156 int err = check_if_large_allgather_is_ok(MPI_COMM_WORLD);
157 if (!rank) {
158 printf("%schecking if large MPI_Allgather (>4GB) works: %s\n", prefix,
159 ok_NOK(err == 0));
160 if (err != 0)
161 fprintf(stderr, "%schecking if large MPI_Allgather (>4GB) works: %s\n", prefix,
162 ok_NOK(err == 0));
163 }
164 if (err == 0) return;
165
166 if (err == -2) {
167 fprintf(stderr, "%sCould not allocate memory buffer."
168 " Proceeding anyway, since this is just a test,"
169 " but the program will probably abort sooner"
170 " or later anyway.\n", prefix);
171 return;
172 }
173
174 int someone_has_minusone = (err == -1);
175 MPI_Allreduce(MPI_IN_PLACE, &someone_has_minusone, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
176 if (someone_has_minusone) {
177 long * offsets = malloc(size * sizeof(long));
178 offsets[rank] = failed_offset;
179 MPI_Gather(&failed_offset, 1, MPI_LONG,
180 offsets, 1, MPI_LONG, 0, MPI_COMM_WORLD);
181 if (!rank) {
182 for(int i = 0 ; i < size ; i++) {
183 fprintf(stderr, "node %d failed_offset = 0x%lx\n", i, offsets[i]);
184 }
185 }
186 free(offsets);
187 }
188
189 if (!rank) {
190 if (err > 0) { /* return an MPI Error if we've got one. */
191 /* MPI_ERR_OTHER... mostly useless */
192 char error[1024];
193 int errorlen = sizeof(error);
194 MPI_Error_string(err, error, &errorlen);
195 fprintf(stderr, "%sMPI error returned:\n%s%s\n",
196 prefix, prefix, error);
197 }
198 size_t s = sizeof(error_explanation_mpi)/sizeof(error_explanation_mpi[0]);
199 fprintf(stderr, "%s%s\n%s%s\n",
200 prefix,
201 "A basic test of an MPI_Allgather with large messages (>4GB) failed.",
202 prefix,
203 "This could be due to the PSM2 layer having an API limit at 4GB per message."
204 );
205 for(size_t i = 0 ; i < s ; ++i) {
206 fprintf(stderr, "%s%s\n", prefix, error_explanation_mpi[i]);
207 }
208 }
209 abort();
210 }
211
check_if_large_mpi_send_is_ok(MPI_Comm comm0)212 int check_if_large_mpi_send_is_ok(MPI_Comm comm0)
213 {
214 /* The test makes more sense for larger communicators, but as a
215 * matter of fact it works already for only 2 jobs, and that gives us
216 * more control on the allocated size.
217 */
218 const int number_of_nodes_for_test = 2;
219
220 int comm_rank0;
221 MPI_Comm_rank(comm0, &comm_rank0);
222
223 MPI_Comm comm;
224 MPI_Comm_split(comm0, comm_rank0 < number_of_nodes_for_test, comm_rank0, &comm);
225
226 int err = 0;
227
228 if (comm_rank0 < number_of_nodes_for_test) {
229 int comm_rank;
230 MPI_Comm_rank(comm, &comm_rank);
231
232 size_t chunk = 3<<29;
233
234 void * data = malloc(chunk);
235
236 int alloc_ok = data != NULL;
237 MPI_Allreduce(MPI_IN_PLACE, &alloc_ok, 1, MPI_INT, MPI_MIN, comm);
238 if (alloc_ok) {
239 memset(data, 0x42, chunk);
240 if (comm_rank == 1) {
241 err = MPI_Send(data, chunk, MPI_BYTE, 0, 0xbeef, comm);
242 } else if (comm_rank == 0) {
243 err = MPI_Recv(data, chunk, MPI_BYTE, 1, 0xbeef, comm, MPI_STATUS_IGNORE);
244 }
245 free(data);
246 } else {
247 err = -2;
248 }
249 MPI_Barrier(comm);
250 }
251 MPI_Barrier(comm0);
252
253 return err;
254 }
255
check_large_mpi_send_or_abort(const char * prefix)256 void check_large_mpi_send_or_abort(const char * prefix)
257 {
258 int rank, size;
259 MPI_Comm_size(MPI_COMM_WORLD, &size);
260 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
261 if (!rank) printf("%schecking if large MPI_Send (>1GB) works: ...\n", prefix);
262 int err = check_if_large_mpi_send_is_ok(MPI_COMM_WORLD);
263 if (!rank) {
264 printf("%schecking if large MPI_Send (>1GB) works: %s\n", prefix,
265 ok_NOK(err == 0));
266 if (err != 0)
267 fprintf(stderr, "%schecking if large MPI_Send (>1GB) works: %s\n", prefix,
268 ok_NOK(err == 0));
269 }
270 if (err == 0) return;
271
272 if (err == -2) {
273 fprintf(stderr, "%sCould not allocate memory buffer."
274 " Proceeding anyway, since this is just a test,"
275 " but the program will probably abort sooner"
276 " or later anyway.\n", prefix);
277 return;
278 }
279
280 if (!rank) {
281 /* MPI_ERR_OTHER... mostly useless */
282 char error[1024];
283 int errorlen = sizeof(error);
284 MPI_Error_string(err, error, &errorlen);
285 fprintf(stderr, "%sMPI error returned:\n%s%s\n",
286 prefix, prefix, error);
287 fprintf(stderr, "%s%s\n", prefix,
288 "A basic test of an MPI_Send with large messages (>1GB) failed."
289 );
290 size_t s = sizeof(error_explanation_mpi)/sizeof(error_explanation_mpi[0]);
291 for(size_t i = 0 ; i < s ; ++i) {
292 fprintf(stderr, "%s%s\n", prefix, error_explanation_mpi[i]);
293 }
294 }
295 abort();
296 }
297
check_for_mpi_problems()298 void check_for_mpi_problems()
299 {
300 int size;
301 MPI_Comm_size(MPI_COMM_WORLD, &size);
302 if (size < 2) {
303 printf("# Skipping MPI checks (only %d node)\n", size);
304 } else {
305 check_large_mpi_send_or_abort("# ");
306 // check_large_allgather_or_abort("# ");
307 check_large_allgather_or_abort("# ");
308 }
309 }
310
311 #ifdef WANT_MAIN
main(int argc,char * argv[])312 int main(int argc, char * argv[])
313 {
314 MPI_Init(&argc, &argv);
315 check_for_mpi_problems();
316 MPI_Finalize();
317 }
318 #endif
319
320