1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2005 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
14 * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
15 * Copyright (c) 2006-2018 Los Alamos National Security, LLC. All rights
16 * reserved.
17 * Copyright (c) 2006-2007 Voltaire All rights reserved.
18 * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
19 * Copyright (c) 2013-2015 NVIDIA Corporation. All rights reserved.
20 * Copyright (c) 2014-2016 Research Organization for Information Science
21 * and Technology (RIST). All rights reserved.
22 * Copyright (c) 2014 Intel, Inc. All rights reserved.
23 * $COPYRIGHT$
24 *
25 * Additional copyrights may follow
26 *
27 * $HEADER$
28 */
29
30 #include "opal_config.h"
31
32 #include <string.h>
33 #include "opal/util/bit_ops.h"
34 #include "opal/mca/common/verbs/common_verbs.h"
35 #include "opal/mca/installdirs/installdirs.h"
36 #include "opal/util/os_dirpath.h"
37 #include "opal/util/output.h"
38 #include "opal/util/show_help.h"
39 #include "opal/util/proc.h"
40
41 #include "btl_openib.h"
42 #include "btl_openib_mca.h"
43 #include "btl_openib_ini.h"
44 #include "connect/base.h"
45
46 #ifdef HAVE_IBV_FORK_INIT
47 #define OPAL_HAVE_IBV_FORK_INIT 1
48 #else
49 #define OPAL_HAVE_IBV_FORK_INIT 0
50 #endif
51
52 /*
53 * Local flags
54 */
55 enum {
56 REGINT_NEG_ONE_OK = 0x01,
57 REGINT_GE_ZERO = 0x02,
58 REGINT_GE_ONE = 0x04,
59 REGINT_NONZERO = 0x08,
60
61 REGINT_MAX = 0x88
62 };
63
64
65 enum {
66 REGSTR_EMPTY_OK = 0x01,
67
68 REGSTR_MAX = 0x88
69 };
70
71 static mca_base_var_enum_value_t ib_mtu_values[] = {
72 {IBV_MTU_256, "256B"},
73 {IBV_MTU_512, "512B"},
74 {IBV_MTU_1024, "1k"},
75 {IBV_MTU_2048, "2k"},
76 {IBV_MTU_4096, "4k"},
77 {0, NULL}
78 };
79
80 static mca_base_var_enum_value_t device_type_values[] = {
81 {BTL_OPENIB_DT_IB, "infiniband"},
82 {BTL_OPENIB_DT_IB, "ib"},
83 {BTL_OPENIB_DT_IWARP, "iwarp"},
84 {BTL_OPENIB_DT_IWARP, "iw"},
85 {BTL_OPENIB_DT_ALL, "all"},
86 {0, NULL}
87 };
88
89 static int btl_openib_cq_size;
90 static bool btl_openib_have_fork_support = OPAL_HAVE_IBV_FORK_INIT;
91
92 /*
93 * utility routine for string parameter registration
94 */
reg_string(const char * param_name,const char * deprecated_param_name,const char * param_desc,const char * default_value,char ** storage,int flags)95 static int reg_string(const char* param_name,
96 const char* deprecated_param_name,
97 const char* param_desc,
98 const char* default_value, char **storage,
99 int flags)
100 {
101 int index;
102
103 assert (NULL != storage);
104
105 /* The MCA variable system will not change this pointer */
106 *storage = (char *) default_value;
107 index = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
108 param_name, param_desc, MCA_BASE_VAR_TYPE_STRING,
109 NULL, 0, 0, OPAL_INFO_LVL_9,
110 MCA_BASE_VAR_SCOPE_READONLY, storage);
111 if (NULL != deprecated_param_name) {
112 (void) mca_base_var_register_synonym(index, "ompi", "btl", "openib",
113 deprecated_param_name,
114 MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
115 }
116
117 if (0 != (flags & REGSTR_EMPTY_OK) && (NULL == *storage || 0 == strlen(*storage))) {
118 opal_output(0, "Bad parameter value for parameter \"%s\"",
119 param_name);
120 return OPAL_ERR_BAD_PARAM;
121 }
122
123 return OPAL_SUCCESS;
124 }
125
126
127 /*
128 * utility routine for integer parameter registration
129 */
reg_int(const char * param_name,const char * deprecated_param_name,const char * param_desc,int default_value,int * storage,int flags)130 static int reg_int(const char* param_name,
131 const char* deprecated_param_name,
132 const char* param_desc,
133 int default_value, int *storage, int flags)
134 {
135 int index;
136
137 *storage = default_value;
138 index = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
139 param_name, param_desc, MCA_BASE_VAR_TYPE_INT,
140 NULL, 0, 0, OPAL_INFO_LVL_9,
141 MCA_BASE_VAR_SCOPE_READONLY, storage);
142 if (NULL != deprecated_param_name) {
143 (void) mca_base_var_register_synonym(index, "ompi", "btl", "openib",
144 deprecated_param_name,
145 MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
146 }
147
148 if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) {
149 return OPAL_SUCCESS;
150 }
151
152 if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) ||
153 (0 != (flags & REGINT_GE_ONE) && *storage < 1) ||
154 (0 != (flags & REGINT_NONZERO) && 0 == *storage)) {
155 opal_output(0, "Bad parameter value for parameter \"%s\"",
156 param_name);
157 return OPAL_ERR_BAD_PARAM;
158 }
159
160 return OPAL_SUCCESS;
161 }
162
163 /*
164 * utility routine for integer parameter registration
165 */
reg_uint(const char * param_name,const char * deprecated_param_name,const char * param_desc,unsigned int default_value,unsigned int * storage,int flags)166 static int reg_uint(const char* param_name,
167 const char* deprecated_param_name,
168 const char* param_desc,
169 unsigned int default_value, unsigned int *storage,
170 int flags)
171 {
172 int index;
173
174 *storage = default_value;
175 index = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
176 param_name, param_desc, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
177 NULL, 0, 0, OPAL_INFO_LVL_9,
178 MCA_BASE_VAR_SCOPE_READONLY, storage);
179 if (NULL != deprecated_param_name) {
180 (void) mca_base_var_register_synonym(index, "ompi", "btl", "openib",
181 deprecated_param_name,
182 MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
183 }
184
185 if ((0 != (flags & REGINT_GE_ONE) && *storage < 1) ||
186 (0 != (flags & REGINT_NONZERO) && 0 == *storage)) {
187 opal_output(0, "Bad parameter value for parameter \"%s\"",
188 param_name);
189 return OPAL_ERR_BAD_PARAM;
190 }
191
192 return OPAL_SUCCESS;
193 }
194
195 /*
196 * utility routine for integer parameter registration
197 */
reg_bool(const char * param_name,const char * deprecated_param_name,const char * param_desc,bool default_value,bool * storage)198 static int reg_bool(const char* param_name,
199 const char* deprecated_param_name,
200 const char* param_desc,
201 bool default_value, bool *storage)
202 {
203 int index;
204
205 *storage = default_value;
206 index = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
207 param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL,
208 NULL, 0, 0, OPAL_INFO_LVL_9,
209 MCA_BASE_VAR_SCOPE_READONLY, storage);
210 if (NULL != deprecated_param_name) {
211 (void) mca_base_var_register_synonym(index, "ompi", "btl", "openib",
212 deprecated_param_name,
213 MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
214 }
215
216 return OPAL_SUCCESS;
217 }
218
219 /*
220 * Register and check all MCA parameters
221 */
btl_openib_register_mca_params(void)222 int btl_openib_register_mca_params(void)
223 {
224 mca_base_var_enum_t *new_enum;
225 char *default_qps;
226 uint32_t mid_qp_size;
227 char *msg, *str;
228 int ret, tmp;
229
230 ret = OPAL_SUCCESS;
231 #define CHECK(expr) do {\
232 tmp = (expr); \
233 if (OPAL_SUCCESS != tmp) ret = tmp; \
234 } while (0)
235
236 /* register openib component parameters */
237 CHECK(reg_bool("verbose", NULL,
238 "Output some verbose OpenIB BTL information "
239 "(0 = no output, nonzero = output)", false,
240 &mca_btl_openib_component.verbose));
241
242 CHECK(reg_bool("warn_no_device_params_found",
243 "warn_no_hca_params_found",
244 "Warn when no device-specific parameters are found in the INI file specified by the btl_openib_device_param_files MCA parameter "
245 "(0 = do not warn; any other value = warn)",
246 true, &mca_btl_openib_component.warn_no_device_params_found));
247
248 CHECK(reg_bool("warn_default_gid_prefix", NULL,
249 "Warn when there is more than one active ports and at least one of them connected to the network with only default GID prefix configured "
250 "(0 = do not warn; any other value = warn)",
251 true, &mca_btl_openib_component.warn_default_gid_prefix));
252
253 CHECK(reg_bool("warn_nonexistent_if", NULL,
254 "Warn if non-existent devices and/or ports are specified in the btl_openib_if_[in|ex]clude MCA parameters "
255 "(0 = do not warn; any other value = warn)",
256 true, &mca_btl_openib_component.warn_nonexistent_if));
257
258 /* If we print a warning about not having enough registered memory
259 available, do we want to abort? */
260 CHECK(reg_bool("abort_not_enough_reg_mem", NULL,
261 "If there is not enough registered memory available on the system for Open MPI to function properly, Open MPI will issue a warning. If this MCA parameter is set to true, then Open MPI will also abort all MPI jobs "
262 "(0 = warn, but do not abort; any other value = warn and abort)",
263 false, &mca_btl_openib_component.abort_not_enough_reg_mem));
264
265 CHECK(reg_uint("poll_cq_batch", NULL,
266 "Retrieve up to poll_cq_batch completions from CQ",
267 MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT, &mca_btl_openib_component.cq_poll_batch,
268 REGINT_GE_ONE));
269
270 asprintf(&str, "%s/mca-btl-openib-device-params.ini",
271 opal_install_dirs.opaldatadir);
272 if (NULL == str) {
273 return OPAL_ERR_OUT_OF_RESOURCE;
274 }
275 CHECK(reg_string("device_param_files", "hca_param_files",
276 "Colon-delimited list of INI-style files that contain device vendor/part-specific parameters (use semicolon for Windows)",
277 str, &mca_btl_openib_component.device_params_file_names,
278 0));
279 free(str);
280
281 (void)mca_base_var_enum_create("btl_openib_device_types", device_type_values, &new_enum);
282 mca_btl_openib_component.device_type = BTL_OPENIB_DT_ALL;
283 tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
284 "device_type", "Specify to only use IB or iWARP "
285 "network adapters (infiniband = only use InfiniBand "
286 "HCAs; iwarp = only use iWARP NICs; all = use any "
287 "available adapters)", MCA_BASE_VAR_TYPE_INT, new_enum,
288 0, 0, OPAL_INFO_LVL_9,
289 MCA_BASE_VAR_SCOPE_READONLY,
290 &mca_btl_openib_component.device_type);
291 if (0 > tmp) ret = tmp;
292 OBJ_RELEASE(new_enum);
293
294 /*
295 * Provide way for using to override policy of ignoring IB HCAs
296 */
297
298 mca_btl_openib_component.allow_ib = false;
299 tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
300 "allow_ib",
301 "Override policy since Open MPI 4.0 of ignoring IB HCAs for openib BTL",
302 MCA_BASE_VAR_TYPE_BOOL, NULL,
303 0, 0, OPAL_INFO_LVL_5,
304 MCA_BASE_VAR_SCOPE_READONLY,
305 &mca_btl_openib_component.allow_ib);
306
307 CHECK(reg_int("max_btls", NULL,
308 "Maximum number of device ports to use "
309 "(-1 = use all available, otherwise must be >= 1)",
310 -1, &mca_btl_openib_component.ib_max_btls,
311 REGINT_NEG_ONE_OK | REGINT_GE_ONE));
312 CHECK(reg_int("free_list_num", NULL,
313 "Initial size of free lists "
314 "(must be >= 1)",
315 8, &mca_btl_openib_component.ib_free_list_num,
316 REGINT_GE_ONE));
317 CHECK(reg_int("free_list_max", NULL,
318 "Maximum size of free lists "
319 "(-1 = infinite, otherwise must be >= 0)",
320 -1, &mca_btl_openib_component.ib_free_list_max,
321 REGINT_NEG_ONE_OK | REGINT_GE_ONE));
322 CHECK(reg_int("free_list_inc", NULL,
323 "Increment size of free lists "
324 "(must be >= 1)",
325 32, &mca_btl_openib_component.ib_free_list_inc,
326 REGINT_GE_ONE));
327 CHECK(reg_string("mpool_hints", NULL, "hints for selecting a memory pool (default: none)",
328 NULL, &mca_btl_openib_component.ib_mpool_hints,
329 0));
330 CHECK(reg_string("rcache", NULL,
331 "Name of the registration cache to be used (it is unlikely that you will ever want to change this)",
332 "grdma", &mca_btl_openib_component.ib_rcache_name,
333 0));
334 CHECK(reg_int("reg_mru_len", NULL,
335 "Length of the registration cache most recently used list "
336 "(must be >= 1)",
337 16, (int*) &mca_btl_openib_component.reg_mru_len,
338 REGINT_GE_ONE));
339
340 CHECK(reg_int("cq_size", "ib_cq_size",
341 "Minimum size of the OpenFabrics completion queue "
342 "(CQs are automatically sized based on the number "
343 "of peer MPI processes; this value determines the "
344 "*minimum* size of all CQs)",
345 8192, &btl_openib_cq_size, REGINT_GE_ONE));
346 mca_btl_openib_component.ib_cq_size[BTL_OPENIB_LP_CQ] =
347 mca_btl_openib_component.ib_cq_size[BTL_OPENIB_HP_CQ] = (uint32_t) btl_openib_cq_size;
348
349 CHECK(reg_int("max_inline_data", "ib_max_inline_data",
350 "Maximum size of inline data segment "
351 "(-1 = run-time probe to discover max value, otherwise must be >= 0). "
352 "If not explicitly set, use max_inline_data from "
353 "the INI file containing device-specific parameters",
354 -1, &mca_btl_openib_component.ib_max_inline_data,
355 REGINT_NEG_ONE_OK | REGINT_GE_ZERO));
356
357 CHECK(reg_uint("pkey", "ib_pkey_val",
358 "OpenFabrics partition key (pkey) value. "
359 "Unsigned integer decimal or hex values are allowed (e.g., \"3\" or \"0x3f\") and will be masked against the maximum allowable IB partition key value (0x7fff)",
360 0, &mca_btl_openib_component.ib_pkey_val, 0));
361
362 CHECK(reg_uint("psn", "ib_psn",
363 "OpenFabrics packet sequence starting number "
364 "(must be >= 0)",
365 0, &mca_btl_openib_component.ib_psn, 0));
366
367 CHECK(reg_uint("ib_qp_ous_rd_atom", NULL,
368 "InfiniBand outstanding atomic reads "
369 "(must be >= 0)",
370 4, &mca_btl_openib_component.ib_qp_ous_rd_atom, 0));
371
372 asprintf(&msg, "OpenFabrics MTU, in bytes (if not specified in INI files). Valid values are: %d=256 bytes, %d=512 bytes, %d=1024 bytes, %d=2048 bytes, %d=4096 bytes",
373 IBV_MTU_256,
374 IBV_MTU_512,
375 IBV_MTU_1024,
376 IBV_MTU_2048,
377 IBV_MTU_4096);
378 if (NULL == msg) {
379 /* Don't try to recover from this */
380 return OPAL_ERR_OUT_OF_RESOURCE;
381 }
382 mca_btl_openib_component.ib_mtu = 0;
383 (void) mca_base_var_enum_create("btl_openib_mtus", ib_mtu_values, &new_enum);
384 tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
385 "mtu", msg, MCA_BASE_VAR_TYPE_INT, new_enum,
386 0, 0, OPAL_INFO_LVL_9,
387 MCA_BASE_VAR_SCOPE_READONLY,
388 &mca_btl_openib_component.ib_mtu);
389 if (0 <= tmp) {
390 (void) mca_base_var_register_synonym(tmp, "ompi", "btl", "openib", "ib_mtu",
391 MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
392 } else {
393 ret = tmp;
394 }
395
396 OBJ_RELEASE(new_enum);
397 free(msg);
398
399 CHECK(reg_uint("ib_min_rnr_timer", NULL, "InfiniBand minimum "
400 "\"receiver not ready\" timer, in seconds "
401 "(must be >= 0 and <= 31)",
402 25, &mca_btl_openib_component.ib_min_rnr_timer, 0));
403
404 CHECK(reg_uint("ib_timeout", NULL,
405 "InfiniBand transmit timeout, plugged into formula: 4.096 microseconds * (2^btl_openib_ib_timeout) "
406 "(must be >= 0 and <= 31)",
407 20, &mca_btl_openib_component.ib_timeout, 0));
408
409 CHECK(reg_uint("ib_retry_count", NULL,
410 "InfiniBand transmit retry count "
411 "(must be >= 0 and <= 7)",
412 7, &mca_btl_openib_component.ib_retry_count, 0));
413
414 CHECK(reg_uint("ib_rnr_retry", NULL,
415 "InfiniBand \"receiver not ready\" "
416 "retry count; applies *only* to SRQ/XRC queues. PP queues "
417 "use RNR retry values of 0 because Open MPI performs "
418 "software flow control to guarantee that RNRs never occur "
419 "(must be >= 0 and <= 7; 7 = \"infinite\")",
420 7, &mca_btl_openib_component.ib_rnr_retry, 0));
421
422 CHECK(reg_uint("ib_max_rdma_dst_ops", NULL, "InfiniBand maximum pending RDMA "
423 "destination operations "
424 "(must be >= 0)",
425 4, &mca_btl_openib_component.ib_max_rdma_dst_ops, 0));
426
427 CHECK(reg_uint("ib_service_level", NULL, "InfiniBand service level "
428 "(must be >= 0 and <= 15)",
429 0, &mca_btl_openib_component.ib_service_level, 0));
430
431 #if (ENABLE_DYNAMIC_SL)
432 CHECK(reg_uint("ib_path_record_service_level", NULL,
433 "Enable getting InfiniBand service level from PathRecord "
434 "(must be >= 0, 0 = disabled, positive = try to get the "
435 "service level from PathRecord)",
436 0, &mca_btl_openib_component.ib_path_record_service_level, 0));
437 #endif
438
439 CHECK(reg_int("use_eager_rdma", NULL, "Use RDMA for eager messages "
440 "(-1 = use device default, 0 = do not use eager RDMA, "
441 "1 = use eager RDMA)",
442 -1, &mca_btl_openib_component.use_eager_rdma, 0));
443
444 CHECK(reg_int("eager_rdma_threshold", NULL,
445 "Use RDMA for short messages after this number of "
446 "messages are received from a given peer "
447 "(must be >= 1)",
448 16, &mca_btl_openib_component.eager_rdma_threshold, REGINT_GE_ONE));
449
450 CHECK(reg_int("max_eager_rdma", NULL, "Maximum number of peers allowed to use "
451 "RDMA for short messages (RDMA is used for all long "
452 "messages, except if explicitly disabled, such as "
453 "with the \"dr\" pml) "
454 "(must be >= 0)",
455 16, &mca_btl_openib_component.max_eager_rdma, REGINT_GE_ZERO));
456
457 CHECK(reg_int("eager_rdma_num", NULL, "Number of RDMA buffers to allocate "
458 "for small messages "
459 "(must be >= 1)",
460 16, &mca_btl_openib_component.eager_rdma_num, REGINT_GE_ONE));
461 mca_btl_openib_component.eager_rdma_num++;
462
463 CHECK(reg_uint("btls_per_lid", NULL, "Number of BTLs to create for each "
464 "InfiniBand LID "
465 "(must be >= 1)",
466 1, &mca_btl_openib_component.btls_per_lid, REGINT_GE_ONE));
467
468 CHECK(reg_uint("max_lmc", NULL, "Maximum number of LIDs to use for each device port "
469 "(must be >= 0, where 0 = use all available)",
470 1, &mca_btl_openib_component.max_lmc, 0));
471
472 CHECK(reg_int("enable_apm_over_lmc", NULL, "Maximum number of alternative paths for each device port "
473 "(must be >= -1, where 0 = disable apm, -1 = all available alternative paths )",
474 0, &mca_btl_openib_component.apm_lmc, REGINT_NEG_ONE_OK|REGINT_GE_ZERO));
475
476 CHECK(reg_int("enable_apm_over_ports", NULL, "Enable alternative path migration (APM) over different ports of the same device "
477 "(must be >= 0, where 0 = disable APM over ports, 1 = enable APM over ports of the same device)",
478 0, &mca_btl_openib_component.apm_ports, REGINT_GE_ZERO));
479
480 CHECK(reg_bool("use_async_event_thread", NULL,
481 "If nonzero, use the thread that will handle InfiniBand asynchronous events",
482 true, &mca_btl_openib_component.use_async_event_thread));
483
484 CHECK(reg_bool("enable_srq_resize", NULL,
485 "Enable/Disable on demand SRQ resize. "
486 "(0 = without resizing, nonzero = with resizing)", 1,
487 &mca_btl_openib_component.enable_srq_resize));
488
489 #if HAVE_DECL_IBV_LINK_LAYER_ETHERNET
490 CHECK(reg_bool("rroce_enable", NULL,
491 "Enable/Disable routing between different subnets"
492 "(0 = disable, nonzero = enable)", false,
493 &mca_btl_openib_component.rroce_enable));
494 #endif
495
496 CHECK(reg_uint("buffer_alignment", NULL,
497 "Preferred communication buffer alignment, in bytes "
498 "(must be > 0 and power of two)",
499 64, &mca_btl_openib_component.buffer_alignment, 0));
500
501 CHECK(reg_bool("use_message_coalescing", NULL,
502 "If nonzero, use message coalescing", false,
503 &mca_btl_openib_component.use_message_coalescing));
504
505 CHECK(reg_uint("cq_poll_ratio", NULL,
506 "How often to poll high priority CQ versus low priority CQ",
507 100, &mca_btl_openib_component.cq_poll_ratio, REGINT_GE_ONE));
508 CHECK(reg_uint("eager_rdma_poll_ratio", NULL,
509 "How often to poll eager RDMA channel versus CQ",
510 100, &mca_btl_openib_component.eager_rdma_poll_ratio, REGINT_GE_ONE));
511 CHECK(reg_uint("hp_cq_poll_per_progress", NULL,
512 "Max number of completion events to process for each call "
513 "of BTL progress engine",
514 10, &mca_btl_openib_component.cq_poll_progress, REGINT_GE_ONE));
515
516 CHECK(reg_uint("max_hw_msg_size", NULL,
517 "Maximum size (in bytes) of a single fragment of a long message when using the RDMA protocols (must be > 0 and <= hw capabilities).",
518 0, &mca_btl_openib_component.max_hw_msg_size, 0));
519
520 CHECK(reg_bool("allow_max_memory_registration", NULL,
521 "Allow maximum possible memory to register with HCA",
522 1, &mca_btl_openib_component.allow_max_memory_registration));
523
524 /* Help debug memory registration issues */
525 CHECK(reg_int("memory_registration_verbose", NULL,
526 "Output some verbose memory registration information "
527 "(0 = no output, nonzero = output)", 0,
528 &mca_btl_openib_component.memory_registration_verbose_level, 0));
529
530 CHECK(reg_int("ignore_locality", NULL,
531 "Ignore any locality information and use all devices "
532 "(0 = use locality informaiton and use only close devices, nonzero = ignore locality information)", 0,
533 &mca_btl_openib_component.ignore_locality, REGINT_GE_ZERO));
534
535 /* Info only */
536 tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
537 "have_fork_support",
538 "Whether the OpenFabrics stack supports applications that invoke the \"fork()\" system call or not (0 = no, 1 = yes). "
539 "Note that this value does NOT indicate whether the system being run on supports \"fork()\" with OpenFabrics applications or not.",
540 MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
541 MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
542 OPAL_INFO_LVL_9,
543 MCA_BASE_VAR_SCOPE_CONSTANT,
544 &btl_openib_have_fork_support);
545
546 mca_btl_openib_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT;
547
548 mca_btl_openib_module.super.btl_eager_limit = 12 * 1024;
549 mca_btl_openib_module.super.btl_rndv_eager_limit = 12 * 1024;
550 mca_btl_openib_module.super.btl_max_send_size = 64 * 1024;
551 mca_btl_openib_module.super.btl_rdma_pipeline_send_length = 1024 * 1024;
552 mca_btl_openib_module.super.btl_rdma_pipeline_frag_size = 1024 * 1024;
553 mca_btl_openib_module.super.btl_min_rdma_pipeline_size = 256 * 1024;
554 mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA |
555 MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA |
556 MCA_BTL_FLAGS_SEND;
557 #if HAVE_DECL_IBV_ATOMIC_HCA
558 mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS;
559 mca_btl_openib_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
560 #endif
561
562 /* Default to bandwidth auto-detection */
563 mca_btl_openib_module.super.btl_bandwidth = 0;
564 mca_btl_openib_module.super.btl_latency = 4;
565 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
566 /* Default is enabling CUDA asynchronous send copies */
567 CHECK(reg_bool("cuda_async_send", NULL,
568 "Enable or disable CUDA async send copies "
569 "(true = async; false = sync)",
570 true, &mca_btl_openib_component.cuda_async_send));
571
572 /* Default is enabling CUDA asynchronous receive copies */
573 CHECK(reg_bool("cuda_async_recv", NULL,
574 "Enable or disable CUDA async recv copies "
575 "(true = async; false = sync)",
576 false, &mca_btl_openib_component.cuda_async_recv));
577 /* Also make the max send size larger for better GPU buffer performance */
578 mca_btl_openib_module.super.btl_max_send_size = 128 * 1024;
579 /* Turn of message coalescing - not sure if it works with GPU buffers */
580 mca_btl_openib_component.use_message_coalescing = 0;
581
582 /* Indicates if library was built with GPU Direct RDMA support. Not changeable. */
583 mca_btl_openib_component.cuda_have_gdr = OPAL_INT_TO_BOOL(OPAL_CUDA_GDR_SUPPORT);
584 (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, "have_cuda_gdr",
585 "Whether CUDA GPU Direct RDMA support is built into library or not",
586 MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
587 MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
588 OPAL_INFO_LVL_5,
589 MCA_BASE_VAR_SCOPE_CONSTANT,
590 &mca_btl_openib_component.cuda_have_gdr);
591
592 /* Indicates if driver has GPU Direct RDMA support. Not changeable. */
593 if (OPAL_SUCCESS == opal_os_dirpath_access("/sys/kernel/mm/memory_peers/nv_mem/version", S_IRUSR)) {
594 mca_btl_openib_component.driver_have_gdr = 1;
595 } else {
596 mca_btl_openib_component.driver_have_gdr = 0;
597 }
598 (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, "have_driver_gdr",
599 "Whether Infiniband driver has GPU Direct RDMA support",
600 MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
601 MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
602 OPAL_INFO_LVL_5,
603 MCA_BASE_VAR_SCOPE_CONSTANT,
604 &mca_btl_openib_component.driver_have_gdr);
605
606 /* Default for GPU Direct RDMA is off for now */
607 CHECK(reg_bool("want_cuda_gdr", NULL,
608 "Enable or disable CUDA GPU Direct RDMA support "
609 "(true = enabled; false = disabled)",
610 false, &mca_btl_openib_component.cuda_want_gdr));
611
612 if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.cuda_have_gdr) {
613 opal_show_help("help-mpi-btl-openib.txt",
614 "CUDA_no_gdr_support", true,
615 opal_process_info.nodename);
616 return OPAL_ERROR;
617 }
618 if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.driver_have_gdr) {
619 opal_show_help("help-mpi-btl-openib.txt",
620 "driver_no_gdr_support", true,
621 opal_process_info.nodename);
622 return OPAL_ERROR;
623 }
624 #if OPAL_CUDA_GDR_SUPPORT
625 if (mca_btl_openib_component.cuda_want_gdr) {
626 mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
627 mca_btl_openib_module.super.btl_cuda_eager_limit = SIZE_MAX; /* magic number - indicates set it to minimum */
628 mca_btl_openib_module.super.btl_cuda_rdma_limit = 30000; /* default switchover is 30,000 to pipeline */
629 } else {
630 mca_btl_openib_module.super.btl_cuda_eager_limit = 0; /* Turns off any of the GPU Direct RDMA code */
631 mca_btl_openib_module.super.btl_cuda_rdma_limit = 0; /* Unused */
632 }
633 #endif /* OPAL_CUDA_GDR_SUPPORT */
634 #endif /* OPAL_CUDA_SUPPORT */
635 CHECK(mca_btl_base_param_register(
636 &mca_btl_openib_component.super.btl_version,
637 &mca_btl_openib_module.super));
638
639 /* setup all the qp stuff */
640 /* round mid_qp_size to smallest power of two */
641 mid_qp_size = opal_next_poweroftwo (mca_btl_openib_module.super.btl_eager_limit / 4) >> 1;
642
643 /* mid_qp_size = MAX (mid_qp_size, 1024); ?! */
644 if(mid_qp_size <= 128) {
645 mid_qp_size = 1024;
646 }
647
648 asprintf(&default_qps,
649 "S,128,256,192,128:S,%u,1024,1008,64:S,%u,1024,1008,64:S,%u,1024,1008,64",
650 mid_qp_size,
651 (uint32_t)mca_btl_openib_module.super.btl_eager_limit,
652 (uint32_t)mca_btl_openib_module.super.btl_max_send_size);
653 if (NULL == default_qps) {
654 /* Don't try to recover from this */
655 return OPAL_ERR_OUT_OF_RESOURCE;
656 }
657 if (NULL != mca_btl_openib_component.default_recv_qps) {
658 free(mca_btl_openib_component.default_recv_qps);
659 }
660 mca_btl_openib_component.default_recv_qps = default_qps;
661 CHECK(reg_string("receive_queues", NULL,
662 "Colon-delimited, comma-delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
663 default_qps, &mca_btl_openib_component.receive_queues,
664 0
665 ));
666
667 CHECK(reg_string("if_include", NULL,
668 "Comma-delimited list of devices/ports to be used (e.g. \"mthca0,mthca1:2\"; empty value means to use all ports found). Mutually exclusive with btl_openib_if_exclude.",
669 NULL, &mca_btl_openib_component.if_include,
670 0));
671
672 CHECK(reg_string("if_exclude", NULL,
673 "Comma-delimited list of device/ports to be excluded (empty value means to not exclude any ports). Mutually exclusive with btl_openib_if_include.",
674 NULL, &mca_btl_openib_component.if_exclude,
675 0));
676
677 CHECK(reg_string("ipaddr_include", NULL,
678 "Comma-delimited list of IP Addresses to be used (e.g. \"192.168.1.0/24\"). Mutually exclusive with btl_openib_ipaddr_exclude.",
679 NULL, &mca_btl_openib_component.ipaddr_include,
680 0));
681
682 CHECK(reg_string("ipaddr_exclude", NULL,
683 "Comma-delimited list of IP Addresses to be excluded (e.g. \"192.168.1.0/24\"). Mutually exclusive with btl_openib_ipaddr_include.",
684 NULL, &mca_btl_openib_component.ipaddr_exclude,
685 0));
686
687 CHECK(reg_int("gid_index", NULL,
688 "GID index to use on verbs device ports",
689 0, &mca_btl_openib_component.gid_index,
690 REGINT_GE_ZERO));
691
692 CHECK(reg_bool("allow_different_subnets", NULL,
693 "Allow connecting processes from different IB subnets."
694 "(0 = do not allow; 1 = allow)",
695 false, &mca_btl_openib_component.allow_different_subnets));
696
697 /* Register any MCA params for the connect pseudo-components */
698 if (OPAL_SUCCESS == ret) {
699 ret = opal_btl_openib_connect_base_register();
700 }
701
702 return btl_openib_verify_mca_params();
703 }
704
btl_openib_verify_mca_params(void)705 int btl_openib_verify_mca_params (void)
706 {
707 if (mca_btl_openib_component.cq_poll_batch > MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT) {
708 mca_btl_openib_component.cq_poll_batch = MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT;
709 }
710
711 #if !HAVE_IBV_FORK_INIT
712 if (1 == mca_btl_openib_component.want_fork_support) {
713 opal_show_help("help-mpi-btl-openib.txt",
714 "ibv_fork requested but not supported", true,
715 opal_process_info.nodename);
716 return OPAL_ERR_BAD_PARAM;
717 }
718 #endif
719
720 mca_btl_openib_component.ib_pkey_val &= MCA_BTL_IB_PKEY_MASK;
721
722 if (mca_btl_openib_component.ib_min_rnr_timer > 31) {
723 opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
724 true, "btl_openib_ib_min_rnr_timer > 31",
725 "btl_openib_ib_min_rnr_timer reset to 31");
726 mca_btl_openib_component.ib_min_rnr_timer = 31;
727 }
728
729 if (mca_btl_openib_component.ib_timeout > 31) {
730 opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
731 true, "btl_openib_ib_timeout > 31",
732 "btl_openib_ib_timeout reset to 31");
733 mca_btl_openib_component.ib_timeout = 31;
734 }
735
736 if (mca_btl_openib_component.ib_retry_count > 7) {
737 opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
738 true, "btl_openib_ib_retry_count > 7",
739 "btl_openib_ib_retry_count reset to 7");
740 mca_btl_openib_component.ib_retry_count = 7;
741 }
742
743 if (mca_btl_openib_component.ib_rnr_retry > 7) {
744 opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
745 true, "btl_openib_ib_rnr_retry > 7",
746 "btl_openib_ib_rnr_retry reset to 7");
747 mca_btl_openib_component.ib_rnr_retry = 7;
748 }
749
750 if (mca_btl_openib_component.ib_service_level > 15) {
751 opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
752 true, "btl_openib_ib_service_level > 15",
753 "btl_openib_ib_service_level reset to 15");
754 mca_btl_openib_component.ib_service_level = 15;
755 }
756
757 if(mca_btl_openib_component.buffer_alignment <= 1 ||
758 (mca_btl_openib_component.buffer_alignment & (mca_btl_openib_component.buffer_alignment - 1))) {
759 opal_show_help("help-mpi-btl-openib.txt", "wrong buffer alignment",
760 true, mca_btl_openib_component.buffer_alignment, opal_process_info.nodename, 64);
761 mca_btl_openib_component.buffer_alignment = 64;
762 }
763
764 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
765 if (mca_btl_openib_component.cuda_async_send) {
766 mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND;
767 } else {
768 mca_btl_openib_module.super.btl_flags &= ~MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND;
769 }
770
771 if (mca_btl_openib_component.cuda_async_recv) {
772 mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV;
773 } else {
774 mca_btl_openib_module.super.btl_flags &= ~MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV;
775 }
776 #if 0 /* Disable this check for now while fork support code is worked out. */
777 /* Cannot have fork support and GDR on at the same time. If the user asks for both,
778 * then print a message and return error. If the user does not explicitly ask for
779 * fork support, then turn it off in the presence of GDR. */
780 if (mca_btl_openib_component.cuda_want_gdr && mca_btl_openib_component.cuda_have_gdr &&
781 mca_btl_openib_component.driver_have_gdr) {
782 if (1 == opal_common_verbs_want_fork_support) {
783 opal_show_help("help-mpi-btl-openib.txt", "no_fork_with_gdr",
784 true, opal_process_info.nodename);
785 return OPAL_ERR_BAD_PARAM;
786 }
787 }
788 #endif /* Workaround */
789 if (0 != mca_btl_openib_module.super.btl_cuda_max_send_size) {
790 opal_show_help("help-mpi-btl-openib.txt", "do_not_set_openib_value",
791 true, opal_process_info.nodename);
792 mca_btl_openib_module.super.btl_cuda_max_send_size = 0;
793 }
794 #endif
795
796 return OPAL_SUCCESS;
797 }
798