1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2005 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2006-2015 Cisco Systems, Inc.  All rights reserved.
14  * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
15  * Copyright (c) 2006-2018 Los Alamos National Security, LLC.  All rights
16  *                         reserved.
17  * Copyright (c) 2006-2007 Voltaire All rights reserved.
18  * Copyright (c) 2009-2010 Oracle and/or its affiliates.  All rights reserved.
19  * Copyright (c) 2013-2015 NVIDIA Corporation.  All rights reserved.
20  * Copyright (c) 2014-2016 Research Organization for Information Science
21  *                         and Technology (RIST). All rights reserved.
22  * Copyright (c) 2014      Intel, Inc. All rights reserved.
23  * $COPYRIGHT$
24  *
25  * Additional copyrights may follow
26  *
27  * $HEADER$
28  */
29 
30 #include "opal_config.h"
31 
32 #include <string.h>
33 #include "opal/util/bit_ops.h"
34 #include "opal/mca/common/verbs/common_verbs.h"
35 #include "opal/mca/installdirs/installdirs.h"
36 #include "opal/util/os_dirpath.h"
37 #include "opal/util/output.h"
38 #include "opal/util/show_help.h"
39 #include "opal/util/proc.h"
40 
41 #include "btl_openib.h"
42 #include "btl_openib_mca.h"
43 #include "btl_openib_ini.h"
44 #include "connect/base.h"
45 
46 #ifdef HAVE_IBV_FORK_INIT
47 #define OPAL_HAVE_IBV_FORK_INIT 1
48 #else
49 #define OPAL_HAVE_IBV_FORK_INIT 0
50 #endif
51 
52 /*
53  * Local flags
54  */
55 enum {
56     REGINT_NEG_ONE_OK = 0x01,
57     REGINT_GE_ZERO = 0x02,
58     REGINT_GE_ONE = 0x04,
59     REGINT_NONZERO = 0x08,
60 
61     REGINT_MAX = 0x88
62 };
63 
64 
65 enum {
66     REGSTR_EMPTY_OK = 0x01,
67 
68     REGSTR_MAX = 0x88
69 };
70 
71 static mca_base_var_enum_value_t ib_mtu_values[] = {
72     {IBV_MTU_256, "256B"},
73     {IBV_MTU_512, "512B"},
74     {IBV_MTU_1024, "1k"},
75     {IBV_MTU_2048, "2k"},
76     {IBV_MTU_4096, "4k"},
77     {0, NULL}
78 };
79 
80 static mca_base_var_enum_value_t device_type_values[] = {
81     {BTL_OPENIB_DT_IB,    "infiniband"},
82     {BTL_OPENIB_DT_IB,    "ib"},
83     {BTL_OPENIB_DT_IWARP, "iwarp"},
84     {BTL_OPENIB_DT_IWARP, "iw"},
85     {BTL_OPENIB_DT_ALL,   "all"},
86     {0, NULL}
87 };
88 
89 static int btl_openib_cq_size;
90 static bool btl_openib_have_fork_support = OPAL_HAVE_IBV_FORK_INIT;
91 
92 /*
93  * utility routine for string parameter registration
94  */
reg_string(const char * param_name,const char * deprecated_param_name,const char * param_desc,const char * default_value,char ** storage,int flags)95 static int reg_string(const char* param_name,
96                       const char* deprecated_param_name,
97                       const char* param_desc,
98                       const char* default_value, char **storage,
99                       int flags)
100 {
101     int index;
102 
103     assert (NULL != storage);
104 
105     /* The MCA variable system will not change this pointer */
106     *storage = (char *) default_value;
107     index = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
108                                             param_name, param_desc, MCA_BASE_VAR_TYPE_STRING,
109                                             NULL, 0, 0, OPAL_INFO_LVL_9,
110                                             MCA_BASE_VAR_SCOPE_READONLY, storage);
111     if (NULL != deprecated_param_name) {
112         (void) mca_base_var_register_synonym(index, "ompi", "btl", "openib",
113                                              deprecated_param_name,
114                                              MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
115     }
116 
117     if (0 != (flags & REGSTR_EMPTY_OK) && (NULL == *storage || 0 == strlen(*storage))) {
118         opal_output(0, "Bad parameter value for parameter \"%s\"",
119                 param_name);
120         return OPAL_ERR_BAD_PARAM;
121     }
122 
123     return OPAL_SUCCESS;
124 }
125 
126 
127 /*
128  * utility routine for integer parameter registration
129  */
reg_int(const char * param_name,const char * deprecated_param_name,const char * param_desc,int default_value,int * storage,int flags)130 static int reg_int(const char* param_name,
131                    const char* deprecated_param_name,
132                    const char* param_desc,
133                    int default_value, int *storage, int flags)
134 {
135     int index;
136 
137     *storage = default_value;
138     index = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
139                                             param_name, param_desc, MCA_BASE_VAR_TYPE_INT,
140                                             NULL, 0, 0, OPAL_INFO_LVL_9,
141                                             MCA_BASE_VAR_SCOPE_READONLY, storage);
142     if (NULL != deprecated_param_name) {
143         (void) mca_base_var_register_synonym(index, "ompi", "btl", "openib",
144                                              deprecated_param_name,
145                                              MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
146     }
147 
148     if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) {
149         return OPAL_SUCCESS;
150     }
151 
152     if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) ||
153         (0 != (flags & REGINT_GE_ONE) && *storage < 1) ||
154         (0 != (flags & REGINT_NONZERO) && 0 == *storage)) {
155         opal_output(0, "Bad parameter value for parameter \"%s\"",
156                 param_name);
157         return OPAL_ERR_BAD_PARAM;
158     }
159 
160     return OPAL_SUCCESS;
161 }
162 
163 /*
164  * utility routine for integer parameter registration
165  */
reg_uint(const char * param_name,const char * deprecated_param_name,const char * param_desc,unsigned int default_value,unsigned int * storage,int flags)166 static int reg_uint(const char* param_name,
167                     const char* deprecated_param_name,
168                     const char* param_desc,
169                     unsigned int default_value, unsigned int *storage,
170                     int flags)
171 {
172     int index;
173 
174     *storage = default_value;
175     index = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
176                                             param_name, param_desc, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
177                                             NULL, 0, 0, OPAL_INFO_LVL_9,
178                                             MCA_BASE_VAR_SCOPE_READONLY, storage);
179     if (NULL != deprecated_param_name) {
180         (void) mca_base_var_register_synonym(index, "ompi", "btl", "openib",
181                                              deprecated_param_name,
182                                              MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
183     }
184 
185     if ((0 != (flags & REGINT_GE_ONE) && *storage < 1) ||
186         (0 != (flags & REGINT_NONZERO) && 0 == *storage)) {
187         opal_output(0, "Bad parameter value for parameter \"%s\"",
188                 param_name);
189         return OPAL_ERR_BAD_PARAM;
190     }
191 
192     return OPAL_SUCCESS;
193 }
194 
195 /*
196  * utility routine for integer parameter registration
197  */
reg_bool(const char * param_name,const char * deprecated_param_name,const char * param_desc,bool default_value,bool * storage)198 static int reg_bool(const char* param_name,
199                     const char* deprecated_param_name,
200                     const char* param_desc,
201                     bool default_value, bool *storage)
202 {
203     int index;
204 
205     *storage = default_value;
206     index = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
207                                             param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL,
208                                             NULL, 0, 0, OPAL_INFO_LVL_9,
209                                             MCA_BASE_VAR_SCOPE_READONLY, storage);
210     if (NULL != deprecated_param_name) {
211         (void) mca_base_var_register_synonym(index, "ompi", "btl", "openib",
212                                              deprecated_param_name,
213                                              MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
214     }
215 
216     return OPAL_SUCCESS;
217 }
218 
219 /*
220  * Register and check all MCA parameters
221  */
btl_openib_register_mca_params(void)222 int btl_openib_register_mca_params(void)
223 {
224     mca_base_var_enum_t *new_enum;
225     char *default_qps;
226     uint32_t mid_qp_size;
227     char *msg, *str;
228     int ret, tmp;
229 
230     ret = OPAL_SUCCESS;
231 #define CHECK(expr) do {\
232         tmp = (expr); \
233         if (OPAL_SUCCESS != tmp) ret = tmp; \
234      } while (0)
235 
236     /* register openib component parameters */
237     CHECK(reg_bool("verbose", NULL,
238                   "Output some verbose OpenIB BTL information "
239                   "(0 = no output, nonzero = output)", false,
240                    &mca_btl_openib_component.verbose));
241 
242     CHECK(reg_bool("warn_no_device_params_found",
243                   "warn_no_hca_params_found",
244                   "Warn when no device-specific parameters are found in the INI file specified by the btl_openib_device_param_files MCA parameter "
245                   "(0 = do not warn; any other value = warn)",
246                   true, &mca_btl_openib_component.warn_no_device_params_found));
247 
248     CHECK(reg_bool("warn_default_gid_prefix", NULL,
249                   "Warn when there is more than one active ports and at least one of them connected to the network with only default GID prefix configured "
250                   "(0 = do not warn; any other value = warn)",
251                   true, &mca_btl_openib_component.warn_default_gid_prefix));
252 
253     CHECK(reg_bool("warn_nonexistent_if", NULL,
254                   "Warn if non-existent devices and/or ports are specified in the btl_openib_if_[in|ex]clude MCA parameters "
255                   "(0 = do not warn; any other value = warn)",
256                   true, &mca_btl_openib_component.warn_nonexistent_if));
257 
258     /* If we print a warning about not having enough registered memory
259        available, do we want to abort? */
260     CHECK(reg_bool("abort_not_enough_reg_mem", NULL,
261                   "If there is not enough registered memory available on the system for Open MPI to function properly, Open MPI will issue a warning.  If this MCA parameter is set to true, then Open MPI will also abort all MPI jobs "
262                   "(0 = warn, but do not abort; any other value = warn and abort)",
263                   false, &mca_btl_openib_component.abort_not_enough_reg_mem));
264 
265     CHECK(reg_uint("poll_cq_batch", NULL,
266                    "Retrieve up to poll_cq_batch completions from CQ",
267                    MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT, &mca_btl_openib_component.cq_poll_batch,
268                    REGINT_GE_ONE));
269 
270     asprintf(&str, "%s/mca-btl-openib-device-params.ini",
271              opal_install_dirs.opaldatadir);
272     if (NULL == str) {
273         return OPAL_ERR_OUT_OF_RESOURCE;
274     }
275     CHECK(reg_string("device_param_files", "hca_param_files",
276                      "Colon-delimited list of INI-style files that contain device vendor/part-specific parameters (use semicolon for Windows)",
277                      str, &mca_btl_openib_component.device_params_file_names,
278                      0));
279     free(str);
280 
281     (void)mca_base_var_enum_create("btl_openib_device_types", device_type_values, &new_enum);
282     mca_btl_openib_component.device_type = BTL_OPENIB_DT_ALL;
283     tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
284                                           "device_type", "Specify to only use IB or iWARP "
285                                           "network adapters (infiniband = only use InfiniBand "
286                                           "HCAs; iwarp = only use iWARP NICs; all = use any "
287                                           "available adapters)", MCA_BASE_VAR_TYPE_INT, new_enum,
288                                           0, 0, OPAL_INFO_LVL_9,
289                                           MCA_BASE_VAR_SCOPE_READONLY,
290                                           &mca_btl_openib_component.device_type);
291     if (0 > tmp) ret = tmp;
292     OBJ_RELEASE(new_enum);
293 
294     /*
295      * Provide way for using to override policy of ignoring IB HCAs
296      */
297 
298     mca_btl_openib_component.allow_ib = false;
299     tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
300                                           "allow_ib",
301                                           "Override policy since Open MPI 4.0 of ignoring IB HCAs for openib BTL",
302                                           MCA_BASE_VAR_TYPE_BOOL, NULL,
303                                           0, 0, OPAL_INFO_LVL_5,
304                                           MCA_BASE_VAR_SCOPE_READONLY,
305                                           &mca_btl_openib_component.allow_ib);
306 
307     CHECK(reg_int("max_btls", NULL,
308                   "Maximum number of device ports to use "
309                   "(-1 = use all available, otherwise must be >= 1)",
310                   -1, &mca_btl_openib_component.ib_max_btls,
311                   REGINT_NEG_ONE_OK | REGINT_GE_ONE));
312     CHECK(reg_int("free_list_num", NULL,
313                   "Initial size of free lists "
314                   "(must be >= 1)",
315                   8, &mca_btl_openib_component.ib_free_list_num,
316                   REGINT_GE_ONE));
317     CHECK(reg_int("free_list_max", NULL,
318                   "Maximum size of free lists "
319                   "(-1 = infinite, otherwise must be >= 0)",
320                   -1, &mca_btl_openib_component.ib_free_list_max,
321                   REGINT_NEG_ONE_OK | REGINT_GE_ONE));
322     CHECK(reg_int("free_list_inc", NULL,
323                   "Increment size of free lists "
324                   "(must be >= 1)",
325                   32, &mca_btl_openib_component.ib_free_list_inc,
326                   REGINT_GE_ONE));
327     CHECK(reg_string("mpool_hints", NULL, "hints for selecting a memory pool (default: none)",
328                      NULL, &mca_btl_openib_component.ib_mpool_hints,
329                      0));
330     CHECK(reg_string("rcache", NULL,
331                      "Name of the registration cache to be used (it is unlikely that you will ever want to change this)",
332                      "grdma", &mca_btl_openib_component.ib_rcache_name,
333                      0));
334     CHECK(reg_int("reg_mru_len", NULL,
335                   "Length of the registration cache most recently used list "
336                   "(must be >= 1)",
337                   16, (int*) &mca_btl_openib_component.reg_mru_len,
338                   REGINT_GE_ONE));
339 
340     CHECK(reg_int("cq_size", "ib_cq_size",
341                   "Minimum size of the OpenFabrics completion queue "
342                   "(CQs are automatically sized based on the number "
343                   "of peer MPI processes; this value determines the "
344                   "*minimum* size of all CQs)",
345                   8192, &btl_openib_cq_size, REGINT_GE_ONE));
346     mca_btl_openib_component.ib_cq_size[BTL_OPENIB_LP_CQ] =
347         mca_btl_openib_component.ib_cq_size[BTL_OPENIB_HP_CQ] = (uint32_t) btl_openib_cq_size;
348 
349     CHECK(reg_int("max_inline_data", "ib_max_inline_data",
350                   "Maximum size of inline data segment "
351                   "(-1 = run-time probe to discover max value, otherwise must be >= 0). "
352                   "If not explicitly set, use max_inline_data from "
353                   "the INI file containing device-specific parameters",
354                   -1, &mca_btl_openib_component.ib_max_inline_data,
355                   REGINT_NEG_ONE_OK | REGINT_GE_ZERO));
356 
357     CHECK(reg_uint("pkey", "ib_pkey_val",
358                    "OpenFabrics partition key (pkey) value. "
359                    "Unsigned integer decimal or hex values are allowed (e.g., \"3\" or \"0x3f\") and will be masked against the maximum allowable IB partition key value (0x7fff)",
360                    0, &mca_btl_openib_component.ib_pkey_val, 0));
361 
362     CHECK(reg_uint("psn", "ib_psn",
363                   "OpenFabrics packet sequence starting number "
364                   "(must be >= 0)",
365                   0, &mca_btl_openib_component.ib_psn, 0));
366 
367     CHECK(reg_uint("ib_qp_ous_rd_atom", NULL,
368                    "InfiniBand outstanding atomic reads "
369                    "(must be >= 0)",
370                    4, &mca_btl_openib_component.ib_qp_ous_rd_atom, 0));
371 
372     asprintf(&msg, "OpenFabrics MTU, in bytes (if not specified in INI files).  Valid values are: %d=256 bytes, %d=512 bytes, %d=1024 bytes, %d=2048 bytes, %d=4096 bytes",
373              IBV_MTU_256,
374              IBV_MTU_512,
375              IBV_MTU_1024,
376              IBV_MTU_2048,
377              IBV_MTU_4096);
378     if (NULL == msg) {
379         /* Don't try to recover from this */
380         return OPAL_ERR_OUT_OF_RESOURCE;
381     }
382     mca_btl_openib_component.ib_mtu = 0;
383     (void) mca_base_var_enum_create("btl_openib_mtus", ib_mtu_values, &new_enum);
384     tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
385                                           "mtu", msg, MCA_BASE_VAR_TYPE_INT, new_enum,
386                                           0, 0, OPAL_INFO_LVL_9,
387                                           MCA_BASE_VAR_SCOPE_READONLY,
388                                           &mca_btl_openib_component.ib_mtu);
389     if (0 <= tmp) {
390         (void) mca_base_var_register_synonym(tmp, "ompi", "btl", "openib", "ib_mtu",
391                                              MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
392     } else {
393         ret = tmp;
394     }
395 
396     OBJ_RELEASE(new_enum);
397     free(msg);
398 
399     CHECK(reg_uint("ib_min_rnr_timer", NULL, "InfiniBand minimum "
400                    "\"receiver not ready\" timer, in seconds "
401                    "(must be >= 0 and <= 31)",
402                    25, &mca_btl_openib_component.ib_min_rnr_timer, 0));
403 
404     CHECK(reg_uint("ib_timeout", NULL,
405                   "InfiniBand transmit timeout, plugged into formula: 4.096 microseconds * (2^btl_openib_ib_timeout) "
406                   "(must be >= 0 and <= 31)",
407                   20, &mca_btl_openib_component.ib_timeout, 0));
408 
409     CHECK(reg_uint("ib_retry_count", NULL,
410                   "InfiniBand transmit retry count "
411                   "(must be >= 0 and <= 7)",
412                   7, &mca_btl_openib_component.ib_retry_count, 0));
413 
414     CHECK(reg_uint("ib_rnr_retry", NULL,
415                    "InfiniBand \"receiver not ready\" "
416                    "retry count; applies *only* to SRQ/XRC queues.  PP queues "
417                    "use RNR retry values of 0 because Open MPI performs "
418                    "software flow control to guarantee that RNRs never occur "
419                    "(must be >= 0 and <= 7; 7 = \"infinite\")",
420                    7, &mca_btl_openib_component.ib_rnr_retry, 0));
421 
422     CHECK(reg_uint("ib_max_rdma_dst_ops", NULL, "InfiniBand maximum pending RDMA "
423                   "destination operations "
424                   "(must be >= 0)",
425                   4, &mca_btl_openib_component.ib_max_rdma_dst_ops, 0));
426 
427     CHECK(reg_uint("ib_service_level", NULL, "InfiniBand service level "
428                    "(must be >= 0 and <= 15)",
429                    0, &mca_btl_openib_component.ib_service_level, 0));
430 
431 #if (ENABLE_DYNAMIC_SL)
432     CHECK(reg_uint("ib_path_record_service_level", NULL,
433                    "Enable getting InfiniBand service level from PathRecord "
434                    "(must be >= 0, 0 = disabled, positive = try to get the "
435                    "service level from PathRecord)",
436                    0, &mca_btl_openib_component.ib_path_record_service_level, 0));
437 #endif
438 
439     CHECK(reg_int("use_eager_rdma", NULL, "Use RDMA for eager messages "
440                   "(-1 = use device default, 0 = do not use eager RDMA, "
441                   "1 = use eager RDMA)",
442                   -1, &mca_btl_openib_component.use_eager_rdma, 0));
443 
444     CHECK(reg_int("eager_rdma_threshold", NULL,
445                   "Use RDMA for short messages after this number of "
446                   "messages are received from a given peer "
447                   "(must be >= 1)",
448                   16, &mca_btl_openib_component.eager_rdma_threshold, REGINT_GE_ONE));
449 
450     CHECK(reg_int("max_eager_rdma", NULL, "Maximum number of peers allowed to use "
451                   "RDMA for short messages (RDMA is used for all long "
452                   "messages, except if explicitly disabled, such as "
453                   "with the \"dr\" pml) "
454                   "(must be >= 0)",
455                   16, &mca_btl_openib_component.max_eager_rdma, REGINT_GE_ZERO));
456 
457     CHECK(reg_int("eager_rdma_num", NULL, "Number of RDMA buffers to allocate "
458                   "for small messages "
459                   "(must be >= 1)",
460                   16, &mca_btl_openib_component.eager_rdma_num, REGINT_GE_ONE));
461     mca_btl_openib_component.eager_rdma_num++;
462 
463     CHECK(reg_uint("btls_per_lid", NULL, "Number of BTLs to create for each "
464                   "InfiniBand LID "
465                   "(must be >= 1)",
466                   1, &mca_btl_openib_component.btls_per_lid, REGINT_GE_ONE));
467 
468     CHECK(reg_uint("max_lmc", NULL, "Maximum number of LIDs to use for each device port "
469                    "(must be >= 0, where 0 = use all available)",
470                    1, &mca_btl_openib_component.max_lmc, 0));
471 
472     CHECK(reg_int("enable_apm_over_lmc", NULL, "Maximum number of alternative paths for each device port "
473                   "(must be >= -1, where 0 = disable apm, -1 = all available alternative paths )",
474                   0, &mca_btl_openib_component.apm_lmc, REGINT_NEG_ONE_OK|REGINT_GE_ZERO));
475 
476     CHECK(reg_int("enable_apm_over_ports", NULL, "Enable alternative path migration (APM) over different ports of the same device "
477                   "(must be >= 0, where 0 = disable APM over ports, 1 = enable APM over ports of the same device)",
478                   0, &mca_btl_openib_component.apm_ports, REGINT_GE_ZERO));
479 
480     CHECK(reg_bool("use_async_event_thread", NULL,
481                    "If nonzero, use the thread that will handle InfiniBand asynchronous events",
482                    true, &mca_btl_openib_component.use_async_event_thread));
483 
484     CHECK(reg_bool("enable_srq_resize", NULL,
485                    "Enable/Disable on demand SRQ resize. "
486                    "(0 = without resizing, nonzero = with resizing)", 1,
487                    &mca_btl_openib_component.enable_srq_resize));
488 
489 #if HAVE_DECL_IBV_LINK_LAYER_ETHERNET
490     CHECK(reg_bool("rroce_enable", NULL,
491                    "Enable/Disable routing between different subnets"
492                    "(0 = disable, nonzero = enable)", false,
493                    &mca_btl_openib_component.rroce_enable));
494 #endif
495 
496     CHECK(reg_uint("buffer_alignment", NULL,
497                    "Preferred communication buffer alignment, in bytes "
498                    "(must be > 0 and power of two)",
499                    64, &mca_btl_openib_component.buffer_alignment, 0));
500 
501     CHECK(reg_bool("use_message_coalescing", NULL,
502                    "If nonzero, use message coalescing", false,
503                    &mca_btl_openib_component.use_message_coalescing));
504 
505     CHECK(reg_uint("cq_poll_ratio", NULL,
506                    "How often to poll high priority CQ versus low priority CQ",
507                    100, &mca_btl_openib_component.cq_poll_ratio, REGINT_GE_ONE));
508     CHECK(reg_uint("eager_rdma_poll_ratio", NULL,
509                    "How often to poll eager RDMA channel versus CQ",
510                    100, &mca_btl_openib_component.eager_rdma_poll_ratio, REGINT_GE_ONE));
511     CHECK(reg_uint("hp_cq_poll_per_progress", NULL,
512                   "Max number of completion events to process for each call "
513                   "of BTL progress engine",
514                   10, &mca_btl_openib_component.cq_poll_progress, REGINT_GE_ONE));
515 
516     CHECK(reg_uint("max_hw_msg_size", NULL,
517                    "Maximum size (in bytes) of a single fragment of a long message when using the RDMA protocols (must be > 0 and <= hw capabilities).",
518                    0, &mca_btl_openib_component.max_hw_msg_size, 0));
519 
520     CHECK(reg_bool("allow_max_memory_registration", NULL,
521                   "Allow maximum possible memory to register with HCA",
522                    1, &mca_btl_openib_component.allow_max_memory_registration));
523 
524     /* Help debug memory registration issues */
525     CHECK(reg_int("memory_registration_verbose", NULL,
526                   "Output some verbose memory registration information "
527                   "(0 = no output, nonzero = output)", 0,
528 		  &mca_btl_openib_component.memory_registration_verbose_level, 0));
529 
530     CHECK(reg_int("ignore_locality", NULL,
531                   "Ignore any locality information and use all devices "
532                   "(0 = use locality informaiton and use only close devices, nonzero = ignore locality information)", 0,
533                   &mca_btl_openib_component.ignore_locality, REGINT_GE_ZERO));
534 
535     /* Info only */
536     tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
537                                           "have_fork_support",
538                                           "Whether the OpenFabrics stack supports applications that invoke the \"fork()\" system call or not (0 = no, 1 = yes). "
539                                           "Note that this value does NOT indicate whether the system being run on supports \"fork()\" with OpenFabrics applications or not.",
540                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
541                                           MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
542                                           OPAL_INFO_LVL_9,
543                                           MCA_BASE_VAR_SCOPE_CONSTANT,
544                                           &btl_openib_have_fork_support);
545 
546     mca_btl_openib_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT;
547 
548     mca_btl_openib_module.super.btl_eager_limit = 12 * 1024;
549     mca_btl_openib_module.super.btl_rndv_eager_limit = 12 * 1024;
550     mca_btl_openib_module.super.btl_max_send_size = 64 * 1024;
551     mca_btl_openib_module.super.btl_rdma_pipeline_send_length = 1024 * 1024;
552     mca_btl_openib_module.super.btl_rdma_pipeline_frag_size = 1024 * 1024;
553     mca_btl_openib_module.super.btl_min_rdma_pipeline_size = 256 * 1024;
554     mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA |
555 	MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA |
556         MCA_BTL_FLAGS_SEND;
557 #if HAVE_DECL_IBV_ATOMIC_HCA
558     mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS;
559     mca_btl_openib_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
560 #endif
561 
562     /* Default to bandwidth auto-detection */
563     mca_btl_openib_module.super.btl_bandwidth = 0;
564     mca_btl_openib_module.super.btl_latency = 4;
565 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
566     /* Default is enabling CUDA asynchronous send copies */
567     CHECK(reg_bool("cuda_async_send", NULL,
568                    "Enable or disable CUDA async send copies "
569                    "(true = async; false = sync)",
570                    true, &mca_btl_openib_component.cuda_async_send));
571 
572     /* Default is enabling CUDA asynchronous receive copies */
573     CHECK(reg_bool("cuda_async_recv", NULL,
574                    "Enable or disable CUDA async recv copies "
575                    "(true = async; false = sync)",
576                    false, &mca_btl_openib_component.cuda_async_recv));
577     /* Also make the max send size larger for better GPU buffer performance */
578     mca_btl_openib_module.super.btl_max_send_size = 128 * 1024;
579     /* Turn of message coalescing - not sure if it works with GPU buffers */
580     mca_btl_openib_component.use_message_coalescing = 0;
581 
582     /* Indicates if library was built with GPU Direct RDMA support.  Not changeable.  */
583     mca_btl_openib_component.cuda_have_gdr = OPAL_INT_TO_BOOL(OPAL_CUDA_GDR_SUPPORT);
584     (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, "have_cuda_gdr",
585                                            "Whether CUDA GPU Direct RDMA support is built into library or not",
586                                            MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
587                                            MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
588                                            OPAL_INFO_LVL_5,
589                                            MCA_BASE_VAR_SCOPE_CONSTANT,
590                                            &mca_btl_openib_component.cuda_have_gdr);
591 
592     /* Indicates if driver has GPU Direct RDMA support.  Not changeable.  */
593     if (OPAL_SUCCESS == opal_os_dirpath_access("/sys/kernel/mm/memory_peers/nv_mem/version", S_IRUSR)) {
594         mca_btl_openib_component.driver_have_gdr = 1;
595     } else {
596         mca_btl_openib_component.driver_have_gdr = 0;
597     }
598     (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, "have_driver_gdr",
599                                            "Whether Infiniband driver has GPU Direct RDMA support",
600                                            MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
601                                            MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
602                                            OPAL_INFO_LVL_5,
603                                            MCA_BASE_VAR_SCOPE_CONSTANT,
604                                            &mca_btl_openib_component.driver_have_gdr);
605 
606     /* Default for GPU Direct RDMA is off for now */
607     CHECK(reg_bool("want_cuda_gdr", NULL,
608                    "Enable or disable CUDA GPU Direct RDMA support "
609                    "(true = enabled; false = disabled)",
610                    false, &mca_btl_openib_component.cuda_want_gdr));
611 
612     if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.cuda_have_gdr) {
613         opal_show_help("help-mpi-btl-openib.txt",
614                        "CUDA_no_gdr_support", true,
615                        opal_process_info.nodename);
616         return OPAL_ERROR;
617     }
618     if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.driver_have_gdr) {
619         opal_show_help("help-mpi-btl-openib.txt",
620                        "driver_no_gdr_support", true,
621                        opal_process_info.nodename);
622         return OPAL_ERROR;
623     }
624 #if OPAL_CUDA_GDR_SUPPORT
625     if (mca_btl_openib_component.cuda_want_gdr) {
626         mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
627         mca_btl_openib_module.super.btl_cuda_eager_limit = SIZE_MAX; /* magic number - indicates set it to minimum */
628         mca_btl_openib_module.super.btl_cuda_rdma_limit = 30000;  /* default switchover is 30,000 to pipeline */
629     } else {
630         mca_btl_openib_module.super.btl_cuda_eager_limit = 0; /* Turns off any of the GPU Direct RDMA code */
631         mca_btl_openib_module.super.btl_cuda_rdma_limit = 0;  /* Unused */
632     }
633 #endif /* OPAL_CUDA_GDR_SUPPORT */
634 #endif /* OPAL_CUDA_SUPPORT */
635     CHECK(mca_btl_base_param_register(
636             &mca_btl_openib_component.super.btl_version,
637             &mca_btl_openib_module.super));
638 
639     /* setup all the qp stuff */
640     /* round mid_qp_size to smallest power of two */
641     mid_qp_size = opal_next_poweroftwo (mca_btl_openib_module.super.btl_eager_limit / 4) >> 1;
642 
643     /* mid_qp_size = MAX (mid_qp_size, 1024); ?! */
644     if(mid_qp_size <= 128) {
645         mid_qp_size = 1024;
646     }
647 
648     asprintf(&default_qps,
649             "S,128,256,192,128:S,%u,1024,1008,64:S,%u,1024,1008,64:S,%u,1024,1008,64",
650             mid_qp_size,
651             (uint32_t)mca_btl_openib_module.super.btl_eager_limit,
652             (uint32_t)mca_btl_openib_module.super.btl_max_send_size);
653     if (NULL == default_qps) {
654         /* Don't try to recover from this */
655         return OPAL_ERR_OUT_OF_RESOURCE;
656     }
657     if (NULL != mca_btl_openib_component.default_recv_qps) {
658         free(mca_btl_openib_component.default_recv_qps);
659     }
660     mca_btl_openib_component.default_recv_qps = default_qps;
661     CHECK(reg_string("receive_queues", NULL,
662                      "Colon-delimited, comma-delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
663                      default_qps, &mca_btl_openib_component.receive_queues,
664                      0
665                 ));
666 
667     CHECK(reg_string("if_include", NULL,
668                      "Comma-delimited list of devices/ports to be used (e.g. \"mthca0,mthca1:2\"; empty value means to use all ports found).  Mutually exclusive with btl_openib_if_exclude.",
669                      NULL, &mca_btl_openib_component.if_include,
670                      0));
671 
672     CHECK(reg_string("if_exclude", NULL,
673                      "Comma-delimited list of device/ports to be excluded (empty value means to not exclude any ports).  Mutually exclusive with btl_openib_if_include.",
674                      NULL, &mca_btl_openib_component.if_exclude,
675                      0));
676 
677     CHECK(reg_string("ipaddr_include", NULL,
678                      "Comma-delimited list of IP Addresses to be used (e.g. \"192.168.1.0/24\").  Mutually exclusive with btl_openib_ipaddr_exclude.",
679                      NULL, &mca_btl_openib_component.ipaddr_include,
680                      0));
681 
682     CHECK(reg_string("ipaddr_exclude", NULL,
683                      "Comma-delimited list of IP Addresses to be excluded (e.g. \"192.168.1.0/24\").  Mutually exclusive with btl_openib_ipaddr_include.",
684                      NULL, &mca_btl_openib_component.ipaddr_exclude,
685                      0));
686 
687     CHECK(reg_int("gid_index", NULL,
688                   "GID index to use on verbs device ports",
689                   0, &mca_btl_openib_component.gid_index,
690                   REGINT_GE_ZERO));
691 
692     CHECK(reg_bool("allow_different_subnets", NULL,
693                    "Allow connecting processes from different IB subnets."
694                    "(0 = do not allow; 1 = allow)",
695                    false, &mca_btl_openib_component.allow_different_subnets));
696 
697     /* Register any MCA params for the connect pseudo-components */
698     if (OPAL_SUCCESS == ret) {
699         ret = opal_btl_openib_connect_base_register();
700     }
701 
702     return btl_openib_verify_mca_params();
703 }
704 
btl_openib_verify_mca_params(void)705 int btl_openib_verify_mca_params (void)
706 {
707     if (mca_btl_openib_component.cq_poll_batch > MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT) {
708         mca_btl_openib_component.cq_poll_batch = MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT;
709     }
710 
711 #if !HAVE_IBV_FORK_INIT
712     if (1 == mca_btl_openib_component.want_fork_support) {
713         opal_show_help("help-mpi-btl-openib.txt",
714                        "ibv_fork requested but not supported", true,
715                        opal_process_info.nodename);
716         return OPAL_ERR_BAD_PARAM;
717     }
718 #endif
719 
720     mca_btl_openib_component.ib_pkey_val &= MCA_BTL_IB_PKEY_MASK;
721 
722     if (mca_btl_openib_component.ib_min_rnr_timer > 31) {
723         opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
724                        true, "btl_openib_ib_min_rnr_timer > 31",
725                        "btl_openib_ib_min_rnr_timer reset to 31");
726         mca_btl_openib_component.ib_min_rnr_timer = 31;
727     }
728 
729     if (mca_btl_openib_component.ib_timeout > 31) {
730         opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
731                        true, "btl_openib_ib_timeout > 31",
732                        "btl_openib_ib_timeout reset to 31");
733         mca_btl_openib_component.ib_timeout = 31;
734     }
735 
736     if (mca_btl_openib_component.ib_retry_count > 7) {
737         opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
738                        true, "btl_openib_ib_retry_count > 7",
739                        "btl_openib_ib_retry_count reset to 7");
740         mca_btl_openib_component.ib_retry_count = 7;
741     }
742 
743     if (mca_btl_openib_component.ib_rnr_retry > 7) {
744         opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
745                        true, "btl_openib_ib_rnr_retry > 7",
746                        "btl_openib_ib_rnr_retry reset to 7");
747         mca_btl_openib_component.ib_rnr_retry = 7;
748     }
749 
750     if (mca_btl_openib_component.ib_service_level > 15) {
751         opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
752                        true, "btl_openib_ib_service_level > 15",
753                        "btl_openib_ib_service_level reset to 15");
754         mca_btl_openib_component.ib_service_level = 15;
755     }
756 
757     if(mca_btl_openib_component.buffer_alignment <= 1 ||
758        (mca_btl_openib_component.buffer_alignment & (mca_btl_openib_component.buffer_alignment - 1))) {
759         opal_show_help("help-mpi-btl-openib.txt", "wrong buffer alignment",
760                 true, mca_btl_openib_component.buffer_alignment, opal_process_info.nodename, 64);
761         mca_btl_openib_component.buffer_alignment = 64;
762     }
763 
764 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
765     if (mca_btl_openib_component.cuda_async_send) {
766         mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND;
767     } else {
768         mca_btl_openib_module.super.btl_flags &= ~MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND;
769     }
770 
771     if (mca_btl_openib_component.cuda_async_recv) {
772         mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV;
773     } else {
774         mca_btl_openib_module.super.btl_flags &= ~MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV;
775     }
776 #if 0 /* Disable this check for now while fork support code is worked out. */
777     /* Cannot have fork support and GDR on at the same time.  If the user asks for both,
778      * then print a message and return error.  If the user does not explicitly ask for
779      * fork support, then turn it off in the presence of GDR.  */
780     if (mca_btl_openib_component.cuda_want_gdr && mca_btl_openib_component.cuda_have_gdr &&
781         mca_btl_openib_component.driver_have_gdr) {
782         if (1 == opal_common_verbs_want_fork_support) {
783               opal_show_help("help-mpi-btl-openib.txt", "no_fork_with_gdr",
784                              true, opal_process_info.nodename);
785               return OPAL_ERR_BAD_PARAM;
786         }
787     }
788 #endif /* Workaround */
789     if (0 != mca_btl_openib_module.super.btl_cuda_max_send_size) {
790         opal_show_help("help-mpi-btl-openib.txt", "do_not_set_openib_value",
791                        true, opal_process_info.nodename);
792         mca_btl_openib_module.super.btl_cuda_max_send_size = 0;
793     }
794 #endif
795 
796     return OPAL_SUCCESS;
797 }
798