1 /**
2  * Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
3  * Copyright (C) The University of Tennessee and The University
4  *               of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED.
5  *
6  * See file LICENSE for terms.
7  */
8 
9 #ifdef HAVE_CONFIG_H
10 #  include "config.h"
11 #endif
12 
13 #include "ib_md.h"
14 #include "ib_device.h"
15 
16 #include <ucs/arch/atomic.h>
17 #include <ucs/profile/profile.h>
18 #include <ucs/sys/math.h>
19 #include <ucs/sys/module.h>
20 #include <ucs/sys/string.h>
21 #include <ucs/time/time.h>
22 #include <ucm/api/ucm.h>
23 #include <pthread.h>
24 #ifdef HAVE_PTHREAD_NP_H
25 #include <pthread_np.h>
26 #endif
27 #include <sys/resource.h>
28 #include <float.h>
29 
30 
31 #define UCT_IB_MD_RCACHE_DEFAULT_ALIGN 16
32 
33 typedef struct uct_ib_md_pci_info {
34     double      bw;       /* bandwidth */
35     uint16_t    payload;  /* payload used to data transfer */
36     uint16_t    overhead; /* PHY + data link layer + header + *CRC* */
37     uint16_t    nack;     /* number of TLC before ACK */
38     uint16_t    ctrl;     /* length of control TLP */
39     uint16_t    encoding; /* number of bits in symbol encoded, 8 - gen 1/2, 128 - gen 3 */
40     uint16_t    decoding; /* number of bits in symbol decoded, 10 - gen 1/2, 130 - gen 3 */
41     const char *name;     /* name of PCI generation */
42 } uct_ib_md_pci_info_t;
43 
44 static UCS_CONFIG_DEFINE_ARRAY(pci_bw,
45                                sizeof(ucs_config_bw_spec_t),
46                                UCS_CONFIG_TYPE_BW_SPEC);
47 
48 static const char *uct_ib_devx_objs[] = {
49     [UCT_IB_DEVX_OBJ_RCQP]  = "rcqp",
50     [UCT_IB_DEVX_OBJ_RCSRQ] = "rcsrq",
51     [UCT_IB_DEVX_OBJ_DCT]   = "dct",
52     [UCT_IB_DEVX_OBJ_DCSRQ] = "dcsrq",
53     NULL
54 };
55 
56 static ucs_config_field_t uct_ib_md_config_table[] = {
57     {"", "", NULL,
58      ucs_offsetof(uct_ib_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)},
59 
60     {"REG_METHODS", "rcache,odp,direct",
61      "List of registration methods in order of preference. Supported methods are:\n"
62      "  odp         - implicit on-demand paging\n"
63      "  rcache      - userspace registration cache\n"
64      "  direct      - direct registration\n",
65      ucs_offsetof(uct_ib_md_config_t, reg_methods), UCS_CONFIG_TYPE_STRING_ARRAY},
66 
67     {"", "RCACHE_ADDR_ALIGN=" UCS_PP_MAKE_STRING(UCT_IB_MD_RCACHE_DEFAULT_ALIGN), NULL,
68      ucs_offsetof(uct_ib_md_config_t, rcache),
69      UCS_CONFIG_TYPE_TABLE(uct_md_config_rcache_table)},
70 
71     {"MEM_REG_OVERHEAD", "16us", "Memory registration overhead", /* TODO take default from device */
72      ucs_offsetof(uct_ib_md_config_t, uc_reg_cost.c), UCS_CONFIG_TYPE_TIME},
73 
74     {"MEM_REG_GROWTH", "0.06ns", "Memory registration growth rate", /* TODO take default from device */
75      ucs_offsetof(uct_ib_md_config_t, uc_reg_cost.m), UCS_CONFIG_TYPE_TIME},
76 
77     {"FORK_INIT", "try",
78      "Initialize a fork-safe IB library with ibv_fork_init().",
79      ucs_offsetof(uct_ib_md_config_t, fork_init), UCS_CONFIG_TYPE_TERNARY},
80 
81     {"ASYNC_EVENTS", "n",
82      "Enable listening for async events on the device",
83      ucs_offsetof(uct_ib_md_config_t, async_events), UCS_CONFIG_TYPE_BOOL},
84 
85     {"ETH_PAUSE_ON", "y",
86      "Whether or not 'Pause Frame' is enabled on an Ethernet network.\n"
87      "Pause frame is a mechanism for temporarily stopping the transmission of data to\n"
88      "ensure zero loss under congestion on Ethernet family computer networks.\n"
89      "This parameter, if set to 'no', will disqualify IB transports that may not perform\n"
90      "well on a lossy fabric when working with RoCE.",
91      ucs_offsetof(uct_ib_md_config_t, ext.eth_pause), UCS_CONFIG_TYPE_BOOL},
92 
93     {"ODP_NUMA_POLICY", "preferred",
94      "Override NUMA policy for ODP regions, to avoid extra page migrations.\n"
95      " - default: Do no change existing policy.\n"
96      " - preferred/bind:\n"
97      "     Unless the memory policy of the current thread is MPOL_BIND, set the\n"
98      "     policy of ODP regions to MPOL_PREFERRED/MPOL_BIND, respectively.\n"
99      "     If the numa node mask of the current thread is not defined, use the numa\n"
100      "     nodes which correspond to its cpu affinity mask.",
101      ucs_offsetof(uct_ib_md_config_t, ext.odp.numa_policy),
102      UCS_CONFIG_TYPE_ENUM(ucs_numa_policy_names)},
103 
104     {"ODP_PREFETCH", "n",
105      "Force prefetch of memory regions created with ODP.\n",
106      ucs_offsetof(uct_ib_md_config_t, ext.odp.prefetch), UCS_CONFIG_TYPE_BOOL},
107 
108     {"ODP_MAX_SIZE", "auto",
109      "Maximal memory region size to enable ODP for. 0 - disable.\n",
110      ucs_offsetof(uct_ib_md_config_t, ext.odp.max_size), UCS_CONFIG_TYPE_MEMUNITS},
111 
112     {"DEVICE_SPECS", "",
113      "Array of custom device specification. Each element is a string of the following format:\n"
114      "  <vendor-id>:<device-id>[:name[:<flags>[:<priority>]]]\n"
115      "where:\n"
116      "  <vendor-id> - (mandatory) pci vendor id, integer or hexadecimal.\n"
117      "  <device-id> - (mandatory) pci device id, integer or hexadecimal.\n"
118      "  <name>      - (optional) device name.\n"
119      "  <flags>     - (optional) empty, or a combination of:\n"
120      "                             '4' - mlx4 device\n"
121      "                             '5' - mlx5 device\n"
122      "                             'd' - DC version 1 (Connect-IB, ConnectX-4)\n"
123      "                             'D' - DC version 2 (ConnectX-5 and above)\n"
124      "                             'a' - Compact address vector support\n"
125      "  <priority>  - (optional) device priority, integer.\n"
126      "\n"
127      "Example: The value '0x02c9:4115:ConnectX4:5d' would specify a device named ConnectX-4\n"
128      "to match vendor id 0x2c9, device id 4115, with DC version 1 support.",
129      ucs_offsetof(uct_ib_md_config_t, custom_devices), UCS_CONFIG_TYPE_STRING_ARRAY},
130 
131     {"PREFER_NEAREST_DEVICE", "y",
132      "Prefer nearest device to cpu when selecting a device from NET_DEVICES list.\n",
133      ucs_offsetof(uct_ib_md_config_t, ext.prefer_nearest_device), UCS_CONFIG_TYPE_BOOL},
134 
135     {"INDIRECT_ATOMIC", "y",
136      "Use indirect atomic\n",
137      ucs_offsetof(uct_ib_md_config_t, ext.enable_indirect_atomic), UCS_CONFIG_TYPE_BOOL},
138 
139     {"GID_INDEX", "auto",
140      "Port GID index to use.",
141      ucs_offsetof(uct_ib_md_config_t, ext.gid_index), UCS_CONFIG_TYPE_ULUNITS},
142 
143     {"SUBNET_PREFIX", "",
144      "Infiniband subnet prefix to filter ports by, empty means no filter. "
145      "Relevant for IB link layer only\n"
146      "For example a filter for the default subnet prefix can be specified as: fe80:0:0:0",
147      ucs_offsetof(uct_ib_md_config_t, subnet_prefix), UCS_CONFIG_TYPE_STRING},
148 
149     {"GPU_DIRECT_RDMA", "try",
150      "Use GPU Direct RDMA for HCA to access GPU pages directly\n",
151      ucs_offsetof(uct_ib_md_config_t, ext.enable_gpudirect_rdma), UCS_CONFIG_TYPE_TERNARY},
152 
153 #ifdef HAVE_EXP_UMR
154     {"MAX_INLINE_KLM_LIST", "inf",
155      "When posting a UMR, KLM lists shorter or equal to this value will be posted as inline.\n"
156      "The actual maximal length is also limited by device capabilities.",
157      ucs_offsetof(uct_ib_md_config_t, ext.max_inline_klm_list), UCS_CONFIG_TYPE_UINT},
158 #endif
159 
160     {"PCI_BW", "",
161      "Maximum effective data transfer rate of PCI bus connected to HCA\n",
162      ucs_offsetof(uct_ib_md_config_t, pci_bw), UCS_CONFIG_TYPE_ARRAY(pci_bw)},
163 
164     {"MLX5_DEVX", "try",
165      "DEVX support\n",
166      ucs_offsetof(uct_ib_md_config_t, devx), UCS_CONFIG_TYPE_TERNARY},
167 
168     {"MLX5_DEVX_OBJECTS", "rcqp,rcsrq,dct,dcsrq",
169      "Objects to be created by DevX\n",
170      ucs_offsetof(uct_ib_md_config_t, devx_objs),
171      UCS_CONFIG_TYPE_BITMAP(uct_ib_devx_objs)},
172 
173     {"REG_MT_THRESH", "4G",
174      "Minimal MR size to be register using multiple parallel threads.\n"
175      "Number of threads used will be determined by number of CPUs which "
176      "registering thread is bound to by hard affinity.",
177      ucs_offsetof(uct_ib_md_config_t, ext.min_mt_reg), UCS_CONFIG_TYPE_MEMUNITS},
178 
179     {"REG_MT_CHUNK", "2G",
180      "Size of single chunk used in multithreaded registration.\n"
181      "Must be power of 2.",
182      ucs_offsetof(uct_ib_md_config_t, ext.mt_reg_chunk), UCS_CONFIG_TYPE_MEMUNITS},
183 
184     {"REG_MT_BIND", "n",
185      "Enable setting CPU affinity of memory registration threads.",
186      ucs_offsetof(uct_ib_md_config_t, ext.mt_reg_bind), UCS_CONFIG_TYPE_BOOL},
187 
188     {"PCI_RELAXED_ORDERING", "auto",
189      "Enable relaxed ordering for PCIe transactions to improve performance on some systems.",
190      ucs_offsetof(uct_ib_md_config_t, mr_relaxed_order), UCS_CONFIG_TYPE_ON_OFF_AUTO},
191 
192     {NULL}
193 };
194 
195 #ifdef ENABLE_STATS
196 static ucs_stats_class_t uct_ib_md_stats_class = {
197     .name           = "",
198     .num_counters   = UCT_IB_MD_STAT_LAST,
199     .counter_names = {
200         [UCT_IB_MD_STAT_MEM_ALLOC]   = "mem_alloc",
201         [UCT_IB_MD_STAT_MEM_REG]     = "mem_reg"
202     }
203 };
204 #endif
205 
206 static const uct_ib_md_pci_info_t uct_ib_md_pci_info[] = {
207     { /* GEN 1 */
208         .bw       = 2.5 * UCS_GBYTE / 8,
209         .payload  = 512,
210         .overhead = 28,
211         .nack     = 5,
212         .ctrl     = 256,
213         .encoding = 8,
214         .decoding = 10,
215         .name     = "gen1"
216     },
217     { /* GEN 2 */
218         .bw       = 5.0 * UCS_GBYTE / 8,
219         .payload  = 512,
220         .overhead = 28,
221         .nack     = 5,
222         .ctrl     = 256,
223         .encoding = 8,
224         .decoding = 10,
225         .name     = "gen2"
226     },
227     { /* GEN 3 */
228         .bw       = 8.0 * UCS_GBYTE / 8,
229         .payload  = 512,
230         .overhead = 30,
231         .nack     = 5,
232         .ctrl     = 256,
233         .encoding = 128,
234         .decoding = 130,
235         .name     = "gen3"
236     },
237 };
238 
239 UCS_LIST_HEAD(uct_ib_md_ops_list);
240 
241 typedef struct uct_ib_verbs_mem {
242     uct_ib_mem_t        super;
243     uct_ib_mr_t         mrs[];
244 } uct_ib_verbs_mem_t;
245 
246 typedef struct {
247     pthread_t     thread;
248     void          *addr;
249     size_t        len;
250     size_t        chunk;
251     uint64_t      access;
252     struct ibv_pd *pd;
253     struct ibv_mr **mr;
254 } uct_ib_md_mem_reg_thread_t;
255 
uct_ib_check_gpudirect_driver(uct_ib_md_t * md,uct_md_attr_t * md_attr,const char * file,ucs_memory_type_t mem_type)256 static void uct_ib_check_gpudirect_driver(uct_ib_md_t *md, uct_md_attr_t *md_attr,
257                                           const char *file,
258                                           ucs_memory_type_t mem_type)
259 {
260     if (!access(file, F_OK)) {
261         md_attr->cap.reg_mem_types |= UCS_BIT(mem_type);
262     }
263 
264     ucs_debug("%s: %s GPUDirect RDMA is %s",
265               uct_ib_device_name(&md->dev), ucs_memory_type_names[mem_type],
266               md_attr->cap.reg_mem_types & UCS_BIT(mem_type) ?
267               "enabled" : "disabled");
268 }
269 
uct_ib_md_query(uct_md_h uct_md,uct_md_attr_t * md_attr)270 static ucs_status_t uct_ib_md_query(uct_md_h uct_md, uct_md_attr_t *md_attr)
271 {
272     uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
273 
274     md_attr->cap.max_alloc = ULONG_MAX; /* TODO query device */
275     md_attr->cap.max_reg   = ULONG_MAX; /* TODO query device */
276     md_attr->cap.flags     = UCT_MD_FLAG_REG       |
277                              UCT_MD_FLAG_NEED_MEMH |
278                              UCT_MD_FLAG_NEED_RKEY |
279                              UCT_MD_FLAG_ADVISE;
280     md_attr->cap.reg_mem_types    = UCS_MEMORY_TYPES_CPU_ACCESSIBLE;
281     md_attr->cap.access_mem_type  = UCS_MEMORY_TYPE_HOST;
282     md_attr->cap.detect_mem_types = 0;
283 
284     if (md->config.enable_gpudirect_rdma != UCS_NO) {
285         /* check if GDR driver is loaded */
286         uct_ib_check_gpudirect_driver(md, md_attr,
287                                       "/sys/kernel/mm/memory_peers/nv_mem/version",
288                                       UCS_MEMORY_TYPE_CUDA);
289 
290         /* check if ROCM KFD driver is loaded */
291         uct_ib_check_gpudirect_driver(md, md_attr, "/dev/kfd",
292                                       UCS_MEMORY_TYPE_ROCM);
293 
294         if (!(md_attr->cap.reg_mem_types & ~UCS_BIT(UCS_MEMORY_TYPE_HOST)) &&
295             (md->config.enable_gpudirect_rdma == UCS_YES)) {
296                 ucs_error("%s: Couldn't enable GPUDirect RDMA. Please make sure"
297                           " nv_peer_mem or amdgpu plugin installed correctly.",
298                           uct_ib_device_name(&md->dev));
299                 return UCS_ERR_UNSUPPORTED;
300         }
301     }
302 
303     md_attr->rkey_packed_size = UCT_IB_MD_PACKED_RKEY_SIZE;
304     md_attr->reg_cost         = md->reg_cost;
305     ucs_sys_cpuset_copy(&md_attr->local_cpus, &md->dev.local_cpus);
306 
307     return UCS_OK;
308 }
309 
uct_ib_md_print_mem_reg_err_msg(ucs_log_level_t level,void * address,size_t length,uint64_t access_flags)310 static void uct_ib_md_print_mem_reg_err_msg(ucs_log_level_t level, void *address,
311                                             size_t length, uint64_t access_flags)
312 {
313     char msg[200] = {0};
314     struct rlimit limit_info;
315 
316     ucs_snprintf_zero(msg, sizeof(msg),
317                       "%s(address=%p, length=%zu, access=0x%lx) failed: %m",
318                       ibv_reg_mr_func_name, address, length, access_flags);
319 
320     /* Check the value of the max locked memory which is set on the system
321      * (ulimit -l) */
322     if (!getrlimit(RLIMIT_MEMLOCK, &limit_info) &&
323         (limit_info.rlim_cur != RLIM_INFINITY)) {
324         ucs_snprintf_zero(msg + strlen(msg), sizeof(msg) - strlen(msg),
325                           ". Please set max locked memory (ulimit -l) to 'unlimited' "
326                           "(current: %llu kbytes)", limit_info.rlim_cur / UCS_KBYTE);
327     }
328 
329     ucs_log(level, "%s", msg);
330 }
331 
uct_ib_md_mem_handle_thread_func(void * arg)332 void *uct_ib_md_mem_handle_thread_func(void *arg)
333 {
334     uct_ib_md_mem_reg_thread_t *ctx = arg;
335     ucs_status_t status;
336     int mr_idx = 0;
337     size_t size = 0;
338     ucs_time_t UCS_V_UNUSED t0 = ucs_get_time();
339 
340     while (ctx->len) {
341         size = ucs_min(ctx->len, ctx->chunk);
342         if (ctx->access != UCT_IB_MEM_DEREG) {
343             ctx->mr[mr_idx] = UCS_PROFILE_NAMED_CALL(ibv_reg_mr_func_name,
344                                                      ibv_reg_mr, ctx->pd,
345                                                      ctx->addr, size,
346                                                      ctx->access);
347             if (ctx->mr[mr_idx] == NULL) {
348                 return UCS_STATUS_PTR(UCS_ERR_IO_ERROR);
349             }
350         } else {
351             status = uct_ib_dereg_mr(ctx->mr[mr_idx]);
352             if (status != UCS_OK) {
353                 return UCS_STATUS_PTR(status);
354             }
355         }
356         ctx->addr  = UCS_PTR_BYTE_OFFSET(ctx->addr, size);
357         ctx->len  -= size;
358         mr_idx++;
359     }
360 
361     ucs_trace("%s %p..%p took %f usec\n",
362               (ctx->access == UCT_IB_MEM_DEREG) ? "dereg_mr" : "reg_mr",
363               ctx->mr[0]->addr,
364               UCS_PTR_BYTE_OFFSET(ctx->mr[mr_idx-1]->addr, size),
365               ucs_time_to_usec(ucs_get_time() - t0));
366 
367     return UCS_STATUS_PTR(UCS_OK);
368 }
369 
370 ucs_status_t
uct_ib_md_handle_mr_list_multithreaded(uct_ib_md_t * md,void * address,size_t length,uint64_t access_flags,size_t chunk,struct ibv_mr ** mrs)371 uct_ib_md_handle_mr_list_multithreaded(uct_ib_md_t *md, void *address,
372                                        size_t length, uint64_t access_flags,
373                                        size_t chunk, struct ibv_mr **mrs)
374 {
375     int thread_num_mrs, thread_num, thread_idx, mr_idx = 0, cpu_id = 0;
376     int mr_num = ucs_div_round_up(length, chunk);
377     ucs_status_t status;
378     void *thread_status;
379     ucs_sys_cpuset_t parent_set, thread_set;
380     uct_ib_md_mem_reg_thread_t *ctxs, *cur_ctx;
381     pthread_attr_t attr;
382     char UCS_V_UNUSED affinity_str[64];
383     int ret;
384 
385     ret = pthread_getaffinity_np(pthread_self(), sizeof(ucs_sys_cpuset_t),
386                                  &parent_set);
387     if (ret != 0) {
388         ucs_error("pthread_getaffinity_np() failed: %m");
389         return UCS_ERR_INVALID_PARAM;
390     }
391 
392     thread_num = ucs_min(CPU_COUNT(&parent_set), mr_num);
393 
394     ucs_trace("multithreaded handle %p..%p access %lx threads %d affinity %s\n",
395               address, UCS_PTR_BYTE_OFFSET(address, length), access_flags, thread_num,
396               ucs_make_affinity_str(&parent_set, affinity_str, sizeof(affinity_str)));
397 
398     if (thread_num == 1) {
399         return UCS_ERR_UNSUPPORTED;
400     }
401 
402     ctxs = ucs_calloc(thread_num, sizeof(*ctxs), "ib mr ctxs");
403     if (ctxs == NULL) {
404         return UCS_ERR_NO_MEMORY;
405     }
406 
407     pthread_attr_init(&attr);
408 
409     status = UCS_OK;
410     for (thread_idx = 0; thread_idx < thread_num; thread_idx++) {
411         /* calculate number of mrs for each thread so each one will
412          * get proportional amount */
413         thread_num_mrs  = ucs_div_round_up(mr_num - mr_idx, thread_num - thread_idx);
414 
415         cur_ctx         = &ctxs[thread_idx];
416         cur_ctx->pd     = md->pd;
417         cur_ctx->addr   = UCS_PTR_BYTE_OFFSET(address, mr_idx * chunk);
418         cur_ctx->len    = ucs_min(thread_num_mrs * chunk, length - (mr_idx * chunk));
419         cur_ctx->access = access_flags;
420         cur_ctx->mr     = &mrs[mr_idx];
421         cur_ctx->chunk  = chunk;
422 
423         if (md->config.mt_reg_bind) {
424             while (!CPU_ISSET(cpu_id, &parent_set)) {
425                 cpu_id++;
426             }
427 
428             CPU_ZERO(&thread_set);
429             CPU_SET(cpu_id, &thread_set);
430             cpu_id++;
431             pthread_attr_setaffinity_np(&attr, sizeof(ucs_sys_cpuset_t), &thread_set);
432         }
433 
434         ret = pthread_create(&cur_ctx->thread, &attr,
435                              uct_ib_md_mem_handle_thread_func, cur_ctx);
436         if (ret) {
437             ucs_error("pthread_create() failed: %m");
438             status     = UCS_ERR_IO_ERROR;
439             thread_num = thread_idx;
440             break;
441         }
442 
443         mr_idx += thread_num_mrs;
444     }
445 
446     for (thread_idx = 0; thread_idx < thread_num; thread_idx++) {
447         cur_ctx = &ctxs[thread_idx];
448         pthread_join(cur_ctx->thread, &thread_status);
449         if (UCS_PTR_IS_ERR(UCS_OK)) {
450             status = UCS_PTR_STATUS(thread_status);
451         }
452     }
453 
454     ucs_free(ctxs);
455     pthread_attr_destroy(&attr);
456 
457     if (status != UCS_OK) {
458         for (mr_idx = 0; mr_idx < mr_num; mr_idx++) {
459             /* coverity[check_return] */
460             uct_ib_dereg_mr(mrs[mr_idx]);
461         }
462     }
463 
464     return status;
465 }
466 
uct_ib_md_reg_mr(uct_ib_md_t * md,void * address,size_t length,uint64_t access_flags,int silent,uct_ib_mem_t * memh,uct_ib_mr_type_t mr_type)467 static ucs_status_t uct_ib_md_reg_mr(uct_ib_md_t *md, void *address,
468                                      size_t length, uint64_t access_flags,
469                                      int silent, uct_ib_mem_t *memh,
470                                      uct_ib_mr_type_t mr_type)
471 {
472     ucs_log_level_t level = silent ? UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR;
473     ucs_status_t status;
474 
475     if (length >= md->config.min_mt_reg) {
476         UCS_PROFILE_CODE("reg ksm") {
477             status = md->ops->reg_multithreaded(md, address, length,
478                                                 access_flags, memh, mr_type);
479         }
480 
481         if (status != UCS_ERR_UNSUPPORTED) {
482             if (status == UCS_OK) {
483                 memh->flags |= UCT_IB_MEM_MULTITHREADED;
484             } else {
485                 uct_ib_md_print_mem_reg_err_msg(level, address, length,
486                                                 access_flags);
487             }
488 
489             return status;
490         } /* if unsuported - fallback to regular registration */
491     }
492 
493     status = md->ops->reg_key(md, address, length, access_flags, memh, mr_type);
494     if (status != UCS_OK) {
495         uct_ib_md_print_mem_reg_err_msg(level, address, length, access_flags);
496         return status;
497     }
498 
499     return UCS_OK;
500 }
501 
uct_ib_reg_mr(struct ibv_pd * pd,void * addr,size_t length,uint64_t access_flags,struct ibv_mr ** mr_p)502 ucs_status_t uct_ib_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
503                            uint64_t access_flags, struct ibv_mr **mr_p)
504 {
505     struct ibv_mr *mr;
506 #if HAVE_DECL_IBV_EXP_REG_MR
507     struct ibv_exp_reg_mr_in in = {};
508 
509     in.pd         = pd;
510     in.addr       = addr;
511     in.length     = length;
512     in.exp_access = access_flags;
513     mr = UCS_PROFILE_CALL(ibv_exp_reg_mr, &in);
514 #else
515     mr = UCS_PROFILE_CALL(ibv_reg_mr, pd, addr, length, access_flags);
516 #endif
517     if (mr == NULL) {
518         return UCS_ERR_IO_ERROR;
519     }
520 
521     *mr_p = mr;
522     return UCS_OK;
523 }
524 
uct_ib_dereg_mr(struct ibv_mr * mr)525 ucs_status_t uct_ib_dereg_mr(struct ibv_mr *mr)
526 {
527     int ret;
528 
529     if (mr == NULL) {
530         return UCS_OK;
531     }
532 
533     ret = UCS_PROFILE_CALL(ibv_dereg_mr, mr);
534     if (ret != 0) {
535         ucs_error("ibv_dereg_mr() failed: %m");
536         return UCS_ERR_IO_ERROR;
537     }
538 
539     return UCS_OK;
540 }
541 
uct_ib_dereg_mrs(struct ibv_mr ** mrs,size_t mr_num)542 ucs_status_t uct_ib_dereg_mrs(struct ibv_mr **mrs, size_t mr_num)
543 {
544     ucs_status_t s, status = UCS_OK;
545     int i;
546 
547     for (i = 0; i < mr_num; i++) {
548         s = uct_ib_dereg_mr(mrs[i]);
549         if (s != UCS_OK) {
550             status = s;
551         }
552     }
553 
554     return status;
555 }
556 
uct_ib_memh_dereg_key(uct_ib_md_t * md,uct_ib_mem_t * memh,uct_ib_mr_type_t mr_type)557 static ucs_status_t uct_ib_memh_dereg_key(uct_ib_md_t *md, uct_ib_mem_t *memh,
558                                           uct_ib_mr_type_t mr_type)
559 {
560     if (memh->flags & UCT_IB_MEM_MULTITHREADED) {
561         return md->ops->dereg_multithreaded(md, memh, mr_type);
562     } else {
563         return md->ops->dereg_key(md, memh, mr_type);
564     }
565 }
566 
uct_ib_memh_dereg(uct_ib_md_t * md,uct_ib_mem_t * memh)567 static ucs_status_t uct_ib_memh_dereg(uct_ib_md_t *md, uct_ib_mem_t *memh)
568 {
569     ucs_status_t s, status = UCS_OK;
570 
571     if (memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR) {
572         s = md->ops->dereg_atomic_key(md, memh);
573         memh->flags &= ~UCT_IB_MEM_FLAG_ATOMIC_MR;
574         if (s != UCS_OK) {
575             status = s;
576         }
577     }
578 
579     if (memh->flags & UCT_IB_MEM_FLAG_RELAXED_ORDERING) {
580         s = uct_ib_memh_dereg_key(md, memh, UCT_IB_MR_STRICT_ORDER);
581         memh->flags &= ~UCT_IB_MEM_FLAG_RELAXED_ORDERING;
582         if (s != UCS_OK) {
583             status = s;
584         }
585     }
586 
587     s = uct_ib_memh_dereg_key(md, memh, UCT_IB_MR_DEFAULT);
588     if (s != UCS_OK) {
589         status = s;
590     }
591 
592     return status;
593 }
594 
uct_ib_memh_free(uct_ib_mem_t * memh)595 static void uct_ib_memh_free(uct_ib_mem_t *memh)
596 {
597     ucs_free(memh);
598 }
599 
uct_ib_memh_alloc(uct_ib_md_t * md)600 static uct_ib_mem_t *uct_ib_memh_alloc(uct_ib_md_t *md)
601 {
602     return ucs_calloc(1, md->memh_struct_size, "ib_memh");
603 }
604 
uct_ib_md_access_flags(uct_ib_md_t * md,unsigned flags,size_t length)605 static uint64_t uct_ib_md_access_flags(uct_ib_md_t *md, unsigned flags,
606                                        size_t length)
607 {
608     uint64_t access_flags = UCT_IB_MEM_ACCESS_FLAGS;
609 
610     if ((flags & UCT_MD_MEM_FLAG_NONBLOCK) && (length > 0) &&
611         (length <= md->config.odp.max_size)) {
612         access_flags |= IBV_ACCESS_ON_DEMAND;
613     }
614 
615     if (md->relaxed_order) {
616         access_flags |= IBV_ACCESS_RELAXED_ORDERING;
617     }
618 
619     return access_flags;
620 }
621 
622 #if HAVE_NUMA
uct_ib_mem_set_numa_policy(uct_ib_md_t * md,void * address,size_t length,uct_ib_mem_t * memh)623 static ucs_status_t uct_ib_mem_set_numa_policy(uct_ib_md_t *md, void *address,
624                                                size_t length, uct_ib_mem_t *memh)
625 {
626     int ret, old_policy, new_policy;
627     struct bitmask *nodemask;
628     uintptr_t start, end;
629     ucs_status_t status;
630 
631     if (!(memh->flags & UCT_IB_MEM_FLAG_ODP) ||
632         (md->config.odp.numa_policy == UCS_NUMA_POLICY_DEFAULT) ||
633         (numa_available() < 0))
634     {
635         status = UCS_OK;
636         goto out;
637     }
638 
639     nodemask = numa_allocate_nodemask();
640     if (nodemask == NULL) {
641         ucs_warn("Failed to allocate numa node mask");
642         status = UCS_ERR_NO_MEMORY;
643         goto out;
644     }
645 
646     ret = get_mempolicy(&old_policy, numa_nodemask_p(nodemask),
647                         numa_nodemask_size(nodemask), NULL, 0);
648     if (ret < 0) {
649         ucs_warn("get_mempolicy(maxnode=%zu) failed: %m",
650                  numa_nodemask_size(nodemask));
651         status = UCS_ERR_INVALID_PARAM;
652         goto out_free;
653     }
654 
655     switch (old_policy) {
656     case MPOL_DEFAULT:
657         /* if no policy is defined, use the numa node of the current cpu */
658         numa_get_thread_node_mask(&nodemask);
659         break;
660     case MPOL_BIND:
661         /* if the current policy is BIND, keep it as-is */
662         status = UCS_OK;
663         goto out_free;
664     default:
665         break;
666     }
667 
668     switch (md->config.odp.numa_policy) {
669     case UCS_NUMA_POLICY_BIND:
670         new_policy = MPOL_BIND;
671         break;
672     case UCS_NUMA_POLICY_PREFERRED:
673         new_policy = MPOL_PREFERRED;
674         break;
675     default:
676         ucs_error("unexpected numa policy %d", md->config.odp.numa_policy);
677         status = UCS_ERR_INVALID_PARAM;
678         goto out_free;
679     }
680 
681     if (new_policy != old_policy) {
682         start = ucs_align_down_pow2((uintptr_t)address, ucs_get_page_size());
683         end   = ucs_align_up_pow2((uintptr_t)address + length,
684                                   ucs_get_page_size());
685         ucs_trace("0x%lx..0x%lx: changing numa policy from %d to %d, "
686                   "nodemask[0]=0x%lx", start, end, old_policy, new_policy,
687                   numa_nodemask_p(nodemask)[0]);
688 
689         ret = UCS_PROFILE_CALL(mbind, (void*)start, end - start, new_policy,
690                                numa_nodemask_p(nodemask),
691                                numa_nodemask_size(nodemask), 0);
692         if (ret < 0) {
693             ucs_warn("mbind(addr=0x%lx length=%ld policy=%d) failed: %m",
694                      start, end - start, new_policy);
695             status = UCS_ERR_IO_ERROR;
696             goto out_free;
697         }
698     }
699 
700     status = UCS_OK;
701 
702 out_free:
703     numa_free_nodemask(nodemask);
704 out:
705     return status;
706 }
707 #else
uct_ib_mem_set_numa_policy(uct_ib_md_t * md,void * address,size_t length,uct_ib_mem_t * memh)708 static ucs_status_t uct_ib_mem_set_numa_policy(uct_ib_md_t *md, void *address,
709                                                size_t length, uct_ib_mem_t *memh)
710 {
711     return UCS_OK;
712 }
713 #endif /* UCT_MD_DISABLE_NUMA */
714 
uct_ib_mem_init(uct_ib_mem_t * memh,unsigned uct_flags,uint64_t access_flags)715 static void uct_ib_mem_init(uct_ib_mem_t *memh, unsigned uct_flags,
716                             uint64_t access_flags)
717 {
718     memh->flags = 0;
719 
720     /* coverity[dead_error_condition] */
721     if (access_flags & IBV_ACCESS_ON_DEMAND) {
722         memh->flags |= UCT_IB_MEM_FLAG_ODP;
723     }
724 
725     if (uct_flags & UCT_MD_MEM_ACCESS_REMOTE_ATOMIC) {
726         memh->flags |= UCT_IB_MEM_ACCESS_REMOTE_ATOMIC;
727     }
728 }
729 
uct_ib_mem_reg_internal(uct_md_h uct_md,void * address,size_t length,unsigned flags,int silent,uct_ib_mem_t * memh)730 static ucs_status_t uct_ib_mem_reg_internal(uct_md_h uct_md, void *address,
731                                             size_t length, unsigned flags,
732                                             int silent, uct_ib_mem_t *memh)
733 {
734     uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
735     ucs_status_t status;
736     uint64_t access_flags;
737 
738     access_flags = uct_ib_md_access_flags(md, flags, length);
739     uct_ib_mem_init(memh, flags, access_flags);
740     status = uct_ib_md_reg_mr(md, address, length, access_flags, silent, memh,
741                               UCT_IB_MR_DEFAULT);
742     if (status != UCS_OK) {
743         return status;
744     }
745 
746     if (md->relaxed_order) {
747         status = uct_ib_md_reg_mr(md, address, length,
748                                   access_flags & ~IBV_ACCESS_RELAXED_ORDERING,
749                                   silent, memh, UCT_IB_MR_STRICT_ORDER);
750         if (status != UCS_OK) {
751             goto err;
752         }
753 
754         memh->flags |= UCT_IB_MEM_FLAG_RELAXED_ORDERING;
755     }
756 
757     ucs_debug("registered memory %p..%p on %s lkey 0x%x rkey 0x%x "
758               "access 0x%lx flags 0x%x", address,
759               UCS_PTR_BYTE_OFFSET(address, length),
760               uct_ib_device_name(&md->dev), memh->lkey, memh->rkey,
761               access_flags, flags);
762 
763     uct_ib_mem_set_numa_policy(md, address, length, memh);
764 
765     if (md->config.odp.prefetch) {
766         md->ops->mem_prefetch(md, memh, address, length);
767     }
768 
769     UCS_STATS_UPDATE_COUNTER(md->stats, UCT_IB_MD_STAT_MEM_REG, +1);
770     return UCS_OK;
771 
772 err:
773     uct_ib_memh_dereg(md, memh);
774     return status;
775 }
776 
uct_ib_mem_reg(uct_md_h uct_md,void * address,size_t length,unsigned flags,uct_mem_h * memh_p)777 static ucs_status_t uct_ib_mem_reg(uct_md_h uct_md, void *address, size_t length,
778                                    unsigned flags, uct_mem_h *memh_p)
779 {
780     uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
781     ucs_status_t status;
782     uct_ib_mem_t *memh;
783 
784     memh = uct_ib_memh_alloc(md);
785     if (memh == NULL) {
786         return UCS_ERR_NO_MEMORY;
787     }
788 
789     status = uct_ib_mem_reg_internal(uct_md, address, length, flags, 0, memh);
790     if (status != UCS_OK) {
791         uct_ib_memh_free(memh);
792         return status;
793     }
794     *memh_p = memh;
795 
796     return UCS_OK;
797 }
798 
uct_ib_mem_dereg(uct_md_h uct_md,uct_mem_h memh)799 static ucs_status_t uct_ib_mem_dereg(uct_md_h uct_md, uct_mem_h memh)
800 {
801     uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
802     uct_ib_mem_t *ib_memh = memh;
803     ucs_status_t status;
804 
805     status = uct_ib_memh_dereg(md, ib_memh);
806     uct_ib_memh_free(ib_memh);
807     return status;
808 }
809 
uct_ib_verbs_reg_key(uct_ib_md_t * md,void * address,size_t length,uint64_t access_flags,uct_ib_mem_t * ib_memh,uct_ib_mr_type_t mr_type)810 static ucs_status_t uct_ib_verbs_reg_key(uct_ib_md_t *md, void *address,
811                                          size_t length, uint64_t access_flags,
812                                          uct_ib_mem_t *ib_memh,
813                                          uct_ib_mr_type_t mr_type)
814 {
815     uct_ib_verbs_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_verbs_mem_t);
816 
817     return uct_ib_reg_key_impl(md, address, length, access_flags,
818                                ib_memh, &memh->mrs[mr_type], mr_type);
819 }
820 
uct_ib_reg_key_impl(uct_ib_md_t * md,void * address,size_t length,uint64_t access_flags,uct_ib_mem_t * memh,uct_ib_mr_t * mr,uct_ib_mr_type_t mr_type)821 ucs_status_t uct_ib_reg_key_impl(uct_ib_md_t *md, void *address,
822                                  size_t length, uint64_t access_flags,
823                                  uct_ib_mem_t *memh, uct_ib_mr_t *mr,
824                                  uct_ib_mr_type_t mr_type)
825 {
826     ucs_status_t status;
827 
828     status = uct_ib_reg_mr(md->pd, address, length, access_flags, &mr->ib);
829     if (status != UCS_OK) {
830         return status;
831     }
832 
833     if (mr_type == UCT_IB_MR_DEFAULT) {
834         uct_ib_memh_init_keys(memh, mr->ib->lkey, mr->ib->rkey);
835     }
836 
837     return UCS_OK;
838 }
839 
uct_ib_verbs_dereg_key(uct_ib_md_t * md,uct_ib_mem_t * ib_memh,uct_ib_mr_type_t mr_type)840 static ucs_status_t uct_ib_verbs_dereg_key(uct_ib_md_t *md,
841                                            uct_ib_mem_t *ib_memh,
842                                            uct_ib_mr_type_t mr_type)
843 {
844     uct_ib_verbs_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_verbs_mem_t);
845 
846     return uct_ib_dereg_mr(memh->mrs[mr_type].ib);
847 }
848 
uct_ib_verbs_reg_atomic_key(uct_ib_md_t * ibmd,uct_ib_mem_t * ib_memh)849 static ucs_status_t uct_ib_verbs_reg_atomic_key(uct_ib_md_t *ibmd,
850                                                 uct_ib_mem_t *ib_memh)
851 {
852     uct_ib_mr_type_t mr_type = uct_ib_memh_get_atomic_base_mr_type(ib_memh);
853     uct_ib_verbs_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_verbs_mem_t);
854 
855     if (mr_type != UCT_IB_MR_STRICT_ORDER) {
856         return UCS_ERR_UNSUPPORTED;
857     }
858 
859     memh->super.atomic_rkey = memh->mrs[mr_type].ib->rkey;
860     return UCS_OK;
861 }
862 
863 static ucs_status_t
uct_ib_mem_advise(uct_md_h uct_md,uct_mem_h memh,void * addr,size_t length,unsigned advice)864 uct_ib_mem_advise(uct_md_h uct_md, uct_mem_h memh, void *addr,
865                         size_t length, unsigned advice)
866 {
867     uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
868 
869     ucs_debug("memh %p advice %d", memh, advice);
870     if ((advice == UCT_MADV_WILLNEED) && !md->config.odp.prefetch) {
871         return md->ops->mem_prefetch(md, memh, addr, length);
872     }
873 
874     return UCS_OK;
875 }
876 
uct_ib_mkey_pack(uct_md_h uct_md,uct_mem_h uct_memh,void * rkey_buffer)877 static ucs_status_t uct_ib_mkey_pack(uct_md_h uct_md, uct_mem_h uct_memh,
878                                      void *rkey_buffer)
879 {
880     uct_ib_md_t *md         = ucs_derived_of(uct_md, uct_ib_md_t);
881     uct_ib_mem_t *memh      = uct_memh;
882     uint32_t atomic_rkey;
883     ucs_status_t status;
884 
885     /* create umr only if a user requested atomic access to the
886      * memory region and the hardware supports it.
887      */
888     if (((memh->flags & UCT_IB_MEM_ACCESS_REMOTE_ATOMIC) ||
889          (memh->flags & UCT_IB_MEM_FLAG_RELAXED_ORDERING)) &&
890         !(memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR) &&
891         (memh != md->global_odp))
892     {
893         /* create UMR on-demand */
894         UCS_PROFILE_CODE("reg atomic key") {
895             status = md->ops->reg_atomic_key(md, memh);
896         }
897         if (status == UCS_OK) {
898             memh->flags |= UCT_IB_MEM_FLAG_ATOMIC_MR;
899             ucs_trace("created atomic key 0x%x for 0x%x", memh->atomic_rkey,
900                       memh->lkey);
901         } else if (status != UCS_ERR_UNSUPPORTED) {
902             return status;
903         }
904     }
905     if (memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR) {
906         atomic_rkey = memh->atomic_rkey;
907     } else {
908         atomic_rkey = UCT_IB_INVALID_RKEY;
909     }
910 
911     uct_ib_md_pack_rkey(memh->rkey, atomic_rkey, rkey_buffer);
912     return UCS_OK;
913 }
914 
uct_ib_rkey_unpack(uct_component_t * component,const void * rkey_buffer,uct_rkey_t * rkey_p,void ** handle_p)915 static ucs_status_t uct_ib_rkey_unpack(uct_component_t *component,
916                                        const void *rkey_buffer, uct_rkey_t *rkey_p,
917                                        void **handle_p)
918 {
919     uint64_t packed_rkey = *(const uint64_t*)rkey_buffer;
920 
921     *rkey_p   = packed_rkey;
922     *handle_p = NULL;
923     ucs_trace("unpacked rkey 0x%llx: direct 0x%x indirect 0x%x",
924               (unsigned long long)packed_rkey,
925               uct_ib_md_direct_rkey(*rkey_p), uct_ib_md_indirect_rkey(*rkey_p));
926     return UCS_OK;
927 }
928 
929 static uct_md_ops_t uct_ib_md_ops = {
930     .close              = uct_ib_md_close,
931     .query              = uct_ib_md_query,
932     .mem_reg            = uct_ib_mem_reg,
933     .mem_dereg          = uct_ib_mem_dereg,
934     .mem_advise         = uct_ib_mem_advise,
935     .mkey_pack          = uct_ib_mkey_pack,
936     .detect_memory_type = ucs_empty_function_return_unsupported,
937 };
938 
uct_ib_rcache_region_from_memh(uct_mem_h memh)939 static inline uct_ib_rcache_region_t* uct_ib_rcache_region_from_memh(uct_mem_h memh)
940 {
941     return ucs_container_of(memh, uct_ib_rcache_region_t, memh);
942 }
943 
uct_ib_mem_rcache_reg(uct_md_h uct_md,void * address,size_t length,unsigned flags,uct_mem_h * memh_p)944 static ucs_status_t uct_ib_mem_rcache_reg(uct_md_h uct_md, void *address,
945                                           size_t length, unsigned flags,
946                                           uct_mem_h *memh_p)
947 {
948     uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
949     ucs_rcache_region_t *rregion;
950     ucs_status_t status;
951     uct_ib_mem_t *memh;
952 
953     status = ucs_rcache_get(md->rcache, address, length, PROT_READ|PROT_WRITE,
954                             &flags, &rregion);
955     if (status != UCS_OK) {
956         return status;
957     }
958 
959     ucs_assert(rregion->refcount > 0);
960     memh = &ucs_derived_of(rregion, uct_ib_rcache_region_t)->memh;
961     /* The original region was registered without atomic access
962      * so update the access flags. Actual umr creation will happen
963      * when uct_ib_mkey_pack() is called.
964      */
965     if (flags & UCT_MD_MEM_ACCESS_REMOTE_ATOMIC) {
966         memh->flags |= UCT_IB_MEM_ACCESS_REMOTE_ATOMIC;
967     }
968     *memh_p = memh;
969     return UCS_OK;
970 }
971 
uct_ib_mem_rcache_dereg(uct_md_h uct_md,uct_mem_h memh)972 static ucs_status_t uct_ib_mem_rcache_dereg(uct_md_h uct_md, uct_mem_h memh)
973 {
974     uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
975     uct_ib_rcache_region_t *region = uct_ib_rcache_region_from_memh(memh);
976 
977     ucs_rcache_region_put(md->rcache, &region->super);
978     return UCS_OK;
979 }
980 
981 static uct_md_ops_t uct_ib_md_rcache_ops = {
982     .close              = uct_ib_md_close,
983     .query              = uct_ib_md_query,
984     .mem_reg            = uct_ib_mem_rcache_reg,
985     .mem_dereg          = uct_ib_mem_rcache_dereg,
986     .mem_advise         = uct_ib_mem_advise,
987     .mkey_pack          = uct_ib_mkey_pack,
988     .detect_memory_type = ucs_empty_function_return_unsupported,
989 };
990 
uct_ib_rcache_mem_reg_cb(void * context,ucs_rcache_t * rcache,void * arg,ucs_rcache_region_t * rregion,uint16_t rcache_mem_reg_flags)991 static ucs_status_t uct_ib_rcache_mem_reg_cb(void *context, ucs_rcache_t *rcache,
992                                              void *arg, ucs_rcache_region_t *rregion,
993                                              uint16_t rcache_mem_reg_flags)
994 {
995     uct_ib_rcache_region_t *region = ucs_derived_of(rregion, uct_ib_rcache_region_t);
996     uct_ib_md_t *md = context;
997     int *flags      = arg;
998     int silent      = (rcache_mem_reg_flags & UCS_RCACHE_MEM_REG_HIDE_ERRORS) ||
999                       (*flags & UCT_MD_MEM_FLAG_HIDE_ERRORS);
1000     ucs_status_t status;
1001 
1002     status = uct_ib_mem_reg_internal(&md->super, (void*)region->super.super.start,
1003                                      region->super.super.end - region->super.super.start,
1004                                      *flags, silent, &region->memh);
1005     if (status != UCS_OK) {
1006         return status;
1007     }
1008 
1009     return UCS_OK;
1010 }
1011 
uct_ib_rcache_mem_dereg_cb(void * context,ucs_rcache_t * rcache,ucs_rcache_region_t * rregion)1012 static void uct_ib_rcache_mem_dereg_cb(void *context, ucs_rcache_t *rcache,
1013                                        ucs_rcache_region_t *rregion)
1014 {
1015     uct_ib_rcache_region_t *region = ucs_derived_of(rregion, uct_ib_rcache_region_t);
1016     uct_ib_md_t *md = (uct_ib_md_t *)context;
1017 
1018     (void)uct_ib_memh_dereg(md, &region->memh);
1019 }
1020 
uct_ib_rcache_dump_region_cb(void * context,ucs_rcache_t * rcache,ucs_rcache_region_t * rregion,char * buf,size_t max)1021 static void uct_ib_rcache_dump_region_cb(void *context, ucs_rcache_t *rcache,
1022                                          ucs_rcache_region_t *rregion, char *buf,
1023                                          size_t max)
1024 {
1025     uct_ib_rcache_region_t *region = ucs_derived_of(rregion, uct_ib_rcache_region_t);
1026     uct_ib_mem_t *memh = &region->memh;
1027 
1028     snprintf(buf, max, "lkey 0x%x rkey 0x%x atomic_rkey 0x%x",
1029              memh->lkey, memh->rkey,
1030              (memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR) ? memh->atomic_rkey :
1031                              UCT_IB_INVALID_RKEY
1032              );
1033 }
1034 
1035 static ucs_rcache_ops_t uct_ib_rcache_ops = {
1036     .mem_reg     = uct_ib_rcache_mem_reg_cb,
1037     .mem_dereg   = uct_ib_rcache_mem_dereg_cb,
1038     .dump_region = uct_ib_rcache_dump_region_cb
1039 };
1040 
uct_ib_md_odp_query(uct_md_h uct_md,uct_md_attr_t * md_attr)1041 static ucs_status_t uct_ib_md_odp_query(uct_md_h uct_md, uct_md_attr_t *md_attr)
1042 {
1043     ucs_status_t status;
1044 
1045     status = uct_ib_md_query(uct_md, md_attr);
1046     if (status != UCS_OK) {
1047         return status;
1048     }
1049 
1050     /* ODP supports only host memory */
1051     md_attr->cap.reg_mem_types &= UCS_BIT(UCS_MEMORY_TYPE_HOST);
1052     return UCS_OK;
1053 }
1054 
uct_ib_mem_global_odp_reg(uct_md_h uct_md,void * address,size_t length,unsigned flags,uct_mem_h * memh_p)1055 static ucs_status_t uct_ib_mem_global_odp_reg(uct_md_h uct_md, void *address,
1056                                               size_t length, unsigned flags,
1057                                               uct_mem_h *memh_p)
1058 {
1059     uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
1060     uct_ib_mem_t *memh = md->global_odp;
1061 
1062     ucs_assert(md->global_odp != NULL);
1063     if (flags & UCT_MD_MEM_FLAG_LOCK) {
1064         return uct_ib_mem_reg(uct_md, address, length, flags, memh_p);
1065     }
1066 
1067     if (md->config.odp.prefetch) {
1068         md->ops->mem_prefetch(md, memh, address, length);
1069     }
1070 
1071     /* cppcheck-suppress autoVariables */
1072     *memh_p = md->global_odp;
1073     return UCS_OK;
1074 }
1075 
uct_ib_mem_global_odp_dereg(uct_md_h uct_md,uct_mem_h memh)1076 static ucs_status_t uct_ib_mem_global_odp_dereg(uct_md_h uct_md, uct_mem_h memh)
1077 {
1078     uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
1079 
1080     if (memh == md->global_odp) {
1081         return UCS_OK;
1082     }
1083 
1084     return uct_ib_mem_dereg(uct_md, memh);
1085 }
1086 
1087 static uct_md_ops_t UCS_V_UNUSED uct_ib_md_global_odp_ops = {
1088     .close              = uct_ib_md_close,
1089     .query              = uct_ib_md_odp_query,
1090     .mem_reg            = uct_ib_mem_global_odp_reg,
1091     .mem_dereg          = uct_ib_mem_global_odp_dereg,
1092     .mem_advise         = uct_ib_mem_advise,
1093     .mkey_pack          = uct_ib_mkey_pack,
1094     .detect_memory_type = ucs_empty_function_return_unsupported,
1095 };
1096 
uct_ib_query_md_resources(uct_component_t * component,uct_md_resource_desc_t ** resources_p,unsigned * num_resources_p)1097 static ucs_status_t uct_ib_query_md_resources(uct_component_t *component,
1098                                               uct_md_resource_desc_t **resources_p,
1099                                               unsigned *num_resources_p)
1100 {
1101     UCS_MODULE_FRAMEWORK_DECLARE(uct_ib);
1102     uct_md_resource_desc_t *resources;
1103     struct ibv_device **device_list;
1104     ucs_status_t status;
1105     int i, num_devices;
1106 
1107     UCS_MODULE_FRAMEWORK_LOAD(uct_ib, 0);
1108 
1109     /* Get device list from driver */
1110     device_list = ibv_get_device_list(&num_devices);
1111     if (device_list == NULL) {
1112         ucs_debug("Failed to get IB device list, assuming no devices are present");
1113         *resources_p     = NULL;
1114         *num_resources_p = 0;
1115         return UCS_OK;
1116     }
1117 
1118     resources = ucs_calloc(num_devices, sizeof(*resources), "ib resources");
1119     if (resources == NULL) {
1120         status = UCS_ERR_NO_MEMORY;
1121         goto out_free_device_list;
1122     }
1123 
1124     for (i = 0; i < num_devices; ++i) {
1125         ucs_snprintf_zero(resources[i].md_name, sizeof(resources[i].md_name),
1126                           "%s", ibv_get_device_name(device_list[i]));
1127     }
1128 
1129     *resources_p     = resources;
1130     *num_resources_p = num_devices;
1131     status = UCS_OK;
1132 
1133 out_free_device_list:
1134     ibv_free_device_list(device_list);
1135     return status;
1136 }
1137 
uct_ib_fork_warn()1138 static void uct_ib_fork_warn()
1139 {
1140     ucs_warn("IB: ibv_fork_init() was disabled or failed, yet a fork() has been issued.");
1141     ucs_warn("IB: data corruption might occur when using registered memory.");
1142 }
1143 
uct_ib_fork_warn_enable()1144 static void uct_ib_fork_warn_enable()
1145 {
1146     static volatile uint32_t enabled = 0;
1147     int ret;
1148 
1149     if (ucs_atomic_cswap32(&enabled, 0, 1) != 0) {
1150         return;
1151     }
1152 
1153     ret = pthread_atfork(uct_ib_fork_warn, NULL, NULL);
1154     if (ret) {
1155         ucs_warn("registering fork() warning failed: %m");
1156     }
1157 }
1158 
uct_ib_md_release_device_config(uct_ib_md_t * md)1159 static void uct_ib_md_release_device_config(uct_ib_md_t *md)
1160 {
1161     unsigned i;
1162 
1163     for (i = 0; i < md->custom_devices.count; ++i) {
1164         free((char*)md->custom_devices.specs[i].name);
1165     }
1166     ucs_free(md->custom_devices.specs);
1167 }
1168 
1169 static ucs_status_t UCS_V_UNUSED
uct_ib_md_global_odp_init(uct_ib_md_t * md,uct_mem_h * memh_p)1170 uct_ib_md_global_odp_init(uct_ib_md_t *md, uct_mem_h *memh_p)
1171 {
1172     uct_ib_verbs_mem_t *global_odp;
1173     uct_ib_mr_t *mr;
1174     ucs_status_t status;
1175 
1176     global_odp = (uct_ib_verbs_mem_t *)uct_ib_memh_alloc(md);
1177     if (global_odp == NULL) {
1178         return UCS_ERR_NO_MEMORY;
1179     }
1180 
1181     mr = &global_odp->mrs[UCT_IB_MR_DEFAULT];
1182     status = uct_ib_reg_mr(md->pd, 0, UINT64_MAX,
1183                            UCT_IB_MEM_ACCESS_FLAGS | IBV_ACCESS_ON_DEMAND,
1184                            &mr->ib);
1185     if (status != UCS_OK) {
1186         ucs_debug("%s: failed to register global mr: %m",
1187                   uct_ib_device_name(&md->dev));
1188         goto err;
1189     }
1190 
1191     global_odp->super.flags = UCT_IB_MEM_FLAG_ODP;
1192     uct_ib_memh_init_keys(&global_odp->super, mr->ib->lkey, mr->ib->rkey);
1193     *memh_p = global_odp;
1194     return UCS_OK;
1195 
1196 err:
1197     uct_ib_memh_free(&global_odp->super);
1198     return status;
1199 }
1200 
1201 static ucs_status_t
uct_ib_md_parse_reg_methods(uct_ib_md_t * md,uct_md_attr_t * md_attr,const uct_ib_md_config_t * md_config)1202 uct_ib_md_parse_reg_methods(uct_ib_md_t *md, uct_md_attr_t *md_attr,
1203                             const uct_ib_md_config_t *md_config)
1204 {
1205     ucs_rcache_params_t rcache_params;
1206     ucs_status_t status;
1207     int i;
1208 
1209     for (i = 0; i < md_config->reg_methods.count; ++i) {
1210         if (!strcasecmp(md_config->reg_methods.rmtd[i], "rcache")) {
1211             rcache_params.region_struct_size = sizeof(ucs_rcache_region_t) +
1212                                                md->memh_struct_size;
1213             rcache_params.alignment          = md_config->rcache.alignment;
1214             rcache_params.max_alignment      = ucs_get_page_size();
1215             rcache_params.ucm_events         = UCM_EVENT_VM_UNMAPPED;
1216             if (md_attr->cap.reg_mem_types & ~UCS_BIT(UCS_MEMORY_TYPE_HOST)) {
1217                 rcache_params.ucm_events     |= UCM_EVENT_MEM_TYPE_FREE;
1218             }
1219             rcache_params.ucm_event_priority = md_config->rcache.event_prio;
1220             rcache_params.context            = md;
1221             rcache_params.ops                = &uct_ib_rcache_ops;
1222             rcache_params.flags              = 0;
1223 
1224             status = ucs_rcache_create(&rcache_params, uct_ib_device_name(&md->dev),
1225                                        UCS_STATS_RVAL(md->stats), &md->rcache);
1226             if (status != UCS_OK) {
1227                 ucs_debug("%s: failed to create registration cache: %s",
1228                           uct_ib_device_name(&md->dev),
1229                           ucs_status_string(status));
1230                 continue;
1231             }
1232 
1233             md->super.ops = &uct_ib_md_rcache_ops;
1234             md->reg_cost  = ucs_linear_func_make(md_config->rcache.overhead, 0);
1235             ucs_debug("%s: using registration cache",
1236                       uct_ib_device_name(&md->dev));
1237             return UCS_OK;
1238 #if HAVE_ODP_IMPLICIT
1239         } else if (!strcasecmp(md_config->reg_methods.rmtd[i], "odp")) {
1240             if (!(md->dev.flags & UCT_IB_DEVICE_FLAG_ODP_IMPLICIT)) {
1241                 ucs_debug("%s: on-demand-paging with global memory region is "
1242                           "not supported", uct_ib_device_name(&md->dev));
1243                 continue;
1244             }
1245 
1246             status = uct_ib_md_global_odp_init(md, &md->global_odp);
1247             if (status != UCS_OK) {
1248                 continue;
1249             }
1250 
1251             md->super.ops = &uct_ib_md_global_odp_ops;
1252             md->reg_cost  = ucs_linear_func_make(10e-9, 0);
1253             ucs_debug("%s: using odp global key", uct_ib_device_name(&md->dev));
1254             return UCS_OK;
1255 #endif
1256         } else if (!strcmp(md_config->reg_methods.rmtd[i], "direct")) {
1257             md->super.ops = &uct_ib_md_ops;
1258             md->reg_cost  = md_config->uc_reg_cost;
1259             ucs_debug("%s: using direct registration",
1260                       uct_ib_device_name(&md->dev));
1261             return UCS_OK;
1262         }
1263     }
1264 
1265     return UCS_ERR_INVALID_PARAM;
1266 }
1267 
1268 static ucs_status_t
uct_ib_md_parse_device_config(uct_ib_md_t * md,const uct_ib_md_config_t * md_config)1269 uct_ib_md_parse_device_config(uct_ib_md_t *md, const uct_ib_md_config_t *md_config)
1270 {
1271     uct_ib_device_spec_t *spec;
1272     ucs_status_t status;
1273     char *flags_str, *p;
1274     unsigned i, count;
1275     int nfields;
1276 
1277     count = md->custom_devices.count = md_config->custom_devices.count;
1278     if (count == 0) {
1279         md->custom_devices.specs = NULL;
1280         md->custom_devices.count = 0;
1281         return UCS_OK;
1282     }
1283 
1284     md->custom_devices.specs = ucs_calloc(count, sizeof(*md->custom_devices.specs),
1285                                           "ib_custom_devices");
1286     if (md->custom_devices.specs == NULL) {
1287         status = UCS_ERR_NO_MEMORY;
1288         goto err;
1289     }
1290 
1291     for (i = 0; i < count; ++i) {
1292         spec = &md->custom_devices.specs[i];
1293         nfields = sscanf(md_config->custom_devices.spec[i],
1294                          "%hi:%hi:%m[^:]:%m[^:]:%hhu",
1295                          &spec->pci_id.vendor, &spec->pci_id.device, &spec->name,
1296                          &flags_str, &spec->priority);
1297         if (nfields < 2) {
1298             ucs_error("failed to parse device config '%s' (parsed: %d/%d)",
1299                       md_config->custom_devices.spec[i], nfields, 5);
1300             status = UCS_ERR_INVALID_PARAM;
1301             goto err_free;
1302         }
1303 
1304         if (nfields >= 4) {
1305             for (p = flags_str; *p != 0; ++p) {
1306                 if (*p == '4') {
1307                     spec->flags |= UCT_IB_DEVICE_FLAG_MLX4_PRM;
1308                 } else if (*p == '5') {
1309                     spec->flags |= UCT_IB_DEVICE_FLAG_MLX5_PRM;
1310                 } else if (*p == 'd') {
1311                     spec->flags |= UCT_IB_DEVICE_FLAG_DC_V1;
1312                 } else if (*p == 'D') {
1313                     spec->flags |= UCT_IB_DEVICE_FLAG_DC_V2;
1314                 } else if (*p == 'a') {
1315                     spec->flags |= UCT_IB_DEVICE_FLAG_AV;
1316                 } else {
1317                     ucs_error("invalid device flag: '%c'", *p);
1318                     free(flags_str);
1319                     status = UCS_ERR_INVALID_PARAM;
1320                     goto err_free;
1321                 }
1322             }
1323             free(flags_str);
1324         }
1325 
1326         ucs_trace("added device '%s' vendor_id 0x%x device_id %d flags %c%c prio %d",
1327                   spec->name, spec->pci_id.vendor, spec->pci_id.device,
1328                   (spec->flags & UCT_IB_DEVICE_FLAG_MLX4_PRM) ? '4' : '-',
1329                   (spec->flags & UCT_IB_DEVICE_FLAG_MLX5_PRM) ? '5' : '-',
1330                   spec->priority);
1331     }
1332 
1333     return UCS_OK;
1334 
1335 err_free:
1336     uct_ib_md_release_device_config(md);
1337 err:
1338     return status;
1339 }
1340 
uct_ib_md_release_reg_method(uct_ib_md_t * md)1341 static void uct_ib_md_release_reg_method(uct_ib_md_t *md)
1342 {
1343     if (md->rcache != NULL) {
1344         ucs_rcache_destroy(md->rcache);
1345     }
1346     if (md->global_odp != NULL) {
1347         uct_ib_mem_dereg(&md->super, md->global_odp);
1348     }
1349 }
1350 
1351 static ucs_status_t
uct_ib_md_parse_subnet_prefix(const char * subnet_prefix_str,uint64_t * subnet_prefix)1352 uct_ib_md_parse_subnet_prefix(const char *subnet_prefix_str,
1353                               uint64_t *subnet_prefix)
1354 {
1355     uint16_t pfx[4] = {0};
1356     uint64_t pfx64 = 0;
1357     int res, i;
1358 
1359     res = sscanf(subnet_prefix_str, "%hx:%hx:%hx:%hx",
1360                  &pfx[0], &pfx[1], &pfx[2], &pfx[3]);
1361     if (res != 4) {
1362         ucs_error("subnet filter '%s' is invalid", subnet_prefix_str);
1363         return UCS_ERR_INVALID_PARAM;
1364     }
1365 
1366     for (i = 0; i < 4; i++) {
1367         pfx64 = pfx[i] + (pfx64 << 16);
1368     }
1369 
1370     *subnet_prefix = htobe64(pfx64);
1371     return UCS_OK;
1372 }
1373 
uct_ib_md_read_pci_bw(struct ibv_device * ib_device)1374 static double uct_ib_md_read_pci_bw(struct ibv_device *ib_device)
1375 {
1376     const char *pci_width_file_name = "current_link_width";
1377     const char *pci_speed_file_name = "current_link_speed";
1378     char pci_width_str[16];
1379     char pci_speed_str[16];
1380     char gts[16];
1381     const uct_ib_md_pci_info_t *p;
1382     double bw, effective_bw;
1383     unsigned width;
1384     ssize_t len;
1385     size_t i;
1386 
1387     len = ucs_read_file(pci_width_str, sizeof(pci_width_str) - 1, 1,
1388                         UCT_IB_DEVICE_SYSFS_FMT, ib_device->name,
1389                         pci_width_file_name);
1390     if (len < 1) {
1391         ucs_debug("failed to read file: " UCT_IB_DEVICE_SYSFS_FMT,
1392                   ib_device->name, pci_width_file_name);
1393         return DBL_MAX; /* failed to read file */
1394     }
1395     pci_width_str[len] = '\0';
1396 
1397     len = ucs_read_file(pci_speed_str, sizeof(pci_speed_str) - 1, 1,
1398                         UCT_IB_DEVICE_SYSFS_FMT, ib_device->name,
1399                         pci_speed_file_name);
1400     if (len < 1) {
1401         ucs_debug("failed to read file: " UCT_IB_DEVICE_SYSFS_FMT,
1402                   ib_device->name, pci_speed_file_name);
1403         return DBL_MAX; /* failed to read file */
1404     }
1405     pci_speed_str[len] = '\0';
1406 
1407     if (sscanf(pci_width_str, "%u", &width) < 1) {
1408         ucs_debug("incorrect format of %s file: expected: <unsigned integer>, actual: %s\n",
1409                   pci_width_file_name, pci_width_str);
1410         return DBL_MAX;
1411     }
1412 
1413     if ((sscanf(pci_speed_str, "%lf%s", &bw, gts) < 2) ||
1414         strcasecmp("GT/s", ucs_strtrim(gts))) {
1415         ucs_debug("incorrect format of %s file: expected: <double> GT/s, actual: %s\n",
1416                   pci_speed_file_name, pci_speed_str);
1417         return DBL_MAX;
1418     }
1419 
1420     bw *= UCS_GBYTE / 8; /* gigabit -> gigabyte */
1421 
1422     for (i = 0; i < ucs_static_array_size(uct_ib_md_pci_info); i++) {
1423         if (bw < (uct_ib_md_pci_info[i].bw * 1.2)) { /* use 1.2 multiplex to avoid round issues */
1424             p = &uct_ib_md_pci_info[i]; /* use pointer to make equation shorter */
1425             /* coverity[overflow] */
1426             effective_bw = bw * width *
1427                            (p->payload * p->nack) /
1428                            (((p->payload + p->overhead) * p->nack) + p->ctrl) *
1429                            p->encoding / p->decoding;
1430             ucs_trace("%s: pcie %ux %s, effective throughput %.3lfMB/s (%.3lfGb/s)",
1431                       ib_device->name, width, p->name,
1432                       (effective_bw / UCS_MBYTE), (effective_bw * 8 / UCS_GBYTE));
1433             return effective_bw;
1434         }
1435     }
1436 
1437     return DBL_MAX;
1438 }
1439 
uct_ib_md_pci_bw(const uct_ib_md_config_t * md_config,struct ibv_device * ib_device)1440 static double uct_ib_md_pci_bw(const uct_ib_md_config_t *md_config,
1441                                struct ibv_device *ib_device)
1442 {
1443     unsigned i;
1444 
1445     for (i = 0; i < md_config->pci_bw.count; i++) {
1446         if (!strcmp(ib_device->name, md_config->pci_bw.device[i].name)) {
1447             if (UCS_CONFIG_BW_IS_AUTO(md_config->pci_bw.device[i].bw)) {
1448                 break; /* read data from system */
1449             }
1450             return md_config->pci_bw.device[i].bw;
1451         }
1452     }
1453 
1454     return uct_ib_md_read_pci_bw(ib_device);
1455 }
1456 
uct_ib_md_open(uct_component_t * component,const char * md_name,const uct_md_config_t * uct_md_config,uct_md_h * md_p)1457 ucs_status_t uct_ib_md_open(uct_component_t *component, const char *md_name,
1458                             const uct_md_config_t *uct_md_config, uct_md_h *md_p)
1459 {
1460     const uct_ib_md_config_t *md_config = ucs_derived_of(uct_md_config, uct_ib_md_config_t);
1461     ucs_status_t status = UCS_ERR_UNSUPPORTED;
1462     uct_ib_md_t *md = NULL;
1463     struct ibv_device **ib_device_list, *ib_device;
1464     uct_ib_md_ops_entry_t *md_ops_entry;
1465     int i, num_devices, ret, fork_init = 0;
1466 
1467     ucs_trace("opening IB device %s", md_name);
1468 
1469 #if !HAVE_DEVX
1470     if (md_config->devx == UCS_YES) {
1471         ucs_error("DEVX requested but not supported");
1472         status = UCS_ERR_NO_DEVICE;
1473         goto out;
1474     }
1475 #endif
1476 
1477     /* Get device list from driver */
1478     ib_device_list = ibv_get_device_list(&num_devices);
1479     if (ib_device_list == NULL) {
1480         ucs_debug("Failed to get IB device list, assuming no devices are present");
1481         status = UCS_ERR_NO_DEVICE;
1482         goto out;
1483     }
1484 
1485     ib_device = NULL;
1486     for (i = 0; i < num_devices; ++i) {
1487         if (!strcmp(ibv_get_device_name(ib_device_list[i]), md_name)) {
1488             ib_device = ib_device_list[i];
1489             break;
1490         }
1491     }
1492 
1493     if (ib_device == NULL) {
1494         ucs_debug("IB device %s not found", md_name);
1495         status = UCS_ERR_NO_DEVICE;
1496         goto out_free_dev_list;
1497     }
1498 
1499     if (md_config->fork_init != UCS_NO) {
1500         ret = ibv_fork_init();
1501         if (ret) {
1502             if (md_config->fork_init == UCS_YES) {
1503                 ucs_error("ibv_fork_init() failed: %m");
1504                 status = UCS_ERR_IO_ERROR;
1505                 goto out_free_dev_list;
1506             }
1507             ucs_debug("ibv_fork_init() failed: %m, continuing, but fork may be unsafe.");
1508             uct_ib_fork_warn_enable();
1509         } else {
1510             fork_init = 1;
1511         }
1512     } else {
1513         uct_ib_fork_warn_enable();
1514     }
1515 
1516     ucs_list_for_each(md_ops_entry, &uct_ib_md_ops_list, list) {
1517         status = md_ops_entry->ops->open(ib_device, md_config, &md);
1518         if (status == UCS_OK) {
1519             ucs_debug("%s: md open by '%s' is successful", md_name,
1520                       md_ops_entry->name);
1521             md->ops = md_ops_entry->ops;
1522             break;
1523         } else if (status != UCS_ERR_UNSUPPORTED) {
1524             goto out_free_dev_list;
1525         }
1526         ucs_debug("%s: md open by '%s' failed, trying next", md_name,
1527                   md_ops_entry->name);
1528     }
1529 
1530     if (status != UCS_OK) {
1531         ucs_assert(status == UCS_ERR_UNSUPPORTED);
1532         ucs_debug("Unsupported IB device %s", md_name);
1533         goto out_free_dev_list;
1534     }
1535 
1536     /* cppcheck-suppress autoVariables */
1537     *md_p         = &md->super;
1538     md->fork_init = fork_init;
1539     status        = UCS_OK;
1540 
1541 out_free_dev_list:
1542     ibv_free_device_list(ib_device_list);
1543 out:
1544     return status;
1545 }
1546 
uct_ib_md_parse_relaxed_order(uct_ib_md_t * md,const uct_ib_md_config_t * md_config)1547 void uct_ib_md_parse_relaxed_order(uct_ib_md_t *md,
1548                                    const uct_ib_md_config_t *md_config)
1549 {
1550     if (md_config->mr_relaxed_order == UCS_CONFIG_ON) {
1551         if (IBV_ACCESS_RELAXED_ORDERING) {
1552             md->relaxed_order = 1;
1553         } else {
1554             ucs_warn("relaxed order memory access requested but not supported");
1555         }
1556     } else if (md_config->mr_relaxed_order == UCS_CONFIG_AUTO) {
1557         if (ucs_cpu_prefer_relaxed_order()) {
1558             md->relaxed_order = 1;
1559         }
1560     }
1561 }
1562 
uct_ib_md_open_common(uct_ib_md_t * md,struct ibv_device * ib_device,const uct_ib_md_config_t * md_config)1563 ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md,
1564                                    struct ibv_device *ib_device,
1565                                    const uct_ib_md_config_t *md_config)
1566 {
1567     uct_md_attr_t md_attr;
1568     ucs_status_t status;
1569 
1570     md->super.ops       = &uct_ib_md_ops;
1571     md->super.component = &uct_ib_component;
1572 
1573     if (md->config.odp.max_size == UCS_MEMUNITS_AUTO) {
1574         md->config.odp.max_size = uct_ib_device_odp_max_size(&md->dev);
1575     }
1576 
1577     /* Create statistics */
1578     status = UCS_STATS_NODE_ALLOC(&md->stats, &uct_ib_md_stats_class,
1579                                   ucs_stats_get_root(),
1580                                   "%s-%p", ibv_get_device_name(ib_device), md);
1581     if (status != UCS_OK) {
1582         goto err;
1583     }
1584 
1585     status = uct_ib_device_init(&md->dev, ib_device, md_config->async_events
1586                                 UCS_STATS_ARG(md->stats));
1587     if (status != UCS_OK) {
1588         goto err_release_stats;
1589     }
1590 
1591 #if HAVE_DECL_IBV_EXP_SETENV
1592     ibv_exp_setenv(md->dev.ibv_context, "MLX_QP_ALLOC_TYPE", "ANON", 0);
1593     ibv_exp_setenv(md->dev.ibv_context, "MLX_CQ_ALLOC_TYPE", "ANON", 0);
1594 #endif
1595 
1596     if (strlen(md_config->subnet_prefix) > 0) {
1597         status = uct_ib_md_parse_subnet_prefix(md_config->subnet_prefix,
1598                                                &md->subnet_filter);
1599 
1600         if (status != UCS_OK) {
1601             goto err_cleanup_device;
1602         }
1603 
1604         md->check_subnet_filter = 1;
1605     }
1606 
1607     /* Allocate memory domain */
1608     md->pd = ibv_alloc_pd(md->dev.ibv_context);
1609     if (md->pd == NULL) {
1610         ucs_error("ibv_alloc_pd() failed: %m");
1611         status = UCS_ERR_NO_MEMORY;
1612         goto err_cleanup_device;
1613     }
1614 
1615     status = uct_md_query(&md->super, &md_attr);
1616     if (status != UCS_OK) {
1617         goto err_dealloc_pd;
1618     }
1619 
1620     status = uct_ib_md_parse_reg_methods(md, &md_attr, md_config);
1621     if (status != UCS_OK) {
1622         goto err_dealloc_pd;
1623     }
1624 
1625     md->dev.max_zcopy_log_sge = INT_MAX;
1626     if (md_attr.cap.reg_mem_types & ~UCS_BIT(UCS_MEMORY_TYPE_HOST)) {
1627         md->dev.max_zcopy_log_sge = 1;
1628     }
1629 
1630     md->pci_bw = uct_ib_md_pci_bw(md_config, ib_device);
1631     return UCS_OK;
1632 
1633 err_dealloc_pd:
1634     ibv_dealloc_pd(md->pd);
1635 err_cleanup_device:
1636     uct_ib_device_cleanup(&md->dev);
1637 err_release_stats:
1638     UCS_STATS_NODE_FREE(md->stats);
1639 err:
1640     return status;
1641 }
1642 
uct_ib_md_close(uct_md_h uct_md)1643 void uct_ib_md_close(uct_md_h uct_md)
1644 {
1645     uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
1646 
1647     md->ops->cleanup(md);
1648     uct_ib_md_release_device_config(md);
1649     uct_ib_md_release_reg_method(md);
1650     uct_ib_device_cleanup_ah_cached(&md->dev);
1651     ibv_dealloc_pd(md->pd);
1652     uct_ib_device_cleanup(&md->dev);
1653     ibv_close_device(md->dev.ibv_context);
1654     UCS_STATS_NODE_FREE(md->stats);
1655     ucs_free(md);
1656 }
1657 
1658 static uct_ib_md_ops_t uct_ib_verbs_md_ops;
1659 
uct_ib_verbs_md_open(struct ibv_device * ibv_device,const uct_ib_md_config_t * md_config,uct_ib_md_t ** p_md)1660 static ucs_status_t uct_ib_verbs_md_open(struct ibv_device *ibv_device,
1661                                          const uct_ib_md_config_t *md_config,
1662                                          uct_ib_md_t **p_md)
1663 {
1664     uct_ib_device_t *dev;
1665     ucs_status_t status;
1666     uct_ib_md_t *md;
1667     int num_mrs;
1668 
1669     md = ucs_calloc(1, sizeof(*md), "ib_md");
1670     if (md == NULL) {
1671         return UCS_ERR_NO_MEMORY;
1672     }
1673 
1674     /* Open verbs context */
1675     dev              = &md->dev;
1676     dev->ibv_context = ibv_open_device(ibv_device);
1677     if (dev->ibv_context == NULL) {
1678         ucs_error("ibv_open_device(%s) failed: %m", ibv_get_device_name(ibv_device));
1679         status = UCS_ERR_IO_ERROR;
1680         goto err;
1681     }
1682 
1683     md->config = md_config->ext;
1684 
1685     status = uct_ib_device_query(dev, ibv_device);
1686     if (status != UCS_OK) {
1687         goto err_free_context;
1688     }
1689 
1690     if (UCT_IB_HAVE_ODP_IMPLICIT(&dev->dev_attr)) {
1691         md->dev.flags |= UCT_IB_DEVICE_FLAG_ODP_IMPLICIT;
1692     }
1693 
1694     if (IBV_EXP_HAVE_ATOMIC_HCA(&dev->dev_attr)) {
1695         dev->atomic_arg_sizes = sizeof(uint64_t);
1696     }
1697 
1698     md->ops = &uct_ib_verbs_md_ops;
1699     status = uct_ib_md_parse_device_config(md, md_config);
1700     if (status != UCS_OK) {
1701         goto err_free_context;
1702     }
1703 
1704     uct_ib_md_parse_relaxed_order(md, md_config);
1705     num_mrs = 1;      /* UCT_IB_MR_DEFAULT */
1706 
1707     if (md->relaxed_order) {
1708         ++num_mrs;    /* UCT_IB_MR_STRICT_ORDER */
1709     }
1710 
1711     md->memh_struct_size = sizeof(uct_ib_verbs_mem_t) +
1712                           (sizeof(uct_ib_mr_t) * num_mrs);
1713 
1714     status = uct_ib_md_open_common(md, ibv_device, md_config);
1715     if (status != UCS_OK) {
1716         goto err_dev_cfg;
1717     }
1718 
1719     md->dev.flags  = uct_ib_device_spec(&md->dev)->flags;
1720     *p_md = md;
1721     return UCS_OK;
1722 
1723 err_dev_cfg:
1724     uct_ib_md_release_device_config(md);
1725 err_free_context:
1726     ibv_close_device(dev->ibv_context);
1727 err:
1728     ucs_free(md);
1729     return status;
1730 }
1731 
1732 static uct_ib_md_ops_t uct_ib_verbs_md_ops = {
1733     .open                = uct_ib_verbs_md_open,
1734     .cleanup             = (uct_ib_md_cleanup_func_t)ucs_empty_function,
1735     .reg_key             = uct_ib_verbs_reg_key,
1736     .dereg_key           = uct_ib_verbs_dereg_key,
1737     .reg_atomic_key      = uct_ib_verbs_reg_atomic_key,
1738     .dereg_atomic_key    = (uct_ib_md_dereg_atomic_key_func_t)ucs_empty_function_return_success,
1739     .reg_multithreaded   = (uct_ib_md_reg_multithreaded_func_t)ucs_empty_function_return_unsupported,
1740     .dereg_multithreaded = (uct_ib_md_dereg_multithreaded_func_t)ucs_empty_function_return_unsupported,
1741     .mem_prefetch        = (uct_ib_md_mem_prefetch_func_t)ucs_empty_function_return_success,
1742     .get_atomic_mr_id    = (uct_ib_md_get_atomic_mr_id_func_t)ucs_empty_function_return_unsupported,
1743 };
1744 
1745 UCT_IB_MD_OPS(uct_ib_verbs_md_ops, 0);
1746 
1747 uct_component_t uct_ib_component = {
1748     .query_md_resources = uct_ib_query_md_resources,
1749     .md_open            = uct_ib_md_open,
1750     .cm_open            = ucs_empty_function_return_unsupported,
1751     .rkey_unpack        = uct_ib_rkey_unpack,
1752     .rkey_ptr           = ucs_empty_function_return_unsupported,
1753     .rkey_release       = ucs_empty_function_return_success,
1754     .name               = "ib",
1755     .md_config          = {
1756         .name           = "IB memory domain",
1757         .prefix         = UCT_IB_CONFIG_PREFIX,
1758         .table          = uct_ib_md_config_table,
1759         .size           = sizeof(uct_ib_md_config_t),
1760     },
1761     .cm_config          = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY,
1762     .tl_list            = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_ib_component),
1763     .flags              = 0
1764 };
1765 UCT_COMPONENT_REGISTER(&uct_ib_component);
1766