1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED.
3 * Copyright (C) The University of Tennessee and The University
4 * of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED.
5 *
6 * See file LICENSE for terms.
7 */
8
9 #ifdef HAVE_CONFIG_H
10 # include "config.h"
11 #endif
12
13 #include "ib_md.h"
14 #include "ib_device.h"
15
16 #include <ucs/arch/atomic.h>
17 #include <ucs/profile/profile.h>
18 #include <ucs/sys/math.h>
19 #include <ucs/sys/module.h>
20 #include <ucs/sys/string.h>
21 #include <ucs/time/time.h>
22 #include <ucm/api/ucm.h>
23 #include <pthread.h>
24 #ifdef HAVE_PTHREAD_NP_H
25 #include <pthread_np.h>
26 #endif
27 #include <sys/resource.h>
28 #include <float.h>
29
30
31 #define UCT_IB_MD_RCACHE_DEFAULT_ALIGN 16
32
33 typedef struct uct_ib_md_pci_info {
34 double bw; /* bandwidth */
35 uint16_t payload; /* payload used to data transfer */
36 uint16_t overhead; /* PHY + data link layer + header + *CRC* */
37 uint16_t nack; /* number of TLC before ACK */
38 uint16_t ctrl; /* length of control TLP */
39 uint16_t encoding; /* number of bits in symbol encoded, 8 - gen 1/2, 128 - gen 3 */
40 uint16_t decoding; /* number of bits in symbol decoded, 10 - gen 1/2, 130 - gen 3 */
41 const char *name; /* name of PCI generation */
42 } uct_ib_md_pci_info_t;
43
44 static UCS_CONFIG_DEFINE_ARRAY(pci_bw,
45 sizeof(ucs_config_bw_spec_t),
46 UCS_CONFIG_TYPE_BW_SPEC);
47
48 static const char *uct_ib_devx_objs[] = {
49 [UCT_IB_DEVX_OBJ_RCQP] = "rcqp",
50 [UCT_IB_DEVX_OBJ_RCSRQ] = "rcsrq",
51 [UCT_IB_DEVX_OBJ_DCT] = "dct",
52 [UCT_IB_DEVX_OBJ_DCSRQ] = "dcsrq",
53 NULL
54 };
55
56 static ucs_config_field_t uct_ib_md_config_table[] = {
57 {"", "", NULL,
58 ucs_offsetof(uct_ib_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)},
59
60 {"REG_METHODS", "rcache,odp,direct",
61 "List of registration methods in order of preference. Supported methods are:\n"
62 " odp - implicit on-demand paging\n"
63 " rcache - userspace registration cache\n"
64 " direct - direct registration\n",
65 ucs_offsetof(uct_ib_md_config_t, reg_methods), UCS_CONFIG_TYPE_STRING_ARRAY},
66
67 {"", "RCACHE_ADDR_ALIGN=" UCS_PP_MAKE_STRING(UCT_IB_MD_RCACHE_DEFAULT_ALIGN), NULL,
68 ucs_offsetof(uct_ib_md_config_t, rcache),
69 UCS_CONFIG_TYPE_TABLE(uct_md_config_rcache_table)},
70
71 {"MEM_REG_OVERHEAD", "16us", "Memory registration overhead", /* TODO take default from device */
72 ucs_offsetof(uct_ib_md_config_t, uc_reg_cost.c), UCS_CONFIG_TYPE_TIME},
73
74 {"MEM_REG_GROWTH", "0.06ns", "Memory registration growth rate", /* TODO take default from device */
75 ucs_offsetof(uct_ib_md_config_t, uc_reg_cost.m), UCS_CONFIG_TYPE_TIME},
76
77 {"FORK_INIT", "try",
78 "Initialize a fork-safe IB library with ibv_fork_init().",
79 ucs_offsetof(uct_ib_md_config_t, fork_init), UCS_CONFIG_TYPE_TERNARY},
80
81 {"ASYNC_EVENTS", "n",
82 "Enable listening for async events on the device",
83 ucs_offsetof(uct_ib_md_config_t, async_events), UCS_CONFIG_TYPE_BOOL},
84
85 {"ETH_PAUSE_ON", "y",
86 "Whether or not 'Pause Frame' is enabled on an Ethernet network.\n"
87 "Pause frame is a mechanism for temporarily stopping the transmission of data to\n"
88 "ensure zero loss under congestion on Ethernet family computer networks.\n"
89 "This parameter, if set to 'no', will disqualify IB transports that may not perform\n"
90 "well on a lossy fabric when working with RoCE.",
91 ucs_offsetof(uct_ib_md_config_t, ext.eth_pause), UCS_CONFIG_TYPE_BOOL},
92
93 {"ODP_NUMA_POLICY", "preferred",
94 "Override NUMA policy for ODP regions, to avoid extra page migrations.\n"
95 " - default: Do no change existing policy.\n"
96 " - preferred/bind:\n"
97 " Unless the memory policy of the current thread is MPOL_BIND, set the\n"
98 " policy of ODP regions to MPOL_PREFERRED/MPOL_BIND, respectively.\n"
99 " If the numa node mask of the current thread is not defined, use the numa\n"
100 " nodes which correspond to its cpu affinity mask.",
101 ucs_offsetof(uct_ib_md_config_t, ext.odp.numa_policy),
102 UCS_CONFIG_TYPE_ENUM(ucs_numa_policy_names)},
103
104 {"ODP_PREFETCH", "n",
105 "Force prefetch of memory regions created with ODP.\n",
106 ucs_offsetof(uct_ib_md_config_t, ext.odp.prefetch), UCS_CONFIG_TYPE_BOOL},
107
108 {"ODP_MAX_SIZE", "auto",
109 "Maximal memory region size to enable ODP for. 0 - disable.\n",
110 ucs_offsetof(uct_ib_md_config_t, ext.odp.max_size), UCS_CONFIG_TYPE_MEMUNITS},
111
112 {"DEVICE_SPECS", "",
113 "Array of custom device specification. Each element is a string of the following format:\n"
114 " <vendor-id>:<device-id>[:name[:<flags>[:<priority>]]]\n"
115 "where:\n"
116 " <vendor-id> - (mandatory) pci vendor id, integer or hexadecimal.\n"
117 " <device-id> - (mandatory) pci device id, integer or hexadecimal.\n"
118 " <name> - (optional) device name.\n"
119 " <flags> - (optional) empty, or a combination of:\n"
120 " '4' - mlx4 device\n"
121 " '5' - mlx5 device\n"
122 " 'd' - DC version 1 (Connect-IB, ConnectX-4)\n"
123 " 'D' - DC version 2 (ConnectX-5 and above)\n"
124 " 'a' - Compact address vector support\n"
125 " <priority> - (optional) device priority, integer.\n"
126 "\n"
127 "Example: The value '0x02c9:4115:ConnectX4:5d' would specify a device named ConnectX-4\n"
128 "to match vendor id 0x2c9, device id 4115, with DC version 1 support.",
129 ucs_offsetof(uct_ib_md_config_t, custom_devices), UCS_CONFIG_TYPE_STRING_ARRAY},
130
131 {"PREFER_NEAREST_DEVICE", "y",
132 "Prefer nearest device to cpu when selecting a device from NET_DEVICES list.\n",
133 ucs_offsetof(uct_ib_md_config_t, ext.prefer_nearest_device), UCS_CONFIG_TYPE_BOOL},
134
135 {"INDIRECT_ATOMIC", "y",
136 "Use indirect atomic\n",
137 ucs_offsetof(uct_ib_md_config_t, ext.enable_indirect_atomic), UCS_CONFIG_TYPE_BOOL},
138
139 {"GID_INDEX", "auto",
140 "Port GID index to use.",
141 ucs_offsetof(uct_ib_md_config_t, ext.gid_index), UCS_CONFIG_TYPE_ULUNITS},
142
143 {"SUBNET_PREFIX", "",
144 "Infiniband subnet prefix to filter ports by, empty means no filter. "
145 "Relevant for IB link layer only\n"
146 "For example a filter for the default subnet prefix can be specified as: fe80:0:0:0",
147 ucs_offsetof(uct_ib_md_config_t, subnet_prefix), UCS_CONFIG_TYPE_STRING},
148
149 {"GPU_DIRECT_RDMA", "try",
150 "Use GPU Direct RDMA for HCA to access GPU pages directly\n",
151 ucs_offsetof(uct_ib_md_config_t, ext.enable_gpudirect_rdma), UCS_CONFIG_TYPE_TERNARY},
152
153 #ifdef HAVE_EXP_UMR
154 {"MAX_INLINE_KLM_LIST", "inf",
155 "When posting a UMR, KLM lists shorter or equal to this value will be posted as inline.\n"
156 "The actual maximal length is also limited by device capabilities.",
157 ucs_offsetof(uct_ib_md_config_t, ext.max_inline_klm_list), UCS_CONFIG_TYPE_UINT},
158 #endif
159
160 {"PCI_BW", "",
161 "Maximum effective data transfer rate of PCI bus connected to HCA\n",
162 ucs_offsetof(uct_ib_md_config_t, pci_bw), UCS_CONFIG_TYPE_ARRAY(pci_bw)},
163
164 {"MLX5_DEVX", "try",
165 "DEVX support\n",
166 ucs_offsetof(uct_ib_md_config_t, devx), UCS_CONFIG_TYPE_TERNARY},
167
168 {"MLX5_DEVX_OBJECTS", "rcqp,rcsrq,dct,dcsrq",
169 "Objects to be created by DevX\n",
170 ucs_offsetof(uct_ib_md_config_t, devx_objs),
171 UCS_CONFIG_TYPE_BITMAP(uct_ib_devx_objs)},
172
173 {"REG_MT_THRESH", "4G",
174 "Minimal MR size to be register using multiple parallel threads.\n"
175 "Number of threads used will be determined by number of CPUs which "
176 "registering thread is bound to by hard affinity.",
177 ucs_offsetof(uct_ib_md_config_t, ext.min_mt_reg), UCS_CONFIG_TYPE_MEMUNITS},
178
179 {"REG_MT_CHUNK", "2G",
180 "Size of single chunk used in multithreaded registration.\n"
181 "Must be power of 2.",
182 ucs_offsetof(uct_ib_md_config_t, ext.mt_reg_chunk), UCS_CONFIG_TYPE_MEMUNITS},
183
184 {"REG_MT_BIND", "n",
185 "Enable setting CPU affinity of memory registration threads.",
186 ucs_offsetof(uct_ib_md_config_t, ext.mt_reg_bind), UCS_CONFIG_TYPE_BOOL},
187
188 {"PCI_RELAXED_ORDERING", "auto",
189 "Enable relaxed ordering for PCIe transactions to improve performance on some systems.",
190 ucs_offsetof(uct_ib_md_config_t, mr_relaxed_order), UCS_CONFIG_TYPE_ON_OFF_AUTO},
191
192 {NULL}
193 };
194
195 #ifdef ENABLE_STATS
196 static ucs_stats_class_t uct_ib_md_stats_class = {
197 .name = "",
198 .num_counters = UCT_IB_MD_STAT_LAST,
199 .counter_names = {
200 [UCT_IB_MD_STAT_MEM_ALLOC] = "mem_alloc",
201 [UCT_IB_MD_STAT_MEM_REG] = "mem_reg"
202 }
203 };
204 #endif
205
206 static const uct_ib_md_pci_info_t uct_ib_md_pci_info[] = {
207 { /* GEN 1 */
208 .bw = 2.5 * UCS_GBYTE / 8,
209 .payload = 512,
210 .overhead = 28,
211 .nack = 5,
212 .ctrl = 256,
213 .encoding = 8,
214 .decoding = 10,
215 .name = "gen1"
216 },
217 { /* GEN 2 */
218 .bw = 5.0 * UCS_GBYTE / 8,
219 .payload = 512,
220 .overhead = 28,
221 .nack = 5,
222 .ctrl = 256,
223 .encoding = 8,
224 .decoding = 10,
225 .name = "gen2"
226 },
227 { /* GEN 3 */
228 .bw = 8.0 * UCS_GBYTE / 8,
229 .payload = 512,
230 .overhead = 30,
231 .nack = 5,
232 .ctrl = 256,
233 .encoding = 128,
234 .decoding = 130,
235 .name = "gen3"
236 },
237 };
238
239 UCS_LIST_HEAD(uct_ib_md_ops_list);
240
241 typedef struct uct_ib_verbs_mem {
242 uct_ib_mem_t super;
243 uct_ib_mr_t mrs[];
244 } uct_ib_verbs_mem_t;
245
246 typedef struct {
247 pthread_t thread;
248 void *addr;
249 size_t len;
250 size_t chunk;
251 uint64_t access;
252 struct ibv_pd *pd;
253 struct ibv_mr **mr;
254 } uct_ib_md_mem_reg_thread_t;
255
uct_ib_check_gpudirect_driver(uct_ib_md_t * md,uct_md_attr_t * md_attr,const char * file,ucs_memory_type_t mem_type)256 static void uct_ib_check_gpudirect_driver(uct_ib_md_t *md, uct_md_attr_t *md_attr,
257 const char *file,
258 ucs_memory_type_t mem_type)
259 {
260 if (!access(file, F_OK)) {
261 md_attr->cap.reg_mem_types |= UCS_BIT(mem_type);
262 }
263
264 ucs_debug("%s: %s GPUDirect RDMA is %s",
265 uct_ib_device_name(&md->dev), ucs_memory_type_names[mem_type],
266 md_attr->cap.reg_mem_types & UCS_BIT(mem_type) ?
267 "enabled" : "disabled");
268 }
269
uct_ib_md_query(uct_md_h uct_md,uct_md_attr_t * md_attr)270 static ucs_status_t uct_ib_md_query(uct_md_h uct_md, uct_md_attr_t *md_attr)
271 {
272 uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
273
274 md_attr->cap.max_alloc = ULONG_MAX; /* TODO query device */
275 md_attr->cap.max_reg = ULONG_MAX; /* TODO query device */
276 md_attr->cap.flags = UCT_MD_FLAG_REG |
277 UCT_MD_FLAG_NEED_MEMH |
278 UCT_MD_FLAG_NEED_RKEY |
279 UCT_MD_FLAG_ADVISE;
280 md_attr->cap.reg_mem_types = UCS_MEMORY_TYPES_CPU_ACCESSIBLE;
281 md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_HOST;
282 md_attr->cap.detect_mem_types = 0;
283
284 if (md->config.enable_gpudirect_rdma != UCS_NO) {
285 /* check if GDR driver is loaded */
286 uct_ib_check_gpudirect_driver(md, md_attr,
287 "/sys/kernel/mm/memory_peers/nv_mem/version",
288 UCS_MEMORY_TYPE_CUDA);
289
290 /* check if ROCM KFD driver is loaded */
291 uct_ib_check_gpudirect_driver(md, md_attr, "/dev/kfd",
292 UCS_MEMORY_TYPE_ROCM);
293
294 if (!(md_attr->cap.reg_mem_types & ~UCS_BIT(UCS_MEMORY_TYPE_HOST)) &&
295 (md->config.enable_gpudirect_rdma == UCS_YES)) {
296 ucs_error("%s: Couldn't enable GPUDirect RDMA. Please make sure"
297 " nv_peer_mem or amdgpu plugin installed correctly.",
298 uct_ib_device_name(&md->dev));
299 return UCS_ERR_UNSUPPORTED;
300 }
301 }
302
303 md_attr->rkey_packed_size = UCT_IB_MD_PACKED_RKEY_SIZE;
304 md_attr->reg_cost = md->reg_cost;
305 ucs_sys_cpuset_copy(&md_attr->local_cpus, &md->dev.local_cpus);
306
307 return UCS_OK;
308 }
309
uct_ib_md_print_mem_reg_err_msg(ucs_log_level_t level,void * address,size_t length,uint64_t access_flags)310 static void uct_ib_md_print_mem_reg_err_msg(ucs_log_level_t level, void *address,
311 size_t length, uint64_t access_flags)
312 {
313 char msg[200] = {0};
314 struct rlimit limit_info;
315
316 ucs_snprintf_zero(msg, sizeof(msg),
317 "%s(address=%p, length=%zu, access=0x%lx) failed: %m",
318 ibv_reg_mr_func_name, address, length, access_flags);
319
320 /* Check the value of the max locked memory which is set on the system
321 * (ulimit -l) */
322 if (!getrlimit(RLIMIT_MEMLOCK, &limit_info) &&
323 (limit_info.rlim_cur != RLIM_INFINITY)) {
324 ucs_snprintf_zero(msg + strlen(msg), sizeof(msg) - strlen(msg),
325 ". Please set max locked memory (ulimit -l) to 'unlimited' "
326 "(current: %llu kbytes)", limit_info.rlim_cur / UCS_KBYTE);
327 }
328
329 ucs_log(level, "%s", msg);
330 }
331
uct_ib_md_mem_handle_thread_func(void * arg)332 void *uct_ib_md_mem_handle_thread_func(void *arg)
333 {
334 uct_ib_md_mem_reg_thread_t *ctx = arg;
335 ucs_status_t status;
336 int mr_idx = 0;
337 size_t size = 0;
338 ucs_time_t UCS_V_UNUSED t0 = ucs_get_time();
339
340 while (ctx->len) {
341 size = ucs_min(ctx->len, ctx->chunk);
342 if (ctx->access != UCT_IB_MEM_DEREG) {
343 ctx->mr[mr_idx] = UCS_PROFILE_NAMED_CALL(ibv_reg_mr_func_name,
344 ibv_reg_mr, ctx->pd,
345 ctx->addr, size,
346 ctx->access);
347 if (ctx->mr[mr_idx] == NULL) {
348 return UCS_STATUS_PTR(UCS_ERR_IO_ERROR);
349 }
350 } else {
351 status = uct_ib_dereg_mr(ctx->mr[mr_idx]);
352 if (status != UCS_OK) {
353 return UCS_STATUS_PTR(status);
354 }
355 }
356 ctx->addr = UCS_PTR_BYTE_OFFSET(ctx->addr, size);
357 ctx->len -= size;
358 mr_idx++;
359 }
360
361 ucs_trace("%s %p..%p took %f usec\n",
362 (ctx->access == UCT_IB_MEM_DEREG) ? "dereg_mr" : "reg_mr",
363 ctx->mr[0]->addr,
364 UCS_PTR_BYTE_OFFSET(ctx->mr[mr_idx-1]->addr, size),
365 ucs_time_to_usec(ucs_get_time() - t0));
366
367 return UCS_STATUS_PTR(UCS_OK);
368 }
369
370 ucs_status_t
uct_ib_md_handle_mr_list_multithreaded(uct_ib_md_t * md,void * address,size_t length,uint64_t access_flags,size_t chunk,struct ibv_mr ** mrs)371 uct_ib_md_handle_mr_list_multithreaded(uct_ib_md_t *md, void *address,
372 size_t length, uint64_t access_flags,
373 size_t chunk, struct ibv_mr **mrs)
374 {
375 int thread_num_mrs, thread_num, thread_idx, mr_idx = 0, cpu_id = 0;
376 int mr_num = ucs_div_round_up(length, chunk);
377 ucs_status_t status;
378 void *thread_status;
379 ucs_sys_cpuset_t parent_set, thread_set;
380 uct_ib_md_mem_reg_thread_t *ctxs, *cur_ctx;
381 pthread_attr_t attr;
382 char UCS_V_UNUSED affinity_str[64];
383 int ret;
384
385 ret = pthread_getaffinity_np(pthread_self(), sizeof(ucs_sys_cpuset_t),
386 &parent_set);
387 if (ret != 0) {
388 ucs_error("pthread_getaffinity_np() failed: %m");
389 return UCS_ERR_INVALID_PARAM;
390 }
391
392 thread_num = ucs_min(CPU_COUNT(&parent_set), mr_num);
393
394 ucs_trace("multithreaded handle %p..%p access %lx threads %d affinity %s\n",
395 address, UCS_PTR_BYTE_OFFSET(address, length), access_flags, thread_num,
396 ucs_make_affinity_str(&parent_set, affinity_str, sizeof(affinity_str)));
397
398 if (thread_num == 1) {
399 return UCS_ERR_UNSUPPORTED;
400 }
401
402 ctxs = ucs_calloc(thread_num, sizeof(*ctxs), "ib mr ctxs");
403 if (ctxs == NULL) {
404 return UCS_ERR_NO_MEMORY;
405 }
406
407 pthread_attr_init(&attr);
408
409 status = UCS_OK;
410 for (thread_idx = 0; thread_idx < thread_num; thread_idx++) {
411 /* calculate number of mrs for each thread so each one will
412 * get proportional amount */
413 thread_num_mrs = ucs_div_round_up(mr_num - mr_idx, thread_num - thread_idx);
414
415 cur_ctx = &ctxs[thread_idx];
416 cur_ctx->pd = md->pd;
417 cur_ctx->addr = UCS_PTR_BYTE_OFFSET(address, mr_idx * chunk);
418 cur_ctx->len = ucs_min(thread_num_mrs * chunk, length - (mr_idx * chunk));
419 cur_ctx->access = access_flags;
420 cur_ctx->mr = &mrs[mr_idx];
421 cur_ctx->chunk = chunk;
422
423 if (md->config.mt_reg_bind) {
424 while (!CPU_ISSET(cpu_id, &parent_set)) {
425 cpu_id++;
426 }
427
428 CPU_ZERO(&thread_set);
429 CPU_SET(cpu_id, &thread_set);
430 cpu_id++;
431 pthread_attr_setaffinity_np(&attr, sizeof(ucs_sys_cpuset_t), &thread_set);
432 }
433
434 ret = pthread_create(&cur_ctx->thread, &attr,
435 uct_ib_md_mem_handle_thread_func, cur_ctx);
436 if (ret) {
437 ucs_error("pthread_create() failed: %m");
438 status = UCS_ERR_IO_ERROR;
439 thread_num = thread_idx;
440 break;
441 }
442
443 mr_idx += thread_num_mrs;
444 }
445
446 for (thread_idx = 0; thread_idx < thread_num; thread_idx++) {
447 cur_ctx = &ctxs[thread_idx];
448 pthread_join(cur_ctx->thread, &thread_status);
449 if (UCS_PTR_IS_ERR(UCS_OK)) {
450 status = UCS_PTR_STATUS(thread_status);
451 }
452 }
453
454 ucs_free(ctxs);
455 pthread_attr_destroy(&attr);
456
457 if (status != UCS_OK) {
458 for (mr_idx = 0; mr_idx < mr_num; mr_idx++) {
459 /* coverity[check_return] */
460 uct_ib_dereg_mr(mrs[mr_idx]);
461 }
462 }
463
464 return status;
465 }
466
uct_ib_md_reg_mr(uct_ib_md_t * md,void * address,size_t length,uint64_t access_flags,int silent,uct_ib_mem_t * memh,uct_ib_mr_type_t mr_type)467 static ucs_status_t uct_ib_md_reg_mr(uct_ib_md_t *md, void *address,
468 size_t length, uint64_t access_flags,
469 int silent, uct_ib_mem_t *memh,
470 uct_ib_mr_type_t mr_type)
471 {
472 ucs_log_level_t level = silent ? UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR;
473 ucs_status_t status;
474
475 if (length >= md->config.min_mt_reg) {
476 UCS_PROFILE_CODE("reg ksm") {
477 status = md->ops->reg_multithreaded(md, address, length,
478 access_flags, memh, mr_type);
479 }
480
481 if (status != UCS_ERR_UNSUPPORTED) {
482 if (status == UCS_OK) {
483 memh->flags |= UCT_IB_MEM_MULTITHREADED;
484 } else {
485 uct_ib_md_print_mem_reg_err_msg(level, address, length,
486 access_flags);
487 }
488
489 return status;
490 } /* if unsuported - fallback to regular registration */
491 }
492
493 status = md->ops->reg_key(md, address, length, access_flags, memh, mr_type);
494 if (status != UCS_OK) {
495 uct_ib_md_print_mem_reg_err_msg(level, address, length, access_flags);
496 return status;
497 }
498
499 return UCS_OK;
500 }
501
uct_ib_reg_mr(struct ibv_pd * pd,void * addr,size_t length,uint64_t access_flags,struct ibv_mr ** mr_p)502 ucs_status_t uct_ib_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
503 uint64_t access_flags, struct ibv_mr **mr_p)
504 {
505 struct ibv_mr *mr;
506 #if HAVE_DECL_IBV_EXP_REG_MR
507 struct ibv_exp_reg_mr_in in = {};
508
509 in.pd = pd;
510 in.addr = addr;
511 in.length = length;
512 in.exp_access = access_flags;
513 mr = UCS_PROFILE_CALL(ibv_exp_reg_mr, &in);
514 #else
515 mr = UCS_PROFILE_CALL(ibv_reg_mr, pd, addr, length, access_flags);
516 #endif
517 if (mr == NULL) {
518 return UCS_ERR_IO_ERROR;
519 }
520
521 *mr_p = mr;
522 return UCS_OK;
523 }
524
uct_ib_dereg_mr(struct ibv_mr * mr)525 ucs_status_t uct_ib_dereg_mr(struct ibv_mr *mr)
526 {
527 int ret;
528
529 if (mr == NULL) {
530 return UCS_OK;
531 }
532
533 ret = UCS_PROFILE_CALL(ibv_dereg_mr, mr);
534 if (ret != 0) {
535 ucs_error("ibv_dereg_mr() failed: %m");
536 return UCS_ERR_IO_ERROR;
537 }
538
539 return UCS_OK;
540 }
541
uct_ib_dereg_mrs(struct ibv_mr ** mrs,size_t mr_num)542 ucs_status_t uct_ib_dereg_mrs(struct ibv_mr **mrs, size_t mr_num)
543 {
544 ucs_status_t s, status = UCS_OK;
545 int i;
546
547 for (i = 0; i < mr_num; i++) {
548 s = uct_ib_dereg_mr(mrs[i]);
549 if (s != UCS_OK) {
550 status = s;
551 }
552 }
553
554 return status;
555 }
556
uct_ib_memh_dereg_key(uct_ib_md_t * md,uct_ib_mem_t * memh,uct_ib_mr_type_t mr_type)557 static ucs_status_t uct_ib_memh_dereg_key(uct_ib_md_t *md, uct_ib_mem_t *memh,
558 uct_ib_mr_type_t mr_type)
559 {
560 if (memh->flags & UCT_IB_MEM_MULTITHREADED) {
561 return md->ops->dereg_multithreaded(md, memh, mr_type);
562 } else {
563 return md->ops->dereg_key(md, memh, mr_type);
564 }
565 }
566
uct_ib_memh_dereg(uct_ib_md_t * md,uct_ib_mem_t * memh)567 static ucs_status_t uct_ib_memh_dereg(uct_ib_md_t *md, uct_ib_mem_t *memh)
568 {
569 ucs_status_t s, status = UCS_OK;
570
571 if (memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR) {
572 s = md->ops->dereg_atomic_key(md, memh);
573 memh->flags &= ~UCT_IB_MEM_FLAG_ATOMIC_MR;
574 if (s != UCS_OK) {
575 status = s;
576 }
577 }
578
579 if (memh->flags & UCT_IB_MEM_FLAG_RELAXED_ORDERING) {
580 s = uct_ib_memh_dereg_key(md, memh, UCT_IB_MR_STRICT_ORDER);
581 memh->flags &= ~UCT_IB_MEM_FLAG_RELAXED_ORDERING;
582 if (s != UCS_OK) {
583 status = s;
584 }
585 }
586
587 s = uct_ib_memh_dereg_key(md, memh, UCT_IB_MR_DEFAULT);
588 if (s != UCS_OK) {
589 status = s;
590 }
591
592 return status;
593 }
594
uct_ib_memh_free(uct_ib_mem_t * memh)595 static void uct_ib_memh_free(uct_ib_mem_t *memh)
596 {
597 ucs_free(memh);
598 }
599
uct_ib_memh_alloc(uct_ib_md_t * md)600 static uct_ib_mem_t *uct_ib_memh_alloc(uct_ib_md_t *md)
601 {
602 return ucs_calloc(1, md->memh_struct_size, "ib_memh");
603 }
604
uct_ib_md_access_flags(uct_ib_md_t * md,unsigned flags,size_t length)605 static uint64_t uct_ib_md_access_flags(uct_ib_md_t *md, unsigned flags,
606 size_t length)
607 {
608 uint64_t access_flags = UCT_IB_MEM_ACCESS_FLAGS;
609
610 if ((flags & UCT_MD_MEM_FLAG_NONBLOCK) && (length > 0) &&
611 (length <= md->config.odp.max_size)) {
612 access_flags |= IBV_ACCESS_ON_DEMAND;
613 }
614
615 if (md->relaxed_order) {
616 access_flags |= IBV_ACCESS_RELAXED_ORDERING;
617 }
618
619 return access_flags;
620 }
621
622 #if HAVE_NUMA
uct_ib_mem_set_numa_policy(uct_ib_md_t * md,void * address,size_t length,uct_ib_mem_t * memh)623 static ucs_status_t uct_ib_mem_set_numa_policy(uct_ib_md_t *md, void *address,
624 size_t length, uct_ib_mem_t *memh)
625 {
626 int ret, old_policy, new_policy;
627 struct bitmask *nodemask;
628 uintptr_t start, end;
629 ucs_status_t status;
630
631 if (!(memh->flags & UCT_IB_MEM_FLAG_ODP) ||
632 (md->config.odp.numa_policy == UCS_NUMA_POLICY_DEFAULT) ||
633 (numa_available() < 0))
634 {
635 status = UCS_OK;
636 goto out;
637 }
638
639 nodemask = numa_allocate_nodemask();
640 if (nodemask == NULL) {
641 ucs_warn("Failed to allocate numa node mask");
642 status = UCS_ERR_NO_MEMORY;
643 goto out;
644 }
645
646 ret = get_mempolicy(&old_policy, numa_nodemask_p(nodemask),
647 numa_nodemask_size(nodemask), NULL, 0);
648 if (ret < 0) {
649 ucs_warn("get_mempolicy(maxnode=%zu) failed: %m",
650 numa_nodemask_size(nodemask));
651 status = UCS_ERR_INVALID_PARAM;
652 goto out_free;
653 }
654
655 switch (old_policy) {
656 case MPOL_DEFAULT:
657 /* if no policy is defined, use the numa node of the current cpu */
658 numa_get_thread_node_mask(&nodemask);
659 break;
660 case MPOL_BIND:
661 /* if the current policy is BIND, keep it as-is */
662 status = UCS_OK;
663 goto out_free;
664 default:
665 break;
666 }
667
668 switch (md->config.odp.numa_policy) {
669 case UCS_NUMA_POLICY_BIND:
670 new_policy = MPOL_BIND;
671 break;
672 case UCS_NUMA_POLICY_PREFERRED:
673 new_policy = MPOL_PREFERRED;
674 break;
675 default:
676 ucs_error("unexpected numa policy %d", md->config.odp.numa_policy);
677 status = UCS_ERR_INVALID_PARAM;
678 goto out_free;
679 }
680
681 if (new_policy != old_policy) {
682 start = ucs_align_down_pow2((uintptr_t)address, ucs_get_page_size());
683 end = ucs_align_up_pow2((uintptr_t)address + length,
684 ucs_get_page_size());
685 ucs_trace("0x%lx..0x%lx: changing numa policy from %d to %d, "
686 "nodemask[0]=0x%lx", start, end, old_policy, new_policy,
687 numa_nodemask_p(nodemask)[0]);
688
689 ret = UCS_PROFILE_CALL(mbind, (void*)start, end - start, new_policy,
690 numa_nodemask_p(nodemask),
691 numa_nodemask_size(nodemask), 0);
692 if (ret < 0) {
693 ucs_warn("mbind(addr=0x%lx length=%ld policy=%d) failed: %m",
694 start, end - start, new_policy);
695 status = UCS_ERR_IO_ERROR;
696 goto out_free;
697 }
698 }
699
700 status = UCS_OK;
701
702 out_free:
703 numa_free_nodemask(nodemask);
704 out:
705 return status;
706 }
707 #else
uct_ib_mem_set_numa_policy(uct_ib_md_t * md,void * address,size_t length,uct_ib_mem_t * memh)708 static ucs_status_t uct_ib_mem_set_numa_policy(uct_ib_md_t *md, void *address,
709 size_t length, uct_ib_mem_t *memh)
710 {
711 return UCS_OK;
712 }
713 #endif /* UCT_MD_DISABLE_NUMA */
714
uct_ib_mem_init(uct_ib_mem_t * memh,unsigned uct_flags,uint64_t access_flags)715 static void uct_ib_mem_init(uct_ib_mem_t *memh, unsigned uct_flags,
716 uint64_t access_flags)
717 {
718 memh->flags = 0;
719
720 /* coverity[dead_error_condition] */
721 if (access_flags & IBV_ACCESS_ON_DEMAND) {
722 memh->flags |= UCT_IB_MEM_FLAG_ODP;
723 }
724
725 if (uct_flags & UCT_MD_MEM_ACCESS_REMOTE_ATOMIC) {
726 memh->flags |= UCT_IB_MEM_ACCESS_REMOTE_ATOMIC;
727 }
728 }
729
uct_ib_mem_reg_internal(uct_md_h uct_md,void * address,size_t length,unsigned flags,int silent,uct_ib_mem_t * memh)730 static ucs_status_t uct_ib_mem_reg_internal(uct_md_h uct_md, void *address,
731 size_t length, unsigned flags,
732 int silent, uct_ib_mem_t *memh)
733 {
734 uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
735 ucs_status_t status;
736 uint64_t access_flags;
737
738 access_flags = uct_ib_md_access_flags(md, flags, length);
739 uct_ib_mem_init(memh, flags, access_flags);
740 status = uct_ib_md_reg_mr(md, address, length, access_flags, silent, memh,
741 UCT_IB_MR_DEFAULT);
742 if (status != UCS_OK) {
743 return status;
744 }
745
746 if (md->relaxed_order) {
747 status = uct_ib_md_reg_mr(md, address, length,
748 access_flags & ~IBV_ACCESS_RELAXED_ORDERING,
749 silent, memh, UCT_IB_MR_STRICT_ORDER);
750 if (status != UCS_OK) {
751 goto err;
752 }
753
754 memh->flags |= UCT_IB_MEM_FLAG_RELAXED_ORDERING;
755 }
756
757 ucs_debug("registered memory %p..%p on %s lkey 0x%x rkey 0x%x "
758 "access 0x%lx flags 0x%x", address,
759 UCS_PTR_BYTE_OFFSET(address, length),
760 uct_ib_device_name(&md->dev), memh->lkey, memh->rkey,
761 access_flags, flags);
762
763 uct_ib_mem_set_numa_policy(md, address, length, memh);
764
765 if (md->config.odp.prefetch) {
766 md->ops->mem_prefetch(md, memh, address, length);
767 }
768
769 UCS_STATS_UPDATE_COUNTER(md->stats, UCT_IB_MD_STAT_MEM_REG, +1);
770 return UCS_OK;
771
772 err:
773 uct_ib_memh_dereg(md, memh);
774 return status;
775 }
776
uct_ib_mem_reg(uct_md_h uct_md,void * address,size_t length,unsigned flags,uct_mem_h * memh_p)777 static ucs_status_t uct_ib_mem_reg(uct_md_h uct_md, void *address, size_t length,
778 unsigned flags, uct_mem_h *memh_p)
779 {
780 uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
781 ucs_status_t status;
782 uct_ib_mem_t *memh;
783
784 memh = uct_ib_memh_alloc(md);
785 if (memh == NULL) {
786 return UCS_ERR_NO_MEMORY;
787 }
788
789 status = uct_ib_mem_reg_internal(uct_md, address, length, flags, 0, memh);
790 if (status != UCS_OK) {
791 uct_ib_memh_free(memh);
792 return status;
793 }
794 *memh_p = memh;
795
796 return UCS_OK;
797 }
798
uct_ib_mem_dereg(uct_md_h uct_md,uct_mem_h memh)799 static ucs_status_t uct_ib_mem_dereg(uct_md_h uct_md, uct_mem_h memh)
800 {
801 uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
802 uct_ib_mem_t *ib_memh = memh;
803 ucs_status_t status;
804
805 status = uct_ib_memh_dereg(md, ib_memh);
806 uct_ib_memh_free(ib_memh);
807 return status;
808 }
809
uct_ib_verbs_reg_key(uct_ib_md_t * md,void * address,size_t length,uint64_t access_flags,uct_ib_mem_t * ib_memh,uct_ib_mr_type_t mr_type)810 static ucs_status_t uct_ib_verbs_reg_key(uct_ib_md_t *md, void *address,
811 size_t length, uint64_t access_flags,
812 uct_ib_mem_t *ib_memh,
813 uct_ib_mr_type_t mr_type)
814 {
815 uct_ib_verbs_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_verbs_mem_t);
816
817 return uct_ib_reg_key_impl(md, address, length, access_flags,
818 ib_memh, &memh->mrs[mr_type], mr_type);
819 }
820
uct_ib_reg_key_impl(uct_ib_md_t * md,void * address,size_t length,uint64_t access_flags,uct_ib_mem_t * memh,uct_ib_mr_t * mr,uct_ib_mr_type_t mr_type)821 ucs_status_t uct_ib_reg_key_impl(uct_ib_md_t *md, void *address,
822 size_t length, uint64_t access_flags,
823 uct_ib_mem_t *memh, uct_ib_mr_t *mr,
824 uct_ib_mr_type_t mr_type)
825 {
826 ucs_status_t status;
827
828 status = uct_ib_reg_mr(md->pd, address, length, access_flags, &mr->ib);
829 if (status != UCS_OK) {
830 return status;
831 }
832
833 if (mr_type == UCT_IB_MR_DEFAULT) {
834 uct_ib_memh_init_keys(memh, mr->ib->lkey, mr->ib->rkey);
835 }
836
837 return UCS_OK;
838 }
839
uct_ib_verbs_dereg_key(uct_ib_md_t * md,uct_ib_mem_t * ib_memh,uct_ib_mr_type_t mr_type)840 static ucs_status_t uct_ib_verbs_dereg_key(uct_ib_md_t *md,
841 uct_ib_mem_t *ib_memh,
842 uct_ib_mr_type_t mr_type)
843 {
844 uct_ib_verbs_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_verbs_mem_t);
845
846 return uct_ib_dereg_mr(memh->mrs[mr_type].ib);
847 }
848
uct_ib_verbs_reg_atomic_key(uct_ib_md_t * ibmd,uct_ib_mem_t * ib_memh)849 static ucs_status_t uct_ib_verbs_reg_atomic_key(uct_ib_md_t *ibmd,
850 uct_ib_mem_t *ib_memh)
851 {
852 uct_ib_mr_type_t mr_type = uct_ib_memh_get_atomic_base_mr_type(ib_memh);
853 uct_ib_verbs_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_verbs_mem_t);
854
855 if (mr_type != UCT_IB_MR_STRICT_ORDER) {
856 return UCS_ERR_UNSUPPORTED;
857 }
858
859 memh->super.atomic_rkey = memh->mrs[mr_type].ib->rkey;
860 return UCS_OK;
861 }
862
863 static ucs_status_t
uct_ib_mem_advise(uct_md_h uct_md,uct_mem_h memh,void * addr,size_t length,unsigned advice)864 uct_ib_mem_advise(uct_md_h uct_md, uct_mem_h memh, void *addr,
865 size_t length, unsigned advice)
866 {
867 uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
868
869 ucs_debug("memh %p advice %d", memh, advice);
870 if ((advice == UCT_MADV_WILLNEED) && !md->config.odp.prefetch) {
871 return md->ops->mem_prefetch(md, memh, addr, length);
872 }
873
874 return UCS_OK;
875 }
876
uct_ib_mkey_pack(uct_md_h uct_md,uct_mem_h uct_memh,void * rkey_buffer)877 static ucs_status_t uct_ib_mkey_pack(uct_md_h uct_md, uct_mem_h uct_memh,
878 void *rkey_buffer)
879 {
880 uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
881 uct_ib_mem_t *memh = uct_memh;
882 uint32_t atomic_rkey;
883 ucs_status_t status;
884
885 /* create umr only if a user requested atomic access to the
886 * memory region and the hardware supports it.
887 */
888 if (((memh->flags & UCT_IB_MEM_ACCESS_REMOTE_ATOMIC) ||
889 (memh->flags & UCT_IB_MEM_FLAG_RELAXED_ORDERING)) &&
890 !(memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR) &&
891 (memh != md->global_odp))
892 {
893 /* create UMR on-demand */
894 UCS_PROFILE_CODE("reg atomic key") {
895 status = md->ops->reg_atomic_key(md, memh);
896 }
897 if (status == UCS_OK) {
898 memh->flags |= UCT_IB_MEM_FLAG_ATOMIC_MR;
899 ucs_trace("created atomic key 0x%x for 0x%x", memh->atomic_rkey,
900 memh->lkey);
901 } else if (status != UCS_ERR_UNSUPPORTED) {
902 return status;
903 }
904 }
905 if (memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR) {
906 atomic_rkey = memh->atomic_rkey;
907 } else {
908 atomic_rkey = UCT_IB_INVALID_RKEY;
909 }
910
911 uct_ib_md_pack_rkey(memh->rkey, atomic_rkey, rkey_buffer);
912 return UCS_OK;
913 }
914
uct_ib_rkey_unpack(uct_component_t * component,const void * rkey_buffer,uct_rkey_t * rkey_p,void ** handle_p)915 static ucs_status_t uct_ib_rkey_unpack(uct_component_t *component,
916 const void *rkey_buffer, uct_rkey_t *rkey_p,
917 void **handle_p)
918 {
919 uint64_t packed_rkey = *(const uint64_t*)rkey_buffer;
920
921 *rkey_p = packed_rkey;
922 *handle_p = NULL;
923 ucs_trace("unpacked rkey 0x%llx: direct 0x%x indirect 0x%x",
924 (unsigned long long)packed_rkey,
925 uct_ib_md_direct_rkey(*rkey_p), uct_ib_md_indirect_rkey(*rkey_p));
926 return UCS_OK;
927 }
928
929 static uct_md_ops_t uct_ib_md_ops = {
930 .close = uct_ib_md_close,
931 .query = uct_ib_md_query,
932 .mem_reg = uct_ib_mem_reg,
933 .mem_dereg = uct_ib_mem_dereg,
934 .mem_advise = uct_ib_mem_advise,
935 .mkey_pack = uct_ib_mkey_pack,
936 .detect_memory_type = ucs_empty_function_return_unsupported,
937 };
938
uct_ib_rcache_region_from_memh(uct_mem_h memh)939 static inline uct_ib_rcache_region_t* uct_ib_rcache_region_from_memh(uct_mem_h memh)
940 {
941 return ucs_container_of(memh, uct_ib_rcache_region_t, memh);
942 }
943
uct_ib_mem_rcache_reg(uct_md_h uct_md,void * address,size_t length,unsigned flags,uct_mem_h * memh_p)944 static ucs_status_t uct_ib_mem_rcache_reg(uct_md_h uct_md, void *address,
945 size_t length, unsigned flags,
946 uct_mem_h *memh_p)
947 {
948 uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
949 ucs_rcache_region_t *rregion;
950 ucs_status_t status;
951 uct_ib_mem_t *memh;
952
953 status = ucs_rcache_get(md->rcache, address, length, PROT_READ|PROT_WRITE,
954 &flags, &rregion);
955 if (status != UCS_OK) {
956 return status;
957 }
958
959 ucs_assert(rregion->refcount > 0);
960 memh = &ucs_derived_of(rregion, uct_ib_rcache_region_t)->memh;
961 /* The original region was registered without atomic access
962 * so update the access flags. Actual umr creation will happen
963 * when uct_ib_mkey_pack() is called.
964 */
965 if (flags & UCT_MD_MEM_ACCESS_REMOTE_ATOMIC) {
966 memh->flags |= UCT_IB_MEM_ACCESS_REMOTE_ATOMIC;
967 }
968 *memh_p = memh;
969 return UCS_OK;
970 }
971
uct_ib_mem_rcache_dereg(uct_md_h uct_md,uct_mem_h memh)972 static ucs_status_t uct_ib_mem_rcache_dereg(uct_md_h uct_md, uct_mem_h memh)
973 {
974 uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
975 uct_ib_rcache_region_t *region = uct_ib_rcache_region_from_memh(memh);
976
977 ucs_rcache_region_put(md->rcache, ®ion->super);
978 return UCS_OK;
979 }
980
981 static uct_md_ops_t uct_ib_md_rcache_ops = {
982 .close = uct_ib_md_close,
983 .query = uct_ib_md_query,
984 .mem_reg = uct_ib_mem_rcache_reg,
985 .mem_dereg = uct_ib_mem_rcache_dereg,
986 .mem_advise = uct_ib_mem_advise,
987 .mkey_pack = uct_ib_mkey_pack,
988 .detect_memory_type = ucs_empty_function_return_unsupported,
989 };
990
uct_ib_rcache_mem_reg_cb(void * context,ucs_rcache_t * rcache,void * arg,ucs_rcache_region_t * rregion,uint16_t rcache_mem_reg_flags)991 static ucs_status_t uct_ib_rcache_mem_reg_cb(void *context, ucs_rcache_t *rcache,
992 void *arg, ucs_rcache_region_t *rregion,
993 uint16_t rcache_mem_reg_flags)
994 {
995 uct_ib_rcache_region_t *region = ucs_derived_of(rregion, uct_ib_rcache_region_t);
996 uct_ib_md_t *md = context;
997 int *flags = arg;
998 int silent = (rcache_mem_reg_flags & UCS_RCACHE_MEM_REG_HIDE_ERRORS) ||
999 (*flags & UCT_MD_MEM_FLAG_HIDE_ERRORS);
1000 ucs_status_t status;
1001
1002 status = uct_ib_mem_reg_internal(&md->super, (void*)region->super.super.start,
1003 region->super.super.end - region->super.super.start,
1004 *flags, silent, ®ion->memh);
1005 if (status != UCS_OK) {
1006 return status;
1007 }
1008
1009 return UCS_OK;
1010 }
1011
uct_ib_rcache_mem_dereg_cb(void * context,ucs_rcache_t * rcache,ucs_rcache_region_t * rregion)1012 static void uct_ib_rcache_mem_dereg_cb(void *context, ucs_rcache_t *rcache,
1013 ucs_rcache_region_t *rregion)
1014 {
1015 uct_ib_rcache_region_t *region = ucs_derived_of(rregion, uct_ib_rcache_region_t);
1016 uct_ib_md_t *md = (uct_ib_md_t *)context;
1017
1018 (void)uct_ib_memh_dereg(md, ®ion->memh);
1019 }
1020
uct_ib_rcache_dump_region_cb(void * context,ucs_rcache_t * rcache,ucs_rcache_region_t * rregion,char * buf,size_t max)1021 static void uct_ib_rcache_dump_region_cb(void *context, ucs_rcache_t *rcache,
1022 ucs_rcache_region_t *rregion, char *buf,
1023 size_t max)
1024 {
1025 uct_ib_rcache_region_t *region = ucs_derived_of(rregion, uct_ib_rcache_region_t);
1026 uct_ib_mem_t *memh = ®ion->memh;
1027
1028 snprintf(buf, max, "lkey 0x%x rkey 0x%x atomic_rkey 0x%x",
1029 memh->lkey, memh->rkey,
1030 (memh->flags & UCT_IB_MEM_FLAG_ATOMIC_MR) ? memh->atomic_rkey :
1031 UCT_IB_INVALID_RKEY
1032 );
1033 }
1034
1035 static ucs_rcache_ops_t uct_ib_rcache_ops = {
1036 .mem_reg = uct_ib_rcache_mem_reg_cb,
1037 .mem_dereg = uct_ib_rcache_mem_dereg_cb,
1038 .dump_region = uct_ib_rcache_dump_region_cb
1039 };
1040
uct_ib_md_odp_query(uct_md_h uct_md,uct_md_attr_t * md_attr)1041 static ucs_status_t uct_ib_md_odp_query(uct_md_h uct_md, uct_md_attr_t *md_attr)
1042 {
1043 ucs_status_t status;
1044
1045 status = uct_ib_md_query(uct_md, md_attr);
1046 if (status != UCS_OK) {
1047 return status;
1048 }
1049
1050 /* ODP supports only host memory */
1051 md_attr->cap.reg_mem_types &= UCS_BIT(UCS_MEMORY_TYPE_HOST);
1052 return UCS_OK;
1053 }
1054
uct_ib_mem_global_odp_reg(uct_md_h uct_md,void * address,size_t length,unsigned flags,uct_mem_h * memh_p)1055 static ucs_status_t uct_ib_mem_global_odp_reg(uct_md_h uct_md, void *address,
1056 size_t length, unsigned flags,
1057 uct_mem_h *memh_p)
1058 {
1059 uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
1060 uct_ib_mem_t *memh = md->global_odp;
1061
1062 ucs_assert(md->global_odp != NULL);
1063 if (flags & UCT_MD_MEM_FLAG_LOCK) {
1064 return uct_ib_mem_reg(uct_md, address, length, flags, memh_p);
1065 }
1066
1067 if (md->config.odp.prefetch) {
1068 md->ops->mem_prefetch(md, memh, address, length);
1069 }
1070
1071 /* cppcheck-suppress autoVariables */
1072 *memh_p = md->global_odp;
1073 return UCS_OK;
1074 }
1075
uct_ib_mem_global_odp_dereg(uct_md_h uct_md,uct_mem_h memh)1076 static ucs_status_t uct_ib_mem_global_odp_dereg(uct_md_h uct_md, uct_mem_h memh)
1077 {
1078 uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
1079
1080 if (memh == md->global_odp) {
1081 return UCS_OK;
1082 }
1083
1084 return uct_ib_mem_dereg(uct_md, memh);
1085 }
1086
1087 static uct_md_ops_t UCS_V_UNUSED uct_ib_md_global_odp_ops = {
1088 .close = uct_ib_md_close,
1089 .query = uct_ib_md_odp_query,
1090 .mem_reg = uct_ib_mem_global_odp_reg,
1091 .mem_dereg = uct_ib_mem_global_odp_dereg,
1092 .mem_advise = uct_ib_mem_advise,
1093 .mkey_pack = uct_ib_mkey_pack,
1094 .detect_memory_type = ucs_empty_function_return_unsupported,
1095 };
1096
uct_ib_query_md_resources(uct_component_t * component,uct_md_resource_desc_t ** resources_p,unsigned * num_resources_p)1097 static ucs_status_t uct_ib_query_md_resources(uct_component_t *component,
1098 uct_md_resource_desc_t **resources_p,
1099 unsigned *num_resources_p)
1100 {
1101 UCS_MODULE_FRAMEWORK_DECLARE(uct_ib);
1102 uct_md_resource_desc_t *resources;
1103 struct ibv_device **device_list;
1104 ucs_status_t status;
1105 int i, num_devices;
1106
1107 UCS_MODULE_FRAMEWORK_LOAD(uct_ib, 0);
1108
1109 /* Get device list from driver */
1110 device_list = ibv_get_device_list(&num_devices);
1111 if (device_list == NULL) {
1112 ucs_debug("Failed to get IB device list, assuming no devices are present");
1113 *resources_p = NULL;
1114 *num_resources_p = 0;
1115 return UCS_OK;
1116 }
1117
1118 resources = ucs_calloc(num_devices, sizeof(*resources), "ib resources");
1119 if (resources == NULL) {
1120 status = UCS_ERR_NO_MEMORY;
1121 goto out_free_device_list;
1122 }
1123
1124 for (i = 0; i < num_devices; ++i) {
1125 ucs_snprintf_zero(resources[i].md_name, sizeof(resources[i].md_name),
1126 "%s", ibv_get_device_name(device_list[i]));
1127 }
1128
1129 *resources_p = resources;
1130 *num_resources_p = num_devices;
1131 status = UCS_OK;
1132
1133 out_free_device_list:
1134 ibv_free_device_list(device_list);
1135 return status;
1136 }
1137
uct_ib_fork_warn()1138 static void uct_ib_fork_warn()
1139 {
1140 ucs_warn("IB: ibv_fork_init() was disabled or failed, yet a fork() has been issued.");
1141 ucs_warn("IB: data corruption might occur when using registered memory.");
1142 }
1143
uct_ib_fork_warn_enable()1144 static void uct_ib_fork_warn_enable()
1145 {
1146 static volatile uint32_t enabled = 0;
1147 int ret;
1148
1149 if (ucs_atomic_cswap32(&enabled, 0, 1) != 0) {
1150 return;
1151 }
1152
1153 ret = pthread_atfork(uct_ib_fork_warn, NULL, NULL);
1154 if (ret) {
1155 ucs_warn("registering fork() warning failed: %m");
1156 }
1157 }
1158
uct_ib_md_release_device_config(uct_ib_md_t * md)1159 static void uct_ib_md_release_device_config(uct_ib_md_t *md)
1160 {
1161 unsigned i;
1162
1163 for (i = 0; i < md->custom_devices.count; ++i) {
1164 free((char*)md->custom_devices.specs[i].name);
1165 }
1166 ucs_free(md->custom_devices.specs);
1167 }
1168
1169 static ucs_status_t UCS_V_UNUSED
uct_ib_md_global_odp_init(uct_ib_md_t * md,uct_mem_h * memh_p)1170 uct_ib_md_global_odp_init(uct_ib_md_t *md, uct_mem_h *memh_p)
1171 {
1172 uct_ib_verbs_mem_t *global_odp;
1173 uct_ib_mr_t *mr;
1174 ucs_status_t status;
1175
1176 global_odp = (uct_ib_verbs_mem_t *)uct_ib_memh_alloc(md);
1177 if (global_odp == NULL) {
1178 return UCS_ERR_NO_MEMORY;
1179 }
1180
1181 mr = &global_odp->mrs[UCT_IB_MR_DEFAULT];
1182 status = uct_ib_reg_mr(md->pd, 0, UINT64_MAX,
1183 UCT_IB_MEM_ACCESS_FLAGS | IBV_ACCESS_ON_DEMAND,
1184 &mr->ib);
1185 if (status != UCS_OK) {
1186 ucs_debug("%s: failed to register global mr: %m",
1187 uct_ib_device_name(&md->dev));
1188 goto err;
1189 }
1190
1191 global_odp->super.flags = UCT_IB_MEM_FLAG_ODP;
1192 uct_ib_memh_init_keys(&global_odp->super, mr->ib->lkey, mr->ib->rkey);
1193 *memh_p = global_odp;
1194 return UCS_OK;
1195
1196 err:
1197 uct_ib_memh_free(&global_odp->super);
1198 return status;
1199 }
1200
1201 static ucs_status_t
uct_ib_md_parse_reg_methods(uct_ib_md_t * md,uct_md_attr_t * md_attr,const uct_ib_md_config_t * md_config)1202 uct_ib_md_parse_reg_methods(uct_ib_md_t *md, uct_md_attr_t *md_attr,
1203 const uct_ib_md_config_t *md_config)
1204 {
1205 ucs_rcache_params_t rcache_params;
1206 ucs_status_t status;
1207 int i;
1208
1209 for (i = 0; i < md_config->reg_methods.count; ++i) {
1210 if (!strcasecmp(md_config->reg_methods.rmtd[i], "rcache")) {
1211 rcache_params.region_struct_size = sizeof(ucs_rcache_region_t) +
1212 md->memh_struct_size;
1213 rcache_params.alignment = md_config->rcache.alignment;
1214 rcache_params.max_alignment = ucs_get_page_size();
1215 rcache_params.ucm_events = UCM_EVENT_VM_UNMAPPED;
1216 if (md_attr->cap.reg_mem_types & ~UCS_BIT(UCS_MEMORY_TYPE_HOST)) {
1217 rcache_params.ucm_events |= UCM_EVENT_MEM_TYPE_FREE;
1218 }
1219 rcache_params.ucm_event_priority = md_config->rcache.event_prio;
1220 rcache_params.context = md;
1221 rcache_params.ops = &uct_ib_rcache_ops;
1222 rcache_params.flags = 0;
1223
1224 status = ucs_rcache_create(&rcache_params, uct_ib_device_name(&md->dev),
1225 UCS_STATS_RVAL(md->stats), &md->rcache);
1226 if (status != UCS_OK) {
1227 ucs_debug("%s: failed to create registration cache: %s",
1228 uct_ib_device_name(&md->dev),
1229 ucs_status_string(status));
1230 continue;
1231 }
1232
1233 md->super.ops = &uct_ib_md_rcache_ops;
1234 md->reg_cost = ucs_linear_func_make(md_config->rcache.overhead, 0);
1235 ucs_debug("%s: using registration cache",
1236 uct_ib_device_name(&md->dev));
1237 return UCS_OK;
1238 #if HAVE_ODP_IMPLICIT
1239 } else if (!strcasecmp(md_config->reg_methods.rmtd[i], "odp")) {
1240 if (!(md->dev.flags & UCT_IB_DEVICE_FLAG_ODP_IMPLICIT)) {
1241 ucs_debug("%s: on-demand-paging with global memory region is "
1242 "not supported", uct_ib_device_name(&md->dev));
1243 continue;
1244 }
1245
1246 status = uct_ib_md_global_odp_init(md, &md->global_odp);
1247 if (status != UCS_OK) {
1248 continue;
1249 }
1250
1251 md->super.ops = &uct_ib_md_global_odp_ops;
1252 md->reg_cost = ucs_linear_func_make(10e-9, 0);
1253 ucs_debug("%s: using odp global key", uct_ib_device_name(&md->dev));
1254 return UCS_OK;
1255 #endif
1256 } else if (!strcmp(md_config->reg_methods.rmtd[i], "direct")) {
1257 md->super.ops = &uct_ib_md_ops;
1258 md->reg_cost = md_config->uc_reg_cost;
1259 ucs_debug("%s: using direct registration",
1260 uct_ib_device_name(&md->dev));
1261 return UCS_OK;
1262 }
1263 }
1264
1265 return UCS_ERR_INVALID_PARAM;
1266 }
1267
1268 static ucs_status_t
uct_ib_md_parse_device_config(uct_ib_md_t * md,const uct_ib_md_config_t * md_config)1269 uct_ib_md_parse_device_config(uct_ib_md_t *md, const uct_ib_md_config_t *md_config)
1270 {
1271 uct_ib_device_spec_t *spec;
1272 ucs_status_t status;
1273 char *flags_str, *p;
1274 unsigned i, count;
1275 int nfields;
1276
1277 count = md->custom_devices.count = md_config->custom_devices.count;
1278 if (count == 0) {
1279 md->custom_devices.specs = NULL;
1280 md->custom_devices.count = 0;
1281 return UCS_OK;
1282 }
1283
1284 md->custom_devices.specs = ucs_calloc(count, sizeof(*md->custom_devices.specs),
1285 "ib_custom_devices");
1286 if (md->custom_devices.specs == NULL) {
1287 status = UCS_ERR_NO_MEMORY;
1288 goto err;
1289 }
1290
1291 for (i = 0; i < count; ++i) {
1292 spec = &md->custom_devices.specs[i];
1293 nfields = sscanf(md_config->custom_devices.spec[i],
1294 "%hi:%hi:%m[^:]:%m[^:]:%hhu",
1295 &spec->pci_id.vendor, &spec->pci_id.device, &spec->name,
1296 &flags_str, &spec->priority);
1297 if (nfields < 2) {
1298 ucs_error("failed to parse device config '%s' (parsed: %d/%d)",
1299 md_config->custom_devices.spec[i], nfields, 5);
1300 status = UCS_ERR_INVALID_PARAM;
1301 goto err_free;
1302 }
1303
1304 if (nfields >= 4) {
1305 for (p = flags_str; *p != 0; ++p) {
1306 if (*p == '4') {
1307 spec->flags |= UCT_IB_DEVICE_FLAG_MLX4_PRM;
1308 } else if (*p == '5') {
1309 spec->flags |= UCT_IB_DEVICE_FLAG_MLX5_PRM;
1310 } else if (*p == 'd') {
1311 spec->flags |= UCT_IB_DEVICE_FLAG_DC_V1;
1312 } else if (*p == 'D') {
1313 spec->flags |= UCT_IB_DEVICE_FLAG_DC_V2;
1314 } else if (*p == 'a') {
1315 spec->flags |= UCT_IB_DEVICE_FLAG_AV;
1316 } else {
1317 ucs_error("invalid device flag: '%c'", *p);
1318 free(flags_str);
1319 status = UCS_ERR_INVALID_PARAM;
1320 goto err_free;
1321 }
1322 }
1323 free(flags_str);
1324 }
1325
1326 ucs_trace("added device '%s' vendor_id 0x%x device_id %d flags %c%c prio %d",
1327 spec->name, spec->pci_id.vendor, spec->pci_id.device,
1328 (spec->flags & UCT_IB_DEVICE_FLAG_MLX4_PRM) ? '4' : '-',
1329 (spec->flags & UCT_IB_DEVICE_FLAG_MLX5_PRM) ? '5' : '-',
1330 spec->priority);
1331 }
1332
1333 return UCS_OK;
1334
1335 err_free:
1336 uct_ib_md_release_device_config(md);
1337 err:
1338 return status;
1339 }
1340
uct_ib_md_release_reg_method(uct_ib_md_t * md)1341 static void uct_ib_md_release_reg_method(uct_ib_md_t *md)
1342 {
1343 if (md->rcache != NULL) {
1344 ucs_rcache_destroy(md->rcache);
1345 }
1346 if (md->global_odp != NULL) {
1347 uct_ib_mem_dereg(&md->super, md->global_odp);
1348 }
1349 }
1350
1351 static ucs_status_t
uct_ib_md_parse_subnet_prefix(const char * subnet_prefix_str,uint64_t * subnet_prefix)1352 uct_ib_md_parse_subnet_prefix(const char *subnet_prefix_str,
1353 uint64_t *subnet_prefix)
1354 {
1355 uint16_t pfx[4] = {0};
1356 uint64_t pfx64 = 0;
1357 int res, i;
1358
1359 res = sscanf(subnet_prefix_str, "%hx:%hx:%hx:%hx",
1360 &pfx[0], &pfx[1], &pfx[2], &pfx[3]);
1361 if (res != 4) {
1362 ucs_error("subnet filter '%s' is invalid", subnet_prefix_str);
1363 return UCS_ERR_INVALID_PARAM;
1364 }
1365
1366 for (i = 0; i < 4; i++) {
1367 pfx64 = pfx[i] + (pfx64 << 16);
1368 }
1369
1370 *subnet_prefix = htobe64(pfx64);
1371 return UCS_OK;
1372 }
1373
uct_ib_md_read_pci_bw(struct ibv_device * ib_device)1374 static double uct_ib_md_read_pci_bw(struct ibv_device *ib_device)
1375 {
1376 const char *pci_width_file_name = "current_link_width";
1377 const char *pci_speed_file_name = "current_link_speed";
1378 char pci_width_str[16];
1379 char pci_speed_str[16];
1380 char gts[16];
1381 const uct_ib_md_pci_info_t *p;
1382 double bw, effective_bw;
1383 unsigned width;
1384 ssize_t len;
1385 size_t i;
1386
1387 len = ucs_read_file(pci_width_str, sizeof(pci_width_str) - 1, 1,
1388 UCT_IB_DEVICE_SYSFS_FMT, ib_device->name,
1389 pci_width_file_name);
1390 if (len < 1) {
1391 ucs_debug("failed to read file: " UCT_IB_DEVICE_SYSFS_FMT,
1392 ib_device->name, pci_width_file_name);
1393 return DBL_MAX; /* failed to read file */
1394 }
1395 pci_width_str[len] = '\0';
1396
1397 len = ucs_read_file(pci_speed_str, sizeof(pci_speed_str) - 1, 1,
1398 UCT_IB_DEVICE_SYSFS_FMT, ib_device->name,
1399 pci_speed_file_name);
1400 if (len < 1) {
1401 ucs_debug("failed to read file: " UCT_IB_DEVICE_SYSFS_FMT,
1402 ib_device->name, pci_speed_file_name);
1403 return DBL_MAX; /* failed to read file */
1404 }
1405 pci_speed_str[len] = '\0';
1406
1407 if (sscanf(pci_width_str, "%u", &width) < 1) {
1408 ucs_debug("incorrect format of %s file: expected: <unsigned integer>, actual: %s\n",
1409 pci_width_file_name, pci_width_str);
1410 return DBL_MAX;
1411 }
1412
1413 if ((sscanf(pci_speed_str, "%lf%s", &bw, gts) < 2) ||
1414 strcasecmp("GT/s", ucs_strtrim(gts))) {
1415 ucs_debug("incorrect format of %s file: expected: <double> GT/s, actual: %s\n",
1416 pci_speed_file_name, pci_speed_str);
1417 return DBL_MAX;
1418 }
1419
1420 bw *= UCS_GBYTE / 8; /* gigabit -> gigabyte */
1421
1422 for (i = 0; i < ucs_static_array_size(uct_ib_md_pci_info); i++) {
1423 if (bw < (uct_ib_md_pci_info[i].bw * 1.2)) { /* use 1.2 multiplex to avoid round issues */
1424 p = &uct_ib_md_pci_info[i]; /* use pointer to make equation shorter */
1425 /* coverity[overflow] */
1426 effective_bw = bw * width *
1427 (p->payload * p->nack) /
1428 (((p->payload + p->overhead) * p->nack) + p->ctrl) *
1429 p->encoding / p->decoding;
1430 ucs_trace("%s: pcie %ux %s, effective throughput %.3lfMB/s (%.3lfGb/s)",
1431 ib_device->name, width, p->name,
1432 (effective_bw / UCS_MBYTE), (effective_bw * 8 / UCS_GBYTE));
1433 return effective_bw;
1434 }
1435 }
1436
1437 return DBL_MAX;
1438 }
1439
uct_ib_md_pci_bw(const uct_ib_md_config_t * md_config,struct ibv_device * ib_device)1440 static double uct_ib_md_pci_bw(const uct_ib_md_config_t *md_config,
1441 struct ibv_device *ib_device)
1442 {
1443 unsigned i;
1444
1445 for (i = 0; i < md_config->pci_bw.count; i++) {
1446 if (!strcmp(ib_device->name, md_config->pci_bw.device[i].name)) {
1447 if (UCS_CONFIG_BW_IS_AUTO(md_config->pci_bw.device[i].bw)) {
1448 break; /* read data from system */
1449 }
1450 return md_config->pci_bw.device[i].bw;
1451 }
1452 }
1453
1454 return uct_ib_md_read_pci_bw(ib_device);
1455 }
1456
uct_ib_md_open(uct_component_t * component,const char * md_name,const uct_md_config_t * uct_md_config,uct_md_h * md_p)1457 ucs_status_t uct_ib_md_open(uct_component_t *component, const char *md_name,
1458 const uct_md_config_t *uct_md_config, uct_md_h *md_p)
1459 {
1460 const uct_ib_md_config_t *md_config = ucs_derived_of(uct_md_config, uct_ib_md_config_t);
1461 ucs_status_t status = UCS_ERR_UNSUPPORTED;
1462 uct_ib_md_t *md = NULL;
1463 struct ibv_device **ib_device_list, *ib_device;
1464 uct_ib_md_ops_entry_t *md_ops_entry;
1465 int i, num_devices, ret, fork_init = 0;
1466
1467 ucs_trace("opening IB device %s", md_name);
1468
1469 #if !HAVE_DEVX
1470 if (md_config->devx == UCS_YES) {
1471 ucs_error("DEVX requested but not supported");
1472 status = UCS_ERR_NO_DEVICE;
1473 goto out;
1474 }
1475 #endif
1476
1477 /* Get device list from driver */
1478 ib_device_list = ibv_get_device_list(&num_devices);
1479 if (ib_device_list == NULL) {
1480 ucs_debug("Failed to get IB device list, assuming no devices are present");
1481 status = UCS_ERR_NO_DEVICE;
1482 goto out;
1483 }
1484
1485 ib_device = NULL;
1486 for (i = 0; i < num_devices; ++i) {
1487 if (!strcmp(ibv_get_device_name(ib_device_list[i]), md_name)) {
1488 ib_device = ib_device_list[i];
1489 break;
1490 }
1491 }
1492
1493 if (ib_device == NULL) {
1494 ucs_debug("IB device %s not found", md_name);
1495 status = UCS_ERR_NO_DEVICE;
1496 goto out_free_dev_list;
1497 }
1498
1499 if (md_config->fork_init != UCS_NO) {
1500 ret = ibv_fork_init();
1501 if (ret) {
1502 if (md_config->fork_init == UCS_YES) {
1503 ucs_error("ibv_fork_init() failed: %m");
1504 status = UCS_ERR_IO_ERROR;
1505 goto out_free_dev_list;
1506 }
1507 ucs_debug("ibv_fork_init() failed: %m, continuing, but fork may be unsafe.");
1508 uct_ib_fork_warn_enable();
1509 } else {
1510 fork_init = 1;
1511 }
1512 } else {
1513 uct_ib_fork_warn_enable();
1514 }
1515
1516 ucs_list_for_each(md_ops_entry, &uct_ib_md_ops_list, list) {
1517 status = md_ops_entry->ops->open(ib_device, md_config, &md);
1518 if (status == UCS_OK) {
1519 ucs_debug("%s: md open by '%s' is successful", md_name,
1520 md_ops_entry->name);
1521 md->ops = md_ops_entry->ops;
1522 break;
1523 } else if (status != UCS_ERR_UNSUPPORTED) {
1524 goto out_free_dev_list;
1525 }
1526 ucs_debug("%s: md open by '%s' failed, trying next", md_name,
1527 md_ops_entry->name);
1528 }
1529
1530 if (status != UCS_OK) {
1531 ucs_assert(status == UCS_ERR_UNSUPPORTED);
1532 ucs_debug("Unsupported IB device %s", md_name);
1533 goto out_free_dev_list;
1534 }
1535
1536 /* cppcheck-suppress autoVariables */
1537 *md_p = &md->super;
1538 md->fork_init = fork_init;
1539 status = UCS_OK;
1540
1541 out_free_dev_list:
1542 ibv_free_device_list(ib_device_list);
1543 out:
1544 return status;
1545 }
1546
uct_ib_md_parse_relaxed_order(uct_ib_md_t * md,const uct_ib_md_config_t * md_config)1547 void uct_ib_md_parse_relaxed_order(uct_ib_md_t *md,
1548 const uct_ib_md_config_t *md_config)
1549 {
1550 if (md_config->mr_relaxed_order == UCS_CONFIG_ON) {
1551 if (IBV_ACCESS_RELAXED_ORDERING) {
1552 md->relaxed_order = 1;
1553 } else {
1554 ucs_warn("relaxed order memory access requested but not supported");
1555 }
1556 } else if (md_config->mr_relaxed_order == UCS_CONFIG_AUTO) {
1557 if (ucs_cpu_prefer_relaxed_order()) {
1558 md->relaxed_order = 1;
1559 }
1560 }
1561 }
1562
uct_ib_md_open_common(uct_ib_md_t * md,struct ibv_device * ib_device,const uct_ib_md_config_t * md_config)1563 ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md,
1564 struct ibv_device *ib_device,
1565 const uct_ib_md_config_t *md_config)
1566 {
1567 uct_md_attr_t md_attr;
1568 ucs_status_t status;
1569
1570 md->super.ops = &uct_ib_md_ops;
1571 md->super.component = &uct_ib_component;
1572
1573 if (md->config.odp.max_size == UCS_MEMUNITS_AUTO) {
1574 md->config.odp.max_size = uct_ib_device_odp_max_size(&md->dev);
1575 }
1576
1577 /* Create statistics */
1578 status = UCS_STATS_NODE_ALLOC(&md->stats, &uct_ib_md_stats_class,
1579 ucs_stats_get_root(),
1580 "%s-%p", ibv_get_device_name(ib_device), md);
1581 if (status != UCS_OK) {
1582 goto err;
1583 }
1584
1585 status = uct_ib_device_init(&md->dev, ib_device, md_config->async_events
1586 UCS_STATS_ARG(md->stats));
1587 if (status != UCS_OK) {
1588 goto err_release_stats;
1589 }
1590
1591 #if HAVE_DECL_IBV_EXP_SETENV
1592 ibv_exp_setenv(md->dev.ibv_context, "MLX_QP_ALLOC_TYPE", "ANON", 0);
1593 ibv_exp_setenv(md->dev.ibv_context, "MLX_CQ_ALLOC_TYPE", "ANON", 0);
1594 #endif
1595
1596 if (strlen(md_config->subnet_prefix) > 0) {
1597 status = uct_ib_md_parse_subnet_prefix(md_config->subnet_prefix,
1598 &md->subnet_filter);
1599
1600 if (status != UCS_OK) {
1601 goto err_cleanup_device;
1602 }
1603
1604 md->check_subnet_filter = 1;
1605 }
1606
1607 /* Allocate memory domain */
1608 md->pd = ibv_alloc_pd(md->dev.ibv_context);
1609 if (md->pd == NULL) {
1610 ucs_error("ibv_alloc_pd() failed: %m");
1611 status = UCS_ERR_NO_MEMORY;
1612 goto err_cleanup_device;
1613 }
1614
1615 status = uct_md_query(&md->super, &md_attr);
1616 if (status != UCS_OK) {
1617 goto err_dealloc_pd;
1618 }
1619
1620 status = uct_ib_md_parse_reg_methods(md, &md_attr, md_config);
1621 if (status != UCS_OK) {
1622 goto err_dealloc_pd;
1623 }
1624
1625 md->dev.max_zcopy_log_sge = INT_MAX;
1626 if (md_attr.cap.reg_mem_types & ~UCS_BIT(UCS_MEMORY_TYPE_HOST)) {
1627 md->dev.max_zcopy_log_sge = 1;
1628 }
1629
1630 md->pci_bw = uct_ib_md_pci_bw(md_config, ib_device);
1631 return UCS_OK;
1632
1633 err_dealloc_pd:
1634 ibv_dealloc_pd(md->pd);
1635 err_cleanup_device:
1636 uct_ib_device_cleanup(&md->dev);
1637 err_release_stats:
1638 UCS_STATS_NODE_FREE(md->stats);
1639 err:
1640 return status;
1641 }
1642
uct_ib_md_close(uct_md_h uct_md)1643 void uct_ib_md_close(uct_md_h uct_md)
1644 {
1645 uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
1646
1647 md->ops->cleanup(md);
1648 uct_ib_md_release_device_config(md);
1649 uct_ib_md_release_reg_method(md);
1650 uct_ib_device_cleanup_ah_cached(&md->dev);
1651 ibv_dealloc_pd(md->pd);
1652 uct_ib_device_cleanup(&md->dev);
1653 ibv_close_device(md->dev.ibv_context);
1654 UCS_STATS_NODE_FREE(md->stats);
1655 ucs_free(md);
1656 }
1657
1658 static uct_ib_md_ops_t uct_ib_verbs_md_ops;
1659
uct_ib_verbs_md_open(struct ibv_device * ibv_device,const uct_ib_md_config_t * md_config,uct_ib_md_t ** p_md)1660 static ucs_status_t uct_ib_verbs_md_open(struct ibv_device *ibv_device,
1661 const uct_ib_md_config_t *md_config,
1662 uct_ib_md_t **p_md)
1663 {
1664 uct_ib_device_t *dev;
1665 ucs_status_t status;
1666 uct_ib_md_t *md;
1667 int num_mrs;
1668
1669 md = ucs_calloc(1, sizeof(*md), "ib_md");
1670 if (md == NULL) {
1671 return UCS_ERR_NO_MEMORY;
1672 }
1673
1674 /* Open verbs context */
1675 dev = &md->dev;
1676 dev->ibv_context = ibv_open_device(ibv_device);
1677 if (dev->ibv_context == NULL) {
1678 ucs_error("ibv_open_device(%s) failed: %m", ibv_get_device_name(ibv_device));
1679 status = UCS_ERR_IO_ERROR;
1680 goto err;
1681 }
1682
1683 md->config = md_config->ext;
1684
1685 status = uct_ib_device_query(dev, ibv_device);
1686 if (status != UCS_OK) {
1687 goto err_free_context;
1688 }
1689
1690 if (UCT_IB_HAVE_ODP_IMPLICIT(&dev->dev_attr)) {
1691 md->dev.flags |= UCT_IB_DEVICE_FLAG_ODP_IMPLICIT;
1692 }
1693
1694 if (IBV_EXP_HAVE_ATOMIC_HCA(&dev->dev_attr)) {
1695 dev->atomic_arg_sizes = sizeof(uint64_t);
1696 }
1697
1698 md->ops = &uct_ib_verbs_md_ops;
1699 status = uct_ib_md_parse_device_config(md, md_config);
1700 if (status != UCS_OK) {
1701 goto err_free_context;
1702 }
1703
1704 uct_ib_md_parse_relaxed_order(md, md_config);
1705 num_mrs = 1; /* UCT_IB_MR_DEFAULT */
1706
1707 if (md->relaxed_order) {
1708 ++num_mrs; /* UCT_IB_MR_STRICT_ORDER */
1709 }
1710
1711 md->memh_struct_size = sizeof(uct_ib_verbs_mem_t) +
1712 (sizeof(uct_ib_mr_t) * num_mrs);
1713
1714 status = uct_ib_md_open_common(md, ibv_device, md_config);
1715 if (status != UCS_OK) {
1716 goto err_dev_cfg;
1717 }
1718
1719 md->dev.flags = uct_ib_device_spec(&md->dev)->flags;
1720 *p_md = md;
1721 return UCS_OK;
1722
1723 err_dev_cfg:
1724 uct_ib_md_release_device_config(md);
1725 err_free_context:
1726 ibv_close_device(dev->ibv_context);
1727 err:
1728 ucs_free(md);
1729 return status;
1730 }
1731
1732 static uct_ib_md_ops_t uct_ib_verbs_md_ops = {
1733 .open = uct_ib_verbs_md_open,
1734 .cleanup = (uct_ib_md_cleanup_func_t)ucs_empty_function,
1735 .reg_key = uct_ib_verbs_reg_key,
1736 .dereg_key = uct_ib_verbs_dereg_key,
1737 .reg_atomic_key = uct_ib_verbs_reg_atomic_key,
1738 .dereg_atomic_key = (uct_ib_md_dereg_atomic_key_func_t)ucs_empty_function_return_success,
1739 .reg_multithreaded = (uct_ib_md_reg_multithreaded_func_t)ucs_empty_function_return_unsupported,
1740 .dereg_multithreaded = (uct_ib_md_dereg_multithreaded_func_t)ucs_empty_function_return_unsupported,
1741 .mem_prefetch = (uct_ib_md_mem_prefetch_func_t)ucs_empty_function_return_success,
1742 .get_atomic_mr_id = (uct_ib_md_get_atomic_mr_id_func_t)ucs_empty_function_return_unsupported,
1743 };
1744
1745 UCT_IB_MD_OPS(uct_ib_verbs_md_ops, 0);
1746
1747 uct_component_t uct_ib_component = {
1748 .query_md_resources = uct_ib_query_md_resources,
1749 .md_open = uct_ib_md_open,
1750 .cm_open = ucs_empty_function_return_unsupported,
1751 .rkey_unpack = uct_ib_rkey_unpack,
1752 .rkey_ptr = ucs_empty_function_return_unsupported,
1753 .rkey_release = ucs_empty_function_return_success,
1754 .name = "ib",
1755 .md_config = {
1756 .name = "IB memory domain",
1757 .prefix = UCT_IB_CONFIG_PREFIX,
1758 .table = uct_ib_md_config_table,
1759 .size = sizeof(uct_ib_md_config_t),
1760 },
1761 .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY,
1762 .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_ib_component),
1763 .flags = 0
1764 };
1765 UCT_COMPONENT_REGISTER(&uct_ib_component);
1766