1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED.
3 * Copyright (C) UT-Battelle, LLC. 2014. ALL RIGHTS RESERVED.
4 * See file LICENSE for terms.
5 */
6
7 #ifdef HAVE_CONFIG_H
8 # include "config.h"
9 #endif
10
11 #include "ib_device.h"
12 #include "ib_md.h"
13
14 #include <ucs/arch/bitops.h>
15 #include <ucs/debug/memtrack.h>
16 #include <ucs/debug/log.h>
17 #include <ucs/async/async.h>
18 #include <ucs/sys/compiler.h>
19 #include <ucs/sys/string.h>
20 #include <ucs/sys/sock.h>
21 #include <ucs/sys/sys.h>
22 #include <sys/poll.h>
23 #include <sched.h>
24 #include <libgen.h>
25
26
27 /* This table is according to "Encoding for RNR NAK Timer Field"
28 * in IBTA specification */
29 const double uct_ib_qp_rnr_time_ms[] = {
30 655.36, 0.01, 0.02, 0.03, 0.04, 0.06, 0.08, 0.12,
31 0.16, 0.24, 0.32, 0.48, 0.64, 0.96, 1.28, 1.92,
32 2.56, 3.84, 5.12, 7.68, 10.24, 15.36, 20.48, 30.72,
33 40.96, 61.44, 81.92, 122.88, 163.84, 245.76, 327.68, 491.52
34 };
35
36
37 /* use both gid + lid data for key generarion (lid - ib based, gid - RoCE) */
38 static UCS_F_ALWAYS_INLINE
uct_ib_kh_ah_hash_func(struct ibv_ah_attr attr)39 khint32_t uct_ib_kh_ah_hash_func(struct ibv_ah_attr attr)
40 {
41 return kh_int64_hash_func(attr.grh.dgid.global.subnet_prefix ^
42 attr.grh.dgid.global.interface_id ^
43 attr.dlid);
44 }
45
46 static UCS_F_ALWAYS_INLINE
uct_ib_kh_ah_hash_equal(struct ibv_ah_attr a,struct ibv_ah_attr b)47 int uct_ib_kh_ah_hash_equal(struct ibv_ah_attr a, struct ibv_ah_attr b)
48 {
49 return !memcmp(&a, &b, sizeof(a));
50 }
51
52 KHASH_IMPL(uct_ib_ah, struct ibv_ah_attr, struct ibv_ah*, 1,
53 uct_ib_kh_ah_hash_func, uct_ib_kh_ah_hash_equal)
54
55
56 #ifdef ENABLE_STATS
57 static ucs_stats_class_t uct_ib_device_stats_class = {
58 .name = "",
59 .num_counters = UCT_IB_DEVICE_STAT_LAST,
60 .counter_names = {
61 [UCT_IB_DEVICE_STAT_ASYNC_EVENT] = "async_event"
62 }
63 };
64 #endif
65
66 static uct_ib_device_spec_t uct_ib_builtin_device_specs[] = {
67 {"ConnectX-3", {0x15b3, 4099},
68 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX4_PRM, 10},
69 {"ConnectX-3 Pro", {0x15b3, 4103},
70 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX4_PRM, 11},
71 {"Connect-IB", {0x15b3, 4113},
72 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
73 UCT_IB_DEVICE_FLAG_DC_V1, 20},
74 {"ConnectX-4", {0x15b3, 4115},
75 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
76 UCT_IB_DEVICE_FLAG_DC_V1, 30},
77 {"ConnectX-4", {0x15b3, 4116},
78 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
79 UCT_IB_DEVICE_FLAG_DC_V1, 29},
80 {"ConnectX-4 LX", {0x15b3, 4117},
81 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
82 UCT_IB_DEVICE_FLAG_DC_V1, 28},
83 {"ConnectX-4 LX VF", {0x15b3, 4118},
84 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
85 UCT_IB_DEVICE_FLAG_DC_V1, 28},
86 {"ConnectX-5", {0x15b3, 4119},
87 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
88 UCT_IB_DEVICE_FLAG_DC_V2, 38},
89 {"ConnectX-5", {0x15b3, 4121},
90 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
91 UCT_IB_DEVICE_FLAG_DC_V2, 40},
92 {"ConnectX-5", {0x15b3, 4120},
93 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
94 UCT_IB_DEVICE_FLAG_DC_V2, 39},
95 {"ConnectX-5", {0x15b3, 41682},
96 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
97 UCT_IB_DEVICE_FLAG_DC_V2, 37},
98 {"ConnectX-5", {0x15b3, 4122},
99 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
100 UCT_IB_DEVICE_FLAG_DC_V2, 36},
101 {"ConnectX-6", {0x15b3, 4123},
102 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
103 UCT_IB_DEVICE_FLAG_DC_V2, 50},
104 {"ConnectX-6 VF", {0x15b3, 4124},
105 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
106 UCT_IB_DEVICE_FLAG_DC_V2, 50},
107 {"ConnectX-6 DX", {0x15b3, 4125},
108 UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
109 UCT_IB_DEVICE_FLAG_DC_V2, 50},
110 {"Generic HCA", {0, 0}, 0, 0},
111 {NULL}
112 };
113
uct_ib_device_get_locality(const char * dev_name,ucs_sys_cpuset_t * cpu_mask,int * numa_node)114 static void uct_ib_device_get_locality(const char *dev_name,
115 ucs_sys_cpuset_t *cpu_mask,
116 int *numa_node)
117 {
118 char *p, buf[ucs_max(CPU_SETSIZE, 10)];
119 ucs_status_t status;
120 ssize_t nread;
121 uint32_t word;
122 int base, k;
123 long n;
124
125 /* Read list of CPUs close to the device */
126 CPU_ZERO(cpu_mask);
127 nread = ucs_read_file(buf, sizeof(buf) - 1, 1, UCT_IB_DEVICE_SYSFS_FMT,
128 dev_name, "local_cpus");
129 if (nread >= 0) {
130 buf[CPU_SETSIZE - 1] = '\0';
131 base = 0;
132 do {
133 p = strrchr(buf, ',');
134 if (p == NULL) {
135 p = buf;
136 } else if (*p == ',') {
137 *(p++) = 0;
138 }
139
140 word = strtoul(p, 0, 16);
141 for (k = 0; word; ++k, word >>= 1) {
142 if (word & 1) {
143 CPU_SET(base + k, cpu_mask);
144 }
145 }
146 base += 32;
147 } while ((base < CPU_SETSIZE) && (p != buf));
148 } else {
149 /* If affinity file is not present, treat all CPUs as local */
150 for (k = 0; k < CPU_SETSIZE; ++k) {
151 CPU_SET(k, cpu_mask);
152 }
153 }
154
155 /* Read NUMA node number */
156 status = ucs_read_file_number(&n, 1,
157 "/sys/class/infiniband/%s/device/numa_node",
158 dev_name);
159 *numa_node = (status == UCS_OK) ? n : -1;
160 }
161
uct_ib_async_event_handler(int fd,int events,void * arg)162 static void uct_ib_async_event_handler(int fd, int events, void *arg)
163 {
164 uct_ib_device_t *dev = arg;
165 struct ibv_async_event ibevent;
166 uct_ib_async_event_t event;
167 int ret;
168
169 ret = ibv_get_async_event(dev->ibv_context, &ibevent);
170 if (ret != 0) {
171 if (errno != EAGAIN) {
172 ucs_warn("ibv_get_async_event() failed: %m");
173 }
174 return;
175 }
176
177 event.event_type = ibevent.event_type;
178 switch (event.event_type) {
179 case IBV_EVENT_CQ_ERR:
180 event.cookie = ibevent.element.cq;
181 break;
182 case IBV_EVENT_QP_FATAL:
183 case IBV_EVENT_QP_REQ_ERR:
184 case IBV_EVENT_QP_ACCESS_ERR:
185 case IBV_EVENT_COMM_EST:
186 case IBV_EVENT_SQ_DRAINED:
187 case IBV_EVENT_PATH_MIG:
188 case IBV_EVENT_PATH_MIG_ERR:
189 case IBV_EVENT_QP_LAST_WQE_REACHED:
190 event.qp_num = ibevent.element.qp->qp_num;
191 break;
192 case IBV_EVENT_SRQ_ERR:
193 case IBV_EVENT_SRQ_LIMIT_REACHED:
194 event.cookie = ibevent.element.srq;
195 break;
196 case IBV_EVENT_DEVICE_FATAL:
197 case IBV_EVENT_PORT_ERR:
198 case IBV_EVENT_PORT_ACTIVE:
199 #if HAVE_DECL_IBV_EVENT_GID_CHANGE
200 case IBV_EVENT_GID_CHANGE:
201 #endif
202 case IBV_EVENT_LID_CHANGE:
203 case IBV_EVENT_PKEY_CHANGE:
204 case IBV_EVENT_SM_CHANGE:
205 case IBV_EVENT_CLIENT_REREGISTER:
206 event.port_num = ibevent.element.port_num;
207 break;
208 #ifdef HAVE_STRUCT_IBV_ASYNC_EVENT_ELEMENT_DCT
209 case IBV_EXP_EVENT_DCT_KEY_VIOLATION:
210 case IBV_EXP_EVENT_DCT_ACCESS_ERR:
211 case IBV_EXP_EVENT_DCT_REQ_ERR:
212 if (ibevent.element.dct) {
213 event.dct_num = ibevent.element.dct->dct_num;
214 } else {
215 event.dct_num = 0;
216 }
217 break;
218 #endif
219 default:
220 break;
221 };
222
223 uct_ib_handle_async_event(dev, &event);
224 ibv_ack_async_event(&ibevent);
225 }
226
uct_ib_handle_async_event(uct_ib_device_t * dev,uct_ib_async_event_t * event)227 void uct_ib_handle_async_event(uct_ib_device_t *dev, uct_ib_async_event_t *event)
228 {
229 char event_info[200];
230 ucs_log_level_t level;
231
232 switch (event->event_type) {
233 case IBV_EVENT_CQ_ERR:
234 snprintf(event_info, sizeof(event_info), "%s on CQ %p",
235 ibv_event_type_str(event->event_type), event->cookie);
236 level = UCS_LOG_LEVEL_ERROR;
237 break;
238 case IBV_EVENT_QP_FATAL:
239 case IBV_EVENT_QP_REQ_ERR:
240 case IBV_EVENT_QP_ACCESS_ERR:
241 case IBV_EVENT_COMM_EST:
242 case IBV_EVENT_SQ_DRAINED:
243 case IBV_EVENT_PATH_MIG:
244 case IBV_EVENT_PATH_MIG_ERR:
245 snprintf(event_info, sizeof(event_info), "%s on QPN 0x%x",
246 ibv_event_type_str(event->event_type), event->qp_num);
247 level = UCS_LOG_LEVEL_ERROR;
248 break;
249 case IBV_EVENT_QP_LAST_WQE_REACHED:
250 snprintf(event_info, sizeof(event_info), "SRQ-attached QP 0x%x was flushed",
251 event->qp_num);
252 level = UCS_LOG_LEVEL_DEBUG;
253 break;
254 case IBV_EVENT_SRQ_ERR:
255 level = UCS_LOG_LEVEL_ERROR;
256 snprintf(event_info, sizeof(event_info), "%s on SRQ %p",
257 ibv_event_type_str(event->event_type), event->cookie);
258 break;
259 case IBV_EVENT_SRQ_LIMIT_REACHED:
260 snprintf(event_info, sizeof(event_info), "%s on SRQ %p",
261 ibv_event_type_str(event->event_type), event->cookie);
262 level = UCS_LOG_LEVEL_DEBUG;
263 break;
264 case IBV_EVENT_DEVICE_FATAL:
265 case IBV_EVENT_PORT_ERR:
266 snprintf(event_info, sizeof(event_info), "%s on port %d",
267 ibv_event_type_str(event->event_type), event->port_num);
268 level = UCS_LOG_LEVEL_ERROR;
269 break;
270 case IBV_EVENT_PORT_ACTIVE:
271 #if HAVE_DECL_IBV_EVENT_GID_CHANGE
272 case IBV_EVENT_GID_CHANGE:
273 #endif
274 case IBV_EVENT_LID_CHANGE:
275 case IBV_EVENT_PKEY_CHANGE:
276 case IBV_EVENT_SM_CHANGE:
277 case IBV_EVENT_CLIENT_REREGISTER:
278 snprintf(event_info, sizeof(event_info), "%s on port %d",
279 ibv_event_type_str(event->event_type), event->port_num);
280 level = UCS_LOG_LEVEL_WARN;
281 break;
282 #ifdef HAVE_STRUCT_IBV_ASYNC_EVENT_ELEMENT_DCT
283 case IBV_EXP_EVENT_DCT_KEY_VIOLATION:
284 snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x",
285 "DCT key violation", event->dct_num);
286 level = UCS_LOG_LEVEL_ERROR;
287 break;
288 case IBV_EXP_EVENT_DCT_ACCESS_ERR:
289 if (event->dct_num) {
290 snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x",
291 "DCT access error", event->dct_num);
292 } else {
293 snprintf(event_info, sizeof(event_info), "%s on DCTN UNKNOWN",
294 "DCT access error");
295 }
296 level = UCS_LOG_LEVEL_ERROR;
297 break;
298 case IBV_EXP_EVENT_DCT_REQ_ERR:
299 snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x",
300 "DCT requester error", event->dct_num);
301 level = UCS_LOG_LEVEL_ERROR;
302 break;
303 #endif
304 default:
305 snprintf(event_info, sizeof(event_info), "%s (%d)",
306 ibv_event_type_str(event->event_type), event->event_type);
307 level = UCS_LOG_LEVEL_INFO;
308 break;
309 };
310
311 UCS_STATS_UPDATE_COUNTER(dev->stats, UCT_IB_DEVICE_STAT_ASYNC_EVENT, +1);
312 ucs_log(level, "IB Async event on %s: %s", uct_ib_device_name(dev), event_info);
313 }
314
uct_ib_device_get_ids(uct_ib_device_t * dev)315 static void uct_ib_device_get_ids(uct_ib_device_t *dev)
316 {
317 long vendor_id, device_id;
318
319 if ((ucs_read_file_number(&vendor_id, 1, UCT_IB_DEVICE_SYSFS_FMT,
320 uct_ib_device_name(dev), "vendor") == UCS_OK) &&
321 (ucs_read_file_number(&device_id, 1, UCT_IB_DEVICE_SYSFS_FMT,
322 uct_ib_device_name(dev), "device") == UCS_OK)) {
323 dev->pci_id.vendor = vendor_id;
324 dev->pci_id.device = device_id;
325 ucs_debug("%s vendor_id: 0x%x device_id: %d", uct_ib_device_name(dev),
326 dev->pci_id.vendor, dev->pci_id.device);
327 } else {
328 dev->pci_id.vendor = 0;
329 dev->pci_id.device = 0;
330 ucs_warn("%s: could not read device/vendor id from sysfs, "
331 "performance may be affected", uct_ib_device_name(dev));
332 }
333 }
334
uct_ib_device_query(uct_ib_device_t * dev,struct ibv_device * ibv_device)335 ucs_status_t uct_ib_device_query(uct_ib_device_t *dev,
336 struct ibv_device *ibv_device)
337 {
338 ucs_status_t status;
339 uint8_t i;
340 int ret;
341
342 status = uct_ib_query_device(dev->ibv_context, &dev->dev_attr);
343 if (status != UCS_OK) {
344 return status;
345 }
346
347 /* Check device type*/
348 switch (ibv_device->node_type) {
349 case IBV_NODE_SWITCH:
350 dev->first_port = 0;
351 dev->num_ports = 1;
352 break;
353 case IBV_NODE_CA:
354 default:
355 dev->first_port = 1;
356 dev->num_ports = IBV_DEV_ATTR(dev, phys_port_cnt);
357 break;
358 }
359
360 if (dev->num_ports > UCT_IB_DEV_MAX_PORTS) {
361 ucs_error("%s has %d ports, but only up to %d are supported",
362 ibv_get_device_name(ibv_device), dev->num_ports,
363 UCT_IB_DEV_MAX_PORTS);
364 return UCS_ERR_UNSUPPORTED;
365 }
366
367 /* Query all ports */
368 for (i = 0; i < dev->num_ports; ++i) {
369 ret = ibv_query_port(dev->ibv_context, i + dev->first_port,
370 &dev->port_attr[i]);
371 if (ret != 0) {
372 ucs_error("ibv_query_port() returned %d: %m", ret);
373 return UCS_ERR_IO_ERROR;
374 }
375 }
376
377 uct_ib_device_get_ids(dev);
378
379 return UCS_OK;
380 }
381
uct_ib_device_init(uct_ib_device_t * dev,struct ibv_device * ibv_device,int async_events UCS_STATS_ARG (ucs_stats_node_t * stats_parent))382 ucs_status_t uct_ib_device_init(uct_ib_device_t *dev,
383 struct ibv_device *ibv_device, int async_events
384 UCS_STATS_ARG(ucs_stats_node_t *stats_parent))
385 {
386 ucs_status_t status;
387
388 dev->async_events = async_events;
389
390 uct_ib_device_get_locality(ibv_get_device_name(ibv_device), &dev->local_cpus,
391 &dev->numa_node);
392
393 status = UCS_STATS_NODE_ALLOC(&dev->stats, &uct_ib_device_stats_class,
394 stats_parent, "device");
395 if (status != UCS_OK) {
396 goto err;
397 }
398
399 status = ucs_sys_fcntl_modfl(dev->ibv_context->async_fd, O_NONBLOCK, 0);
400 if (status != UCS_OK) {
401 goto err_release_stats;
402 }
403
404 /* Register to IB async events */
405 if (dev->async_events) {
406 status = ucs_async_set_event_handler(UCS_ASYNC_THREAD_LOCK_TYPE,
407 dev->ibv_context->async_fd,
408 UCS_EVENT_SET_EVREAD,
409 uct_ib_async_event_handler, dev,
410 NULL);
411 if (status != UCS_OK) {
412 goto err_release_stats;
413 }
414 }
415
416 kh_init_inplace(uct_ib_ah, &dev->ah_hash);
417 ucs_recursive_spinlock_init(&dev->ah_lock, 0);
418
419 ucs_debug("initialized device '%s' (%s) with %d ports", uct_ib_device_name(dev),
420 ibv_node_type_str(ibv_device->node_type),
421 dev->num_ports);
422 return UCS_OK;
423
424 err_release_stats:
425 UCS_STATS_NODE_FREE(dev->stats);
426 err:
427 return status;
428 }
429
uct_ib_device_cleanup_ah_cached(uct_ib_device_t * dev)430 void uct_ib_device_cleanup_ah_cached(uct_ib_device_t *dev)
431 {
432 struct ibv_ah *ah;
433
434 kh_foreach_value(&dev->ah_hash, ah, ibv_destroy_ah(ah));
435 }
436
uct_ib_device_cleanup(uct_ib_device_t * dev)437 void uct_ib_device_cleanup(uct_ib_device_t *dev)
438 {
439 ucs_status_t status;
440
441 ucs_debug("destroying ib device %s", uct_ib_device_name(dev));
442
443 kh_destroy_inplace(uct_ib_ah, &dev->ah_hash);
444
445 status = ucs_recursive_spinlock_destroy(&dev->ah_lock);
446 if (status != UCS_OK) {
447 ucs_warn("ucs_recursive_spinlock_destroy() failed (%d)", status);
448 }
449
450 if (dev->async_events) {
451 ucs_async_remove_handler(dev->ibv_context->async_fd, 1);
452 }
453 UCS_STATS_NODE_FREE(dev->stats);
454 }
455
uct_ib_device_spec_match(uct_ib_device_t * dev,const uct_ib_device_spec_t * spec)456 static inline int uct_ib_device_spec_match(uct_ib_device_t *dev,
457 const uct_ib_device_spec_t *spec)
458 {
459 return (spec->pci_id.vendor == dev->pci_id.vendor) &&
460 (spec->pci_id.device == dev->pci_id.device);
461 }
462
uct_ib_device_spec(uct_ib_device_t * dev)463 const uct_ib_device_spec_t* uct_ib_device_spec(uct_ib_device_t *dev)
464 {
465 uct_ib_md_t *md = ucs_container_of(dev, uct_ib_md_t, dev);
466 uct_ib_device_spec_t *spec;
467
468 /* search through devices specified in the configuration */
469 for (spec = md->custom_devices.specs;
470 spec < md->custom_devices.specs + md->custom_devices.count; ++spec) {
471 if (uct_ib_device_spec_match(dev, spec)) {
472 return spec;
473 }
474 }
475
476 /* search through built-in list of device specifications */
477 spec = uct_ib_builtin_device_specs;
478 while ((spec->name != NULL) && !uct_ib_device_spec_match(dev, spec)) {
479 ++spec;
480 }
481 return spec; /* if no match is found, return the last entry, which contains
482 default settings for unknown devices */
483 }
484
uct_ib_device_get_ib_gid_index(uct_ib_md_t * md)485 static size_t uct_ib_device_get_ib_gid_index(uct_ib_md_t *md)
486 {
487 if (md->config.gid_index == UCS_ULUNITS_AUTO) {
488 return UCT_IB_MD_DEFAULT_GID_INDEX;
489 } else {
490 return md->config.gid_index;
491 }
492 }
493
uct_ib_device_is_iwarp(uct_ib_device_t * dev)494 static int uct_ib_device_is_iwarp(uct_ib_device_t *dev)
495 {
496 return dev->ibv_context->device->transport_type == IBV_TRANSPORT_IWARP;
497 }
498
uct_ib_device_port_check(uct_ib_device_t * dev,uint8_t port_num,unsigned flags)499 ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num,
500 unsigned flags)
501 {
502 uct_ib_md_t *md = ucs_container_of(dev, uct_ib_md_t, dev);
503 const uct_ib_device_spec_t *dev_info;
504 uint8_t required_dev_flags;
505 ucs_status_t status;
506 union ibv_gid gid;
507
508 if (port_num < dev->first_port || port_num >= dev->first_port + dev->num_ports) {
509 return UCS_ERR_NO_DEVICE;
510 }
511
512 if (uct_ib_device_port_attr(dev, port_num)->state != IBV_PORT_ACTIVE) {
513 ucs_trace("%s:%d is not active (state: %d)", uct_ib_device_name(dev),
514 port_num, uct_ib_device_port_attr(dev, port_num)->state);
515 return UCS_ERR_UNREACHABLE;
516 }
517
518 if (uct_ib_device_is_iwarp(dev)) {
519 /* TODO: enable it when support is ready */
520 ucs_debug("iWarp device %s is not supported", uct_ib_device_name(dev));
521 return UCS_ERR_UNSUPPORTED;
522 }
523
524 if (!uct_ib_device_is_port_ib(dev, port_num) && (flags & UCT_IB_DEVICE_FLAG_LINK_IB)) {
525 ucs_debug("%s:%d is not IB link layer", uct_ib_device_name(dev),
526 port_num);
527 return UCS_ERR_UNSUPPORTED;
528 }
529
530 if (flags & UCT_IB_DEVICE_FLAG_DC) {
531 if (!IBV_DEVICE_HAS_DC(dev)) {
532 ucs_trace("%s:%d does not support DC", uct_ib_device_name(dev), port_num);
533 return UCS_ERR_UNSUPPORTED;
534 }
535 }
536
537 /* check generic device flags */
538 dev_info = uct_ib_device_spec(dev);
539 required_dev_flags = flags & (UCT_IB_DEVICE_FLAG_MLX4_PRM |
540 UCT_IB_DEVICE_FLAG_MLX5_PRM);
541 if (!ucs_test_all_flags(dev_info->flags, required_dev_flags)) {
542 ucs_trace("%s:%d (%s) does not support flags 0x%x", uct_ib_device_name(dev),
543 port_num, dev_info->name, required_dev_flags);
544 return UCS_ERR_UNSUPPORTED;
545 }
546
547 if (md->check_subnet_filter && uct_ib_device_is_port_ib(dev, port_num)) {
548 status = uct_ib_device_query_gid(dev, port_num,
549 uct_ib_device_get_ib_gid_index(md), &gid);
550 if (status != UCS_OK) {
551 return status;
552 }
553
554 if (md->subnet_filter != gid.global.subnet_prefix) {
555 ucs_trace("%s:%d subnet_prefix does not match",
556 uct_ib_device_name(dev), port_num);
557 return UCS_ERR_UNSUPPORTED;
558 }
559 }
560
561 return UCS_OK;
562 }
563
uct_ib_roce_version_str(uct_ib_roce_version_t roce_ver)564 const char *uct_ib_roce_version_str(uct_ib_roce_version_t roce_ver)
565 {
566 switch (roce_ver) {
567 case UCT_IB_DEVICE_ROCE_V1:
568 return "RoCE v1";
569 case UCT_IB_DEVICE_ROCE_V1_5:
570 return "RoCE v1.5";
571 case UCT_IB_DEVICE_ROCE_V2:
572 return "RoCE v2";
573 default:
574 return "<unknown RoCE version>";
575 }
576 }
577
uct_ib_gid_str(const union ibv_gid * gid,char * str,size_t max_size)578 const char *uct_ib_gid_str(const union ibv_gid *gid, char *str, size_t max_size)
579 {
580 inet_ntop(AF_INET6, gid, str, max_size);
581 return str;
582 }
583
uct_ib_device_is_addr_ipv4_mcast(const struct in6_addr * raw,const uint32_t addr_last_bits)584 static int uct_ib_device_is_addr_ipv4_mcast(const struct in6_addr *raw,
585 const uint32_t addr_last_bits)
586 {
587 /* IPv4 encoded multicast addresses */
588 return (raw->s6_addr32[0] == htonl(0xff0e0000)) &&
589 !(raw->s6_addr32[1] | addr_last_bits);
590 }
591
uct_ib_device_get_addr_family(union ibv_gid * gid,int gid_index)592 static sa_family_t uct_ib_device_get_addr_family(union ibv_gid *gid, int gid_index)
593 {
594 const struct in6_addr *raw = (struct in6_addr *)gid->raw;
595 const uint32_t addr_last_bits = raw->s6_addr32[2] ^ htonl(0x0000ffff);
596 char p[128];
597
598 ucs_debug("testing addr_family on gid index %d: %s",
599 gid_index, uct_ib_gid_str(gid, p, sizeof(p)));
600
601 if (!((raw->s6_addr32[0] | raw->s6_addr32[1]) | addr_last_bits) ||
602 uct_ib_device_is_addr_ipv4_mcast(raw, addr_last_bits)) {
603 return AF_INET;
604 } else {
605 return AF_INET6;
606 }
607 }
608
609 ucs_status_t
uct_ib_device_query_gid_info(struct ibv_context * ctx,const char * dev_name,uint8_t port_num,unsigned gid_index,uct_ib_device_gid_info_t * info)610 uct_ib_device_query_gid_info(struct ibv_context *ctx, const char *dev_name,
611 uint8_t port_num, unsigned gid_index,
612 uct_ib_device_gid_info_t *info)
613 {
614 int ret;
615
616 #if HAVE_DECL_IBV_EXP_QUERY_GID_ATTR
617 struct ibv_exp_gid_attr attr;
618
619 attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE | IBV_EXP_QUERY_GID_ATTR_GID;
620 ret = ibv_exp_query_gid_attr(ctx, port_num, gid_index, &attr);
621 if (ret == 0) {
622 info->gid = attr.gid;
623 info->gid_index = gid_index;
624 info->roce_info.addr_family =
625 uct_ib_device_get_addr_family(&info->gid, gid_index);
626 switch (attr.type) {
627 case IBV_EXP_IB_ROCE_V1_GID_TYPE:
628 info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1;
629 return UCS_OK;
630 case IBV_EXP_ROCE_V1_5_GID_TYPE:
631 info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1_5;
632 return UCS_OK;
633 case IBV_EXP_ROCE_V2_GID_TYPE:
634 info->roce_info.ver = UCT_IB_DEVICE_ROCE_V2;
635 return UCS_OK;
636 default:
637 ucs_error("Invalid GID[%d] type on %s:%d: %d",
638 gid_index, dev_name, port_num, attr.type);
639 return UCS_ERR_IO_ERROR;
640 }
641 }
642 #else
643 char buf[16];
644
645 ret = ibv_query_gid(ctx, port_num, gid_index, &info->gid);
646 if (ret == 0) {
647 ret = ucs_read_file(buf, sizeof(buf) - 1, 1,
648 UCT_IB_DEVICE_SYSFS_GID_TYPE_FMT,
649 dev_name, port_num, gid_index);
650 if (ret > 0) {
651 if (!strncmp(buf, "IB/RoCE v1", 10)) {
652 info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1;
653 } else if (!strncmp(buf, "RoCE v2", 7)) {
654 info->roce_info.ver = UCT_IB_DEVICE_ROCE_V2;
655 } else {
656 ucs_error("failed to parse gid type '%s' (dev=%s port=%d index=%d)",
657 buf, dev_name, port_num, gid_index);
658 return UCS_ERR_INVALID_PARAM;
659 }
660 } else {
661 info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1;
662 }
663
664 info->roce_info.addr_family =
665 uct_ib_device_get_addr_family(&info->gid, gid_index);
666 info->gid_index = gid_index;
667 return UCS_OK;
668 }
669 #endif
670 ucs_error("ibv_query_gid(dev=%s port=%d index=%d) failed: %m",
671 dev_name, port_num, gid_index);
672 return UCS_ERR_INVALID_PARAM;
673 }
674
uct_ib_device_test_roce_gid_index(uct_ib_device_t * dev,uint8_t port_num,const union ibv_gid * gid,uint8_t gid_index)675 int uct_ib_device_test_roce_gid_index(uct_ib_device_t *dev, uint8_t port_num,
676 const union ibv_gid *gid,
677 uint8_t gid_index)
678 {
679 struct ibv_ah_attr ah_attr;
680 struct ibv_ah *ah;
681
682 ucs_assert(uct_ib_device_is_port_roce(dev, port_num));
683
684 memset(&ah_attr, 0, sizeof(ah_attr));
685 ah_attr.port_num = port_num;
686 ah_attr.is_global = 1;
687 ah_attr.grh.dgid = *gid;
688 ah_attr.grh.sgid_index = gid_index;
689 ah_attr.grh.hop_limit = 255;
690 ah_attr.grh.flow_label = 1;
691 ah_attr.dlid = UCT_IB_ROCE_UDP_SRC_PORT_BASE;
692
693 ah = ibv_create_ah(ucs_container_of(dev, uct_ib_md_t, dev)->pd, &ah_attr);
694 if (ah == NULL) {
695 return 0; /* gid entry is not operational */
696 }
697
698 ibv_destroy_ah(ah);
699 return 1;
700 }
701
uct_ib_device_select_gid(uct_ib_device_t * dev,uint8_t port_num,uct_ib_device_gid_info_t * gid_info)702 ucs_status_t uct_ib_device_select_gid(uct_ib_device_t *dev, uint8_t port_num,
703 uct_ib_device_gid_info_t *gid_info)
704 {
705 static const uct_ib_roce_version_info_t roce_prio[] = {
706 {UCT_IB_DEVICE_ROCE_V2, AF_INET},
707 {UCT_IB_DEVICE_ROCE_V2, AF_INET6},
708 {UCT_IB_DEVICE_ROCE_V1, AF_INET},
709 {UCT_IB_DEVICE_ROCE_V1, AF_INET6}
710 };
711 int gid_tbl_len = uct_ib_device_port_attr(dev, port_num)->gid_tbl_len;
712 ucs_status_t status = UCS_OK;
713 int priorities_arr_len = ucs_static_array_size(roce_prio);
714 uct_ib_device_gid_info_t gid_info_tmp;
715 int i, prio_idx;
716
717 ucs_assert(uct_ib_device_is_port_roce(dev, port_num));
718
719 /* search for matching GID table entries, according to the order defined
720 * in priorities array
721 */
722 for (prio_idx = 0; prio_idx < priorities_arr_len; prio_idx++) {
723 for (i = 0; i < gid_tbl_len; i++) {
724 status = uct_ib_device_query_gid_info(dev->ibv_context,
725 uct_ib_device_name(dev),
726 port_num, i, &gid_info_tmp);
727 if (status != UCS_OK) {
728 goto out;
729 }
730
731 if ((roce_prio[prio_idx].ver == gid_info_tmp.roce_info.ver) &&
732 (roce_prio[prio_idx].addr_family == gid_info_tmp.roce_info.addr_family) &&
733 uct_ib_device_test_roce_gid_index(dev, port_num, &gid_info_tmp.gid, i)) {
734
735 gid_info->gid_index = i;
736 gid_info->roce_info = gid_info_tmp.roce_info;
737 goto out_print;
738 }
739 }
740 }
741
742 gid_info->gid_index = UCT_IB_MD_DEFAULT_GID_INDEX;
743 gid_info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1;
744 gid_info->roce_info.addr_family = AF_INET;
745
746 out_print:
747 ucs_debug("%s:%d using gid_index %d", uct_ib_device_name(dev), port_num,
748 gid_info->gid_index);
749 out:
750 return status;
751 }
752
uct_ib_device_is_port_ib(uct_ib_device_t * dev,uint8_t port_num)753 int uct_ib_device_is_port_ib(uct_ib_device_t *dev, uint8_t port_num)
754 {
755 #if HAVE_DECL_IBV_LINK_LAYER_INFINIBAND
756 return uct_ib_device_port_attr(dev, port_num)->link_layer == IBV_LINK_LAYER_INFINIBAND;
757 #else
758 return 1;
759 #endif
760 }
761
uct_ib_device_is_port_roce(uct_ib_device_t * dev,uint8_t port_num)762 int uct_ib_device_is_port_roce(uct_ib_device_t *dev, uint8_t port_num)
763 {
764 return IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_device_port_attr(dev, port_num));
765 }
766
uct_ib_device_name(uct_ib_device_t * dev)767 const char *uct_ib_device_name(uct_ib_device_t *dev)
768 {
769 return ibv_get_device_name(dev->ibv_context->device);
770 }
771
uct_ib_device_bus(uct_ib_device_t * dev,int port_num,ucs_sys_bus_id_t * bus_id)772 ucs_status_t uct_ib_device_bus(uct_ib_device_t *dev, int port_num,
773 ucs_sys_bus_id_t *bus_id)
774 {
775 char ib_realpath[PATH_MAX];
776 char *pcie_bus;
777 char *tmp;
778 int i, bus_len;
779 int num_inputs;
780
781 if (NULL == realpath(dev->ibv_context->device->ibdev_path, ib_realpath)) {
782 return UCS_ERR_NO_RESOURCE;
783 }
784
785 /* realpath name is of form /sys/devices/.../0000:05:00.0/infiniband/mlx5_0
786 * and bus_id is constructed from 0000:05:00.0 */
787
788 /* Make sure there is /infiniband substring in ib_realpath*/
789 tmp = strstr(ib_realpath, "/infiniband");
790 if (NULL == tmp) {
791 return UCS_ERR_NO_RESOURCE;
792 }
793
794 pcie_bus = dirname(ib_realpath);
795 pcie_bus = dirname(pcie_bus);
796 pcie_bus = basename(pcie_bus);
797
798 bus_len = strlen(pcie_bus);
799 for (i = 0; i < bus_len; i++) {
800 if ((pcie_bus[i] == ':') || (pcie_bus[i] == '.')) {
801 pcie_bus[i] = ' ';
802 }
803 }
804
805 num_inputs = sscanf(pcie_bus, "%hx %hhx %hhx %hhx", &bus_id->domain,
806 &bus_id->bus,
807 &bus_id->slot,
808 &bus_id->function);
809 if (num_inputs != 4) {
810 return UCS_ERR_NO_RESOURCE;
811 }
812
813 ucs_debug("ib device = %s:%d, bus id = %hu:%hhu:%hhu.%hhu",
814 uct_ib_device_name(dev), port_num, bus_id->domain, bus_id->bus,
815 bus_id->slot, bus_id->function);
816
817 return UCS_OK;
818 }
819
uct_ib_mtu_value(enum ibv_mtu mtu)820 size_t uct_ib_mtu_value(enum ibv_mtu mtu)
821 {
822 switch (mtu) {
823 case IBV_MTU_256:
824 return 256;
825 case IBV_MTU_512:
826 return 512;
827 case IBV_MTU_1024:
828 return 1024;
829 case IBV_MTU_2048:
830 return 2048;
831 case IBV_MTU_4096:
832 return 4096;
833 }
834 ucs_fatal("Invalid MTU value (%d)", mtu);
835 }
836
uct_ib_to_qp_fabric_time(double t)837 uint8_t uct_ib_to_qp_fabric_time(double t)
838 {
839 double to;
840
841 to = log(t / 4.096e-6) / log(2.0);
842 if (to < 1) {
843 return 1; /* Very small timeout */
844 } else if ((long)(to + 0.5) >= UCT_IB_FABRIC_TIME_MAX) {
845 return 0; /* No timeout */
846 } else {
847 return (long)(to + 0.5);
848 }
849 }
850
uct_ib_to_rnr_fabric_time(double t)851 uint8_t uct_ib_to_rnr_fabric_time(double t)
852 {
853 double time_ms = t * UCS_MSEC_PER_SEC;
854 uint8_t idx, next_index;
855 double avg_ms;
856
857 for (idx = 1; idx < UCT_IB_FABRIC_TIME_MAX; idx++) {
858 next_index = (idx + 1) % UCT_IB_FABRIC_TIME_MAX;
859
860 if (time_ms <= uct_ib_qp_rnr_time_ms[next_index]) {
861 avg_ms = (uct_ib_qp_rnr_time_ms[idx] +
862 uct_ib_qp_rnr_time_ms[next_index]) * 0.5;
863
864 if (time_ms < avg_ms) {
865 /* return previous index */
866 return idx;
867 } else {
868 /* return current index */
869 return next_index;
870 }
871 }
872 }
873
874 return 0; /* this is a special value that means the maximum value */
875 }
876
uct_ib_modify_qp(struct ibv_qp * qp,enum ibv_qp_state state)877 ucs_status_t uct_ib_modify_qp(struct ibv_qp *qp, enum ibv_qp_state state)
878 {
879 struct ibv_qp_attr qp_attr;
880
881 ucs_debug("modify QP 0x%x to state %d", qp->qp_num, state);
882 memset(&qp_attr, 0, sizeof(qp_attr));
883 qp_attr.qp_state = state;
884 if (ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE)) {
885 ucs_warn("modify qp 0x%x to state %d failed: %m", qp->qp_num, state);
886 return UCS_ERR_IO_ERROR;
887 }
888
889 return UCS_OK;
890 }
891
uct_ib_device_query_ports(uct_ib_device_t * dev,unsigned flags,uct_tl_device_resource_t ** tl_devices_p,unsigned * num_tl_devices_p)892 ucs_status_t uct_ib_device_query_ports(uct_ib_device_t *dev, unsigned flags,
893 uct_tl_device_resource_t **tl_devices_p,
894 unsigned *num_tl_devices_p)
895 {
896 uct_tl_device_resource_t *tl_devices;
897 unsigned num_tl_devices;
898 ucs_status_t status;
899 uint8_t port_num;
900
901 /* Allocate resources array
902 * We may allocate more memory than really required, but it's not so bad. */
903 tl_devices = ucs_calloc(dev->num_ports, sizeof(*tl_devices), "ib device resource");
904 if (tl_devices == NULL) {
905 status = UCS_ERR_NO_MEMORY;
906 goto err;
907 }
908
909 /* Second pass: fill port information */
910 num_tl_devices = 0;
911 for (port_num = dev->first_port; port_num < dev->first_port + dev->num_ports;
912 ++port_num)
913 {
914 /* Check port capabilities */
915 status = uct_ib_device_port_check(dev, port_num, flags);
916 if (status != UCS_OK) {
917 ucs_trace("%s:%d does not support flags 0x%x: %s",
918 uct_ib_device_name(dev), port_num, flags,
919 ucs_status_string(status));
920 continue;
921 }
922
923 /* Save device information */
924 ucs_snprintf_zero(tl_devices[num_tl_devices].name,
925 sizeof(tl_devices[num_tl_devices].name),
926 "%s:%d", uct_ib_device_name(dev), port_num);
927 tl_devices[num_tl_devices].type = UCT_DEVICE_TYPE_NET;
928 ++num_tl_devices;
929 }
930
931 if (num_tl_devices == 0) {
932 ucs_debug("no compatible IB ports found for flags 0x%x", flags);
933 status = UCS_ERR_NO_DEVICE;
934 goto err_free;
935 }
936
937 *num_tl_devices_p = num_tl_devices;
938 *tl_devices_p = tl_devices;
939 return UCS_OK;
940
941 err_free:
942 ucs_free(tl_devices);
943 err:
944 return status;
945 }
946
uct_ib_device_find_port(uct_ib_device_t * dev,const char * resource_dev_name,uint8_t * p_port_num)947 ucs_status_t uct_ib_device_find_port(uct_ib_device_t *dev,
948 const char *resource_dev_name,
949 uint8_t *p_port_num)
950 {
951 const char *ibdev_name;
952 unsigned port_num;
953 size_t devname_len;
954 char *p;
955
956 p = strrchr(resource_dev_name, ':');
957 if (p == NULL) {
958 goto err; /* Wrong device name format */
959 }
960 devname_len = p - resource_dev_name;
961
962 ibdev_name = uct_ib_device_name(dev);
963 if ((strlen(ibdev_name) != devname_len) ||
964 strncmp(ibdev_name, resource_dev_name, devname_len))
965 {
966 goto err; /* Device name is wrong */
967 }
968
969 port_num = strtod(p + 1, &p);
970 if (*p != '\0') {
971 goto err; /* Failed to parse port number */
972 }
973 if ((port_num < dev->first_port) || (port_num >= dev->first_port + dev->num_ports)) {
974 goto err; /* Port number out of range */
975 }
976
977 *p_port_num = port_num;
978 return UCS_OK;
979
980 err:
981 ucs_error("%s: failed to find port", resource_dev_name);
982 return UCS_ERR_NO_DEVICE;
983 }
984
uct_ib_device_mtu(const char * dev_name,uct_md_h md,int * p_mtu)985 ucs_status_t uct_ib_device_mtu(const char *dev_name, uct_md_h md, int *p_mtu)
986 {
987 uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev;
988 uint8_t port_num;
989 ucs_status_t status;
990
991 status = uct_ib_device_find_port(dev, dev_name, &port_num);
992 if (status != UCS_OK) {
993 return status;
994 }
995
996 *p_mtu = uct_ib_mtu_value(uct_ib_device_port_attr(dev, port_num)->active_mtu);
997 return UCS_OK;
998 }
999
uct_ib_device_is_gid_raw_empty(uint8_t * gid_raw)1000 int uct_ib_device_is_gid_raw_empty(uint8_t *gid_raw)
1001 {
1002 return (*(uint64_t *)gid_raw == 0) && (*(uint64_t *)(gid_raw + 8) == 0);
1003 }
1004
uct_ib_device_query_gid(uct_ib_device_t * dev,uint8_t port_num,unsigned gid_index,union ibv_gid * gid)1005 ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num,
1006 unsigned gid_index, union ibv_gid *gid)
1007 {
1008 uct_ib_device_gid_info_t gid_info;
1009 ucs_status_t status;
1010
1011 status = uct_ib_device_query_gid_info(dev->ibv_context, uct_ib_device_name(dev),
1012 port_num, gid_index, &gid_info);
1013 if (status != UCS_OK) {
1014 return status;
1015 }
1016
1017 if (uct_ib_device_is_gid_raw_empty(gid_info.gid.raw)) {
1018 ucs_error("Invalid gid[%d] on %s:%d", gid_index,
1019 uct_ib_device_name(dev), port_num);
1020 return UCS_ERR_INVALID_ADDR;
1021 }
1022
1023 *gid = gid_info.gid;
1024 return UCS_OK;
1025 }
1026
uct_ib_device_odp_max_size(uct_ib_device_t * dev)1027 size_t uct_ib_device_odp_max_size(uct_ib_device_t *dev)
1028 {
1029 #ifdef HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS
1030 const struct ibv_exp_device_attr *dev_attr = &dev->dev_attr;
1031 uint32_t required_ud_odp_caps = IBV_EXP_ODP_SUPPORT_SEND;
1032 uint32_t required_rc_odp_caps = IBV_EXP_ODP_SUPPORT_SEND |
1033 IBV_EXP_ODP_SUPPORT_WRITE |
1034 IBV_EXP_ODP_SUPPORT_READ;
1035
1036 if (RUNNING_ON_VALGRIND ||
1037 !IBV_EXP_HAVE_ODP(dev_attr) ||
1038 !ucs_test_all_flags(IBV_EXP_ODP_CAPS(dev_attr, rc), required_rc_odp_caps) ||
1039 !ucs_test_all_flags(IBV_EXP_ODP_CAPS(dev_attr, ud), required_ud_odp_caps))
1040 {
1041 return 0;
1042 }
1043
1044 if (IBV_DEVICE_HAS_DC(dev)
1045 # if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS_PER_TRANSPORT_CAPS_DC_ODP_CAPS
1046 && !ucs_test_all_flags(IBV_EXP_ODP_CAPS(dev_attr, dc), required_rc_odp_caps)
1047 # endif
1048 )
1049 {
1050 return 0;
1051 }
1052
1053 # if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_MR_MAX_SIZE
1054 return dev_attr->odp_mr_max_size;
1055 # else
1056 return 1ul << 28; /* Limit ODP to 256 MB by default */
1057 # endif /* HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_MR_MAX_SIZE */
1058
1059 #else
1060 return 0;
1061 #endif /* HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS */
1062 }
1063
uct_ib_wc_status_str(enum ibv_wc_status wc_status)1064 const char *uct_ib_wc_status_str(enum ibv_wc_status wc_status)
1065 {
1066 return ibv_wc_status_str(wc_status);
1067 }
1068
uct_ib_device_create_ah(uct_ib_device_t * dev,struct ibv_ah_attr * ah_attr,struct ibv_pd * pd,struct ibv_ah ** ah_p)1069 static ucs_status_t uct_ib_device_create_ah(uct_ib_device_t *dev,
1070 struct ibv_ah_attr *ah_attr,
1071 struct ibv_pd *pd,
1072 struct ibv_ah **ah_p)
1073 {
1074 struct ibv_ah *ah;
1075 char buf[128];
1076
1077 ah = ibv_create_ah(pd, ah_attr);
1078 if (ah == NULL) {
1079 ucs_error("ibv_create_ah(%s) on %s failed: %m",
1080 uct_ib_ah_attr_str(buf, sizeof(buf), ah_attr),
1081 uct_ib_device_name(dev));
1082 return UCS_ERR_INVALID_ADDR;
1083 }
1084
1085 *ah_p = ah;
1086 return UCS_OK;
1087 }
1088
uct_ib_device_create_ah_cached(uct_ib_device_t * dev,struct ibv_ah_attr * ah_attr,struct ibv_pd * pd,struct ibv_ah ** ah_p)1089 ucs_status_t uct_ib_device_create_ah_cached(uct_ib_device_t *dev,
1090 struct ibv_ah_attr *ah_attr,
1091 struct ibv_pd *pd,
1092 struct ibv_ah **ah_p)
1093 {
1094 ucs_status_t status = UCS_OK;
1095 khiter_t iter;
1096 int ret;
1097
1098 ucs_recursive_spin_lock(&dev->ah_lock);
1099
1100 /* looking for existing AH with same attributes */
1101 iter = kh_get(uct_ib_ah, &dev->ah_hash, *ah_attr);
1102 if (iter == kh_end(&dev->ah_hash)) {
1103 /* new AH */
1104 status = uct_ib_device_create_ah(dev, ah_attr, pd, ah_p);
1105 if (status != UCS_OK) {
1106 goto unlock;
1107 }
1108
1109 /* store AH in hash */
1110 iter = kh_put(uct_ib_ah, &dev->ah_hash, *ah_attr, &ret);
1111
1112 /* failed to store - rollback */
1113 if (iter == kh_end(&dev->ah_hash)) {
1114 ibv_destroy_ah(*ah_p);
1115 status = UCS_ERR_NO_MEMORY;
1116 goto unlock;
1117 }
1118
1119 kh_value(&dev->ah_hash, iter) = *ah_p;
1120 } else {
1121 /* found existing AH */
1122 *ah_p = kh_value(&dev->ah_hash, iter);
1123 }
1124
1125 unlock:
1126 ucs_recursive_spin_unlock(&dev->ah_lock);
1127 return status;
1128 }
1129
uct_ib_get_cqe_size(int cqe_size_min)1130 int uct_ib_get_cqe_size(int cqe_size_min)
1131 {
1132 static int cqe_size_max = -1;
1133 int cqe_size;
1134
1135 if (cqe_size_max == -1) {
1136 #ifdef __aarch64__
1137 char arm_board_vendor[128];
1138 ucs_aarch64_cpuid_t cpuid;
1139 ucs_aarch64_cpuid(&cpuid);
1140
1141 arm_board_vendor[0] = '\0';
1142 ucs_read_file(arm_board_vendor, sizeof(arm_board_vendor), 1,
1143 "/sys/devices/virtual/dmi/id/board_vendor");
1144 ucs_debug("arm_board_vendor is '%s'", arm_board_vendor);
1145
1146 cqe_size_max = ((strcasestr(arm_board_vendor, "Huawei")) &&
1147 (cpuid.implementer == 0x41) && (cpuid.architecture == 8) &&
1148 (cpuid.variant == 0) && (cpuid.part == 0xd08) &&
1149 (cpuid.revision == 2))
1150 ? 64 : 128;
1151 #else
1152 cqe_size_max = 128;
1153 #endif
1154 ucs_debug("max IB CQE size is %d", cqe_size_max);
1155 }
1156
1157 /* Set cqe size according to inline size and cache line size. */
1158 cqe_size = ucs_max(cqe_size_min, UCS_SYS_CACHE_LINE_SIZE);
1159 cqe_size = ucs_max(cqe_size, 64); /* at least 64 */
1160 cqe_size = ucs_min(cqe_size, cqe_size_max);
1161
1162 return cqe_size;
1163 }
1164
1165 static ucs_status_t
uct_ib_device_get_roce_ndev_name(uct_ib_device_t * dev,uint8_t port_num,char * ndev_name,size_t max)1166 uct_ib_device_get_roce_ndev_name(uct_ib_device_t *dev, uint8_t port_num,
1167 char *ndev_name, size_t max)
1168 {
1169 ssize_t nread;
1170
1171 ucs_assert_always(uct_ib_device_is_port_roce(dev, port_num));
1172
1173 /* get the network device name which corresponds to a RoCE port */
1174 nread = ucs_read_file_str(ndev_name, max, 1,
1175 UCT_IB_DEVICE_SYSFS_GID_NDEV_FMT,
1176 uct_ib_device_name(dev), port_num, 0);
1177 if (nread < 0) {
1178 ucs_diag("failed to read " UCT_IB_DEVICE_SYSFS_GID_NDEV_FMT": %m",
1179 uct_ib_device_name(dev), port_num, 0);
1180 return UCS_ERR_NO_DEVICE;
1181 }
1182
1183 ucs_strtrim(ndev_name);
1184 return UCS_OK;
1185 }
1186
uct_ib_device_get_roce_lag_level(uct_ib_device_t * dev,uint8_t port_num)1187 unsigned uct_ib_device_get_roce_lag_level(uct_ib_device_t *dev, uint8_t port_num)
1188 {
1189 char ndev_name[IFNAMSIZ];
1190 unsigned roce_lag_level;
1191 ucs_status_t status;
1192
1193 status = uct_ib_device_get_roce_ndev_name(dev, port_num, ndev_name,
1194 sizeof(ndev_name));
1195 if (status != UCS_OK) {
1196 return 1;
1197 }
1198
1199 roce_lag_level = ucs_netif_bond_ad_num_ports(ndev_name);
1200 ucs_debug("RoCE LAG level on %s:%d (%s) is %u", uct_ib_device_name(dev),
1201 port_num, ndev_name, roce_lag_level);
1202 return roce_lag_level;
1203 }
1204
uct_ib_ah_attr_str(char * buf,size_t max,const struct ibv_ah_attr * ah_attr)1205 const char* uct_ib_ah_attr_str(char *buf, size_t max,
1206 const struct ibv_ah_attr *ah_attr)
1207 {
1208 char *p = buf;
1209 char *endp = buf + max;
1210
1211 snprintf(p, endp - p, "dlid=%d sl=%d port=%d src_path_bits=%d",
1212 ah_attr->dlid, ah_attr->sl,
1213 ah_attr->port_num, ah_attr->src_path_bits);
1214 p += strlen(p);
1215
1216 if (ah_attr->is_global) {
1217 snprintf(p, endp - p, " dgid=");
1218 p += strlen(p);
1219 uct_ib_gid_str(&ah_attr->grh.dgid, p, endp - p);
1220 p += strlen(p);
1221 snprintf(p, endp - p, " sgid_index=%d traffic_class=%d",
1222 ah_attr->grh.sgid_index, ah_attr->grh.traffic_class);
1223 }
1224
1225 return buf;
1226 }
1227