1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
3 * Copyright (C) UT-Battelle, LLC. 2014. ALL RIGHTS RESERVED.
4 * See file LICENSE for terms.
5 */
6 
7 #ifdef HAVE_CONFIG_H
8 #  include "config.h"
9 #endif
10 
11 #include "ib_device.h"
12 #include "ib_md.h"
13 
14 #include <ucs/arch/bitops.h>
15 #include <ucs/debug/memtrack.h>
16 #include <ucs/debug/log.h>
17 #include <ucs/async/async.h>
18 #include <ucs/sys/compiler.h>
19 #include <ucs/sys/string.h>
20 #include <ucs/sys/sock.h>
21 #include <ucs/sys/sys.h>
22 #include <sys/poll.h>
23 #include <sched.h>
24 #include <libgen.h>
25 
26 
27 /* This table is according to "Encoding for RNR NAK Timer Field"
28  * in IBTA specification */
29 const double uct_ib_qp_rnr_time_ms[] = {
30     655.36,  0.01,  0.02,   0.03,   0.04,   0.06,   0.08,   0.12,
31       0.16,  0.24,  0.32,   0.48,   0.64,   0.96,   1.28,   1.92,
32       2.56,  3.84,  5.12,   7.68,  10.24,  15.36,  20.48,  30.72,
33      40.96, 61.44, 81.92, 122.88, 163.84, 245.76, 327.68, 491.52
34 };
35 
36 
37 /* use both gid + lid data for key generarion (lid - ib based, gid - RoCE) */
38 static UCS_F_ALWAYS_INLINE
uct_ib_kh_ah_hash_func(struct ibv_ah_attr attr)39 khint32_t uct_ib_kh_ah_hash_func(struct ibv_ah_attr attr)
40 {
41     return kh_int64_hash_func(attr.grh.dgid.global.subnet_prefix ^
42                               attr.grh.dgid.global.interface_id  ^
43                               attr.dlid);
44 }
45 
46 static UCS_F_ALWAYS_INLINE
uct_ib_kh_ah_hash_equal(struct ibv_ah_attr a,struct ibv_ah_attr b)47 int uct_ib_kh_ah_hash_equal(struct ibv_ah_attr a, struct ibv_ah_attr b)
48 {
49     return !memcmp(&a, &b, sizeof(a));
50 }
51 
52 KHASH_IMPL(uct_ib_ah, struct ibv_ah_attr, struct ibv_ah*, 1,
53            uct_ib_kh_ah_hash_func, uct_ib_kh_ah_hash_equal)
54 
55 
56 #ifdef ENABLE_STATS
57 static ucs_stats_class_t uct_ib_device_stats_class = {
58     .name           = "",
59     .num_counters   = UCT_IB_DEVICE_STAT_LAST,
60     .counter_names = {
61         [UCT_IB_DEVICE_STAT_ASYNC_EVENT] = "async_event"
62     }
63 };
64 #endif
65 
66 static uct_ib_device_spec_t uct_ib_builtin_device_specs[] = {
67   {"ConnectX-3", {0x15b3, 4099},
68    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX4_PRM, 10},
69   {"ConnectX-3 Pro", {0x15b3, 4103},
70    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX4_PRM, 11},
71   {"Connect-IB", {0x15b3, 4113},
72    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
73    UCT_IB_DEVICE_FLAG_DC_V1, 20},
74   {"ConnectX-4", {0x15b3, 4115},
75    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
76    UCT_IB_DEVICE_FLAG_DC_V1, 30},
77   {"ConnectX-4", {0x15b3, 4116},
78    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
79    UCT_IB_DEVICE_FLAG_DC_V1, 29},
80   {"ConnectX-4 LX", {0x15b3, 4117},
81    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
82    UCT_IB_DEVICE_FLAG_DC_V1, 28},
83   {"ConnectX-4 LX VF", {0x15b3, 4118},
84    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
85    UCT_IB_DEVICE_FLAG_DC_V1, 28},
86   {"ConnectX-5", {0x15b3, 4119},
87    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
88    UCT_IB_DEVICE_FLAG_DC_V2, 38},
89   {"ConnectX-5", {0x15b3, 4121},
90    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
91    UCT_IB_DEVICE_FLAG_DC_V2, 40},
92   {"ConnectX-5", {0x15b3, 4120},
93    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
94    UCT_IB_DEVICE_FLAG_DC_V2, 39},
95   {"ConnectX-5", {0x15b3, 41682},
96    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
97    UCT_IB_DEVICE_FLAG_DC_V2, 37},
98   {"ConnectX-5", {0x15b3, 4122},
99    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
100    UCT_IB_DEVICE_FLAG_DC_V2, 36},
101   {"ConnectX-6", {0x15b3, 4123},
102    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
103    UCT_IB_DEVICE_FLAG_DC_V2, 50},
104   {"ConnectX-6 VF", {0x15b3, 4124},
105    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
106    UCT_IB_DEVICE_FLAG_DC_V2, 50},
107   {"ConnectX-6 DX", {0x15b3, 4125},
108    UCT_IB_DEVICE_FLAG_MELLANOX | UCT_IB_DEVICE_FLAG_MLX5_PRM |
109    UCT_IB_DEVICE_FLAG_DC_V2, 50},
110   {"Generic HCA", {0, 0}, 0, 0},
111   {NULL}
112 };
113 
uct_ib_device_get_locality(const char * dev_name,ucs_sys_cpuset_t * cpu_mask,int * numa_node)114 static void uct_ib_device_get_locality(const char *dev_name,
115                                        ucs_sys_cpuset_t *cpu_mask,
116                                        int *numa_node)
117 {
118     char *p, buf[ucs_max(CPU_SETSIZE, 10)];
119     ucs_status_t status;
120     ssize_t nread;
121     uint32_t word;
122     int base, k;
123     long n;
124 
125     /* Read list of CPUs close to the device */
126     CPU_ZERO(cpu_mask);
127     nread = ucs_read_file(buf, sizeof(buf) - 1, 1, UCT_IB_DEVICE_SYSFS_FMT,
128                           dev_name, "local_cpus");
129     if (nread >= 0) {
130         buf[CPU_SETSIZE - 1] = '\0';
131         base = 0;
132         do {
133             p = strrchr(buf, ',');
134             if (p == NULL) {
135                 p = buf;
136             } else if (*p == ',') {
137                 *(p++) = 0;
138             }
139 
140             word = strtoul(p, 0, 16);
141             for (k = 0; word; ++k, word >>= 1) {
142                 if (word & 1) {
143                     CPU_SET(base + k, cpu_mask);
144                 }
145             }
146             base += 32;
147         } while ((base < CPU_SETSIZE) && (p != buf));
148     } else {
149         /* If affinity file is not present, treat all CPUs as local */
150         for (k = 0; k < CPU_SETSIZE; ++k) {
151             CPU_SET(k, cpu_mask);
152         }
153     }
154 
155     /* Read NUMA node number */
156     status = ucs_read_file_number(&n, 1,
157                                   "/sys/class/infiniband/%s/device/numa_node",
158                                   dev_name);
159     *numa_node = (status == UCS_OK) ? n : -1;
160 }
161 
uct_ib_async_event_handler(int fd,int events,void * arg)162 static void uct_ib_async_event_handler(int fd, int events, void *arg)
163 {
164     uct_ib_device_t *dev = arg;
165     struct ibv_async_event ibevent;
166     uct_ib_async_event_t event;
167     int ret;
168 
169     ret = ibv_get_async_event(dev->ibv_context, &ibevent);
170     if (ret != 0) {
171         if (errno != EAGAIN) {
172             ucs_warn("ibv_get_async_event() failed: %m");
173         }
174         return;
175     }
176 
177     event.event_type = ibevent.event_type;
178     switch (event.event_type) {
179     case IBV_EVENT_CQ_ERR:
180         event.cookie = ibevent.element.cq;
181         break;
182     case IBV_EVENT_QP_FATAL:
183     case IBV_EVENT_QP_REQ_ERR:
184     case IBV_EVENT_QP_ACCESS_ERR:
185     case IBV_EVENT_COMM_EST:
186     case IBV_EVENT_SQ_DRAINED:
187     case IBV_EVENT_PATH_MIG:
188     case IBV_EVENT_PATH_MIG_ERR:
189     case IBV_EVENT_QP_LAST_WQE_REACHED:
190         event.qp_num = ibevent.element.qp->qp_num;
191         break;
192     case IBV_EVENT_SRQ_ERR:
193     case IBV_EVENT_SRQ_LIMIT_REACHED:
194         event.cookie = ibevent.element.srq;
195         break;
196     case IBV_EVENT_DEVICE_FATAL:
197     case IBV_EVENT_PORT_ERR:
198     case IBV_EVENT_PORT_ACTIVE:
199 #if HAVE_DECL_IBV_EVENT_GID_CHANGE
200     case IBV_EVENT_GID_CHANGE:
201 #endif
202     case IBV_EVENT_LID_CHANGE:
203     case IBV_EVENT_PKEY_CHANGE:
204     case IBV_EVENT_SM_CHANGE:
205     case IBV_EVENT_CLIENT_REREGISTER:
206         event.port_num = ibevent.element.port_num;
207         break;
208 #ifdef HAVE_STRUCT_IBV_ASYNC_EVENT_ELEMENT_DCT
209     case IBV_EXP_EVENT_DCT_KEY_VIOLATION:
210     case IBV_EXP_EVENT_DCT_ACCESS_ERR:
211     case IBV_EXP_EVENT_DCT_REQ_ERR:
212         if (ibevent.element.dct) {
213             event.dct_num = ibevent.element.dct->dct_num;
214         } else {
215             event.dct_num = 0;
216         }
217         break;
218 #endif
219     default:
220         break;
221     };
222 
223     uct_ib_handle_async_event(dev, &event);
224     ibv_ack_async_event(&ibevent);
225 }
226 
uct_ib_handle_async_event(uct_ib_device_t * dev,uct_ib_async_event_t * event)227 void uct_ib_handle_async_event(uct_ib_device_t *dev, uct_ib_async_event_t *event)
228 {
229     char event_info[200];
230     ucs_log_level_t level;
231 
232     switch (event->event_type) {
233     case IBV_EVENT_CQ_ERR:
234         snprintf(event_info, sizeof(event_info), "%s on CQ %p",
235                  ibv_event_type_str(event->event_type), event->cookie);
236         level = UCS_LOG_LEVEL_ERROR;
237         break;
238     case IBV_EVENT_QP_FATAL:
239     case IBV_EVENT_QP_REQ_ERR:
240     case IBV_EVENT_QP_ACCESS_ERR:
241     case IBV_EVENT_COMM_EST:
242     case IBV_EVENT_SQ_DRAINED:
243     case IBV_EVENT_PATH_MIG:
244     case IBV_EVENT_PATH_MIG_ERR:
245         snprintf(event_info, sizeof(event_info), "%s on QPN 0x%x",
246                  ibv_event_type_str(event->event_type), event->qp_num);
247         level = UCS_LOG_LEVEL_ERROR;
248         break;
249     case IBV_EVENT_QP_LAST_WQE_REACHED:
250         snprintf(event_info, sizeof(event_info), "SRQ-attached QP 0x%x was flushed",
251                  event->qp_num);
252         level = UCS_LOG_LEVEL_DEBUG;
253         break;
254     case IBV_EVENT_SRQ_ERR:
255         level = UCS_LOG_LEVEL_ERROR;
256         snprintf(event_info, sizeof(event_info), "%s on SRQ %p",
257                  ibv_event_type_str(event->event_type), event->cookie);
258         break;
259     case IBV_EVENT_SRQ_LIMIT_REACHED:
260         snprintf(event_info, sizeof(event_info), "%s on SRQ %p",
261                  ibv_event_type_str(event->event_type), event->cookie);
262         level = UCS_LOG_LEVEL_DEBUG;
263         break;
264     case IBV_EVENT_DEVICE_FATAL:
265     case IBV_EVENT_PORT_ERR:
266         snprintf(event_info, sizeof(event_info), "%s on port %d",
267                  ibv_event_type_str(event->event_type), event->port_num);
268         level = UCS_LOG_LEVEL_ERROR;
269         break;
270     case IBV_EVENT_PORT_ACTIVE:
271 #if HAVE_DECL_IBV_EVENT_GID_CHANGE
272     case IBV_EVENT_GID_CHANGE:
273 #endif
274     case IBV_EVENT_LID_CHANGE:
275     case IBV_EVENT_PKEY_CHANGE:
276     case IBV_EVENT_SM_CHANGE:
277     case IBV_EVENT_CLIENT_REREGISTER:
278         snprintf(event_info, sizeof(event_info), "%s on port %d",
279                  ibv_event_type_str(event->event_type), event->port_num);
280         level = UCS_LOG_LEVEL_WARN;
281         break;
282 #ifdef HAVE_STRUCT_IBV_ASYNC_EVENT_ELEMENT_DCT
283     case IBV_EXP_EVENT_DCT_KEY_VIOLATION:
284         snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x",
285                  "DCT key violation", event->dct_num);
286         level = UCS_LOG_LEVEL_ERROR;
287         break;
288     case IBV_EXP_EVENT_DCT_ACCESS_ERR:
289         if (event->dct_num) {
290             snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x",
291                      "DCT access error", event->dct_num);
292         } else {
293             snprintf(event_info, sizeof(event_info), "%s on DCTN UNKNOWN",
294                      "DCT access error");
295         }
296         level = UCS_LOG_LEVEL_ERROR;
297         break;
298     case IBV_EXP_EVENT_DCT_REQ_ERR:
299         snprintf(event_info, sizeof(event_info), "%s on DCTN 0x%x",
300                  "DCT requester error", event->dct_num);
301         level = UCS_LOG_LEVEL_ERROR;
302         break;
303 #endif
304     default:
305         snprintf(event_info, sizeof(event_info), "%s (%d)",
306                  ibv_event_type_str(event->event_type), event->event_type);
307         level = UCS_LOG_LEVEL_INFO;
308         break;
309     };
310 
311     UCS_STATS_UPDATE_COUNTER(dev->stats, UCT_IB_DEVICE_STAT_ASYNC_EVENT, +1);
312     ucs_log(level, "IB Async event on %s: %s", uct_ib_device_name(dev), event_info);
313 }
314 
uct_ib_device_get_ids(uct_ib_device_t * dev)315 static void uct_ib_device_get_ids(uct_ib_device_t *dev)
316 {
317     long vendor_id, device_id;
318 
319     if ((ucs_read_file_number(&vendor_id, 1, UCT_IB_DEVICE_SYSFS_FMT,
320                               uct_ib_device_name(dev), "vendor") == UCS_OK) &&
321         (ucs_read_file_number(&device_id, 1, UCT_IB_DEVICE_SYSFS_FMT,
322                               uct_ib_device_name(dev), "device") == UCS_OK)) {
323         dev->pci_id.vendor = vendor_id;
324         dev->pci_id.device = device_id;
325         ucs_debug("%s vendor_id: 0x%x device_id: %d", uct_ib_device_name(dev),
326                   dev->pci_id.vendor, dev->pci_id.device);
327     } else {
328         dev->pci_id.vendor = 0;
329         dev->pci_id.device = 0;
330         ucs_warn("%s: could not read device/vendor id from sysfs, "
331                  "performance may be affected", uct_ib_device_name(dev));
332     }
333 }
334 
uct_ib_device_query(uct_ib_device_t * dev,struct ibv_device * ibv_device)335 ucs_status_t uct_ib_device_query(uct_ib_device_t *dev,
336                                  struct ibv_device *ibv_device)
337 {
338     ucs_status_t status;
339     uint8_t i;
340     int ret;
341 
342     status = uct_ib_query_device(dev->ibv_context, &dev->dev_attr);
343     if (status != UCS_OK) {
344         return status;
345     }
346 
347     /* Check device type*/
348     switch (ibv_device->node_type) {
349     case IBV_NODE_SWITCH:
350         dev->first_port = 0;
351         dev->num_ports  = 1;
352         break;
353     case IBV_NODE_CA:
354     default:
355         dev->first_port = 1;
356         dev->num_ports  = IBV_DEV_ATTR(dev, phys_port_cnt);
357         break;
358     }
359 
360     if (dev->num_ports > UCT_IB_DEV_MAX_PORTS) {
361         ucs_error("%s has %d ports, but only up to %d are supported",
362                   ibv_get_device_name(ibv_device), dev->num_ports,
363                   UCT_IB_DEV_MAX_PORTS);
364         return UCS_ERR_UNSUPPORTED;
365     }
366 
367     /* Query all ports */
368     for (i = 0; i < dev->num_ports; ++i) {
369         ret = ibv_query_port(dev->ibv_context, i + dev->first_port,
370                              &dev->port_attr[i]);
371         if (ret != 0) {
372             ucs_error("ibv_query_port() returned %d: %m", ret);
373             return UCS_ERR_IO_ERROR;
374         }
375     }
376 
377     uct_ib_device_get_ids(dev);
378 
379     return UCS_OK;
380 }
381 
uct_ib_device_init(uct_ib_device_t * dev,struct ibv_device * ibv_device,int async_events UCS_STATS_ARG (ucs_stats_node_t * stats_parent))382 ucs_status_t uct_ib_device_init(uct_ib_device_t *dev,
383                                 struct ibv_device *ibv_device, int async_events
384                                 UCS_STATS_ARG(ucs_stats_node_t *stats_parent))
385 {
386     ucs_status_t status;
387 
388     dev->async_events = async_events;
389 
390     uct_ib_device_get_locality(ibv_get_device_name(ibv_device), &dev->local_cpus,
391                                &dev->numa_node);
392 
393     status = UCS_STATS_NODE_ALLOC(&dev->stats, &uct_ib_device_stats_class,
394                                   stats_parent, "device");
395     if (status != UCS_OK) {
396         goto err;
397     }
398 
399     status = ucs_sys_fcntl_modfl(dev->ibv_context->async_fd, O_NONBLOCK, 0);
400     if (status != UCS_OK) {
401         goto err_release_stats;
402     }
403 
404     /* Register to IB async events */
405     if (dev->async_events) {
406         status = ucs_async_set_event_handler(UCS_ASYNC_THREAD_LOCK_TYPE,
407                                              dev->ibv_context->async_fd,
408                                              UCS_EVENT_SET_EVREAD,
409                                              uct_ib_async_event_handler, dev,
410                                              NULL);
411         if (status != UCS_OK) {
412             goto err_release_stats;
413         }
414     }
415 
416     kh_init_inplace(uct_ib_ah, &dev->ah_hash);
417     ucs_recursive_spinlock_init(&dev->ah_lock, 0);
418 
419     ucs_debug("initialized device '%s' (%s) with %d ports", uct_ib_device_name(dev),
420               ibv_node_type_str(ibv_device->node_type),
421               dev->num_ports);
422     return UCS_OK;
423 
424 err_release_stats:
425     UCS_STATS_NODE_FREE(dev->stats);
426 err:
427     return status;
428 }
429 
uct_ib_device_cleanup_ah_cached(uct_ib_device_t * dev)430 void uct_ib_device_cleanup_ah_cached(uct_ib_device_t *dev)
431 {
432     struct ibv_ah *ah;
433 
434     kh_foreach_value(&dev->ah_hash, ah, ibv_destroy_ah(ah));
435 }
436 
uct_ib_device_cleanup(uct_ib_device_t * dev)437 void uct_ib_device_cleanup(uct_ib_device_t *dev)
438 {
439     ucs_status_t status;
440 
441     ucs_debug("destroying ib device %s", uct_ib_device_name(dev));
442 
443     kh_destroy_inplace(uct_ib_ah, &dev->ah_hash);
444 
445     status = ucs_recursive_spinlock_destroy(&dev->ah_lock);
446     if (status != UCS_OK) {
447         ucs_warn("ucs_recursive_spinlock_destroy() failed (%d)", status);
448     }
449 
450     if (dev->async_events) {
451         ucs_async_remove_handler(dev->ibv_context->async_fd, 1);
452     }
453     UCS_STATS_NODE_FREE(dev->stats);
454 }
455 
uct_ib_device_spec_match(uct_ib_device_t * dev,const uct_ib_device_spec_t * spec)456 static inline int uct_ib_device_spec_match(uct_ib_device_t *dev,
457                                            const uct_ib_device_spec_t *spec)
458 {
459     return (spec->pci_id.vendor == dev->pci_id.vendor) &&
460            (spec->pci_id.device == dev->pci_id.device);
461 }
462 
uct_ib_device_spec(uct_ib_device_t * dev)463 const uct_ib_device_spec_t* uct_ib_device_spec(uct_ib_device_t *dev)
464 {
465     uct_ib_md_t *md = ucs_container_of(dev, uct_ib_md_t, dev);
466     uct_ib_device_spec_t *spec;
467 
468     /* search through devices specified in the configuration */
469     for (spec = md->custom_devices.specs;
470          spec < md->custom_devices.specs + md->custom_devices.count; ++spec) {
471         if (uct_ib_device_spec_match(dev, spec)) {
472             return spec;
473         }
474     }
475 
476     /* search through built-in list of device specifications */
477     spec = uct_ib_builtin_device_specs;
478     while ((spec->name != NULL) && !uct_ib_device_spec_match(dev, spec)) {
479         ++spec;
480     }
481     return spec; /* if no match is found, return the last entry, which contains
482                     default settings for unknown devices */
483 }
484 
uct_ib_device_get_ib_gid_index(uct_ib_md_t * md)485 static size_t uct_ib_device_get_ib_gid_index(uct_ib_md_t *md)
486 {
487     if (md->config.gid_index == UCS_ULUNITS_AUTO) {
488         return UCT_IB_MD_DEFAULT_GID_INDEX;
489     } else {
490         return md->config.gid_index;
491     }
492 }
493 
uct_ib_device_is_iwarp(uct_ib_device_t * dev)494 static int uct_ib_device_is_iwarp(uct_ib_device_t *dev)
495 {
496     return dev->ibv_context->device->transport_type == IBV_TRANSPORT_IWARP;
497 }
498 
uct_ib_device_port_check(uct_ib_device_t * dev,uint8_t port_num,unsigned flags)499 ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num,
500                                       unsigned flags)
501 {
502     uct_ib_md_t *md = ucs_container_of(dev, uct_ib_md_t, dev);
503     const uct_ib_device_spec_t *dev_info;
504     uint8_t required_dev_flags;
505     ucs_status_t status;
506     union ibv_gid gid;
507 
508     if (port_num < dev->first_port || port_num >= dev->first_port + dev->num_ports) {
509         return UCS_ERR_NO_DEVICE;
510     }
511 
512     if (uct_ib_device_port_attr(dev, port_num)->state != IBV_PORT_ACTIVE) {
513         ucs_trace("%s:%d is not active (state: %d)", uct_ib_device_name(dev),
514                   port_num, uct_ib_device_port_attr(dev, port_num)->state);
515         return UCS_ERR_UNREACHABLE;
516     }
517 
518     if (uct_ib_device_is_iwarp(dev)) {
519         /* TODO: enable it when support is ready */
520         ucs_debug("iWarp device %s is not supported", uct_ib_device_name(dev));
521         return UCS_ERR_UNSUPPORTED;
522     }
523 
524     if (!uct_ib_device_is_port_ib(dev, port_num) && (flags & UCT_IB_DEVICE_FLAG_LINK_IB)) {
525         ucs_debug("%s:%d is not IB link layer", uct_ib_device_name(dev),
526                   port_num);
527         return UCS_ERR_UNSUPPORTED;
528     }
529 
530     if (flags & UCT_IB_DEVICE_FLAG_DC) {
531         if (!IBV_DEVICE_HAS_DC(dev)) {
532             ucs_trace("%s:%d does not support DC", uct_ib_device_name(dev), port_num);
533             return UCS_ERR_UNSUPPORTED;
534         }
535     }
536 
537     /* check generic device flags */
538     dev_info           = uct_ib_device_spec(dev);
539     required_dev_flags = flags & (UCT_IB_DEVICE_FLAG_MLX4_PRM |
540                                   UCT_IB_DEVICE_FLAG_MLX5_PRM);
541     if (!ucs_test_all_flags(dev_info->flags, required_dev_flags)) {
542         ucs_trace("%s:%d (%s) does not support flags 0x%x", uct_ib_device_name(dev),
543                   port_num, dev_info->name, required_dev_flags);
544         return UCS_ERR_UNSUPPORTED;
545     }
546 
547     if (md->check_subnet_filter && uct_ib_device_is_port_ib(dev, port_num)) {
548         status = uct_ib_device_query_gid(dev, port_num,
549                                          uct_ib_device_get_ib_gid_index(md), &gid);
550         if (status != UCS_OK) {
551             return status;
552         }
553 
554         if (md->subnet_filter != gid.global.subnet_prefix) {
555             ucs_trace("%s:%d subnet_prefix does not match",
556                       uct_ib_device_name(dev), port_num);
557             return UCS_ERR_UNSUPPORTED;
558         }
559     }
560 
561     return UCS_OK;
562 }
563 
uct_ib_roce_version_str(uct_ib_roce_version_t roce_ver)564 const char *uct_ib_roce_version_str(uct_ib_roce_version_t roce_ver)
565 {
566     switch (roce_ver) {
567     case UCT_IB_DEVICE_ROCE_V1:
568         return "RoCE v1";
569     case UCT_IB_DEVICE_ROCE_V1_5:
570         return "RoCE v1.5";
571     case UCT_IB_DEVICE_ROCE_V2:
572         return "RoCE v2";
573     default:
574         return "<unknown RoCE version>";
575     }
576 }
577 
uct_ib_gid_str(const union ibv_gid * gid,char * str,size_t max_size)578 const char *uct_ib_gid_str(const union ibv_gid *gid, char *str, size_t max_size)
579 {
580     inet_ntop(AF_INET6, gid, str, max_size);
581     return str;
582 }
583 
uct_ib_device_is_addr_ipv4_mcast(const struct in6_addr * raw,const uint32_t addr_last_bits)584 static int uct_ib_device_is_addr_ipv4_mcast(const struct in6_addr *raw,
585                                             const uint32_t addr_last_bits)
586 {
587     /* IPv4 encoded multicast addresses */
588     return (raw->s6_addr32[0] == htonl(0xff0e0000)) &&
589            !(raw->s6_addr32[1] | addr_last_bits);
590 }
591 
uct_ib_device_get_addr_family(union ibv_gid * gid,int gid_index)592 static sa_family_t uct_ib_device_get_addr_family(union ibv_gid *gid, int gid_index)
593 {
594     const struct in6_addr *raw    = (struct in6_addr *)gid->raw;
595     const uint32_t addr_last_bits = raw->s6_addr32[2] ^ htonl(0x0000ffff);
596     char p[128];
597 
598     ucs_debug("testing addr_family on gid index %d: %s",
599               gid_index, uct_ib_gid_str(gid, p, sizeof(p)));
600 
601     if (!((raw->s6_addr32[0] | raw->s6_addr32[1]) | addr_last_bits) ||
602         uct_ib_device_is_addr_ipv4_mcast(raw, addr_last_bits)) {
603         return AF_INET;
604     } else {
605         return AF_INET6;
606     }
607 }
608 
609 ucs_status_t
uct_ib_device_query_gid_info(struct ibv_context * ctx,const char * dev_name,uint8_t port_num,unsigned gid_index,uct_ib_device_gid_info_t * info)610 uct_ib_device_query_gid_info(struct ibv_context *ctx, const char *dev_name,
611                              uint8_t port_num, unsigned gid_index,
612                              uct_ib_device_gid_info_t *info)
613 {
614     int ret;
615 
616 #if HAVE_DECL_IBV_EXP_QUERY_GID_ATTR
617     struct ibv_exp_gid_attr attr;
618 
619     attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE | IBV_EXP_QUERY_GID_ATTR_GID;
620     ret = ibv_exp_query_gid_attr(ctx, port_num, gid_index, &attr);
621     if (ret == 0) {
622         info->gid                  = attr.gid;
623         info->gid_index            = gid_index;
624         info->roce_info.addr_family =
625                         uct_ib_device_get_addr_family(&info->gid, gid_index);
626         switch (attr.type) {
627         case IBV_EXP_IB_ROCE_V1_GID_TYPE:
628             info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1;
629             return UCS_OK;
630         case IBV_EXP_ROCE_V1_5_GID_TYPE:
631             info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1_5;
632             return UCS_OK;
633         case IBV_EXP_ROCE_V2_GID_TYPE:
634             info->roce_info.ver = UCT_IB_DEVICE_ROCE_V2;
635             return UCS_OK;
636         default:
637             ucs_error("Invalid GID[%d] type on %s:%d: %d",
638                       gid_index, dev_name, port_num, attr.type);
639             return UCS_ERR_IO_ERROR;
640         }
641     }
642 #else
643     char buf[16];
644 
645     ret = ibv_query_gid(ctx, port_num, gid_index, &info->gid);
646     if (ret == 0) {
647         ret = ucs_read_file(buf, sizeof(buf) - 1, 1,
648                             UCT_IB_DEVICE_SYSFS_GID_TYPE_FMT,
649                             dev_name, port_num, gid_index);
650         if (ret > 0) {
651             if (!strncmp(buf, "IB/RoCE v1", 10)) {
652                 info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1;
653             } else if (!strncmp(buf, "RoCE v2", 7)) {
654                 info->roce_info.ver = UCT_IB_DEVICE_ROCE_V2;
655             } else {
656                 ucs_error("failed to parse gid type '%s' (dev=%s port=%d index=%d)",
657                           buf, dev_name, port_num, gid_index);
658                 return UCS_ERR_INVALID_PARAM;
659             }
660         } else {
661             info->roce_info.ver = UCT_IB_DEVICE_ROCE_V1;
662         }
663 
664         info->roce_info.addr_family =
665                         uct_ib_device_get_addr_family(&info->gid, gid_index);
666         info->gid_index            = gid_index;
667         return UCS_OK;
668     }
669 #endif
670     ucs_error("ibv_query_gid(dev=%s port=%d index=%d) failed: %m",
671               dev_name, port_num, gid_index);
672     return UCS_ERR_INVALID_PARAM;
673 }
674 
uct_ib_device_test_roce_gid_index(uct_ib_device_t * dev,uint8_t port_num,const union ibv_gid * gid,uint8_t gid_index)675 int uct_ib_device_test_roce_gid_index(uct_ib_device_t *dev, uint8_t port_num,
676                                       const union ibv_gid *gid,
677                                       uint8_t gid_index)
678 {
679     struct ibv_ah_attr ah_attr;
680     struct ibv_ah *ah;
681 
682     ucs_assert(uct_ib_device_is_port_roce(dev, port_num));
683 
684     memset(&ah_attr, 0, sizeof(ah_attr));
685     ah_attr.port_num       = port_num;
686     ah_attr.is_global      = 1;
687     ah_attr.grh.dgid       = *gid;
688     ah_attr.grh.sgid_index = gid_index;
689     ah_attr.grh.hop_limit  = 255;
690     ah_attr.grh.flow_label = 1;
691     ah_attr.dlid           = UCT_IB_ROCE_UDP_SRC_PORT_BASE;
692 
693     ah = ibv_create_ah(ucs_container_of(dev, uct_ib_md_t, dev)->pd, &ah_attr);
694     if (ah == NULL) {
695         return 0; /* gid entry is not operational */
696     }
697 
698     ibv_destroy_ah(ah);
699     return 1;
700 }
701 
uct_ib_device_select_gid(uct_ib_device_t * dev,uint8_t port_num,uct_ib_device_gid_info_t * gid_info)702 ucs_status_t uct_ib_device_select_gid(uct_ib_device_t *dev, uint8_t port_num,
703                                       uct_ib_device_gid_info_t *gid_info)
704 {
705     static const uct_ib_roce_version_info_t roce_prio[] = {
706         {UCT_IB_DEVICE_ROCE_V2, AF_INET},
707         {UCT_IB_DEVICE_ROCE_V2, AF_INET6},
708         {UCT_IB_DEVICE_ROCE_V1, AF_INET},
709         {UCT_IB_DEVICE_ROCE_V1, AF_INET6}
710     };
711     int gid_tbl_len         = uct_ib_device_port_attr(dev, port_num)->gid_tbl_len;
712     ucs_status_t status     = UCS_OK;
713     int priorities_arr_len  = ucs_static_array_size(roce_prio);
714     uct_ib_device_gid_info_t gid_info_tmp;
715     int i, prio_idx;
716 
717     ucs_assert(uct_ib_device_is_port_roce(dev, port_num));
718 
719     /* search for matching GID table entries, according to the order defined
720      * in priorities array
721      */
722     for (prio_idx = 0; prio_idx < priorities_arr_len; prio_idx++) {
723         for (i = 0; i < gid_tbl_len; i++) {
724             status = uct_ib_device_query_gid_info(dev->ibv_context,
725                                                   uct_ib_device_name(dev),
726                                                   port_num, i, &gid_info_tmp);
727             if (status != UCS_OK) {
728                 goto out;
729             }
730 
731             if ((roce_prio[prio_idx].ver         == gid_info_tmp.roce_info.ver) &&
732                 (roce_prio[prio_idx].addr_family == gid_info_tmp.roce_info.addr_family) &&
733                 uct_ib_device_test_roce_gid_index(dev, port_num, &gid_info_tmp.gid, i)) {
734 
735                 gid_info->gid_index = i;
736                 gid_info->roce_info = gid_info_tmp.roce_info;
737                 goto out_print;
738             }
739         }
740     }
741 
742     gid_info->gid_index             = UCT_IB_MD_DEFAULT_GID_INDEX;
743     gid_info->roce_info.ver         = UCT_IB_DEVICE_ROCE_V1;
744     gid_info->roce_info.addr_family = AF_INET;
745 
746 out_print:
747     ucs_debug("%s:%d using gid_index %d", uct_ib_device_name(dev), port_num,
748               gid_info->gid_index);
749 out:
750     return status;
751 }
752 
uct_ib_device_is_port_ib(uct_ib_device_t * dev,uint8_t port_num)753 int uct_ib_device_is_port_ib(uct_ib_device_t *dev, uint8_t port_num)
754 {
755 #if HAVE_DECL_IBV_LINK_LAYER_INFINIBAND
756     return uct_ib_device_port_attr(dev, port_num)->link_layer == IBV_LINK_LAYER_INFINIBAND;
757 #else
758     return 1;
759 #endif
760 }
761 
uct_ib_device_is_port_roce(uct_ib_device_t * dev,uint8_t port_num)762 int uct_ib_device_is_port_roce(uct_ib_device_t *dev, uint8_t port_num)
763 {
764     return IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_device_port_attr(dev, port_num));
765 }
766 
uct_ib_device_name(uct_ib_device_t * dev)767 const char *uct_ib_device_name(uct_ib_device_t *dev)
768 {
769     return ibv_get_device_name(dev->ibv_context->device);
770 }
771 
uct_ib_device_bus(uct_ib_device_t * dev,int port_num,ucs_sys_bus_id_t * bus_id)772 ucs_status_t uct_ib_device_bus(uct_ib_device_t *dev, int port_num,
773                                ucs_sys_bus_id_t *bus_id)
774 {
775     char ib_realpath[PATH_MAX];
776     char *pcie_bus;
777     char *tmp;
778     int i, bus_len;
779     int num_inputs;
780 
781     if (NULL == realpath(dev->ibv_context->device->ibdev_path, ib_realpath)) {
782         return UCS_ERR_NO_RESOURCE;
783     }
784 
785     /* realpath name is of form /sys/devices/.../0000:05:00.0/infiniband/mlx5_0
786      * and bus_id is constructed from 0000:05:00.0 */
787 
788     /* Make sure there is /infiniband substring in ib_realpath*/
789     tmp = strstr(ib_realpath, "/infiniband");
790     if (NULL == tmp) {
791         return UCS_ERR_NO_RESOURCE;
792     }
793 
794     pcie_bus = dirname(ib_realpath);
795     pcie_bus = dirname(pcie_bus);
796     pcie_bus = basename(pcie_bus);
797 
798     bus_len = strlen(pcie_bus);
799     for (i = 0; i < bus_len; i++) {
800         if ((pcie_bus[i] == ':') || (pcie_bus[i] == '.')) {
801             pcie_bus[i] = ' ';
802         }
803     }
804 
805     num_inputs = sscanf(pcie_bus, "%hx %hhx %hhx %hhx", &bus_id->domain,
806                                                         &bus_id->bus,
807                                                         &bus_id->slot,
808                                                         &bus_id->function);
809     if (num_inputs != 4) {
810         return UCS_ERR_NO_RESOURCE;
811     }
812 
813     ucs_debug("ib device = %s:%d, bus id = %hu:%hhu:%hhu.%hhu",
814                uct_ib_device_name(dev), port_num, bus_id->domain, bus_id->bus,
815                bus_id->slot, bus_id->function);
816 
817     return UCS_OK;
818 }
819 
uct_ib_mtu_value(enum ibv_mtu mtu)820 size_t uct_ib_mtu_value(enum ibv_mtu mtu)
821 {
822     switch (mtu) {
823     case IBV_MTU_256:
824         return 256;
825     case IBV_MTU_512:
826         return 512;
827     case IBV_MTU_1024:
828         return 1024;
829     case IBV_MTU_2048:
830         return 2048;
831     case IBV_MTU_4096:
832         return 4096;
833     }
834     ucs_fatal("Invalid MTU value (%d)", mtu);
835 }
836 
uct_ib_to_qp_fabric_time(double t)837 uint8_t uct_ib_to_qp_fabric_time(double t)
838 {
839     double to;
840 
841     to = log(t / 4.096e-6) / log(2.0);
842     if (to < 1) {
843         return 1; /* Very small timeout */
844     } else if ((long)(to + 0.5) >= UCT_IB_FABRIC_TIME_MAX) {
845         return 0; /* No timeout */
846     } else {
847         return (long)(to + 0.5);
848     }
849 }
850 
uct_ib_to_rnr_fabric_time(double t)851 uint8_t uct_ib_to_rnr_fabric_time(double t)
852 {
853     double time_ms = t * UCS_MSEC_PER_SEC;
854     uint8_t idx, next_index;
855     double avg_ms;
856 
857     for (idx = 1; idx < UCT_IB_FABRIC_TIME_MAX; idx++) {
858         next_index = (idx + 1) % UCT_IB_FABRIC_TIME_MAX;
859 
860         if (time_ms <= uct_ib_qp_rnr_time_ms[next_index]) {
861             avg_ms = (uct_ib_qp_rnr_time_ms[idx] +
862                       uct_ib_qp_rnr_time_ms[next_index]) * 0.5;
863 
864             if (time_ms < avg_ms) {
865                 /* return previous index */
866                 return idx;
867             } else {
868                 /* return current index */
869                 return next_index;
870             }
871         }
872     }
873 
874     return 0; /* this is a special value that means the maximum value */
875 }
876 
uct_ib_modify_qp(struct ibv_qp * qp,enum ibv_qp_state state)877 ucs_status_t uct_ib_modify_qp(struct ibv_qp *qp, enum ibv_qp_state state)
878 {
879     struct ibv_qp_attr qp_attr;
880 
881     ucs_debug("modify QP 0x%x to state %d", qp->qp_num, state);
882     memset(&qp_attr, 0, sizeof(qp_attr));
883     qp_attr.qp_state = state;
884     if (ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE)) {
885         ucs_warn("modify qp 0x%x to state %d failed: %m", qp->qp_num, state);
886         return UCS_ERR_IO_ERROR;
887     }
888 
889     return UCS_OK;
890 }
891 
uct_ib_device_query_ports(uct_ib_device_t * dev,unsigned flags,uct_tl_device_resource_t ** tl_devices_p,unsigned * num_tl_devices_p)892 ucs_status_t uct_ib_device_query_ports(uct_ib_device_t *dev, unsigned flags,
893                                        uct_tl_device_resource_t **tl_devices_p,
894                                        unsigned *num_tl_devices_p)
895 {
896     uct_tl_device_resource_t *tl_devices;
897     unsigned num_tl_devices;
898     ucs_status_t status;
899     uint8_t port_num;
900 
901     /* Allocate resources array
902      * We may allocate more memory than really required, but it's not so bad. */
903     tl_devices = ucs_calloc(dev->num_ports, sizeof(*tl_devices), "ib device resource");
904     if (tl_devices == NULL) {
905         status = UCS_ERR_NO_MEMORY;
906         goto err;
907     }
908 
909     /* Second pass: fill port information */
910     num_tl_devices = 0;
911     for (port_num = dev->first_port; port_num < dev->first_port + dev->num_ports;
912          ++port_num)
913     {
914         /* Check port capabilities */
915         status = uct_ib_device_port_check(dev, port_num, flags);
916         if (status != UCS_OK) {
917            ucs_trace("%s:%d does not support flags 0x%x: %s",
918                      uct_ib_device_name(dev), port_num, flags,
919                      ucs_status_string(status));
920            continue;
921         }
922 
923         /* Save device information */
924         ucs_snprintf_zero(tl_devices[num_tl_devices].name,
925                           sizeof(tl_devices[num_tl_devices].name),
926                           "%s:%d", uct_ib_device_name(dev), port_num);
927         tl_devices[num_tl_devices].type = UCT_DEVICE_TYPE_NET;
928         ++num_tl_devices;
929     }
930 
931     if (num_tl_devices == 0) {
932         ucs_debug("no compatible IB ports found for flags 0x%x", flags);
933         status = UCS_ERR_NO_DEVICE;
934         goto err_free;
935     }
936 
937     *num_tl_devices_p = num_tl_devices;
938     *tl_devices_p     = tl_devices;
939     return UCS_OK;
940 
941 err_free:
942     ucs_free(tl_devices);
943 err:
944     return status;
945 }
946 
uct_ib_device_find_port(uct_ib_device_t * dev,const char * resource_dev_name,uint8_t * p_port_num)947 ucs_status_t uct_ib_device_find_port(uct_ib_device_t *dev,
948                                      const char *resource_dev_name,
949                                      uint8_t *p_port_num)
950 {
951     const char *ibdev_name;
952     unsigned port_num;
953     size_t devname_len;
954     char *p;
955 
956     p = strrchr(resource_dev_name, ':');
957     if (p == NULL) {
958         goto err; /* Wrong device name format */
959     }
960     devname_len = p - resource_dev_name;
961 
962     ibdev_name = uct_ib_device_name(dev);
963     if ((strlen(ibdev_name) != devname_len) ||
964         strncmp(ibdev_name, resource_dev_name, devname_len))
965     {
966         goto err; /* Device name is wrong */
967     }
968 
969     port_num = strtod(p + 1, &p);
970     if (*p != '\0') {
971         goto err; /* Failed to parse port number */
972     }
973     if ((port_num < dev->first_port) || (port_num >= dev->first_port + dev->num_ports)) {
974         goto err; /* Port number out of range */
975     }
976 
977     *p_port_num = port_num;
978     return UCS_OK;
979 
980 err:
981     ucs_error("%s: failed to find port", resource_dev_name);
982     return UCS_ERR_NO_DEVICE;
983 }
984 
uct_ib_device_mtu(const char * dev_name,uct_md_h md,int * p_mtu)985 ucs_status_t uct_ib_device_mtu(const char *dev_name, uct_md_h md, int *p_mtu)
986 {
987     uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev;
988     uint8_t port_num;
989     ucs_status_t status;
990 
991     status = uct_ib_device_find_port(dev, dev_name, &port_num);
992     if (status != UCS_OK) {
993         return status;
994     }
995 
996     *p_mtu = uct_ib_mtu_value(uct_ib_device_port_attr(dev, port_num)->active_mtu);
997     return UCS_OK;
998 }
999 
uct_ib_device_is_gid_raw_empty(uint8_t * gid_raw)1000 int uct_ib_device_is_gid_raw_empty(uint8_t *gid_raw)
1001 {
1002     return (*(uint64_t *)gid_raw == 0) && (*(uint64_t *)(gid_raw + 8) == 0);
1003 }
1004 
uct_ib_device_query_gid(uct_ib_device_t * dev,uint8_t port_num,unsigned gid_index,union ibv_gid * gid)1005 ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num,
1006                                      unsigned gid_index, union ibv_gid *gid)
1007 {
1008     uct_ib_device_gid_info_t gid_info;
1009     ucs_status_t status;
1010 
1011     status = uct_ib_device_query_gid_info(dev->ibv_context, uct_ib_device_name(dev),
1012                                           port_num, gid_index, &gid_info);
1013     if (status != UCS_OK) {
1014         return status;
1015     }
1016 
1017     if (uct_ib_device_is_gid_raw_empty(gid_info.gid.raw)) {
1018         ucs_error("Invalid gid[%d] on %s:%d", gid_index,
1019                   uct_ib_device_name(dev), port_num);
1020         return UCS_ERR_INVALID_ADDR;
1021     }
1022 
1023     *gid = gid_info.gid;
1024     return UCS_OK;
1025 }
1026 
uct_ib_device_odp_max_size(uct_ib_device_t * dev)1027 size_t uct_ib_device_odp_max_size(uct_ib_device_t *dev)
1028 {
1029 #ifdef HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS
1030     const struct ibv_exp_device_attr *dev_attr = &dev->dev_attr;
1031     uint32_t required_ud_odp_caps = IBV_EXP_ODP_SUPPORT_SEND;
1032     uint32_t required_rc_odp_caps = IBV_EXP_ODP_SUPPORT_SEND |
1033                                     IBV_EXP_ODP_SUPPORT_WRITE |
1034                                     IBV_EXP_ODP_SUPPORT_READ;
1035 
1036     if (RUNNING_ON_VALGRIND ||
1037         !IBV_EXP_HAVE_ODP(dev_attr) ||
1038         !ucs_test_all_flags(IBV_EXP_ODP_CAPS(dev_attr, rc), required_rc_odp_caps) ||
1039         !ucs_test_all_flags(IBV_EXP_ODP_CAPS(dev_attr, ud), required_ud_odp_caps))
1040     {
1041         return 0;
1042     }
1043 
1044     if (IBV_DEVICE_HAS_DC(dev)
1045 #  if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS_PER_TRANSPORT_CAPS_DC_ODP_CAPS
1046         && !ucs_test_all_flags(IBV_EXP_ODP_CAPS(dev_attr, dc), required_rc_odp_caps)
1047 #  endif
1048         )
1049     {
1050         return 0;
1051     }
1052 
1053 #  if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_MR_MAX_SIZE
1054     return dev_attr->odp_mr_max_size;
1055 #  else
1056     return 1ul << 28; /* Limit ODP to 256 MB by default */
1057 #  endif /* HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_MR_MAX_SIZE */
1058 
1059 #else
1060     return 0;
1061 #endif /* HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_ODP_CAPS */
1062 }
1063 
uct_ib_wc_status_str(enum ibv_wc_status wc_status)1064 const char *uct_ib_wc_status_str(enum ibv_wc_status wc_status)
1065 {
1066     return ibv_wc_status_str(wc_status);
1067 }
1068 
uct_ib_device_create_ah(uct_ib_device_t * dev,struct ibv_ah_attr * ah_attr,struct ibv_pd * pd,struct ibv_ah ** ah_p)1069 static ucs_status_t uct_ib_device_create_ah(uct_ib_device_t *dev,
1070                                             struct ibv_ah_attr *ah_attr,
1071                                             struct ibv_pd *pd,
1072                                             struct ibv_ah **ah_p)
1073 {
1074     struct ibv_ah *ah;
1075     char buf[128];
1076 
1077     ah = ibv_create_ah(pd, ah_attr);
1078     if (ah == NULL) {
1079         ucs_error("ibv_create_ah(%s) on %s failed: %m",
1080                   uct_ib_ah_attr_str(buf, sizeof(buf), ah_attr),
1081                   uct_ib_device_name(dev));
1082         return UCS_ERR_INVALID_ADDR;
1083     }
1084 
1085     *ah_p = ah;
1086     return UCS_OK;
1087 }
1088 
uct_ib_device_create_ah_cached(uct_ib_device_t * dev,struct ibv_ah_attr * ah_attr,struct ibv_pd * pd,struct ibv_ah ** ah_p)1089 ucs_status_t uct_ib_device_create_ah_cached(uct_ib_device_t *dev,
1090                                             struct ibv_ah_attr *ah_attr,
1091                                             struct ibv_pd *pd,
1092                                             struct ibv_ah **ah_p)
1093 {
1094     ucs_status_t status = UCS_OK;
1095     khiter_t iter;
1096     int ret;
1097 
1098     ucs_recursive_spin_lock(&dev->ah_lock);
1099 
1100     /* looking for existing AH with same attributes */
1101     iter = kh_get(uct_ib_ah, &dev->ah_hash, *ah_attr);
1102     if (iter == kh_end(&dev->ah_hash)) {
1103         /* new AH */
1104         status = uct_ib_device_create_ah(dev, ah_attr, pd, ah_p);
1105         if (status != UCS_OK) {
1106             goto unlock;
1107         }
1108 
1109         /* store AH in hash */
1110         iter = kh_put(uct_ib_ah, &dev->ah_hash, *ah_attr, &ret);
1111 
1112         /* failed to store - rollback */
1113         if (iter == kh_end(&dev->ah_hash)) {
1114             ibv_destroy_ah(*ah_p);
1115             status = UCS_ERR_NO_MEMORY;
1116             goto unlock;
1117         }
1118 
1119         kh_value(&dev->ah_hash, iter) = *ah_p;
1120     } else {
1121         /* found existing AH */
1122         *ah_p = kh_value(&dev->ah_hash, iter);
1123     }
1124 
1125 unlock:
1126     ucs_recursive_spin_unlock(&dev->ah_lock);
1127     return status;
1128 }
1129 
uct_ib_get_cqe_size(int cqe_size_min)1130 int uct_ib_get_cqe_size(int cqe_size_min)
1131 {
1132     static int cqe_size_max = -1;
1133     int cqe_size;
1134 
1135     if (cqe_size_max == -1) {
1136 #ifdef __aarch64__
1137         char arm_board_vendor[128];
1138         ucs_aarch64_cpuid_t cpuid;
1139         ucs_aarch64_cpuid(&cpuid);
1140 
1141         arm_board_vendor[0] = '\0';
1142         ucs_read_file(arm_board_vendor, sizeof(arm_board_vendor), 1,
1143                       "/sys/devices/virtual/dmi/id/board_vendor");
1144         ucs_debug("arm_board_vendor is '%s'", arm_board_vendor);
1145 
1146         cqe_size_max = ((strcasestr(arm_board_vendor, "Huawei")) &&
1147                         (cpuid.implementer == 0x41) && (cpuid.architecture == 8) &&
1148                         (cpuid.variant == 0)        && (cpuid.part == 0xd08)     &&
1149                         (cpuid.revision == 2))
1150                        ? 64 : 128;
1151 #else
1152         cqe_size_max = 128;
1153 #endif
1154         ucs_debug("max IB CQE size is %d", cqe_size_max);
1155     }
1156 
1157     /* Set cqe size according to inline size and cache line size. */
1158     cqe_size = ucs_max(cqe_size_min, UCS_SYS_CACHE_LINE_SIZE);
1159     cqe_size = ucs_max(cqe_size, 64);  /* at least 64 */
1160     cqe_size = ucs_min(cqe_size, cqe_size_max);
1161 
1162     return cqe_size;
1163 }
1164 
1165 static ucs_status_t
uct_ib_device_get_roce_ndev_name(uct_ib_device_t * dev,uint8_t port_num,char * ndev_name,size_t max)1166 uct_ib_device_get_roce_ndev_name(uct_ib_device_t *dev, uint8_t port_num,
1167                                  char *ndev_name, size_t max)
1168 {
1169     ssize_t nread;
1170 
1171     ucs_assert_always(uct_ib_device_is_port_roce(dev, port_num));
1172 
1173     /* get the network device name which corresponds to a RoCE port */
1174     nread = ucs_read_file_str(ndev_name, max, 1,
1175                               UCT_IB_DEVICE_SYSFS_GID_NDEV_FMT,
1176                               uct_ib_device_name(dev), port_num, 0);
1177     if (nread < 0) {
1178         ucs_diag("failed to read " UCT_IB_DEVICE_SYSFS_GID_NDEV_FMT": %m",
1179                  uct_ib_device_name(dev), port_num, 0);
1180         return UCS_ERR_NO_DEVICE;
1181     }
1182 
1183     ucs_strtrim(ndev_name);
1184     return UCS_OK;
1185 }
1186 
uct_ib_device_get_roce_lag_level(uct_ib_device_t * dev,uint8_t port_num)1187 unsigned uct_ib_device_get_roce_lag_level(uct_ib_device_t *dev, uint8_t port_num)
1188 {
1189     char ndev_name[IFNAMSIZ];
1190     unsigned roce_lag_level;
1191     ucs_status_t status;
1192 
1193     status = uct_ib_device_get_roce_ndev_name(dev, port_num, ndev_name,
1194                                               sizeof(ndev_name));
1195     if (status != UCS_OK) {
1196         return 1;
1197     }
1198 
1199     roce_lag_level = ucs_netif_bond_ad_num_ports(ndev_name);
1200     ucs_debug("RoCE LAG level on %s:%d (%s) is %u", uct_ib_device_name(dev),
1201               port_num, ndev_name, roce_lag_level);
1202     return roce_lag_level;
1203 }
1204 
uct_ib_ah_attr_str(char * buf,size_t max,const struct ibv_ah_attr * ah_attr)1205 const char* uct_ib_ah_attr_str(char *buf, size_t max,
1206                                const struct ibv_ah_attr *ah_attr)
1207 {
1208     char *p    = buf;
1209     char *endp = buf + max;
1210 
1211     snprintf(p, endp - p, "dlid=%d sl=%d port=%d src_path_bits=%d",
1212              ah_attr->dlid, ah_attr->sl,
1213              ah_attr->port_num, ah_attr->src_path_bits);
1214     p += strlen(p);
1215 
1216     if (ah_attr->is_global) {
1217         snprintf(p, endp - p, " dgid=");
1218         p += strlen(p);
1219         uct_ib_gid_str(&ah_attr->grh.dgid, p, endp - p);
1220         p += strlen(p);
1221         snprintf(p, endp - p, " sgid_index=%d traffic_class=%d",
1222                  ah_attr->grh.sgid_index, ah_attr->grh.traffic_class);
1223     }
1224 
1225     return buf;
1226 }
1227