1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
3 *
4 * See file LICENSE for terms.
5 */
6 
7 #ifdef HAVE_CONFIG_H
8 #  include "config.h"
9 #endif
10 
11 #include "ib_iface.h"
12 #include "ib_log.h"
13 
14 #include <uct/base/uct_md.h>
15 #include <ucs/arch/bitops.h>
16 #include <ucs/arch/cpu.h>
17 #include <ucs/type/class.h>
18 #include <ucs/type/cpu_set.h>
19 #include <ucs/debug/log.h>
20 #include <ucs/time/time.h>
21 #include <ucs/memory/numa.h>
22 #include <ucs/sys/sock.h>
23 #include <ucm/util/sys.h>
24 #include <string.h>
25 #include <stdlib.h>
26 #include <poll.h>
27 
28 
29 static UCS_CONFIG_DEFINE_ARRAY(path_bits_spec,
30                                sizeof(ucs_range_spec_t),
31                                UCS_CONFIG_TYPE_RANGE_SPEC);
32 
33 const char *uct_ib_mtu_values[] = {
34     [UCT_IB_MTU_DEFAULT]    = "default",
35     [UCT_IB_MTU_512]        = "512",
36     [UCT_IB_MTU_1024]       = "1024",
37     [UCT_IB_MTU_2048]       = "2048",
38     [UCT_IB_MTU_4096]       = "4096",
39     [UCT_IB_MTU_LAST]       = NULL
40 };
41 
42 enum {
43     UCT_IB_ADDRESS_TYPE_LINK_LOCAL,
44     UCT_IB_ADDRESS_TYPE_SITE_LOCAL,
45     UCT_IB_ADDRESS_TYPE_GLOBAL,
46     UCT_IB_ADDRESS_TYPE_ETH,
47     UCT_IB_ADDRESS_TYPE_LAST,
48     UCT_IB_IFACE_ADDRESS_TYPE_AUTO  = UCT_IB_ADDRESS_TYPE_LAST,
49     UCT_IB_IFACE_ADDRESS_TYPE_LAST
50 };
51 
52 static const char *uct_ib_iface_addr_types[] = {
53    [UCT_IB_ADDRESS_TYPE_LINK_LOCAL] = "ib_local",
54    [UCT_IB_ADDRESS_TYPE_SITE_LOCAL] = "ib_site_local",
55    [UCT_IB_ADDRESS_TYPE_GLOBAL]     = "ib_global",
56    [UCT_IB_ADDRESS_TYPE_ETH]        = "eth",
57    [UCT_IB_IFACE_ADDRESS_TYPE_AUTO] = "auto",
58    [UCT_IB_IFACE_ADDRESS_TYPE_LAST] = NULL
59 };
60 
61 ucs_config_field_t uct_ib_iface_config_table[] = {
62   {"", "", NULL,
63    ucs_offsetof(uct_ib_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)},
64 
65   {"SEG_SIZE", "8192",
66    "Size of bounce buffers used for post_send and post_recv.",
67    ucs_offsetof(uct_ib_iface_config_t, seg_size), UCS_CONFIG_TYPE_MEMUNITS},
68 
69   {"TX_QUEUE_LEN", "256",
70    "Length of send queue in the QP.",
71    ucs_offsetof(uct_ib_iface_config_t, tx.queue_len), UCS_CONFIG_TYPE_UINT},
72 
73   {"TX_MAX_BATCH", "16",
74    "Number of send WQEs to batch in one post-send list. Larger values reduce\n"
75    "the CPU usage, but increase the latency and pipelining between sender and\n"
76    "receiver.",
77    ucs_offsetof(uct_ib_iface_config_t, tx.max_batch), UCS_CONFIG_TYPE_UINT},
78 
79   {"TX_MAX_POLL", "16",
80    "Max number of receive completions to pick during TX poll",
81    ucs_offsetof(uct_ib_iface_config_t, tx.max_poll), UCS_CONFIG_TYPE_UINT},
82 
83   {"TX_MIN_INLINE", "64",
84    "Bytes to reserve in send WQE for inline data. Messages which are small\n"
85    "enough will be sent inline.",
86    ucs_offsetof(uct_ib_iface_config_t, tx.min_inline), UCS_CONFIG_TYPE_MEMUNITS},
87 
88   {"TX_INLINE_RESP", "0",
89    "Bytes to reserve in send WQE for inline response. Responses which are small\n"
90    "enough, such as of atomic operations and small reads, will be received inline.",
91    ucs_offsetof(uct_ib_iface_config_t, inl[UCT_IB_DIR_TX]), UCS_CONFIG_TYPE_MEMUNITS},
92 
93   {"TX_MIN_SGE", "3",
94    "Number of SG entries to reserve in the send WQE.",
95    ucs_offsetof(uct_ib_iface_config_t, tx.min_sge), UCS_CONFIG_TYPE_UINT},
96 
97 #if HAVE_DECL_IBV_EXP_CQ_MODERATION
98   {"TX_EVENT_MOD_COUNT", "0",
99    "Number of send completions for which an event would be generated (0 - disabled).",
100    ucs_offsetof(uct_ib_iface_config_t, tx.cq_moderation_count), UCS_CONFIG_TYPE_UINT},
101 
102   {"TX_EVENT_MOD_PERIOD", "0us",
103    "Time period to generate send event (0 - disabled).",
104    ucs_offsetof(uct_ib_iface_config_t, tx.cq_moderation_period), UCS_CONFIG_TYPE_TIME},
105 
106   {"RX_EVENT_MOD_COUNT", "0",
107    "Number of received messages for which an event would be generated (0 - disabled).",
108    ucs_offsetof(uct_ib_iface_config_t, rx.cq_moderation_count), UCS_CONFIG_TYPE_UINT},
109 
110   {"RX_EVENT_MOD_PERIOD", "0us",
111    "Time period to generate receive event (0 - disabled).",
112    ucs_offsetof(uct_ib_iface_config_t, rx.cq_moderation_period), UCS_CONFIG_TYPE_TIME},
113 #endif /* HAVE_DECL_IBV_EXP_CQ_MODERATION */
114 
115   UCT_IFACE_MPOOL_CONFIG_FIELDS("TX_", -1, 1024, "send",
116                                 ucs_offsetof(uct_ib_iface_config_t, tx.mp),
117       "\nAttention: Setting this param with value != -1 is a dangerous thing\n"
118       "in RC/DC and could cause deadlock or performance degradation."),
119 
120   {"RX_QUEUE_LEN", "4096",
121    "Length of receive queue in the QPs.",
122    ucs_offsetof(uct_ib_iface_config_t, rx.queue_len), UCS_CONFIG_TYPE_UINT},
123 
124   {"RX_MAX_BATCH", "16",
125    "How many post-receives to perform in one batch.",
126    ucs_offsetof(uct_ib_iface_config_t, rx.max_batch), UCS_CONFIG_TYPE_UINT},
127 
128   {"RX_MAX_POLL", "16",
129    "Max number of receive completions to pick during RX poll",
130    ucs_offsetof(uct_ib_iface_config_t, rx.max_poll), UCS_CONFIG_TYPE_UINT},
131 
132   {"RX_INLINE", "0",
133    "Number of bytes to request for inline receive. If the maximal supported size\n"
134    "is smaller, it will be used instead. If it is possible to support a larger\n"
135    "size than requested with the same hardware resources, it will be used instead.",
136    ucs_offsetof(uct_ib_iface_config_t, inl[UCT_IB_DIR_RX]), UCS_CONFIG_TYPE_MEMUNITS},
137 
138   UCT_IFACE_MPOOL_CONFIG_FIELDS("RX_", -1, 0, "receive",
139                                 ucs_offsetof(uct_ib_iface_config_t, rx.mp), ""),
140 
141   {"ADDR_TYPE", "auto",
142    "Set the interface address type. \"auto\" mode detects the type according to\n"
143    "link layer type and IB subnet prefix.\n"
144    "Deprecated. To force use of global routing use IS_GLOBAL.",
145    ucs_offsetof(uct_ib_iface_config_t, addr_type),
146    UCS_CONFIG_TYPE_ENUM(uct_ib_iface_addr_types)},
147 
148   {"IS_GLOBAL", "n",
149    "Force interface to use global routing.",
150    ucs_offsetof(uct_ib_iface_config_t, is_global), UCS_CONFIG_TYPE_BOOL},
151 
152   {"SL", "0",
153    "IB Service Level / RoCEv2 Ethernet Priority.\n",
154    ucs_offsetof(uct_ib_iface_config_t, sl), UCS_CONFIG_TYPE_UINT},
155 
156   {"TRAFFIC_CLASS", "auto",
157    "IB Traffic Class / RoCEv2 Differentiated Services Code Point (DSCP).\n"
158    "\"auto\" option selects 106 on RoCEv2 and 0 otherwise.",
159    ucs_offsetof(uct_ib_iface_config_t, traffic_class), UCS_CONFIG_TYPE_ULUNITS},
160 
161   {"HOP_LIMIT", "255",
162    "IB Hop limit / RoCEv2 Time to Live. Should be between 0 and 255.\n",
163    ucs_offsetof(uct_ib_iface_config_t, hop_limit), UCS_CONFIG_TYPE_UINT},
164 
165   {"NUM_PATHS", "auto",
166    "Number of connections that should be created between a pair of communicating\n"
167    "endpoints for optimal performance. The default value 'auto' behaves according\n"
168    "to the port link layer:\n"
169    " RoCE       - "UCS_PP_MAKE_STRING(UCT_IB_DEV_MAX_PORTS) " for LAG port, otherwise - 1.\n"
170    " InfiniBand - As the number of path bits enabled by fabric's LMC value and selected\n"
171    "              by "UCS_DEFAULT_ENV_PREFIX UCT_IB_CONFIG_PREFIX"LID_PATH_BITS configuration.",
172    ucs_offsetof(uct_ib_iface_config_t, num_paths), UCS_CONFIG_TYPE_ULUNITS},
173 
174   {"ROCE_PATH_FACTOR", "1",
175    "Multiplier for RoCE LAG UDP source port calculation. The UDP source port\n"
176    "is typically used by switches and network adapters to select a different\n"
177    "path for the same pair of endpoints.",
178    ucs_offsetof(uct_ib_iface_config_t, roce_path_factor), UCS_CONFIG_TYPE_UINT},
179 
180   {"LID_PATH_BITS", "0",
181    "List of IB Path bits separated by comma (a,b,c) "
182    "which will be the low portion of the LID, according to the LMC in the fabric.",
183    ucs_offsetof(uct_ib_iface_config_t, lid_path_bits), UCS_CONFIG_TYPE_ARRAY(path_bits_spec)},
184 
185   {"PKEY", "auto",
186    "Which pkey value to use. Should be between 0 and 0x7fff.\n"
187    "\"auto\" option selects a first valid pkey value with full membership.",
188    ucs_offsetof(uct_ib_iface_config_t, pkey), UCS_CONFIG_TYPE_HEX},
189 
190 #ifdef HAVE_IBV_EXP_RES_DOMAIN
191   {"RESOURCE_DOMAIN", "y",
192    "Enable multiple resource domains (experimental).",
193    ucs_offsetof(uct_ib_iface_config_t, enable_res_domain), UCS_CONFIG_TYPE_BOOL},
194 #endif
195 
196   {"PATH_MTU", "default",
197    "Path MTU. \"default\" will select the best MTU for the device.",
198    ucs_offsetof(uct_ib_iface_config_t, path_mtu),
199                 UCS_CONFIG_TYPE_ENUM(uct_ib_mtu_values)},
200 
201   {"ENABLE_CUDA_AFFINITY", "y",
202    "Prefer IB devices closest to detected CUDA device\n",
203    ucs_offsetof(uct_ib_iface_config_t, enable_cuda_affinity), UCS_CONFIG_TYPE_BOOL},
204 
205   {NULL}
206 };
207 
uct_ib_iface_is_roce(uct_ib_iface_t * iface)208 int uct_ib_iface_is_roce(uct_ib_iface_t *iface)
209 {
210     return uct_ib_device_is_port_roce(uct_ib_iface_device(iface),
211                                       iface->config.port_num);
212 }
213 
uct_ib_iface_is_ib(uct_ib_iface_t * iface)214 int uct_ib_iface_is_ib(uct_ib_iface_t *iface)
215 {
216     return uct_ib_device_is_port_ib(uct_ib_iface_device(iface),
217                                     iface->config.port_num);
218 }
219 
uct_ib_iface_recv_desc_init(uct_iface_h tl_iface,void * obj,uct_mem_h memh)220 static void uct_ib_iface_recv_desc_init(uct_iface_h tl_iface, void *obj, uct_mem_h memh)
221 {
222     uct_ib_iface_recv_desc_t *desc = obj;
223 
224     desc->lkey = uct_ib_memh_get_lkey(memh);
225 }
226 
uct_ib_iface_recv_mpool_init(uct_ib_iface_t * iface,const uct_ib_iface_config_t * config,const char * name,ucs_mpool_t * mp)227 ucs_status_t uct_ib_iface_recv_mpool_init(uct_ib_iface_t *iface,
228                                           const uct_ib_iface_config_t *config,
229                                           const char *name, ucs_mpool_t *mp)
230 {
231     unsigned grow;
232 
233     if (config->rx.queue_len < 1024) {
234         grow = 1024;
235     } else {
236         /* We want to have some free (+10%) elements to avoid mem pool expansion */
237         grow = ucs_min( (int)(1.1 * config->rx.queue_len + 0.5),
238                         config->rx.mp.max_bufs);
239     }
240 
241     return uct_iface_mpool_init(&iface->super, mp,
242                                 iface->config.rx_payload_offset + iface->config.seg_size,
243                                 iface->config.rx_hdr_offset,
244                                 UCS_SYS_CACHE_LINE_SIZE,
245                                 &config->rx.mp, grow,
246                                 uct_ib_iface_recv_desc_init,
247                                 name);
248 }
249 
uct_ib_iface_release_desc(uct_recv_desc_t * self,void * desc)250 void uct_ib_iface_release_desc(uct_recv_desc_t *self, void *desc)
251 {
252     uct_ib_iface_t *iface = ucs_container_of(self, uct_ib_iface_t, release_desc);
253     void *ib_desc;
254 
255     ib_desc = UCS_PTR_BYTE_OFFSET(desc, -(ptrdiff_t)iface->config.rx_headroom_offset);
256     ucs_mpool_put_inline(ib_desc);
257 }
258 
259 static inline uct_ib_roce_version_t
uct_ib_address_flags_get_roce_version(uint8_t flags)260 uct_ib_address_flags_get_roce_version(uint8_t flags)
261 {
262     ucs_assert(flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH);
263     return (uct_ib_roce_version_t)(flags >> ucs_ilog2(UCT_IB_ADDRESS_FLAG_ETH_LAST));
264 }
265 
266 static inline sa_family_t
uct_ib_address_flags_get_roce_af(uint8_t flags)267 uct_ib_address_flags_get_roce_af(uint8_t flags)
268 {
269     ucs_assert(flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH);
270     return (flags & UCT_IB_ADDRESS_FLAG_ROCE_IPV6) ?
271            AF_INET6 : AF_INET;
272 }
273 
uct_ib_address_size(const uct_ib_address_pack_params_t * params)274 size_t uct_ib_address_size(const uct_ib_address_pack_params_t *params)
275 {
276     size_t size = sizeof(uct_ib_address_t);
277 
278     if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) {
279         /* Ethernet: address contains only raw GID */
280         size += sizeof(union ibv_gid);
281     } else {
282         /* InfiniBand: address always contains LID */
283         size += sizeof(uint16_t); /* lid */
284 
285         if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID) {
286             /* Add GUID */
287             UCS_STATIC_ASSERT(sizeof(params->gid.global.interface_id) == sizeof(uint64_t));
288             size += sizeof(uint64_t);
289         }
290 
291         if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) {
292             if ((params->gid.global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) ==
293                                                     UCT_IB_SITE_LOCAL_PREFIX) {
294                 /* 16-bit subnet prefix */
295                 size += sizeof(uint16_t);
296             } else if (params->gid.global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX) {
297                 /* 64-bit subnet prefix */
298                 size += sizeof(uint64_t);
299             }
300             /* Note: if subnet prefix is LINK_LOCAL, no need to pack it because
301              * it's a well-known value defined by IB specification.
302              */
303         }
304     }
305 
306     if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) {
307         size += sizeof(uint8_t);
308     }
309 
310     if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) {
311         size += sizeof(uint8_t);
312     }
313 
314     if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) {
315         size += sizeof(uint16_t);
316     }
317 
318     return size;
319 }
320 
uct_ib_address_pack(const uct_ib_address_pack_params_t * params,uct_ib_address_t * ib_addr)321 void uct_ib_address_pack(const uct_ib_address_pack_params_t *params,
322                          uct_ib_address_t *ib_addr)
323 {
324     void *ptr = ib_addr + 1;
325 
326     if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) {
327         /* RoCE, in this case we don't use the lid, we pack the gid, the RoCE
328          * version, address family and set the ETH flag */
329         ib_addr->flags = UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH |
330                          (params->roce_info.ver <<
331                           ucs_ilog2(UCT_IB_ADDRESS_FLAG_ETH_LAST));
332 
333         if (params->roce_info.addr_family == AF_INET6) {
334             ib_addr->flags |= UCT_IB_ADDRESS_FLAG_ROCE_IPV6;
335         }
336 
337         /* uint8_t raw[16]; */
338         memcpy(ptr, params->gid.raw, sizeof(params->gid.raw));
339         ptr = UCS_PTR_TYPE_OFFSET(ptr, params->gid.raw);
340     } else {
341         /* IB, LID */
342         ib_addr->flags   = 0;
343         *(uint16_t*)ptr  = params->lid;
344         ptr              = UCS_PTR_TYPE_OFFSET(ptr, uint16_t);
345 
346         if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID) {
347             /* Pack GUID */
348             ib_addr->flags  |= UCT_IB_ADDRESS_FLAG_IF_ID;
349             *(uint64_t*) ptr = params->gid.global.interface_id;
350             ptr              = UCS_PTR_TYPE_OFFSET(ptr, uint64_t);
351         }
352 
353         if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) {
354             if ((params->gid.global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) ==
355                                                     UCT_IB_SITE_LOCAL_PREFIX) {
356                 /* Site-local */
357                 ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET16;
358                 *(uint16_t*)ptr = params->gid.global.subnet_prefix >> 48;
359                 ptr             = UCS_PTR_TYPE_OFFSET(ptr, uint16_t);
360             } else if (params->gid.global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX) {
361                 /* Global */
362                 ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET64;
363                 *(uint64_t*)ptr = params->gid.global.subnet_prefix;
364                 ptr             = UCS_PTR_TYPE_OFFSET(ptr, uint64_t);
365             }
366         }
367     }
368 
369     if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) {
370         ucs_assert((int)params->path_mtu < UINT8_MAX);
371         ib_addr->flags |= UCT_IB_ADDRESS_FLAG_PATH_MTU;
372         *(uint8_t*)ptr  = (uint8_t)params->path_mtu;
373         ptr             = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
374     }
375 
376     if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) {
377         ib_addr->flags |= UCT_IB_ADDRESS_FLAG_GID_INDEX;
378         *(uint8_t*)ptr  = params->gid_index;
379     }
380 
381     if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) {
382         ucs_assert(params->pkey != UCT_IB_ADDRESS_DEFAULT_PKEY);
383         ib_addr->flags |= UCT_IB_ADDRESS_FLAG_PKEY;
384         *(uint16_t*)ptr = params->pkey;
385     }
386 }
387 
uct_ib_iface_address_pack_flags(uct_ib_iface_t * iface)388 unsigned uct_ib_iface_address_pack_flags(uct_ib_iface_t *iface)
389 {
390     unsigned pack_flags = 0;
391 
392     if (iface->pkey != UCT_IB_ADDRESS_DEFAULT_PKEY) {
393         pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_PKEY;
394     }
395 
396     if (uct_ib_iface_is_roce(iface)) {
397         /* pack Ethernet address */
398         pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_ETH;
399     } else if (iface->config.force_global_addr) {
400         /* pack full IB address */
401         pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX |
402                       UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID;
403     } else {
404         /* pack only subnet prefix for reachability test */
405         pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX;
406     }
407 
408     return pack_flags;
409 }
410 
uct_ib_iface_address_size(uct_ib_iface_t * iface)411 size_t uct_ib_iface_address_size(uct_ib_iface_t *iface)
412 {
413     uct_ib_address_pack_params_t params;
414 
415     params.flags     = uct_ib_iface_address_pack_flags(iface);
416     params.gid       = iface->gid_info.gid;
417     params.roce_info = iface->gid_info.roce_info;
418     return uct_ib_address_size(&params);
419 }
420 
uct_ib_iface_address_pack(uct_ib_iface_t * iface,uct_ib_address_t * ib_addr)421 void uct_ib_iface_address_pack(uct_ib_iface_t *iface, uct_ib_address_t *ib_addr)
422 {
423     uct_ib_address_pack_params_t params;
424 
425     params.flags     = uct_ib_iface_address_pack_flags(iface);
426     params.gid       = iface->gid_info.gid;
427     params.lid       = uct_ib_iface_port_attr(iface)->lid;
428     params.roce_info = iface->gid_info.roce_info;
429     /* to suppress gcc 4.3.4 warning */
430     params.path_mtu  = UCT_IB_ADDRESS_INVALID_PATH_MTU;
431     params.gid_index = UCT_IB_ADDRESS_INVALID_GID_INDEX;
432     params.pkey      = iface->pkey;
433     uct_ib_address_pack(&params, ib_addr);
434 }
435 
uct_ib_address_unpack(const uct_ib_address_t * ib_addr,uct_ib_address_pack_params_t * params_p)436 void uct_ib_address_unpack(const uct_ib_address_t *ib_addr,
437                            uct_ib_address_pack_params_t *params_p)
438 {
439     const void *ptr                     = ib_addr + 1;
440     /* silence cppcheck warning */
441     uct_ib_address_pack_params_t params = {0};
442 
443     params.gid_index = UCT_IB_ADDRESS_INVALID_GID_INDEX;
444     params.path_mtu  = UCT_IB_ADDRESS_INVALID_PATH_MTU;
445     params.pkey      = UCT_IB_ADDRESS_DEFAULT_PKEY;
446 
447     if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH) {
448         /* uint8_t raw[16]; */
449         memcpy(params.gid.raw, ptr, sizeof(params.gid.raw));
450         ptr           = UCS_PTR_BYTE_OFFSET(ptr, sizeof(params.gid.raw));
451         params.flags |= UCT_IB_ADDRESS_PACK_FLAG_ETH;
452 
453         params.roce_info.addr_family =
454             uct_ib_address_flags_get_roce_af(ib_addr->flags);
455         params.roce_info.ver         =
456             uct_ib_address_flags_get_roce_version(ib_addr->flags);
457     } else {
458         /* Default prefix */
459         params.gid.global.subnet_prefix = UCT_IB_LINK_LOCAL_PREFIX;
460         params.gid.global.interface_id  = 0;
461         params.flags                   |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX |
462                                           UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID;
463 
464         /* If the link layer is not ETHERNET, then it is IB and a lid
465          * must be present */
466         params.lid                      = *(const uint16_t*)ptr;
467         ptr                             = UCS_PTR_TYPE_OFFSET(ptr, uint16_t);
468 
469         if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_IF_ID) {
470             params.gid.global.interface_id = *(uint64_t*)ptr;
471             ptr                            = UCS_PTR_TYPE_OFFSET(ptr, uint64_t);
472         }
473 
474         if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET16) {
475             params.gid.global.subnet_prefix = UCT_IB_SITE_LOCAL_PREFIX |
476                                               ((uint64_t)*(uint16_t*)ptr << 48);
477             ptr                             = UCS_PTR_TYPE_OFFSET(ptr, uint16_t);
478             ucs_assert(!(ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64));
479         }
480 
481         if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64) {
482             params.gid.global.subnet_prefix = *(uint64_t*)ptr;
483             ptr                             = UCS_PTR_TYPE_OFFSET(ptr, uint64_t);
484             params.flags                   |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX;
485         }
486     }
487 
488     if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_PATH_MTU) {
489         params.path_mtu = *(const uint8_t*)ptr;
490         ptr             = UCS_PTR_TYPE_OFFSET(ptr, const uint8_t);
491         params.flags   |= UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU;
492     }
493 
494     if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_GID_INDEX) {
495         params.gid_index = *(const uint8_t*)ptr;
496         ptr              = UCS_PTR_TYPE_OFFSET(ptr, const uint16_t);
497         params.flags    |= UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX;
498     }
499 
500     if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_PKEY) {
501         params.pkey = *(const uint16_t*)ptr;
502     }
503     /* PKEY is always in params */
504     params.flags |= UCT_IB_ADDRESS_PACK_FLAG_PKEY;
505 
506     *params_p = params;
507 }
508 
uct_ib_address_str(const uct_ib_address_t * ib_addr,char * buf,size_t max)509 const char *uct_ib_address_str(const uct_ib_address_t *ib_addr, char *buf,
510                                size_t max)
511 {
512     uct_ib_address_pack_params_t params;
513     char *p, *endp;
514 
515     uct_ib_address_unpack(ib_addr, &params);
516 
517     p    = buf;
518     endp = buf + max;
519     if (params.lid != 0) {
520         snprintf(p, endp - p, "lid %d ", params.lid);
521         p += strlen(p);
522     }
523 
524     uct_ib_gid_str(&params.gid, p, endp - p);
525     p += strlen(p);
526 
527     if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) {
528         ucs_assert(params.gid_index != UCT_IB_ADDRESS_INVALID_GID_INDEX);
529         snprintf(p, endp - p, "gid index %u ", params.gid_index);
530         p += strlen(p);
531     }
532 
533     if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) {
534         ucs_assert(params.path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU);
535         snprintf(p, endp - p, "mtu %zu ", uct_ib_mtu_value(params.path_mtu));
536         p += strlen(p);
537     }
538 
539     ucs_assert((params.flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) &&
540                (params.flags != UCT_IB_ADDRESS_INVALID_PKEY));
541     snprintf(p, endp - p, "pkey 0x%x ", params.pkey);
542 
543     return buf;
544 }
545 
uct_ib_iface_get_device_address(uct_iface_h tl_iface,uct_device_addr_t * dev_addr)546 ucs_status_t uct_ib_iface_get_device_address(uct_iface_h tl_iface,
547                                              uct_device_addr_t *dev_addr)
548 {
549     uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t);
550 
551     uct_ib_iface_address_pack(iface, (void*)dev_addr);
552 
553     return UCS_OK;
554 }
555 
uct_ib_iface_roce_is_reachable(const uct_ib_device_gid_info_t * local_gid_info,const uct_ib_address_t * remote_ib_addr)556 static int uct_ib_iface_roce_is_reachable(const uct_ib_device_gid_info_t *local_gid_info,
557                                           const uct_ib_address_t *remote_ib_addr)
558 {
559     sa_family_t local_ib_addr_af         = local_gid_info->roce_info.addr_family;
560     uct_ib_roce_version_t local_roce_ver = local_gid_info->roce_info.ver;
561     uint8_t remote_ib_addr_flags         = remote_ib_addr->flags;
562     uct_ib_roce_version_t remote_roce_ver;
563     sa_family_t remote_ib_addr_af;
564     char local_gid_str[128], remote_gid_str[128];
565 
566     if ((uct_ib_address_flags_get_roce_version(remote_ib_addr_flags)) ==
567          UCT_IB_DEVICE_ROCE_ANY) {
568         return 1;
569     }
570 
571     /* check the address family */
572     remote_ib_addr_af = uct_ib_address_flags_get_roce_af(remote_ib_addr_flags);
573 
574     if (local_ib_addr_af != remote_ib_addr_af) {
575         ucs_assert(local_ib_addr_af != 0);
576         ucs_debug("different addr_family detected. local %s remote %s",
577                   ucs_sockaddr_address_family_str(local_ib_addr_af),
578                   ucs_sockaddr_address_family_str(remote_ib_addr_af));
579         return 0;
580     }
581 
582     /* check the RoCE version */
583     ucs_assert(local_roce_ver != UCT_IB_DEVICE_ROCE_ANY);
584 
585     remote_roce_ver = uct_ib_address_flags_get_roce_version(remote_ib_addr_flags);
586 
587     if (local_roce_ver != remote_roce_ver) {
588         ucs_trace("different RoCE versions detected. local %s (gid=%s)"
589                   "remote %s (gid=%s)",
590                   uct_ib_roce_version_str(local_roce_ver),
591                   uct_ib_gid_str(&local_gid_info->gid, local_gid_str,
592                                  sizeof(local_gid_str)),
593                   uct_ib_roce_version_str(remote_roce_ver),
594                   uct_ib_gid_str((union ibv_gid *)(remote_ib_addr + 1), remote_gid_str,
595                                  sizeof(remote_gid_str)));
596         return 0;
597     }
598 
599     return 1;
600 }
601 
uct_ib_iface_is_reachable(const uct_iface_h tl_iface,const uct_device_addr_t * dev_addr,const uct_iface_addr_t * iface_addr)602 int uct_ib_iface_is_reachable(const uct_iface_h tl_iface,
603                               const uct_device_addr_t *dev_addr,
604                               const uct_iface_addr_t *iface_addr)
605 {
606     uct_ib_iface_t *iface           = ucs_derived_of(tl_iface, uct_ib_iface_t);
607     int is_local_eth                = uct_ib_iface_is_roce(iface);
608     const uct_ib_address_t *ib_addr = (const void*)dev_addr;
609     uct_ib_address_pack_params_t params;
610 
611     uct_ib_address_unpack(ib_addr, &params);
612 
613     if (/* at least one PKEY has to be with full membership */
614         !((params.pkey | iface->pkey) & UCT_IB_PKEY_MEMBERSHIP_MASK) ||
615         /* PKEY values have to be equal */
616         ((params.pkey ^ iface->pkey) & UCT_IB_PKEY_PARTITION_MASK)) {
617         return 0;
618     }
619 
620     if (!is_local_eth && !(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) {
621         /* same subnet prefix */
622         return params.gid.global.subnet_prefix ==
623                iface->gid_info.gid.global.subnet_prefix;
624     } else if (is_local_eth && (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) {
625         /* there shouldn't be a lid and the UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH
626          * flag should be on. If reachable, the remote and local RoCE versions
627          * and address families have to be the same */
628         return uct_ib_iface_roce_is_reachable(&iface->gid_info, ib_addr);
629     } else {
630         /* local and remote have different link layers and therefore are unreachable */
631         return 0;
632     }
633 }
634 
uct_ib_iface_create_ah(uct_ib_iface_t * iface,struct ibv_ah_attr * ah_attr,struct ibv_ah ** ah_p)635 ucs_status_t uct_ib_iface_create_ah(uct_ib_iface_t *iface,
636                                     struct ibv_ah_attr *ah_attr,
637                                     struct ibv_ah **ah_p)
638 {
639     return uct_ib_device_create_ah_cached(uct_ib_iface_device(iface), ah_attr,
640                                           uct_ib_iface_md(iface)->pd, ah_p);
641 }
642 
uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t * iface,uint16_t lid,const union ibv_gid * gid,uint8_t gid_index,unsigned path_index,struct ibv_ah_attr * ah_attr)643 void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid,
644                                             const union ibv_gid *gid,
645                                             uint8_t gid_index,
646                                             unsigned path_index,
647                                             struct ibv_ah_attr *ah_attr)
648 {
649     uint8_t path_bits;
650     char buf[128];
651 
652     memset(ah_attr, 0, sizeof(*ah_attr));
653 
654     ah_attr->sl                = iface->config.sl;
655     ah_attr->port_num          = iface->config.port_num;
656     ah_attr->grh.traffic_class = iface->config.traffic_class;
657 
658     if (uct_ib_iface_is_roce(iface)) {
659         ah_attr->dlid          = UCT_IB_ROCE_UDP_SRC_PORT_BASE |
660                                  (iface->config.roce_path_factor * path_index);
661         /* Workaround rdma-core issue of calling rand() which affects global
662          * random state in glibc */
663         ah_attr->grh.flow_label = 1;
664     } else {
665         /* TODO iface->path_bits should be removed and replaced by path_index */
666         path_bits              = iface->path_bits[path_index %
667                                                   iface->path_bits_count];
668         ah_attr->dlid          = lid | path_bits;
669         ah_attr->src_path_bits = path_bits;
670     }
671 
672     if (iface->config.force_global_addr ||
673         (iface->gid_info.gid.global.subnet_prefix != gid->global.subnet_prefix)) {
674         ucs_assert_always(gid->global.interface_id != 0);
675         ah_attr->is_global      = 1;
676         ah_attr->grh.dgid       = *gid;
677         ah_attr->grh.sgid_index = gid_index;
678         ah_attr->grh.hop_limit  = iface->config.hop_limit;
679     } else {
680         ah_attr->is_global      = 0;
681     }
682 
683     ucs_debug("iface %p: ah_attr %s", iface,
684               uct_ib_ah_attr_str(buf, sizeof(buf), ah_attr));
685 }
686 
uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t * iface,const uct_ib_address_t * ib_addr,unsigned path_index,struct ibv_ah_attr * ah_attr,enum ibv_mtu * path_mtu)687 void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface,
688                                          const uct_ib_address_t *ib_addr,
689                                          unsigned path_index,
690                                          struct ibv_ah_attr *ah_attr,
691                                          enum ibv_mtu *path_mtu)
692 {
693     uct_ib_address_pack_params_t params;
694 
695     ucs_assert(!uct_ib_iface_is_roce(iface) ==
696                !(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH));
697 
698     uct_ib_address_unpack(ib_addr, &params);
699 
700     if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) {
701         ucs_assert(params.path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU);
702         *path_mtu = params.path_mtu;
703     } else {
704         *path_mtu = iface->config.path_mtu;
705     }
706 
707     if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) {
708         ucs_assert(params.gid_index != UCT_IB_ADDRESS_INVALID_GID_INDEX);
709     } else {
710         params.gid_index = iface->gid_info.gid_index;
711     }
712 
713     uct_ib_iface_fill_ah_attr_from_gid_lid(iface, params.lid, &params.gid,
714                                            params.gid_index, path_index,
715                                            ah_attr);
716 }
717 
uct_ib_iface_init_pkey(uct_ib_iface_t * iface,const uct_ib_iface_config_t * config)718 static ucs_status_t uct_ib_iface_init_pkey(uct_ib_iface_t *iface,
719                                            const uct_ib_iface_config_t *config)
720 {
721     uct_ib_device_t *dev    = uct_ib_iface_device(iface);
722     uint16_t pkey_tbl_len   = uct_ib_iface_port_attr(iface)->pkey_tbl_len;
723     int pkey_found          = 0;
724     uint16_t lim_pkey       = UCT_IB_ADDRESS_INVALID_PKEY;
725     uint16_t lim_pkey_index = UINT16_MAX;
726     uint16_t pkey_index, port_pkey, pkey;
727 
728     if ((config->pkey != UCS_HEXUNITS_AUTO) &&
729         (config->pkey > UCT_IB_PKEY_PARTITION_MASK)) {
730         ucs_error("requested pkey 0x%x is invalid, should be in the range 0..0x%x",
731                   config->pkey, UCT_IB_PKEY_PARTITION_MASK);
732         return UCS_ERR_INVALID_PARAM;
733     }
734 
735     /* get the user's pkey value and find its index in the port's pkey table */
736     for (pkey_index = 0; pkey_index < pkey_tbl_len; ++pkey_index) {
737         /* get the pkey values from the port's pkeys table */
738         if (ibv_query_pkey(dev->ibv_context, iface->config.port_num, pkey_index,
739                            &port_pkey))
740         {
741             ucs_debug("ibv_query_pkey("UCT_IB_IFACE_FMT", index=%d) failed: %m",
742                       UCT_IB_IFACE_ARG(iface), pkey_index);
743             continue;
744         }
745 
746         pkey = ntohs(port_pkey);
747         /* if pkey = 0x0, just skip it w/o debug trace, because 0x0
748          * means that there is no real pkey configured at this index */
749         if (pkey == UCT_IB_ADDRESS_INVALID_PKEY) {
750             continue;
751         }
752 
753         if ((config->pkey == UCS_HEXUNITS_AUTO) ||
754             /* take only the lower 15 bits for the comparison */
755             ((pkey & UCT_IB_PKEY_PARTITION_MASK) == config->pkey)) {
756             if (!(pkey & UCT_IB_PKEY_MEMBERSHIP_MASK) &&
757                 /* limited PKEY has not yet been found */
758                 (lim_pkey == UCT_IB_ADDRESS_INVALID_PKEY)) {
759                 lim_pkey_index = pkey_index;
760                 lim_pkey       = pkey;
761                 continue;
762             }
763 
764             iface->pkey_index = pkey_index;
765             iface->pkey       = pkey;
766             pkey_found        = 1;
767             break;
768         }
769     }
770 
771     if (!pkey_found) {
772         if (lim_pkey == UCT_IB_ADDRESS_INVALID_PKEY) {
773             /* PKEY neither with full nor with limited membership was found */
774             if (config->pkey == UCS_HEXUNITS_AUTO) {
775                 ucs_error("there is no valid pkey to use on "
776                           UCT_IB_IFACE_FMT, UCT_IB_IFACE_ARG(iface));
777             } else {
778                 ucs_error("unable to find specified pkey 0x%x on "UCT_IB_IFACE_FMT,
779                           config->pkey, UCT_IB_IFACE_ARG(iface));
780             }
781 
782             return UCS_ERR_NO_ELEM;
783         } else {
784             ucs_assert(lim_pkey_index != UINT16_MAX);
785             iface->pkey_index = lim_pkey_index;
786             iface->pkey       = lim_pkey;
787         }
788     }
789 
790     ucs_debug("using pkey[%d] 0x%x on "UCT_IB_IFACE_FMT, iface->pkey_index,
791               iface->pkey, UCT_IB_IFACE_ARG(iface));
792 
793     return UCS_OK;
794 }
795 
uct_ib_iface_init_lmc(uct_ib_iface_t * iface,const uct_ib_iface_config_t * config)796 static ucs_status_t uct_ib_iface_init_lmc(uct_ib_iface_t *iface,
797                                           const uct_ib_iface_config_t *config)
798 {
799     unsigned i, j, num_path_bits;
800     unsigned first, last;
801     uint8_t lmc;
802     int step;
803 
804     if (config->lid_path_bits.count == 0) {
805         ucs_error("List of path bits must not be empty");
806         return UCS_ERR_INVALID_PARAM;
807     }
808 
809     /* count the number of lid_path_bits */
810     num_path_bits = 0;
811     for (i = 0; i < config->lid_path_bits.count; i++) {
812         num_path_bits += 1 + abs((int)(config->lid_path_bits.ranges[i].first -
813                                        config->lid_path_bits.ranges[i].last));
814     }
815 
816     iface->path_bits = ucs_calloc(1, num_path_bits * sizeof(*iface->path_bits),
817                                   "ib_path_bits");
818     if (iface->path_bits == NULL) {
819         return UCS_ERR_NO_MEMORY;
820     }
821 
822     lmc = uct_ib_iface_port_attr(iface)->lmc;
823 
824     /* go over the list of values (ranges) for the lid_path_bits and set them */
825     iface->path_bits_count = 0;
826     for (i = 0; i < config->lid_path_bits.count; ++i) {
827 
828         first = config->lid_path_bits.ranges[i].first;
829         last  = config->lid_path_bits.ranges[i].last;
830 
831         /* range of values or one value */
832         if (first < last) {
833             step = 1;
834         } else {
835             step = -1;
836         }
837 
838         /* fill the value/s */
839         for (j = first; j != (last + step); j += step) {
840             if (j >= UCS_BIT(lmc)) {
841                 ucs_debug("Not using value %d for path_bits - must be < 2^lmc (lmc=%d)",
842                           j, lmc);
843                 if (step == 1) {
844                     break;
845                 } else {
846                     continue;
847                 }
848             }
849 
850             ucs_assert(iface->path_bits_count < num_path_bits);
851             iface->path_bits[iface->path_bits_count] = j;
852             iface->path_bits_count++;
853         }
854     }
855 
856     return UCS_OK;
857 }
858 
uct_ib_iface_fill_attr(uct_ib_iface_t * iface,uct_ib_qp_attr_t * attr)859 void uct_ib_iface_fill_attr(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr)
860 {
861     attr->ibv.send_cq             = iface->cq[UCT_IB_DIR_TX];
862     attr->ibv.recv_cq             = iface->cq[UCT_IB_DIR_RX];
863 
864     attr->ibv.srq                 = attr->srq;
865     attr->ibv.cap                 = attr->cap;
866     attr->ibv.qp_type             = (enum ibv_qp_type)attr->qp_type;
867     attr->ibv.sq_sig_all          = attr->sq_sig_all;
868 
869 #if HAVE_DECL_IBV_EXP_CREATE_QP
870     if (!(attr->ibv.comp_mask & IBV_EXP_QP_INIT_ATTR_PD)) {
871         attr->ibv.comp_mask       = IBV_EXP_QP_INIT_ATTR_PD;
872         attr->ibv.pd              = uct_ib_iface_md(iface)->pd;
873     }
874 #elif HAVE_DECL_IBV_CREATE_QP_EX
875     if (!(attr->ibv.comp_mask & IBV_QP_INIT_ATTR_PD)) {
876         attr->ibv.comp_mask       = IBV_QP_INIT_ATTR_PD;
877         attr->ibv.pd              = uct_ib_iface_md(iface)->pd;
878     }
879 #endif
880 
881     attr->port                    = iface->config.port_num;
882 
883     if (attr->qp_type == IBV_QPT_UD) {
884         return;
885     }
886 
887     /* MOFED requires this to enable IB spec atomic */
888 #if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE
889     if (uct_ib_iface_device(iface)->dev_attr.exp_atomic_cap ==
890                                      IBV_EXP_ATOMIC_HCA_REPLY_BE) {
891         attr->ibv.comp_mask       |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS;
892         attr->ibv.exp_create_flags = IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY;
893     }
894 #endif
895 }
896 
uct_ib_iface_create_qp(uct_ib_iface_t * iface,uct_ib_qp_attr_t * attr,struct ibv_qp ** qp_p)897 ucs_status_t uct_ib_iface_create_qp(uct_ib_iface_t *iface,
898                                     uct_ib_qp_attr_t *attr,
899                                     struct ibv_qp **qp_p)
900 {
901     uct_ib_device_t *dev = uct_ib_iface_device(iface);
902     struct ibv_qp *qp;
903 
904     uct_ib_iface_fill_attr(iface, attr);
905 
906 #if HAVE_DECL_IBV_EXP_CREATE_QP
907     qp = ibv_exp_create_qp(dev->ibv_context, &attr->ibv);
908 #elif HAVE_DECL_IBV_CREATE_QP_EX
909     qp = ibv_create_qp_ex(dev->ibv_context, &attr->ibv);
910 #else
911     qp = ibv_create_qp(uct_ib_iface_md(iface)->pd, &attr->ibv);
912 #endif
913     if (qp == NULL) {
914         ucs_error("iface=%p: failed to create %s QP "
915                   "TX wr:%d sge:%d inl:%d resp:%d RX wr:%d sge:%d resp:%d: %m",
916                   iface, uct_ib_qp_type_str(attr->qp_type),
917                   attr->cap.max_send_wr, attr->cap.max_send_sge,
918                   attr->cap.max_inline_data, attr->max_inl_cqe[UCT_IB_DIR_TX],
919                   attr->cap.max_recv_wr, attr->cap.max_recv_sge,
920                   attr->max_inl_cqe[UCT_IB_DIR_RX]);
921         return UCS_ERR_IO_ERROR;
922     }
923 
924     attr->cap  = attr->ibv.cap;
925     *qp_p      = qp;
926 
927     ucs_debug("iface=%p: created %s QP 0x%x on %s:%d "
928               "TX wr:%d sge:%d inl:%d resp:%d RX wr:%d sge:%d resp:%d",
929               iface, uct_ib_qp_type_str(attr->qp_type), qp->qp_num,
930               uct_ib_device_name(dev), iface->config.port_num,
931               attr->cap.max_send_wr, attr->cap.max_send_sge,
932               attr->cap.max_inline_data, attr->max_inl_cqe[UCT_IB_DIR_TX],
933               attr->cap.max_recv_wr, attr->cap.max_recv_sge,
934               attr->max_inl_cqe[UCT_IB_DIR_RX]);
935 
936     return UCS_OK;
937 }
938 
uct_ib_verbs_create_cq(uct_ib_iface_t * iface,uct_ib_dir_t dir,const uct_ib_iface_init_attr_t * init_attr,int preferred_cpu,size_t inl)939 ucs_status_t uct_ib_verbs_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
940                                     const uct_ib_iface_init_attr_t *init_attr,
941                                     int preferred_cpu, size_t inl)
942 {
943     uct_ib_device_t *dev = uct_ib_iface_device(iface);
944     struct ibv_cq *cq;
945 #if HAVE_DECL_IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN
946     struct ibv_cq_init_attr_ex cq_attr = {};
947 
948     cq_attr.cqe         = init_attr->cq_len[dir];
949     cq_attr.channel     = iface->comp_channel;
950     cq_attr.comp_vector = preferred_cpu;
951     if (init_attr->flags & UCT_IB_CQ_IGNORE_OVERRUN) {
952         cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_FLAGS;
953         cq_attr.flags     = IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN;
954     }
955 
956     cq = ibv_cq_ex_to_cq(ibv_create_cq_ex(dev->ibv_context, &cq_attr));
957     if (!cq && (errno == ENOSYS))
958 #endif
959     {
960         iface->config.max_inl_cqe[dir] = 0;
961         cq = ibv_create_cq(dev->ibv_context, init_attr->cq_len[dir], NULL,
962                            iface->comp_channel, preferred_cpu);
963     }
964 
965     if (!cq) {
966         ucs_error("ibv_create_cq(cqe=%d) failed: %m", init_attr->cq_len[dir]);
967         return UCS_ERR_IO_ERROR;
968     }
969 
970     iface->cq[dir]                 = cq;
971     iface->config.max_inl_cqe[dir] = inl;
972     return UCS_OK;
973 }
974 
975 static ucs_status_t
uct_ib_iface_create_cq(uct_ib_iface_t * iface,uct_ib_dir_t dir,const uct_ib_iface_init_attr_t * init_attr,const uct_ib_iface_config_t * config,int preferred_cpu)976 uct_ib_iface_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
977                        const uct_ib_iface_init_attr_t *init_attr,
978                        const uct_ib_iface_config_t *config,
979                        int preferred_cpu)
980 {
981     ucs_status_t status;
982     size_t inl                          = config->inl[dir];
983 #if HAVE_DECL_IBV_EXP_SETENV && !HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE
984     uct_ib_device_t *dev                = uct_ib_iface_device(iface);
985     static const char *cqe_size_env_var = "MLX5_CQE_SIZE";
986     size_t cqe_size                     = 64;
987     int env_var_added                   = 0;
988     const char *cqe_size_env_value;
989     size_t cqe_size_min;
990     char cqe_size_buf[32];
991     int ret;
992 
993     cqe_size_min       = (inl > 32) ? 128 : 64;
994     cqe_size_env_value = getenv(cqe_size_env_var);
995 
996     if (cqe_size_env_value != NULL) {
997         cqe_size = atol(cqe_size_env_value);
998         if (cqe_size < cqe_size_min) {
999             ucs_error("%s is set to %zu, but at least %zu is required (inl: %zu)",
1000                       cqe_size_env_var, cqe_size, cqe_size_min, inl);
1001             return UCS_ERR_INVALID_PARAM;
1002         }
1003     } else {
1004         cqe_size = uct_ib_get_cqe_size(cqe_size_min);
1005         snprintf(cqe_size_buf, sizeof(cqe_size_buf),"%zu", cqe_size);
1006         ucs_debug("%s: setting %s=%s", uct_ib_device_name(dev), cqe_size_env_var,
1007                   cqe_size_buf);
1008         ret = ibv_exp_setenv(dev->ibv_context, cqe_size_env_var, cqe_size_buf, 1);
1009         if (ret) {
1010             ucs_error("ibv_exp_setenv(%s=%s) failed: %m", cqe_size_env_var,
1011                       cqe_size_buf);
1012             return UCS_ERR_INVALID_PARAM;
1013         }
1014 
1015         env_var_added = 1;
1016     }
1017 #endif
1018     status = iface->ops->create_cq(iface, dir, init_attr, preferred_cpu, inl);
1019     if (status != UCS_OK) {
1020         goto out_unsetenv;
1021     }
1022 
1023     status = UCS_OK;
1024 
1025 out_unsetenv:
1026 #if HAVE_DECL_IBV_EXP_SETENV && !HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE
1027     iface->config.max_inl_cqe[dir] = cqe_size / 2;
1028     if (env_var_added) {
1029         /* if we created a new environment variable, remove it */
1030         ret = ibv_exp_unsetenv(dev->ibv_context, cqe_size_env_var);
1031         if (ret) {
1032             ucs_warn("unsetenv(%s) failed: %m", cqe_size_env_var);
1033         }
1034     }
1035 #endif
1036     return status;
1037 }
1038 
1039 
uct_ib_iface_set_moderation(struct ibv_cq * cq,unsigned count,double period_usec)1040 static ucs_status_t uct_ib_iface_set_moderation(struct ibv_cq *cq,
1041                                                 unsigned count, double period_usec)
1042 {
1043 #if HAVE_DECL_IBV_EXP_CQ_MODERATION
1044     unsigned period = (unsigned)(period_usec * UCS_USEC_PER_SEC);
1045 
1046     if (count > UINT16_MAX) {
1047         ucs_error("CQ moderation count is too high: %u, max value: %u", count, UINT16_MAX);
1048         return UCS_ERR_INVALID_PARAM;
1049     } else if (count == 0) {
1050         /* in case if count value is 0 (unchanged default value) - set it to maximum
1051          * possible value */
1052         count = UINT16_MAX;
1053     }
1054 
1055     if (period > UINT16_MAX) {
1056         ucs_error("CQ moderation period is too high: %u, max value: %uus", period, UINT16_MAX);
1057         return UCS_ERR_INVALID_PARAM;
1058     } else if (period == 0) {
1059         /* in case if count value is 0 (unchanged default value) - set it to maximum
1060          * possible value, the same behavior as counter */
1061         period = UINT16_MAX;
1062     }
1063 
1064     if ((count < UINT16_MAX) || (period < UINT16_MAX)) {
1065         struct ibv_exp_cq_attr cq_attr = {
1066             .comp_mask            = IBV_EXP_CQ_ATTR_MODERATION,
1067             .moderation.cq_count  = (uint16_t)(count),
1068             .moderation.cq_period = (uint16_t)(period),
1069             .cq_cap_flags         = 0
1070         };
1071         if (ibv_exp_modify_cq(cq, &cq_attr, IBV_EXP_CQ_MODERATION)) {
1072             ucs_error("ibv_exp_modify_cq(count=%d, period=%d) failed: %m", count, period);
1073             return UCS_ERR_IO_ERROR;
1074         }
1075     }
1076 #endif /* HAVE_DECL_IBV_EXP_CQ_MODERATION */
1077 
1078     return UCS_OK;
1079 }
1080 
uct_ib_iface_set_num_paths(uct_ib_iface_t * iface,const uct_ib_iface_config_t * config)1081 static void uct_ib_iface_set_num_paths(uct_ib_iface_t *iface,
1082                                        const uct_ib_iface_config_t *config)
1083 {
1084     uct_ib_device_t *dev = uct_ib_iface_device(iface);
1085 
1086     if (config->num_paths == UCS_ULUNITS_AUTO) {
1087         if (uct_ib_iface_is_roce(iface)) {
1088             /* RoCE - number of paths is RoCE LAG level */
1089             iface->num_paths =
1090                     uct_ib_device_get_roce_lag_level(dev, iface->config.port_num);
1091         } else {
1092             /* IB - number of paths is LMC level */
1093             ucs_assert(iface->path_bits_count > 0);
1094             iface->num_paths = iface->path_bits_count;
1095         }
1096     } else {
1097         iface->num_paths = config->num_paths;
1098     }
1099 }
1100 
uct_ib_iface_is_roce_v2(uct_ib_iface_t * iface,uct_ib_device_t * dev)1101 int uct_ib_iface_is_roce_v2(uct_ib_iface_t *iface, uct_ib_device_t *dev)
1102 {
1103     return uct_ib_iface_is_roce(iface) &&
1104            (iface->gid_info.roce_info.ver == UCT_IB_DEVICE_ROCE_V2);
1105 }
1106 
uct_ib_iface_init_roce_gid_info(uct_ib_iface_t * iface,size_t md_config_index)1107 ucs_status_t uct_ib_iface_init_roce_gid_info(uct_ib_iface_t *iface,
1108                                              size_t md_config_index)
1109 {
1110     uct_ib_device_t *dev = uct_ib_iface_device(iface);
1111     uint8_t port_num     = iface->config.port_num;
1112 
1113     ucs_assert(uct_ib_iface_is_roce(iface));
1114 
1115     if (md_config_index == UCS_ULUNITS_AUTO) {
1116         return uct_ib_device_select_gid(dev, port_num, &iface->gid_info);
1117     }
1118 
1119     return uct_ib_device_query_gid_info(dev->ibv_context, uct_ib_device_name(dev),
1120                                         port_num, md_config_index,
1121                                         &iface->gid_info);
1122 }
1123 
uct_ib_iface_init_gid_info(uct_ib_iface_t * iface,size_t md_config_index)1124 static ucs_status_t uct_ib_iface_init_gid_info(uct_ib_iface_t *iface,
1125                                                size_t md_config_index)
1126 {
1127     uct_ib_device_gid_info_t *gid_info = &iface->gid_info;
1128     ucs_status_t status;
1129 
1130     /* Fill the gid index and the RoCE version */
1131     if (uct_ib_iface_is_roce(iface)) {
1132         status = uct_ib_iface_init_roce_gid_info(iface, md_config_index);
1133         if (status != UCS_OK) {
1134             goto out;
1135         }
1136     } else {
1137         gid_info->gid_index             = (md_config_index ==
1138                                            UCS_ULUNITS_AUTO) ?
1139                                           UCT_IB_MD_DEFAULT_GID_INDEX :
1140                                           md_config_index;
1141         gid_info->roce_info.ver         = UCT_IB_DEVICE_ROCE_ANY;
1142         gid_info->roce_info.addr_family = 0;
1143     }
1144 
1145     /* Fill the gid */
1146     status = uct_ib_device_query_gid(uct_ib_iface_device(iface),
1147                                      iface->config.port_num,
1148                                      gid_info->gid_index, &gid_info->gid);
1149     if (status != UCS_OK) {
1150         goto out;
1151     }
1152 
1153 out:
1154     return status;
1155 }
1156 
uct_ib_iface_set_path_mtu(uct_ib_iface_t * iface,const uct_ib_iface_config_t * config)1157 static void uct_ib_iface_set_path_mtu(uct_ib_iface_t *iface,
1158                                       const uct_ib_iface_config_t *config)
1159 {
1160     enum ibv_mtu port_mtu = uct_ib_iface_port_attr(iface)->active_mtu;
1161     uct_ib_device_t *dev  = uct_ib_iface_device(iface);
1162 
1163     /* MTU is set by user configuration */
1164     if (config->path_mtu != UCT_IB_MTU_DEFAULT) {
1165         /* cast from uct_ib_mtu_t to ibv_mtu */
1166         iface->config.path_mtu = (enum ibv_mtu)(config->path_mtu +
1167                                                 (IBV_MTU_512 - UCT_IB_MTU_512));
1168     } else if ((port_mtu > IBV_MTU_2048) &&
1169                (IBV_DEV_ATTR(dev, vendor_id) == 0x02c9) &&
1170                ((IBV_DEV_ATTR(dev, vendor_part_id) == 4099) ||
1171                 (IBV_DEV_ATTR(dev, vendor_part_id) == 4100) ||
1172                 (IBV_DEV_ATTR(dev, vendor_part_id) == 4103) ||
1173                 (IBV_DEV_ATTR(dev, vendor_part_id) == 4104))) {
1174         /* On some devices optimal path_mtu is 2048 */
1175         iface->config.path_mtu = IBV_MTU_2048;
1176     } else {
1177         iface->config.path_mtu = port_mtu;
1178     }
1179 }
1180 
UCS_CLASS_INIT_FUNC(uct_ib_iface_t,uct_ib_iface_ops_t * ops,uct_md_h md,uct_worker_h worker,const uct_iface_params_t * params,const uct_ib_iface_config_t * config,const uct_ib_iface_init_attr_t * init_attr)1181 UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md,
1182                     uct_worker_h worker, const uct_iface_params_t *params,
1183                     const uct_ib_iface_config_t *config,
1184                     const uct_ib_iface_init_attr_t *init_attr)
1185 {
1186     uct_ib_md_t *ib_md   = ucs_derived_of(md, uct_ib_md_t);
1187     uct_ib_device_t *dev = &ib_md->dev;
1188     size_t rx_headroom   = (params->field_mask &
1189                             UCT_IFACE_PARAM_FIELD_RX_HEADROOM) ?
1190                            params->rx_headroom : 0;
1191     ucs_cpu_set_t cpu_mask;
1192     int preferred_cpu;
1193     ucs_status_t status;
1194     uint8_t port_num;
1195 
1196     if (!(params->open_mode & UCT_IFACE_OPEN_MODE_DEVICE)) {
1197         return UCS_ERR_UNSUPPORTED;
1198     }
1199 
1200     if (params->field_mask & UCT_IFACE_PARAM_FIELD_CPU_MASK) {
1201         cpu_mask = params->cpu_mask;
1202     } else {
1203         memset(&cpu_mask, 0, sizeof(cpu_mask));
1204     }
1205 
1206     preferred_cpu = ucs_cpu_set_find_lcs(&cpu_mask);
1207 
1208     UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &ops->super, md, worker,
1209                               params, &config->super
1210                               UCS_STATS_ARG(((params->field_mask &
1211                                               UCT_IFACE_PARAM_FIELD_STATS_ROOT) &&
1212                                              (params->stats_root != NULL)) ?
1213                                             params->stats_root :
1214                                             dev->stats)
1215                               UCS_STATS_ARG(params->mode.device.dev_name));
1216 
1217     status = uct_ib_device_find_port(dev, params->mode.device.dev_name, &port_num);
1218     if (status != UCS_OK) {
1219         goto err;
1220     }
1221 
1222     self->ops                         = ops;
1223 
1224     self->config.rx_payload_offset    = sizeof(uct_ib_iface_recv_desc_t) +
1225                                         ucs_max(sizeof(uct_recv_desc_t) +
1226                                                 rx_headroom,
1227                                                 init_attr->rx_priv_len +
1228                                                 init_attr->rx_hdr_len);
1229     self->config.rx_hdr_offset        = self->config.rx_payload_offset -
1230                                         init_attr->rx_hdr_len;
1231     self->config.rx_headroom_offset   = self->config.rx_payload_offset -
1232                                         rx_headroom;
1233     self->config.seg_size             = init_attr->seg_size;
1234     self->config.roce_path_factor     = config->roce_path_factor;
1235     self->config.tx_max_poll          = config->tx.max_poll;
1236     self->config.rx_max_poll          = config->rx.max_poll;
1237     self->config.rx_max_batch         = ucs_min(config->rx.max_batch,
1238                                                 config->rx.queue_len / 4);
1239     self->config.port_num             = port_num;
1240     self->config.sl                   = config->sl;
1241     self->config.hop_limit            = config->hop_limit;
1242     self->release_desc.cb             = uct_ib_iface_release_desc;
1243     self->config.enable_res_domain    = config->enable_res_domain;
1244     self->config.enable_cuda_affinity = config->enable_cuda_affinity;
1245     self->config.qp_type              = init_attr->qp_type;
1246     uct_ib_iface_set_path_mtu(self, config);
1247 
1248     if (ucs_derived_of(worker, uct_priv_worker_t)->thread_mode == UCS_THREAD_MODE_MULTI) {
1249         ucs_error("IB transports do not support multi-threaded worker");
1250         return UCS_ERR_INVALID_PARAM;
1251     }
1252 
1253     status = uct_ib_iface_init_pkey(self, config);
1254     if (status != UCS_OK) {
1255         goto err;
1256     }
1257 
1258     status = uct_ib_iface_init_gid_info(self, ib_md->config.gid_index);
1259     if (status != UCS_OK) {
1260         goto err;
1261     }
1262 
1263     if (config->traffic_class == UCS_ULUNITS_AUTO) {
1264         self->config.traffic_class = uct_ib_iface_is_roce_v2(self, dev) ?
1265                                      UCT_IB_DEFAULT_ROCEV2_DSCP : 0;
1266     } else {
1267         self->config.traffic_class = config->traffic_class;
1268     }
1269 
1270     status = uct_ib_iface_init_lmc(self, config);
1271     if (status != UCS_OK) {
1272         goto err;
1273     }
1274 
1275     uct_ib_iface_set_num_paths(self, config);
1276 
1277     self->comp_channel = ibv_create_comp_channel(dev->ibv_context);
1278     if (self->comp_channel == NULL) {
1279         ucs_error("ibv_create_comp_channel() failed: %m");
1280         status = UCS_ERR_IO_ERROR;
1281         goto err_cleanup;
1282     }
1283 
1284     status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0);
1285     if (status != UCS_OK) {
1286         goto err_destroy_comp_channel;
1287     }
1288 
1289     status = uct_ib_iface_create_cq(self, UCT_IB_DIR_TX, init_attr,
1290                                     config, preferred_cpu);
1291     if (status != UCS_OK) {
1292         goto err_destroy_comp_channel;
1293     }
1294 
1295     status = uct_ib_iface_set_moderation(self->cq[UCT_IB_DIR_TX],
1296                                          config->tx.cq_moderation_count,
1297                                          config->tx.cq_moderation_period);
1298     if (status != UCS_OK) {
1299         goto err_destroy_send_cq;
1300     }
1301 
1302     status = uct_ib_iface_create_cq(self, UCT_IB_DIR_RX, init_attr,
1303                                     config, preferred_cpu);
1304     if (status != UCS_OK) {
1305         goto err_destroy_send_cq;
1306     }
1307 
1308     status = uct_ib_iface_set_moderation(self->cq[UCT_IB_DIR_RX],
1309                                          config->rx.cq_moderation_count,
1310                                          config->rx.cq_moderation_period);
1311     if (status != UCS_OK) {
1312         goto err_destroy_recv_cq;
1313     }
1314 
1315     /* Address scope and size */
1316     if (uct_ib_iface_is_roce(self) || config->is_global ||
1317         uct_ib_grh_required(uct_ib_iface_port_attr(self)) ||
1318         /* check ADDR_TYPE for backward compatibility */
1319         (config->addr_type == UCT_IB_ADDRESS_TYPE_SITE_LOCAL) ||
1320         (config->addr_type == UCT_IB_ADDRESS_TYPE_GLOBAL)) {
1321         self->config.force_global_addr = 1;
1322     } else {
1323         self->config.force_global_addr = 0;
1324     }
1325 
1326     self->addr_size  = uct_ib_iface_address_size(self);
1327 
1328     ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d",
1329               self->config.rx_headroom_offset, self->config.rx_payload_offset,
1330               self->config.rx_hdr_offset, self->config.seg_size);
1331 
1332     return UCS_OK;
1333 
1334 err_destroy_recv_cq:
1335     ibv_destroy_cq(self->cq[UCT_IB_DIR_RX]);
1336 err_destroy_send_cq:
1337     ibv_destroy_cq(self->cq[UCT_IB_DIR_TX]);
1338 err_destroy_comp_channel:
1339     ibv_destroy_comp_channel(self->comp_channel);
1340 err_cleanup:
1341     ucs_free(self->path_bits);
1342 err:
1343     return status;
1344 }
1345 
UCS_CLASS_CLEANUP_FUNC(uct_ib_iface_t)1346 static UCS_CLASS_CLEANUP_FUNC(uct_ib_iface_t)
1347 {
1348     int ret;
1349 
1350     ret = ibv_destroy_cq(self->cq[UCT_IB_DIR_RX]);
1351     if (ret != 0) {
1352         ucs_warn("ibv_destroy_cq(recv_cq) returned %d: %m", ret);
1353     }
1354 
1355     ret = ibv_destroy_cq(self->cq[UCT_IB_DIR_TX]);
1356     if (ret != 0) {
1357         ucs_warn("ibv_destroy_cq(send_cq) returned %d: %m", ret);
1358     }
1359 
1360     ret = ibv_destroy_comp_channel(self->comp_channel);
1361     if (ret != 0) {
1362         ucs_warn("ibv_destroy_comp_channel(comp_channel) returned %d: %m", ret);
1363     }
1364 
1365     ucs_free(self->path_bits);
1366 }
1367 
1368 UCS_CLASS_DEFINE(uct_ib_iface_t, uct_base_iface_t);
1369 
uct_ib_iface_prepare_rx_wrs(uct_ib_iface_t * iface,ucs_mpool_t * mp,uct_ib_recv_wr_t * wrs,unsigned n)1370 int uct_ib_iface_prepare_rx_wrs(uct_ib_iface_t *iface, ucs_mpool_t *mp,
1371                                 uct_ib_recv_wr_t *wrs, unsigned n)
1372 {
1373     uct_ib_iface_recv_desc_t *desc;
1374     unsigned count;
1375 
1376     count = 0;
1377     while (count < n) {
1378         UCT_TL_IFACE_GET_RX_DESC(&iface->super, mp, desc, break);
1379         wrs[count].sg.addr   = (uintptr_t)uct_ib_iface_recv_desc_hdr(iface, desc);
1380         wrs[count].sg.length = iface->config.rx_payload_offset + iface->config.seg_size;
1381         wrs[count].sg.lkey   = desc->lkey;
1382         wrs[count].ibwr.num_sge = 1;
1383         wrs[count].ibwr.wr_id   = (uintptr_t)desc;
1384         wrs[count].ibwr.sg_list = &wrs[count].sg;
1385         wrs[count].ibwr.next    = &wrs[count + 1].ibwr;
1386         ++count;
1387     }
1388 
1389     if (count > 0) {
1390         wrs[count - 1].ibwr.next = NULL;
1391     }
1392 
1393     return count;
1394 }
1395 
uct_ib_iface_get_numa_latency(uct_ib_iface_t * iface,double * latency)1396 static ucs_status_t uct_ib_iface_get_numa_latency(uct_ib_iface_t *iface,
1397                                                   double *latency)
1398 {
1399     uct_ib_device_t *dev = uct_ib_iface_device(iface);
1400     uct_ib_md_t *md      = uct_ib_iface_md(iface);
1401     ucs_sys_cpuset_t temp_cpu_mask, process_affinity;
1402 #if HAVE_NUMA
1403     int distance, min_cpu_distance;
1404     int cpu, num_cpus;
1405 #endif
1406     int ret;
1407 
1408     if (!md->config.prefer_nearest_device) {
1409         *latency = 0;
1410         return UCS_OK;
1411     }
1412 
1413     ret = ucs_sys_getaffinity(&process_affinity);
1414     if (ret) {
1415         ucs_error("sched_getaffinity() failed: %m");
1416         return UCS_ERR_INVALID_PARAM;
1417     }
1418 
1419 #if HAVE_NUMA
1420     /* Try to estimate the extra device latency according to NUMA distance */
1421     if (dev->numa_node != -1) {
1422         min_cpu_distance = INT_MAX;
1423         num_cpus         = ucs_min(CPU_SETSIZE, numa_num_configured_cpus());
1424         for (cpu = 0; cpu < num_cpus; ++cpu) {
1425             if (!CPU_ISSET(cpu, &process_affinity)) {
1426                 continue;
1427             }
1428             distance = numa_distance(ucs_numa_node_of_cpu(cpu), dev->numa_node);
1429             if (distance >= UCS_NUMA_MIN_DISTANCE) {
1430                 min_cpu_distance = ucs_min(min_cpu_distance, distance);
1431             }
1432         }
1433 
1434         if (min_cpu_distance != INT_MAX) {
1435             /* set the extra latency to (numa_distance - 10) * 20nsec */
1436             *latency = (min_cpu_distance - UCS_NUMA_MIN_DISTANCE) * 20e-9;
1437             return UCS_OK;
1438         }
1439     }
1440 #endif
1441 
1442     /* Estimate the extra device latency according to its local CPUs mask */
1443     CPU_AND(&temp_cpu_mask, &dev->local_cpus, &process_affinity);
1444     if (CPU_EQUAL(&process_affinity, &temp_cpu_mask)) {
1445         *latency = 0;
1446     } else {
1447         *latency = 200e-9;
1448     }
1449     return UCS_OK;
1450 }
1451 
uct_ib_iface_get_cuda_latency(uct_ib_iface_t * iface,double * latency)1452 static ucs_status_t uct_ib_iface_get_cuda_latency(uct_ib_iface_t *iface,
1453                                                   double *latency)
1454 {
1455     ucs_sys_dev_distance_t dist = {0.0, 0.0};
1456     uct_ib_device_t *dev        = uct_ib_iface_device(iface);
1457     ucs_sys_device_t ib_sys_device;
1458     ucs_sys_device_t cuda_sys_device;
1459     ucs_sys_bus_id_t ib_bus_id;
1460     ucs_sys_bus_id_t cuda_bus_id;
1461     ucs_status_t status;
1462 
1463     status = ucm_get_mem_type_current_device_info(UCS_MEMORY_TYPE_CUDA,
1464                                                   &cuda_bus_id);
1465     if (status != UCS_OK) {
1466         *latency = 0.0;
1467         return UCS_OK;
1468     }
1469 
1470     status = ucs_topo_find_device_by_bus_id(&cuda_bus_id, &cuda_sys_device);
1471     if (status != UCS_OK) {
1472         return status;
1473     }
1474 
1475     status = uct_ib_device_bus(dev, iface->config.port_num, &ib_bus_id);
1476     if (status != UCS_OK) {
1477         return status;
1478     }
1479 
1480     status = ucs_topo_find_device_by_bus_id(&ib_bus_id, &ib_sys_device);
1481     if (status != UCS_OK) {
1482         return status;
1483     }
1484 
1485     status = ucs_topo_get_distance(ib_sys_device, cuda_sys_device, &dist);
1486     if (status != UCS_OK) {
1487         return status;
1488     }
1489 
1490     *latency = dist.latency;
1491 
1492     return UCS_OK;
1493 }
1494 
uct_ib_iface_query(uct_ib_iface_t * iface,size_t xport_hdr_len,uct_iface_attr_t * iface_attr)1495 ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len,
1496                                 uct_iface_attr_t *iface_attr)
1497 {
1498     uct_ib_device_t *dev = uct_ib_iface_device(iface);
1499     uct_ib_md_t     *md  = uct_ib_iface_md(iface);
1500     static const unsigned ib_port_widths[] = {
1501         [0] = 1,
1502         [1] = 4,
1503         [2] = 8,
1504         [3] = 12,
1505         [4] = 16
1506     };
1507     uint8_t active_width, active_speed, active_mtu, width_idx;
1508     double encoding, signal_rate, wire_speed;
1509     size_t mtu, width, extra_pkt_len;
1510     ucs_status_t status;
1511     double numa_latency;
1512     double cuda_latency;
1513 
1514     uct_base_iface_query(&iface->super, iface_attr);
1515 
1516     active_width = uct_ib_iface_port_attr(iface)->active_width;
1517     active_speed = uct_ib_iface_port_attr(iface)->active_speed;
1518     active_mtu   = uct_ib_iface_port_attr(iface)->active_mtu;
1519 
1520     /* Get active width */
1521     width_idx = ucs_ilog2(active_width);
1522     if (!ucs_is_pow2(active_width) ||
1523         (active_width < 1) || (width_idx > 4))
1524     {
1525         ucs_error("Invalid active_width on %s:%d: %d",
1526                   UCT_IB_IFACE_ARG(iface), active_width);
1527         return UCS_ERR_IO_ERROR;
1528     }
1529 
1530     iface_attr->device_addr_len = iface->addr_size;
1531     iface_attr->dev_num_paths   = iface->num_paths;
1532 
1533     switch (active_speed) {
1534     case 1: /* SDR */
1535         iface_attr->latency.c = 5000e-9;
1536         signal_rate           = 2.5e9;
1537         encoding              = 8.0/10.0;
1538         break;
1539     case 2: /* DDR */
1540         iface_attr->latency.c = 2500e-9;
1541         signal_rate           = 5.0e9;
1542         encoding              = 8.0/10.0;
1543         break;
1544     case 4:
1545         iface_attr->latency.c = 1300e-9;
1546         if (uct_ib_iface_is_roce(iface)) {
1547             /* 10/40g Eth  */
1548             signal_rate       = 10.3125e9;
1549             encoding          = 64.0/66.0;
1550         } else {
1551             /* QDR */
1552             signal_rate       = 10.0e9;
1553             encoding          = 8.0/10.0;
1554         }
1555         break;
1556     case 8: /* FDR10 */
1557         iface_attr->latency.c = 700e-9;
1558         signal_rate           = 10.3125e9;
1559         encoding              = 64.0/66.0;
1560         break;
1561     case 16: /* FDR */
1562         iface_attr->latency.c = 700e-9;
1563         signal_rate           = 14.0625e9;
1564         encoding              = 64.0/66.0;
1565         break;
1566     case 32: /* EDR / 100g Eth */
1567         iface_attr->latency.c = 600e-9;
1568         signal_rate           = 25.78125e9;
1569         encoding              = 64.0/66.0;
1570         break;
1571     case 64: /* 50g Eth */
1572         iface_attr->latency.c = 600e-9;
1573         signal_rate           = 25.78125e9 * 2;
1574         encoding              = 64.0/66.0;
1575         break;
1576     default:
1577         ucs_error("Invalid active_speed on %s:%d: %d",
1578                   UCT_IB_IFACE_ARG(iface), active_speed);
1579         return UCS_ERR_IO_ERROR;
1580     }
1581 
1582     status = uct_ib_iface_get_numa_latency(iface, &numa_latency);
1583     if (status != UCS_OK) {
1584         return status;
1585     }
1586 
1587     iface_attr->latency.c += numa_latency;
1588     iface_attr->latency.m  = 0;
1589 
1590     if (iface->config.enable_cuda_affinity != UCS_NO) {
1591         status = uct_ib_iface_get_cuda_latency(iface, &cuda_latency);
1592         if (status != UCS_OK) {
1593             return status;
1594         }
1595 
1596         iface_attr->latency.c += cuda_latency;
1597         iface_attr->latency.m  = 0;
1598     }
1599 
1600     /* Wire speed calculation: Width * SignalRate * Encoding */
1601     width                 = ib_port_widths[width_idx];
1602     wire_speed            = (width * signal_rate * encoding) / 8.0;
1603 
1604     /* Calculate packet overhead  */
1605     mtu                   = ucs_min(uct_ib_mtu_value((enum ibv_mtu)active_mtu),
1606                                     iface->config.seg_size);
1607 
1608     extra_pkt_len = UCT_IB_BTH_LEN + xport_hdr_len +  UCT_IB_ICRC_LEN + UCT_IB_VCRC_LEN + UCT_IB_DELIM_LEN;
1609 
1610     if (uct_ib_iface_is_roce(iface)) {
1611         extra_pkt_len += UCT_IB_GRH_LEN + UCT_IB_ROCE_LEN;
1612         iface_attr->latency.c += 200e-9;
1613     } else {
1614         /* TODO check if UCT_IB_DELIM_LEN is present in RoCE as well */
1615         extra_pkt_len += UCT_IB_LRH_LEN;
1616     }
1617 
1618     iface_attr->bandwidth.shared    = ucs_min((wire_speed * mtu) / (mtu + extra_pkt_len), md->pci_bw);
1619     iface_attr->bandwidth.dedicated = 0;
1620     iface_attr->priority            = uct_ib_device_spec(dev)->priority;
1621 
1622     return UCS_OK;
1623 }
1624 
uct_ib_iface_event_fd_get(uct_iface_h tl_iface,int * fd_p)1625 ucs_status_t uct_ib_iface_event_fd_get(uct_iface_h tl_iface, int *fd_p)
1626 {
1627     uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t);
1628     *fd_p                 = iface->comp_channel->fd;
1629     return UCS_OK;
1630 }
1631 
uct_ib_iface_pre_arm(uct_ib_iface_t * iface)1632 ucs_status_t uct_ib_iface_pre_arm(uct_ib_iface_t *iface)
1633 {
1634     int res, send_cq_count, recv_cq_count;
1635     struct ibv_cq *cq;
1636     void *cq_context;
1637 
1638     send_cq_count = 0;
1639     recv_cq_count = 0;
1640     do {
1641         res = ibv_get_cq_event(iface->comp_channel, &cq, &cq_context);
1642         if (0 == res) {
1643             if (iface->cq[UCT_IB_DIR_TX] == cq) {
1644                 iface->ops->event_cq(iface, UCT_IB_DIR_TX);
1645                 ++send_cq_count;
1646             }
1647             if (iface->cq[UCT_IB_DIR_RX] == cq) {
1648                 iface->ops->event_cq(iface, UCT_IB_DIR_RX);
1649                 ++recv_cq_count;
1650             }
1651         }
1652     } while (res == 0);
1653 
1654     if (errno != EAGAIN) {
1655         return UCS_ERR_IO_ERROR;
1656     }
1657 
1658     if (send_cq_count > 0) {
1659         ibv_ack_cq_events(iface->cq[UCT_IB_DIR_TX], send_cq_count);
1660     }
1661 
1662     if (recv_cq_count > 0) {
1663         ibv_ack_cq_events(iface->cq[UCT_IB_DIR_RX], recv_cq_count);
1664     }
1665 
1666     /* avoid re-arming the interface if any events exists */
1667     if ((send_cq_count > 0) || (recv_cq_count > 0)) {
1668         ucs_trace("arm_cq: got %d send and %d recv events, returning BUSY",
1669                   send_cq_count, recv_cq_count);
1670         return UCS_ERR_BUSY;
1671     }
1672 
1673     return UCS_OK;
1674 }
1675 
uct_ib_iface_arm_cq(uct_ib_iface_t * iface,uct_ib_dir_t dir,int solicited_only)1676 ucs_status_t uct_ib_iface_arm_cq(uct_ib_iface_t *iface,
1677                                  uct_ib_dir_t dir,
1678                                  int solicited_only)
1679 {
1680     int ret;
1681 
1682     ret = ibv_req_notify_cq(iface->cq[dir], solicited_only);
1683     if (ret != 0) {
1684         ucs_error("ibv_req_notify_cq("UCT_IB_IFACE_FMT", %d, sol=%d) failed: %m",
1685                   UCT_IB_IFACE_ARG(iface), dir, solicited_only);
1686         return UCS_ERR_IO_ERROR;
1687     }
1688     return UCS_OK;
1689 }
1690