1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED.
3 *
4 * See file LICENSE for terms.
5 */
6
7 #ifdef HAVE_CONFIG_H
8 # include "config.h"
9 #endif
10
11 #include "ib_iface.h"
12 #include "ib_log.h"
13
14 #include <uct/base/uct_md.h>
15 #include <ucs/arch/bitops.h>
16 #include <ucs/arch/cpu.h>
17 #include <ucs/type/class.h>
18 #include <ucs/type/cpu_set.h>
19 #include <ucs/debug/log.h>
20 #include <ucs/time/time.h>
21 #include <ucs/memory/numa.h>
22 #include <ucs/sys/sock.h>
23 #include <ucm/util/sys.h>
24 #include <string.h>
25 #include <stdlib.h>
26 #include <poll.h>
27
28
29 static UCS_CONFIG_DEFINE_ARRAY(path_bits_spec,
30 sizeof(ucs_range_spec_t),
31 UCS_CONFIG_TYPE_RANGE_SPEC);
32
33 const char *uct_ib_mtu_values[] = {
34 [UCT_IB_MTU_DEFAULT] = "default",
35 [UCT_IB_MTU_512] = "512",
36 [UCT_IB_MTU_1024] = "1024",
37 [UCT_IB_MTU_2048] = "2048",
38 [UCT_IB_MTU_4096] = "4096",
39 [UCT_IB_MTU_LAST] = NULL
40 };
41
42 enum {
43 UCT_IB_ADDRESS_TYPE_LINK_LOCAL,
44 UCT_IB_ADDRESS_TYPE_SITE_LOCAL,
45 UCT_IB_ADDRESS_TYPE_GLOBAL,
46 UCT_IB_ADDRESS_TYPE_ETH,
47 UCT_IB_ADDRESS_TYPE_LAST,
48 UCT_IB_IFACE_ADDRESS_TYPE_AUTO = UCT_IB_ADDRESS_TYPE_LAST,
49 UCT_IB_IFACE_ADDRESS_TYPE_LAST
50 };
51
52 static const char *uct_ib_iface_addr_types[] = {
53 [UCT_IB_ADDRESS_TYPE_LINK_LOCAL] = "ib_local",
54 [UCT_IB_ADDRESS_TYPE_SITE_LOCAL] = "ib_site_local",
55 [UCT_IB_ADDRESS_TYPE_GLOBAL] = "ib_global",
56 [UCT_IB_ADDRESS_TYPE_ETH] = "eth",
57 [UCT_IB_IFACE_ADDRESS_TYPE_AUTO] = "auto",
58 [UCT_IB_IFACE_ADDRESS_TYPE_LAST] = NULL
59 };
60
61 ucs_config_field_t uct_ib_iface_config_table[] = {
62 {"", "", NULL,
63 ucs_offsetof(uct_ib_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)},
64
65 {"SEG_SIZE", "8192",
66 "Size of bounce buffers used for post_send and post_recv.",
67 ucs_offsetof(uct_ib_iface_config_t, seg_size), UCS_CONFIG_TYPE_MEMUNITS},
68
69 {"TX_QUEUE_LEN", "256",
70 "Length of send queue in the QP.",
71 ucs_offsetof(uct_ib_iface_config_t, tx.queue_len), UCS_CONFIG_TYPE_UINT},
72
73 {"TX_MAX_BATCH", "16",
74 "Number of send WQEs to batch in one post-send list. Larger values reduce\n"
75 "the CPU usage, but increase the latency and pipelining between sender and\n"
76 "receiver.",
77 ucs_offsetof(uct_ib_iface_config_t, tx.max_batch), UCS_CONFIG_TYPE_UINT},
78
79 {"TX_MAX_POLL", "16",
80 "Max number of receive completions to pick during TX poll",
81 ucs_offsetof(uct_ib_iface_config_t, tx.max_poll), UCS_CONFIG_TYPE_UINT},
82
83 {"TX_MIN_INLINE", "64",
84 "Bytes to reserve in send WQE for inline data. Messages which are small\n"
85 "enough will be sent inline.",
86 ucs_offsetof(uct_ib_iface_config_t, tx.min_inline), UCS_CONFIG_TYPE_MEMUNITS},
87
88 {"TX_INLINE_RESP", "0",
89 "Bytes to reserve in send WQE for inline response. Responses which are small\n"
90 "enough, such as of atomic operations and small reads, will be received inline.",
91 ucs_offsetof(uct_ib_iface_config_t, inl[UCT_IB_DIR_TX]), UCS_CONFIG_TYPE_MEMUNITS},
92
93 {"TX_MIN_SGE", "3",
94 "Number of SG entries to reserve in the send WQE.",
95 ucs_offsetof(uct_ib_iface_config_t, tx.min_sge), UCS_CONFIG_TYPE_UINT},
96
97 #if HAVE_DECL_IBV_EXP_CQ_MODERATION
98 {"TX_EVENT_MOD_COUNT", "0",
99 "Number of send completions for which an event would be generated (0 - disabled).",
100 ucs_offsetof(uct_ib_iface_config_t, tx.cq_moderation_count), UCS_CONFIG_TYPE_UINT},
101
102 {"TX_EVENT_MOD_PERIOD", "0us",
103 "Time period to generate send event (0 - disabled).",
104 ucs_offsetof(uct_ib_iface_config_t, tx.cq_moderation_period), UCS_CONFIG_TYPE_TIME},
105
106 {"RX_EVENT_MOD_COUNT", "0",
107 "Number of received messages for which an event would be generated (0 - disabled).",
108 ucs_offsetof(uct_ib_iface_config_t, rx.cq_moderation_count), UCS_CONFIG_TYPE_UINT},
109
110 {"RX_EVENT_MOD_PERIOD", "0us",
111 "Time period to generate receive event (0 - disabled).",
112 ucs_offsetof(uct_ib_iface_config_t, rx.cq_moderation_period), UCS_CONFIG_TYPE_TIME},
113 #endif /* HAVE_DECL_IBV_EXP_CQ_MODERATION */
114
115 UCT_IFACE_MPOOL_CONFIG_FIELDS("TX_", -1, 1024, "send",
116 ucs_offsetof(uct_ib_iface_config_t, tx.mp),
117 "\nAttention: Setting this param with value != -1 is a dangerous thing\n"
118 "in RC/DC and could cause deadlock or performance degradation."),
119
120 {"RX_QUEUE_LEN", "4096",
121 "Length of receive queue in the QPs.",
122 ucs_offsetof(uct_ib_iface_config_t, rx.queue_len), UCS_CONFIG_TYPE_UINT},
123
124 {"RX_MAX_BATCH", "16",
125 "How many post-receives to perform in one batch.",
126 ucs_offsetof(uct_ib_iface_config_t, rx.max_batch), UCS_CONFIG_TYPE_UINT},
127
128 {"RX_MAX_POLL", "16",
129 "Max number of receive completions to pick during RX poll",
130 ucs_offsetof(uct_ib_iface_config_t, rx.max_poll), UCS_CONFIG_TYPE_UINT},
131
132 {"RX_INLINE", "0",
133 "Number of bytes to request for inline receive. If the maximal supported size\n"
134 "is smaller, it will be used instead. If it is possible to support a larger\n"
135 "size than requested with the same hardware resources, it will be used instead.",
136 ucs_offsetof(uct_ib_iface_config_t, inl[UCT_IB_DIR_RX]), UCS_CONFIG_TYPE_MEMUNITS},
137
138 UCT_IFACE_MPOOL_CONFIG_FIELDS("RX_", -1, 0, "receive",
139 ucs_offsetof(uct_ib_iface_config_t, rx.mp), ""),
140
141 {"ADDR_TYPE", "auto",
142 "Set the interface address type. \"auto\" mode detects the type according to\n"
143 "link layer type and IB subnet prefix.\n"
144 "Deprecated. To force use of global routing use IS_GLOBAL.",
145 ucs_offsetof(uct_ib_iface_config_t, addr_type),
146 UCS_CONFIG_TYPE_ENUM(uct_ib_iface_addr_types)},
147
148 {"IS_GLOBAL", "n",
149 "Force interface to use global routing.",
150 ucs_offsetof(uct_ib_iface_config_t, is_global), UCS_CONFIG_TYPE_BOOL},
151
152 {"SL", "0",
153 "IB Service Level / RoCEv2 Ethernet Priority.\n",
154 ucs_offsetof(uct_ib_iface_config_t, sl), UCS_CONFIG_TYPE_UINT},
155
156 {"TRAFFIC_CLASS", "auto",
157 "IB Traffic Class / RoCEv2 Differentiated Services Code Point (DSCP).\n"
158 "\"auto\" option selects 106 on RoCEv2 and 0 otherwise.",
159 ucs_offsetof(uct_ib_iface_config_t, traffic_class), UCS_CONFIG_TYPE_ULUNITS},
160
161 {"HOP_LIMIT", "255",
162 "IB Hop limit / RoCEv2 Time to Live. Should be between 0 and 255.\n",
163 ucs_offsetof(uct_ib_iface_config_t, hop_limit), UCS_CONFIG_TYPE_UINT},
164
165 {"NUM_PATHS", "auto",
166 "Number of connections that should be created between a pair of communicating\n"
167 "endpoints for optimal performance. The default value 'auto' behaves according\n"
168 "to the port link layer:\n"
169 " RoCE - "UCS_PP_MAKE_STRING(UCT_IB_DEV_MAX_PORTS) " for LAG port, otherwise - 1.\n"
170 " InfiniBand - As the number of path bits enabled by fabric's LMC value and selected\n"
171 " by "UCS_DEFAULT_ENV_PREFIX UCT_IB_CONFIG_PREFIX"LID_PATH_BITS configuration.",
172 ucs_offsetof(uct_ib_iface_config_t, num_paths), UCS_CONFIG_TYPE_ULUNITS},
173
174 {"ROCE_PATH_FACTOR", "1",
175 "Multiplier for RoCE LAG UDP source port calculation. The UDP source port\n"
176 "is typically used by switches and network adapters to select a different\n"
177 "path for the same pair of endpoints.",
178 ucs_offsetof(uct_ib_iface_config_t, roce_path_factor), UCS_CONFIG_TYPE_UINT},
179
180 {"LID_PATH_BITS", "0",
181 "List of IB Path bits separated by comma (a,b,c) "
182 "which will be the low portion of the LID, according to the LMC in the fabric.",
183 ucs_offsetof(uct_ib_iface_config_t, lid_path_bits), UCS_CONFIG_TYPE_ARRAY(path_bits_spec)},
184
185 {"PKEY", "auto",
186 "Which pkey value to use. Should be between 0 and 0x7fff.\n"
187 "\"auto\" option selects a first valid pkey value with full membership.",
188 ucs_offsetof(uct_ib_iface_config_t, pkey), UCS_CONFIG_TYPE_HEX},
189
190 #ifdef HAVE_IBV_EXP_RES_DOMAIN
191 {"RESOURCE_DOMAIN", "y",
192 "Enable multiple resource domains (experimental).",
193 ucs_offsetof(uct_ib_iface_config_t, enable_res_domain), UCS_CONFIG_TYPE_BOOL},
194 #endif
195
196 {"PATH_MTU", "default",
197 "Path MTU. \"default\" will select the best MTU for the device.",
198 ucs_offsetof(uct_ib_iface_config_t, path_mtu),
199 UCS_CONFIG_TYPE_ENUM(uct_ib_mtu_values)},
200
201 {"ENABLE_CUDA_AFFINITY", "y",
202 "Prefer IB devices closest to detected CUDA device\n",
203 ucs_offsetof(uct_ib_iface_config_t, enable_cuda_affinity), UCS_CONFIG_TYPE_BOOL},
204
205 {NULL}
206 };
207
uct_ib_iface_is_roce(uct_ib_iface_t * iface)208 int uct_ib_iface_is_roce(uct_ib_iface_t *iface)
209 {
210 return uct_ib_device_is_port_roce(uct_ib_iface_device(iface),
211 iface->config.port_num);
212 }
213
uct_ib_iface_is_ib(uct_ib_iface_t * iface)214 int uct_ib_iface_is_ib(uct_ib_iface_t *iface)
215 {
216 return uct_ib_device_is_port_ib(uct_ib_iface_device(iface),
217 iface->config.port_num);
218 }
219
uct_ib_iface_recv_desc_init(uct_iface_h tl_iface,void * obj,uct_mem_h memh)220 static void uct_ib_iface_recv_desc_init(uct_iface_h tl_iface, void *obj, uct_mem_h memh)
221 {
222 uct_ib_iface_recv_desc_t *desc = obj;
223
224 desc->lkey = uct_ib_memh_get_lkey(memh);
225 }
226
uct_ib_iface_recv_mpool_init(uct_ib_iface_t * iface,const uct_ib_iface_config_t * config,const char * name,ucs_mpool_t * mp)227 ucs_status_t uct_ib_iface_recv_mpool_init(uct_ib_iface_t *iface,
228 const uct_ib_iface_config_t *config,
229 const char *name, ucs_mpool_t *mp)
230 {
231 unsigned grow;
232
233 if (config->rx.queue_len < 1024) {
234 grow = 1024;
235 } else {
236 /* We want to have some free (+10%) elements to avoid mem pool expansion */
237 grow = ucs_min( (int)(1.1 * config->rx.queue_len + 0.5),
238 config->rx.mp.max_bufs);
239 }
240
241 return uct_iface_mpool_init(&iface->super, mp,
242 iface->config.rx_payload_offset + iface->config.seg_size,
243 iface->config.rx_hdr_offset,
244 UCS_SYS_CACHE_LINE_SIZE,
245 &config->rx.mp, grow,
246 uct_ib_iface_recv_desc_init,
247 name);
248 }
249
uct_ib_iface_release_desc(uct_recv_desc_t * self,void * desc)250 void uct_ib_iface_release_desc(uct_recv_desc_t *self, void *desc)
251 {
252 uct_ib_iface_t *iface = ucs_container_of(self, uct_ib_iface_t, release_desc);
253 void *ib_desc;
254
255 ib_desc = UCS_PTR_BYTE_OFFSET(desc, -(ptrdiff_t)iface->config.rx_headroom_offset);
256 ucs_mpool_put_inline(ib_desc);
257 }
258
259 static inline uct_ib_roce_version_t
uct_ib_address_flags_get_roce_version(uint8_t flags)260 uct_ib_address_flags_get_roce_version(uint8_t flags)
261 {
262 ucs_assert(flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH);
263 return (uct_ib_roce_version_t)(flags >> ucs_ilog2(UCT_IB_ADDRESS_FLAG_ETH_LAST));
264 }
265
266 static inline sa_family_t
uct_ib_address_flags_get_roce_af(uint8_t flags)267 uct_ib_address_flags_get_roce_af(uint8_t flags)
268 {
269 ucs_assert(flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH);
270 return (flags & UCT_IB_ADDRESS_FLAG_ROCE_IPV6) ?
271 AF_INET6 : AF_INET;
272 }
273
uct_ib_address_size(const uct_ib_address_pack_params_t * params)274 size_t uct_ib_address_size(const uct_ib_address_pack_params_t *params)
275 {
276 size_t size = sizeof(uct_ib_address_t);
277
278 if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) {
279 /* Ethernet: address contains only raw GID */
280 size += sizeof(union ibv_gid);
281 } else {
282 /* InfiniBand: address always contains LID */
283 size += sizeof(uint16_t); /* lid */
284
285 if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID) {
286 /* Add GUID */
287 UCS_STATIC_ASSERT(sizeof(params->gid.global.interface_id) == sizeof(uint64_t));
288 size += sizeof(uint64_t);
289 }
290
291 if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) {
292 if ((params->gid.global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) ==
293 UCT_IB_SITE_LOCAL_PREFIX) {
294 /* 16-bit subnet prefix */
295 size += sizeof(uint16_t);
296 } else if (params->gid.global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX) {
297 /* 64-bit subnet prefix */
298 size += sizeof(uint64_t);
299 }
300 /* Note: if subnet prefix is LINK_LOCAL, no need to pack it because
301 * it's a well-known value defined by IB specification.
302 */
303 }
304 }
305
306 if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) {
307 size += sizeof(uint8_t);
308 }
309
310 if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) {
311 size += sizeof(uint8_t);
312 }
313
314 if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) {
315 size += sizeof(uint16_t);
316 }
317
318 return size;
319 }
320
uct_ib_address_pack(const uct_ib_address_pack_params_t * params,uct_ib_address_t * ib_addr)321 void uct_ib_address_pack(const uct_ib_address_pack_params_t *params,
322 uct_ib_address_t *ib_addr)
323 {
324 void *ptr = ib_addr + 1;
325
326 if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) {
327 /* RoCE, in this case we don't use the lid, we pack the gid, the RoCE
328 * version, address family and set the ETH flag */
329 ib_addr->flags = UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH |
330 (params->roce_info.ver <<
331 ucs_ilog2(UCT_IB_ADDRESS_FLAG_ETH_LAST));
332
333 if (params->roce_info.addr_family == AF_INET6) {
334 ib_addr->flags |= UCT_IB_ADDRESS_FLAG_ROCE_IPV6;
335 }
336
337 /* uint8_t raw[16]; */
338 memcpy(ptr, params->gid.raw, sizeof(params->gid.raw));
339 ptr = UCS_PTR_TYPE_OFFSET(ptr, params->gid.raw);
340 } else {
341 /* IB, LID */
342 ib_addr->flags = 0;
343 *(uint16_t*)ptr = params->lid;
344 ptr = UCS_PTR_TYPE_OFFSET(ptr, uint16_t);
345
346 if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID) {
347 /* Pack GUID */
348 ib_addr->flags |= UCT_IB_ADDRESS_FLAG_IF_ID;
349 *(uint64_t*) ptr = params->gid.global.interface_id;
350 ptr = UCS_PTR_TYPE_OFFSET(ptr, uint64_t);
351 }
352
353 if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) {
354 if ((params->gid.global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) ==
355 UCT_IB_SITE_LOCAL_PREFIX) {
356 /* Site-local */
357 ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET16;
358 *(uint16_t*)ptr = params->gid.global.subnet_prefix >> 48;
359 ptr = UCS_PTR_TYPE_OFFSET(ptr, uint16_t);
360 } else if (params->gid.global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX) {
361 /* Global */
362 ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET64;
363 *(uint64_t*)ptr = params->gid.global.subnet_prefix;
364 ptr = UCS_PTR_TYPE_OFFSET(ptr, uint64_t);
365 }
366 }
367 }
368
369 if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) {
370 ucs_assert((int)params->path_mtu < UINT8_MAX);
371 ib_addr->flags |= UCT_IB_ADDRESS_FLAG_PATH_MTU;
372 *(uint8_t*)ptr = (uint8_t)params->path_mtu;
373 ptr = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
374 }
375
376 if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) {
377 ib_addr->flags |= UCT_IB_ADDRESS_FLAG_GID_INDEX;
378 *(uint8_t*)ptr = params->gid_index;
379 }
380
381 if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) {
382 ucs_assert(params->pkey != UCT_IB_ADDRESS_DEFAULT_PKEY);
383 ib_addr->flags |= UCT_IB_ADDRESS_FLAG_PKEY;
384 *(uint16_t*)ptr = params->pkey;
385 }
386 }
387
uct_ib_iface_address_pack_flags(uct_ib_iface_t * iface)388 unsigned uct_ib_iface_address_pack_flags(uct_ib_iface_t *iface)
389 {
390 unsigned pack_flags = 0;
391
392 if (iface->pkey != UCT_IB_ADDRESS_DEFAULT_PKEY) {
393 pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_PKEY;
394 }
395
396 if (uct_ib_iface_is_roce(iface)) {
397 /* pack Ethernet address */
398 pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_ETH;
399 } else if (iface->config.force_global_addr) {
400 /* pack full IB address */
401 pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX |
402 UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID;
403 } else {
404 /* pack only subnet prefix for reachability test */
405 pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX;
406 }
407
408 return pack_flags;
409 }
410
uct_ib_iface_address_size(uct_ib_iface_t * iface)411 size_t uct_ib_iface_address_size(uct_ib_iface_t *iface)
412 {
413 uct_ib_address_pack_params_t params;
414
415 params.flags = uct_ib_iface_address_pack_flags(iface);
416 params.gid = iface->gid_info.gid;
417 params.roce_info = iface->gid_info.roce_info;
418 return uct_ib_address_size(¶ms);
419 }
420
uct_ib_iface_address_pack(uct_ib_iface_t * iface,uct_ib_address_t * ib_addr)421 void uct_ib_iface_address_pack(uct_ib_iface_t *iface, uct_ib_address_t *ib_addr)
422 {
423 uct_ib_address_pack_params_t params;
424
425 params.flags = uct_ib_iface_address_pack_flags(iface);
426 params.gid = iface->gid_info.gid;
427 params.lid = uct_ib_iface_port_attr(iface)->lid;
428 params.roce_info = iface->gid_info.roce_info;
429 /* to suppress gcc 4.3.4 warning */
430 params.path_mtu = UCT_IB_ADDRESS_INVALID_PATH_MTU;
431 params.gid_index = UCT_IB_ADDRESS_INVALID_GID_INDEX;
432 params.pkey = iface->pkey;
433 uct_ib_address_pack(¶ms, ib_addr);
434 }
435
uct_ib_address_unpack(const uct_ib_address_t * ib_addr,uct_ib_address_pack_params_t * params_p)436 void uct_ib_address_unpack(const uct_ib_address_t *ib_addr,
437 uct_ib_address_pack_params_t *params_p)
438 {
439 const void *ptr = ib_addr + 1;
440 /* silence cppcheck warning */
441 uct_ib_address_pack_params_t params = {0};
442
443 params.gid_index = UCT_IB_ADDRESS_INVALID_GID_INDEX;
444 params.path_mtu = UCT_IB_ADDRESS_INVALID_PATH_MTU;
445 params.pkey = UCT_IB_ADDRESS_DEFAULT_PKEY;
446
447 if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH) {
448 /* uint8_t raw[16]; */
449 memcpy(params.gid.raw, ptr, sizeof(params.gid.raw));
450 ptr = UCS_PTR_BYTE_OFFSET(ptr, sizeof(params.gid.raw));
451 params.flags |= UCT_IB_ADDRESS_PACK_FLAG_ETH;
452
453 params.roce_info.addr_family =
454 uct_ib_address_flags_get_roce_af(ib_addr->flags);
455 params.roce_info.ver =
456 uct_ib_address_flags_get_roce_version(ib_addr->flags);
457 } else {
458 /* Default prefix */
459 params.gid.global.subnet_prefix = UCT_IB_LINK_LOCAL_PREFIX;
460 params.gid.global.interface_id = 0;
461 params.flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX |
462 UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID;
463
464 /* If the link layer is not ETHERNET, then it is IB and a lid
465 * must be present */
466 params.lid = *(const uint16_t*)ptr;
467 ptr = UCS_PTR_TYPE_OFFSET(ptr, uint16_t);
468
469 if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_IF_ID) {
470 params.gid.global.interface_id = *(uint64_t*)ptr;
471 ptr = UCS_PTR_TYPE_OFFSET(ptr, uint64_t);
472 }
473
474 if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET16) {
475 params.gid.global.subnet_prefix = UCT_IB_SITE_LOCAL_PREFIX |
476 ((uint64_t)*(uint16_t*)ptr << 48);
477 ptr = UCS_PTR_TYPE_OFFSET(ptr, uint16_t);
478 ucs_assert(!(ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64));
479 }
480
481 if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64) {
482 params.gid.global.subnet_prefix = *(uint64_t*)ptr;
483 ptr = UCS_PTR_TYPE_OFFSET(ptr, uint64_t);
484 params.flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX;
485 }
486 }
487
488 if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_PATH_MTU) {
489 params.path_mtu = *(const uint8_t*)ptr;
490 ptr = UCS_PTR_TYPE_OFFSET(ptr, const uint8_t);
491 params.flags |= UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU;
492 }
493
494 if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_GID_INDEX) {
495 params.gid_index = *(const uint8_t*)ptr;
496 ptr = UCS_PTR_TYPE_OFFSET(ptr, const uint16_t);
497 params.flags |= UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX;
498 }
499
500 if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_PKEY) {
501 params.pkey = *(const uint16_t*)ptr;
502 }
503 /* PKEY is always in params */
504 params.flags |= UCT_IB_ADDRESS_PACK_FLAG_PKEY;
505
506 *params_p = params;
507 }
508
uct_ib_address_str(const uct_ib_address_t * ib_addr,char * buf,size_t max)509 const char *uct_ib_address_str(const uct_ib_address_t *ib_addr, char *buf,
510 size_t max)
511 {
512 uct_ib_address_pack_params_t params;
513 char *p, *endp;
514
515 uct_ib_address_unpack(ib_addr, ¶ms);
516
517 p = buf;
518 endp = buf + max;
519 if (params.lid != 0) {
520 snprintf(p, endp - p, "lid %d ", params.lid);
521 p += strlen(p);
522 }
523
524 uct_ib_gid_str(¶ms.gid, p, endp - p);
525 p += strlen(p);
526
527 if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) {
528 ucs_assert(params.gid_index != UCT_IB_ADDRESS_INVALID_GID_INDEX);
529 snprintf(p, endp - p, "gid index %u ", params.gid_index);
530 p += strlen(p);
531 }
532
533 if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) {
534 ucs_assert(params.path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU);
535 snprintf(p, endp - p, "mtu %zu ", uct_ib_mtu_value(params.path_mtu));
536 p += strlen(p);
537 }
538
539 ucs_assert((params.flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) &&
540 (params.flags != UCT_IB_ADDRESS_INVALID_PKEY));
541 snprintf(p, endp - p, "pkey 0x%x ", params.pkey);
542
543 return buf;
544 }
545
uct_ib_iface_get_device_address(uct_iface_h tl_iface,uct_device_addr_t * dev_addr)546 ucs_status_t uct_ib_iface_get_device_address(uct_iface_h tl_iface,
547 uct_device_addr_t *dev_addr)
548 {
549 uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t);
550
551 uct_ib_iface_address_pack(iface, (void*)dev_addr);
552
553 return UCS_OK;
554 }
555
uct_ib_iface_roce_is_reachable(const uct_ib_device_gid_info_t * local_gid_info,const uct_ib_address_t * remote_ib_addr)556 static int uct_ib_iface_roce_is_reachable(const uct_ib_device_gid_info_t *local_gid_info,
557 const uct_ib_address_t *remote_ib_addr)
558 {
559 sa_family_t local_ib_addr_af = local_gid_info->roce_info.addr_family;
560 uct_ib_roce_version_t local_roce_ver = local_gid_info->roce_info.ver;
561 uint8_t remote_ib_addr_flags = remote_ib_addr->flags;
562 uct_ib_roce_version_t remote_roce_ver;
563 sa_family_t remote_ib_addr_af;
564 char local_gid_str[128], remote_gid_str[128];
565
566 if ((uct_ib_address_flags_get_roce_version(remote_ib_addr_flags)) ==
567 UCT_IB_DEVICE_ROCE_ANY) {
568 return 1;
569 }
570
571 /* check the address family */
572 remote_ib_addr_af = uct_ib_address_flags_get_roce_af(remote_ib_addr_flags);
573
574 if (local_ib_addr_af != remote_ib_addr_af) {
575 ucs_assert(local_ib_addr_af != 0);
576 ucs_debug("different addr_family detected. local %s remote %s",
577 ucs_sockaddr_address_family_str(local_ib_addr_af),
578 ucs_sockaddr_address_family_str(remote_ib_addr_af));
579 return 0;
580 }
581
582 /* check the RoCE version */
583 ucs_assert(local_roce_ver != UCT_IB_DEVICE_ROCE_ANY);
584
585 remote_roce_ver = uct_ib_address_flags_get_roce_version(remote_ib_addr_flags);
586
587 if (local_roce_ver != remote_roce_ver) {
588 ucs_trace("different RoCE versions detected. local %s (gid=%s)"
589 "remote %s (gid=%s)",
590 uct_ib_roce_version_str(local_roce_ver),
591 uct_ib_gid_str(&local_gid_info->gid, local_gid_str,
592 sizeof(local_gid_str)),
593 uct_ib_roce_version_str(remote_roce_ver),
594 uct_ib_gid_str((union ibv_gid *)(remote_ib_addr + 1), remote_gid_str,
595 sizeof(remote_gid_str)));
596 return 0;
597 }
598
599 return 1;
600 }
601
uct_ib_iface_is_reachable(const uct_iface_h tl_iface,const uct_device_addr_t * dev_addr,const uct_iface_addr_t * iface_addr)602 int uct_ib_iface_is_reachable(const uct_iface_h tl_iface,
603 const uct_device_addr_t *dev_addr,
604 const uct_iface_addr_t *iface_addr)
605 {
606 uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t);
607 int is_local_eth = uct_ib_iface_is_roce(iface);
608 const uct_ib_address_t *ib_addr = (const void*)dev_addr;
609 uct_ib_address_pack_params_t params;
610
611 uct_ib_address_unpack(ib_addr, ¶ms);
612
613 if (/* at least one PKEY has to be with full membership */
614 !((params.pkey | iface->pkey) & UCT_IB_PKEY_MEMBERSHIP_MASK) ||
615 /* PKEY values have to be equal */
616 ((params.pkey ^ iface->pkey) & UCT_IB_PKEY_PARTITION_MASK)) {
617 return 0;
618 }
619
620 if (!is_local_eth && !(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) {
621 /* same subnet prefix */
622 return params.gid.global.subnet_prefix ==
623 iface->gid_info.gid.global.subnet_prefix;
624 } else if (is_local_eth && (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) {
625 /* there shouldn't be a lid and the UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH
626 * flag should be on. If reachable, the remote and local RoCE versions
627 * and address families have to be the same */
628 return uct_ib_iface_roce_is_reachable(&iface->gid_info, ib_addr);
629 } else {
630 /* local and remote have different link layers and therefore are unreachable */
631 return 0;
632 }
633 }
634
uct_ib_iface_create_ah(uct_ib_iface_t * iface,struct ibv_ah_attr * ah_attr,struct ibv_ah ** ah_p)635 ucs_status_t uct_ib_iface_create_ah(uct_ib_iface_t *iface,
636 struct ibv_ah_attr *ah_attr,
637 struct ibv_ah **ah_p)
638 {
639 return uct_ib_device_create_ah_cached(uct_ib_iface_device(iface), ah_attr,
640 uct_ib_iface_md(iface)->pd, ah_p);
641 }
642
uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t * iface,uint16_t lid,const union ibv_gid * gid,uint8_t gid_index,unsigned path_index,struct ibv_ah_attr * ah_attr)643 void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid,
644 const union ibv_gid *gid,
645 uint8_t gid_index,
646 unsigned path_index,
647 struct ibv_ah_attr *ah_attr)
648 {
649 uint8_t path_bits;
650 char buf[128];
651
652 memset(ah_attr, 0, sizeof(*ah_attr));
653
654 ah_attr->sl = iface->config.sl;
655 ah_attr->port_num = iface->config.port_num;
656 ah_attr->grh.traffic_class = iface->config.traffic_class;
657
658 if (uct_ib_iface_is_roce(iface)) {
659 ah_attr->dlid = UCT_IB_ROCE_UDP_SRC_PORT_BASE |
660 (iface->config.roce_path_factor * path_index);
661 /* Workaround rdma-core issue of calling rand() which affects global
662 * random state in glibc */
663 ah_attr->grh.flow_label = 1;
664 } else {
665 /* TODO iface->path_bits should be removed and replaced by path_index */
666 path_bits = iface->path_bits[path_index %
667 iface->path_bits_count];
668 ah_attr->dlid = lid | path_bits;
669 ah_attr->src_path_bits = path_bits;
670 }
671
672 if (iface->config.force_global_addr ||
673 (iface->gid_info.gid.global.subnet_prefix != gid->global.subnet_prefix)) {
674 ucs_assert_always(gid->global.interface_id != 0);
675 ah_attr->is_global = 1;
676 ah_attr->grh.dgid = *gid;
677 ah_attr->grh.sgid_index = gid_index;
678 ah_attr->grh.hop_limit = iface->config.hop_limit;
679 } else {
680 ah_attr->is_global = 0;
681 }
682
683 ucs_debug("iface %p: ah_attr %s", iface,
684 uct_ib_ah_attr_str(buf, sizeof(buf), ah_attr));
685 }
686
uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t * iface,const uct_ib_address_t * ib_addr,unsigned path_index,struct ibv_ah_attr * ah_attr,enum ibv_mtu * path_mtu)687 void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface,
688 const uct_ib_address_t *ib_addr,
689 unsigned path_index,
690 struct ibv_ah_attr *ah_attr,
691 enum ibv_mtu *path_mtu)
692 {
693 uct_ib_address_pack_params_t params;
694
695 ucs_assert(!uct_ib_iface_is_roce(iface) ==
696 !(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH));
697
698 uct_ib_address_unpack(ib_addr, ¶ms);
699
700 if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) {
701 ucs_assert(params.path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU);
702 *path_mtu = params.path_mtu;
703 } else {
704 *path_mtu = iface->config.path_mtu;
705 }
706
707 if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) {
708 ucs_assert(params.gid_index != UCT_IB_ADDRESS_INVALID_GID_INDEX);
709 } else {
710 params.gid_index = iface->gid_info.gid_index;
711 }
712
713 uct_ib_iface_fill_ah_attr_from_gid_lid(iface, params.lid, ¶ms.gid,
714 params.gid_index, path_index,
715 ah_attr);
716 }
717
uct_ib_iface_init_pkey(uct_ib_iface_t * iface,const uct_ib_iface_config_t * config)718 static ucs_status_t uct_ib_iface_init_pkey(uct_ib_iface_t *iface,
719 const uct_ib_iface_config_t *config)
720 {
721 uct_ib_device_t *dev = uct_ib_iface_device(iface);
722 uint16_t pkey_tbl_len = uct_ib_iface_port_attr(iface)->pkey_tbl_len;
723 int pkey_found = 0;
724 uint16_t lim_pkey = UCT_IB_ADDRESS_INVALID_PKEY;
725 uint16_t lim_pkey_index = UINT16_MAX;
726 uint16_t pkey_index, port_pkey, pkey;
727
728 if ((config->pkey != UCS_HEXUNITS_AUTO) &&
729 (config->pkey > UCT_IB_PKEY_PARTITION_MASK)) {
730 ucs_error("requested pkey 0x%x is invalid, should be in the range 0..0x%x",
731 config->pkey, UCT_IB_PKEY_PARTITION_MASK);
732 return UCS_ERR_INVALID_PARAM;
733 }
734
735 /* get the user's pkey value and find its index in the port's pkey table */
736 for (pkey_index = 0; pkey_index < pkey_tbl_len; ++pkey_index) {
737 /* get the pkey values from the port's pkeys table */
738 if (ibv_query_pkey(dev->ibv_context, iface->config.port_num, pkey_index,
739 &port_pkey))
740 {
741 ucs_debug("ibv_query_pkey("UCT_IB_IFACE_FMT", index=%d) failed: %m",
742 UCT_IB_IFACE_ARG(iface), pkey_index);
743 continue;
744 }
745
746 pkey = ntohs(port_pkey);
747 /* if pkey = 0x0, just skip it w/o debug trace, because 0x0
748 * means that there is no real pkey configured at this index */
749 if (pkey == UCT_IB_ADDRESS_INVALID_PKEY) {
750 continue;
751 }
752
753 if ((config->pkey == UCS_HEXUNITS_AUTO) ||
754 /* take only the lower 15 bits for the comparison */
755 ((pkey & UCT_IB_PKEY_PARTITION_MASK) == config->pkey)) {
756 if (!(pkey & UCT_IB_PKEY_MEMBERSHIP_MASK) &&
757 /* limited PKEY has not yet been found */
758 (lim_pkey == UCT_IB_ADDRESS_INVALID_PKEY)) {
759 lim_pkey_index = pkey_index;
760 lim_pkey = pkey;
761 continue;
762 }
763
764 iface->pkey_index = pkey_index;
765 iface->pkey = pkey;
766 pkey_found = 1;
767 break;
768 }
769 }
770
771 if (!pkey_found) {
772 if (lim_pkey == UCT_IB_ADDRESS_INVALID_PKEY) {
773 /* PKEY neither with full nor with limited membership was found */
774 if (config->pkey == UCS_HEXUNITS_AUTO) {
775 ucs_error("there is no valid pkey to use on "
776 UCT_IB_IFACE_FMT, UCT_IB_IFACE_ARG(iface));
777 } else {
778 ucs_error("unable to find specified pkey 0x%x on "UCT_IB_IFACE_FMT,
779 config->pkey, UCT_IB_IFACE_ARG(iface));
780 }
781
782 return UCS_ERR_NO_ELEM;
783 } else {
784 ucs_assert(lim_pkey_index != UINT16_MAX);
785 iface->pkey_index = lim_pkey_index;
786 iface->pkey = lim_pkey;
787 }
788 }
789
790 ucs_debug("using pkey[%d] 0x%x on "UCT_IB_IFACE_FMT, iface->pkey_index,
791 iface->pkey, UCT_IB_IFACE_ARG(iface));
792
793 return UCS_OK;
794 }
795
uct_ib_iface_init_lmc(uct_ib_iface_t * iface,const uct_ib_iface_config_t * config)796 static ucs_status_t uct_ib_iface_init_lmc(uct_ib_iface_t *iface,
797 const uct_ib_iface_config_t *config)
798 {
799 unsigned i, j, num_path_bits;
800 unsigned first, last;
801 uint8_t lmc;
802 int step;
803
804 if (config->lid_path_bits.count == 0) {
805 ucs_error("List of path bits must not be empty");
806 return UCS_ERR_INVALID_PARAM;
807 }
808
809 /* count the number of lid_path_bits */
810 num_path_bits = 0;
811 for (i = 0; i < config->lid_path_bits.count; i++) {
812 num_path_bits += 1 + abs((int)(config->lid_path_bits.ranges[i].first -
813 config->lid_path_bits.ranges[i].last));
814 }
815
816 iface->path_bits = ucs_calloc(1, num_path_bits * sizeof(*iface->path_bits),
817 "ib_path_bits");
818 if (iface->path_bits == NULL) {
819 return UCS_ERR_NO_MEMORY;
820 }
821
822 lmc = uct_ib_iface_port_attr(iface)->lmc;
823
824 /* go over the list of values (ranges) for the lid_path_bits and set them */
825 iface->path_bits_count = 0;
826 for (i = 0; i < config->lid_path_bits.count; ++i) {
827
828 first = config->lid_path_bits.ranges[i].first;
829 last = config->lid_path_bits.ranges[i].last;
830
831 /* range of values or one value */
832 if (first < last) {
833 step = 1;
834 } else {
835 step = -1;
836 }
837
838 /* fill the value/s */
839 for (j = first; j != (last + step); j += step) {
840 if (j >= UCS_BIT(lmc)) {
841 ucs_debug("Not using value %d for path_bits - must be < 2^lmc (lmc=%d)",
842 j, lmc);
843 if (step == 1) {
844 break;
845 } else {
846 continue;
847 }
848 }
849
850 ucs_assert(iface->path_bits_count < num_path_bits);
851 iface->path_bits[iface->path_bits_count] = j;
852 iface->path_bits_count++;
853 }
854 }
855
856 return UCS_OK;
857 }
858
uct_ib_iface_fill_attr(uct_ib_iface_t * iface,uct_ib_qp_attr_t * attr)859 void uct_ib_iface_fill_attr(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr)
860 {
861 attr->ibv.send_cq = iface->cq[UCT_IB_DIR_TX];
862 attr->ibv.recv_cq = iface->cq[UCT_IB_DIR_RX];
863
864 attr->ibv.srq = attr->srq;
865 attr->ibv.cap = attr->cap;
866 attr->ibv.qp_type = (enum ibv_qp_type)attr->qp_type;
867 attr->ibv.sq_sig_all = attr->sq_sig_all;
868
869 #if HAVE_DECL_IBV_EXP_CREATE_QP
870 if (!(attr->ibv.comp_mask & IBV_EXP_QP_INIT_ATTR_PD)) {
871 attr->ibv.comp_mask = IBV_EXP_QP_INIT_ATTR_PD;
872 attr->ibv.pd = uct_ib_iface_md(iface)->pd;
873 }
874 #elif HAVE_DECL_IBV_CREATE_QP_EX
875 if (!(attr->ibv.comp_mask & IBV_QP_INIT_ATTR_PD)) {
876 attr->ibv.comp_mask = IBV_QP_INIT_ATTR_PD;
877 attr->ibv.pd = uct_ib_iface_md(iface)->pd;
878 }
879 #endif
880
881 attr->port = iface->config.port_num;
882
883 if (attr->qp_type == IBV_QPT_UD) {
884 return;
885 }
886
887 /* MOFED requires this to enable IB spec atomic */
888 #if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE
889 if (uct_ib_iface_device(iface)->dev_attr.exp_atomic_cap ==
890 IBV_EXP_ATOMIC_HCA_REPLY_BE) {
891 attr->ibv.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS;
892 attr->ibv.exp_create_flags = IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY;
893 }
894 #endif
895 }
896
uct_ib_iface_create_qp(uct_ib_iface_t * iface,uct_ib_qp_attr_t * attr,struct ibv_qp ** qp_p)897 ucs_status_t uct_ib_iface_create_qp(uct_ib_iface_t *iface,
898 uct_ib_qp_attr_t *attr,
899 struct ibv_qp **qp_p)
900 {
901 uct_ib_device_t *dev = uct_ib_iface_device(iface);
902 struct ibv_qp *qp;
903
904 uct_ib_iface_fill_attr(iface, attr);
905
906 #if HAVE_DECL_IBV_EXP_CREATE_QP
907 qp = ibv_exp_create_qp(dev->ibv_context, &attr->ibv);
908 #elif HAVE_DECL_IBV_CREATE_QP_EX
909 qp = ibv_create_qp_ex(dev->ibv_context, &attr->ibv);
910 #else
911 qp = ibv_create_qp(uct_ib_iface_md(iface)->pd, &attr->ibv);
912 #endif
913 if (qp == NULL) {
914 ucs_error("iface=%p: failed to create %s QP "
915 "TX wr:%d sge:%d inl:%d resp:%d RX wr:%d sge:%d resp:%d: %m",
916 iface, uct_ib_qp_type_str(attr->qp_type),
917 attr->cap.max_send_wr, attr->cap.max_send_sge,
918 attr->cap.max_inline_data, attr->max_inl_cqe[UCT_IB_DIR_TX],
919 attr->cap.max_recv_wr, attr->cap.max_recv_sge,
920 attr->max_inl_cqe[UCT_IB_DIR_RX]);
921 return UCS_ERR_IO_ERROR;
922 }
923
924 attr->cap = attr->ibv.cap;
925 *qp_p = qp;
926
927 ucs_debug("iface=%p: created %s QP 0x%x on %s:%d "
928 "TX wr:%d sge:%d inl:%d resp:%d RX wr:%d sge:%d resp:%d",
929 iface, uct_ib_qp_type_str(attr->qp_type), qp->qp_num,
930 uct_ib_device_name(dev), iface->config.port_num,
931 attr->cap.max_send_wr, attr->cap.max_send_sge,
932 attr->cap.max_inline_data, attr->max_inl_cqe[UCT_IB_DIR_TX],
933 attr->cap.max_recv_wr, attr->cap.max_recv_sge,
934 attr->max_inl_cqe[UCT_IB_DIR_RX]);
935
936 return UCS_OK;
937 }
938
uct_ib_verbs_create_cq(uct_ib_iface_t * iface,uct_ib_dir_t dir,const uct_ib_iface_init_attr_t * init_attr,int preferred_cpu,size_t inl)939 ucs_status_t uct_ib_verbs_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
940 const uct_ib_iface_init_attr_t *init_attr,
941 int preferred_cpu, size_t inl)
942 {
943 uct_ib_device_t *dev = uct_ib_iface_device(iface);
944 struct ibv_cq *cq;
945 #if HAVE_DECL_IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN
946 struct ibv_cq_init_attr_ex cq_attr = {};
947
948 cq_attr.cqe = init_attr->cq_len[dir];
949 cq_attr.channel = iface->comp_channel;
950 cq_attr.comp_vector = preferred_cpu;
951 if (init_attr->flags & UCT_IB_CQ_IGNORE_OVERRUN) {
952 cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_FLAGS;
953 cq_attr.flags = IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN;
954 }
955
956 cq = ibv_cq_ex_to_cq(ibv_create_cq_ex(dev->ibv_context, &cq_attr));
957 if (!cq && (errno == ENOSYS))
958 #endif
959 {
960 iface->config.max_inl_cqe[dir] = 0;
961 cq = ibv_create_cq(dev->ibv_context, init_attr->cq_len[dir], NULL,
962 iface->comp_channel, preferred_cpu);
963 }
964
965 if (!cq) {
966 ucs_error("ibv_create_cq(cqe=%d) failed: %m", init_attr->cq_len[dir]);
967 return UCS_ERR_IO_ERROR;
968 }
969
970 iface->cq[dir] = cq;
971 iface->config.max_inl_cqe[dir] = inl;
972 return UCS_OK;
973 }
974
975 static ucs_status_t
uct_ib_iface_create_cq(uct_ib_iface_t * iface,uct_ib_dir_t dir,const uct_ib_iface_init_attr_t * init_attr,const uct_ib_iface_config_t * config,int preferred_cpu)976 uct_ib_iface_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
977 const uct_ib_iface_init_attr_t *init_attr,
978 const uct_ib_iface_config_t *config,
979 int preferred_cpu)
980 {
981 ucs_status_t status;
982 size_t inl = config->inl[dir];
983 #if HAVE_DECL_IBV_EXP_SETENV && !HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE
984 uct_ib_device_t *dev = uct_ib_iface_device(iface);
985 static const char *cqe_size_env_var = "MLX5_CQE_SIZE";
986 size_t cqe_size = 64;
987 int env_var_added = 0;
988 const char *cqe_size_env_value;
989 size_t cqe_size_min;
990 char cqe_size_buf[32];
991 int ret;
992
993 cqe_size_min = (inl > 32) ? 128 : 64;
994 cqe_size_env_value = getenv(cqe_size_env_var);
995
996 if (cqe_size_env_value != NULL) {
997 cqe_size = atol(cqe_size_env_value);
998 if (cqe_size < cqe_size_min) {
999 ucs_error("%s is set to %zu, but at least %zu is required (inl: %zu)",
1000 cqe_size_env_var, cqe_size, cqe_size_min, inl);
1001 return UCS_ERR_INVALID_PARAM;
1002 }
1003 } else {
1004 cqe_size = uct_ib_get_cqe_size(cqe_size_min);
1005 snprintf(cqe_size_buf, sizeof(cqe_size_buf),"%zu", cqe_size);
1006 ucs_debug("%s: setting %s=%s", uct_ib_device_name(dev), cqe_size_env_var,
1007 cqe_size_buf);
1008 ret = ibv_exp_setenv(dev->ibv_context, cqe_size_env_var, cqe_size_buf, 1);
1009 if (ret) {
1010 ucs_error("ibv_exp_setenv(%s=%s) failed: %m", cqe_size_env_var,
1011 cqe_size_buf);
1012 return UCS_ERR_INVALID_PARAM;
1013 }
1014
1015 env_var_added = 1;
1016 }
1017 #endif
1018 status = iface->ops->create_cq(iface, dir, init_attr, preferred_cpu, inl);
1019 if (status != UCS_OK) {
1020 goto out_unsetenv;
1021 }
1022
1023 status = UCS_OK;
1024
1025 out_unsetenv:
1026 #if HAVE_DECL_IBV_EXP_SETENV && !HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE
1027 iface->config.max_inl_cqe[dir] = cqe_size / 2;
1028 if (env_var_added) {
1029 /* if we created a new environment variable, remove it */
1030 ret = ibv_exp_unsetenv(dev->ibv_context, cqe_size_env_var);
1031 if (ret) {
1032 ucs_warn("unsetenv(%s) failed: %m", cqe_size_env_var);
1033 }
1034 }
1035 #endif
1036 return status;
1037 }
1038
1039
uct_ib_iface_set_moderation(struct ibv_cq * cq,unsigned count,double period_usec)1040 static ucs_status_t uct_ib_iface_set_moderation(struct ibv_cq *cq,
1041 unsigned count, double period_usec)
1042 {
1043 #if HAVE_DECL_IBV_EXP_CQ_MODERATION
1044 unsigned period = (unsigned)(period_usec * UCS_USEC_PER_SEC);
1045
1046 if (count > UINT16_MAX) {
1047 ucs_error("CQ moderation count is too high: %u, max value: %u", count, UINT16_MAX);
1048 return UCS_ERR_INVALID_PARAM;
1049 } else if (count == 0) {
1050 /* in case if count value is 0 (unchanged default value) - set it to maximum
1051 * possible value */
1052 count = UINT16_MAX;
1053 }
1054
1055 if (period > UINT16_MAX) {
1056 ucs_error("CQ moderation period is too high: %u, max value: %uus", period, UINT16_MAX);
1057 return UCS_ERR_INVALID_PARAM;
1058 } else if (period == 0) {
1059 /* in case if count value is 0 (unchanged default value) - set it to maximum
1060 * possible value, the same behavior as counter */
1061 period = UINT16_MAX;
1062 }
1063
1064 if ((count < UINT16_MAX) || (period < UINT16_MAX)) {
1065 struct ibv_exp_cq_attr cq_attr = {
1066 .comp_mask = IBV_EXP_CQ_ATTR_MODERATION,
1067 .moderation.cq_count = (uint16_t)(count),
1068 .moderation.cq_period = (uint16_t)(period),
1069 .cq_cap_flags = 0
1070 };
1071 if (ibv_exp_modify_cq(cq, &cq_attr, IBV_EXP_CQ_MODERATION)) {
1072 ucs_error("ibv_exp_modify_cq(count=%d, period=%d) failed: %m", count, period);
1073 return UCS_ERR_IO_ERROR;
1074 }
1075 }
1076 #endif /* HAVE_DECL_IBV_EXP_CQ_MODERATION */
1077
1078 return UCS_OK;
1079 }
1080
uct_ib_iface_set_num_paths(uct_ib_iface_t * iface,const uct_ib_iface_config_t * config)1081 static void uct_ib_iface_set_num_paths(uct_ib_iface_t *iface,
1082 const uct_ib_iface_config_t *config)
1083 {
1084 uct_ib_device_t *dev = uct_ib_iface_device(iface);
1085
1086 if (config->num_paths == UCS_ULUNITS_AUTO) {
1087 if (uct_ib_iface_is_roce(iface)) {
1088 /* RoCE - number of paths is RoCE LAG level */
1089 iface->num_paths =
1090 uct_ib_device_get_roce_lag_level(dev, iface->config.port_num);
1091 } else {
1092 /* IB - number of paths is LMC level */
1093 ucs_assert(iface->path_bits_count > 0);
1094 iface->num_paths = iface->path_bits_count;
1095 }
1096 } else {
1097 iface->num_paths = config->num_paths;
1098 }
1099 }
1100
uct_ib_iface_is_roce_v2(uct_ib_iface_t * iface,uct_ib_device_t * dev)1101 int uct_ib_iface_is_roce_v2(uct_ib_iface_t *iface, uct_ib_device_t *dev)
1102 {
1103 return uct_ib_iface_is_roce(iface) &&
1104 (iface->gid_info.roce_info.ver == UCT_IB_DEVICE_ROCE_V2);
1105 }
1106
uct_ib_iface_init_roce_gid_info(uct_ib_iface_t * iface,size_t md_config_index)1107 ucs_status_t uct_ib_iface_init_roce_gid_info(uct_ib_iface_t *iface,
1108 size_t md_config_index)
1109 {
1110 uct_ib_device_t *dev = uct_ib_iface_device(iface);
1111 uint8_t port_num = iface->config.port_num;
1112
1113 ucs_assert(uct_ib_iface_is_roce(iface));
1114
1115 if (md_config_index == UCS_ULUNITS_AUTO) {
1116 return uct_ib_device_select_gid(dev, port_num, &iface->gid_info);
1117 }
1118
1119 return uct_ib_device_query_gid_info(dev->ibv_context, uct_ib_device_name(dev),
1120 port_num, md_config_index,
1121 &iface->gid_info);
1122 }
1123
uct_ib_iface_init_gid_info(uct_ib_iface_t * iface,size_t md_config_index)1124 static ucs_status_t uct_ib_iface_init_gid_info(uct_ib_iface_t *iface,
1125 size_t md_config_index)
1126 {
1127 uct_ib_device_gid_info_t *gid_info = &iface->gid_info;
1128 ucs_status_t status;
1129
1130 /* Fill the gid index and the RoCE version */
1131 if (uct_ib_iface_is_roce(iface)) {
1132 status = uct_ib_iface_init_roce_gid_info(iface, md_config_index);
1133 if (status != UCS_OK) {
1134 goto out;
1135 }
1136 } else {
1137 gid_info->gid_index = (md_config_index ==
1138 UCS_ULUNITS_AUTO) ?
1139 UCT_IB_MD_DEFAULT_GID_INDEX :
1140 md_config_index;
1141 gid_info->roce_info.ver = UCT_IB_DEVICE_ROCE_ANY;
1142 gid_info->roce_info.addr_family = 0;
1143 }
1144
1145 /* Fill the gid */
1146 status = uct_ib_device_query_gid(uct_ib_iface_device(iface),
1147 iface->config.port_num,
1148 gid_info->gid_index, &gid_info->gid);
1149 if (status != UCS_OK) {
1150 goto out;
1151 }
1152
1153 out:
1154 return status;
1155 }
1156
uct_ib_iface_set_path_mtu(uct_ib_iface_t * iface,const uct_ib_iface_config_t * config)1157 static void uct_ib_iface_set_path_mtu(uct_ib_iface_t *iface,
1158 const uct_ib_iface_config_t *config)
1159 {
1160 enum ibv_mtu port_mtu = uct_ib_iface_port_attr(iface)->active_mtu;
1161 uct_ib_device_t *dev = uct_ib_iface_device(iface);
1162
1163 /* MTU is set by user configuration */
1164 if (config->path_mtu != UCT_IB_MTU_DEFAULT) {
1165 /* cast from uct_ib_mtu_t to ibv_mtu */
1166 iface->config.path_mtu = (enum ibv_mtu)(config->path_mtu +
1167 (IBV_MTU_512 - UCT_IB_MTU_512));
1168 } else if ((port_mtu > IBV_MTU_2048) &&
1169 (IBV_DEV_ATTR(dev, vendor_id) == 0x02c9) &&
1170 ((IBV_DEV_ATTR(dev, vendor_part_id) == 4099) ||
1171 (IBV_DEV_ATTR(dev, vendor_part_id) == 4100) ||
1172 (IBV_DEV_ATTR(dev, vendor_part_id) == 4103) ||
1173 (IBV_DEV_ATTR(dev, vendor_part_id) == 4104))) {
1174 /* On some devices optimal path_mtu is 2048 */
1175 iface->config.path_mtu = IBV_MTU_2048;
1176 } else {
1177 iface->config.path_mtu = port_mtu;
1178 }
1179 }
1180
UCS_CLASS_INIT_FUNC(uct_ib_iface_t,uct_ib_iface_ops_t * ops,uct_md_h md,uct_worker_h worker,const uct_iface_params_t * params,const uct_ib_iface_config_t * config,const uct_ib_iface_init_attr_t * init_attr)1181 UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md,
1182 uct_worker_h worker, const uct_iface_params_t *params,
1183 const uct_ib_iface_config_t *config,
1184 const uct_ib_iface_init_attr_t *init_attr)
1185 {
1186 uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t);
1187 uct_ib_device_t *dev = &ib_md->dev;
1188 size_t rx_headroom = (params->field_mask &
1189 UCT_IFACE_PARAM_FIELD_RX_HEADROOM) ?
1190 params->rx_headroom : 0;
1191 ucs_cpu_set_t cpu_mask;
1192 int preferred_cpu;
1193 ucs_status_t status;
1194 uint8_t port_num;
1195
1196 if (!(params->open_mode & UCT_IFACE_OPEN_MODE_DEVICE)) {
1197 return UCS_ERR_UNSUPPORTED;
1198 }
1199
1200 if (params->field_mask & UCT_IFACE_PARAM_FIELD_CPU_MASK) {
1201 cpu_mask = params->cpu_mask;
1202 } else {
1203 memset(&cpu_mask, 0, sizeof(cpu_mask));
1204 }
1205
1206 preferred_cpu = ucs_cpu_set_find_lcs(&cpu_mask);
1207
1208 UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &ops->super, md, worker,
1209 params, &config->super
1210 UCS_STATS_ARG(((params->field_mask &
1211 UCT_IFACE_PARAM_FIELD_STATS_ROOT) &&
1212 (params->stats_root != NULL)) ?
1213 params->stats_root :
1214 dev->stats)
1215 UCS_STATS_ARG(params->mode.device.dev_name));
1216
1217 status = uct_ib_device_find_port(dev, params->mode.device.dev_name, &port_num);
1218 if (status != UCS_OK) {
1219 goto err;
1220 }
1221
1222 self->ops = ops;
1223
1224 self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) +
1225 ucs_max(sizeof(uct_recv_desc_t) +
1226 rx_headroom,
1227 init_attr->rx_priv_len +
1228 init_attr->rx_hdr_len);
1229 self->config.rx_hdr_offset = self->config.rx_payload_offset -
1230 init_attr->rx_hdr_len;
1231 self->config.rx_headroom_offset = self->config.rx_payload_offset -
1232 rx_headroom;
1233 self->config.seg_size = init_attr->seg_size;
1234 self->config.roce_path_factor = config->roce_path_factor;
1235 self->config.tx_max_poll = config->tx.max_poll;
1236 self->config.rx_max_poll = config->rx.max_poll;
1237 self->config.rx_max_batch = ucs_min(config->rx.max_batch,
1238 config->rx.queue_len / 4);
1239 self->config.port_num = port_num;
1240 self->config.sl = config->sl;
1241 self->config.hop_limit = config->hop_limit;
1242 self->release_desc.cb = uct_ib_iface_release_desc;
1243 self->config.enable_res_domain = config->enable_res_domain;
1244 self->config.enable_cuda_affinity = config->enable_cuda_affinity;
1245 self->config.qp_type = init_attr->qp_type;
1246 uct_ib_iface_set_path_mtu(self, config);
1247
1248 if (ucs_derived_of(worker, uct_priv_worker_t)->thread_mode == UCS_THREAD_MODE_MULTI) {
1249 ucs_error("IB transports do not support multi-threaded worker");
1250 return UCS_ERR_INVALID_PARAM;
1251 }
1252
1253 status = uct_ib_iface_init_pkey(self, config);
1254 if (status != UCS_OK) {
1255 goto err;
1256 }
1257
1258 status = uct_ib_iface_init_gid_info(self, ib_md->config.gid_index);
1259 if (status != UCS_OK) {
1260 goto err;
1261 }
1262
1263 if (config->traffic_class == UCS_ULUNITS_AUTO) {
1264 self->config.traffic_class = uct_ib_iface_is_roce_v2(self, dev) ?
1265 UCT_IB_DEFAULT_ROCEV2_DSCP : 0;
1266 } else {
1267 self->config.traffic_class = config->traffic_class;
1268 }
1269
1270 status = uct_ib_iface_init_lmc(self, config);
1271 if (status != UCS_OK) {
1272 goto err;
1273 }
1274
1275 uct_ib_iface_set_num_paths(self, config);
1276
1277 self->comp_channel = ibv_create_comp_channel(dev->ibv_context);
1278 if (self->comp_channel == NULL) {
1279 ucs_error("ibv_create_comp_channel() failed: %m");
1280 status = UCS_ERR_IO_ERROR;
1281 goto err_cleanup;
1282 }
1283
1284 status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0);
1285 if (status != UCS_OK) {
1286 goto err_destroy_comp_channel;
1287 }
1288
1289 status = uct_ib_iface_create_cq(self, UCT_IB_DIR_TX, init_attr,
1290 config, preferred_cpu);
1291 if (status != UCS_OK) {
1292 goto err_destroy_comp_channel;
1293 }
1294
1295 status = uct_ib_iface_set_moderation(self->cq[UCT_IB_DIR_TX],
1296 config->tx.cq_moderation_count,
1297 config->tx.cq_moderation_period);
1298 if (status != UCS_OK) {
1299 goto err_destroy_send_cq;
1300 }
1301
1302 status = uct_ib_iface_create_cq(self, UCT_IB_DIR_RX, init_attr,
1303 config, preferred_cpu);
1304 if (status != UCS_OK) {
1305 goto err_destroy_send_cq;
1306 }
1307
1308 status = uct_ib_iface_set_moderation(self->cq[UCT_IB_DIR_RX],
1309 config->rx.cq_moderation_count,
1310 config->rx.cq_moderation_period);
1311 if (status != UCS_OK) {
1312 goto err_destroy_recv_cq;
1313 }
1314
1315 /* Address scope and size */
1316 if (uct_ib_iface_is_roce(self) || config->is_global ||
1317 uct_ib_grh_required(uct_ib_iface_port_attr(self)) ||
1318 /* check ADDR_TYPE for backward compatibility */
1319 (config->addr_type == UCT_IB_ADDRESS_TYPE_SITE_LOCAL) ||
1320 (config->addr_type == UCT_IB_ADDRESS_TYPE_GLOBAL)) {
1321 self->config.force_global_addr = 1;
1322 } else {
1323 self->config.force_global_addr = 0;
1324 }
1325
1326 self->addr_size = uct_ib_iface_address_size(self);
1327
1328 ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d",
1329 self->config.rx_headroom_offset, self->config.rx_payload_offset,
1330 self->config.rx_hdr_offset, self->config.seg_size);
1331
1332 return UCS_OK;
1333
1334 err_destroy_recv_cq:
1335 ibv_destroy_cq(self->cq[UCT_IB_DIR_RX]);
1336 err_destroy_send_cq:
1337 ibv_destroy_cq(self->cq[UCT_IB_DIR_TX]);
1338 err_destroy_comp_channel:
1339 ibv_destroy_comp_channel(self->comp_channel);
1340 err_cleanup:
1341 ucs_free(self->path_bits);
1342 err:
1343 return status;
1344 }
1345
UCS_CLASS_CLEANUP_FUNC(uct_ib_iface_t)1346 static UCS_CLASS_CLEANUP_FUNC(uct_ib_iface_t)
1347 {
1348 int ret;
1349
1350 ret = ibv_destroy_cq(self->cq[UCT_IB_DIR_RX]);
1351 if (ret != 0) {
1352 ucs_warn("ibv_destroy_cq(recv_cq) returned %d: %m", ret);
1353 }
1354
1355 ret = ibv_destroy_cq(self->cq[UCT_IB_DIR_TX]);
1356 if (ret != 0) {
1357 ucs_warn("ibv_destroy_cq(send_cq) returned %d: %m", ret);
1358 }
1359
1360 ret = ibv_destroy_comp_channel(self->comp_channel);
1361 if (ret != 0) {
1362 ucs_warn("ibv_destroy_comp_channel(comp_channel) returned %d: %m", ret);
1363 }
1364
1365 ucs_free(self->path_bits);
1366 }
1367
1368 UCS_CLASS_DEFINE(uct_ib_iface_t, uct_base_iface_t);
1369
uct_ib_iface_prepare_rx_wrs(uct_ib_iface_t * iface,ucs_mpool_t * mp,uct_ib_recv_wr_t * wrs,unsigned n)1370 int uct_ib_iface_prepare_rx_wrs(uct_ib_iface_t *iface, ucs_mpool_t *mp,
1371 uct_ib_recv_wr_t *wrs, unsigned n)
1372 {
1373 uct_ib_iface_recv_desc_t *desc;
1374 unsigned count;
1375
1376 count = 0;
1377 while (count < n) {
1378 UCT_TL_IFACE_GET_RX_DESC(&iface->super, mp, desc, break);
1379 wrs[count].sg.addr = (uintptr_t)uct_ib_iface_recv_desc_hdr(iface, desc);
1380 wrs[count].sg.length = iface->config.rx_payload_offset + iface->config.seg_size;
1381 wrs[count].sg.lkey = desc->lkey;
1382 wrs[count].ibwr.num_sge = 1;
1383 wrs[count].ibwr.wr_id = (uintptr_t)desc;
1384 wrs[count].ibwr.sg_list = &wrs[count].sg;
1385 wrs[count].ibwr.next = &wrs[count + 1].ibwr;
1386 ++count;
1387 }
1388
1389 if (count > 0) {
1390 wrs[count - 1].ibwr.next = NULL;
1391 }
1392
1393 return count;
1394 }
1395
uct_ib_iface_get_numa_latency(uct_ib_iface_t * iface,double * latency)1396 static ucs_status_t uct_ib_iface_get_numa_latency(uct_ib_iface_t *iface,
1397 double *latency)
1398 {
1399 uct_ib_device_t *dev = uct_ib_iface_device(iface);
1400 uct_ib_md_t *md = uct_ib_iface_md(iface);
1401 ucs_sys_cpuset_t temp_cpu_mask, process_affinity;
1402 #if HAVE_NUMA
1403 int distance, min_cpu_distance;
1404 int cpu, num_cpus;
1405 #endif
1406 int ret;
1407
1408 if (!md->config.prefer_nearest_device) {
1409 *latency = 0;
1410 return UCS_OK;
1411 }
1412
1413 ret = ucs_sys_getaffinity(&process_affinity);
1414 if (ret) {
1415 ucs_error("sched_getaffinity() failed: %m");
1416 return UCS_ERR_INVALID_PARAM;
1417 }
1418
1419 #if HAVE_NUMA
1420 /* Try to estimate the extra device latency according to NUMA distance */
1421 if (dev->numa_node != -1) {
1422 min_cpu_distance = INT_MAX;
1423 num_cpus = ucs_min(CPU_SETSIZE, numa_num_configured_cpus());
1424 for (cpu = 0; cpu < num_cpus; ++cpu) {
1425 if (!CPU_ISSET(cpu, &process_affinity)) {
1426 continue;
1427 }
1428 distance = numa_distance(ucs_numa_node_of_cpu(cpu), dev->numa_node);
1429 if (distance >= UCS_NUMA_MIN_DISTANCE) {
1430 min_cpu_distance = ucs_min(min_cpu_distance, distance);
1431 }
1432 }
1433
1434 if (min_cpu_distance != INT_MAX) {
1435 /* set the extra latency to (numa_distance - 10) * 20nsec */
1436 *latency = (min_cpu_distance - UCS_NUMA_MIN_DISTANCE) * 20e-9;
1437 return UCS_OK;
1438 }
1439 }
1440 #endif
1441
1442 /* Estimate the extra device latency according to its local CPUs mask */
1443 CPU_AND(&temp_cpu_mask, &dev->local_cpus, &process_affinity);
1444 if (CPU_EQUAL(&process_affinity, &temp_cpu_mask)) {
1445 *latency = 0;
1446 } else {
1447 *latency = 200e-9;
1448 }
1449 return UCS_OK;
1450 }
1451
uct_ib_iface_get_cuda_latency(uct_ib_iface_t * iface,double * latency)1452 static ucs_status_t uct_ib_iface_get_cuda_latency(uct_ib_iface_t *iface,
1453 double *latency)
1454 {
1455 ucs_sys_dev_distance_t dist = {0.0, 0.0};
1456 uct_ib_device_t *dev = uct_ib_iface_device(iface);
1457 ucs_sys_device_t ib_sys_device;
1458 ucs_sys_device_t cuda_sys_device;
1459 ucs_sys_bus_id_t ib_bus_id;
1460 ucs_sys_bus_id_t cuda_bus_id;
1461 ucs_status_t status;
1462
1463 status = ucm_get_mem_type_current_device_info(UCS_MEMORY_TYPE_CUDA,
1464 &cuda_bus_id);
1465 if (status != UCS_OK) {
1466 *latency = 0.0;
1467 return UCS_OK;
1468 }
1469
1470 status = ucs_topo_find_device_by_bus_id(&cuda_bus_id, &cuda_sys_device);
1471 if (status != UCS_OK) {
1472 return status;
1473 }
1474
1475 status = uct_ib_device_bus(dev, iface->config.port_num, &ib_bus_id);
1476 if (status != UCS_OK) {
1477 return status;
1478 }
1479
1480 status = ucs_topo_find_device_by_bus_id(&ib_bus_id, &ib_sys_device);
1481 if (status != UCS_OK) {
1482 return status;
1483 }
1484
1485 status = ucs_topo_get_distance(ib_sys_device, cuda_sys_device, &dist);
1486 if (status != UCS_OK) {
1487 return status;
1488 }
1489
1490 *latency = dist.latency;
1491
1492 return UCS_OK;
1493 }
1494
uct_ib_iface_query(uct_ib_iface_t * iface,size_t xport_hdr_len,uct_iface_attr_t * iface_attr)1495 ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len,
1496 uct_iface_attr_t *iface_attr)
1497 {
1498 uct_ib_device_t *dev = uct_ib_iface_device(iface);
1499 uct_ib_md_t *md = uct_ib_iface_md(iface);
1500 static const unsigned ib_port_widths[] = {
1501 [0] = 1,
1502 [1] = 4,
1503 [2] = 8,
1504 [3] = 12,
1505 [4] = 16
1506 };
1507 uint8_t active_width, active_speed, active_mtu, width_idx;
1508 double encoding, signal_rate, wire_speed;
1509 size_t mtu, width, extra_pkt_len;
1510 ucs_status_t status;
1511 double numa_latency;
1512 double cuda_latency;
1513
1514 uct_base_iface_query(&iface->super, iface_attr);
1515
1516 active_width = uct_ib_iface_port_attr(iface)->active_width;
1517 active_speed = uct_ib_iface_port_attr(iface)->active_speed;
1518 active_mtu = uct_ib_iface_port_attr(iface)->active_mtu;
1519
1520 /* Get active width */
1521 width_idx = ucs_ilog2(active_width);
1522 if (!ucs_is_pow2(active_width) ||
1523 (active_width < 1) || (width_idx > 4))
1524 {
1525 ucs_error("Invalid active_width on %s:%d: %d",
1526 UCT_IB_IFACE_ARG(iface), active_width);
1527 return UCS_ERR_IO_ERROR;
1528 }
1529
1530 iface_attr->device_addr_len = iface->addr_size;
1531 iface_attr->dev_num_paths = iface->num_paths;
1532
1533 switch (active_speed) {
1534 case 1: /* SDR */
1535 iface_attr->latency.c = 5000e-9;
1536 signal_rate = 2.5e9;
1537 encoding = 8.0/10.0;
1538 break;
1539 case 2: /* DDR */
1540 iface_attr->latency.c = 2500e-9;
1541 signal_rate = 5.0e9;
1542 encoding = 8.0/10.0;
1543 break;
1544 case 4:
1545 iface_attr->latency.c = 1300e-9;
1546 if (uct_ib_iface_is_roce(iface)) {
1547 /* 10/40g Eth */
1548 signal_rate = 10.3125e9;
1549 encoding = 64.0/66.0;
1550 } else {
1551 /* QDR */
1552 signal_rate = 10.0e9;
1553 encoding = 8.0/10.0;
1554 }
1555 break;
1556 case 8: /* FDR10 */
1557 iface_attr->latency.c = 700e-9;
1558 signal_rate = 10.3125e9;
1559 encoding = 64.0/66.0;
1560 break;
1561 case 16: /* FDR */
1562 iface_attr->latency.c = 700e-9;
1563 signal_rate = 14.0625e9;
1564 encoding = 64.0/66.0;
1565 break;
1566 case 32: /* EDR / 100g Eth */
1567 iface_attr->latency.c = 600e-9;
1568 signal_rate = 25.78125e9;
1569 encoding = 64.0/66.0;
1570 break;
1571 case 64: /* 50g Eth */
1572 iface_attr->latency.c = 600e-9;
1573 signal_rate = 25.78125e9 * 2;
1574 encoding = 64.0/66.0;
1575 break;
1576 default:
1577 ucs_error("Invalid active_speed on %s:%d: %d",
1578 UCT_IB_IFACE_ARG(iface), active_speed);
1579 return UCS_ERR_IO_ERROR;
1580 }
1581
1582 status = uct_ib_iface_get_numa_latency(iface, &numa_latency);
1583 if (status != UCS_OK) {
1584 return status;
1585 }
1586
1587 iface_attr->latency.c += numa_latency;
1588 iface_attr->latency.m = 0;
1589
1590 if (iface->config.enable_cuda_affinity != UCS_NO) {
1591 status = uct_ib_iface_get_cuda_latency(iface, &cuda_latency);
1592 if (status != UCS_OK) {
1593 return status;
1594 }
1595
1596 iface_attr->latency.c += cuda_latency;
1597 iface_attr->latency.m = 0;
1598 }
1599
1600 /* Wire speed calculation: Width * SignalRate * Encoding */
1601 width = ib_port_widths[width_idx];
1602 wire_speed = (width * signal_rate * encoding) / 8.0;
1603
1604 /* Calculate packet overhead */
1605 mtu = ucs_min(uct_ib_mtu_value((enum ibv_mtu)active_mtu),
1606 iface->config.seg_size);
1607
1608 extra_pkt_len = UCT_IB_BTH_LEN + xport_hdr_len + UCT_IB_ICRC_LEN + UCT_IB_VCRC_LEN + UCT_IB_DELIM_LEN;
1609
1610 if (uct_ib_iface_is_roce(iface)) {
1611 extra_pkt_len += UCT_IB_GRH_LEN + UCT_IB_ROCE_LEN;
1612 iface_attr->latency.c += 200e-9;
1613 } else {
1614 /* TODO check if UCT_IB_DELIM_LEN is present in RoCE as well */
1615 extra_pkt_len += UCT_IB_LRH_LEN;
1616 }
1617
1618 iface_attr->bandwidth.shared = ucs_min((wire_speed * mtu) / (mtu + extra_pkt_len), md->pci_bw);
1619 iface_attr->bandwidth.dedicated = 0;
1620 iface_attr->priority = uct_ib_device_spec(dev)->priority;
1621
1622 return UCS_OK;
1623 }
1624
uct_ib_iface_event_fd_get(uct_iface_h tl_iface,int * fd_p)1625 ucs_status_t uct_ib_iface_event_fd_get(uct_iface_h tl_iface, int *fd_p)
1626 {
1627 uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t);
1628 *fd_p = iface->comp_channel->fd;
1629 return UCS_OK;
1630 }
1631
uct_ib_iface_pre_arm(uct_ib_iface_t * iface)1632 ucs_status_t uct_ib_iface_pre_arm(uct_ib_iface_t *iface)
1633 {
1634 int res, send_cq_count, recv_cq_count;
1635 struct ibv_cq *cq;
1636 void *cq_context;
1637
1638 send_cq_count = 0;
1639 recv_cq_count = 0;
1640 do {
1641 res = ibv_get_cq_event(iface->comp_channel, &cq, &cq_context);
1642 if (0 == res) {
1643 if (iface->cq[UCT_IB_DIR_TX] == cq) {
1644 iface->ops->event_cq(iface, UCT_IB_DIR_TX);
1645 ++send_cq_count;
1646 }
1647 if (iface->cq[UCT_IB_DIR_RX] == cq) {
1648 iface->ops->event_cq(iface, UCT_IB_DIR_RX);
1649 ++recv_cq_count;
1650 }
1651 }
1652 } while (res == 0);
1653
1654 if (errno != EAGAIN) {
1655 return UCS_ERR_IO_ERROR;
1656 }
1657
1658 if (send_cq_count > 0) {
1659 ibv_ack_cq_events(iface->cq[UCT_IB_DIR_TX], send_cq_count);
1660 }
1661
1662 if (recv_cq_count > 0) {
1663 ibv_ack_cq_events(iface->cq[UCT_IB_DIR_RX], recv_cq_count);
1664 }
1665
1666 /* avoid re-arming the interface if any events exists */
1667 if ((send_cq_count > 0) || (recv_cq_count > 0)) {
1668 ucs_trace("arm_cq: got %d send and %d recv events, returning BUSY",
1669 send_cq_count, recv_cq_count);
1670 return UCS_ERR_BUSY;
1671 }
1672
1673 return UCS_OK;
1674 }
1675
uct_ib_iface_arm_cq(uct_ib_iface_t * iface,uct_ib_dir_t dir,int solicited_only)1676 ucs_status_t uct_ib_iface_arm_cq(uct_ib_iface_t *iface,
1677 uct_ib_dir_t dir,
1678 int solicited_only)
1679 {
1680 int ret;
1681
1682 ret = ibv_req_notify_cq(iface->cq[dir], solicited_only);
1683 if (ret != 0) {
1684 ucs_error("ibv_req_notify_cq("UCT_IB_IFACE_FMT", %d, sol=%d) failed: %m",
1685 UCT_IB_IFACE_ARG(iface), dir, solicited_only);
1686 return UCS_ERR_IO_ERROR;
1687 }
1688 return UCS_OK;
1689 }
1690