1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED.
3 *
4 * See file LICENSE for terms.
5 */
6
7 #ifndef UCT_IB_IFACE_H
8 #define UCT_IB_IFACE_H
9
10 #include "ib_md.h"
11
12 #include <uct/api/uct.h>
13 #include <uct/base/uct_iface.h>
14 #include <uct/base/uct_iov.inl>
15 #include <ucs/sys/compiler.h>
16 #include <ucs/sys/string.h>
17 #include <ucs/sys/math.h>
18 #include <ucs/datastruct/mpool.inl>
19
20 #define UCT_IB_MAX_IOV 8UL
21 #define UCT_IB_IFACE_NULL_RES_DOMAIN_KEY 0u
22 #define UCT_IB_MAX_ATOMIC_SIZE sizeof(uint64_t)
23 #define UCT_IB_ADDRESS_INVALID_GID_INDEX UINT8_MAX
24 #define UCT_IB_ADDRESS_INVALID_PATH_MTU 0
25 #define UCT_IB_ADDRESS_INVALID_PKEY 0
26 #define UCT_IB_ADDRESS_DEFAULT_PKEY 0xffff
27
28 /* Forward declarations */
29 typedef struct uct_ib_iface_config uct_ib_iface_config_t;
30 typedef struct uct_ib_iface_ops uct_ib_iface_ops_t;
31 typedef struct uct_ib_iface uct_ib_iface_t;
32
33
34 /**
35 * IB port/path MTU.
36 */
37 typedef enum uct_ib_mtu {
38 UCT_IB_MTU_DEFAULT = 0,
39 UCT_IB_MTU_512 = 1,
40 UCT_IB_MTU_1024 = 2,
41 UCT_IB_MTU_2048 = 3,
42 UCT_IB_MTU_4096 = 4,
43 UCT_IB_MTU_LAST
44 } uct_ib_mtu_t;
45
46
47 /**
48 * Traffic direction.
49 */
50 typedef enum {
51 UCT_IB_DIR_RX,
52 UCT_IB_DIR_TX,
53 UCT_IB_DIR_NUM
54 } uct_ib_dir_t;
55
56 enum {
57 UCT_IB_QPT_UNKNOWN,
58 #ifdef HAVE_DC_EXP
59 UCT_IB_QPT_DCI = IBV_EXP_QPT_DC_INI,
60 #elif HAVE_DC_DV
61 UCT_IB_QPT_DCI = IBV_QPT_DRIVER,
62 #endif
63 };
64
65
66 /**
67 * IB address packing flags
68 */
69 enum {
70 UCT_IB_ADDRESS_PACK_FLAG_ETH = UCS_BIT(0),
71 UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID = UCS_BIT(1),
72 UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX = UCS_BIT(2),
73 UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU = UCS_BIT(3),
74 UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX = UCS_BIT(4),
75 UCT_IB_ADDRESS_PACK_FLAG_PKEY = UCS_BIT(5)
76 };
77
78
79 typedef struct uct_ib_address_pack_params {
80 /* Packing flags, UCT_IB_ADDRESS_PACK_FLAG_xx. */
81 uint64_t flags;
82 /* GID address to pack/unpack. */
83 union ibv_gid gid;
84 /* LID address to pack/unpack. */
85 uint16_t lid;
86 /* RoCE version to pack/unpack in case of an Ethernet link layer,
87 must be valid if @ref UCT_IB_ADDRESS_PACK_FLAG_ETH is set. */
88 uct_ib_roce_version_info_t roce_info;
89 /* path MTU size as defined in enum ibv_mtu,
90 must be valid if @ref UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU is set. */
91 enum ibv_mtu path_mtu;
92 /* GID index,
93 must be valid if @ref UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX is set. */
94 uint8_t gid_index;
95 /* PKEY value,
96 must be valid if @ref UCT_IB_ADDRESS_PACK_FLAG_PKEY is set. */
97 uint16_t pkey;
98 } uct_ib_address_pack_params_t;
99
100
101 struct uct_ib_iface_config {
102 uct_iface_config_t super;
103
104 size_t seg_size; /* Maximal size of copy-out sends */
105
106 struct {
107 unsigned queue_len; /* Queue length */
108 unsigned max_batch; /* How many fragments can be batched to one post send */
109 unsigned max_poll; /* How many wcs can be picked when polling tx cq */
110 size_t min_inline; /* Inline space to reserve for sends */
111 unsigned min_sge; /* How many SG entries to support */
112 uct_iface_mpool_config_t mp;
113
114 /* Event moderation parameters */
115 unsigned cq_moderation_count;
116 double cq_moderation_period;
117 } tx;
118
119 struct {
120 unsigned queue_len; /* Queue length */
121 unsigned max_batch; /* How many buffers can be batched to one post receive */
122 unsigned max_poll; /* How many wcs can be picked when polling rx cq */
123 uct_iface_mpool_config_t mp;
124
125 /* Event moderation parameters */
126 unsigned cq_moderation_count;
127 double cq_moderation_period;
128 } rx;
129
130 /* Inline space to reserve in CQ */
131 size_t inl[UCT_IB_DIR_NUM];
132
133 /* Change the address type */
134 int addr_type;
135
136 /* Force global routing */
137 int is_global;
138
139 /* IB SL to use */
140 unsigned sl;
141
142 /* IB Traffic Class to use */
143 unsigned long traffic_class;
144
145 /* IB hop limit / TTL */
146 unsigned hop_limit;
147
148 /* Number of paths to expose for the interface */
149 unsigned long num_paths;
150
151 /* Multiplier for RoCE LAG UDP source port calculation */
152 unsigned roce_path_factor;
153
154 /* Ranges of path bits */
155 UCS_CONFIG_ARRAY_FIELD(ucs_range_spec_t, ranges) lid_path_bits;
156
157 /* IB PKEY to use */
158 unsigned pkey;
159
160 /* Multiple resource domains */
161 int enable_res_domain;
162
163 /* Path MTU size */
164 uct_ib_mtu_t path_mtu;
165
166 /* Allow IB devices to be penalized based on distance from CUDA device */
167 int enable_cuda_affinity;
168 };
169
170
171 enum {
172 UCT_IB_CQ_IGNORE_OVERRUN = UCS_BIT(0),
173 UCT_IB_TM_SUPPORTED = UCS_BIT(1)
174 };
175
176
177 typedef struct uct_ib_iface_init_attr {
178 unsigned rx_priv_len; /* Length of transport private data to reserve */
179 unsigned rx_hdr_len; /* Length of transport network header */
180 unsigned cq_len[UCT_IB_DIR_NUM]; /* CQ length */
181 size_t seg_size; /* Transport segment size */
182 unsigned fc_req_size; /* Flow control request size */
183 int qp_type; /* IB QP type */
184 int flags; /* Various flags (see enum) */
185 } uct_ib_iface_init_attr_t;
186
187
188 typedef struct uct_ib_qp_attr {
189 int qp_type;
190 struct ibv_qp_cap cap;
191 int port;
192 struct ibv_srq *srq;
193 uint32_t srq_num;
194 unsigned sq_sig_all;
195 unsigned max_inl_cqe[UCT_IB_DIR_NUM];
196 #if HAVE_DECL_IBV_EXP_CREATE_QP
197 struct ibv_exp_qp_init_attr ibv;
198 #elif HAVE_DECL_IBV_CREATE_QP_EX
199 struct ibv_qp_init_attr_ex ibv;
200 #else
201 struct ibv_qp_init_attr ibv;
202 #endif
203 } uct_ib_qp_attr_t;
204
205
206 typedef ucs_status_t (*uct_ib_iface_create_cq_func_t)(uct_ib_iface_t *iface,
207 uct_ib_dir_t dir,
208 const uct_ib_iface_init_attr_t *init_attr,
209 int preferred_cpu,
210 size_t inl);
211
212 typedef ucs_status_t (*uct_ib_iface_arm_cq_func_t)(uct_ib_iface_t *iface,
213 uct_ib_dir_t dir,
214 int solicited_only);
215
216 typedef void (*uct_ib_iface_event_cq_func_t)(uct_ib_iface_t *iface,
217 uct_ib_dir_t dir);
218
219 typedef void (*uct_ib_iface_handle_failure_func_t)(uct_ib_iface_t *iface, void *arg,
220 ucs_status_t status);
221
222 typedef ucs_status_t (*uct_ib_iface_set_ep_failed_func_t)(uct_ib_iface_t *iface, uct_ep_h ep,
223 ucs_status_t status);
224
225
226 struct uct_ib_iface_ops {
227 uct_iface_ops_t super;
228 uct_ib_iface_create_cq_func_t create_cq;
229 uct_ib_iface_arm_cq_func_t arm_cq;
230 uct_ib_iface_event_cq_func_t event_cq;
231 uct_ib_iface_handle_failure_func_t handle_failure;
232 uct_ib_iface_set_ep_failed_func_t set_ep_failed;
233 };
234
235
236 struct uct_ib_iface {
237 uct_base_iface_t super;
238
239 struct ibv_cq *cq[UCT_IB_DIR_NUM];
240 struct ibv_comp_channel *comp_channel;
241 uct_recv_desc_t release_desc;
242
243 uint8_t *path_bits;
244 unsigned path_bits_count;
245 unsigned num_paths;
246 uint16_t pkey_index;
247 uint16_t pkey;
248 uint8_t addr_size;
249 uct_ib_device_gid_info_t gid_info;
250
251 struct {
252 unsigned rx_payload_offset; /* offset from desc to payload */
253 unsigned rx_hdr_offset; /* offset from desc to network header */
254 unsigned rx_headroom_offset; /* offset from desc to user headroom */
255 unsigned rx_max_batch;
256 unsigned rx_max_poll;
257 unsigned tx_max_poll;
258 unsigned seg_size;
259 unsigned roce_path_factor;
260 uint8_t max_inl_cqe[UCT_IB_DIR_NUM];
261 uint8_t port_num;
262 uint8_t sl;
263 uint8_t traffic_class;
264 uint8_t hop_limit;
265 uint8_t enable_res_domain; /* Disable multiple resource domains */
266 uint8_t enable_cuda_affinity;
267 uint8_t qp_type;
268 uint8_t force_global_addr;
269 enum ibv_mtu path_mtu;
270 } config;
271
272 uct_ib_iface_ops_t *ops;
273 };
274
275
276 typedef struct uct_ib_fence_info {
277 uint16_t fence_beat; /* 16bit is enough because if it wraps around,
278 * it means the older ops are already completed
279 * because QP size is less than 64k */
280 } uct_ib_fence_info_t;
281
282
283 UCS_CLASS_DECLARE(uct_ib_iface_t, uct_ib_iface_ops_t*, uct_md_h, uct_worker_h,
284 const uct_iface_params_t*, const uct_ib_iface_config_t*,
285 const uct_ib_iface_init_attr_t*);
286
287 /*
288 * The offset to the payload is the maximum between user-requested headroom
289 * and transport-specific data/header. When the active message callback is invoked,
290 * it gets a pointer to the beginning of the headroom.
291 * The headroom can be either smaller (1) or larger (2) than the transport data.
292 *
293 * (1)
294 *
295 * <rx_headroom_offset>
296 * |
297 * |
298 * uct_recv_desc_t |
299 * | |
300 * | am_callback/tag_unexp_callback
301 * | |
302 * +------+------+---+-----------+---------+
303 * | LKey | ??? | D | Head Room | Payload |
304 * +------+------+---+--+--------+---------+
305 * | LKey | TL data | TL hdr | Payload |
306 * +------+-------------+--------+---------+
307 * |
308 * post_receive
309 *
310 * (2)
311 * am_callback/tag_unexp_callback
312 * |
313 * +------+---+------------------+---------+
314 * | LKey | D | Head Room | Payload |
315 * +------+---+-----+---+--------+---------+
316 * | LKey | TL data | ? | TL hdr | Payload |
317 * +------+---------+---+--------+---------+
318 * |
319 * post_receive
320 * <dsc>
321 * <--- rx_headroom -->
322 * <------- rx_payload_offset --->
323 * <--- rx_hdr_offset -->
324 *
325 */
326 typedef struct uct_ib_iface_recv_desc {
327 uint32_t lkey;
328 } UCS_S_PACKED uct_ib_iface_recv_desc_t;
329
330
331
332 extern ucs_config_field_t uct_ib_iface_config_table[];
333 extern const char *uct_ib_mtu_values[];
334
335
336 /**
337 * Create memory pool of receive descriptors.
338 */
339 ucs_status_t uct_ib_iface_recv_mpool_init(uct_ib_iface_t *iface,
340 const uct_ib_iface_config_t *config,
341 const char *name, ucs_mpool_t *mp);
342
343 void uct_ib_iface_release_desc(uct_recv_desc_t *self, void *desc);
344
345
346 static UCS_F_ALWAYS_INLINE void
uct_ib_iface_invoke_am_desc(uct_ib_iface_t * iface,uint8_t am_id,void * data,unsigned length,uct_ib_iface_recv_desc_t * ib_desc)347 uct_ib_iface_invoke_am_desc(uct_ib_iface_t *iface, uint8_t am_id, void *data,
348 unsigned length, uct_ib_iface_recv_desc_t *ib_desc)
349 {
350 void *desc = (char*)ib_desc + iface->config.rx_headroom_offset;
351 ucs_status_t status;
352
353 status = uct_iface_invoke_am(&iface->super, am_id, data, length,
354 UCT_CB_PARAM_FLAG_DESC);
355 if (status == UCS_OK) {
356 ucs_mpool_put_inline(ib_desc);
357 } else {
358 uct_recv_desc(desc) = &iface->release_desc;
359 }
360 }
361
362
363 /**
364 * @return Whether the port used by this interface is RoCE
365 */
366 int uct_ib_iface_is_roce(uct_ib_iface_t *iface);
367
368
369 /**
370 * @return Whether the port used by this interface is IB
371 */
372 int uct_ib_iface_is_ib(uct_ib_iface_t *iface);
373
374
375 /**
376 * Get the expected size of IB packed address.
377 *
378 * @param [in] params Address parameters as defined in
379 * @ref uct_ib_address_pack_params_t.
380 *
381 * @return IB address size of the given link scope.
382 */
383 size_t uct_ib_address_size(const uct_ib_address_pack_params_t *params);
384
385
386 /**
387 * @return IB address packing flags of the given iface.
388 */
389 unsigned uct_ib_iface_address_pack_flags(uct_ib_iface_t *iface);
390
391
392 /**
393 * @return IB address size of the given iface.
394 */
395 size_t uct_ib_iface_address_size(uct_ib_iface_t *iface);
396
397
398 /**
399 * Pack IB address.
400 *
401 * @param [in] params Address parameters as defined in
402 * @ref uct_ib_address_pack_params_t.
403 * @param [in/out] ib_addr Filled with packed ib address. Size of the structure
404 * must be at least what @ref uct_ib_address_size()
405 * returns for the given scope.
406 */
407 void uct_ib_address_pack(const uct_ib_address_pack_params_t *params,
408 uct_ib_address_t *ib_addr);
409
410
411
412 /**
413 * Pack the IB address of the given iface.
414 *
415 * @param [in] iface Iface whose IB address to pack.
416 * @param [in/out] ib_addr Filled with packed ib address. Size of the structure
417 * must be at least what @ref uct_ib_address_size()
418 * returns for the given scope.
419 */
420 void uct_ib_iface_address_pack(uct_ib_iface_t *iface, uct_ib_address_t *ib_addr);
421
422
423 /**
424 * Unpack IB address.
425 *
426 * @param [in] ib_addr IB address to unpack.
427 * @param [out] params_p Filled with address attributes as in
428 * @ref uct_ib_address_pack_params_t.
429 */
430 void uct_ib_address_unpack(const uct_ib_address_t *ib_addr,
431 uct_ib_address_pack_params_t *params_p);
432
433
434 /**
435 * Convert IB address to a human-readable string.
436 */
437 const char *uct_ib_address_str(const uct_ib_address_t *ib_addr, char *buf,
438 size_t max);
439
440 ucs_status_t uct_ib_iface_get_device_address(uct_iface_h tl_iface,
441 uct_device_addr_t *dev_addr);
442
443 int uct_ib_iface_is_reachable(const uct_iface_h tl_iface, const uct_device_addr_t *dev_addr,
444 const uct_iface_addr_t *iface_addr);
445
446 /*
447 * @param xport_hdr_len How many bytes this transport adds on top of IB header (LRH+BTH+iCRC+vCRC)
448 */
449 ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len,
450 uct_iface_attr_t *iface_attr);
451
452
453 int uct_ib_iface_is_roce_v2(uct_ib_iface_t *iface, uct_ib_device_t *dev);
454
455
456 /**
457 * Select the IB gid index and RoCE version to use for a RoCE port.
458 *
459 * @param iface IB interface
460 * @param md_config_index Gid index from the md configuration.
461 */
462 ucs_status_t uct_ib_iface_init_roce_gid_info(uct_ib_iface_t *iface,
463 size_t md_config_index);
464
465
uct_ib_iface_md(uct_ib_iface_t * iface)466 static inline uct_ib_md_t* uct_ib_iface_md(uct_ib_iface_t *iface)
467 {
468 return ucs_derived_of(iface->super.md, uct_ib_md_t);
469 }
470
uct_ib_iface_device(uct_ib_iface_t * iface)471 static inline uct_ib_device_t* uct_ib_iface_device(uct_ib_iface_t *iface)
472 {
473 return &uct_ib_iface_md(iface)->dev;
474 }
475
uct_ib_iface_port_attr(uct_ib_iface_t * iface)476 static inline struct ibv_port_attr* uct_ib_iface_port_attr(uct_ib_iface_t *iface)
477 {
478 return uct_ib_device_port_attr(uct_ib_iface_device(iface), iface->config.port_num);
479 }
480
uct_ib_iface_recv_desc_hdr(uct_ib_iface_t * iface,uct_ib_iface_recv_desc_t * desc)481 static inline void* uct_ib_iface_recv_desc_hdr(uct_ib_iface_t *iface,
482 uct_ib_iface_recv_desc_t *desc)
483 {
484 return (void*)((char *)desc + iface->config.rx_hdr_offset);
485 }
486
487 typedef struct uct_ib_recv_wr {
488 struct ibv_recv_wr ibwr;
489 struct ibv_sge sg;
490 } uct_ib_recv_wr_t;
491
492 /**
493 * prepare a list of n work requests that can be passed to
494 * ibv_post_recv()
495 *
496 * @return number of prepared wrs
497 */
498 int uct_ib_iface_prepare_rx_wrs(uct_ib_iface_t *iface, ucs_mpool_t *mp,
499 uct_ib_recv_wr_t *wrs, unsigned n);
500
501 ucs_status_t uct_ib_iface_create_ah(uct_ib_iface_t *iface,
502 struct ibv_ah_attr *ah_attr,
503 struct ibv_ah **ah_p);
504
505 void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid,
506 const union ibv_gid *gid,
507 uint8_t gid_index,
508 unsigned path_index,
509 struct ibv_ah_attr *ah_attr);
510
511 void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface,
512 const uct_ib_address_t *ib_addr,
513 unsigned path_index,
514 struct ibv_ah_attr *ah_attr,
515 enum ibv_mtu *path_mtu);
516
517 ucs_status_t uct_ib_iface_pre_arm(uct_ib_iface_t *iface);
518
519 ucs_status_t uct_ib_iface_event_fd_get(uct_iface_h iface, int *fd_p);
520
521 ucs_status_t uct_ib_iface_arm_cq(uct_ib_iface_t *iface,
522 uct_ib_dir_t dir,
523 int solicited_only);
524
525 ucs_status_t uct_ib_verbs_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
526 const uct_ib_iface_init_attr_t *init_attr,
527 int preferred_cpu, size_t inl);
528
529 ucs_status_t uct_ib_iface_create_qp(uct_ib_iface_t *iface,
530 uct_ib_qp_attr_t *attr,
531 struct ibv_qp **qp_p);
532
533 void uct_ib_iface_fill_attr(uct_ib_iface_t *iface,
534 uct_ib_qp_attr_t *attr);
535
536
537 #define UCT_IB_IFACE_FMT \
538 "%s:%d"
539 #define UCT_IB_IFACE_ARG(_iface) \
540 uct_ib_device_name(uct_ib_iface_device(_iface)), (_iface)->config.port_num
541
542
543 #define UCT_IB_IFACE_VERBS_COMPLETION_ERR(_type, _iface, _i, _wc) \
544 ucs_fatal("%s completion[%d] with error on %s/%p: %s, vendor_err 0x%x wr_id 0x%lx", \
545 _type, _i, uct_ib_device_name(uct_ib_iface_device(_iface)), _iface, \
546 uct_ib_wc_status_str(_wc[i].status), _wc[i].vendor_err, \
547 _wc[i].wr_id);
548
549 #define UCT_IB_IFACE_VERBS_FOREACH_RXWQE(_iface, _i, _hdr, _wc, _wc_count) \
550 for (_i = 0; _i < _wc_count && ({ \
551 if (ucs_unlikely(_wc[i].status != IBV_WC_SUCCESS)) { \
552 UCT_IB_IFACE_VERBS_COMPLETION_ERR("receive", _iface, _i, _wc); \
553 } \
554 _hdr = (typeof(_hdr))uct_ib_iface_recv_desc_hdr(_iface, \
555 (uct_ib_iface_recv_desc_t *)(uintptr_t)_wc[i].wr_id); \
556 VALGRIND_MAKE_MEM_DEFINED(_hdr, _wc[i].byte_len); \
557 1; }); ++_i)
558
559 #define UCT_IB_MAX_ZCOPY_LOG_SGE(_iface) \
560 (uct_ib_iface_device(_iface)->max_zcopy_log_sge)
561
562 /**
563 * Fill ibv_sge data structure by data provided in uct_iov_t
564 * The function avoids copying IOVs with zero length
565 *
566 * @return Number of elements in sge[]
567 */
568 static UCS_F_ALWAYS_INLINE
uct_ib_verbs_sge_fill_iov(struct ibv_sge * sge,const uct_iov_t * iov,size_t iovcnt)569 size_t uct_ib_verbs_sge_fill_iov(struct ibv_sge *sge, const uct_iov_t *iov,
570 size_t iovcnt)
571 {
572 size_t iov_it, sge_it = 0;
573
574 for (iov_it = 0; iov_it < iovcnt; ++iov_it) {
575 sge[sge_it].length = uct_iov_get_length(&iov[iov_it]);
576 if (sge[sge_it].length > 0) {
577 sge[sge_it].addr = (uintptr_t)(iov[iov_it].buffer);
578 } else {
579 continue; /* to avoid zero length elements in sge */
580 }
581
582 if (iov[sge_it].memh == UCT_MEM_HANDLE_NULL) {
583 sge[sge_it].lkey = 0;
584 } else {
585 sge[sge_it].lkey = uct_ib_memh_get_lkey(iov[iov_it].memh);
586 }
587 ++sge_it;
588 }
589
590 return sge_it;
591 }
592
593 static UCS_F_ALWAYS_INLINE
uct_ib_iface_hdr_size(size_t max_inline,size_t min_size)594 size_t uct_ib_iface_hdr_size(size_t max_inline, size_t min_size)
595 {
596 return (size_t)ucs_max((ssize_t)(max_inline - min_size), 0);
597 }
598
599 static UCS_F_ALWAYS_INLINE void
uct_ib_fence_info_init(uct_ib_fence_info_t * fence)600 uct_ib_fence_info_init(uct_ib_fence_info_t* fence)
601 {
602 fence->fence_beat = 0;
603 }
604
605 #endif
606