1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED.
3 * Copyright (C) ARM Ltd. 2016. ALL RIGHTS RESERVED.
4 *
5 * See file LICENSE for terms.
6 */
7
8 #ifndef UCT_IB_MLX5_H_
9 #define UCT_IB_MLX5_H_
10
11
12 #include <uct/base/uct_worker.h>
13 #include <uct/ib/base/ib_log.h>
14 #include <uct/ib/base/ib_device.h>
15 #include <ucs/arch/cpu.h>
16 #include <ucs/debug/log.h>
17 #include <ucs/type/status.h>
18
19 /**
20 * When using a clang version that is higher than 3.0, the GNUC_MINOR is set
21 * to 2, which affects the offset of several fields that are used by UCX from
22 * the liblmlx5 library (from the mlx5_qp struct).
23 * According to libmlx5, resetting the GNUC_MINOR version to 3, will make the
24 * offset of these fields inside libmlx5 (when compiled with GCC) the same as
25 * the one used by UCX (when compiled with clang).
26 */
27 #ifdef __clang__
28 # define CLANG_VERSION ( __clang_major__ * 100 + __clang_minor__)
29 # if CLANG_VERSION >= 300
30 # undef __GNUC_MINOR__
31 # define __GNUC_MINOR__ 3
32 # endif
33 #endif
34
35 #if HAVE_INFINIBAND_MLX5DV_H
36 # include <infiniband/mlx5dv.h>
37 #else
38 # include <infiniband/mlx5_hw.h>
39 # include <uct/ib/mlx5/exp/ib_mlx5_hw.h>
40 #endif
41 #include <uct/ib/mlx5/dv/ib_mlx5_dv.h>
42
43 #include <netinet/in.h>
44 #include <endian.h>
45 #include <string.h>
46
47
48 #define UCT_IB_MLX5_WQE_SEG_SIZE 16 /* Size of a segment in a WQE */
49 #define UCT_IB_MLX5_CQE64_MAX_INL 32 /* Inline scatter size in 64-byte CQE */
50 #define UCT_IB_MLX5_CQE128_MAX_INL 64 /* Inline scatter size in 128-byte CQE */
51 #define UCT_IB_MLX5_CQE64_SIZE_LOG 6
52 #define UCT_IB_MLX5_CQE128_SIZE_LOG 7
53 #define UCT_IB_MLX5_MAX_BB 4
54 #define UCT_IB_MLX5_WORKER_BF_KEY 0x00c1b7e8u
55 #define UCT_IB_MLX5_DEVX_UAR_KEY 0xdea1ab1eU
56 #define UCT_IB_MLX5_RES_DOMAIN_KEY 0x1b1bda7aU
57 #define UCT_IB_MLX5_WORKER_DM_KEY 0xacdf1245u
58 #define UCT_IB_MLX5_EXTENDED_UD_AV 0x80 /* htonl(0x80000000) */
59 #define UCT_IB_MLX5_AV_GRH_PRESENT 0x40 /* htonl(UCS_BIT(30)) */
60 #define UCT_IB_MLX5_BF_REG_SIZE 256
61 #define UCT_IB_MLX5_CQE_VENDOR_SYND_ODP 0x93
62 #define UCT_IB_MLX5_CQE_OP_OWN_ERR_MASK 0x80
63 #define UCT_IB_MLX5_MAX_SEND_WQE_SIZE (UCT_IB_MLX5_MAX_BB * MLX5_SEND_WQE_BB)
64 #define UCT_IB_MLX5_CQ_SET_CI 0
65 #define UCT_IB_MLX5_CQ_ARM_DB 1
66 #define UCT_IB_MLX5_LOG_MAX_MSG_SIZE 30
67 #define UCT_IB_MLX5_ATOMIC_MODE 3
68 #define UCT_IB_MLX5_CQE_FLAG_L3_IN_DATA UCS_BIT(28) /* GRH/IP in the receive buffer */
69 #define UCT_IB_MLX5_CQE_FLAG_L3_IN_CQE UCS_BIT(29) /* GRH/IP in the CQE */
70
71
72 #define UCT_IB_MLX5_OPMOD_EXT_ATOMIC(_log_arg_size) \
73 ((8) | ((_log_arg_size) - 2))
74
75 #ifdef HAVE_STRUCT_MLX5_WQE_AV_BASE
76
77 # define mlx5_av_base(_av) (&(_av)->base)
78 # define mlx5_av_grh(_av) (&(_av)->grh_sec)
79 # define UCT_IB_MLX5_AV_BASE_SIZE sizeof(struct mlx5_base_av)
80 # define UCT_IB_MLX5_AV_FULL_SIZE sizeof(struct mlx5_wqe_av)
81
82 #else
83
84 # define mlx5_av_base(_av) (_av)
85 /* do not use direct cast from address of reserved0 to avoid compilation warnings */
86 # define mlx5_av_grh(_av) ((struct mlx5_grh_av *)(((char*)(_av)) + \
87 ucs_offsetof(struct mlx5_wqe_av, reserved0[0])))
88 # define UCT_IB_MLX5_AV_BASE_SIZE ucs_offsetof(struct mlx5_wqe_av, reserved0[0])
89 # define UCT_IB_MLX5_AV_FULL_SIZE sizeof(struct mlx5_wqe_av)
90
91 # define mlx5_base_av mlx5_wqe_av
92
93 struct mlx5_grh_av {
94 uint8_t reserved0[4];
95 uint8_t rmac[6];
96 uint8_t tclass;
97 uint8_t hop_limit;
98 uint32_t grh_gid_fl;
99 uint8_t rgid[16];
100 };
101
102 # define HAVE_STRUCT_MLX5_GRH_AV_RMAC 1
103
104 #endif
105
106 #ifndef MLX5_WQE_CTRL_SOLICITED
107 # define MLX5_WQE_CTRL_SOLICITED (1<<1)
108 #endif
109
110 #define UCT_IB_MLX5_WQE_CTRL_FLAG_FENCE (2<<5)
111 #define UCT_IB_MLX5_WQE_CTRL_FLAG_STRONG_ORDER (3<<5)
112
113 #define UCT_IB_MLX5_AM_ZCOPY_MAX_IOV 3UL
114
115 #define UCT_IB_MLX5_AM_MAX_SHORT(_av_size) \
116 (UCT_IB_MLX5_MAX_SEND_WQE_SIZE - \
117 (sizeof(struct mlx5_wqe_ctrl_seg) + \
118 (_av_size) + \
119 sizeof(struct mlx5_wqe_inl_data_seg)))
120
121 #define UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(_av_size) \
122 (UCT_IB_MLX5_AM_MAX_SHORT(_av_size) - \
123 UCT_IB_MLX5_AM_ZCOPY_MAX_IOV * sizeof(struct mlx5_wqe_data_seg))
124
125 #define UCT_IB_MLX5_PUT_MAX_SHORT(_av_size) \
126 (UCT_IB_MLX5_AM_MAX_SHORT(_av_size) - sizeof(struct mlx5_wqe_raddr_seg))
127
128 #define UCT_IB_MLX5_XRQ_MIN_UWQ_POST 33
129
130 #define UCT_IB_MLX5_MD_FLAGS_DEVX_OBJS(_devx_objs) \
131 ((_devx_objs) << UCT_IB_MLX5_MD_FLAG_DEVX_OBJS_SHIFT)
132
133 #define UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(_obj) \
134 UCT_IB_MLX5_MD_FLAGS_DEVX_OBJS(UCS_BIT(UCT_IB_DEVX_OBJ_ ## _obj))
135
136 #define UCT_IB_MLX5_DEVX_EVENT_TYPE_MASK 0xffff
137 #define UCT_IB_MLX5_DEVX_EVENT_DATA_SHIFT 16
138
139 enum {
140 /* Device supports KSM */
141 UCT_IB_MLX5_MD_FLAG_KSM = UCS_BIT(0),
142 /* Device supports DEVX */
143 UCT_IB_MLX5_MD_FLAG_DEVX = UCS_BIT(1),
144 /* Device supports TM DC */
145 UCT_IB_MLX5_MD_FLAG_DC_TM = UCS_BIT(2),
146 /* Device supports MP RQ */
147 UCT_IB_MLX5_MD_FLAG_MP_RQ = UCS_BIT(3),
148 /* Device supports creation of indirect MR with atomics access rights */
149 UCT_IB_MLX5_MD_FLAG_INDIRECT_ATOMICS = UCS_BIT(4),
150 /* Device supports RMP to create SRQ for AM */
151 UCT_IB_MLX5_MD_FLAG_RMP = UCS_BIT(5),
152
153 /* Object to be created by DevX */
154 UCT_IB_MLX5_MD_FLAG_DEVX_OBJS_SHIFT = 6,
155 UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(RCQP),
156 UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(RCSRQ),
157 UCT_IB_MLX5_MD_FLAG_DEVX_DCT = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(DCT),
158 UCT_IB_MLX5_MD_FLAG_DEVX_DC_SRQ = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(DCSRQ),
159 };
160
161
162 enum {
163 UCT_IB_MLX5_SRQ_TOPO_LIST = 0x0,
164 UCT_IB_MLX5_SRQ_TOPO_CYCLIC = 0x1,
165 UCT_IB_MLX5_SRQ_TOPO_LIST_MP_RQ = 0x2,
166 UCT_IB_MLX5_SRQ_TOPO_CYCLIC_MP_RQ = 0x3
167 };
168
169 #if HAVE_DEVX
170 typedef struct uct_ib_mlx5_devx_umem {
171 struct mlx5dv_devx_umem *mem;
172 size_t size;
173 } uct_ib_mlx5_devx_umem_t;
174 #endif
175
176 /**
177 * MLX5 IB memory domain.
178 */
179 typedef struct uct_ib_mlx5_md {
180 uct_ib_md_t super;
181 uint32_t flags;
182 ucs_mpool_t dbrec_pool;
183 ucs_recursive_spinlock_t dbrec_lock;
184 #if HAVE_EXP_UMR
185 struct ibv_qp *umr_qp; /* special QP for creating UMR */
186 struct ibv_cq *umr_cq; /* special CQ for creating UMR */
187 #endif
188
189 #if HAVE_DEVX
190 void *zero_buf;
191 uct_ib_mlx5_devx_umem_t zero_mem;
192 #endif
193 } uct_ib_mlx5_md_t;
194
195
196 typedef enum {
197 UCT_IB_MLX5_MMIO_MODE_BF_POST, /* BF without flush, can be used only from
198 one thread */
199 UCT_IB_MLX5_MMIO_MODE_BF_POST_MT, /* BF with order, can be used by multiple
200 serialized threads */
201 UCT_IB_MLX5_MMIO_MODE_DB, /* 8-byte doorbell (with the mandatory flush) */
202 UCT_IB_MLX5_MMIO_MODE_AUTO, /* Auto-select according to driver/HW capabilities
203 and multi-thread support level */
204 UCT_IB_MLX5_MMIO_MODE_LAST
205 } uct_ib_mlx5_mmio_mode_t;
206
207
208 typedef struct uct_ib_mlx5_iface_config {
209 #if HAVE_IBV_DM
210 struct {
211 size_t seg_len;
212 unsigned count;
213 } dm;
214 #endif
215 uct_ib_mlx5_mmio_mode_t mmio_mode;
216 } uct_ib_mlx5_iface_config_t;
217
218
219 /**
220 * MLX5 DoorBell record
221 */
222 typedef struct uct_ib_mlx5_dbrec {
223 volatile uint32_t db[2];
224 uint32_t mem_id;
225 size_t offset;
226 uct_ib_mlx5_md_t *md;
227 } uct_ib_mlx5_dbrec_t;
228
229
230 typedef enum {
231 UCT_IB_MLX5_OBJ_TYPE_VERBS,
232 UCT_IB_MLX5_OBJ_TYPE_DEVX,
233 UCT_IB_MLX5_OBJ_TYPE_LAST
234 } uct_ib_mlx5_obj_type_t;
235
236
237 /* Shared receive queue */
238 typedef struct uct_ib_mlx5_srq {
239 uct_ib_mlx5_obj_type_t type;
240 uint32_t srq_num;
241 void *buf;
242 volatile uint32_t *db;
243 uint16_t free_idx; /* what is completed contiguously */
244 uint16_t ready_idx; /* what is ready to be posted to hw */
245 uint16_t sw_pi; /* what is posted to hw */
246 uint16_t mask;
247 uint16_t tail; /* tail in the driver */
248 uint16_t stride;
249 union {
250 struct {
251 struct ibv_srq *srq;
252 } verbs;
253 #if HAVE_DEVX
254 struct {
255 uct_ib_mlx5_dbrec_t *dbrec;
256 uct_ib_mlx5_devx_umem_t mem;
257 struct mlx5dv_devx_obj *obj;
258 } devx;
259 #endif
260 };
261 } uct_ib_mlx5_srq_t;
262
263
264 /* Completion queue */
265 typedef struct uct_ib_mlx5_cq {
266 void *cq_buf;
267 unsigned cq_ci;
268 unsigned cq_sn;
269 unsigned cq_length;
270 unsigned cqe_size_log;
271 unsigned cq_num;
272 void *uar;
273 volatile uint32_t *dbrec;
274 } uct_ib_mlx5_cq_t;
275
276
277 /* Blue flame register */
278 typedef struct uct_ib_mlx5_mmio_reg {
279 uct_worker_tl_data_t super;
280 union {
281 void *ptr;
282 uintptr_t uint;
283 } addr;
284 uct_ib_mlx5_mmio_mode_t mode;
285 } uct_ib_mlx5_mmio_reg_t;
286
287
288 typedef struct uct_ib_mlx5_devx_uar {
289 uct_ib_mlx5_mmio_reg_t super;
290 #if HAVE_DEVX
291 struct mlx5dv_devx_uar *uar;
292 #endif
293 struct ibv_context *ctx;
294 } uct_ib_mlx5_devx_uar_t;
295
296
297 /* resource domain */
298 typedef struct uct_ib_mlx5_res_domain {
299 uct_worker_tl_data_t super;
300 #ifdef HAVE_IBV_EXP_RES_DOMAIN
301 struct ibv_exp_res_domain *ibv_domain;
302 #elif HAVE_DECL_IBV_ALLOC_TD
303 struct ibv_td *td;
304 struct ibv_pd *pd;
305 #endif
306 } uct_ib_mlx5_res_domain_t;
307
308
309 typedef struct uct_ib_mlx5_qp_attr {
310 uct_ib_qp_attr_t super;
311 uct_ib_mlx5_mmio_mode_t mmio_mode;
312 } uct_ib_mlx5_qp_attr_t;
313
314
315 /* MLX5 QP wrapper */
316 typedef struct uct_ib_mlx5_qp {
317 uct_ib_mlx5_obj_type_t type;
318 uint32_t qp_num;
319 union {
320 struct {
321 union {
322 struct ibv_qp *qp;
323 #ifdef HAVE_DC_EXP
324 struct ibv_exp_dct *dct;
325 #endif
326 };
327 uct_ib_mlx5_res_domain_t *rd;
328 } verbs;
329 #if HAVE_DEVX
330 struct {
331 void *wq_buf;
332 uct_ib_mlx5_dbrec_t *dbrec;
333 uct_ib_mlx5_devx_umem_t mem;
334 struct mlx5dv_devx_obj *obj;
335 } devx;
336 #endif
337 };
338 } uct_ib_mlx5_qp_t;
339
340 /* Send work-queue */
341 typedef struct uct_ib_mlx5_txwq {
342 uct_ib_mlx5_qp_t super;
343 uint16_t sw_pi; /* PI for next WQE */
344 uint16_t prev_sw_pi; /* PI where last WQE *started* */
345 uct_ib_mlx5_mmio_reg_t *reg;
346 void *curr;
347 volatile uint32_t *dbrec;
348 void *qstart;
349 void *qend;
350 uint16_t bb_max;
351 uint16_t sig_pi; /* PI for last signaled WQE */
352 #if UCS_ENABLE_ASSERT
353 uint16_t hw_ci;
354 #endif
355 uct_ib_fence_info_t fi;
356 } uct_ib_mlx5_txwq_t;
357
358
359 /* Receive work-queue */
360 typedef struct uct_ib_mlx5_rxwq {
361 /* producer index. It updated when new receive wqe is posted */
362 uint16_t rq_wqe_counter;
363 /* consumer index. It is better to track it ourselves than to do ntohs()
364 * on the index in the cqe
365 */
366 uint16_t cq_wqe_counter;
367 uint16_t mask;
368 volatile uint32_t *dbrec;
369 struct mlx5_wqe_data_seg *wqes;
370 } uct_ib_mlx5_rxwq_t;
371
372
373 /* Address-vector for link-local scope */
374 typedef struct uct_ib_mlx5_base_av {
375 uint32_t dqp_dct;
376 uint8_t stat_rate_sl;
377 uint8_t fl_mlid;
378 uint16_t rlid;
379 } UCS_S_PACKED uct_ib_mlx5_base_av_t;
380
381
382 typedef struct uct_ib_mlx5_err_cqe {
383 uint8_t rsvd0[32];
384 uint32_t srqn;
385 uint8_t rsvd1[16];
386 uint8_t hw_err_synd;
387 uint8_t hw_synd_type;
388 uint8_t vendor_err_synd;
389 uint8_t syndrome;
390 uint32_t s_wqe_opcode_qpn;
391 uint16_t wqe_counter;
392 uint8_t signature;
393 uint8_t op_own;
394 } UCS_S_PACKED uct_ib_mlx5_err_cqe_t;
395
396
397 /**
398 * SRQ segment
399 *
400 * We add some SW book-keeping information in the unused HW fields:
401 * - desc - the receive descriptor.
402 * - strides - Number of available strides in this WQE. When it is 0,
403 * this segment can be reposted to the HW. Relevant for
404 * Multi-Packet SRQ only.
405 * - free - points to the next out-of-order completed segment.
406 */
407 typedef struct uct_rc_mlx5_srq_seg {
408 union {
409 struct mlx5_wqe_srq_next_seg mlx5_srq;
410 struct {
411 uint16_t ptr_mask;
412 uint16_t next_wqe_index; /* Network byte order */
413 uint8_t signature;
414 uint8_t rsvd1[1];
415 uint8_t strides;
416 uint8_t free; /* Released but not posted */
417 uct_ib_iface_recv_desc_t *desc; /* Host byte order */
418 } srq;
419 };
420 struct mlx5_wqe_data_seg dptr[0];
421 } uct_ib_mlx5_srq_seg_t;
422
423
424 struct uct_ib_mlx5_atomic_masked_cswap32_seg {
425 uint32_t swap;
426 uint32_t compare;
427 uint32_t swap_mask;
428 uint32_t compare_mask;
429 } UCS_S_PACKED;
430
431
432 struct uct_ib_mlx5_atomic_masked_fadd32_seg {
433 uint32_t add;
434 uint32_t filed_boundary;
435 uint32_t reserved[2];
436 } UCS_S_PACKED;
437
438
439 struct uct_ib_mlx5_atomic_masked_cswap64_seg {
440 uint64_t swap;
441 uint64_t compare;
442 } UCS_S_PACKED;
443
444
445 struct uct_ib_mlx5_atomic_masked_fadd64_seg {
446 uint64_t add;
447 uint64_t filed_boundary;
448 } UCS_S_PACKED;
449
450 ucs_status_t uct_ib_mlx5_md_get_atomic_mr_id(uct_ib_md_t *md, uint8_t *mr_id);
451
452 ucs_status_t uct_ib_mlx5_iface_get_res_domain(uct_ib_iface_t *iface,
453 uct_ib_mlx5_qp_t *txwq);
454
455 void uct_ib_mlx5_iface_put_res_domain(uct_ib_mlx5_qp_t *qp);
456
457 ucs_status_t uct_ib_mlx5_iface_create_qp(uct_ib_iface_t *iface,
458 uct_ib_mlx5_qp_t *qp,
459 uct_ib_mlx5_qp_attr_t *attr);
460
461 ucs_status_t uct_ib_mlx5_modify_qp_state(uct_ib_mlx5_md_t *md,
462 uct_ib_mlx5_qp_t *qp,
463 enum ibv_qp_state state);
464
465 /**
466 * Create CQ with DV
467 */
468 ucs_status_t uct_ib_mlx5_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
469 const uct_ib_iface_init_attr_t *init_attr,
470 int preferred_cpu, size_t inl);
471
472 extern ucs_config_field_t uct_ib_mlx5_iface_config_table[];
473
474 /**
475 * Get internal CQ information.
476 */
477 ucs_status_t uct_ib_mlx5_get_cq(struct ibv_cq *cq, uct_ib_mlx5_cq_t *mlx5_cq);
478
479 /**
480 * Get flag indicating compact AV support.
481 */
482 ucs_status_t uct_ib_mlx5_get_compact_av(uct_ib_iface_t *iface, int *compact_av);
483
484 /**
485 * Requests completion notification.
486 */
487 ucs_status_t uct_ib_mlx5dv_arm_cq(uct_ib_mlx5_cq_t *cq, int solicited);
488
489 /**
490 * Check for completion with error.
491 */
492 void uct_ib_mlx5_check_completion(uct_ib_iface_t *iface, uct_ib_mlx5_cq_t *cq,
493 struct mlx5_cqe64 *cqe);
494
495 ucs_status_t
496 uct_ib_mlx5_get_mmio_mode(uct_priv_worker_t *worker,
497 uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,
498 unsigned bf_size,
499 uct_ib_mlx5_mmio_mode_t *mmio_mode);
500
501 /**
502 * Initialize txwq structure.
503 */
504 ucs_status_t uct_ib_mlx5_txwq_init(uct_priv_worker_t *worker,
505 uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,
506 uct_ib_mlx5_txwq_t *txwq, struct ibv_qp *verbs_qp);
507
508 void uct_ib_mlx5_txwq_cleanup(uct_ib_mlx5_txwq_t* txwq);
509
510 /**
511 * Reset txwq contents and posting indices.
512 */
513 void uct_ib_mlx5_txwq_reset(uct_ib_mlx5_txwq_t *txwq);
514
515 /**
516 * Initialize rxwq structure.
517 */
518 ucs_status_t uct_ib_mlx5_get_rxwq(struct ibv_qp *qp, uct_ib_mlx5_rxwq_t *wq);
519
520 /**
521 * Initialize srq structure.
522 */
523 ucs_status_t
524 uct_ib_mlx5_verbs_srq_init(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq,
525 size_t sg_byte_count, int num_sge);
526
527 void uct_ib_mlx5_srq_buff_init(uct_ib_mlx5_srq_t *srq, uint32_t head,
528 uint32_t tail, size_t sg_byte_count, int num_sge);
529
530 void uct_ib_mlx5_verbs_srq_cleanup(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq);
531
532 /**
533 * DEVX UAR API
534 */
535 int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar,
536 uct_ib_mlx5_md_t *md,
537 uct_ib_mlx5_mmio_mode_t mmio_mode);
538
539 ucs_status_t uct_ib_mlx5_devx_uar_init(uct_ib_mlx5_devx_uar_t *uar,
540 uct_ib_mlx5_md_t *md,
541 uct_ib_mlx5_mmio_mode_t mmio_mode);
542
543 void uct_ib_mlx5_devx_uar_cleanup(uct_ib_mlx5_devx_uar_t *uar);
544
545 /**
546 * DEVX QP API
547 */
548
549 #if HAVE_DEVX
550
551 ucs_status_t uct_ib_mlx5_devx_create_qp(uct_ib_iface_t *iface,
552 uct_ib_mlx5_qp_t *qp,
553 uct_ib_mlx5_txwq_t *tx,
554 uct_ib_mlx5_qp_attr_t *attr);
555
556 ucs_status_t uct_ib_mlx5_devx_modify_qp(uct_ib_mlx5_qp_t *qp,
557 const void *in, size_t inlen,
558 void *out, size_t outlen);
559
560 ucs_status_t uct_ib_mlx5_devx_modify_qp_state(uct_ib_mlx5_qp_t *qp,
561 enum ibv_qp_state state);
562
563 void uct_ib_mlx5_devx_destroy_qp(uct_ib_mlx5_md_t *md, uct_ib_mlx5_qp_t *qp);
564
565 static inline ucs_status_t
uct_ib_mlx5_md_buf_alloc(uct_ib_mlx5_md_t * md,size_t size,int silent,void ** buf_p,uct_ib_mlx5_devx_umem_t * mem,char * name)566 uct_ib_mlx5_md_buf_alloc(uct_ib_mlx5_md_t *md, size_t size, int silent,
567 void **buf_p, uct_ib_mlx5_devx_umem_t *mem,
568 char *name)
569 {
570 ucs_log_level_t level = silent ? UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR;
571 ucs_status_t status;
572 void *buf;
573 int ret;
574
575 ret = ucs_posix_memalign(&buf, ucs_get_page_size(), size, name);
576 if (ret != 0) {
577 ucs_log(level, "failed to allocate buffer of %zu bytes: %m", size);
578 return UCS_ERR_NO_MEMORY;
579 }
580
581 if (md->super.fork_init) {
582 ret = madvise(buf, size, MADV_DONTFORK);
583 if (ret != 0) {
584 ucs_log(level, "madvise(DONTFORK, buf=%p, len=%zu) failed: %m", buf, size);
585 status = UCS_ERR_IO_ERROR;
586 goto err_free;
587 }
588 }
589
590 mem->size = size;
591 mem->mem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, buf, size, 0);
592 if (mem->mem == NULL) {
593 ucs_log(level, "mlx5dv_devx_umem_reg() failed: %m");
594 status = UCS_ERR_NO_MEMORY;
595 goto err_dofork;
596 }
597
598 *buf_p = buf;
599 return UCS_OK;
600
601 err_dofork:
602 if (md->super.fork_init) {
603 madvise(buf, size, MADV_DOFORK);
604 }
605 err_free:
606 ucs_free(buf);
607
608 return status;
609 }
610
611 static inline void
uct_ib_mlx5_md_buf_free(uct_ib_mlx5_md_t * md,void * buf,uct_ib_mlx5_devx_umem_t * mem)612 uct_ib_mlx5_md_buf_free(uct_ib_mlx5_md_t *md, void *buf, uct_ib_mlx5_devx_umem_t *mem)
613 {
614 int ret;
615
616 if (buf == NULL) {
617 return;
618 }
619
620 mlx5dv_devx_umem_dereg(mem->mem);
621 if (md->super.fork_init) {
622 ret = madvise(buf, mem->size, MADV_DOFORK);
623 if (ret != 0) {
624 ucs_warn("madvise(DOFORK, buf=%p, len=%zu) failed: %m", buf, mem->size);
625 }
626 }
627 ucs_free(buf);
628 }
629
630 #else
631
632 static inline ucs_status_t
uct_ib_mlx5_devx_create_qp(uct_ib_iface_t * iface,uct_ib_mlx5_qp_t * qp,uct_ib_mlx5_txwq_t * tx,uct_ib_mlx5_qp_attr_t * attr)633 uct_ib_mlx5_devx_create_qp(uct_ib_iface_t *iface,
634 uct_ib_mlx5_qp_t *qp,
635 uct_ib_mlx5_txwq_t *tx,
636 uct_ib_mlx5_qp_attr_t *attr)
637 {
638 return UCS_ERR_UNSUPPORTED;
639 }
640
641 static inline ucs_status_t
uct_ib_mlx5_devx_modify_qp(uct_ib_mlx5_qp_t * qp,enum ibv_qp_state state)642 uct_ib_mlx5_devx_modify_qp(uct_ib_mlx5_qp_t *qp,
643 enum ibv_qp_state state)
644 {
645 return UCS_ERR_UNSUPPORTED;
646 }
647
648 static inline ucs_status_t
uct_ib_mlx5_devx_modify_qp_state(uct_ib_mlx5_qp_t * qp,enum ibv_qp_state state)649 uct_ib_mlx5_devx_modify_qp_state(uct_ib_mlx5_qp_t *qp, enum ibv_qp_state state)
650 {
651 return UCS_ERR_UNSUPPORTED;
652 }
653
uct_ib_mlx5_devx_destroy_qp(uct_ib_mlx5_md_t * md,uct_ib_mlx5_qp_t * qp)654 static inline void uct_ib_mlx5_devx_destroy_qp(uct_ib_mlx5_md_t *md, uct_ib_mlx5_qp_t *qp) { }
655
656 #endif
657
uct_ib_mlx5_get_dbrec(uct_ib_mlx5_md_t * md)658 static inline uct_ib_mlx5_dbrec_t *uct_ib_mlx5_get_dbrec(uct_ib_mlx5_md_t *md)
659 {
660 uct_ib_mlx5_dbrec_t *dbrec;
661
662 ucs_recursive_spin_lock(&md->dbrec_lock);
663 dbrec = (uct_ib_mlx5_dbrec_t *)ucs_mpool_get_inline(&md->dbrec_pool);
664 ucs_recursive_spin_unlock(&md->dbrec_lock);
665 if (dbrec != NULL) {
666 dbrec->db[MLX5_SND_DBR] = 0;
667 dbrec->db[MLX5_RCV_DBR] = 0;
668 dbrec->md = md;
669 }
670
671 return dbrec;
672 }
673
uct_ib_mlx5_put_dbrec(uct_ib_mlx5_dbrec_t * dbrec)674 static inline void uct_ib_mlx5_put_dbrec(uct_ib_mlx5_dbrec_t *dbrec)
675 {
676 uct_ib_mlx5_md_t *md = dbrec->md;
677
678 ucs_recursive_spin_lock(&md->dbrec_lock);
679 ucs_mpool_put_inline(dbrec);
680 ucs_recursive_spin_unlock(&md->dbrec_lock);
681 }
682
683 #endif
684