1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
3 * Copyright (C) ARM Ltd. 2016.  ALL RIGHTS RESERVED.
4 *
5 * See file LICENSE for terms.
6 */
7 
8 #ifndef UCT_IB_MLX5_H_
9 #define UCT_IB_MLX5_H_
10 
11 
12 #include <uct/base/uct_worker.h>
13 #include <uct/ib/base/ib_log.h>
14 #include <uct/ib/base/ib_device.h>
15 #include <ucs/arch/cpu.h>
16 #include <ucs/debug/log.h>
17 #include <ucs/type/status.h>
18 
19 /**
20  * When using a clang version that is higher than 3.0, the GNUC_MINOR is set
21  * to 2, which affects the offset of several fields that are used by UCX from
22  * the liblmlx5 library (from the mlx5_qp struct).
23  * According to libmlx5, resetting the GNUC_MINOR version to 3, will make the
24  * offset of these fields inside libmlx5 (when compiled with GCC) the same as
25  * the one used by UCX (when compiled with clang).
26  */
27 #ifdef __clang__
28 #  define CLANG_VERSION ( __clang_major__ * 100 + __clang_minor__)
29 #  if CLANG_VERSION >= 300
30 #    undef __GNUC_MINOR__
31 #    define __GNUC_MINOR__ 3
32 #  endif
33 #endif
34 
35 #if HAVE_INFINIBAND_MLX5DV_H
36 #  include <infiniband/mlx5dv.h>
37 #else
38 #  include <infiniband/mlx5_hw.h>
39 #  include <uct/ib/mlx5/exp/ib_mlx5_hw.h>
40 #endif
41 #include <uct/ib/mlx5/dv/ib_mlx5_dv.h>
42 
43 #include <netinet/in.h>
44 #include <endian.h>
45 #include <string.h>
46 
47 
48 #define UCT_IB_MLX5_WQE_SEG_SIZE        16 /* Size of a segment in a WQE */
49 #define UCT_IB_MLX5_CQE64_MAX_INL       32 /* Inline scatter size in 64-byte CQE */
50 #define UCT_IB_MLX5_CQE128_MAX_INL      64 /* Inline scatter size in 128-byte CQE */
51 #define UCT_IB_MLX5_CQE64_SIZE_LOG      6
52 #define UCT_IB_MLX5_CQE128_SIZE_LOG     7
53 #define UCT_IB_MLX5_MAX_BB              4
54 #define UCT_IB_MLX5_WORKER_BF_KEY       0x00c1b7e8u
55 #define UCT_IB_MLX5_DEVX_UAR_KEY        0xdea1ab1eU
56 #define UCT_IB_MLX5_RES_DOMAIN_KEY      0x1b1bda7aU
57 #define UCT_IB_MLX5_WORKER_DM_KEY       0xacdf1245u
58 #define UCT_IB_MLX5_EXTENDED_UD_AV      0x80 /* htonl(0x80000000) */
59 #define UCT_IB_MLX5_AV_GRH_PRESENT      0x40 /* htonl(UCS_BIT(30)) */
60 #define UCT_IB_MLX5_BF_REG_SIZE         256
61 #define UCT_IB_MLX5_CQE_VENDOR_SYND_ODP 0x93
62 #define UCT_IB_MLX5_CQE_OP_OWN_ERR_MASK 0x80
63 #define UCT_IB_MLX5_MAX_SEND_WQE_SIZE   (UCT_IB_MLX5_MAX_BB * MLX5_SEND_WQE_BB)
64 #define UCT_IB_MLX5_CQ_SET_CI           0
65 #define UCT_IB_MLX5_CQ_ARM_DB           1
66 #define UCT_IB_MLX5_LOG_MAX_MSG_SIZE    30
67 #define UCT_IB_MLX5_ATOMIC_MODE         3
68 #define UCT_IB_MLX5_CQE_FLAG_L3_IN_DATA UCS_BIT(28) /* GRH/IP in the receive buffer */
69 #define UCT_IB_MLX5_CQE_FLAG_L3_IN_CQE  UCS_BIT(29) /* GRH/IP in the CQE */
70 
71 
72 #define UCT_IB_MLX5_OPMOD_EXT_ATOMIC(_log_arg_size) \
73     ((8) | ((_log_arg_size) - 2))
74 
75 #ifdef HAVE_STRUCT_MLX5_WQE_AV_BASE
76 
77 #  define mlx5_av_base(_av)         (&(_av)->base)
78 #  define mlx5_av_grh(_av)          (&(_av)->grh_sec)
79 #  define UCT_IB_MLX5_AV_BASE_SIZE  sizeof(struct mlx5_base_av)
80 #  define UCT_IB_MLX5_AV_FULL_SIZE  sizeof(struct mlx5_wqe_av)
81 
82 #else
83 
84 #  define mlx5_av_base(_av)         (_av)
85 /* do not use direct cast from address of reserved0 to avoid compilation warnings */
86 #  define mlx5_av_grh(_av)          ((struct mlx5_grh_av *)(((char*)(_av)) + \
87                                      ucs_offsetof(struct mlx5_wqe_av, reserved0[0])))
88 #  define UCT_IB_MLX5_AV_BASE_SIZE  ucs_offsetof(struct mlx5_wqe_av, reserved0[0])
89 #  define UCT_IB_MLX5_AV_FULL_SIZE  sizeof(struct mlx5_wqe_av)
90 
91 #  define mlx5_base_av              mlx5_wqe_av
92 
93 struct mlx5_grh_av {
94         uint8_t         reserved0[4];
95         uint8_t         rmac[6];
96         uint8_t         tclass;
97         uint8_t         hop_limit;
98         uint32_t        grh_gid_fl;
99         uint8_t         rgid[16];
100 };
101 
102 #  define HAVE_STRUCT_MLX5_GRH_AV_RMAC 1
103 
104 #endif
105 
106 #ifndef MLX5_WQE_CTRL_SOLICITED
107 #  define MLX5_WQE_CTRL_SOLICITED  (1<<1)
108 #endif
109 
110 #define UCT_IB_MLX5_WQE_CTRL_FLAG_FENCE        (2<<5)
111 #define UCT_IB_MLX5_WQE_CTRL_FLAG_STRONG_ORDER (3<<5)
112 
113 #define UCT_IB_MLX5_AM_ZCOPY_MAX_IOV  3UL
114 
115 #define UCT_IB_MLX5_AM_MAX_SHORT(_av_size) \
116     (UCT_IB_MLX5_MAX_SEND_WQE_SIZE - \
117      (sizeof(struct mlx5_wqe_ctrl_seg) + \
118       (_av_size) + \
119       sizeof(struct mlx5_wqe_inl_data_seg)))
120 
121 #define UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(_av_size) \
122     (UCT_IB_MLX5_AM_MAX_SHORT(_av_size) - \
123      UCT_IB_MLX5_AM_ZCOPY_MAX_IOV * sizeof(struct mlx5_wqe_data_seg))
124 
125 #define UCT_IB_MLX5_PUT_MAX_SHORT(_av_size) \
126     (UCT_IB_MLX5_AM_MAX_SHORT(_av_size) - sizeof(struct mlx5_wqe_raddr_seg))
127 
128 #define UCT_IB_MLX5_XRQ_MIN_UWQ_POST 33
129 
130 #define UCT_IB_MLX5_MD_FLAGS_DEVX_OBJS(_devx_objs) \
131     ((_devx_objs) << UCT_IB_MLX5_MD_FLAG_DEVX_OBJS_SHIFT)
132 
133 #define UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(_obj) \
134     UCT_IB_MLX5_MD_FLAGS_DEVX_OBJS(UCS_BIT(UCT_IB_DEVX_OBJ_ ## _obj))
135 
136 #define UCT_IB_MLX5_DEVX_EVENT_TYPE_MASK  0xffff
137 #define UCT_IB_MLX5_DEVX_EVENT_DATA_SHIFT 16
138 
139 enum {
140     /* Device supports KSM */
141     UCT_IB_MLX5_MD_FLAG_KSM              = UCS_BIT(0),
142     /* Device supports DEVX */
143     UCT_IB_MLX5_MD_FLAG_DEVX             = UCS_BIT(1),
144     /* Device supports TM DC */
145     UCT_IB_MLX5_MD_FLAG_DC_TM            = UCS_BIT(2),
146     /* Device supports MP RQ */
147     UCT_IB_MLX5_MD_FLAG_MP_RQ            = UCS_BIT(3),
148     /* Device supports creation of indirect MR with atomics access rights */
149     UCT_IB_MLX5_MD_FLAG_INDIRECT_ATOMICS = UCS_BIT(4),
150     /* Device supports RMP to create SRQ for AM */
151     UCT_IB_MLX5_MD_FLAG_RMP              = UCS_BIT(5),
152 
153     /* Object to be created by DevX */
154     UCT_IB_MLX5_MD_FLAG_DEVX_OBJS_SHIFT  = 6,
155     UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP       = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(RCQP),
156     UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ      = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(RCSRQ),
157     UCT_IB_MLX5_MD_FLAG_DEVX_DCT         = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(DCT),
158     UCT_IB_MLX5_MD_FLAG_DEVX_DC_SRQ      = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(DCSRQ),
159 };
160 
161 
162 enum {
163     UCT_IB_MLX5_SRQ_TOPO_LIST         = 0x0,
164     UCT_IB_MLX5_SRQ_TOPO_CYCLIC       = 0x1,
165     UCT_IB_MLX5_SRQ_TOPO_LIST_MP_RQ   = 0x2,
166     UCT_IB_MLX5_SRQ_TOPO_CYCLIC_MP_RQ = 0x3
167 };
168 
169 #if HAVE_DEVX
170 typedef struct uct_ib_mlx5_devx_umem {
171     struct mlx5dv_devx_umem  *mem;
172     size_t                   size;
173 } uct_ib_mlx5_devx_umem_t;
174 #endif
175 
176 /**
177  * MLX5 IB memory domain.
178  */
179 typedef struct uct_ib_mlx5_md {
180     uct_ib_md_t              super;
181     uint32_t                 flags;
182     ucs_mpool_t              dbrec_pool;
183     ucs_recursive_spinlock_t dbrec_lock;
184 #if HAVE_EXP_UMR
185     struct ibv_qp            *umr_qp;   /* special QP for creating UMR */
186     struct ibv_cq            *umr_cq;   /* special CQ for creating UMR */
187 #endif
188 
189 #if HAVE_DEVX
190     void                     *zero_buf;
191     uct_ib_mlx5_devx_umem_t  zero_mem;
192 #endif
193 } uct_ib_mlx5_md_t;
194 
195 
196 typedef enum {
197     UCT_IB_MLX5_MMIO_MODE_BF_POST,    /* BF without flush, can be used only from
198                                          one thread */
199     UCT_IB_MLX5_MMIO_MODE_BF_POST_MT, /* BF with order, can be used by multiple
200                                          serialized threads */
201     UCT_IB_MLX5_MMIO_MODE_DB,         /* 8-byte doorbell (with the mandatory flush) */
202     UCT_IB_MLX5_MMIO_MODE_AUTO,       /* Auto-select according to driver/HW capabilities
203                                          and multi-thread support level */
204     UCT_IB_MLX5_MMIO_MODE_LAST
205 } uct_ib_mlx5_mmio_mode_t;
206 
207 
208 typedef struct uct_ib_mlx5_iface_config {
209 #if HAVE_IBV_DM
210     struct {
211         size_t               seg_len;
212         unsigned             count;
213     } dm;
214 #endif
215     uct_ib_mlx5_mmio_mode_t  mmio_mode;
216 } uct_ib_mlx5_iface_config_t;
217 
218 
219 /**
220  * MLX5 DoorBell record
221  */
222 typedef struct uct_ib_mlx5_dbrec {
223    volatile uint32_t  db[2];
224    uint32_t           mem_id;
225    size_t             offset;
226    uct_ib_mlx5_md_t   *md;
227 } uct_ib_mlx5_dbrec_t;
228 
229 
230 typedef enum {
231     UCT_IB_MLX5_OBJ_TYPE_VERBS,
232     UCT_IB_MLX5_OBJ_TYPE_DEVX,
233     UCT_IB_MLX5_OBJ_TYPE_LAST
234 } uct_ib_mlx5_obj_type_t;
235 
236 
237 /* Shared receive queue */
238 typedef struct uct_ib_mlx5_srq {
239     uct_ib_mlx5_obj_type_t             type;
240     uint32_t                           srq_num;
241     void                               *buf;
242     volatile uint32_t                  *db;
243     uint16_t                           free_idx;   /* what is completed contiguously */
244     uint16_t                           ready_idx;  /* what is ready to be posted to hw */
245     uint16_t                           sw_pi;      /* what is posted to hw */
246     uint16_t                           mask;
247     uint16_t                           tail;       /* tail in the driver */
248     uint16_t                           stride;
249     union {
250         struct {
251             struct ibv_srq             *srq;
252         } verbs;
253 #if HAVE_DEVX
254         struct {
255             uct_ib_mlx5_dbrec_t        *dbrec;
256             uct_ib_mlx5_devx_umem_t    mem;
257             struct mlx5dv_devx_obj     *obj;
258         } devx;
259 #endif
260     };
261 } uct_ib_mlx5_srq_t;
262 
263 
264 /* Completion queue */
265 typedef struct uct_ib_mlx5_cq {
266     void               *cq_buf;
267     unsigned           cq_ci;
268     unsigned           cq_sn;
269     unsigned           cq_length;
270     unsigned           cqe_size_log;
271     unsigned           cq_num;
272     void               *uar;
273     volatile uint32_t  *dbrec;
274 } uct_ib_mlx5_cq_t;
275 
276 
277 /* Blue flame register */
278 typedef struct uct_ib_mlx5_mmio_reg {
279     uct_worker_tl_data_t        super;
280     union {
281         void                    *ptr;
282         uintptr_t               uint;
283     } addr;
284     uct_ib_mlx5_mmio_mode_t     mode;
285 } uct_ib_mlx5_mmio_reg_t;
286 
287 
288 typedef struct uct_ib_mlx5_devx_uar {
289     uct_ib_mlx5_mmio_reg_t      super;
290 #if HAVE_DEVX
291     struct mlx5dv_devx_uar      *uar;
292 #endif
293     struct ibv_context          *ctx;
294 } uct_ib_mlx5_devx_uar_t;
295 
296 
297 /* resource domain */
298 typedef struct uct_ib_mlx5_res_domain {
299     uct_worker_tl_data_t        super;
300 #ifdef HAVE_IBV_EXP_RES_DOMAIN
301     struct ibv_exp_res_domain   *ibv_domain;
302 #elif HAVE_DECL_IBV_ALLOC_TD
303     struct ibv_td               *td;
304     struct ibv_pd               *pd;
305 #endif
306 } uct_ib_mlx5_res_domain_t;
307 
308 
309 typedef struct uct_ib_mlx5_qp_attr {
310     uct_ib_qp_attr_t            super;
311     uct_ib_mlx5_mmio_mode_t     mmio_mode;
312 } uct_ib_mlx5_qp_attr_t;
313 
314 
315 /* MLX5 QP wrapper */
316 typedef struct uct_ib_mlx5_qp {
317     uct_ib_mlx5_obj_type_t             type;
318     uint32_t                           qp_num;
319     union {
320         struct {
321             union {
322                 struct ibv_qp          *qp;
323 #ifdef HAVE_DC_EXP
324                 struct ibv_exp_dct     *dct;
325 #endif
326             };
327             uct_ib_mlx5_res_domain_t   *rd;
328         } verbs;
329 #if HAVE_DEVX
330         struct {
331             void                       *wq_buf;
332             uct_ib_mlx5_dbrec_t        *dbrec;
333             uct_ib_mlx5_devx_umem_t    mem;
334             struct mlx5dv_devx_obj     *obj;
335         } devx;
336 #endif
337     };
338 } uct_ib_mlx5_qp_t;
339 
340 /* Send work-queue */
341 typedef struct uct_ib_mlx5_txwq {
342     uct_ib_mlx5_qp_t            super;
343     uint16_t                    sw_pi;      /* PI for next WQE */
344     uint16_t                    prev_sw_pi; /* PI where last WQE *started*  */
345     uct_ib_mlx5_mmio_reg_t      *reg;
346     void                        *curr;
347     volatile uint32_t           *dbrec;
348     void                        *qstart;
349     void                        *qend;
350     uint16_t                    bb_max;
351     uint16_t                    sig_pi;     /* PI for last signaled WQE */
352 #if UCS_ENABLE_ASSERT
353     uint16_t                    hw_ci;
354 #endif
355     uct_ib_fence_info_t         fi;
356 } uct_ib_mlx5_txwq_t;
357 
358 
359 /* Receive work-queue */
360 typedef struct uct_ib_mlx5_rxwq {
361     /* producer index. It updated when new receive wqe is posted */
362     uint16_t                    rq_wqe_counter;
363     /* consumer index. It is better to track it ourselves than to do ntohs()
364      * on the index in the cqe
365      */
366     uint16_t                    cq_wqe_counter;
367     uint16_t                    mask;
368     volatile uint32_t           *dbrec;
369     struct mlx5_wqe_data_seg    *wqes;
370 } uct_ib_mlx5_rxwq_t;
371 
372 
373 /* Address-vector for link-local scope */
374 typedef struct uct_ib_mlx5_base_av {
375     uint32_t                    dqp_dct;
376     uint8_t                     stat_rate_sl;
377     uint8_t                     fl_mlid;
378     uint16_t                    rlid;
379 } UCS_S_PACKED uct_ib_mlx5_base_av_t;
380 
381 
382 typedef struct uct_ib_mlx5_err_cqe {
383     uint8_t                     rsvd0[32];
384     uint32_t                    srqn;
385     uint8_t                     rsvd1[16];
386     uint8_t                     hw_err_synd;
387     uint8_t                     hw_synd_type;
388     uint8_t                     vendor_err_synd;
389     uint8_t                     syndrome;
390     uint32_t                    s_wqe_opcode_qpn;
391     uint16_t                    wqe_counter;
392     uint8_t                     signature;
393     uint8_t                     op_own;
394 } UCS_S_PACKED uct_ib_mlx5_err_cqe_t;
395 
396 
397 /**
398  * SRQ segment
399  *
400  * We add some SW book-keeping information in the unused HW fields:
401  *  - desc           - the receive descriptor.
402  *  - strides        - Number of available strides in this WQE. When it is 0,
403  *                     this segment can be reposted to the HW. Relevant for
404  *                     Multi-Packet SRQ only.
405  *  - free           - points to the next out-of-order completed segment.
406  */
407 typedef struct uct_rc_mlx5_srq_seg {
408     union {
409         struct mlx5_wqe_srq_next_seg   mlx5_srq;
410         struct {
411             uint16_t                   ptr_mask;
412             uint16_t                   next_wqe_index; /* Network byte order */
413             uint8_t                    signature;
414             uint8_t                    rsvd1[1];
415             uint8_t                    strides;
416             uint8_t                    free;           /* Released but not posted */
417             uct_ib_iface_recv_desc_t   *desc;          /* Host byte order */
418         } srq;
419     };
420     struct mlx5_wqe_data_seg           dptr[0];
421 } uct_ib_mlx5_srq_seg_t;
422 
423 
424 struct uct_ib_mlx5_atomic_masked_cswap32_seg {
425     uint32_t           swap;
426     uint32_t           compare;
427     uint32_t           swap_mask;
428     uint32_t           compare_mask;
429 } UCS_S_PACKED;
430 
431 
432 struct uct_ib_mlx5_atomic_masked_fadd32_seg {
433     uint32_t           add;
434     uint32_t           filed_boundary;
435     uint32_t           reserved[2];
436 } UCS_S_PACKED;
437 
438 
439 struct uct_ib_mlx5_atomic_masked_cswap64_seg {
440     uint64_t           swap;
441     uint64_t           compare;
442 } UCS_S_PACKED;
443 
444 
445 struct uct_ib_mlx5_atomic_masked_fadd64_seg {
446     uint64_t           add;
447     uint64_t           filed_boundary;
448 } UCS_S_PACKED;
449 
450 ucs_status_t uct_ib_mlx5_md_get_atomic_mr_id(uct_ib_md_t *md, uint8_t *mr_id);
451 
452 ucs_status_t uct_ib_mlx5_iface_get_res_domain(uct_ib_iface_t *iface,
453                                               uct_ib_mlx5_qp_t *txwq);
454 
455 void uct_ib_mlx5_iface_put_res_domain(uct_ib_mlx5_qp_t *qp);
456 
457 ucs_status_t uct_ib_mlx5_iface_create_qp(uct_ib_iface_t *iface,
458                                          uct_ib_mlx5_qp_t *qp,
459                                          uct_ib_mlx5_qp_attr_t *attr);
460 
461 ucs_status_t uct_ib_mlx5_modify_qp_state(uct_ib_mlx5_md_t *md,
462                                          uct_ib_mlx5_qp_t *qp,
463                                          enum ibv_qp_state state);
464 
465 /**
466  * Create CQ with DV
467  */
468 ucs_status_t uct_ib_mlx5_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
469                                    const uct_ib_iface_init_attr_t *init_attr,
470                                    int preferred_cpu, size_t inl);
471 
472 extern ucs_config_field_t uct_ib_mlx5_iface_config_table[];
473 
474 /**
475  * Get internal CQ information.
476  */
477 ucs_status_t uct_ib_mlx5_get_cq(struct ibv_cq *cq, uct_ib_mlx5_cq_t *mlx5_cq);
478 
479 /**
480  * Get flag indicating compact AV support.
481  */
482 ucs_status_t uct_ib_mlx5_get_compact_av(uct_ib_iface_t *iface, int *compact_av);
483 
484 /**
485  * Requests completion notification.
486  */
487 ucs_status_t uct_ib_mlx5dv_arm_cq(uct_ib_mlx5_cq_t *cq, int solicited);
488 
489 /**
490  * Check for completion with error.
491  */
492 void uct_ib_mlx5_check_completion(uct_ib_iface_t *iface, uct_ib_mlx5_cq_t *cq,
493                                   struct mlx5_cqe64 *cqe);
494 
495 ucs_status_t
496 uct_ib_mlx5_get_mmio_mode(uct_priv_worker_t *worker,
497                           uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,
498                           unsigned bf_size,
499                           uct_ib_mlx5_mmio_mode_t *mmio_mode);
500 
501 /**
502  * Initialize txwq structure.
503  */
504 ucs_status_t uct_ib_mlx5_txwq_init(uct_priv_worker_t *worker,
505                                    uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,
506                                    uct_ib_mlx5_txwq_t *txwq, struct ibv_qp *verbs_qp);
507 
508 void uct_ib_mlx5_txwq_cleanup(uct_ib_mlx5_txwq_t* txwq);
509 
510 /**
511  * Reset txwq contents and posting indices.
512  */
513 void uct_ib_mlx5_txwq_reset(uct_ib_mlx5_txwq_t *txwq);
514 
515 /**
516  * Initialize rxwq structure.
517  */
518 ucs_status_t uct_ib_mlx5_get_rxwq(struct ibv_qp *qp, uct_ib_mlx5_rxwq_t *wq);
519 
520 /**
521  * Initialize srq structure.
522  */
523 ucs_status_t
524 uct_ib_mlx5_verbs_srq_init(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq,
525                            size_t sg_byte_count, int num_sge);
526 
527 void uct_ib_mlx5_srq_buff_init(uct_ib_mlx5_srq_t *srq, uint32_t head,
528                                uint32_t tail, size_t sg_byte_count, int num_sge);
529 
530 void uct_ib_mlx5_verbs_srq_cleanup(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq);
531 
532 /**
533  * DEVX UAR API
534  */
535 int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar,
536                              uct_ib_mlx5_md_t *md,
537                              uct_ib_mlx5_mmio_mode_t mmio_mode);
538 
539 ucs_status_t uct_ib_mlx5_devx_uar_init(uct_ib_mlx5_devx_uar_t *uar,
540                                        uct_ib_mlx5_md_t *md,
541                                        uct_ib_mlx5_mmio_mode_t mmio_mode);
542 
543 void uct_ib_mlx5_devx_uar_cleanup(uct_ib_mlx5_devx_uar_t *uar);
544 
545 /**
546  * DEVX QP API
547  */
548 
549 #if HAVE_DEVX
550 
551 ucs_status_t uct_ib_mlx5_devx_create_qp(uct_ib_iface_t *iface,
552                                         uct_ib_mlx5_qp_t *qp,
553                                         uct_ib_mlx5_txwq_t *tx,
554                                         uct_ib_mlx5_qp_attr_t *attr);
555 
556 ucs_status_t uct_ib_mlx5_devx_modify_qp(uct_ib_mlx5_qp_t *qp,
557                                         const void *in, size_t inlen,
558                                         void *out, size_t outlen);
559 
560 ucs_status_t uct_ib_mlx5_devx_modify_qp_state(uct_ib_mlx5_qp_t *qp,
561                                               enum ibv_qp_state state);
562 
563 void uct_ib_mlx5_devx_destroy_qp(uct_ib_mlx5_md_t *md, uct_ib_mlx5_qp_t *qp);
564 
565 static inline ucs_status_t
uct_ib_mlx5_md_buf_alloc(uct_ib_mlx5_md_t * md,size_t size,int silent,void ** buf_p,uct_ib_mlx5_devx_umem_t * mem,char * name)566 uct_ib_mlx5_md_buf_alloc(uct_ib_mlx5_md_t *md, size_t size, int silent,
567                          void **buf_p, uct_ib_mlx5_devx_umem_t *mem,
568                          char *name)
569 {
570     ucs_log_level_t level = silent ? UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR;
571     ucs_status_t status;
572     void *buf;
573     int ret;
574 
575     ret = ucs_posix_memalign(&buf, ucs_get_page_size(), size, name);
576     if (ret != 0) {
577         ucs_log(level, "failed to allocate buffer of %zu bytes: %m", size);
578         return UCS_ERR_NO_MEMORY;
579     }
580 
581     if (md->super.fork_init) {
582         ret = madvise(buf, size, MADV_DONTFORK);
583         if (ret != 0) {
584             ucs_log(level, "madvise(DONTFORK, buf=%p, len=%zu) failed: %m", buf, size);
585             status = UCS_ERR_IO_ERROR;
586             goto err_free;
587         }
588     }
589 
590     mem->size = size;
591     mem->mem  = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, buf, size, 0);
592     if (mem->mem == NULL) {
593         ucs_log(level, "mlx5dv_devx_umem_reg() failed: %m");
594         status = UCS_ERR_NO_MEMORY;
595         goto err_dofork;
596     }
597 
598     *buf_p = buf;
599     return UCS_OK;
600 
601 err_dofork:
602     if (md->super.fork_init) {
603         madvise(buf, size, MADV_DOFORK);
604     }
605 err_free:
606     ucs_free(buf);
607 
608     return status;
609 }
610 
611 static inline void
uct_ib_mlx5_md_buf_free(uct_ib_mlx5_md_t * md,void * buf,uct_ib_mlx5_devx_umem_t * mem)612 uct_ib_mlx5_md_buf_free(uct_ib_mlx5_md_t *md, void *buf, uct_ib_mlx5_devx_umem_t *mem)
613 {
614     int ret;
615 
616     if (buf == NULL) {
617         return;
618     }
619 
620     mlx5dv_devx_umem_dereg(mem->mem);
621     if (md->super.fork_init) {
622         ret = madvise(buf, mem->size, MADV_DOFORK);
623         if (ret != 0) {
624             ucs_warn("madvise(DOFORK, buf=%p, len=%zu) failed: %m", buf, mem->size);
625         }
626     }
627     ucs_free(buf);
628 }
629 
630 #else
631 
632 static inline ucs_status_t
uct_ib_mlx5_devx_create_qp(uct_ib_iface_t * iface,uct_ib_mlx5_qp_t * qp,uct_ib_mlx5_txwq_t * tx,uct_ib_mlx5_qp_attr_t * attr)633 uct_ib_mlx5_devx_create_qp(uct_ib_iface_t *iface,
634                            uct_ib_mlx5_qp_t *qp,
635                            uct_ib_mlx5_txwq_t *tx,
636                            uct_ib_mlx5_qp_attr_t *attr)
637 {
638     return UCS_ERR_UNSUPPORTED;
639 }
640 
641 static inline ucs_status_t
uct_ib_mlx5_devx_modify_qp(uct_ib_mlx5_qp_t * qp,enum ibv_qp_state state)642 uct_ib_mlx5_devx_modify_qp(uct_ib_mlx5_qp_t *qp,
643                            enum ibv_qp_state state)
644 {
645     return UCS_ERR_UNSUPPORTED;
646 }
647 
648 static inline ucs_status_t
uct_ib_mlx5_devx_modify_qp_state(uct_ib_mlx5_qp_t * qp,enum ibv_qp_state state)649 uct_ib_mlx5_devx_modify_qp_state(uct_ib_mlx5_qp_t *qp, enum ibv_qp_state state)
650 {
651     return UCS_ERR_UNSUPPORTED;
652 }
653 
uct_ib_mlx5_devx_destroy_qp(uct_ib_mlx5_md_t * md,uct_ib_mlx5_qp_t * qp)654 static inline void uct_ib_mlx5_devx_destroy_qp(uct_ib_mlx5_md_t *md, uct_ib_mlx5_qp_t *qp) { }
655 
656 #endif
657 
uct_ib_mlx5_get_dbrec(uct_ib_mlx5_md_t * md)658 static inline uct_ib_mlx5_dbrec_t *uct_ib_mlx5_get_dbrec(uct_ib_mlx5_md_t *md)
659 {
660     uct_ib_mlx5_dbrec_t *dbrec;
661 
662     ucs_recursive_spin_lock(&md->dbrec_lock);
663     dbrec = (uct_ib_mlx5_dbrec_t *)ucs_mpool_get_inline(&md->dbrec_pool);
664     ucs_recursive_spin_unlock(&md->dbrec_lock);
665     if (dbrec != NULL) {
666         dbrec->db[MLX5_SND_DBR] = 0;
667         dbrec->db[MLX5_RCV_DBR] = 0;
668         dbrec->md               = md;
669     }
670 
671     return dbrec;
672 }
673 
uct_ib_mlx5_put_dbrec(uct_ib_mlx5_dbrec_t * dbrec)674 static inline void uct_ib_mlx5_put_dbrec(uct_ib_mlx5_dbrec_t *dbrec)
675 {
676     uct_ib_mlx5_md_t *md = dbrec->md;
677 
678     ucs_recursive_spin_lock(&md->dbrec_lock);
679     ucs_mpool_put_inline(dbrec);
680     ucs_recursive_spin_unlock(&md->dbrec_lock);
681 }
682 
683 #endif
684