1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
3 *
4 * See file LICENSE for terms.
5 */
6 
7 #ifdef HAVE_CONFIG_H
8 #  include "config.h"
9 #endif
10 
11 #include "ib_mlx5.h"
12 #include "ib_mlx5.inl"
13 #include "ib_mlx5_log.h"
14 #include <uct/ib/mlx5/exp/ib_exp.h>
15 #include <uct/ib/base/ib_verbs.h>
16 #include <uct/ib/base/ib_device.h>
17 #include <ucs/arch/bitops.h>
18 #include <ucs/debug/log.h>
19 #include <ucs/sys/compiler.h>
20 #include <ucs/sys/sys.h>
21 #include <string.h>
22 
23 
24 static const char *uct_ib_mlx5_mmio_modes[] = {
25     [UCT_IB_MLX5_MMIO_MODE_BF_POST]    = "bf_post",
26     [UCT_IB_MLX5_MMIO_MODE_BF_POST_MT] = "bf_post_mt",
27     [UCT_IB_MLX5_MMIO_MODE_DB]         = "db",
28     [UCT_IB_MLX5_MMIO_MODE_AUTO]       = "auto",
29     [UCT_IB_MLX5_MMIO_MODE_LAST]       = NULL
30 };
31 
32 ucs_config_field_t uct_ib_mlx5_iface_config_table[] = {
33 #if HAVE_IBV_DM
34     {"DM_SIZE", "2k",
35      "Device Memory segment size (0 - disabled)",
36      ucs_offsetof(uct_ib_mlx5_iface_config_t, dm.seg_len), UCS_CONFIG_TYPE_MEMUNITS},
37     {"DM_COUNT", "1",
38      "Device Memory segments count (0 - disabled)",
39      ucs_offsetof(uct_ib_mlx5_iface_config_t, dm.count), UCS_CONFIG_TYPE_UINT},
40 #endif
41 
42     {"MMIO_MODE", "auto",
43      "How to write to MMIO register when posting sends on a QP. One of the following:\n"
44      " bf_post    - BlueFlame post, write the WQE fully to MMIO register.\n"
45      " bf_post_mt - Thread-safe BlueFlame, same as bf_post but same MMIO register can be used\n"
46      "              by multiple threads.\n"
47      " db         - Doorbell mode, write only 8 bytes to MMIO register, followed by a memory\n"
48      "              store fence, which makes sure the doorbell goes out on the bus.\n"
49      " auto       - Select best according to worker thread mode.",
50      ucs_offsetof(uct_ib_mlx5_iface_config_t, mmio_mode),
51      UCS_CONFIG_TYPE_ENUM(uct_ib_mlx5_mmio_modes)},
52 
53     {NULL}
54 };
55 
uct_ib_mlx5_create_cq(uct_ib_iface_t * iface,uct_ib_dir_t dir,const uct_ib_iface_init_attr_t * init_attr,int preferred_cpu,size_t inl)56 ucs_status_t uct_ib_mlx5_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
57                                    const uct_ib_iface_init_attr_t *init_attr,
58                                    int preferred_cpu, size_t inl)
59 {
60 #if HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE
61     uct_ib_device_t *dev = uct_ib_iface_device(iface);
62     struct ibv_cq *cq;
63     struct ibv_cq_init_attr_ex cq_attr = {};
64     struct mlx5dv_cq_init_attr dv_attr = {};
65 
66     cq_attr.cqe         = init_attr->cq_len[dir];
67     cq_attr.channel     = iface->comp_channel;
68     cq_attr.comp_vector = preferred_cpu;
69     if (init_attr->flags & UCT_IB_CQ_IGNORE_OVERRUN) {
70         cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_FLAGS;
71         cq_attr.flags     = IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN;
72     }
73     dv_attr.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE;
74     dv_attr.cqe_size  = uct_ib_get_cqe_size(inl > 32 ? 128 : 64);
75     cq = ibv_cq_ex_to_cq(mlx5dv_create_cq(dev->ibv_context, &cq_attr, &dv_attr));
76     if (!cq) {
77         ucs_error("mlx5dv_create_cq(cqe=%d) failed: %m", cq_attr.cqe);
78         return UCS_ERR_IO_ERROR;
79     }
80 
81     iface->cq[dir]                 = cq;
82     iface->config.max_inl_cqe[dir] = dv_attr.cqe_size / 2;
83     return UCS_OK;
84 #else
85     return uct_ib_verbs_create_cq(iface, dir, init_attr, preferred_cpu, inl);
86 #endif
87 }
88 
uct_ib_mlx5_get_cq(struct ibv_cq * cq,uct_ib_mlx5_cq_t * mlx5_cq)89 ucs_status_t uct_ib_mlx5_get_cq(struct ibv_cq *cq, uct_ib_mlx5_cq_t *mlx5_cq)
90 {
91     uct_ib_mlx5dv_cq_t dcq = {};
92     uct_ib_mlx5dv_t obj = {};
93     struct mlx5_cqe64 *cqe;
94     unsigned cqe_size;
95     ucs_status_t status;
96     int ret, i;
97 
98     obj.dv.cq.in = cq;
99     obj.dv.cq.out = &dcq.dv;
100     status = uct_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_CQ);
101     if (status != UCS_OK) {
102         return UCS_ERR_IO_ERROR;
103     }
104 
105     mlx5_cq->cq_buf    = dcq.dv.buf;
106     mlx5_cq->cq_ci     = 0;
107     mlx5_cq->cq_sn     = 0;
108     mlx5_cq->cq_length = dcq.dv.cqe_cnt;
109     mlx5_cq->cq_num    = dcq.dv.cqn;
110 #if HAVE_STRUCT_MLX5DV_CQ_CQ_UAR
111     mlx5_cq->uar       = dcq.dv.cq_uar;
112 #else
113     /* coverity[var_deref_model] */
114     mlx5_cq->uar       = uct_dv_get_info_uar0(dcq.dv.uar);
115 #endif
116     mlx5_cq->dbrec     = dcq.dv.dbrec;
117     cqe_size           = dcq.dv.cqe_size;
118 
119     /* Move buffer forward for 128b CQE, so we would get pointer to the 2nd
120      * 64b when polling.
121      */
122     mlx5_cq->cq_buf = UCS_PTR_BYTE_OFFSET(mlx5_cq->cq_buf,
123                                           cqe_size - sizeof(struct mlx5_cqe64));
124 
125     ret = ibv_exp_cq_ignore_overrun(cq);
126     if (ret != 0) {
127         ucs_error("Failed to modify send CQ to ignore overrun: %s", strerror(ret));
128         return UCS_ERR_UNSUPPORTED;
129     }
130 
131     mlx5_cq->cqe_size_log = ucs_ilog2(cqe_size);
132     ucs_assert_always((1ul << mlx5_cq->cqe_size_log) == cqe_size);
133 
134     /* Set owner bit for all CQEs, so that CQE would look like it is in HW
135      * ownership. In this case CQ polling functions will return immediately if
136      * no any CQE ready, there is no need to check opcode for
137      * MLX5_CQE_INVALID value anymore. */
138     for (i = 0; i < mlx5_cq->cq_length; ++i) {
139         cqe = uct_ib_mlx5_get_cqe(mlx5_cq, i);
140         cqe->op_own |= MLX5_CQE_OWNER_MASK;
141     }
142 
143 
144     return UCS_OK;
145 }
146 
147 static int
uct_ib_mlx5_res_domain_cmp(uct_ib_mlx5_res_domain_t * res_domain,uct_ib_md_t * md,uct_priv_worker_t * worker)148 uct_ib_mlx5_res_domain_cmp(uct_ib_mlx5_res_domain_t *res_domain,
149                            uct_ib_md_t *md, uct_priv_worker_t *worker)
150 {
151 #ifdef HAVE_IBV_EXP_RES_DOMAIN
152     return res_domain->ibv_domain->context == md->dev.ibv_context;
153 #elif HAVE_DECL_IBV_ALLOC_TD
154     return res_domain->pd->context == md->dev.ibv_context;
155 #else
156     return 1;
157 #endif
158 }
159 
160 static ucs_status_t
uct_ib_mlx5_res_domain_init(uct_ib_mlx5_res_domain_t * res_domain,uct_ib_md_t * md,uct_priv_worker_t * worker)161 uct_ib_mlx5_res_domain_init(uct_ib_mlx5_res_domain_t *res_domain,
162                             uct_ib_md_t *md, uct_priv_worker_t *worker)
163 {
164 #ifdef HAVE_IBV_EXP_RES_DOMAIN
165     struct ibv_exp_res_domain_init_attr attr;
166 
167     attr.comp_mask    = IBV_EXP_RES_DOMAIN_THREAD_MODEL |
168                         IBV_EXP_RES_DOMAIN_MSG_MODEL;
169     attr.msg_model    = IBV_EXP_MSG_LOW_LATENCY;
170 
171     switch (worker->thread_mode) {
172     case UCS_THREAD_MODE_SINGLE:
173         attr.thread_model = IBV_EXP_THREAD_SINGLE;
174         break;
175     case UCS_THREAD_MODE_SERIALIZED:
176         attr.thread_model = IBV_EXP_THREAD_UNSAFE;
177         break;
178     default:
179         attr.thread_model = IBV_EXP_THREAD_SAFE;
180         break;
181     }
182 
183     res_domain->ibv_domain = ibv_exp_create_res_domain(md->dev.ibv_context, &attr);
184     if (res_domain->ibv_domain == NULL) {
185         ucs_error("ibv_exp_create_res_domain() on %s failed: %m",
186                   uct_ib_device_name(&md->dev));
187         return UCS_ERR_IO_ERROR;
188     }
189 #elif HAVE_DECL_IBV_ALLOC_TD
190     struct ibv_parent_domain_init_attr attr;
191     struct ibv_td_init_attr td_attr;
192 
193     if (worker->thread_mode == UCS_THREAD_MODE_MULTI) {
194         td_attr.comp_mask = 0;
195         res_domain->td = ibv_alloc_td(md->dev.ibv_context, &td_attr);
196         if (res_domain->td == NULL) {
197             ucs_error("ibv_alloc_td() on %s failed: %m",
198                       uct_ib_device_name(&md->dev));
199             return UCS_ERR_IO_ERROR;
200         }
201     } else {
202         res_domain->td = NULL;
203         res_domain->pd = md->pd;
204         return UCS_OK;
205     }
206 
207     attr.td = res_domain->td;
208     attr.pd = md->pd;
209     attr.comp_mask = 0;
210     res_domain->pd = ibv_alloc_parent_domain(md->dev.ibv_context, &attr);
211     if (res_domain->pd == NULL) {
212         ucs_error("ibv_alloc_parent_domain() on %s failed: %m",
213                   uct_ib_device_name(&md->dev));
214         ibv_dealloc_td(res_domain->td);
215         return UCS_ERR_IO_ERROR;
216     }
217 #endif
218     return UCS_OK;
219 }
220 
uct_ib_mlx5_res_domain_cleanup(uct_ib_mlx5_res_domain_t * res_domain)221 static void uct_ib_mlx5_res_domain_cleanup(uct_ib_mlx5_res_domain_t *res_domain)
222 {
223 #ifdef HAVE_IBV_EXP_RES_DOMAIN
224     struct ibv_exp_destroy_res_domain_attr attr;
225     int ret;
226 
227     attr.comp_mask = 0;
228     ret = ibv_exp_destroy_res_domain(res_domain->ibv_domain->context,
229                                      res_domain->ibv_domain, &attr);
230     if (ret != 0) {
231         ucs_warn("ibv_exp_destroy_res_domain() failed: %m");
232     }
233 #elif HAVE_DECL_IBV_ALLOC_TD
234     int ret;
235 
236     if (res_domain->td != NULL) {
237         ret = ibv_dealloc_pd(res_domain->pd);
238         if (ret != 0) {
239             ucs_warn("ibv_dealloc_pd() failed: %m");
240             return;
241         }
242 
243         ret = ibv_dealloc_td(res_domain->td);
244         if (ret != 0) {
245             ucs_warn("ibv_dealloc_td() failed: %m");
246         }
247     }
248 #endif
249 }
250 
uct_ib_mlx5_iface_get_res_domain(uct_ib_iface_t * iface,uct_ib_mlx5_qp_t * qp)251 ucs_status_t uct_ib_mlx5_iface_get_res_domain(uct_ib_iface_t *iface,
252                                               uct_ib_mlx5_qp_t *qp)
253 {
254     qp->verbs.rd = uct_worker_tl_data_get(iface->super.worker,
255                                           UCT_IB_MLX5_RES_DOMAIN_KEY,
256                                           uct_ib_mlx5_res_domain_t,
257                                           uct_ib_mlx5_res_domain_cmp,
258                                           uct_ib_mlx5_res_domain_init,
259                                           uct_ib_iface_md(iface),
260                                           iface->super.worker);
261     if (UCS_PTR_IS_ERR(qp->verbs.rd)) {
262         return UCS_PTR_STATUS(qp->verbs.rd);
263     }
264 
265     qp->type = UCT_IB_MLX5_OBJ_TYPE_VERBS;
266 
267     return UCS_OK;
268 }
269 
uct_ib_mlx5_iface_put_res_domain(uct_ib_mlx5_qp_t * qp)270 void uct_ib_mlx5_iface_put_res_domain(uct_ib_mlx5_qp_t *qp)
271 {
272     if (qp->type == UCT_IB_MLX5_OBJ_TYPE_VERBS) {
273         uct_worker_tl_data_put(qp->verbs.rd, uct_ib_mlx5_res_domain_cleanup);
274     }
275 }
276 
uct_ib_mlx5_iface_create_qp(uct_ib_iface_t * iface,uct_ib_mlx5_qp_t * qp,uct_ib_mlx5_qp_attr_t * attr)277 ucs_status_t uct_ib_mlx5_iface_create_qp(uct_ib_iface_t *iface,
278                                          uct_ib_mlx5_qp_t *qp,
279                                          uct_ib_mlx5_qp_attr_t *attr)
280 {
281     ucs_status_t status;
282 
283     status = uct_ib_mlx5_iface_fill_attr(iface, qp, attr);
284     if (status != UCS_OK) {
285         return status;
286     }
287 
288     uct_ib_exp_qp_fill_attr(iface, &attr->super);
289     status = uct_ib_iface_create_qp(iface, &attr->super, &qp->verbs.qp);
290     if (status != UCS_OK) {
291         return status;
292     }
293 
294     qp->qp_num = qp->verbs.qp->qp_num;
295     return UCS_OK;
296 }
297 
298 #if !HAVE_DEVX
uct_ib_mlx5_get_compact_av(uct_ib_iface_t * iface,int * compact_av)299 ucs_status_t uct_ib_mlx5_get_compact_av(uct_ib_iface_t *iface, int *compact_av)
300 {
301     struct mlx5_wqe_av  mlx5_av;
302     struct ibv_ah      *ah;
303     uct_ib_address_t   *ib_addr;
304     ucs_status_t        status;
305     struct ibv_ah_attr  ah_attr;
306     enum ibv_mtu        path_mtu;
307 
308     /* coverity[result_independent_of_operands] */
309     ib_addr = ucs_alloca((size_t)iface->addr_size);
310 
311     status = uct_ib_iface_get_device_address(&iface->super.super,
312                                              (uct_device_addr_t*)ib_addr);
313     if (status != UCS_OK) {
314         return status;
315     }
316 
317     uct_ib_iface_fill_ah_attr_from_addr(iface, ib_addr, 0, &ah_attr, &path_mtu);
318     ah_attr.is_global = iface->config.force_global_addr;
319     status = uct_ib_iface_create_ah(iface, &ah_attr, &ah);
320     if (status != UCS_OK) {
321         return status;
322     }
323 
324     uct_ib_mlx5_get_av(ah, &mlx5_av);
325 
326     /* copy MLX5_EXTENDED_UD_AV from the driver, if the flag is not present then
327      * the device supports compact address vector. */
328     *compact_av = !(mlx5_av_base(&mlx5_av)->dqp_dct & UCT_IB_MLX5_EXTENDED_UD_AV);
329     return UCS_OK;
330 }
331 #endif
332 
uct_ib_mlx5_check_completion(uct_ib_iface_t * iface,uct_ib_mlx5_cq_t * cq,struct mlx5_cqe64 * cqe)333 void uct_ib_mlx5_check_completion(uct_ib_iface_t *iface, uct_ib_mlx5_cq_t *cq,
334                                   struct mlx5_cqe64 *cqe)
335 {
336     ucs_status_t status;
337 
338     switch (cqe->op_own >> 4) {
339     case MLX5_CQE_REQ_ERR:
340         /* update ci before invoking error callback, since it can poll on cq */
341         UCS_STATIC_ASSERT(MLX5_CQE_REQ_ERR & (UCT_IB_MLX5_CQE_OP_OWN_ERR_MASK >> 4));
342         ++cq->cq_ci;
343         status = uct_ib_mlx5_completion_with_err(iface, (void*)cqe, NULL,
344                                                  UCS_LOG_LEVEL_DEBUG);
345         iface->ops->handle_failure(iface, cqe, status);
346         return;
347     case MLX5_CQE_RESP_ERR:
348         /* Local side failure - treat as fatal */
349         UCS_STATIC_ASSERT(MLX5_CQE_RESP_ERR & (UCT_IB_MLX5_CQE_OP_OWN_ERR_MASK >> 4));
350         ++cq->cq_ci;
351         uct_ib_mlx5_completion_with_err(iface, (void*)cqe, NULL,
352                                         UCS_LOG_LEVEL_FATAL);
353         return;
354     default:
355         /* CQE might have been updated by HW. Skip it now, and it would be handled
356          * in next polling. */
357         return;
358     }
359 }
360 
uct_ib_mlx5_mmio_cmp(uct_ib_mlx5_mmio_reg_t * reg,uintptr_t addr,unsigned bf_size)361 static int uct_ib_mlx5_mmio_cmp(uct_ib_mlx5_mmio_reg_t *reg, uintptr_t addr,
362                                 unsigned bf_size)
363 {
364     return (reg->addr.uint & ~UCT_IB_MLX5_BF_REG_SIZE) ==
365            (addr & ~UCT_IB_MLX5_BF_REG_SIZE);
366 }
367 
uct_ib_mlx5_mmio_init(uct_ib_mlx5_mmio_reg_t * reg,uintptr_t addr,uct_ib_mlx5_mmio_mode_t mmio_mode)368 static ucs_status_t uct_ib_mlx5_mmio_init(uct_ib_mlx5_mmio_reg_t *reg,
369                                           uintptr_t addr,
370                                           uct_ib_mlx5_mmio_mode_t mmio_mode)
371 {
372     reg->addr.uint = addr;
373     reg->mode      = mmio_mode;
374     return UCS_OK;
375 }
376 
uct_ib_mlx5_mmio_cleanup(uct_ib_mlx5_mmio_reg_t * reg)377 static void uct_ib_mlx5_mmio_cleanup(uct_ib_mlx5_mmio_reg_t *reg)
378 {
379 }
380 
uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t * uar,uct_ib_mlx5_md_t * md,uct_ib_mlx5_mmio_mode_t mmio_mode)381 int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar,
382                              uct_ib_mlx5_md_t *md,
383                              uct_ib_mlx5_mmio_mode_t mmio_mode)
384 {
385     return uar->ctx == md->super.dev.ibv_context;
386 }
387 
uct_ib_mlx5_devx_uar_init(uct_ib_mlx5_devx_uar_t * uar,uct_ib_mlx5_md_t * md,uct_ib_mlx5_mmio_mode_t mmio_mode)388 ucs_status_t uct_ib_mlx5_devx_uar_init(uct_ib_mlx5_devx_uar_t *uar,
389                                        uct_ib_mlx5_md_t *md,
390                                        uct_ib_mlx5_mmio_mode_t mmio_mode)
391 {
392 #if HAVE_DEVX
393     uar->uar            = mlx5dv_devx_alloc_uar(md->super.dev.ibv_context, 0);
394     if (uar->uar == NULL) {
395         ucs_error("mlx5dv_devx_alloc_uar() failed: %m");
396         return UCS_ERR_NO_MEMORY;
397     }
398 
399     uar->super.addr.ptr = uar->uar->reg_addr;
400     uar->super.mode     = mmio_mode;
401     uar->ctx            = md->super.dev.ibv_context;
402 
403     return UCS_OK;
404 #else
405     return UCS_ERR_UNSUPPORTED;
406 #endif
407 }
408 
uct_ib_mlx5_devx_uar_cleanup(uct_ib_mlx5_devx_uar_t * uar)409 void uct_ib_mlx5_devx_uar_cleanup(uct_ib_mlx5_devx_uar_t *uar)
410 {
411 #if HAVE_DEVX
412     mlx5dv_devx_free_uar(uar->uar);
413 #endif
414 }
415 
uct_ib_mlx5_txwq_reset(uct_ib_mlx5_txwq_t * txwq)416 void uct_ib_mlx5_txwq_reset(uct_ib_mlx5_txwq_t *txwq)
417 {
418     txwq->curr       = txwq->qstart;
419     txwq->sw_pi      = 0;
420     txwq->prev_sw_pi = UINT16_MAX;
421 #if UCS_ENABLE_ASSERT
422     txwq->hw_ci      = 0xFFFF;
423 #endif
424     uct_ib_fence_info_init(&txwq->fi);
425     memset(txwq->qstart, 0, UCS_PTR_BYTE_DIFF(txwq->qstart, txwq->qend));
426 }
427 
428 ucs_status_t
uct_ib_mlx5_get_mmio_mode(uct_priv_worker_t * worker,uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,unsigned bf_size,uct_ib_mlx5_mmio_mode_t * mmio_mode)429 uct_ib_mlx5_get_mmio_mode(uct_priv_worker_t *worker,
430                           uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,
431                           unsigned bf_size,
432                           uct_ib_mlx5_mmio_mode_t *mmio_mode)
433 {
434     ucs_assert(cfg_mmio_mode < UCT_IB_MLX5_MMIO_MODE_LAST);
435 
436     if (cfg_mmio_mode != UCT_IB_MLX5_MMIO_MODE_AUTO) {
437         *mmio_mode = cfg_mmio_mode;
438     } else if (bf_size > 0) {
439         if (worker->thread_mode == UCS_THREAD_MODE_SINGLE) {
440             *mmio_mode = UCT_IB_MLX5_MMIO_MODE_BF_POST;
441         } else if (worker->thread_mode == UCS_THREAD_MODE_SERIALIZED) {
442             *mmio_mode = UCT_IB_MLX5_MMIO_MODE_BF_POST_MT;
443         } else {
444             ucs_error("unsupported thread mode for mlx5: %d", worker->thread_mode);
445             return UCS_ERR_UNSUPPORTED;
446         }
447     } else {
448         *mmio_mode = UCT_IB_MLX5_MMIO_MODE_DB;
449     }
450 
451     return UCS_OK;
452 }
453 
uct_ib_mlx5_txwq_init(uct_priv_worker_t * worker,uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,uct_ib_mlx5_txwq_t * txwq,struct ibv_qp * verbs_qp)454 ucs_status_t uct_ib_mlx5_txwq_init(uct_priv_worker_t *worker,
455                                    uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,
456                                    uct_ib_mlx5_txwq_t *txwq,
457                                    struct ibv_qp *verbs_qp)
458 {
459     uct_ib_mlx5_mmio_mode_t mmio_mode;
460     uct_ib_mlx5dv_qp_t qp_info = {};
461     uct_ib_mlx5dv_t obj = {};
462     ucs_status_t status;
463 
464     obj.dv.qp.in = verbs_qp;
465     obj.dv.qp.out = &qp_info.dv;
466 
467     status = uct_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_QP);
468     if (status != UCS_OK) {
469         return UCS_ERR_IO_ERROR;
470     }
471 
472     if ((qp_info.dv.sq.stride != MLX5_SEND_WQE_BB) || !ucs_is_pow2(qp_info.dv.sq.wqe_cnt) ||
473         ((qp_info.dv.bf.size != 0) && (qp_info.dv.bf.size != UCT_IB_MLX5_BF_REG_SIZE)))
474     {
475         ucs_error("mlx5 device parameters not suitable for transport "
476                   "bf.size(%d) %d, sq.stride(%d) %d, wqe_cnt %d",
477                   UCT_IB_MLX5_BF_REG_SIZE, qp_info.dv.bf.size,
478                   MLX5_SEND_WQE_BB, qp_info.dv.sq.stride, qp_info.dv.sq.wqe_cnt);
479         return UCS_ERR_IO_ERROR;
480     }
481 
482     status = uct_ib_mlx5_get_mmio_mode(worker, cfg_mmio_mode,
483                                        qp_info.dv.bf.size, &mmio_mode);
484     if (status != UCS_OK) {
485         return status;
486     }
487 
488     ucs_debug("tx wq %d bytes [bb=%d, nwqe=%d] mmio_mode %s",
489               qp_info.dv.sq.stride * qp_info.dv.sq.wqe_cnt,
490               qp_info.dv.sq.stride, qp_info.dv.sq.wqe_cnt,
491               uct_ib_mlx5_mmio_modes[mmio_mode]);
492 
493     txwq->qstart     = qp_info.dv.sq.buf;
494     txwq->qend       = UCS_PTR_BYTE_OFFSET(qp_info.dv.sq.buf,
495                                            qp_info.dv.sq.stride * qp_info.dv.sq.wqe_cnt);
496     txwq->reg        = uct_worker_tl_data_get(worker,
497                                               UCT_IB_MLX5_WORKER_BF_KEY,
498                                               uct_ib_mlx5_mmio_reg_t,
499                                               uct_ib_mlx5_mmio_cmp,
500                                               uct_ib_mlx5_mmio_init,
501                                               (uintptr_t)qp_info.dv.bf.reg,
502                                               mmio_mode);
503     if (UCS_PTR_IS_ERR(txwq->reg)) {
504         return UCS_PTR_STATUS(txwq->reg);
505     }
506 
507     /* cppcheck-suppress autoVariables */
508     txwq->dbrec      = &qp_info.dv.dbrec[MLX5_SND_DBR];
509     /* need to reserve 2x because:
510      *  - on completion we only get the index of last wqe and we do not
511      *    really know how many bb is there (but no more than max bb
512      *  - on send we check that there is at least one bb. We know
513      *  exact number of bbs once we actually are sending.
514      */
515     txwq->bb_max     = qp_info.dv.sq.wqe_cnt - 2 * UCT_IB_MLX5_MAX_BB;
516     ucs_assert_always(txwq->bb_max > 0);
517 
518     uct_ib_mlx5_txwq_reset(txwq);
519     return UCS_OK;
520 }
521 
uct_ib_mlx5_txwq_cleanup(uct_ib_mlx5_txwq_t * txwq)522 void uct_ib_mlx5_txwq_cleanup(uct_ib_mlx5_txwq_t* txwq)
523 {
524     uct_ib_mlx5_devx_uar_t *uar = ucs_derived_of(txwq->reg,
525                                                  uct_ib_mlx5_devx_uar_t);
526     switch (txwq->super.type) {
527     case UCT_IB_MLX5_OBJ_TYPE_DEVX:
528         uct_worker_tl_data_put(uar, uct_ib_mlx5_devx_uar_cleanup);
529         break;
530     case UCT_IB_MLX5_OBJ_TYPE_VERBS:
531         uct_ib_mlx5_iface_put_res_domain(&txwq->super);
532         uct_worker_tl_data_put(txwq->reg, uct_ib_mlx5_mmio_cleanup);
533         break;
534     case UCT_IB_MLX5_OBJ_TYPE_LAST:
535         if (txwq->reg != NULL) {
536             uct_worker_tl_data_put(txwq->reg, uct_ib_mlx5_mmio_cleanup);
537         }
538     }
539 }
540 
uct_ib_mlx5_get_rxwq(struct ibv_qp * verbs_qp,uct_ib_mlx5_rxwq_t * rxwq)541 ucs_status_t uct_ib_mlx5_get_rxwq(struct ibv_qp *verbs_qp, uct_ib_mlx5_rxwq_t *rxwq)
542 {
543     uct_ib_mlx5dv_qp_t qp_info = {};
544     uct_ib_mlx5dv_t obj = {};
545     ucs_status_t status;
546 
547     obj.dv.qp.in = verbs_qp;
548     obj.dv.qp.out = &qp_info.dv;
549 
550     status = uct_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_QP);
551     if (status != UCS_OK) {
552         return UCS_ERR_IO_ERROR;
553     }
554 
555     if (!ucs_is_pow2(qp_info.dv.rq.wqe_cnt) ||
556         qp_info.dv.rq.stride != sizeof(struct mlx5_wqe_data_seg)) {
557         ucs_error("mlx5 rx wq [count=%d stride=%d] has invalid parameters",
558                   qp_info.dv.rq.wqe_cnt,
559                   qp_info.dv.rq.stride);
560         return UCS_ERR_IO_ERROR;
561     }
562     rxwq->wqes            = qp_info.dv.rq.buf;
563     rxwq->rq_wqe_counter  = 0;
564     rxwq->cq_wqe_counter  = 0;
565     rxwq->mask            = qp_info.dv.rq.wqe_cnt - 1;
566     /* cppcheck-suppress autoVariables */
567     rxwq->dbrec           = &qp_info.dv.dbrec[MLX5_RCV_DBR];
568     memset(rxwq->wqes, 0, qp_info.dv.rq.wqe_cnt * sizeof(struct mlx5_wqe_data_seg));
569 
570     return UCS_OK;
571 }
572 
573 ucs_status_t
uct_ib_mlx5_verbs_srq_init(uct_ib_mlx5_srq_t * srq,struct ibv_srq * verbs_srq,size_t sg_byte_count,int sge_num)574 uct_ib_mlx5_verbs_srq_init(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq,
575                            size_t sg_byte_count, int sge_num)
576 {
577     uct_ib_mlx5dv_srq_t srq_info = {};
578     uct_ib_mlx5dv_t obj          = {};
579     ucs_status_t status;
580     uint16_t stride;
581 
582     obj.dv.srq.in         = verbs_srq;
583     obj.dv.srq.out        = &srq_info.dv;
584 #if HAVE_DEVX
585     srq_info.dv.comp_mask = MLX5DV_SRQ_MASK_SRQN;
586 #endif
587 
588     status = uct_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_SRQ);
589     if (status != UCS_OK) {
590         return status;
591     }
592 
593 #if HAVE_DEVX
594     srq->srq_num = srq_info.dv.srqn;
595 #else
596     srq->srq_num = 0;
597 #endif
598 
599     if (srq_info.dv.head != 0) {
600         ucs_error("SRQ head is not 0 (%d)", srq_info.dv.head);
601         return UCS_ERR_NO_DEVICE;
602     }
603 
604     stride = uct_ib_mlx5_srq_stride(sge_num);
605     if (srq_info.dv.stride != stride) {
606         ucs_error("SRQ stride is not %u (%d), sgenum %d",
607                   stride, srq_info.dv.stride, sge_num);
608         return UCS_ERR_NO_DEVICE;
609     }
610 
611     if (!ucs_is_pow2(srq_info.dv.tail + 1)) {
612         ucs_error("SRQ length is not power of 2 (%d)", srq_info.dv.tail + 1);
613         return UCS_ERR_NO_DEVICE;
614     }
615 
616     srq->buf = srq_info.dv.buf;
617     srq->db  = srq_info.dv.dbrec;
618     uct_ib_mlx5_srq_buff_init(srq, srq_info.dv.head, srq_info.dv.tail,
619                               sg_byte_count, sge_num);
620 
621     return UCS_OK;
622 }
623 
uct_ib_mlx5_srq_buff_init(uct_ib_mlx5_srq_t * srq,uint32_t head,uint32_t tail,size_t sg_byte_count,int sge_num)624 void uct_ib_mlx5_srq_buff_init(uct_ib_mlx5_srq_t *srq, uint32_t head,
625                                uint32_t tail, size_t sg_byte_count, int sge_num)
626 {
627     uct_ib_mlx5_srq_seg_t *seg;
628     unsigned i, j;
629 
630     srq->free_idx  = tail;
631     srq->ready_idx = UINT16_MAX;
632     srq->sw_pi     = UINT16_MAX;
633     srq->mask      = tail;
634     srq->tail      = tail;
635     srq->stride    = uct_ib_mlx5_srq_stride(sge_num);
636 
637     for (i = head; i <= tail; ++i) {
638         seg = uct_ib_mlx5_srq_get_wqe(srq, i);
639         seg->srq.next_wqe_index = htons((i + 1) & tail);
640         seg->srq.ptr_mask       = 0;
641         seg->srq.free           = 0;
642         seg->srq.desc           = NULL;
643         seg->srq.strides        = sge_num;
644         for (j = 0; j < sge_num; ++j) {
645             seg->dptr[j].byte_count = htonl(sg_byte_count);
646         }
647     }
648 }
649 
uct_ib_mlx5_verbs_srq_cleanup(uct_ib_mlx5_srq_t * srq,struct ibv_srq * verbs_srq)650 void uct_ib_mlx5_verbs_srq_cleanup(uct_ib_mlx5_srq_t *srq,
651                                    struct ibv_srq *verbs_srq)
652 {
653     uct_ib_mlx5dv_srq_t srq_info = {};
654     uct_ib_mlx5dv_t obj = {};
655     ucs_status_t status;
656 
657     if (srq->type != UCT_IB_MLX5_OBJ_TYPE_VERBS) {
658         return;
659     }
660 
661     /* check if mlx5 driver didn't modified SRQ */
662     obj.dv.srq.in = verbs_srq;
663     obj.dv.srq.out = &srq_info.dv;
664 
665     status = uct_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_SRQ);
666     ucs_assert_always(status == UCS_OK);
667     ucs_assertv_always(srq->tail == srq_info.dv.tail, "srq->tail=%d srq_info.tail=%d",
668                        srq->tail, srq_info.dv.tail);
669 }
670 
uct_ib_mlx5_modify_qp_state(uct_ib_mlx5_md_t * md,uct_ib_mlx5_qp_t * qp,enum ibv_qp_state state)671 ucs_status_t uct_ib_mlx5_modify_qp_state(uct_ib_mlx5_md_t *md,
672                                          uct_ib_mlx5_qp_t *qp,
673                                          enum ibv_qp_state state)
674 {
675     if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX) {
676         return uct_ib_mlx5_devx_modify_qp_state(qp, state);
677     } else {
678         return uct_ib_modify_qp(qp->verbs.qp, state);
679     }
680 }
681 
uct_ib_mlx5_md_get_atomic_mr_id(uct_ib_md_t * ibmd,uint8_t * mr_id)682 ucs_status_t uct_ib_mlx5_md_get_atomic_mr_id(uct_ib_md_t *ibmd, uint8_t *mr_id)
683 {
684     uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t);
685 
686 #if HAVE_EXP_UMR
687     if ((md->umr_qp == NULL) || (md->umr_cq == NULL)) {
688         goto unsupported;
689     }
690 #else
691     if (!(md->flags & UCT_IB_MLX5_MD_FLAG_DEVX)) {
692         goto unsupported;
693     }
694 #endif
695 
696     /* Generate atomic UMR id. We want umrs for same virtual addresses to have
697      * different ids across processes.
698      *
699      * Usually parallel processes running on the same node as part of a single
700      * job will have consecutive PIDs. For example MPI ranks, slurm spawned tasks...
701      */
702     *mr_id = getpid() % 256;
703     return UCS_OK;
704 
705 unsupported:
706     *mr_id = 0;
707     return UCS_ERR_UNSUPPORTED;
708 }
709 
710