1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED.
3 *
4 * See file LICENSE for terms.
5 */
6
7 #ifdef HAVE_CONFIG_H
8 # include "config.h"
9 #endif
10
11 #include "ib_mlx5.h"
12 #include "ib_mlx5.inl"
13 #include "ib_mlx5_log.h"
14 #include <uct/ib/mlx5/exp/ib_exp.h>
15 #include <uct/ib/base/ib_verbs.h>
16 #include <uct/ib/base/ib_device.h>
17 #include <ucs/arch/bitops.h>
18 #include <ucs/debug/log.h>
19 #include <ucs/sys/compiler.h>
20 #include <ucs/sys/sys.h>
21 #include <string.h>
22
23
24 static const char *uct_ib_mlx5_mmio_modes[] = {
25 [UCT_IB_MLX5_MMIO_MODE_BF_POST] = "bf_post",
26 [UCT_IB_MLX5_MMIO_MODE_BF_POST_MT] = "bf_post_mt",
27 [UCT_IB_MLX5_MMIO_MODE_DB] = "db",
28 [UCT_IB_MLX5_MMIO_MODE_AUTO] = "auto",
29 [UCT_IB_MLX5_MMIO_MODE_LAST] = NULL
30 };
31
32 ucs_config_field_t uct_ib_mlx5_iface_config_table[] = {
33 #if HAVE_IBV_DM
34 {"DM_SIZE", "2k",
35 "Device Memory segment size (0 - disabled)",
36 ucs_offsetof(uct_ib_mlx5_iface_config_t, dm.seg_len), UCS_CONFIG_TYPE_MEMUNITS},
37 {"DM_COUNT", "1",
38 "Device Memory segments count (0 - disabled)",
39 ucs_offsetof(uct_ib_mlx5_iface_config_t, dm.count), UCS_CONFIG_TYPE_UINT},
40 #endif
41
42 {"MMIO_MODE", "auto",
43 "How to write to MMIO register when posting sends on a QP. One of the following:\n"
44 " bf_post - BlueFlame post, write the WQE fully to MMIO register.\n"
45 " bf_post_mt - Thread-safe BlueFlame, same as bf_post but same MMIO register can be used\n"
46 " by multiple threads.\n"
47 " db - Doorbell mode, write only 8 bytes to MMIO register, followed by a memory\n"
48 " store fence, which makes sure the doorbell goes out on the bus.\n"
49 " auto - Select best according to worker thread mode.",
50 ucs_offsetof(uct_ib_mlx5_iface_config_t, mmio_mode),
51 UCS_CONFIG_TYPE_ENUM(uct_ib_mlx5_mmio_modes)},
52
53 {NULL}
54 };
55
uct_ib_mlx5_create_cq(uct_ib_iface_t * iface,uct_ib_dir_t dir,const uct_ib_iface_init_attr_t * init_attr,int preferred_cpu,size_t inl)56 ucs_status_t uct_ib_mlx5_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
57 const uct_ib_iface_init_attr_t *init_attr,
58 int preferred_cpu, size_t inl)
59 {
60 #if HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE
61 uct_ib_device_t *dev = uct_ib_iface_device(iface);
62 struct ibv_cq *cq;
63 struct ibv_cq_init_attr_ex cq_attr = {};
64 struct mlx5dv_cq_init_attr dv_attr = {};
65
66 cq_attr.cqe = init_attr->cq_len[dir];
67 cq_attr.channel = iface->comp_channel;
68 cq_attr.comp_vector = preferred_cpu;
69 if (init_attr->flags & UCT_IB_CQ_IGNORE_OVERRUN) {
70 cq_attr.comp_mask = IBV_CQ_INIT_ATTR_MASK_FLAGS;
71 cq_attr.flags = IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN;
72 }
73 dv_attr.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE;
74 dv_attr.cqe_size = uct_ib_get_cqe_size(inl > 32 ? 128 : 64);
75 cq = ibv_cq_ex_to_cq(mlx5dv_create_cq(dev->ibv_context, &cq_attr, &dv_attr));
76 if (!cq) {
77 ucs_error("mlx5dv_create_cq(cqe=%d) failed: %m", cq_attr.cqe);
78 return UCS_ERR_IO_ERROR;
79 }
80
81 iface->cq[dir] = cq;
82 iface->config.max_inl_cqe[dir] = dv_attr.cqe_size / 2;
83 return UCS_OK;
84 #else
85 return uct_ib_verbs_create_cq(iface, dir, init_attr, preferred_cpu, inl);
86 #endif
87 }
88
uct_ib_mlx5_get_cq(struct ibv_cq * cq,uct_ib_mlx5_cq_t * mlx5_cq)89 ucs_status_t uct_ib_mlx5_get_cq(struct ibv_cq *cq, uct_ib_mlx5_cq_t *mlx5_cq)
90 {
91 uct_ib_mlx5dv_cq_t dcq = {};
92 uct_ib_mlx5dv_t obj = {};
93 struct mlx5_cqe64 *cqe;
94 unsigned cqe_size;
95 ucs_status_t status;
96 int ret, i;
97
98 obj.dv.cq.in = cq;
99 obj.dv.cq.out = &dcq.dv;
100 status = uct_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_CQ);
101 if (status != UCS_OK) {
102 return UCS_ERR_IO_ERROR;
103 }
104
105 mlx5_cq->cq_buf = dcq.dv.buf;
106 mlx5_cq->cq_ci = 0;
107 mlx5_cq->cq_sn = 0;
108 mlx5_cq->cq_length = dcq.dv.cqe_cnt;
109 mlx5_cq->cq_num = dcq.dv.cqn;
110 #if HAVE_STRUCT_MLX5DV_CQ_CQ_UAR
111 mlx5_cq->uar = dcq.dv.cq_uar;
112 #else
113 /* coverity[var_deref_model] */
114 mlx5_cq->uar = uct_dv_get_info_uar0(dcq.dv.uar);
115 #endif
116 mlx5_cq->dbrec = dcq.dv.dbrec;
117 cqe_size = dcq.dv.cqe_size;
118
119 /* Move buffer forward for 128b CQE, so we would get pointer to the 2nd
120 * 64b when polling.
121 */
122 mlx5_cq->cq_buf = UCS_PTR_BYTE_OFFSET(mlx5_cq->cq_buf,
123 cqe_size - sizeof(struct mlx5_cqe64));
124
125 ret = ibv_exp_cq_ignore_overrun(cq);
126 if (ret != 0) {
127 ucs_error("Failed to modify send CQ to ignore overrun: %s", strerror(ret));
128 return UCS_ERR_UNSUPPORTED;
129 }
130
131 mlx5_cq->cqe_size_log = ucs_ilog2(cqe_size);
132 ucs_assert_always((1ul << mlx5_cq->cqe_size_log) == cqe_size);
133
134 /* Set owner bit for all CQEs, so that CQE would look like it is in HW
135 * ownership. In this case CQ polling functions will return immediately if
136 * no any CQE ready, there is no need to check opcode for
137 * MLX5_CQE_INVALID value anymore. */
138 for (i = 0; i < mlx5_cq->cq_length; ++i) {
139 cqe = uct_ib_mlx5_get_cqe(mlx5_cq, i);
140 cqe->op_own |= MLX5_CQE_OWNER_MASK;
141 }
142
143
144 return UCS_OK;
145 }
146
147 static int
uct_ib_mlx5_res_domain_cmp(uct_ib_mlx5_res_domain_t * res_domain,uct_ib_md_t * md,uct_priv_worker_t * worker)148 uct_ib_mlx5_res_domain_cmp(uct_ib_mlx5_res_domain_t *res_domain,
149 uct_ib_md_t *md, uct_priv_worker_t *worker)
150 {
151 #ifdef HAVE_IBV_EXP_RES_DOMAIN
152 return res_domain->ibv_domain->context == md->dev.ibv_context;
153 #elif HAVE_DECL_IBV_ALLOC_TD
154 return res_domain->pd->context == md->dev.ibv_context;
155 #else
156 return 1;
157 #endif
158 }
159
160 static ucs_status_t
uct_ib_mlx5_res_domain_init(uct_ib_mlx5_res_domain_t * res_domain,uct_ib_md_t * md,uct_priv_worker_t * worker)161 uct_ib_mlx5_res_domain_init(uct_ib_mlx5_res_domain_t *res_domain,
162 uct_ib_md_t *md, uct_priv_worker_t *worker)
163 {
164 #ifdef HAVE_IBV_EXP_RES_DOMAIN
165 struct ibv_exp_res_domain_init_attr attr;
166
167 attr.comp_mask = IBV_EXP_RES_DOMAIN_THREAD_MODEL |
168 IBV_EXP_RES_DOMAIN_MSG_MODEL;
169 attr.msg_model = IBV_EXP_MSG_LOW_LATENCY;
170
171 switch (worker->thread_mode) {
172 case UCS_THREAD_MODE_SINGLE:
173 attr.thread_model = IBV_EXP_THREAD_SINGLE;
174 break;
175 case UCS_THREAD_MODE_SERIALIZED:
176 attr.thread_model = IBV_EXP_THREAD_UNSAFE;
177 break;
178 default:
179 attr.thread_model = IBV_EXP_THREAD_SAFE;
180 break;
181 }
182
183 res_domain->ibv_domain = ibv_exp_create_res_domain(md->dev.ibv_context, &attr);
184 if (res_domain->ibv_domain == NULL) {
185 ucs_error("ibv_exp_create_res_domain() on %s failed: %m",
186 uct_ib_device_name(&md->dev));
187 return UCS_ERR_IO_ERROR;
188 }
189 #elif HAVE_DECL_IBV_ALLOC_TD
190 struct ibv_parent_domain_init_attr attr;
191 struct ibv_td_init_attr td_attr;
192
193 if (worker->thread_mode == UCS_THREAD_MODE_MULTI) {
194 td_attr.comp_mask = 0;
195 res_domain->td = ibv_alloc_td(md->dev.ibv_context, &td_attr);
196 if (res_domain->td == NULL) {
197 ucs_error("ibv_alloc_td() on %s failed: %m",
198 uct_ib_device_name(&md->dev));
199 return UCS_ERR_IO_ERROR;
200 }
201 } else {
202 res_domain->td = NULL;
203 res_domain->pd = md->pd;
204 return UCS_OK;
205 }
206
207 attr.td = res_domain->td;
208 attr.pd = md->pd;
209 attr.comp_mask = 0;
210 res_domain->pd = ibv_alloc_parent_domain(md->dev.ibv_context, &attr);
211 if (res_domain->pd == NULL) {
212 ucs_error("ibv_alloc_parent_domain() on %s failed: %m",
213 uct_ib_device_name(&md->dev));
214 ibv_dealloc_td(res_domain->td);
215 return UCS_ERR_IO_ERROR;
216 }
217 #endif
218 return UCS_OK;
219 }
220
uct_ib_mlx5_res_domain_cleanup(uct_ib_mlx5_res_domain_t * res_domain)221 static void uct_ib_mlx5_res_domain_cleanup(uct_ib_mlx5_res_domain_t *res_domain)
222 {
223 #ifdef HAVE_IBV_EXP_RES_DOMAIN
224 struct ibv_exp_destroy_res_domain_attr attr;
225 int ret;
226
227 attr.comp_mask = 0;
228 ret = ibv_exp_destroy_res_domain(res_domain->ibv_domain->context,
229 res_domain->ibv_domain, &attr);
230 if (ret != 0) {
231 ucs_warn("ibv_exp_destroy_res_domain() failed: %m");
232 }
233 #elif HAVE_DECL_IBV_ALLOC_TD
234 int ret;
235
236 if (res_domain->td != NULL) {
237 ret = ibv_dealloc_pd(res_domain->pd);
238 if (ret != 0) {
239 ucs_warn("ibv_dealloc_pd() failed: %m");
240 return;
241 }
242
243 ret = ibv_dealloc_td(res_domain->td);
244 if (ret != 0) {
245 ucs_warn("ibv_dealloc_td() failed: %m");
246 }
247 }
248 #endif
249 }
250
uct_ib_mlx5_iface_get_res_domain(uct_ib_iface_t * iface,uct_ib_mlx5_qp_t * qp)251 ucs_status_t uct_ib_mlx5_iface_get_res_domain(uct_ib_iface_t *iface,
252 uct_ib_mlx5_qp_t *qp)
253 {
254 qp->verbs.rd = uct_worker_tl_data_get(iface->super.worker,
255 UCT_IB_MLX5_RES_DOMAIN_KEY,
256 uct_ib_mlx5_res_domain_t,
257 uct_ib_mlx5_res_domain_cmp,
258 uct_ib_mlx5_res_domain_init,
259 uct_ib_iface_md(iface),
260 iface->super.worker);
261 if (UCS_PTR_IS_ERR(qp->verbs.rd)) {
262 return UCS_PTR_STATUS(qp->verbs.rd);
263 }
264
265 qp->type = UCT_IB_MLX5_OBJ_TYPE_VERBS;
266
267 return UCS_OK;
268 }
269
uct_ib_mlx5_iface_put_res_domain(uct_ib_mlx5_qp_t * qp)270 void uct_ib_mlx5_iface_put_res_domain(uct_ib_mlx5_qp_t *qp)
271 {
272 if (qp->type == UCT_IB_MLX5_OBJ_TYPE_VERBS) {
273 uct_worker_tl_data_put(qp->verbs.rd, uct_ib_mlx5_res_domain_cleanup);
274 }
275 }
276
uct_ib_mlx5_iface_create_qp(uct_ib_iface_t * iface,uct_ib_mlx5_qp_t * qp,uct_ib_mlx5_qp_attr_t * attr)277 ucs_status_t uct_ib_mlx5_iface_create_qp(uct_ib_iface_t *iface,
278 uct_ib_mlx5_qp_t *qp,
279 uct_ib_mlx5_qp_attr_t *attr)
280 {
281 ucs_status_t status;
282
283 status = uct_ib_mlx5_iface_fill_attr(iface, qp, attr);
284 if (status != UCS_OK) {
285 return status;
286 }
287
288 uct_ib_exp_qp_fill_attr(iface, &attr->super);
289 status = uct_ib_iface_create_qp(iface, &attr->super, &qp->verbs.qp);
290 if (status != UCS_OK) {
291 return status;
292 }
293
294 qp->qp_num = qp->verbs.qp->qp_num;
295 return UCS_OK;
296 }
297
298 #if !HAVE_DEVX
uct_ib_mlx5_get_compact_av(uct_ib_iface_t * iface,int * compact_av)299 ucs_status_t uct_ib_mlx5_get_compact_av(uct_ib_iface_t *iface, int *compact_av)
300 {
301 struct mlx5_wqe_av mlx5_av;
302 struct ibv_ah *ah;
303 uct_ib_address_t *ib_addr;
304 ucs_status_t status;
305 struct ibv_ah_attr ah_attr;
306 enum ibv_mtu path_mtu;
307
308 /* coverity[result_independent_of_operands] */
309 ib_addr = ucs_alloca((size_t)iface->addr_size);
310
311 status = uct_ib_iface_get_device_address(&iface->super.super,
312 (uct_device_addr_t*)ib_addr);
313 if (status != UCS_OK) {
314 return status;
315 }
316
317 uct_ib_iface_fill_ah_attr_from_addr(iface, ib_addr, 0, &ah_attr, &path_mtu);
318 ah_attr.is_global = iface->config.force_global_addr;
319 status = uct_ib_iface_create_ah(iface, &ah_attr, &ah);
320 if (status != UCS_OK) {
321 return status;
322 }
323
324 uct_ib_mlx5_get_av(ah, &mlx5_av);
325
326 /* copy MLX5_EXTENDED_UD_AV from the driver, if the flag is not present then
327 * the device supports compact address vector. */
328 *compact_av = !(mlx5_av_base(&mlx5_av)->dqp_dct & UCT_IB_MLX5_EXTENDED_UD_AV);
329 return UCS_OK;
330 }
331 #endif
332
uct_ib_mlx5_check_completion(uct_ib_iface_t * iface,uct_ib_mlx5_cq_t * cq,struct mlx5_cqe64 * cqe)333 void uct_ib_mlx5_check_completion(uct_ib_iface_t *iface, uct_ib_mlx5_cq_t *cq,
334 struct mlx5_cqe64 *cqe)
335 {
336 ucs_status_t status;
337
338 switch (cqe->op_own >> 4) {
339 case MLX5_CQE_REQ_ERR:
340 /* update ci before invoking error callback, since it can poll on cq */
341 UCS_STATIC_ASSERT(MLX5_CQE_REQ_ERR & (UCT_IB_MLX5_CQE_OP_OWN_ERR_MASK >> 4));
342 ++cq->cq_ci;
343 status = uct_ib_mlx5_completion_with_err(iface, (void*)cqe, NULL,
344 UCS_LOG_LEVEL_DEBUG);
345 iface->ops->handle_failure(iface, cqe, status);
346 return;
347 case MLX5_CQE_RESP_ERR:
348 /* Local side failure - treat as fatal */
349 UCS_STATIC_ASSERT(MLX5_CQE_RESP_ERR & (UCT_IB_MLX5_CQE_OP_OWN_ERR_MASK >> 4));
350 ++cq->cq_ci;
351 uct_ib_mlx5_completion_with_err(iface, (void*)cqe, NULL,
352 UCS_LOG_LEVEL_FATAL);
353 return;
354 default:
355 /* CQE might have been updated by HW. Skip it now, and it would be handled
356 * in next polling. */
357 return;
358 }
359 }
360
uct_ib_mlx5_mmio_cmp(uct_ib_mlx5_mmio_reg_t * reg,uintptr_t addr,unsigned bf_size)361 static int uct_ib_mlx5_mmio_cmp(uct_ib_mlx5_mmio_reg_t *reg, uintptr_t addr,
362 unsigned bf_size)
363 {
364 return (reg->addr.uint & ~UCT_IB_MLX5_BF_REG_SIZE) ==
365 (addr & ~UCT_IB_MLX5_BF_REG_SIZE);
366 }
367
uct_ib_mlx5_mmio_init(uct_ib_mlx5_mmio_reg_t * reg,uintptr_t addr,uct_ib_mlx5_mmio_mode_t mmio_mode)368 static ucs_status_t uct_ib_mlx5_mmio_init(uct_ib_mlx5_mmio_reg_t *reg,
369 uintptr_t addr,
370 uct_ib_mlx5_mmio_mode_t mmio_mode)
371 {
372 reg->addr.uint = addr;
373 reg->mode = mmio_mode;
374 return UCS_OK;
375 }
376
uct_ib_mlx5_mmio_cleanup(uct_ib_mlx5_mmio_reg_t * reg)377 static void uct_ib_mlx5_mmio_cleanup(uct_ib_mlx5_mmio_reg_t *reg)
378 {
379 }
380
uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t * uar,uct_ib_mlx5_md_t * md,uct_ib_mlx5_mmio_mode_t mmio_mode)381 int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar,
382 uct_ib_mlx5_md_t *md,
383 uct_ib_mlx5_mmio_mode_t mmio_mode)
384 {
385 return uar->ctx == md->super.dev.ibv_context;
386 }
387
uct_ib_mlx5_devx_uar_init(uct_ib_mlx5_devx_uar_t * uar,uct_ib_mlx5_md_t * md,uct_ib_mlx5_mmio_mode_t mmio_mode)388 ucs_status_t uct_ib_mlx5_devx_uar_init(uct_ib_mlx5_devx_uar_t *uar,
389 uct_ib_mlx5_md_t *md,
390 uct_ib_mlx5_mmio_mode_t mmio_mode)
391 {
392 #if HAVE_DEVX
393 uar->uar = mlx5dv_devx_alloc_uar(md->super.dev.ibv_context, 0);
394 if (uar->uar == NULL) {
395 ucs_error("mlx5dv_devx_alloc_uar() failed: %m");
396 return UCS_ERR_NO_MEMORY;
397 }
398
399 uar->super.addr.ptr = uar->uar->reg_addr;
400 uar->super.mode = mmio_mode;
401 uar->ctx = md->super.dev.ibv_context;
402
403 return UCS_OK;
404 #else
405 return UCS_ERR_UNSUPPORTED;
406 #endif
407 }
408
uct_ib_mlx5_devx_uar_cleanup(uct_ib_mlx5_devx_uar_t * uar)409 void uct_ib_mlx5_devx_uar_cleanup(uct_ib_mlx5_devx_uar_t *uar)
410 {
411 #if HAVE_DEVX
412 mlx5dv_devx_free_uar(uar->uar);
413 #endif
414 }
415
uct_ib_mlx5_txwq_reset(uct_ib_mlx5_txwq_t * txwq)416 void uct_ib_mlx5_txwq_reset(uct_ib_mlx5_txwq_t *txwq)
417 {
418 txwq->curr = txwq->qstart;
419 txwq->sw_pi = 0;
420 txwq->prev_sw_pi = UINT16_MAX;
421 #if UCS_ENABLE_ASSERT
422 txwq->hw_ci = 0xFFFF;
423 #endif
424 uct_ib_fence_info_init(&txwq->fi);
425 memset(txwq->qstart, 0, UCS_PTR_BYTE_DIFF(txwq->qstart, txwq->qend));
426 }
427
428 ucs_status_t
uct_ib_mlx5_get_mmio_mode(uct_priv_worker_t * worker,uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,unsigned bf_size,uct_ib_mlx5_mmio_mode_t * mmio_mode)429 uct_ib_mlx5_get_mmio_mode(uct_priv_worker_t *worker,
430 uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,
431 unsigned bf_size,
432 uct_ib_mlx5_mmio_mode_t *mmio_mode)
433 {
434 ucs_assert(cfg_mmio_mode < UCT_IB_MLX5_MMIO_MODE_LAST);
435
436 if (cfg_mmio_mode != UCT_IB_MLX5_MMIO_MODE_AUTO) {
437 *mmio_mode = cfg_mmio_mode;
438 } else if (bf_size > 0) {
439 if (worker->thread_mode == UCS_THREAD_MODE_SINGLE) {
440 *mmio_mode = UCT_IB_MLX5_MMIO_MODE_BF_POST;
441 } else if (worker->thread_mode == UCS_THREAD_MODE_SERIALIZED) {
442 *mmio_mode = UCT_IB_MLX5_MMIO_MODE_BF_POST_MT;
443 } else {
444 ucs_error("unsupported thread mode for mlx5: %d", worker->thread_mode);
445 return UCS_ERR_UNSUPPORTED;
446 }
447 } else {
448 *mmio_mode = UCT_IB_MLX5_MMIO_MODE_DB;
449 }
450
451 return UCS_OK;
452 }
453
uct_ib_mlx5_txwq_init(uct_priv_worker_t * worker,uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,uct_ib_mlx5_txwq_t * txwq,struct ibv_qp * verbs_qp)454 ucs_status_t uct_ib_mlx5_txwq_init(uct_priv_worker_t *worker,
455 uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,
456 uct_ib_mlx5_txwq_t *txwq,
457 struct ibv_qp *verbs_qp)
458 {
459 uct_ib_mlx5_mmio_mode_t mmio_mode;
460 uct_ib_mlx5dv_qp_t qp_info = {};
461 uct_ib_mlx5dv_t obj = {};
462 ucs_status_t status;
463
464 obj.dv.qp.in = verbs_qp;
465 obj.dv.qp.out = &qp_info.dv;
466
467 status = uct_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_QP);
468 if (status != UCS_OK) {
469 return UCS_ERR_IO_ERROR;
470 }
471
472 if ((qp_info.dv.sq.stride != MLX5_SEND_WQE_BB) || !ucs_is_pow2(qp_info.dv.sq.wqe_cnt) ||
473 ((qp_info.dv.bf.size != 0) && (qp_info.dv.bf.size != UCT_IB_MLX5_BF_REG_SIZE)))
474 {
475 ucs_error("mlx5 device parameters not suitable for transport "
476 "bf.size(%d) %d, sq.stride(%d) %d, wqe_cnt %d",
477 UCT_IB_MLX5_BF_REG_SIZE, qp_info.dv.bf.size,
478 MLX5_SEND_WQE_BB, qp_info.dv.sq.stride, qp_info.dv.sq.wqe_cnt);
479 return UCS_ERR_IO_ERROR;
480 }
481
482 status = uct_ib_mlx5_get_mmio_mode(worker, cfg_mmio_mode,
483 qp_info.dv.bf.size, &mmio_mode);
484 if (status != UCS_OK) {
485 return status;
486 }
487
488 ucs_debug("tx wq %d bytes [bb=%d, nwqe=%d] mmio_mode %s",
489 qp_info.dv.sq.stride * qp_info.dv.sq.wqe_cnt,
490 qp_info.dv.sq.stride, qp_info.dv.sq.wqe_cnt,
491 uct_ib_mlx5_mmio_modes[mmio_mode]);
492
493 txwq->qstart = qp_info.dv.sq.buf;
494 txwq->qend = UCS_PTR_BYTE_OFFSET(qp_info.dv.sq.buf,
495 qp_info.dv.sq.stride * qp_info.dv.sq.wqe_cnt);
496 txwq->reg = uct_worker_tl_data_get(worker,
497 UCT_IB_MLX5_WORKER_BF_KEY,
498 uct_ib_mlx5_mmio_reg_t,
499 uct_ib_mlx5_mmio_cmp,
500 uct_ib_mlx5_mmio_init,
501 (uintptr_t)qp_info.dv.bf.reg,
502 mmio_mode);
503 if (UCS_PTR_IS_ERR(txwq->reg)) {
504 return UCS_PTR_STATUS(txwq->reg);
505 }
506
507 /* cppcheck-suppress autoVariables */
508 txwq->dbrec = &qp_info.dv.dbrec[MLX5_SND_DBR];
509 /* need to reserve 2x because:
510 * - on completion we only get the index of last wqe and we do not
511 * really know how many bb is there (but no more than max bb
512 * - on send we check that there is at least one bb. We know
513 * exact number of bbs once we actually are sending.
514 */
515 txwq->bb_max = qp_info.dv.sq.wqe_cnt - 2 * UCT_IB_MLX5_MAX_BB;
516 ucs_assert_always(txwq->bb_max > 0);
517
518 uct_ib_mlx5_txwq_reset(txwq);
519 return UCS_OK;
520 }
521
uct_ib_mlx5_txwq_cleanup(uct_ib_mlx5_txwq_t * txwq)522 void uct_ib_mlx5_txwq_cleanup(uct_ib_mlx5_txwq_t* txwq)
523 {
524 uct_ib_mlx5_devx_uar_t *uar = ucs_derived_of(txwq->reg,
525 uct_ib_mlx5_devx_uar_t);
526 switch (txwq->super.type) {
527 case UCT_IB_MLX5_OBJ_TYPE_DEVX:
528 uct_worker_tl_data_put(uar, uct_ib_mlx5_devx_uar_cleanup);
529 break;
530 case UCT_IB_MLX5_OBJ_TYPE_VERBS:
531 uct_ib_mlx5_iface_put_res_domain(&txwq->super);
532 uct_worker_tl_data_put(txwq->reg, uct_ib_mlx5_mmio_cleanup);
533 break;
534 case UCT_IB_MLX5_OBJ_TYPE_LAST:
535 if (txwq->reg != NULL) {
536 uct_worker_tl_data_put(txwq->reg, uct_ib_mlx5_mmio_cleanup);
537 }
538 }
539 }
540
uct_ib_mlx5_get_rxwq(struct ibv_qp * verbs_qp,uct_ib_mlx5_rxwq_t * rxwq)541 ucs_status_t uct_ib_mlx5_get_rxwq(struct ibv_qp *verbs_qp, uct_ib_mlx5_rxwq_t *rxwq)
542 {
543 uct_ib_mlx5dv_qp_t qp_info = {};
544 uct_ib_mlx5dv_t obj = {};
545 ucs_status_t status;
546
547 obj.dv.qp.in = verbs_qp;
548 obj.dv.qp.out = &qp_info.dv;
549
550 status = uct_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_QP);
551 if (status != UCS_OK) {
552 return UCS_ERR_IO_ERROR;
553 }
554
555 if (!ucs_is_pow2(qp_info.dv.rq.wqe_cnt) ||
556 qp_info.dv.rq.stride != sizeof(struct mlx5_wqe_data_seg)) {
557 ucs_error("mlx5 rx wq [count=%d stride=%d] has invalid parameters",
558 qp_info.dv.rq.wqe_cnt,
559 qp_info.dv.rq.stride);
560 return UCS_ERR_IO_ERROR;
561 }
562 rxwq->wqes = qp_info.dv.rq.buf;
563 rxwq->rq_wqe_counter = 0;
564 rxwq->cq_wqe_counter = 0;
565 rxwq->mask = qp_info.dv.rq.wqe_cnt - 1;
566 /* cppcheck-suppress autoVariables */
567 rxwq->dbrec = &qp_info.dv.dbrec[MLX5_RCV_DBR];
568 memset(rxwq->wqes, 0, qp_info.dv.rq.wqe_cnt * sizeof(struct mlx5_wqe_data_seg));
569
570 return UCS_OK;
571 }
572
573 ucs_status_t
uct_ib_mlx5_verbs_srq_init(uct_ib_mlx5_srq_t * srq,struct ibv_srq * verbs_srq,size_t sg_byte_count,int sge_num)574 uct_ib_mlx5_verbs_srq_init(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs_srq,
575 size_t sg_byte_count, int sge_num)
576 {
577 uct_ib_mlx5dv_srq_t srq_info = {};
578 uct_ib_mlx5dv_t obj = {};
579 ucs_status_t status;
580 uint16_t stride;
581
582 obj.dv.srq.in = verbs_srq;
583 obj.dv.srq.out = &srq_info.dv;
584 #if HAVE_DEVX
585 srq_info.dv.comp_mask = MLX5DV_SRQ_MASK_SRQN;
586 #endif
587
588 status = uct_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_SRQ);
589 if (status != UCS_OK) {
590 return status;
591 }
592
593 #if HAVE_DEVX
594 srq->srq_num = srq_info.dv.srqn;
595 #else
596 srq->srq_num = 0;
597 #endif
598
599 if (srq_info.dv.head != 0) {
600 ucs_error("SRQ head is not 0 (%d)", srq_info.dv.head);
601 return UCS_ERR_NO_DEVICE;
602 }
603
604 stride = uct_ib_mlx5_srq_stride(sge_num);
605 if (srq_info.dv.stride != stride) {
606 ucs_error("SRQ stride is not %u (%d), sgenum %d",
607 stride, srq_info.dv.stride, sge_num);
608 return UCS_ERR_NO_DEVICE;
609 }
610
611 if (!ucs_is_pow2(srq_info.dv.tail + 1)) {
612 ucs_error("SRQ length is not power of 2 (%d)", srq_info.dv.tail + 1);
613 return UCS_ERR_NO_DEVICE;
614 }
615
616 srq->buf = srq_info.dv.buf;
617 srq->db = srq_info.dv.dbrec;
618 uct_ib_mlx5_srq_buff_init(srq, srq_info.dv.head, srq_info.dv.tail,
619 sg_byte_count, sge_num);
620
621 return UCS_OK;
622 }
623
uct_ib_mlx5_srq_buff_init(uct_ib_mlx5_srq_t * srq,uint32_t head,uint32_t tail,size_t sg_byte_count,int sge_num)624 void uct_ib_mlx5_srq_buff_init(uct_ib_mlx5_srq_t *srq, uint32_t head,
625 uint32_t tail, size_t sg_byte_count, int sge_num)
626 {
627 uct_ib_mlx5_srq_seg_t *seg;
628 unsigned i, j;
629
630 srq->free_idx = tail;
631 srq->ready_idx = UINT16_MAX;
632 srq->sw_pi = UINT16_MAX;
633 srq->mask = tail;
634 srq->tail = tail;
635 srq->stride = uct_ib_mlx5_srq_stride(sge_num);
636
637 for (i = head; i <= tail; ++i) {
638 seg = uct_ib_mlx5_srq_get_wqe(srq, i);
639 seg->srq.next_wqe_index = htons((i + 1) & tail);
640 seg->srq.ptr_mask = 0;
641 seg->srq.free = 0;
642 seg->srq.desc = NULL;
643 seg->srq.strides = sge_num;
644 for (j = 0; j < sge_num; ++j) {
645 seg->dptr[j].byte_count = htonl(sg_byte_count);
646 }
647 }
648 }
649
uct_ib_mlx5_verbs_srq_cleanup(uct_ib_mlx5_srq_t * srq,struct ibv_srq * verbs_srq)650 void uct_ib_mlx5_verbs_srq_cleanup(uct_ib_mlx5_srq_t *srq,
651 struct ibv_srq *verbs_srq)
652 {
653 uct_ib_mlx5dv_srq_t srq_info = {};
654 uct_ib_mlx5dv_t obj = {};
655 ucs_status_t status;
656
657 if (srq->type != UCT_IB_MLX5_OBJ_TYPE_VERBS) {
658 return;
659 }
660
661 /* check if mlx5 driver didn't modified SRQ */
662 obj.dv.srq.in = verbs_srq;
663 obj.dv.srq.out = &srq_info.dv;
664
665 status = uct_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_SRQ);
666 ucs_assert_always(status == UCS_OK);
667 ucs_assertv_always(srq->tail == srq_info.dv.tail, "srq->tail=%d srq_info.tail=%d",
668 srq->tail, srq_info.dv.tail);
669 }
670
uct_ib_mlx5_modify_qp_state(uct_ib_mlx5_md_t * md,uct_ib_mlx5_qp_t * qp,enum ibv_qp_state state)671 ucs_status_t uct_ib_mlx5_modify_qp_state(uct_ib_mlx5_md_t *md,
672 uct_ib_mlx5_qp_t *qp,
673 enum ibv_qp_state state)
674 {
675 if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX) {
676 return uct_ib_mlx5_devx_modify_qp_state(qp, state);
677 } else {
678 return uct_ib_modify_qp(qp->verbs.qp, state);
679 }
680 }
681
uct_ib_mlx5_md_get_atomic_mr_id(uct_ib_md_t * ibmd,uint8_t * mr_id)682 ucs_status_t uct_ib_mlx5_md_get_atomic_mr_id(uct_ib_md_t *ibmd, uint8_t *mr_id)
683 {
684 uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t);
685
686 #if HAVE_EXP_UMR
687 if ((md->umr_qp == NULL) || (md->umr_cq == NULL)) {
688 goto unsupported;
689 }
690 #else
691 if (!(md->flags & UCT_IB_MLX5_MD_FLAG_DEVX)) {
692 goto unsupported;
693 }
694 #endif
695
696 /* Generate atomic UMR id. We want umrs for same virtual addresses to have
697 * different ids across processes.
698 *
699 * Usually parallel processes running on the same node as part of a single
700 * job will have consecutive PIDs. For example MPI ranks, slurm spawned tasks...
701 */
702 *mr_id = getpid() % 256;
703 return UCS_OK;
704
705 unsupported:
706 *mr_id = 0;
707 return UCS_ERR_UNSUPPORTED;
708 }
709
710