1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2019. ALL RIGHTS RESERVED.
3 * Copyright (C) ARM Ltd. 2017. ALL RIGHTS RESERVED.
4 *
5 * See file LICENSE for terms.
6 */
7
8 #ifdef HAVE_CONFIG_H
9 # include "config.h"
10 #endif
11
12 #include "ud_mlx5.h"
13
14 #include <uct/api/uct.h>
15 #include <uct/ib/base/ib_iface.h>
16 #include <uct/base/uct_md.h>
17 #include <uct/base/uct_log.h>
18 #include <ucs/debug/log.h>
19 #include <ucs/debug/memtrack.h>
20 #include <ucs/type/class.h>
21 #include <string.h>
22 #include <arpa/inet.h> /* For htonl */
23
24 #include <uct/ib/mlx5/ib_mlx5_log.h>
25 #include <uct/ib/mlx5/ib_mlx5.inl>
26 #include <uct/ib/mlx5/dv/ib_mlx5_dv.h>
27
28 #include <uct/ib/ud/base/ud_iface.h>
29 #include <uct/ib/ud/base/ud_ep.h>
30 #include <uct/ib/ud/base/ud_def.h>
31 #include <uct/ib/ud/base/ud_inl.h>
32
33
34 static ucs_config_field_t uct_ud_mlx5_iface_config_table[] = {
35 {"UD_", "", NULL,
36 ucs_offsetof(uct_ud_mlx5_iface_config_t, super),
37 UCS_CONFIG_TYPE_TABLE(uct_ud_iface_config_table)},
38
39 {UCT_IB_CONFIG_PREFIX, "", NULL,
40 ucs_offsetof(uct_ud_mlx5_iface_config_t, mlx5_common),
41 UCS_CONFIG_TYPE_TABLE(uct_ib_mlx5_iface_config_table)},
42
43 {"UD_", "", NULL,
44 ucs_offsetof(uct_ud_mlx5_iface_config_t, ud_mlx5_common),
45 UCS_CONFIG_TYPE_TABLE(uct_ud_mlx5_iface_common_config_table)},
46
47 {NULL}
48 };
49
50 static UCS_F_ALWAYS_INLINE size_t
uct_ud_mlx5_ep_ctrl_av_size(uct_ud_mlx5_ep_t * ep)51 uct_ud_mlx5_ep_ctrl_av_size(uct_ud_mlx5_ep_t *ep)
52 {
53 return sizeof(struct mlx5_wqe_ctrl_seg) + uct_ib_mlx5_wqe_av_size(&ep->av);
54 }
55
uct_ud_mlx5_max_am_iov()56 static UCS_F_ALWAYS_INLINE size_t uct_ud_mlx5_max_am_iov()
57 {
58 return ucs_min(UCT_IB_MLX5_AM_ZCOPY_MAX_IOV, UCT_IB_MAX_IOV);
59 }
60
uct_ud_mlx5_max_inline()61 static UCS_F_ALWAYS_INLINE size_t uct_ud_mlx5_max_inline()
62 {
63 return UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE);
64 }
65
66 static UCS_F_ALWAYS_INLINE void
uct_ud_mlx5_post_send(uct_ud_mlx5_iface_t * iface,uct_ud_mlx5_ep_t * ep,uint8_t ce_se,struct mlx5_wqe_ctrl_seg * ctrl,size_t wqe_size,uct_ud_neth_t * neth,int max_log_sge)67 uct_ud_mlx5_post_send(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep,
68 uint8_t ce_se, struct mlx5_wqe_ctrl_seg *ctrl,
69 size_t wqe_size, uct_ud_neth_t *neth, int max_log_sge)
70 {
71 struct mlx5_wqe_datagram_seg *dgram = (void*)(ctrl + 1);
72
73 ucs_assert(wqe_size <= UCT_IB_MLX5_MAX_SEND_WQE_SIZE);
74
75 UCT_UD_EP_HOOK_CALL_TX(&ep->super, neth);
76
77 uct_ib_mlx5_set_ctrl_seg(ctrl, iface->tx.wq.sw_pi, MLX5_OPCODE_SEND, 0,
78 iface->super.qp->qp_num,
79 uct_ud_mlx5_tx_moderation(iface, ce_se), wqe_size);
80 uct_ib_mlx5_set_dgram_seg(dgram, &ep->av, ep->is_global ? &ep->grh_av : NULL,
81 IBV_QPT_UD);
82
83 uct_ib_mlx5_log_tx(&iface->super.super, ctrl, iface->tx.wq.qstart,
84 iface->tx.wq.qend, max_log_sge, NULL, uct_ud_dump_packet);
85 iface->super.tx.available -= uct_ib_mlx5_post_send(&iface->tx.wq, ctrl,
86 wqe_size);
87 ucs_assert((int16_t)iface->tx.wq.bb_max >= iface->super.tx.available);
88 }
89
90 static UCS_F_ALWAYS_INLINE struct mlx5_wqe_ctrl_seg *
uct_ud_mlx5_ep_get_next_wqe(uct_ud_mlx5_iface_t * iface,uct_ud_mlx5_ep_t * ep,size_t * wqe_size_p,void ** next_seg_p)91 uct_ud_mlx5_ep_get_next_wqe(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep,
92 size_t *wqe_size_p, void **next_seg_p)
93 {
94 size_t ctrl_av_size = uct_ud_mlx5_ep_ctrl_av_size(ep);
95 struct mlx5_wqe_ctrl_seg *ctrl;
96 void *ptr;
97
98 ucs_assert((ctrl_av_size % UCT_IB_MLX5_WQE_SEG_SIZE) == 0);
99
100 ctrl = iface->tx.wq.curr;
101 ptr = UCS_PTR_BYTE_OFFSET(ctrl, ctrl_av_size);
102
103 *wqe_size_p = ctrl_av_size;
104 *next_seg_p = uct_ib_mlx5_txwq_wrap_exact(&iface->tx.wq, ptr);
105
106 return ctrl;
107 }
108
uct_ud_mlx5_ep_send_ctl(uct_ud_ep_t * ud_ep,uct_ud_send_skb_t * skb,const uct_ud_iov_t * iov,uint16_t iovcnt,int flags,int max_log_sge)109 static uint16_t uct_ud_mlx5_ep_send_ctl(uct_ud_ep_t *ud_ep, uct_ud_send_skb_t *skb,
110 const uct_ud_iov_t *iov, uint16_t iovcnt,
111 int flags, int max_log_sge)
112 {
113 uct_ud_mlx5_iface_t *iface = ucs_derived_of(ud_ep->super.super.iface,
114 uct_ud_mlx5_iface_t);
115 uct_ud_mlx5_ep_t *ep = ucs_derived_of(ud_ep, uct_ud_mlx5_ep_t);
116 struct mlx5_wqe_inl_data_seg *inl;
117 struct mlx5_wqe_ctrl_seg *ctrl;
118 struct mlx5_wqe_data_seg *dptr;
119 uint16_t iov_index;
120 size_t wqe_size;
121 void *next_seg;
122 uint8_t ce_se;
123 uint16_t sn;
124
125 /* set WQE flags */
126 sn = iface->tx.wq.sw_pi;
127 ce_se = 0;
128 if (flags & UCT_UD_IFACE_SEND_CTL_FLAG_SOLICITED) {
129 ce_se |= MLX5_WQE_CTRL_SOLICITED;
130 }
131 if (flags & UCT_UD_IFACE_SEND_CTL_FLAG_SIGNALED) {
132 ce_se |= MLX5_WQE_CTRL_CQ_UPDATE;
133 }
134
135 /* set skb header as inline (if fits the length) or as data pointer */
136 ctrl = uct_ud_mlx5_ep_get_next_wqe(iface, ep, &wqe_size, &next_seg);
137 if (skb->len <= uct_ud_mlx5_max_inline()) {
138 inl = next_seg;
139 inl->byte_count = htonl(skb->len | MLX5_INLINE_SEG);
140 wqe_size += ucs_align_up_pow2(sizeof(*inl) + skb->len,
141 UCT_IB_MLX5_WQE_SEG_SIZE);
142 uct_ib_mlx5_inline_copy(inl + 1, skb->neth, skb->len, &iface->tx.wq);
143 } else {
144 ucs_assert(!(flags & UCT_UD_IFACE_SEND_CTL_FLAG_INLINE));
145 dptr = next_seg;
146 wqe_size += sizeof(*dptr);
147 uct_ib_mlx5_set_data_seg(dptr, skb->neth, skb->len, skb->lkey);
148 }
149
150 /* copy IOV from descriptor to WQE */
151 dptr = UCS_PTR_BYTE_OFFSET(ctrl, wqe_size);
152 for (iov_index = 0; iov_index < iovcnt; ++iov_index) {
153 if (iov[iov_index].length == 0) {
154 continue;
155 }
156
157 dptr = uct_ib_mlx5_txwq_wrap_any(&iface->tx.wq, dptr);
158 uct_ib_mlx5_set_data_seg(dptr, iov[iov_index].buffer,
159 iov[iov_index].length, iov[iov_index].lkey);
160 wqe_size += sizeof(*dptr);
161 ++dptr;
162 }
163
164 uct_ud_mlx5_post_send(iface, ep, ce_se, ctrl, wqe_size, skb->neth,
165 max_log_sge);
166 return sn;
167 }
168
169 static UCS_F_NOINLINE void
uct_ud_mlx5_iface_post_recv(uct_ud_mlx5_iface_t * iface)170 uct_ud_mlx5_iface_post_recv(uct_ud_mlx5_iface_t *iface)
171 {
172 unsigned batch = iface->super.super.config.rx_max_batch;
173 struct mlx5_wqe_data_seg *rx_wqes;
174 uint16_t pi, next_pi, count;
175 uct_ib_iface_recv_desc_t *desc;
176
177 rx_wqes = iface->rx.wq.wqes;
178 pi = iface->rx.wq.rq_wqe_counter & iface->rx.wq.mask;
179
180 for (count = 0; count < batch; count ++) {
181 next_pi = (pi + 1) & iface->rx.wq.mask;
182 ucs_prefetch(rx_wqes + next_pi);
183 UCT_TL_IFACE_GET_RX_DESC(&iface->super.super.super, &iface->super.rx.mp,
184 desc, break);
185 rx_wqes[pi].lkey = htonl(desc->lkey);
186 rx_wqes[pi].addr = htobe64((uintptr_t)uct_ib_iface_recv_desc_hdr(&iface->super.super, desc));
187 pi = next_pi;
188 }
189 if (ucs_unlikely(count == 0)) {
190 ucs_debug("iface(%p) failed to post receive wqes", iface);
191 return;
192 }
193 pi = iface->rx.wq.rq_wqe_counter + count;
194 iface->rx.wq.rq_wqe_counter = pi;
195 iface->super.rx.available -= count;
196 ucs_memory_cpu_fence();
197 *iface->rx.wq.dbrec = htonl(pi);
198 }
199
UCS_CLASS_INIT_FUNC(uct_ud_mlx5_ep_t,uct_iface_h tl_iface,const uct_ep_params_t * params)200 static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_ep_t, uct_iface_h tl_iface,
201 const uct_ep_params_t *params)
202 {
203 uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_mlx5_iface_t);
204 ucs_trace_func("");
205 UCS_CLASS_CALL_SUPER_INIT(uct_ud_ep_t, &iface->super, params);
206 return UCS_OK;
207 }
208
UCS_CLASS_CLEANUP_FUNC(uct_ud_mlx5_ep_t)209 static UCS_CLASS_CLEANUP_FUNC(uct_ud_mlx5_ep_t)
210 {
211 ucs_trace_func("");
212 }
213
214 UCS_CLASS_DEFINE(uct_ud_mlx5_ep_t, uct_ud_ep_t);
215 static UCS_CLASS_DEFINE_NEW_FUNC(uct_ud_mlx5_ep_t, uct_ep_t, uct_iface_h,
216 const uct_ep_params_t*);
217 UCS_CLASS_DEFINE_DELETE_FUNC(uct_ud_mlx5_ep_t, uct_ep_t);
218
219
220 /*
221 * Generic inline+iov post-send function
222 * The caller should check that header size + sg list would not exceed WQE size.
223 */
224 static UCS_F_ALWAYS_INLINE ucs_status_t
uct_ud_mlx5_ep_inline_iov_post(uct_ep_h tl_ep,uint8_t am_id,const void * header,size_t header_size,const void * data,size_t data_size,const uct_iov_t * iov,size_t iovcnt,uint32_t packet_flags,uct_completion_t * comp,unsigned stat_ops_counter,unsigned stat_bytes_counter,const char * func_name)225 uct_ud_mlx5_ep_inline_iov_post(uct_ep_h tl_ep, uint8_t am_id,
226 /* inl. header */ const void *header, size_t header_size,
227 /* inl. data */ const void *data, size_t data_size,
228 /* iov data */ const uct_iov_t *iov, size_t iovcnt,
229 uint32_t packet_flags, uct_completion_t *comp,
230 unsigned stat_ops_counter, unsigned stat_bytes_counter,
231 const char *func_name)
232 {
233 uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface,
234 uct_ud_mlx5_iface_t);
235 uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t);
236 struct mlx5_wqe_inl_data_seg *inl;
237 struct mlx5_wqe_ctrl_seg *ctrl;
238 size_t inline_size, wqe_size;
239 void *next_seg, *wqe_data;
240 uct_ud_send_skb_t *skb;
241 ucs_status_t status;
242 uct_ud_neth_t *neth;
243
244 UCT_CHECK_AM_ID(am_id);
245 UCT_UD_CHECK_ZCOPY_LENGTH(&iface->super, header_size + data_size,
246 uct_iov_total_length(iov, iovcnt));
247 UCT_CHECK_IOV_SIZE(iovcnt, uct_ud_mlx5_max_am_iov(), func_name);
248
249 uct_ud_enter(&iface->super);
250
251 skb = uct_ud_ep_get_tx_skb(&iface->super, &ep->super);
252 if (!skb) {
253 status = UCS_ERR_NO_RESOURCE;
254 goto out;
255 }
256
257 ctrl = uct_ud_mlx5_ep_get_next_wqe(iface, ep, &wqe_size,
258 &next_seg);
259 inl = next_seg;
260 inline_size = sizeof(*neth) + header_size + data_size;
261 inl->byte_count = htonl(inline_size | MLX5_INLINE_SEG);
262 wqe_size += sizeof(*inl) + inline_size;
263 skb->len = inline_size;
264
265 /* set network header */
266 neth = (void*)(inl + 1);
267 neth->packet_type = (am_id << UCT_UD_PACKET_AM_ID_SHIFT) |
268 ep->super.dest_ep_id |
269 packet_flags;
270 uct_ud_neth_init_data(&ep->super, neth);
271 if (!(packet_flags & UCT_UD_PACKET_FLAG_ACK_REQ)) {
272 /* check for ACK_REQ, if not already enabled by packet_flags */
273 neth->packet_type |= uct_ud_ep_req_ack(&ep->super) << UCT_UD_PACKET_ACK_REQ_SHIFT;
274 }
275
276 /* copy inline "header", assume it fits to one BB so we won't have to check
277 * for QP wrap-around. This is either the "put" header or the 64-bit
278 * am_short header, not the am_zcopy header.
279 */
280 wqe_data = UCS_PTR_BYTE_OFFSET(neth + 1, header_size);
281 ucs_assert(wqe_data <= iface->tx.wq.qend);
282 memcpy(neth + 1, header, header_size);
283
284 /* copy inline "data" */
285 uct_ib_mlx5_inline_copy(wqe_data, data, data_size, &iface->tx.wq);
286
287 /* set iov to dptr */
288 if (iovcnt > 0) {
289 wqe_size = ucs_align_up_pow2(wqe_size, UCT_IB_MLX5_WQE_SEG_SIZE);
290 wqe_size += uct_ib_mlx5_set_data_seg_iov(&iface->tx.wq,
291 UCS_PTR_BYTE_OFFSET(ctrl, wqe_size),
292 iov, iovcnt);
293 }
294
295 uct_ud_mlx5_post_send(iface, ep, 0, ctrl, wqe_size, neth,
296 UCT_IB_MAX_ZCOPY_LOG_SGE(&iface->super.super));
297
298 memcpy(skb->neth, neth, sizeof(*neth) + header_size);
299 memcpy(UCS_PTR_BYTE_OFFSET(skb->neth + 1, header_size), data, data_size);
300
301 if (iovcnt > 0) {
302 uct_ud_skb_set_zcopy_desc(skb, iov, iovcnt, comp);
303 status = UCS_INPROGRESS;
304 } else {
305 status = UCS_OK;
306 }
307
308 uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb);
309 uct_ud_ep_ctl_op_del(&ep->super, UCT_UD_EP_OP_ACK|UCT_UD_EP_OP_ACK_REQ);
310
311 UCS_STATS_UPDATE_COUNTER(ep->super.super.stats, stat_ops_counter, 1);
312 UCS_STATS_UPDATE_COUNTER(ep->super.super.stats, stat_bytes_counter,
313 header_size + data_size +
314 uct_iov_total_length(iov, iovcnt));
315 out:
316 uct_ud_leave(&iface->super);
317 return status;
318 }
319
320 static UCS_F_ALWAYS_INLINE ucs_status_t
uct_ud_mlx5_ep_short_common(uct_ep_h tl_ep,uint8_t am_id,const void * header,size_t header_size,const void * data,size_t data_size,uint32_t packet_flags,unsigned stat_ops_counter,const char * func_name)321 uct_ud_mlx5_ep_short_common(uct_ep_h tl_ep, uint8_t am_id,
322 /* inline header */ const void *header, size_t header_size,
323 /* inline data */ const void *data, size_t data_size,
324 uint32_t packet_flags, unsigned stat_ops_counter,
325 const char *func_name)
326 {
327 UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + header_size + data_size, 0,
328 uct_ud_mlx5_max_inline(), func_name);
329 return uct_ud_mlx5_ep_inline_iov_post(tl_ep, am_id,
330 header, header_size,
331 data, data_size,
332 /* iov */ NULL, 0,
333 packet_flags,
334 /* completion */ NULL,
335 stat_ops_counter,
336 UCT_EP_STAT_BYTES_SHORT,
337 func_name);
338 }
339
340 static ucs_status_t
uct_ud_mlx5_ep_am_short(uct_ep_h tl_ep,uint8_t id,uint64_t hdr,const void * buffer,unsigned length)341 uct_ud_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
342 const void *buffer, unsigned length)
343 {
344 return uct_ud_mlx5_ep_short_common(tl_ep, id,
345 /* inline header */ &hdr, sizeof(hdr),
346 /* inline data */ buffer, length,
347 /* packet flags */ UCT_UD_PACKET_FLAG_AM,
348 UCT_EP_STAT_AM,
349 "uct_ud_mlx5_ep_am_short");
350 }
351
uct_ud_mlx5_ep_am_bcopy(uct_ep_h tl_ep,uint8_t id,uct_pack_callback_t pack_cb,void * arg,unsigned flags)352 static ssize_t uct_ud_mlx5_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
353 uct_pack_callback_t pack_cb, void *arg,
354 unsigned flags)
355 {
356 uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t);
357 uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface,
358 uct_ud_mlx5_iface_t);
359 struct mlx5_wqe_ctrl_seg *ctrl;
360 struct mlx5_wqe_data_seg *dptr;
361 uct_ud_send_skb_t *skb;
362 ucs_status_t status;
363 size_t wqe_size;
364 void *next_seg;
365 size_t length;
366
367 uct_ud_enter(&iface->super);
368
369 status = uct_ud_am_skb_common(&iface->super, &ep->super, id, &skb);
370 if (status != UCS_OK) {
371 uct_ud_leave(&iface->super);
372 return status;
373 }
374
375 length = uct_ud_skb_bcopy(skb, pack_cb, arg);
376 UCT_UD_CHECK_BCOPY_LENGTH(&iface->super, length);
377
378 ctrl = uct_ud_mlx5_ep_get_next_wqe(iface, ep, &wqe_size, &next_seg);
379 dptr = next_seg;
380 uct_ib_mlx5_set_data_seg(dptr, skb->neth, skb->len, skb->lkey);
381 uct_ud_mlx5_post_send(iface, ep, 0, ctrl, wqe_size + sizeof(*dptr),
382 skb->neth, INT_MAX);
383
384 uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb);
385 UCT_TL_EP_STAT_OP(&ep->super.super, AM, BCOPY, length);
386 uct_ud_leave(&iface->super);
387 return length;
388 }
389
390 static ucs_status_t
uct_ud_mlx5_ep_am_zcopy(uct_ep_h tl_ep,uint8_t id,const void * header,unsigned header_length,const uct_iov_t * iov,size_t iovcnt,unsigned flags,uct_completion_t * comp)391 uct_ud_mlx5_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *header,
392 unsigned header_length, const uct_iov_t *iov,
393 size_t iovcnt, unsigned flags, uct_completion_t *comp)
394 {
395 char dummy = 0 ; /* pass dummy pointer to 0-length header to avoid compiler
396 warnings */
397
398 UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + header_length, 0,
399 UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(UCT_IB_MLX5_AV_FULL_SIZE),
400 "am_zcopy header");
401 return uct_ud_mlx5_ep_inline_iov_post(tl_ep, id,
402 /* inl. header */ &dummy, 0,
403 /* inl. data */ header, header_length,
404 /* iov */ iov, iovcnt,
405 /* packet flags */ UCT_UD_PACKET_FLAG_AM |
406 UCT_UD_PACKET_FLAG_ACK_REQ,
407 /* completion */ comp,
408 UCT_EP_STAT_AM, UCT_EP_STAT_BYTES_ZCOPY,
409 "uct_ud_mlx5_ep_am_zcopy");
410 }
411
412 static ucs_status_t
uct_ud_mlx5_ep_put_short(uct_ep_h tl_ep,const void * buffer,unsigned length,uint64_t remote_addr,uct_rkey_t rkey)413 uct_ud_mlx5_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length,
414 uint64_t remote_addr, uct_rkey_t rkey)
415 {
416 uct_ud_put_hdr_t puth = { .rva = remote_addr };
417 return uct_ud_mlx5_ep_short_common(tl_ep, 0,
418 /* inl. header */ &puth, sizeof(puth),
419 /* inl. data */ buffer, length,
420 /* packet flags */ UCT_UD_PACKET_FLAG_PUT,
421 UCT_EP_STAT_PUT,
422 "uct_ud_mlx5_ep_put_short");
423 }
424
425 static UCS_F_ALWAYS_INLINE unsigned
uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t * iface,int is_async)426 uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t *iface, int is_async)
427 {
428 struct mlx5_cqe64 *cqe;
429 uint16_t ci;
430 uct_ib_iface_recv_desc_t *desc;
431 uint32_t len;
432 void *packet;
433 unsigned count;
434 ptrdiff_t rx_hdr_offset;
435
436 ci = iface->rx.wq.cq_wqe_counter & iface->rx.wq.mask;
437 packet = (void *)be64toh(iface->rx.wq.wqes[ci].addr);
438 ucs_prefetch(UCS_PTR_BYTE_OFFSET(packet, UCT_IB_GRH_LEN));
439 rx_hdr_offset = iface->super.super.config.rx_hdr_offset;
440 desc = UCS_PTR_BYTE_OFFSET(packet, -rx_hdr_offset);
441
442 cqe = uct_ib_mlx5_poll_cq(&iface->super.super, &iface->cq[UCT_IB_DIR_RX]);
443 if (cqe == NULL) {
444 count = 0;
445 goto out;
446 }
447
448 ucs_memory_cpu_load_fence();
449
450 ucs_assert(0 == (cqe->op_own &
451 (MLX5_INLINE_SCATTER_32|MLX5_INLINE_SCATTER_64)));
452 ucs_assert(ntohs(cqe->wqe_counter) == iface->rx.wq.cq_wqe_counter);
453
454 iface->super.rx.available++;
455 iface->rx.wq.cq_wqe_counter++;
456 count = 1;
457 len = ntohl(cqe->byte_cnt);
458 VALGRIND_MAKE_MEM_DEFINED(packet, len);
459
460 if (!uct_ud_iface_check_grh(&iface->super, packet,
461 uct_ib_mlx5_cqe_is_grh_present(cqe))) {
462 ucs_mpool_put_inline(desc);
463 goto out;
464 }
465
466 uct_ib_mlx5_log_rx(&iface->super.super, cqe, packet, uct_ud_dump_packet);
467 /* coverity[tainted_data] */
468 uct_ud_ep_process_rx(&iface->super,
469 (uct_ud_neth_t *)UCS_PTR_BYTE_OFFSET(packet, UCT_IB_GRH_LEN),
470 len - UCT_IB_GRH_LEN,
471 (uct_ud_recv_skb_t *)ucs_unaligned_ptr(desc), is_async);
472 out:
473 if (iface->super.rx.available >= iface->super.super.config.rx_max_batch) {
474 /* we need to try to post buffers always. Otherwise it is possible
475 * to run out of rx wqes if receiver is slow and there are always
476 * cqe to process
477 */
478 uct_ud_mlx5_iface_post_recv(iface);
479 }
480 return count;
481 }
482
483 static UCS_F_ALWAYS_INLINE unsigned
uct_ud_mlx5_iface_poll_tx(uct_ud_mlx5_iface_t * iface,int is_async)484 uct_ud_mlx5_iface_poll_tx(uct_ud_mlx5_iface_t *iface, int is_async)
485 {
486 struct mlx5_cqe64 *cqe;
487 uint16_t hw_ci;
488
489 cqe = uct_ib_mlx5_poll_cq(&iface->super.super, &iface->cq[UCT_IB_DIR_TX]);
490 if (cqe == NULL) {
491 return 0;
492 }
493
494 ucs_memory_cpu_load_fence();
495
496 uct_ib_mlx5_log_cqe(cqe);
497 hw_ci = ntohs(cqe->wqe_counter);
498 iface->super.tx.available = uct_ib_mlx5_txwq_update_bb(&iface->tx.wq, hw_ci);
499
500 uct_ud_iface_send_completion(&iface->super, hw_ci, is_async);
501
502 return 1;
503 }
504
uct_ud_mlx5_iface_progress(uct_iface_h tl_iface)505 static unsigned uct_ud_mlx5_iface_progress(uct_iface_h tl_iface)
506 {
507 uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_mlx5_iface_t);
508 ucs_status_t status;
509 unsigned n, count = 0;
510
511 uct_ud_enter(&iface->super);
512 uct_ud_iface_dispatch_async_comps(&iface->super);
513
514 status = uct_ud_iface_dispatch_pending_rx(&iface->super);
515 if (ucs_likely(status == UCS_OK)) {
516 do {
517 n = uct_ud_mlx5_iface_poll_rx(iface, 0);
518 count += n;
519 } while ((n > 0) && (count < iface->super.super.config.rx_max_poll));
520 }
521
522 count += uct_ud_mlx5_iface_poll_tx(iface, 0);
523 uct_ud_iface_progress_pending(&iface->super, 0);
524 uct_ud_leave(&iface->super);
525 return count;
526 }
527
uct_ud_mlx5_iface_async_progress(uct_ud_iface_t * ud_iface)528 static unsigned uct_ud_mlx5_iface_async_progress(uct_ud_iface_t *ud_iface)
529 {
530 uct_ud_mlx5_iface_t *iface = ucs_derived_of(ud_iface, uct_ud_mlx5_iface_t);
531 unsigned n, count;
532
533 count = 0;
534 do {
535 n = uct_ud_mlx5_iface_poll_rx(iface, 1);
536 count += n;
537 } while ((n > 0) && (count < iface->super.rx.async_max_poll));
538
539 count += uct_ud_mlx5_iface_poll_tx(iface, 1);
540
541 uct_ud_iface_progress_pending(&iface->super, 1);
542
543 return count;
544 }
545
546 static ucs_status_t
uct_ud_mlx5_iface_query(uct_iface_h tl_iface,uct_iface_attr_t * iface_attr)547 uct_ud_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr)
548 {
549 uct_ud_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_iface_t);
550 ucs_status_t status;
551
552 ucs_trace_func("");
553
554 status = uct_ud_iface_query(iface, iface_attr, uct_ud_mlx5_max_am_iov(),
555 UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(UCT_IB_MLX5_AV_FULL_SIZE)
556 - sizeof(uct_ud_neth_t));
557 if (status != UCS_OK) {
558 return status;
559 }
560
561 iface_attr->overhead = 80e-9; /* Software overhead */
562
563 return UCS_OK;
564 }
565
566 static ucs_status_t
uct_ud_mlx5_ep_create_ah(uct_ud_mlx5_iface_t * iface,uct_ud_mlx5_ep_t * ep,const uct_ib_address_t * ib_addr,unsigned path_index,const uct_ud_iface_addr_t * if_addr)567 uct_ud_mlx5_ep_create_ah(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep,
568 const uct_ib_address_t *ib_addr, unsigned path_index,
569 const uct_ud_iface_addr_t *if_addr)
570 {
571 ucs_status_t status;
572 uint32_t remote_qpn;
573 int is_global;
574
575 status = uct_ud_mlx5_iface_get_av(&iface->super.super, &iface->ud_mlx5_common,
576 ib_addr, path_index, &ep->av, &ep->grh_av,
577 &is_global);
578 if (status != UCS_OK) {
579 return status;
580 }
581
582 remote_qpn = uct_ib_unpack_uint24(if_addr->qp_num);
583 ep->is_global = is_global;
584 ep->av.dqp_dct |= htonl(remote_qpn);
585 return UCS_OK;
586 }
587
588 static ucs_status_t
uct_ud_mlx5_ep_create_connected(uct_iface_h iface_h,const uct_device_addr_t * dev_addr,const uct_iface_addr_t * iface_addr,unsigned path_index,uct_ep_h * new_ep_p)589 uct_ud_mlx5_ep_create_connected(uct_iface_h iface_h,
590 const uct_device_addr_t *dev_addr,
591 const uct_iface_addr_t *iface_addr,
592 unsigned path_index, uct_ep_h *new_ep_p)
593 {
594 uct_ud_mlx5_iface_t *iface = ucs_derived_of(iface_h, uct_ud_mlx5_iface_t);
595 uct_ud_mlx5_ep_t *ep;
596 uct_ud_ep_t *new_ud_ep;
597 const uct_ud_iface_addr_t *if_addr = (const uct_ud_iface_addr_t *)iface_addr;
598 const uct_ib_address_t *ib_addr = (const uct_ib_address_t *)dev_addr;
599 uct_ud_send_skb_t *skb;
600 ucs_status_t status, status_ah;
601
602 uct_ud_enter(&iface->super);
603 status = uct_ud_ep_create_connected_common(&iface->super, ib_addr, if_addr,
604 path_index, &new_ud_ep, &skb);
605 if (status != UCS_OK &&
606 status != UCS_ERR_NO_RESOURCE &&
607 status != UCS_ERR_ALREADY_EXISTS) {
608 uct_ud_leave(&iface->super);
609 return status;
610 }
611
612 ep = ucs_derived_of(new_ud_ep, uct_ud_mlx5_ep_t);
613 /* cppcheck-suppress autoVariables */
614 *new_ep_p = &ep->super.super.super;
615 if (status == UCS_ERR_ALREADY_EXISTS) {
616 uct_ud_leave(&iface->super);
617 return UCS_OK;
618 }
619
620 status_ah = uct_ud_mlx5_ep_create_ah(iface, ep, ib_addr,
621 ep->super.path_index, if_addr);
622 if (status_ah != UCS_OK) {
623 uct_ud_ep_destroy_connected(&ep->super, ib_addr, if_addr);
624 *new_ep_p = NULL;
625 uct_ud_leave(&iface->super);
626 return status_ah;
627 }
628
629 if (status == UCS_OK) {
630 uct_ud_mlx5_ep_send_ctl(&ep->super, skb, NULL, 0, 1, 1);
631 uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb);
632 ep->super.flags |= UCT_UD_EP_FLAG_CREQ_SENT;
633 }
634
635 uct_ud_leave(&iface->super);
636 return UCS_OK;
637 }
638
639 static ucs_status_t
uct_ud_mlx5_ep_create(const uct_ep_params_t * params,uct_ep_h * ep_p)640 uct_ud_mlx5_ep_create(const uct_ep_params_t* params, uct_ep_h *ep_p)
641 {
642 if (ucs_test_all_flags(params->field_mask, UCT_EP_PARAM_FIELD_DEV_ADDR |
643 UCT_EP_PARAM_FIELD_IFACE_ADDR)) {
644 return uct_ud_mlx5_ep_create_connected(params->iface, params->dev_addr,
645 params->iface_addr,
646 UCT_EP_PARAMS_GET_PATH_INDEX(params),
647 ep_p);
648 }
649
650 return uct_ud_mlx5_ep_t_new(params->iface, params, ep_p);
651 }
652
653
654 static ucs_status_t
uct_ud_mlx5_ep_connect_to_ep(uct_ep_h tl_ep,const uct_device_addr_t * dev_addr,const uct_ep_addr_t * uct_ep_addr)655 uct_ud_mlx5_ep_connect_to_ep(uct_ep_h tl_ep,
656 const uct_device_addr_t *dev_addr,
657 const uct_ep_addr_t *uct_ep_addr)
658 {
659 ucs_status_t status;
660 uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t);
661 uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface,
662 uct_ud_mlx5_iface_t);
663 const uct_ud_ep_addr_t *ep_addr = (const uct_ud_ep_addr_t *)uct_ep_addr;
664 const uct_ib_address_t *ib_addr = (const uct_ib_address_t *)dev_addr;
665
666 ucs_trace_func("");
667 status = uct_ud_ep_connect_to_ep(&ep->super, ib_addr, ep_addr);
668 if (status != UCS_OK) {
669 return status;
670 }
671
672 status = uct_ud_mlx5_ep_create_ah(iface, ep, ib_addr, ep->super.path_index,
673 (const uct_ud_iface_addr_t *)ep_addr);
674 if (status != UCS_OK) {
675 return status;
676 }
677
678 return UCS_OK;
679 }
680
uct_ud_mlx5_iface_arm_cq(uct_ib_iface_t * ib_iface,uct_ib_dir_t dir,int solicited)681 static ucs_status_t uct_ud_mlx5_iface_arm_cq(uct_ib_iface_t *ib_iface,
682 uct_ib_dir_t dir,
683 int solicited)
684 {
685 uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t);
686 #if HAVE_DECL_MLX5DV_INIT_OBJ
687 return uct_ib_mlx5dv_arm_cq(&iface->cq[dir], solicited);
688 #else
689 uct_ib_mlx5_update_cq_ci(iface->super.super.cq[dir],
690 iface->cq[dir].cq_ci);
691 return uct_ib_iface_arm_cq(ib_iface, dir, solicited);
692 #endif
693 }
694
uct_ud_mlx5_ep_set_failed(uct_ib_iface_t * iface,uct_ep_h ep,ucs_status_t status)695 static ucs_status_t uct_ud_mlx5_ep_set_failed(uct_ib_iface_t *iface,
696 uct_ep_h ep, ucs_status_t status)
697 {
698 return uct_set_ep_failed(&UCS_CLASS_NAME(uct_ud_mlx5_ep_t), ep,
699 &iface->super.super, status);
700 }
701
uct_ud_mlx5_iface_event_cq(uct_ib_iface_t * ib_iface,uct_ib_dir_t dir)702 static void uct_ud_mlx5_iface_event_cq(uct_ib_iface_t *ib_iface,
703 uct_ib_dir_t dir)
704 {
705 uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t);
706
707 iface->cq[dir].cq_sn++;
708 }
709
uct_ud_mlx5_iface_create_qp(uct_ib_iface_t * ib_iface,uct_ib_qp_attr_t * ib_attr,struct ibv_qp ** qp_p)710 static ucs_status_t uct_ud_mlx5_iface_create_qp(uct_ib_iface_t *ib_iface,
711 uct_ib_qp_attr_t *ib_attr,
712 struct ibv_qp **qp_p)
713 {
714 uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t);
715 uct_ib_mlx5_qp_t *qp = &iface->tx.wq.super;
716 uct_ib_mlx5_qp_attr_t attr = {};
717 ucs_status_t status;
718
719 attr.super = *ib_attr;
720 attr.mmio_mode = UCT_IB_MLX5_MMIO_MODE_LAST;
721
722 status = uct_ib_mlx5_iface_create_qp(ib_iface, qp, &attr);
723 if (status != UCS_OK) {
724 return status;
725 }
726
727 *qp_p = qp->verbs.qp;
728 return status;
729 }
730
731 static void UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_iface_t)(uct_iface_t*);
732
uct_ud_mlx5_iface_handle_failure(uct_ib_iface_t * ib_iface,void * arg,ucs_status_t status)733 static void uct_ud_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg,
734 ucs_status_t status)
735 {
736 uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t);
737
738 /* Local side failure - treat as fatal */
739 uct_ib_mlx5_completion_with_err(ib_iface, arg, &iface->tx.wq,
740 UCS_LOG_LEVEL_FATAL);
741 }
742
743 static uct_ud_iface_ops_t uct_ud_mlx5_iface_ops = {
744 {
745 {
746 .ep_put_short = uct_ud_mlx5_ep_put_short,
747 .ep_am_short = uct_ud_mlx5_ep_am_short,
748 .ep_am_bcopy = uct_ud_mlx5_ep_am_bcopy,
749 .ep_am_zcopy = uct_ud_mlx5_ep_am_zcopy,
750 .ep_pending_add = uct_ud_ep_pending_add,
751 .ep_pending_purge = uct_ud_ep_pending_purge,
752 .ep_flush = uct_ud_ep_flush,
753 .ep_fence = uct_base_ep_fence,
754 .ep_create = uct_ud_mlx5_ep_create,
755 .ep_destroy = uct_ud_ep_disconnect ,
756 .ep_get_address = uct_ud_ep_get_address,
757 .ep_connect_to_ep = uct_ud_mlx5_ep_connect_to_ep,
758 .iface_flush = uct_ud_iface_flush,
759 .iface_fence = uct_base_iface_fence,
760 .iface_progress_enable = uct_ud_iface_progress_enable,
761 .iface_progress_disable = uct_ud_iface_progress_disable,
762 .iface_progress = uct_ud_mlx5_iface_progress,
763 .iface_event_fd_get = (uct_iface_event_fd_get_func_t)
764 ucs_empty_function_return_unsupported,
765 .iface_event_arm = uct_ud_iface_event_arm,
766 .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_iface_t),
767 .iface_query = uct_ud_mlx5_iface_query,
768 .iface_get_device_address = uct_ib_iface_get_device_address,
769 .iface_get_address = uct_ud_iface_get_address,
770 .iface_is_reachable = uct_ib_iface_is_reachable
771 },
772 .create_cq = uct_ib_mlx5_create_cq,
773 .arm_cq = uct_ud_mlx5_iface_arm_cq,
774 .event_cq = uct_ud_mlx5_iface_event_cq,
775 .handle_failure = uct_ud_mlx5_iface_handle_failure,
776 .set_ep_failed = uct_ud_mlx5_ep_set_failed,
777 },
778 .async_progress = uct_ud_mlx5_iface_async_progress,
779 .send_ctl = uct_ud_mlx5_ep_send_ctl,
780 .ep_free = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_ep_t),
781 .create_qp = uct_ud_mlx5_iface_create_qp,
782 };
783
UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t,uct_md_h md,uct_worker_h worker,const uct_iface_params_t * params,const uct_iface_config_t * tl_config)784 static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t,
785 uct_md_h md, uct_worker_h worker,
786 const uct_iface_params_t *params,
787 const uct_iface_config_t *tl_config)
788 {
789 uct_ud_mlx5_iface_config_t *config = ucs_derived_of(tl_config,
790 uct_ud_mlx5_iface_config_t);
791 uct_ib_iface_init_attr_t init_attr = {};
792 ucs_status_t status;
793 int i;
794
795 ucs_trace_func("");
796
797 init_attr.flags = UCT_IB_CQ_IGNORE_OVERRUN;
798 init_attr.cq_len[UCT_IB_DIR_TX] = config->super.super.tx.queue_len * UCT_IB_MLX5_MAX_BB;
799 init_attr.cq_len[UCT_IB_DIR_RX] = config->super.super.rx.queue_len;
800
801 self->tx.wq.super.type = UCT_IB_MLX5_OBJ_TYPE_LAST;
802
803 UCS_CLASS_CALL_SUPER_INIT(uct_ud_iface_t, &uct_ud_mlx5_iface_ops,
804 md, worker, params, &config->super, &init_attr);
805
806 self->super.config.max_inline = uct_ud_mlx5_max_inline();
807
808 status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_TX], &self->cq[UCT_IB_DIR_TX]);
809 if (status != UCS_OK) {
810 return status;
811 }
812
813 status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_RX], &self->cq[UCT_IB_DIR_RX]);
814 if (status != UCS_OK) {
815 return status;
816 }
817
818 status = uct_ib_mlx5_txwq_init(self->super.super.super.worker,
819 config->mlx5_common.mmio_mode, &self->tx.wq,
820 self->super.qp);
821 if (status != UCS_OK) {
822 return status;
823 }
824
825 self->super.tx.available = self->tx.wq.bb_max;
826 ucs_assert(init_attr.cq_len[UCT_IB_DIR_TX] >= self->tx.wq.bb_max);
827
828 status = uct_ib_mlx5_get_rxwq(self->super.qp, &self->rx.wq);
829 if (status != UCS_OK) {
830 return status;
831 }
832
833 ucs_assert(init_attr.cq_len[UCT_IB_DIR_RX] > self->rx.wq.mask);
834
835 status = uct_ud_mlx5_iface_common_init(&self->super.super,
836 &self->ud_mlx5_common,
837 &config->ud_mlx5_common);
838 if (status != UCS_OK) {
839 return status;
840 }
841
842 /* write buffer sizes */
843 for (i = 0; i <= self->rx.wq.mask; i++) {
844 self->rx.wq.wqes[i].byte_count = htonl(self->super.super.config.rx_payload_offset +
845 self->super.super.config.seg_size);
846 }
847 while (self->super.rx.available >= self->super.super.config.rx_max_batch) {
848 uct_ud_mlx5_iface_post_recv(self);
849 }
850
851 status = uct_ud_iface_complete_init(&self->super);
852 if (status != UCS_OK) {
853 return status;
854 }
855
856 return UCS_OK;
857 }
858
859
UCS_CLASS_CLEANUP_FUNC(uct_ud_mlx5_iface_t)860 static UCS_CLASS_CLEANUP_FUNC(uct_ud_mlx5_iface_t)
861 {
862 ucs_trace_func("");
863 uct_ud_iface_remove_async_handlers(&self->super);
864 uct_ud_enter(&self->super);
865 UCT_UD_IFACE_DELETE_EPS(&self->super, uct_ud_mlx5_ep_t);
866 uct_ib_mlx5_txwq_cleanup(&self->tx.wq);
867 uct_ud_leave(&self->super);
868 }
869
870 UCS_CLASS_DEFINE(uct_ud_mlx5_iface_t, uct_ud_iface_t);
871
872 static UCS_CLASS_DEFINE_NEW_FUNC(uct_ud_mlx5_iface_t, uct_iface_t, uct_md_h,
873 uct_worker_h, const uct_iface_params_t*,
874 const uct_iface_config_t*);
875
876 static UCS_CLASS_DEFINE_DELETE_FUNC(uct_ud_mlx5_iface_t, uct_iface_t);
877
878 static ucs_status_t
uct_ud_mlx5_query_tl_devices(uct_md_h md,uct_tl_device_resource_t ** tl_devices_p,unsigned * num_tl_devices_p)879 uct_ud_mlx5_query_tl_devices(uct_md_h md,
880 uct_tl_device_resource_t **tl_devices_p,
881 unsigned *num_tl_devices_p)
882 {
883 uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t);
884 return uct_ib_device_query_ports(&ib_md->dev, UCT_IB_DEVICE_FLAG_MLX5_PRM,
885 tl_devices_p, num_tl_devices_p);
886 }
887
888 UCT_TL_DEFINE(&uct_ib_component, ud_mlx5, uct_ud_mlx5_query_tl_devices,
889 uct_ud_mlx5_iface_t, "UD_MLX5_", uct_ud_mlx5_iface_config_table,
890 uct_ud_mlx5_iface_config_t);
891