1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
3 * Copyright (C) ARM Ltd. 2017.  ALL RIGHTS RESERVED.
4 *
5 * See file LICENSE for terms.
6 */
7 
8 #ifdef HAVE_CONFIG_H
9 #  include "config.h"
10 #endif
11 
12 #include "ud_mlx5.h"
13 
14 #include <uct/api/uct.h>
15 #include <uct/ib/base/ib_iface.h>
16 #include <uct/base/uct_md.h>
17 #include <uct/base/uct_log.h>
18 #include <ucs/debug/log.h>
19 #include <ucs/debug/memtrack.h>
20 #include <ucs/type/class.h>
21 #include <string.h>
22 #include <arpa/inet.h> /* For htonl */
23 
24 #include <uct/ib/mlx5/ib_mlx5_log.h>
25 #include <uct/ib/mlx5/ib_mlx5.inl>
26 #include <uct/ib/mlx5/dv/ib_mlx5_dv.h>
27 
28 #include <uct/ib/ud/base/ud_iface.h>
29 #include <uct/ib/ud/base/ud_ep.h>
30 #include <uct/ib/ud/base/ud_def.h>
31 #include <uct/ib/ud/base/ud_inl.h>
32 
33 
34 static ucs_config_field_t uct_ud_mlx5_iface_config_table[] = {
35   {"UD_", "", NULL,
36    ucs_offsetof(uct_ud_mlx5_iface_config_t, super),
37    UCS_CONFIG_TYPE_TABLE(uct_ud_iface_config_table)},
38 
39   {UCT_IB_CONFIG_PREFIX, "", NULL,
40    ucs_offsetof(uct_ud_mlx5_iface_config_t, mlx5_common),
41    UCS_CONFIG_TYPE_TABLE(uct_ib_mlx5_iface_config_table)},
42 
43   {"UD_", "", NULL,
44    ucs_offsetof(uct_ud_mlx5_iface_config_t, ud_mlx5_common),
45    UCS_CONFIG_TYPE_TABLE(uct_ud_mlx5_iface_common_config_table)},
46 
47   {NULL}
48 };
49 
50 static UCS_F_ALWAYS_INLINE size_t
uct_ud_mlx5_ep_ctrl_av_size(uct_ud_mlx5_ep_t * ep)51 uct_ud_mlx5_ep_ctrl_av_size(uct_ud_mlx5_ep_t *ep)
52 {
53     return sizeof(struct mlx5_wqe_ctrl_seg) + uct_ib_mlx5_wqe_av_size(&ep->av);
54 }
55 
uct_ud_mlx5_max_am_iov()56 static UCS_F_ALWAYS_INLINE size_t uct_ud_mlx5_max_am_iov()
57 {
58     return ucs_min(UCT_IB_MLX5_AM_ZCOPY_MAX_IOV, UCT_IB_MAX_IOV);
59 }
60 
uct_ud_mlx5_max_inline()61 static UCS_F_ALWAYS_INLINE size_t uct_ud_mlx5_max_inline()
62 {
63     return UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE);
64 }
65 
66 static UCS_F_ALWAYS_INLINE void
uct_ud_mlx5_post_send(uct_ud_mlx5_iface_t * iface,uct_ud_mlx5_ep_t * ep,uint8_t ce_se,struct mlx5_wqe_ctrl_seg * ctrl,size_t wqe_size,uct_ud_neth_t * neth,int max_log_sge)67 uct_ud_mlx5_post_send(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep,
68                       uint8_t ce_se, struct mlx5_wqe_ctrl_seg *ctrl,
69                       size_t wqe_size, uct_ud_neth_t *neth, int max_log_sge)
70 {
71     struct mlx5_wqe_datagram_seg *dgram = (void*)(ctrl + 1);
72 
73     ucs_assert(wqe_size <= UCT_IB_MLX5_MAX_SEND_WQE_SIZE);
74 
75     UCT_UD_EP_HOOK_CALL_TX(&ep->super, neth);
76 
77     uct_ib_mlx5_set_ctrl_seg(ctrl, iface->tx.wq.sw_pi, MLX5_OPCODE_SEND, 0,
78                              iface->super.qp->qp_num,
79                              uct_ud_mlx5_tx_moderation(iface, ce_se), wqe_size);
80     uct_ib_mlx5_set_dgram_seg(dgram, &ep->av, ep->is_global ? &ep->grh_av : NULL,
81                               IBV_QPT_UD);
82 
83     uct_ib_mlx5_log_tx(&iface->super.super, ctrl, iface->tx.wq.qstart,
84                        iface->tx.wq.qend, max_log_sge, NULL, uct_ud_dump_packet);
85     iface->super.tx.available -= uct_ib_mlx5_post_send(&iface->tx.wq, ctrl,
86                                                        wqe_size);
87     ucs_assert((int16_t)iface->tx.wq.bb_max >= iface->super.tx.available);
88 }
89 
90 static UCS_F_ALWAYS_INLINE struct mlx5_wqe_ctrl_seg *
uct_ud_mlx5_ep_get_next_wqe(uct_ud_mlx5_iface_t * iface,uct_ud_mlx5_ep_t * ep,size_t * wqe_size_p,void ** next_seg_p)91 uct_ud_mlx5_ep_get_next_wqe(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep,
92                             size_t *wqe_size_p, void **next_seg_p)
93 {
94     size_t ctrl_av_size = uct_ud_mlx5_ep_ctrl_av_size(ep);
95     struct mlx5_wqe_ctrl_seg *ctrl;
96     void *ptr;
97 
98     ucs_assert((ctrl_av_size % UCT_IB_MLX5_WQE_SEG_SIZE) == 0);
99 
100     ctrl        = iface->tx.wq.curr;
101     ptr         = UCS_PTR_BYTE_OFFSET(ctrl, ctrl_av_size);
102 
103     *wqe_size_p = ctrl_av_size;
104     *next_seg_p = uct_ib_mlx5_txwq_wrap_exact(&iface->tx.wq, ptr);
105 
106     return ctrl;
107 }
108 
uct_ud_mlx5_ep_send_ctl(uct_ud_ep_t * ud_ep,uct_ud_send_skb_t * skb,const uct_ud_iov_t * iov,uint16_t iovcnt,int flags,int max_log_sge)109 static uint16_t uct_ud_mlx5_ep_send_ctl(uct_ud_ep_t *ud_ep, uct_ud_send_skb_t *skb,
110                                         const uct_ud_iov_t *iov, uint16_t iovcnt,
111                                         int flags, int max_log_sge)
112 {
113     uct_ud_mlx5_iface_t *iface = ucs_derived_of(ud_ep->super.super.iface,
114                                                 uct_ud_mlx5_iface_t);
115     uct_ud_mlx5_ep_t *ep       = ucs_derived_of(ud_ep, uct_ud_mlx5_ep_t);
116     struct mlx5_wqe_inl_data_seg *inl;
117     struct mlx5_wqe_ctrl_seg *ctrl;
118     struct mlx5_wqe_data_seg *dptr;
119     uint16_t iov_index;
120     size_t wqe_size;
121     void *next_seg;
122     uint8_t ce_se;
123     uint16_t sn;
124 
125     /* set WQE flags */
126     sn    = iface->tx.wq.sw_pi;
127     ce_se = 0;
128     if (flags & UCT_UD_IFACE_SEND_CTL_FLAG_SOLICITED) {
129         ce_se |= MLX5_WQE_CTRL_SOLICITED;
130     }
131     if (flags & UCT_UD_IFACE_SEND_CTL_FLAG_SIGNALED) {
132         ce_se |= MLX5_WQE_CTRL_CQ_UPDATE;
133     }
134 
135     /* set skb header as inline (if fits the length) or as data pointer */
136     ctrl = uct_ud_mlx5_ep_get_next_wqe(iface, ep, &wqe_size, &next_seg);
137     if (skb->len <= uct_ud_mlx5_max_inline()) {
138         inl             = next_seg;
139         inl->byte_count = htonl(skb->len | MLX5_INLINE_SEG);
140         wqe_size       += ucs_align_up_pow2(sizeof(*inl) + skb->len,
141                                             UCT_IB_MLX5_WQE_SEG_SIZE);
142         uct_ib_mlx5_inline_copy(inl + 1, skb->neth, skb->len, &iface->tx.wq);
143     } else {
144         ucs_assert(!(flags & UCT_UD_IFACE_SEND_CTL_FLAG_INLINE));
145         dptr            = next_seg;
146         wqe_size       += sizeof(*dptr);
147         uct_ib_mlx5_set_data_seg(dptr, skb->neth, skb->len, skb->lkey);
148     }
149 
150     /* copy IOV from descriptor to WQE */
151     dptr = UCS_PTR_BYTE_OFFSET(ctrl, wqe_size);
152     for (iov_index = 0; iov_index < iovcnt; ++iov_index) {
153         if (iov[iov_index].length == 0) {
154             continue;
155         }
156 
157         dptr = uct_ib_mlx5_txwq_wrap_any(&iface->tx.wq, dptr);
158         uct_ib_mlx5_set_data_seg(dptr, iov[iov_index].buffer,
159                                  iov[iov_index].length, iov[iov_index].lkey);
160         wqe_size += sizeof(*dptr);
161         ++dptr;
162     }
163 
164     uct_ud_mlx5_post_send(iface, ep, ce_se, ctrl, wqe_size, skb->neth,
165                           max_log_sge);
166     return sn;
167 }
168 
169 static UCS_F_NOINLINE void
uct_ud_mlx5_iface_post_recv(uct_ud_mlx5_iface_t * iface)170 uct_ud_mlx5_iface_post_recv(uct_ud_mlx5_iface_t *iface)
171 {
172     unsigned batch = iface->super.super.config.rx_max_batch;
173     struct mlx5_wqe_data_seg *rx_wqes;
174     uint16_t pi, next_pi, count;
175     uct_ib_iface_recv_desc_t *desc;
176 
177     rx_wqes = iface->rx.wq.wqes;
178     pi      = iface->rx.wq.rq_wqe_counter & iface->rx.wq.mask;
179 
180     for (count = 0; count < batch; count ++) {
181         next_pi = (pi + 1) &  iface->rx.wq.mask;
182         ucs_prefetch(rx_wqes + next_pi);
183         UCT_TL_IFACE_GET_RX_DESC(&iface->super.super.super, &iface->super.rx.mp,
184                                  desc, break);
185         rx_wqes[pi].lkey = htonl(desc->lkey);
186         rx_wqes[pi].addr = htobe64((uintptr_t)uct_ib_iface_recv_desc_hdr(&iface->super.super, desc));
187         pi = next_pi;
188     }
189     if (ucs_unlikely(count == 0)) {
190         ucs_debug("iface(%p) failed to post receive wqes", iface);
191         return;
192     }
193     pi = iface->rx.wq.rq_wqe_counter + count;
194     iface->rx.wq.rq_wqe_counter = pi;
195     iface->super.rx.available -= count;
196     ucs_memory_cpu_fence();
197     *iface->rx.wq.dbrec = htonl(pi);
198 }
199 
UCS_CLASS_INIT_FUNC(uct_ud_mlx5_ep_t,uct_iface_h tl_iface,const uct_ep_params_t * params)200 static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_ep_t, uct_iface_h tl_iface,
201                            const uct_ep_params_t *params)
202 {
203     uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_mlx5_iface_t);
204     ucs_trace_func("");
205     UCS_CLASS_CALL_SUPER_INIT(uct_ud_ep_t, &iface->super, params);
206     return UCS_OK;
207 }
208 
UCS_CLASS_CLEANUP_FUNC(uct_ud_mlx5_ep_t)209 static UCS_CLASS_CLEANUP_FUNC(uct_ud_mlx5_ep_t)
210 {
211     ucs_trace_func("");
212 }
213 
214 UCS_CLASS_DEFINE(uct_ud_mlx5_ep_t, uct_ud_ep_t);
215 static UCS_CLASS_DEFINE_NEW_FUNC(uct_ud_mlx5_ep_t, uct_ep_t, uct_iface_h,
216                                  const uct_ep_params_t*);
217 UCS_CLASS_DEFINE_DELETE_FUNC(uct_ud_mlx5_ep_t, uct_ep_t);
218 
219 
220 /*
221  * Generic inline+iov post-send function
222  * The caller should check that header size + sg list would not exceed WQE size.
223  */
224 static UCS_F_ALWAYS_INLINE ucs_status_t
uct_ud_mlx5_ep_inline_iov_post(uct_ep_h tl_ep,uint8_t am_id,const void * header,size_t header_size,const void * data,size_t data_size,const uct_iov_t * iov,size_t iovcnt,uint32_t packet_flags,uct_completion_t * comp,unsigned stat_ops_counter,unsigned stat_bytes_counter,const char * func_name)225 uct_ud_mlx5_ep_inline_iov_post(uct_ep_h tl_ep, uint8_t am_id,
226                                /* inl. header */ const void *header, size_t header_size,
227                                /* inl. data */   const void *data, size_t data_size,
228                                /* iov data */    const uct_iov_t *iov, size_t iovcnt,
229                                uint32_t packet_flags, uct_completion_t *comp,
230                                unsigned stat_ops_counter, unsigned stat_bytes_counter,
231                                const char *func_name)
232 {
233     uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface,
234                                                 uct_ud_mlx5_iface_t);
235     uct_ud_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t);
236     struct mlx5_wqe_inl_data_seg *inl;
237     struct mlx5_wqe_ctrl_seg *ctrl;
238     size_t inline_size, wqe_size;
239     void *next_seg, *wqe_data;
240     uct_ud_send_skb_t *skb;
241     ucs_status_t status;
242     uct_ud_neth_t *neth;
243 
244     UCT_CHECK_AM_ID(am_id);
245     UCT_UD_CHECK_ZCOPY_LENGTH(&iface->super, header_size + data_size,
246                               uct_iov_total_length(iov, iovcnt));
247     UCT_CHECK_IOV_SIZE(iovcnt, uct_ud_mlx5_max_am_iov(), func_name);
248 
249     uct_ud_enter(&iface->super);
250 
251     skb = uct_ud_ep_get_tx_skb(&iface->super, &ep->super);
252     if (!skb) {
253         status = UCS_ERR_NO_RESOURCE;
254         goto out;
255     }
256 
257     ctrl            = uct_ud_mlx5_ep_get_next_wqe(iface, ep, &wqe_size,
258                                                   &next_seg);
259     inl             = next_seg;
260     inline_size     = sizeof(*neth) + header_size + data_size;
261     inl->byte_count = htonl(inline_size | MLX5_INLINE_SEG);
262     wqe_size       += sizeof(*inl) + inline_size;
263     skb->len        = inline_size;
264 
265     /* set network header */
266     neth              = (void*)(inl + 1);
267     neth->packet_type = (am_id << UCT_UD_PACKET_AM_ID_SHIFT) |
268                         ep->super.dest_ep_id |
269                         packet_flags;
270     uct_ud_neth_init_data(&ep->super, neth);
271     if (!(packet_flags & UCT_UD_PACKET_FLAG_ACK_REQ)) {
272         /* check for ACK_REQ, if not already enabled by packet_flags */
273         neth->packet_type |= uct_ud_ep_req_ack(&ep->super) << UCT_UD_PACKET_ACK_REQ_SHIFT;
274     }
275 
276     /* copy inline "header", assume it fits to one BB so we won't have to check
277      * for QP wrap-around. This is either the "put" header or the 64-bit
278      * am_short header, not the am_zcopy header.
279      */
280     wqe_data = UCS_PTR_BYTE_OFFSET(neth + 1, header_size);
281     ucs_assert(wqe_data <= iface->tx.wq.qend);
282     memcpy(neth + 1, header, header_size);
283 
284     /* copy inline "data" */
285     uct_ib_mlx5_inline_copy(wqe_data, data, data_size, &iface->tx.wq);
286 
287     /* set iov to dptr */
288     if (iovcnt > 0) {
289         wqe_size  = ucs_align_up_pow2(wqe_size, UCT_IB_MLX5_WQE_SEG_SIZE);
290         wqe_size += uct_ib_mlx5_set_data_seg_iov(&iface->tx.wq,
291                                                  UCS_PTR_BYTE_OFFSET(ctrl, wqe_size),
292                                                  iov, iovcnt);
293     }
294 
295     uct_ud_mlx5_post_send(iface, ep, 0, ctrl, wqe_size, neth,
296                           UCT_IB_MAX_ZCOPY_LOG_SGE(&iface->super.super));
297 
298     memcpy(skb->neth, neth, sizeof(*neth) + header_size);
299     memcpy(UCS_PTR_BYTE_OFFSET(skb->neth + 1, header_size), data, data_size);
300 
301     if (iovcnt > 0) {
302         uct_ud_skb_set_zcopy_desc(skb, iov, iovcnt, comp);
303         status = UCS_INPROGRESS;
304     } else {
305         status = UCS_OK;
306     }
307 
308     uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb);
309     uct_ud_ep_ctl_op_del(&ep->super, UCT_UD_EP_OP_ACK|UCT_UD_EP_OP_ACK_REQ);
310 
311     UCS_STATS_UPDATE_COUNTER(ep->super.super.stats, stat_ops_counter, 1);
312     UCS_STATS_UPDATE_COUNTER(ep->super.super.stats, stat_bytes_counter,
313                              header_size + data_size +
314                              uct_iov_total_length(iov, iovcnt));
315 out:
316     uct_ud_leave(&iface->super);
317     return status;
318 }
319 
320 static UCS_F_ALWAYS_INLINE ucs_status_t
uct_ud_mlx5_ep_short_common(uct_ep_h tl_ep,uint8_t am_id,const void * header,size_t header_size,const void * data,size_t data_size,uint32_t packet_flags,unsigned stat_ops_counter,const char * func_name)321 uct_ud_mlx5_ep_short_common(uct_ep_h tl_ep, uint8_t am_id,
322                             /* inline header */ const void *header, size_t header_size,
323                             /* inline data */   const void *data, size_t data_size,
324                             uint32_t packet_flags, unsigned stat_ops_counter,
325                             const char *func_name)
326 {
327     UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + header_size + data_size, 0,
328                      uct_ud_mlx5_max_inline(), func_name);
329     return uct_ud_mlx5_ep_inline_iov_post(tl_ep, am_id,
330                                           header, header_size,
331                                           data, data_size,
332                                           /* iov */ NULL, 0,
333                                           packet_flags,
334                                           /* completion */ NULL,
335                                           stat_ops_counter,
336                                           UCT_EP_STAT_BYTES_SHORT,
337                                           func_name);
338 }
339 
340 static ucs_status_t
uct_ud_mlx5_ep_am_short(uct_ep_h tl_ep,uint8_t id,uint64_t hdr,const void * buffer,unsigned length)341 uct_ud_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
342                         const void *buffer, unsigned length)
343 {
344     return uct_ud_mlx5_ep_short_common(tl_ep, id,
345                                        /* inline header */ &hdr, sizeof(hdr),
346                                        /* inline data */  buffer, length,
347                                        /* packet flags */ UCT_UD_PACKET_FLAG_AM,
348                                        UCT_EP_STAT_AM,
349                                        "uct_ud_mlx5_ep_am_short");
350 }
351 
uct_ud_mlx5_ep_am_bcopy(uct_ep_h tl_ep,uint8_t id,uct_pack_callback_t pack_cb,void * arg,unsigned flags)352 static ssize_t uct_ud_mlx5_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
353                                        uct_pack_callback_t pack_cb, void *arg,
354                                        unsigned flags)
355 {
356     uct_ud_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t);
357     uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface,
358                                                 uct_ud_mlx5_iface_t);
359     struct mlx5_wqe_ctrl_seg *ctrl;
360     struct mlx5_wqe_data_seg *dptr;
361     uct_ud_send_skb_t *skb;
362     ucs_status_t status;
363     size_t wqe_size;
364     void *next_seg;
365     size_t length;
366 
367     uct_ud_enter(&iface->super);
368 
369     status = uct_ud_am_skb_common(&iface->super, &ep->super, id, &skb);
370     if (status != UCS_OK) {
371         uct_ud_leave(&iface->super);
372         return status;
373     }
374 
375     length = uct_ud_skb_bcopy(skb, pack_cb, arg);
376     UCT_UD_CHECK_BCOPY_LENGTH(&iface->super, length);
377 
378     ctrl = uct_ud_mlx5_ep_get_next_wqe(iface, ep, &wqe_size, &next_seg);
379     dptr = next_seg;
380     uct_ib_mlx5_set_data_seg(dptr, skb->neth, skb->len, skb->lkey);
381     uct_ud_mlx5_post_send(iface, ep, 0, ctrl, wqe_size + sizeof(*dptr),
382                           skb->neth, INT_MAX);
383 
384     uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb);
385     UCT_TL_EP_STAT_OP(&ep->super.super, AM, BCOPY, length);
386     uct_ud_leave(&iface->super);
387     return length;
388 }
389 
390 static ucs_status_t
uct_ud_mlx5_ep_am_zcopy(uct_ep_h tl_ep,uint8_t id,const void * header,unsigned header_length,const uct_iov_t * iov,size_t iovcnt,unsigned flags,uct_completion_t * comp)391 uct_ud_mlx5_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *header,
392                         unsigned header_length, const uct_iov_t *iov,
393                         size_t iovcnt, unsigned flags, uct_completion_t *comp)
394 {
395     char dummy = 0 ; /* pass dummy pointer to 0-length header to avoid compiler
396                         warnings */
397 
398     UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + header_length, 0,
399                      UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(UCT_IB_MLX5_AV_FULL_SIZE),
400                      "am_zcopy header");
401     return uct_ud_mlx5_ep_inline_iov_post(tl_ep, id,
402                                           /* inl. header */  &dummy, 0,
403                                           /* inl. data */    header, header_length,
404                                           /* iov */          iov, iovcnt,
405                                           /* packet flags */ UCT_UD_PACKET_FLAG_AM |
406                                                              UCT_UD_PACKET_FLAG_ACK_REQ,
407                                           /* completion */   comp,
408                                           UCT_EP_STAT_AM, UCT_EP_STAT_BYTES_ZCOPY,
409                                           "uct_ud_mlx5_ep_am_zcopy");
410 }
411 
412 static ucs_status_t
uct_ud_mlx5_ep_put_short(uct_ep_h tl_ep,const void * buffer,unsigned length,uint64_t remote_addr,uct_rkey_t rkey)413 uct_ud_mlx5_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length,
414                          uint64_t remote_addr, uct_rkey_t rkey)
415 {
416     uct_ud_put_hdr_t puth = { .rva = remote_addr };
417     return uct_ud_mlx5_ep_short_common(tl_ep, 0,
418                                        /* inl. header */  &puth, sizeof(puth),
419                                        /* inl. data */    buffer, length,
420                                        /* packet flags */ UCT_UD_PACKET_FLAG_PUT,
421                                        UCT_EP_STAT_PUT,
422                                        "uct_ud_mlx5_ep_put_short");
423 }
424 
425 static UCS_F_ALWAYS_INLINE unsigned
uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t * iface,int is_async)426 uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t *iface, int is_async)
427 {
428     struct mlx5_cqe64 *cqe;
429     uint16_t ci;
430     uct_ib_iface_recv_desc_t *desc;
431     uint32_t len;
432     void *packet;
433     unsigned count;
434     ptrdiff_t rx_hdr_offset;
435 
436     ci            = iface->rx.wq.cq_wqe_counter & iface->rx.wq.mask;
437     packet        = (void *)be64toh(iface->rx.wq.wqes[ci].addr);
438     ucs_prefetch(UCS_PTR_BYTE_OFFSET(packet, UCT_IB_GRH_LEN));
439     rx_hdr_offset = iface->super.super.config.rx_hdr_offset;
440     desc          = UCS_PTR_BYTE_OFFSET(packet, -rx_hdr_offset);
441 
442     cqe = uct_ib_mlx5_poll_cq(&iface->super.super, &iface->cq[UCT_IB_DIR_RX]);
443     if (cqe == NULL) {
444         count = 0;
445         goto out;
446     }
447 
448     ucs_memory_cpu_load_fence();
449 
450     ucs_assert(0 == (cqe->op_own &
451                (MLX5_INLINE_SCATTER_32|MLX5_INLINE_SCATTER_64)));
452     ucs_assert(ntohs(cqe->wqe_counter) == iface->rx.wq.cq_wqe_counter);
453 
454     iface->super.rx.available++;
455     iface->rx.wq.cq_wqe_counter++;
456     count = 1;
457     len   = ntohl(cqe->byte_cnt);
458     VALGRIND_MAKE_MEM_DEFINED(packet, len);
459 
460     if (!uct_ud_iface_check_grh(&iface->super, packet,
461                                 uct_ib_mlx5_cqe_is_grh_present(cqe))) {
462         ucs_mpool_put_inline(desc);
463         goto out;
464     }
465 
466     uct_ib_mlx5_log_rx(&iface->super.super, cqe, packet, uct_ud_dump_packet);
467     /* coverity[tainted_data] */
468     uct_ud_ep_process_rx(&iface->super,
469                          (uct_ud_neth_t *)UCS_PTR_BYTE_OFFSET(packet, UCT_IB_GRH_LEN),
470                          len - UCT_IB_GRH_LEN,
471                          (uct_ud_recv_skb_t *)ucs_unaligned_ptr(desc), is_async);
472 out:
473     if (iface->super.rx.available >= iface->super.super.config.rx_max_batch) {
474         /* we need to try to post buffers always. Otherwise it is possible
475          * to run out of rx wqes if receiver is slow and there are always
476          * cqe to process
477          */
478         uct_ud_mlx5_iface_post_recv(iface);
479     }
480     return count;
481 }
482 
483 static UCS_F_ALWAYS_INLINE unsigned
uct_ud_mlx5_iface_poll_tx(uct_ud_mlx5_iface_t * iface,int is_async)484 uct_ud_mlx5_iface_poll_tx(uct_ud_mlx5_iface_t *iface, int is_async)
485 {
486     struct mlx5_cqe64 *cqe;
487     uint16_t hw_ci;
488 
489     cqe = uct_ib_mlx5_poll_cq(&iface->super.super, &iface->cq[UCT_IB_DIR_TX]);
490     if (cqe == NULL) {
491         return 0;
492     }
493 
494     ucs_memory_cpu_load_fence();
495 
496     uct_ib_mlx5_log_cqe(cqe);
497     hw_ci                     = ntohs(cqe->wqe_counter);
498     iface->super.tx.available = uct_ib_mlx5_txwq_update_bb(&iface->tx.wq, hw_ci);
499 
500     uct_ud_iface_send_completion(&iface->super, hw_ci, is_async);
501 
502     return 1;
503 }
504 
uct_ud_mlx5_iface_progress(uct_iface_h tl_iface)505 static unsigned uct_ud_mlx5_iface_progress(uct_iface_h tl_iface)
506 {
507     uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_mlx5_iface_t);
508     ucs_status_t status;
509     unsigned n, count = 0;
510 
511     uct_ud_enter(&iface->super);
512     uct_ud_iface_dispatch_async_comps(&iface->super);
513 
514     status = uct_ud_iface_dispatch_pending_rx(&iface->super);
515     if (ucs_likely(status == UCS_OK)) {
516         do {
517             n = uct_ud_mlx5_iface_poll_rx(iface, 0);
518             count += n;
519         } while ((n > 0) && (count < iface->super.super.config.rx_max_poll));
520     }
521 
522     count += uct_ud_mlx5_iface_poll_tx(iface, 0);
523     uct_ud_iface_progress_pending(&iface->super, 0);
524     uct_ud_leave(&iface->super);
525     return count;
526 }
527 
uct_ud_mlx5_iface_async_progress(uct_ud_iface_t * ud_iface)528 static unsigned uct_ud_mlx5_iface_async_progress(uct_ud_iface_t *ud_iface)
529 {
530     uct_ud_mlx5_iface_t *iface = ucs_derived_of(ud_iface, uct_ud_mlx5_iface_t);
531     unsigned n, count;
532 
533     count = 0;
534     do {
535         n = uct_ud_mlx5_iface_poll_rx(iface, 1);
536         count += n;
537     } while ((n > 0) && (count < iface->super.rx.async_max_poll));
538 
539     count += uct_ud_mlx5_iface_poll_tx(iface, 1);
540 
541     uct_ud_iface_progress_pending(&iface->super, 1);
542 
543     return count;
544 }
545 
546 static ucs_status_t
uct_ud_mlx5_iface_query(uct_iface_h tl_iface,uct_iface_attr_t * iface_attr)547 uct_ud_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr)
548 {
549     uct_ud_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_iface_t);
550     ucs_status_t status;
551 
552     ucs_trace_func("");
553 
554     status = uct_ud_iface_query(iface, iface_attr, uct_ud_mlx5_max_am_iov(),
555                                 UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(UCT_IB_MLX5_AV_FULL_SIZE)
556                                 - sizeof(uct_ud_neth_t));
557     if (status != UCS_OK) {
558         return status;
559     }
560 
561     iface_attr->overhead = 80e-9; /* Software overhead */
562 
563     return UCS_OK;
564 }
565 
566 static ucs_status_t
uct_ud_mlx5_ep_create_ah(uct_ud_mlx5_iface_t * iface,uct_ud_mlx5_ep_t * ep,const uct_ib_address_t * ib_addr,unsigned path_index,const uct_ud_iface_addr_t * if_addr)567 uct_ud_mlx5_ep_create_ah(uct_ud_mlx5_iface_t *iface, uct_ud_mlx5_ep_t *ep,
568                          const uct_ib_address_t *ib_addr, unsigned path_index,
569                          const uct_ud_iface_addr_t *if_addr)
570 {
571     ucs_status_t status;
572     uint32_t remote_qpn;
573     int is_global;
574 
575     status = uct_ud_mlx5_iface_get_av(&iface->super.super, &iface->ud_mlx5_common,
576                                       ib_addr, path_index, &ep->av, &ep->grh_av,
577                                       &is_global);
578     if (status != UCS_OK) {
579         return status;
580     }
581 
582     remote_qpn      = uct_ib_unpack_uint24(if_addr->qp_num);
583     ep->is_global   = is_global;
584     ep->av.dqp_dct |= htonl(remote_qpn);
585     return UCS_OK;
586 }
587 
588 static ucs_status_t
uct_ud_mlx5_ep_create_connected(uct_iface_h iface_h,const uct_device_addr_t * dev_addr,const uct_iface_addr_t * iface_addr,unsigned path_index,uct_ep_h * new_ep_p)589 uct_ud_mlx5_ep_create_connected(uct_iface_h iface_h,
590                                 const uct_device_addr_t *dev_addr,
591                                 const uct_iface_addr_t *iface_addr,
592                                 unsigned path_index, uct_ep_h *new_ep_p)
593 {
594     uct_ud_mlx5_iface_t *iface = ucs_derived_of(iface_h, uct_ud_mlx5_iface_t);
595     uct_ud_mlx5_ep_t *ep;
596     uct_ud_ep_t *new_ud_ep;
597     const uct_ud_iface_addr_t *if_addr = (const uct_ud_iface_addr_t *)iface_addr;
598     const uct_ib_address_t *ib_addr = (const uct_ib_address_t *)dev_addr;
599     uct_ud_send_skb_t *skb;
600     ucs_status_t status, status_ah;
601 
602     uct_ud_enter(&iface->super);
603     status = uct_ud_ep_create_connected_common(&iface->super, ib_addr, if_addr,
604                                                path_index, &new_ud_ep, &skb);
605     if (status != UCS_OK &&
606         status != UCS_ERR_NO_RESOURCE &&
607         status != UCS_ERR_ALREADY_EXISTS) {
608         uct_ud_leave(&iface->super);
609         return status;
610     }
611 
612     ep = ucs_derived_of(new_ud_ep, uct_ud_mlx5_ep_t);
613     /* cppcheck-suppress autoVariables */
614     *new_ep_p = &ep->super.super.super;
615     if (status == UCS_ERR_ALREADY_EXISTS) {
616         uct_ud_leave(&iface->super);
617         return UCS_OK;
618     }
619 
620     status_ah = uct_ud_mlx5_ep_create_ah(iface, ep, ib_addr,
621                                          ep->super.path_index, if_addr);
622     if (status_ah != UCS_OK) {
623         uct_ud_ep_destroy_connected(&ep->super, ib_addr, if_addr);
624         *new_ep_p = NULL;
625         uct_ud_leave(&iface->super);
626         return status_ah;
627     }
628 
629     if (status == UCS_OK) {
630         uct_ud_mlx5_ep_send_ctl(&ep->super, skb, NULL, 0, 1, 1);
631         uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb);
632         ep->super.flags |= UCT_UD_EP_FLAG_CREQ_SENT;
633     }
634 
635     uct_ud_leave(&iface->super);
636     return UCS_OK;
637 }
638 
639 static ucs_status_t
uct_ud_mlx5_ep_create(const uct_ep_params_t * params,uct_ep_h * ep_p)640 uct_ud_mlx5_ep_create(const uct_ep_params_t* params, uct_ep_h *ep_p)
641 {
642     if (ucs_test_all_flags(params->field_mask, UCT_EP_PARAM_FIELD_DEV_ADDR |
643                                                UCT_EP_PARAM_FIELD_IFACE_ADDR)) {
644         return uct_ud_mlx5_ep_create_connected(params->iface, params->dev_addr,
645                                                params->iface_addr,
646                                                UCT_EP_PARAMS_GET_PATH_INDEX(params),
647                                                ep_p);
648     }
649 
650     return uct_ud_mlx5_ep_t_new(params->iface, params, ep_p);
651 }
652 
653 
654 static ucs_status_t
uct_ud_mlx5_ep_connect_to_ep(uct_ep_h tl_ep,const uct_device_addr_t * dev_addr,const uct_ep_addr_t * uct_ep_addr)655 uct_ud_mlx5_ep_connect_to_ep(uct_ep_h tl_ep,
656                              const uct_device_addr_t *dev_addr,
657                              const uct_ep_addr_t *uct_ep_addr)
658 {
659     ucs_status_t status;
660     uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t);
661     uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface,
662                                                 uct_ud_mlx5_iface_t);
663     const uct_ud_ep_addr_t *ep_addr = (const uct_ud_ep_addr_t *)uct_ep_addr;
664     const uct_ib_address_t *ib_addr = (const uct_ib_address_t *)dev_addr;
665 
666     ucs_trace_func("");
667     status = uct_ud_ep_connect_to_ep(&ep->super, ib_addr, ep_addr);
668     if (status != UCS_OK) {
669         return status;
670     }
671 
672     status = uct_ud_mlx5_ep_create_ah(iface, ep, ib_addr, ep->super.path_index,
673                                       (const uct_ud_iface_addr_t *)ep_addr);
674     if (status != UCS_OK) {
675         return status;
676     }
677 
678     return UCS_OK;
679 }
680 
uct_ud_mlx5_iface_arm_cq(uct_ib_iface_t * ib_iface,uct_ib_dir_t dir,int solicited)681 static ucs_status_t uct_ud_mlx5_iface_arm_cq(uct_ib_iface_t *ib_iface,
682                                              uct_ib_dir_t dir,
683                                              int solicited)
684 {
685     uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t);
686 #if HAVE_DECL_MLX5DV_INIT_OBJ
687     return uct_ib_mlx5dv_arm_cq(&iface->cq[dir], solicited);
688 #else
689     uct_ib_mlx5_update_cq_ci(iface->super.super.cq[dir],
690                              iface->cq[dir].cq_ci);
691     return uct_ib_iface_arm_cq(ib_iface, dir, solicited);
692 #endif
693 }
694 
uct_ud_mlx5_ep_set_failed(uct_ib_iface_t * iface,uct_ep_h ep,ucs_status_t status)695 static ucs_status_t uct_ud_mlx5_ep_set_failed(uct_ib_iface_t *iface,
696                                               uct_ep_h ep, ucs_status_t status)
697 {
698     return uct_set_ep_failed(&UCS_CLASS_NAME(uct_ud_mlx5_ep_t), ep,
699                              &iface->super.super, status);
700 }
701 
uct_ud_mlx5_iface_event_cq(uct_ib_iface_t * ib_iface,uct_ib_dir_t dir)702 static void uct_ud_mlx5_iface_event_cq(uct_ib_iface_t *ib_iface,
703                                        uct_ib_dir_t dir)
704 {
705     uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t);
706 
707     iface->cq[dir].cq_sn++;
708 }
709 
uct_ud_mlx5_iface_create_qp(uct_ib_iface_t * ib_iface,uct_ib_qp_attr_t * ib_attr,struct ibv_qp ** qp_p)710 static ucs_status_t uct_ud_mlx5_iface_create_qp(uct_ib_iface_t *ib_iface,
711                                                 uct_ib_qp_attr_t *ib_attr,
712                                                 struct ibv_qp **qp_p)
713 {
714     uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t);
715     uct_ib_mlx5_qp_t *qp = &iface->tx.wq.super;
716     uct_ib_mlx5_qp_attr_t attr = {};
717     ucs_status_t status;
718 
719     attr.super     = *ib_attr;
720     attr.mmio_mode = UCT_IB_MLX5_MMIO_MODE_LAST;
721 
722     status = uct_ib_mlx5_iface_create_qp(ib_iface, qp, &attr);
723     if (status != UCS_OK) {
724         return status;
725     }
726 
727     *qp_p = qp->verbs.qp;
728     return status;
729 }
730 
731 static void UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_iface_t)(uct_iface_t*);
732 
uct_ud_mlx5_iface_handle_failure(uct_ib_iface_t * ib_iface,void * arg,ucs_status_t status)733 static void uct_ud_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg,
734                                              ucs_status_t status)
735 {
736     uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t);
737 
738     /* Local side failure - treat as fatal */
739     uct_ib_mlx5_completion_with_err(ib_iface, arg, &iface->tx.wq,
740                                     UCS_LOG_LEVEL_FATAL);
741 }
742 
743 static uct_ud_iface_ops_t uct_ud_mlx5_iface_ops = {
744     {
745     {
746     .ep_put_short             = uct_ud_mlx5_ep_put_short,
747     .ep_am_short              = uct_ud_mlx5_ep_am_short,
748     .ep_am_bcopy              = uct_ud_mlx5_ep_am_bcopy,
749     .ep_am_zcopy              = uct_ud_mlx5_ep_am_zcopy,
750     .ep_pending_add           = uct_ud_ep_pending_add,
751     .ep_pending_purge         = uct_ud_ep_pending_purge,
752     .ep_flush                 = uct_ud_ep_flush,
753     .ep_fence                 = uct_base_ep_fence,
754     .ep_create                = uct_ud_mlx5_ep_create,
755     .ep_destroy               = uct_ud_ep_disconnect ,
756     .ep_get_address           = uct_ud_ep_get_address,
757     .ep_connect_to_ep         = uct_ud_mlx5_ep_connect_to_ep,
758     .iface_flush              = uct_ud_iface_flush,
759     .iface_fence              = uct_base_iface_fence,
760     .iface_progress_enable    = uct_ud_iface_progress_enable,
761     .iface_progress_disable   = uct_ud_iface_progress_disable,
762     .iface_progress           = uct_ud_mlx5_iface_progress,
763     .iface_event_fd_get       = (uct_iface_event_fd_get_func_t)
764                                 ucs_empty_function_return_unsupported,
765     .iface_event_arm          = uct_ud_iface_event_arm,
766     .iface_close              = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_iface_t),
767     .iface_query              = uct_ud_mlx5_iface_query,
768     .iface_get_device_address = uct_ib_iface_get_device_address,
769     .iface_get_address        = uct_ud_iface_get_address,
770     .iface_is_reachable       = uct_ib_iface_is_reachable
771     },
772     .create_cq                = uct_ib_mlx5_create_cq,
773     .arm_cq                   = uct_ud_mlx5_iface_arm_cq,
774     .event_cq                 = uct_ud_mlx5_iface_event_cq,
775     .handle_failure           = uct_ud_mlx5_iface_handle_failure,
776     .set_ep_failed            = uct_ud_mlx5_ep_set_failed,
777     },
778     .async_progress           = uct_ud_mlx5_iface_async_progress,
779     .send_ctl                 = uct_ud_mlx5_ep_send_ctl,
780     .ep_free                  = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_ep_t),
781     .create_qp                = uct_ud_mlx5_iface_create_qp,
782 };
783 
UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t,uct_md_h md,uct_worker_h worker,const uct_iface_params_t * params,const uct_iface_config_t * tl_config)784 static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t,
785                            uct_md_h md, uct_worker_h worker,
786                            const uct_iface_params_t *params,
787                            const uct_iface_config_t *tl_config)
788 {
789     uct_ud_mlx5_iface_config_t *config = ucs_derived_of(tl_config,
790                                                         uct_ud_mlx5_iface_config_t);
791     uct_ib_iface_init_attr_t init_attr = {};
792     ucs_status_t status;
793     int i;
794 
795     ucs_trace_func("");
796 
797     init_attr.flags                 = UCT_IB_CQ_IGNORE_OVERRUN;
798     init_attr.cq_len[UCT_IB_DIR_TX] = config->super.super.tx.queue_len * UCT_IB_MLX5_MAX_BB;
799     init_attr.cq_len[UCT_IB_DIR_RX] = config->super.super.rx.queue_len;
800 
801     self->tx.wq.super.type = UCT_IB_MLX5_OBJ_TYPE_LAST;
802 
803     UCS_CLASS_CALL_SUPER_INIT(uct_ud_iface_t, &uct_ud_mlx5_iface_ops,
804                               md, worker, params, &config->super, &init_attr);
805 
806     self->super.config.max_inline = uct_ud_mlx5_max_inline();
807 
808     status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_TX], &self->cq[UCT_IB_DIR_TX]);
809     if (status != UCS_OK) {
810         return status;
811     }
812 
813     status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_RX], &self->cq[UCT_IB_DIR_RX]);
814     if (status != UCS_OK) {
815         return status;
816     }
817 
818     status = uct_ib_mlx5_txwq_init(self->super.super.super.worker,
819                                    config->mlx5_common.mmio_mode, &self->tx.wq,
820                                    self->super.qp);
821     if (status != UCS_OK) {
822         return status;
823     }
824 
825     self->super.tx.available = self->tx.wq.bb_max;
826     ucs_assert(init_attr.cq_len[UCT_IB_DIR_TX] >= self->tx.wq.bb_max);
827 
828     status = uct_ib_mlx5_get_rxwq(self->super.qp, &self->rx.wq);
829     if (status != UCS_OK) {
830         return status;
831     }
832 
833     ucs_assert(init_attr.cq_len[UCT_IB_DIR_RX] > self->rx.wq.mask);
834 
835     status = uct_ud_mlx5_iface_common_init(&self->super.super,
836                                            &self->ud_mlx5_common,
837                                            &config->ud_mlx5_common);
838     if (status != UCS_OK) {
839         return status;
840     }
841 
842     /* write buffer sizes */
843     for (i = 0; i <= self->rx.wq.mask; i++) {
844         self->rx.wq.wqes[i].byte_count = htonl(self->super.super.config.rx_payload_offset +
845                                                self->super.super.config.seg_size);
846     }
847     while (self->super.rx.available >= self->super.super.config.rx_max_batch) {
848         uct_ud_mlx5_iface_post_recv(self);
849     }
850 
851     status = uct_ud_iface_complete_init(&self->super);
852     if (status != UCS_OK) {
853         return status;
854     }
855 
856     return UCS_OK;
857 }
858 
859 
UCS_CLASS_CLEANUP_FUNC(uct_ud_mlx5_iface_t)860 static UCS_CLASS_CLEANUP_FUNC(uct_ud_mlx5_iface_t)
861 {
862     ucs_trace_func("");
863     uct_ud_iface_remove_async_handlers(&self->super);
864     uct_ud_enter(&self->super);
865     UCT_UD_IFACE_DELETE_EPS(&self->super, uct_ud_mlx5_ep_t);
866     uct_ib_mlx5_txwq_cleanup(&self->tx.wq);
867     uct_ud_leave(&self->super);
868 }
869 
870 UCS_CLASS_DEFINE(uct_ud_mlx5_iface_t, uct_ud_iface_t);
871 
872 static UCS_CLASS_DEFINE_NEW_FUNC(uct_ud_mlx5_iface_t, uct_iface_t, uct_md_h,
873                                  uct_worker_h, const uct_iface_params_t*,
874                                  const uct_iface_config_t*);
875 
876 static UCS_CLASS_DEFINE_DELETE_FUNC(uct_ud_mlx5_iface_t, uct_iface_t);
877 
878 static ucs_status_t
uct_ud_mlx5_query_tl_devices(uct_md_h md,uct_tl_device_resource_t ** tl_devices_p,unsigned * num_tl_devices_p)879 uct_ud_mlx5_query_tl_devices(uct_md_h md,
880                              uct_tl_device_resource_t **tl_devices_p,
881                              unsigned *num_tl_devices_p)
882 {
883     uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t);
884     return uct_ib_device_query_ports(&ib_md->dev, UCT_IB_DEVICE_FLAG_MLX5_PRM,
885                                      tl_devices_p, num_tl_devices_p);
886 }
887 
888 UCT_TL_DEFINE(&uct_ib_component, ud_mlx5, uct_ud_mlx5_query_tl_devices,
889               uct_ud_mlx5_iface_t, "UD_MLX5_", uct_ud_mlx5_iface_config_table,
890               uct_ud_mlx5_iface_config_t);
891