1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
3 *
4 * See file LICENSE for terms.
5 */
6 
7 #ifndef UCT_RC_MLX5_COMMON_H
8 #define UCT_RC_MLX5_COMMON_H
9 
10 #include <uct/ib/base/ib_device.h>
11 #include <uct/ib/rc/base/rc_iface.h>
12 #include <uct/ib/rc/base/rc_ep.h>
13 #include <uct/ib/mlx5/ib_mlx5.h>
14 
15 
16 /*
17  * HW tag matching
18  */
19 #if IBV_HW_TM
20 #  define UCT_RC_RNDV_HDR_LEN         sizeof(struct ibv_rvh)
21 #else
22 #  define UCT_RC_RNDV_HDR_LEN         0
23 #endif
24 
25 #if IBV_HW_TM
26 #  if HAVE_INFINIBAND_TM_TYPES_H
27 #    include <infiniband/tm_types.h>
28 #  else
29 #    define ibv_tmh                         ibv_exp_tmh
30 #    define ibv_rvh                         ibv_exp_tmh_rvh
31 #    define IBV_TM_CAP_RC                   IBV_EXP_TM_CAP_RC
32 #    define IBV_TMH_EAGER                   IBV_EXP_TMH_EAGER
33 #    define IBV_TMH_RNDV                    IBV_EXP_TMH_RNDV
34 #    define IBV_TMH_FIN                     IBV_EXP_TMH_FIN
35 #    define IBV_TMH_NO_TAG                  IBV_EXP_TMH_NO_TAG
36 #  endif
37 #  define IBV_DEVICE_TM_CAPS(_dev, _field)  ((_dev)->dev_attr.tm_caps._field)
38 #else
39 #  define IBV_TM_CAP_RC                     0
40 #  define IBV_DEVICE_TM_CAPS(_dev, _field)  0
41 #endif
42 
43 #if HAVE_STRUCT_IBV_TM_CAPS_FLAGS
44 #  define IBV_DEVICE_TM_FLAGS(_dev)         IBV_DEVICE_TM_CAPS(_dev, flags)
45 #else
46 #  define IBV_DEVICE_TM_FLAGS(_dev)         IBV_DEVICE_TM_CAPS(_dev, capability_flags)
47 #endif
48 
49 #define IBV_DEVICE_MAX_UNEXP_COUNT          UCS_BIT(14)
50 
51 #if HAVE_DECL_IBV_EXP_CREATE_SRQ
52 #  define ibv_srq_init_attr_ex              ibv_exp_create_srq_attr
53 #endif
54 
55 #define UCT_RC_MLX5_OPCODE_FLAG_RAW         0x100
56 #define UCT_RC_MLX5_OPCODE_FLAG_TM          0x200
57 #define UCT_RC_MLX5_OPCODE_MASK             0xff
58 #define UCT_RC_MLX5_SINGLE_FRAG_MSG(_flags) \
59     (((_flags) & UCT_CB_PARAM_FLAG_FIRST) && !((_flags) & UCT_CB_PARAM_FLAG_MORE))
60 
61 #define UCT_RC_MLX5_CHECK_AM_ZCOPY(_id, _header_length, _length, _seg_size, _av_size) \
62     UCT_CHECK_AM_ID(_id); \
63     UCT_RC_CHECK_ZCOPY_DATA(_header_length, _length, _seg_size) \
64     UCT_CHECK_LENGTH(sizeof(uct_rc_mlx5_hdr_t) + _header_length, 0, \
65                      UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(_av_size), "am_zcopy header");
66 
67 
68 #define UCT_RC_MLX5_CHECK_AM_SHORT(_id, _length, _av_size) \
69     UCT_CHECK_AM_ID(_id); \
70     UCT_CHECK_LENGTH(sizeof(uct_rc_mlx5_am_short_hdr_t) + _length, 0, \
71         UCT_IB_MLX5_AM_MAX_SHORT(_av_size), "am_short");
72 
73 
74 /* there is no need to do a special check for length == 0 because in that
75  * case wqe size is valid: inl + raddr + dgram + ctrl fit in 2 WQ BB
76  */
77 #define UCT_RC_MLX5_CHECK_PUT_SHORT(_length, _av_size) \
78     UCT_CHECK_LENGTH(_length, 0, UCT_IB_MLX5_PUT_MAX_SHORT(_av_size), "put_short")
79 
80 #define UCT_RC_MLX5_ATOMIC_OPS (UCS_BIT(UCT_ATOMIC_OP_ADD) | \
81                                 UCS_BIT(UCT_ATOMIC_OP_AND) | \
82                                 UCS_BIT(UCT_ATOMIC_OP_OR)  | \
83                                 UCS_BIT(UCT_ATOMIC_OP_XOR))
84 
85 #define UCT_RC_MLX5_ATOMIC_FOPS (UCT_RC_MLX5_ATOMIC_OPS | UCS_BIT(UCT_ATOMIC_OP_SWAP))
86 
87 #define UCT_RC_MLX5_CHECK_ATOMIC_OPS(_op, _size, _flags)                        \
88     if (ucs_unlikely(!(UCS_BIT(_op) & (_flags)))) {                             \
89         ucs_assertv(0, "incorrect opcode for atomic: %d", _op);                 \
90         return UCS_ERR_UNSUPPORTED;                                             \
91     } else {                                                                    \
92         ucs_assert((_size == sizeof(uint64_t)) || (_size == sizeof(uint32_t))); \
93     }
94 
95 #define UCT_RC_MLX5_TO_BE(_val, _size) \
96     ((_size) == sizeof(uint64_t) ? htobe64(_val) : htobe32(_val))
97 
98 #define UCT_RC_MLX5_DECLARE_ATOMIC_LE_HANDLER(_bits) \
99     void \
100     uct_rc_mlx5_common_atomic##_bits##_le_handler(uct_rc_iface_send_op_t *op, \
101                                                   const void *resp);
102 
103 UCT_RC_MLX5_DECLARE_ATOMIC_LE_HANDLER(32)
104 UCT_RC_MLX5_DECLARE_ATOMIC_LE_HANDLER(64)
105 
106 enum {
107     UCT_RC_MLX5_IFACE_STAT_RX_INL_32,
108     UCT_RC_MLX5_IFACE_STAT_RX_INL_64,
109     UCT_RC_MLX5_IFACE_STAT_LAST
110 };
111 
112 enum {
113     UCT_RC_MLX5_TM_OPCODE_NOP              = 0x00,
114     UCT_RC_MLX5_TM_OPCODE_APPEND           = 0x01,
115     UCT_RC_MLX5_TM_OPCODE_REMOVE           = 0x02
116 };
117 
118 /* TODO: Remove/replace this enum when mlx5dv.h is included */
119 enum {
120     UCT_RC_MLX5_OPCODE_TAG_MATCHING          = 0x28,
121     UCT_RC_MLX5_CQE_APP_TAG_MATCHING         = 1,
122 
123     /* last packet flag for multi-packet RQs */
124     UCT_RC_MLX5_MP_RQ_LAST_MSG_FIELD         = 0x40000000,
125 
126     /* byte count mask for multi-packet RQs */
127     UCT_RC_MLX5_MP_RQ_BYTE_CNT_FIELD_MASK    = 0x0000FFFF,
128 
129     UCT_RC_MLX5_MP_RQ_NUM_STRIDES_FIELD_MASK = 0x3FFF0000,
130 
131     /* filler cqe indicator */
132     UCT_RC_MLX5_MP_RQ_FILLER_CQE             = UCS_BIT(31),
133 
134     /* tag segment flags */
135     UCT_RC_MLX5_SRQ_FLAG_TM_SW_CNT           = (1 << 6),
136     UCT_RC_MLX5_SRQ_FLAG_TM_CQE_REQ          = (1 << 7),
137 
138     /* tag CQE codes */
139     UCT_RC_MLX5_CQE_APP_OP_TM_CONSUMED       = 0x1,
140     UCT_RC_MLX5_CQE_APP_OP_TM_EXPECTED       = 0x2,
141     UCT_RC_MLX5_CQE_APP_OP_TM_UNEXPECTED     = 0x3,
142     UCT_RC_MLX5_CQE_APP_OP_TM_NO_TAG         = 0x4,
143     UCT_RC_MLX5_CQE_APP_OP_TM_APPEND         = 0x5,
144     UCT_RC_MLX5_CQE_APP_OP_TM_REMOVE         = 0x6,
145     UCT_RC_MLX5_CQE_APP_OP_TM_CONSUMED_MSG   = 0xA
146 };
147 
148 enum {
149     UCT_RC_MLX5_POLL_FLAG_TM                 = UCS_BIT(0),
150     UCT_RC_MLX5_POLL_FLAG_HAS_EP             = UCS_BIT(1),
151     UCT_RC_MLX5_POLL_FLAG_TAG_CQE            = UCS_BIT(2)
152 };
153 
154 
155 #define UCT_RC_MLX5_RMA_MAX_IOV(_av_size) \
156     ((UCT_IB_MLX5_MAX_SEND_WQE_SIZE - ((_av_size) + \
157      sizeof(struct mlx5_wqe_raddr_seg) + sizeof(struct mlx5_wqe_ctrl_seg))) / \
158      sizeof(struct mlx5_wqe_data_seg))
159 
160 
161 #if IBV_HW_TM
162 #  define UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(_av_size) \
163        (UCT_IB_MLX5_AM_MAX_SHORT(_av_size + sizeof(struct ibv_tmh))/ \
164         sizeof(struct mlx5_wqe_data_seg))
165 #else
166 #  define UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(_av_size)   0
167 #endif /* IBV_HW_TM  */
168 
169 
170 #define UCT_RC_MLX5_TM_CQE_WITH_IMM(_cqe64) \
171    (((_cqe64)->op_own >> 4) == MLX5_CQE_RESP_SEND_IMM)
172 
173 
174 #define UCT_RC_MLX5_TM_IS_SW_RNDV(_cqe64, _imm_data) \
175    (ucs_unlikely(UCT_RC_MLX5_TM_CQE_WITH_IMM(_cqe64) && !(_imm_data)))
176 
177 
178 #define UCT_RC_MLX5_CHECK_TAG(_mlx5_common_iface) \
179    if (ucs_unlikely((_mlx5_common_iface)->tm.head->next == NULL)) {  \
180        return UCS_ERR_EXCEEDS_LIMIT; \
181    }
182 
183 
184 typedef struct uct_rc_mlx5_hdr {
185     uint8_t           tmh_opcode; /* TMH.opcode */
186     uct_rc_hdr_t      rc_hdr;
187 } UCS_S_PACKED uct_rc_mlx5_hdr_t;
188 
189 /*
190  * Short active message header (active message header is always 64 bit).
191  */
192 typedef struct uct_rc_mlx5_am_short_hdr {
193     uct_rc_mlx5_hdr_t  rc_hdr;
194     uint64_t           am_hdr;
195 } UCS_S_PACKED uct_rc_mlx5_am_short_hdr_t;
196 
197 
198 /* TODO: Remove this struct when mlx5dv.h is included! */
199 typedef struct uct_rc_mlx5_wqe_tm_seg {
200     uint8_t                       opcode;
201     uint8_t                       flags;
202     uint16_t                      index;
203     uint8_t                       rsvd0[2];
204     uint16_t                      sw_cnt;
205     uint8_t                       rsvd1[8];
206     uint64_t                      append_tag;
207     uint64_t                      append_mask;
208 } uct_rc_mlx5_wqe_tm_seg_t;
209 
210 
211 /* Tag matching list entry */
212 typedef struct uct_rc_mlx5_tag_entry {
213     struct uct_rc_mlx5_tag_entry  *next;
214     uct_tag_context_t             *ctx;     /* the corresponding UCT context */
215     unsigned                      num_cqes; /* how many CQEs is expected for this entry */
216 } uct_rc_mlx5_tag_entry_t;
217 
218 
219 /* Pending operation on the command QP */
220 typedef struct uct_rc_mlx5_srq_op {
221     uct_rc_mlx5_tag_entry_t       *tag;
222 } uct_rc_mlx5_srq_op_t;
223 
224 
225 /* Command QP work-queue. All tag matching list operations are posted on it. */
226 typedef struct uct_rc_mlx5_cmd_wq {
227     uct_ib_mlx5_txwq_t            super;
228     uct_rc_mlx5_srq_op_t          *ops;     /* array of operations on command QP */
229     int                           ops_head; /* points to the next operation to be completed */
230     int                           ops_tail; /* points to the last adde operation*/
231     int                           ops_mask; /* mask which bounds head and tail by
232                                                ops array size */
233 } uct_rc_mlx5_cmd_wq_t;
234 
235 
236 /* Message context used with multi-packet XRQ */
237 typedef struct uct_rc_mlx5_mp_context {
238     /* Storage for a per-message user-defined context. Must be passed unchanged
239      * to the user in uct_tag_unexp_eager_cb_t. */
240     void                          *context;
241 
242     /* Tag is saved when first fragment (with TMH) arrives and then passed to
243      * the eager unexpected callback for subsequent fragments. */
244     uct_tag_t                     tag;
245 
246     /* With MP XRQ immediate value is delivered with the last fragment, while
247      * TMH is present in the first fragment only. Need to save app_context
248      * from TMH in this field and construct immediate data for unexpected
249      * eager callback when the last message fragment arrives. */
250     uint32_t                      app_ctx;
251 
252     /* Used when local EP can be found by sender QP number (rc_mlx5 tl).
253      * When 0, it means that tag eager unexpected multi-fragmented message
254      * is being processed (not all fragments are delivered to the user via
255      * uct_tag_unexp_eager_cb_t callback yet). Otherwise, any incoming tag
256      * eager message should be either a single fragment message or the first
257      * fragment of multi-fragmeneted message. */
258     uint8_t                       free;
259 } uct_rc_mlx5_mp_context_t;
260 
261 
262 typedef struct uct_rc_mlx5_mp_hash_key {
263     uint64_t                      guid;
264     uint32_t                      qp_num;
265 } uct_rc_mlx5_mp_hash_key_t;
266 
267 
268 static UCS_F_ALWAYS_INLINE int
uct_rc_mlx5_mp_hash_equal(uct_rc_mlx5_mp_hash_key_t key1,uct_rc_mlx5_mp_hash_key_t key2)269 uct_rc_mlx5_mp_hash_equal(uct_rc_mlx5_mp_hash_key_t key1,
270                           uct_rc_mlx5_mp_hash_key_t key2)
271 {
272     return (key1.qp_num == key2.qp_num) && (key1.guid == key2.guid);
273 }
274 
275 
276 static UCS_F_ALWAYS_INLINE khint32_t
uct_rc_mlx5_mp_hash_func(uct_rc_mlx5_mp_hash_key_t key)277 uct_rc_mlx5_mp_hash_func(uct_rc_mlx5_mp_hash_key_t key)
278 {
279     return kh_int64_hash_func(key.guid ^ key.qp_num);
280 }
281 
282 
283 KHASH_MAP_INIT_INT64(uct_rc_mlx5_mp_hash_lid, uct_rc_mlx5_mp_context_t);
284 
285 
286 KHASH_INIT(uct_rc_mlx5_mp_hash_gid, uct_rc_mlx5_mp_hash_key_t,
287            uct_rc_mlx5_mp_context_t, 1, uct_rc_mlx5_mp_hash_func,
288            uct_rc_mlx5_mp_hash_equal);
289 
290 
291 #if IBV_HW_TM
292 #  define UCT_RC_MLX5_IFACE_GET_TM_BCOPY_DESC(_iface, _mp, _desc, _tag, _app_ctx, \
293                                               _pack_cb, _arg, _length) \
294        { \
295            void *hdr; \
296            UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
297            (_desc)->super.handler = (uct_rc_send_handler_t)ucs_mpool_put; \
298            hdr = (_desc) + 1; \
299            uct_rc_mlx5_fill_tmh(hdr, _tag, _app_ctx, IBV_TMH_EAGER); \
300            hdr = UCS_PTR_BYTE_OFFSET(hdr, sizeof(struct ibv_tmh)); \
301            _length = _pack_cb(hdr, _arg); \
302        }
303 #endif
304 
305 enum {
306     UCT_RC_MLX5_STAT_TAG_RX_EXP,
307     UCT_RC_MLX5_STAT_TAG_RX_EAGER_UNEXP,
308     UCT_RC_MLX5_STAT_TAG_RX_RNDV_UNEXP,
309     UCT_RC_MLX5_STAT_TAG_RX_RNDV_REQ_EXP,
310     UCT_RC_MLX5_STAT_TAG_RX_RNDV_REQ_UNEXP,
311     UCT_RC_MLX5_STAT_TAG_RX_RNDV_FIN,
312     UCT_RC_MLX5_STAT_TAG_LIST_ADD,
313     UCT_RC_MLX5_STAT_TAG_LIST_DEL,
314     UCT_RC_MLX5_STAT_TAG_LIST_SYNC,
315     UCT_RC_MLX5_STAT_TAG_LAST
316 };
317 
318 typedef struct uct_rc_mlx5_tmh_priv_data {
319     uint8_t                     length;
320     uint16_t                    data;
321 } UCS_S_PACKED uct_rc_mlx5_tmh_priv_data_t;
322 
323 void uct_rc_mlx5_release_desc(uct_recv_desc_t *self, void *desc);
324 
325 typedef struct uct_rc_mlx5_release_desc {
326     uct_recv_desc_t             super;
327     unsigned                    offset;
328 } uct_rc_mlx5_release_desc_t;
329 
330 
331 typedef struct uct_rc_mlx5_ctx_priv {
332     uint64_t                    tag;
333     void                        *buffer;
334     uint32_t                    app_ctx;
335     uint32_t                    length;
336     uint32_t                    tag_handle;
337 } uct_rc_mlx5_ctx_priv_t;
338 
339 #if HAVE_IBV_DM
340 typedef struct uct_mlx5_dm_data {
341     uct_worker_tl_data_t super;
342     ucs_mpool_t          mp;
343     struct ibv_mr        *mr;
344     struct ibv_dm        *dm;
345     void                 *start_va;
346     size_t               seg_len;
347     unsigned             seg_count;
348     unsigned             seg_attached;
349     uct_ib_device_t      *device;
350 } uct_mlx5_dm_data_t;
351 
352 typedef union uct_rc_mlx5_dm_copy_data {
353     uct_rc_mlx5_am_short_hdr_t am_hdr;
354     struct ibv_tmh             tm_hdr;
355     char                       bytes[sizeof(uint64_t) * 2];
356 } UCS_S_PACKED uct_rc_mlx5_dm_copy_data_t;
357 #endif
358 
359 #define uct_rc_mlx5_tag_addr_hash(_ptr) kh_int64_hash_func((uintptr_t)(_ptr))
360 KHASH_INIT(uct_rc_mlx5_tag_addrs, void*, char, 0, uct_rc_mlx5_tag_addr_hash,
361            kh_int64_hash_equal)
362 
363 typedef struct uct_rc_mlx5_iface_common {
364     uct_rc_iface_t                     super;
365     struct {
366         ucs_mpool_t                    atomic_desc_mp;
367         uct_ib_mlx5_mmio_mode_t        mmio_mode;
368         uint16_t                       bb_max;     /* limit number of outstanding WQE BBs */
369     } tx;
370     struct {
371         uct_ib_mlx5_srq_t              srq;
372         void                           *pref_ptr;
373     } rx;
374     uct_ib_mlx5_cq_t                   cq[UCT_IB_DIR_NUM];
375     struct {
376         uct_rc_mlx5_cmd_wq_t           cmd_wq;
377         uct_rc_mlx5_tag_entry_t        *head;
378         uct_rc_mlx5_tag_entry_t        *tail;
379         uct_rc_mlx5_tag_entry_t        *list;
380         ucs_mpool_t                    *bcopy_mp;
381         khash_t(uct_rc_mlx5_tag_addrs) tag_addrs;
382 
383         ucs_ptr_array_t                rndv_comps;
384         size_t                         max_bcopy;
385         size_t                         max_zcopy;
386         unsigned                       num_tags;
387         unsigned                       num_outstanding;
388         unsigned                       max_rndv_data;
389         uint16_t                       unexpected_cnt;
390         uint16_t                       cmd_qp_len;
391         uint8_t                        enabled;
392         struct {
393             uint8_t                    num_strides;
394             ucs_mpool_t                tx_mp;
395             uct_rc_mlx5_mp_context_t   last_frag_ctx;
396             khash_t(uct_rc_mlx5_mp_hash_lid) hash_lid;
397             khash_t(uct_rc_mlx5_mp_hash_gid) hash_gid;
398         } mp;
399         struct {
400             void                       *arg; /* User defined arg */
401             uct_tag_unexp_eager_cb_t   cb;   /* Callback for unexpected eager messages */
402         } eager_unexp;
403 
404         struct {
405             void                       *arg; /* User defined arg */
406             uct_tag_unexp_rndv_cb_t    cb;   /* Callback for unexpected rndv messages */
407         } rndv_unexp;
408         uct_rc_mlx5_release_desc_t     eager_desc;
409         uct_rc_mlx5_release_desc_t     rndv_desc;
410         uct_rc_mlx5_release_desc_t     am_desc;
411         UCS_STATS_NODE_DECLARE(stats)
412     } tm;
413 #if HAVE_IBV_DM
414     struct {
415         uct_mlx5_dm_data_t             *dm;
416         size_t                         seg_len; /* cached value to avoid double-pointer access */
417         ucs_status_t                   (*am_short)(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
418                                                    const void *payload, unsigned length);
419         ucs_status_t                   (*tag_short)(uct_ep_h tl_ep, uct_tag_t tag,
420                                                     const void *data, size_t length);
421     } dm;
422 #endif
423 #if HAVE_DECL_MLX5DV_DEVX_SUBSCRIBE_DEVX_EVENT
424     struct mlx5dv_devx_event_channel   *event_channel;
425 #endif
426     struct {
427         uint8_t                        atomic_fence_flag;
428         ucs_ternary_value_t            cyclic_srq_enable;
429     } config;
430     UCS_STATS_NODE_DECLARE(stats)
431 } uct_rc_mlx5_iface_common_t;
432 
433 /**
434  * Common RC/DC mlx5 interface configuration
435  */
436 typedef struct uct_rc_mlx5_iface_common_config {
437     uct_ib_mlx5_iface_config_t       super;
438     unsigned                         tx_max_bb;
439     struct {
440         int                          enable;
441         unsigned                     list_size;
442         size_t                       seg_size;
443         ucs_ternary_value_t          mp_enable;
444         size_t                       mp_num_strides;
445     } tm;
446     unsigned                         exp_backoff;
447     ucs_ternary_value_t              cyclic_srq_enable;
448 } uct_rc_mlx5_iface_common_config_t;
449 
450 
451 UCS_CLASS_DECLARE(uct_rc_mlx5_iface_common_t,
452                   uct_rc_iface_ops_t*,
453                   uct_md_h, uct_worker_h,
454                   const uct_iface_params_t*,
455                   uct_rc_iface_common_config_t*,
456                   uct_rc_mlx5_iface_common_config_t*,
457                   uct_ib_iface_init_attr_t*);
458 
459 
460 #define UCT_RC_MLX5_TM_STAT(_iface, _op) \
461     UCS_STATS_UPDATE_COUNTER((_iface)->tm.stats, UCT_RC_MLX5_STAT_TAG_##_op, 1)
462 
463 #define UCT_RC_MLX5_TM_ENABLED(_iface) (_iface)->tm.enabled
464 
465 #define UCT_RC_MLX5_MP_ENABLED(_iface) ((_iface)->tm.mp.num_strides > 1)
466 
467 /* TMH can carry 2 bytes of data in its reserved filed */
468 #define UCT_RC_MLX5_TMH_PRIV_LEN       ucs_field_sizeof(uct_rc_mlx5_tmh_priv_data_t, \
469                                                         data)
470 
471 #define UCT_RC_MLX5_CHECK_RNDV_PARAMS(_iovcnt, _header_len, _tm_len, \
472                                       _max_inline, _max_rndv_hdr) \
473    { \
474        UCT_CHECK_PARAM_PTR(_iovcnt <= 1ul, "Wrong iovcnt %lu", iovcnt); \
475        UCT_CHECK_PARAM_PTR(_header_len <= _max_rndv_hdr, \
476                            "Invalid header len %u", _header_len); \
477        UCT_CHECK_PARAM_PTR((_header_len + _tm_len) <= _max_inline, \
478                            "Invalid RTS len gth %u", \
479                            _header_len + _tm_len); \
480    }
481 
482 #define UCT_RC_MLX5_FILL_TM_IMM(_imm_data, _app_ctx, _ib_imm, _res_op, \
483                                 _op, _imm_suffix) \
484    if (_imm_data == 0) { \
485        _res_op  = _op; \
486        _app_ctx = 0; \
487        _ib_imm  = 0; \
488    } else { \
489        _res_op = UCS_PP_TOKENPASTE(_op, _imm_suffix); \
490        uct_rc_mlx5_tag_imm_data_pack(&(_ib_imm), &(_app_ctx), _imm_data); \
491    }
492 
493 #define UCT_RC_MLX5_GET_TX_TM_DESC(_iface, _mp, _desc, _tag, _app_ctx, _hdr) \
494    { \
495        UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
496        _hdr = _desc + 1; \
497        uct_rc_mlx5_fill_tmh(_hdr, _tag, _app_ctx, IBV_EXP_TMH_EAGER); \
498        _hdr += sizeof(struct ibv_tmh); \
499    }
500 
501 #define UCT_RC_MLX5_GET_TM_BCOPY_DESC(_iface, _mp, _desc, _tag, _app_ctx, \
502                                       _pack_cb, _arg, _length) \
503    { \
504        void *hdr; \
505        UCT_RC_MLX5_GET_TX_TM_DESC(_iface, _mp, _desc, _tag, _app_ctx, hdr) \
506        (_desc)->super.handler = (uct_rc_send_handler_t)ucs_mpool_put; \
507        _length = _pack_cb(hdr, _arg); \
508    }
509 
510 #if IBV_HW_TM
511 void uct_rc_mlx5_handle_unexp_rndv(uct_rc_mlx5_iface_common_t *iface,
512                                    struct ibv_tmh *tmh, uct_tag_t tag,
513                                    struct mlx5_cqe64 *cqe, unsigned flags,
514                                    unsigned byte_len);
515 
516 
517 static UCS_F_ALWAYS_INLINE void
uct_rc_mlx5_fill_tmh(struct ibv_tmh * tmh,uct_tag_t tag,uint32_t app_ctx,unsigned op)518 uct_rc_mlx5_fill_tmh(struct ibv_tmh *tmh, uct_tag_t tag,
519                      uint32_t app_ctx, unsigned op)
520 {
521     tmh->opcode  = op;
522     tmh->app_ctx = app_ctx;
523     tmh->tag     = tag;
524 }
525 
526 static UCS_F_ALWAYS_INLINE void
uct_rc_mlx5_fill_rvh(struct ibv_rvh * rvh,const void * vaddr,uint32_t rkey,uint32_t len)527 uct_rc_mlx5_fill_rvh(struct ibv_rvh *rvh, const void *vaddr,
528                      uint32_t rkey, uint32_t len)
529 {
530     rvh->va   = htobe64((uint64_t)vaddr);
531     rvh->rkey = htonl(rkey);
532     rvh->len  = htonl(len);
533 }
534 
535 static UCS_F_ALWAYS_INLINE unsigned
uct_rc_mlx5_tag_get_op_id(uct_rc_mlx5_iface_common_t * iface,uct_completion_t * comp)536 uct_rc_mlx5_tag_get_op_id(uct_rc_mlx5_iface_common_t *iface, uct_completion_t *comp)
537 {
538     return ucs_ptr_array_insert(&iface->tm.rndv_comps, comp);
539 }
540 
541 
542 static UCS_F_ALWAYS_INLINE unsigned
uct_rc_mlx5_fill_tmh_priv_data(struct ibv_tmh * tmh,const void * hdr,unsigned hdr_len,unsigned max_rndv_priv_data)543 uct_rc_mlx5_fill_tmh_priv_data(struct ibv_tmh *tmh, const void *hdr,
544                                unsigned hdr_len, unsigned max_rndv_priv_data)
545 {
546     uct_rc_mlx5_tmh_priv_data_t *priv = (uct_rc_mlx5_tmh_priv_data_t*)tmh->reserved;
547 
548     /* If header length is bigger tha max_rndv_priv_data size, need to add the
549      * rest to the TMH reserved field. */
550     if (hdr_len > max_rndv_priv_data) {
551         priv->length = hdr_len - max_rndv_priv_data;
552         ucs_assert(priv->length <= UCT_RC_MLX5_TMH_PRIV_LEN);
553         memcpy(&priv->data, (char*)hdr, priv->length);
554     } else {
555         priv->length = 0;
556     }
557 
558     return priv->length;
559 }
560 #endif
561 
562 static UCS_F_ALWAYS_INLINE void
uct_rc_mlx5_tag_imm_data_pack(uint32_t * ib_imm,uint32_t * app_ctx,uint64_t imm_val)563 uct_rc_mlx5_tag_imm_data_pack(uint32_t *ib_imm, uint32_t *app_ctx,
564                               uint64_t imm_val)
565 {
566     *ib_imm  = (uint32_t)(imm_val & 0xFFFFFFFF);
567     *app_ctx = (uint32_t)(imm_val >> 32);
568 }
569 
570 static UCS_F_ALWAYS_INLINE uint64_t
uct_rc_mlx5_tag_imm_data_unpack(uint32_t ib_imm,uint32_t app_ctx,int is_imm)571 uct_rc_mlx5_tag_imm_data_unpack(uint32_t ib_imm, uint32_t app_ctx, int is_imm)
572 {
573     return is_imm ? (((uint64_t)app_ctx << 32) | ib_imm) : 0ul;
574 }
575 
576 static UCS_F_ALWAYS_INLINE uct_rc_mlx5_ctx_priv_t*
uct_rc_mlx5_ctx_priv(uct_tag_context_t * ctx)577 uct_rc_mlx5_ctx_priv(uct_tag_context_t *ctx)
578 {
579     return (uct_rc_mlx5_ctx_priv_t*)ctx->priv;
580 }
581 
582 static UCS_F_ALWAYS_INLINE void
uct_rc_mlx5_handle_rndv_fin(uct_rc_mlx5_iface_common_t * iface,uint32_t app_ctx)583 uct_rc_mlx5_handle_rndv_fin(uct_rc_mlx5_iface_common_t *iface, uint32_t app_ctx)
584 {
585     void *rndv_comp = NULL;
586     int found;
587 
588     found = ucs_ptr_array_lookup(&iface->tm.rndv_comps, app_ctx, rndv_comp);
589     ucs_assert_always(found > 0);
590     uct_invoke_completion((uct_completion_t*)rndv_comp, UCS_OK);
591     ucs_ptr_array_remove(&iface->tm.rndv_comps, app_ctx);
592 }
593 
594 extern ucs_config_field_t uct_rc_mlx5_common_config_table[];
595 
596 unsigned uct_rc_mlx5_iface_srq_post_recv(uct_rc_mlx5_iface_common_t *iface);
597 
598 void uct_rc_mlx5_iface_common_prepost_recvs(uct_rc_mlx5_iface_common_t *iface);
599 
600 ucs_status_t uct_rc_mlx5_iface_common_init(uct_rc_mlx5_iface_common_t *iface,
601                                            uct_rc_iface_t *rc_iface,
602                                            const uct_rc_iface_config_t *config,
603                                            const uct_ib_mlx5_iface_config_t *mlx5_config);
604 
605 void uct_rc_mlx5_iface_common_cleanup(uct_rc_mlx5_iface_common_t *iface);
606 
607 ucs_status_t uct_rc_mlx5_iface_common_dm_init(uct_rc_mlx5_iface_common_t *iface,
608                                               uct_rc_iface_t *rc_iface,
609                                               const uct_ib_mlx5_iface_config_t *mlx5_config);
610 
611 void uct_rc_mlx5_iface_common_dm_cleanup(uct_rc_mlx5_iface_common_t *iface);
612 
613 void uct_rc_mlx5_iface_common_query(uct_ib_iface_t *ib_iface,
614                                     uct_iface_attr_t *iface_attr,
615                                     size_t max_inline, size_t max_tag_eager_iov);
616 
617 void uct_rc_mlx5_iface_common_update_cqs_ci(uct_rc_mlx5_iface_common_t *iface,
618                                             uct_ib_iface_t *ib_iface);
619 
620 void uct_rc_mlx5_iface_common_sync_cqs_ci(uct_rc_mlx5_iface_common_t *iface,
621                                           uct_ib_iface_t *ib_iface);
622 
623 int uct_rc_mlx5_iface_commom_clean(uct_ib_mlx5_cq_t *mlx5_cq,
624                                    uct_ib_mlx5_srq_t *srq, uint32_t qpn);
625 
626 static UCS_F_MAYBE_UNUSED void
uct_rc_mlx5_iface_tm_set_cmd_qp_len(uct_rc_mlx5_iface_common_t * iface)627 uct_rc_mlx5_iface_tm_set_cmd_qp_len(uct_rc_mlx5_iface_common_t *iface)
628 {
629     /* 2 ops for each tag (ADD + DEL) and extra ops for SYNC. */
630     iface->tm.cmd_qp_len = (2 * iface->tm.num_tags) + 2;
631 }
632 
633 #if IBV_HW_TM
634 void uct_rc_mlx5_init_rx_tm_common(uct_rc_mlx5_iface_common_t *iface,
635                                    const uct_rc_iface_common_config_t *config,
636                                    unsigned rndv_hdr_len);
637 
638 ucs_status_t uct_rc_mlx5_init_rx_tm(uct_rc_mlx5_iface_common_t *iface,
639                                     const uct_rc_iface_common_config_t *config,
640                                     struct ibv_srq_init_attr_ex *srq_init_attr,
641                                     unsigned rndv_hdr_len);
642 #else
643 static UCS_F_MAYBE_UNUSED ucs_status_t
uct_rc_mlx5_init_rx_tm(uct_rc_mlx5_iface_common_t * iface,const uct_rc_iface_common_config_t * config,struct ibv_srq_init_attr_ex * srq_init_attr,unsigned rndv_hdr_len)644 uct_rc_mlx5_init_rx_tm(uct_rc_mlx5_iface_common_t *iface,
645                        const uct_rc_iface_common_config_t *config,
646                        struct ibv_srq_init_attr_ex *srq_init_attr,
647                        unsigned rndv_hdr_len)
648 {
649     return UCS_ERR_UNSUPPORTED;
650 }
651 #endif
652 
653 #if IBV_HW_TM && HAVE_DEVX
654 ucs_status_t uct_rc_mlx5_devx_init_rx_tm(uct_rc_mlx5_iface_common_t *iface,
655                                          const uct_rc_iface_common_config_t *config,
656                                          int dc, unsigned rndv_hdr_len);
657 #else
658 static UCS_F_MAYBE_UNUSED ucs_status_t
uct_rc_mlx5_devx_init_rx_tm(uct_rc_mlx5_iface_common_t * iface,const uct_rc_iface_common_config_t * config,int dc,unsigned rndv_hdr_len)659 uct_rc_mlx5_devx_init_rx_tm(uct_rc_mlx5_iface_common_t *iface,
660                             const uct_rc_iface_common_config_t *config,
661                             int dc, unsigned rndv_hdr_len)
662 {
663     return UCS_ERR_UNSUPPORTED;
664 }
665 #endif
666 
667 #if HAVE_DEVX
668 ucs_status_t uct_rc_mlx5_devx_init_rx(uct_rc_mlx5_iface_common_t *iface,
669                                       const uct_rc_iface_common_config_t *config);
670 
671 void uct_rc_mlx5_devx_cleanup_srq(uct_ib_mlx5_md_t *md, uct_ib_mlx5_srq_t *srq);
672 #else
673 static UCS_F_MAYBE_UNUSED ucs_status_t
uct_rc_mlx5_devx_init_rx(uct_rc_mlx5_iface_common_t * iface,const uct_rc_iface_common_config_t * config)674 uct_rc_mlx5_devx_init_rx(uct_rc_mlx5_iface_common_t *iface,
675                          const uct_rc_iface_common_config_t *config)
676 {
677     return UCS_ERR_UNSUPPORTED;
678 }
679 
680 static UCS_F_MAYBE_UNUSED void
uct_rc_mlx5_devx_cleanup_srq(uct_ib_mlx5_md_t * md,uct_ib_mlx5_srq_t * srq)681 uct_rc_mlx5_devx_cleanup_srq(uct_ib_mlx5_md_t *md, uct_ib_mlx5_srq_t *srq)
682 {
683     ucs_bug("DevX SRQ cleanup has to be done only if DevX support is enabled");
684 }
685 #endif
686 
687 void uct_rc_mlx5_tag_cleanup(uct_rc_mlx5_iface_common_t *iface);
688 
689 ucs_status_t uct_rc_mlx5_iface_common_tag_init(uct_rc_mlx5_iface_common_t *iface);
690 
691 void uct_rc_mlx5_iface_common_tag_cleanup(uct_rc_mlx5_iface_common_t *iface);
692 
693 ucs_status_t uct_rc_mlx5_ep_tag_rndv_cancel(uct_ep_h tl_ep, void *op);
694 
695 void uct_rc_mlx5_common_packet_dump(uct_base_iface_t *iface, uct_am_trace_type_t type,
696                                     void *data, size_t length, size_t valid_length,
697                                     char *buffer, size_t max);
698 
699 static UCS_F_ALWAYS_INLINE void
uct_rc_mlx5_am_hdr_fill(uct_rc_mlx5_hdr_t * rch,uint8_t id)700 uct_rc_mlx5_am_hdr_fill(uct_rc_mlx5_hdr_t *rch, uint8_t id)
701 {
702 #if IBV_HW_TM
703     rch->tmh_opcode   = IBV_TMH_NO_TAG;
704 #endif
705     rch->rc_hdr.am_id = id;
706 }
707 
708 #if HAVE_DEVX
709 ucs_status_t
710 uct_rc_mlx5_iface_common_devx_connect_qp(uct_rc_mlx5_iface_common_t *iface,
711                                          uct_ib_mlx5_qp_t *qp,
712                                          uint32_t dest_qp_num,
713                                          struct ibv_ah_attr *ah_attr,
714                                          enum ibv_mtu path_mtu);
715 
716 #else
717 static UCS_F_MAYBE_UNUSED ucs_status_t
uct_rc_mlx5_iface_common_devx_connect_qp(uct_rc_mlx5_iface_common_t * iface,uct_ib_mlx5_qp_t * qp,uint32_t dest_qp_num,struct ibv_ah_attr * ah_attr,enum ibv_mtu path_mtu)718 uct_rc_mlx5_iface_common_devx_connect_qp(uct_rc_mlx5_iface_common_t *iface,
719                                          uct_ib_mlx5_qp_t *qp,
720                                          uint32_t dest_qp_num,
721                                          struct ibv_ah_attr *ah_attr,
722                                          enum ibv_mtu path_mtu)
723 {
724     return UCS_ERR_UNSUPPORTED;
725 }
726 #endif
727 
728 ucs_status_t uct_rc_mlx5_devx_iface_init_events(uct_rc_mlx5_iface_common_t *iface);
729 
730 void uct_rc_mlx5_devx_iface_free_events(uct_rc_mlx5_iface_common_t *iface);
731 
732 ucs_status_t uct_rc_mlx5_devx_iface_subscribe_event(uct_rc_mlx5_iface_common_t *iface,
733                                                     uct_ib_mlx5_qp_t *qp,
734                                                     unsigned event_num,
735                                                     enum ibv_event_type event_type,
736                                                     unsigned event_data);
737 
738 void uct_rc_mlx5_iface_fill_attr(uct_rc_mlx5_iface_common_t *iface,
739                                  uct_ib_mlx5_qp_attr_t *qp_attr,
740                                  unsigned max_send_wr,
741                                  uct_ib_mlx5_srq_t *srq);
742 
743 ucs_status_t
744 uct_rc_mlx5_common_iface_init_rx(uct_rc_mlx5_iface_common_t *iface,
745                                  const uct_rc_iface_common_config_t *rc_config);
746 
747 void uct_rc_mlx5_destroy_srq(uct_ib_mlx5_md_t *md, uct_ib_mlx5_srq_t *srq);
748 
749 #endif
750