1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
3 *
4 * See file LICENSE for terms.
5 */
6 
7 #ifndef UCT_RC_IFACE_H
8 #define UCT_RC_IFACE_H
9 
10 #include "rc_def.h"
11 
12 #include <uct/base/uct_iface.h>
13 #include <uct/ib/base/ib_log.h>
14 #include <uct/ib/base/ib_iface.h>
15 #include <ucs/datastruct/arbiter.h>
16 #include <ucs/datastruct/queue.h>
17 #include <ucs/datastruct/ptr_array.h>
18 #include <ucs/debug/log.h>
19 
20 
21 #define UCT_RC_QP_TABLE_ORDER       12
22 #define UCT_RC_QP_TABLE_SIZE        UCS_BIT(UCT_RC_QP_TABLE_ORDER)
23 #define UCT_RC_QP_TABLE_MEMB_ORDER  (UCT_IB_QPN_ORDER - UCT_RC_QP_TABLE_ORDER)
24 #define UCT_RC_QP_MAX_RETRY_COUNT   7
25 
26 #define UCT_RC_CHECK_AM_SHORT(_am_id, _length, _max_inline) \
27      UCT_CHECK_AM_ID(_am_id); \
28      UCT_CHECK_LENGTH(sizeof(uct_rc_am_short_hdr_t) + _length, 0, _max_inline, "am_short");
29 
30 #define UCT_RC_CHECK_ZCOPY_DATA(_header_length, _length, _seg_size) \
31     UCT_CHECK_LENGTH(_header_length + _length, 0, _seg_size, "am_zcopy payload"); \
32     UCT_CHECK_LENGTH(_header_length + _length, 0, UCT_IB_MAX_MESSAGE_SIZE, "am_zcopy ib max message");
33 
34 #define UCT_RC_CHECK_AM_ZCOPY(_id, _header_length, _length, _desc_size, _seg_size) \
35     UCT_CHECK_AM_ID(_id); \
36     UCT_RC_CHECK_ZCOPY_DATA(_header_length, _length, _seg_size) \
37     UCT_CHECK_LENGTH(sizeof(uct_rc_hdr_t) + _header_length, 0, _desc_size, "am_zcopy header");
38 
39 
40 #define UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
41     UCT_TL_IFACE_GET_TX_DESC(&(_iface)->super.super, _mp, _desc, \
42                              return UCS_ERR_NO_RESOURCE);
43 
44 #define UCT_RC_IFACE_GET_TX_AM_BCOPY_DESC(_iface, _mp, _desc, _id, _pk_hdr_cb, \
45                                           _hdr, _pack_cb, _arg, _length) ({ \
46     _hdr *rch; \
47     UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
48     (_desc)->super.handler = (uct_rc_send_handler_t)ucs_mpool_put; \
49     rch = (_hdr *)(_desc + 1); \
50     _pk_hdr_cb(rch, _id); \
51     *(_length) = _pack_cb(rch + 1, _arg); \
52 })
53 
54 #define UCT_RC_IFACE_GET_TX_AM_ZCOPY_DESC(_iface, _mp, _desc, \
55                                           _id, _header, _header_length, _comp, _send_flags) \
56     UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc); \
57     uct_rc_zcopy_desc_set_comp(_desc, _comp, _send_flags); \
58     uct_rc_zcopy_desc_set_header((uct_rc_hdr_t*)(_desc + 1), _id, _header, _header_length);
59 
60 #define UCT_RC_IFACE_GET_TX_PUT_BCOPY_DESC(_iface, _mp, _desc, _pack_cb, _arg, _length) \
61     UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
62     (_desc)->super.handler = (uct_rc_send_handler_t)ucs_mpool_put; \
63     _length = _pack_cb(_desc + 1, _arg); \
64     UCT_SKIP_ZERO_LENGTH(_length, _desc);
65 
66 #define UCT_RC_IFACE_GET_TX_GET_BCOPY_DESC(_iface, _mp, _desc, _unpack_cb, _comp, _arg, _length) \
67     UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
68     ucs_assert(_length <= (_iface)->super.config.seg_size); \
69     _desc->super.handler     = (_comp == NULL) ? \
70                                 uct_rc_ep_get_bcopy_handler_no_completion : \
71                                 uct_rc_ep_get_bcopy_handler; \
72     _desc->super.unpack_arg  = _arg; \
73     _desc->super.user_comp   = _comp; \
74     _desc->super.length      = _length; \
75     _desc->unpack_cb         = _unpack_cb;
76 
77 
78 #define UCT_RC_IFACE_GET_TX_ATOMIC_DESC(_iface, _mp, _desc) \
79     UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
80     _desc->super.handler = (uct_rc_send_handler_t)ucs_mpool_put;
81 
82 #define UCT_RC_IFACE_GET_TX_ATOMIC_FETCH_DESC(_iface, _mp, _desc, _handler, _result, _comp) \
83     UCT_CHECK_PARAM(_comp != NULL, "completion must be non-NULL"); \
84     UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
85     _desc->super.handler   = _handler; \
86     _desc->super.buffer    = _result; \
87     _desc->super.user_comp = _comp;
88 
89 
90 enum {
91     UCT_RC_IFACE_STAT_RX_COMPLETION,
92     UCT_RC_IFACE_STAT_TX_COMPLETION,
93     UCT_RC_IFACE_STAT_NO_CQE,
94     UCT_RC_IFACE_STAT_NO_READS,
95     UCT_RC_IFACE_STAT_LAST
96 };
97 
98 
99 /* flags for uct_rc_iface_send_op_t */
100 enum {
101 #if UCS_ENABLE_ASSERT
102     UCT_RC_IFACE_SEND_OP_FLAG_ZCOPY = UCS_BIT(13), /* zcopy */
103     UCT_RC_IFACE_SEND_OP_FLAG_IFACE = UCS_BIT(14), /* belongs to iface ops buffer */
104     UCT_RC_IFACE_SEND_OP_FLAG_INUSE = UCS_BIT(15)  /* queued on a txqp */
105 #else
106     UCT_RC_IFACE_SEND_OP_FLAG_ZCOPY = 0,
107     UCT_RC_IFACE_SEND_OP_FLAG_IFACE = 0,
108     UCT_RC_IFACE_SEND_OP_FLAG_INUSE = 0
109 #endif
110 };
111 
112 
113 typedef void (*uct_rc_send_handler_t)(uct_rc_iface_send_op_t *op, const void *resp);
114 
115 
116 /**
117  * RC network header.
118  */
119 typedef struct uct_rc_hdr {
120     uint8_t           am_id;     /* Active message ID */
121 } UCS_S_PACKED uct_rc_hdr_t;
122 
123 
124 typedef struct uct_rc_fc_request {
125     uct_pending_req_t super;
126     uct_ep_t          *ep;
127 } uct_rc_fc_request_t;
128 
129 
130 /**
131  * RC fence type.
132  */
133 typedef enum uct_rc_fence_mode {
134     UCT_RC_FENCE_MODE_NONE,
135     UCT_RC_FENCE_MODE_WEAK,
136     UCT_RC_FENCE_MODE_AUTO,
137     UCT_RC_FENCE_MODE_LAST
138 } uct_rc_fence_mode_t;
139 
140 
141 /* Common configuration used for rc verbs, rcx and dc transports */
142 typedef struct uct_rc_iface_common_config {
143     uct_ib_iface_config_t    super;
144     unsigned                 max_rd_atomic;
145     int                      ooo_rw; /* Enable out-of-order RDMA data placement */
146     int                      fence_mode;
147 
148     struct {
149         double               timeout;
150         unsigned             retry_count;
151         double               rnr_timeout;
152         unsigned             rnr_retry_count;
153         size_t               max_get_zcopy;
154         size_t               max_get_bytes;
155     } tx;
156 
157     struct {
158         int                  enable;
159         double               hard_thresh;
160         unsigned             wnd_size;
161     } fc;
162 } uct_rc_iface_common_config_t;
163 
164 
165 /* RC specific configuration used for rc verbs and rcx transports only */
166 struct uct_rc_iface_config {
167     uct_rc_iface_common_config_t   super;
168     double                         soft_thresh;
169     unsigned                       tx_cq_moderation; /* How many TX messages are
170                                                         batched to one CQE */
171     unsigned                       tx_cq_len;
172 };
173 
174 
175 typedef struct uct_rc_iface_ops {
176     uct_ib_iface_ops_t   super;
177     ucs_status_t         (*init_rx)(uct_rc_iface_t *iface,
178                                     const uct_rc_iface_common_config_t *config);
179     void                 (*cleanup_rx)(uct_rc_iface_t *iface);
180     ucs_status_t         (*fc_ctrl)(uct_ep_t *ep, unsigned op,
181                                     uct_rc_fc_request_t *req);
182     ucs_status_t         (*fc_handler)(uct_rc_iface_t *iface, unsigned qp_num,
183                                        uct_rc_hdr_t *hdr, unsigned length,
184                                        uint32_t imm_data, uint16_t lid,
185                                        unsigned flags);
186 } uct_rc_iface_ops_t;
187 
188 
189 typedef struct uct_rc_srq {
190     unsigned                 available;
191     unsigned                 quota;
192 } uct_rc_srq_t;
193 
194 
195 struct uct_rc_iface {
196     uct_ib_iface_t              super;
197 
198     struct {
199         ucs_mpool_t             mp;       /* pool for send descriptors */
200         ucs_mpool_t             fc_mp;    /* pool for FC grant pending requests */
201         ucs_mpool_t             flush_mp; /* pool for flush completions */
202         /* Credits for completions.
203          * May be negative in case mlx5 because we take "num_bb" credits per
204          * post to be able to calculate credits of outstanding ops on failure.
205          * In case of verbs TL we use QWE number, so 1 post always takes 1
206          * credit */
207         signed                  cq_available;
208         ssize_t                 reads_available;
209         uct_rc_iface_send_op_t  *free_ops; /* stack of free send operations */
210         ucs_arbiter_t           arbiter;
211         uct_rc_iface_send_op_t  *ops_buffer;
212         uct_ib_fence_info_t     fi;
213     } tx;
214 
215     struct {
216         ucs_mpool_t          mp;
217         uct_rc_srq_t         srq;
218     } rx;
219 
220     struct {
221         unsigned             tx_qp_len;
222         unsigned             tx_min_sge;
223         unsigned             tx_min_inline;
224         unsigned             tx_ops_count;
225         uint16_t             tx_moderation;
226 
227         /* Threshold to send "soft" FC credit request. The peer will try to
228          * piggy-back credits grant to the counter AM, if any. */
229         int16_t              fc_soft_thresh;
230 
231         /* Threshold to sent "hard" credits request. The peer will grant
232          * credits in a separate AM as soon as it handles this request. */
233         int16_t              fc_hard_thresh;
234 
235         uint16_t             fc_wnd_size;
236         uint8_t              fc_enabled;
237 
238         uint8_t              min_rnr_timer;
239         uint8_t              timeout;
240         uint8_t              rnr_retry;
241         uint8_t              retry_cnt;
242         uint8_t              max_rd_atomic;
243         /* Enable out-of-order RDMA data placement */
244         uint8_t              ooo_rw;
245 #if UCS_ENABLE_ASSERT
246         int                  tx_cq_len;
247 #endif
248         uct_rc_fence_mode_t  fence_mode;
249         unsigned             exp_backoff;
250         size_t               max_get_zcopy;
251 
252         /* Atomic callbacks */
253         uct_rc_send_handler_t  atomic64_handler;      /* 64bit ib-spec */
254         uct_rc_send_handler_t  atomic32_ext_handler;  /* 32bit extended */
255         uct_rc_send_handler_t  atomic64_ext_handler;  /* 64bit extended */
256     } config;
257 
258     UCS_STATS_NODE_DECLARE(stats)
259 
260     uct_rc_ep_t              **eps[UCT_RC_QP_TABLE_SIZE];
261     ucs_list_link_t          ep_list;
262 
263     /* Progress function (either regular or TM aware) */
264     ucs_callback_t           progress;
265 };
266 UCS_CLASS_DECLARE(uct_rc_iface_t, uct_rc_iface_ops_t*, uct_md_h, uct_worker_h,
267                   const uct_iface_params_t*, const uct_rc_iface_common_config_t*,
268                   uct_ib_iface_init_attr_t*);
269 
270 
271 struct uct_rc_iface_send_op {
272     union {
273         ucs_queue_elem_t          queue;  /* used when enqueued on a txqp */
274         uct_rc_iface_send_op_t    *next;  /* used when on free list */
275     };
276     uct_rc_send_handler_t         handler;
277     uint16_t                      sn;
278     uint16_t                      flags;
279     unsigned                      length;
280     union {
281         void                      *buffer;     /* atomics / desc */
282         void                      *unpack_arg; /* get_bcopy / desc */
283         uct_rc_iface_t            *iface;      /* should not be used with
284                                                   get_bcopy completions */
285     };
286     uct_completion_t              *user_comp;
287 };
288 
289 
290 struct uct_rc_iface_send_desc {
291     uct_rc_iface_send_op_t        super;
292     uct_unpack_callback_t         unpack_cb;
293     uint32_t                      lkey;
294 };
295 
296 
297 /*
298  * Short active message header (active message header is always 64 bit).
299  */
300 typedef struct uct_rc_am_short_hdr {
301     uct_rc_hdr_t      rc_hdr;
302     uint64_t          am_hdr;
303 } UCS_S_PACKED uct_rc_am_short_hdr_t;
304 
305 
306 extern ucs_config_field_t uct_rc_iface_config_table[];
307 extern ucs_config_field_t uct_rc_iface_common_config_table[];
308 
309 unsigned uct_rc_iface_do_progress(uct_iface_h tl_iface);
310 
311 ucs_status_t uct_rc_iface_query(uct_rc_iface_t *iface,
312                                 uct_iface_attr_t *iface_attr,
313                                 size_t put_max_short, size_t max_inline,
314                                 size_t am_max_hdr, size_t am_max_iov,
315                                 size_t am_min_hdr, size_t rma_max_iov);
316 
317 void uct_rc_iface_add_qp(uct_rc_iface_t *iface, uct_rc_ep_t *ep,
318                          unsigned qp_num);
319 
320 void uct_rc_iface_remove_qp(uct_rc_iface_t *iface, unsigned qp_num);
321 
322 ucs_status_t uct_rc_iface_flush(uct_iface_h tl_iface, unsigned flags,
323                                 uct_completion_t *comp);
324 
325 void uct_rc_iface_send_desc_init(uct_iface_h tl_iface, void *obj, uct_mem_h memh);
326 
327 void uct_rc_ep_am_zcopy_handler(uct_rc_iface_send_op_t *op, const void *resp);
328 
329 /**
330  * Creates an RC or DCI QP
331  */
332 ucs_status_t uct_rc_iface_qp_create(uct_rc_iface_t *iface, struct ibv_qp **qp_p,
333                                     uct_ib_qp_attr_t *attr, unsigned max_send_wr,
334                                     struct ibv_srq *srq);
335 
336 void uct_rc_iface_fill_attr(uct_rc_iface_t *iface,
337                             uct_ib_qp_attr_t *qp_init_attr,
338                             unsigned max_send_wr,
339                             struct ibv_srq *srq);
340 
341 ucs_status_t uct_rc_iface_qp_init(uct_rc_iface_t *iface, struct ibv_qp *qp);
342 
343 ucs_status_t uct_rc_iface_qp_connect(uct_rc_iface_t *iface, struct ibv_qp *qp,
344                                      const uint32_t qp_num,
345                                      struct ibv_ah_attr *ah_attr,
346                                      enum ibv_mtu path_mtu);
347 
348 ucs_status_t uct_rc_iface_fc_handler(uct_rc_iface_t *iface, unsigned qp_num,
349                                      uct_rc_hdr_t *hdr, unsigned length,
350                                      uint32_t imm_data, uint16_t lid, unsigned flags);
351 
352 ucs_status_t uct_rc_init_fc_thresh(uct_rc_iface_config_t *rc_cfg,
353                                    uct_rc_iface_t *iface);
354 
355 ucs_status_t uct_rc_iface_event_arm(uct_iface_h tl_iface, unsigned events);
356 
357 ucs_status_t uct_rc_iface_common_event_arm(uct_iface_h tl_iface,
358                                            unsigned events, int force_rx_all);
359 
360 ucs_status_t uct_rc_iface_init_rx(uct_rc_iface_t *iface,
361                                   const uct_rc_iface_common_config_t *config,
362                                   struct ibv_srq **p_srq);
363 
364 ucs_status_t uct_rc_iface_fence(uct_iface_h tl_iface, unsigned flags);
365 
366 static UCS_F_ALWAYS_INLINE ucs_status_t
uct_rc_fc_ctrl(uct_ep_t * ep,unsigned op,uct_rc_fc_request_t * req)367 uct_rc_fc_ctrl(uct_ep_t *ep, unsigned op, uct_rc_fc_request_t *req)
368 {
369     uct_rc_iface_t *iface   = ucs_derived_of(ep->iface, uct_rc_iface_t);
370     uct_rc_iface_ops_t *ops = ucs_derived_of(iface->super.ops,
371                                              uct_rc_iface_ops_t);
372     return ops->fc_ctrl(ep, op, req);
373 }
374 
uct_rc_iface_lookup_ep(uct_rc_iface_t * iface,unsigned qp_num)375 static inline uct_rc_ep_t *uct_rc_iface_lookup_ep(uct_rc_iface_t *iface,
376                                                   unsigned qp_num)
377 {
378     ucs_assert(qp_num < UCS_BIT(UCT_IB_QPN_ORDER));
379     return iface->eps[qp_num >> UCT_RC_QP_TABLE_ORDER]
380                      [qp_num &  UCS_MASK(UCT_RC_QP_TABLE_MEMB_ORDER)];
381 }
382 
383 
384 static UCS_F_ALWAYS_INLINE int
uct_rc_iface_have_tx_cqe_avail(uct_rc_iface_t * iface)385 uct_rc_iface_have_tx_cqe_avail(uct_rc_iface_t* iface)
386 {
387     return iface->tx.cq_available > 0;
388 }
389 
390 static UCS_F_ALWAYS_INLINE uct_rc_iface_send_op_t*
uct_rc_iface_get_send_op(uct_rc_iface_t * iface)391 uct_rc_iface_get_send_op(uct_rc_iface_t *iface)
392 {
393     uct_rc_iface_send_op_t *op;
394     op = iface->tx.free_ops;
395     iface->tx.free_ops = op->next;
396     return op;
397 }
398 
399 static UCS_F_ALWAYS_INLINE void
uct_rc_iface_put_send_op(uct_rc_iface_send_op_t * op)400 uct_rc_iface_put_send_op(uct_rc_iface_send_op_t *op)
401 {
402     uct_rc_iface_t *iface = op->iface;
403     ucs_assert(op->flags == UCT_RC_IFACE_SEND_OP_FLAG_IFACE);
404     op->next = iface->tx.free_ops;
405     iface->tx.free_ops = op;
406 }
407 
408 static UCS_F_ALWAYS_INLINE void
uct_rc_am_hdr_fill(uct_rc_hdr_t * rch,uint8_t id)409 uct_rc_am_hdr_fill(uct_rc_hdr_t *rch, uint8_t id)
410 {
411     rch->am_id = id;
412 }
413 
uct_rc_zcopy_desc_set_comp(uct_rc_iface_send_desc_t * desc,uct_completion_t * comp,int * send_flags)414 static inline void uct_rc_zcopy_desc_set_comp(uct_rc_iface_send_desc_t *desc,
415                                               uct_completion_t *comp,
416                                               int *send_flags)
417 {
418     if (comp == NULL) {
419         desc->super.handler   = (uct_rc_send_handler_t)ucs_mpool_put;
420         *send_flags           = 0;
421     } else {
422         desc->super.handler   = uct_rc_ep_am_zcopy_handler;
423         desc->super.user_comp = comp;
424         *send_flags           = IBV_SEND_SIGNALED;
425     }
426 }
427 
uct_rc_zcopy_desc_set_header(uct_rc_hdr_t * rch,uint8_t id,const void * header,unsigned header_length)428 static inline void uct_rc_zcopy_desc_set_header(uct_rc_hdr_t *rch,
429                                                 uint8_t id, const void *header,
430                                                 unsigned header_length)
431 {
432     uct_rc_am_hdr_fill(rch, id);
433     memcpy(rch + 1, header, header_length);
434 }
435 
uct_rc_iface_has_tx_resources(uct_rc_iface_t * iface)436 static inline int uct_rc_iface_has_tx_resources(uct_rc_iface_t *iface)
437 {
438     return uct_rc_iface_have_tx_cqe_avail(iface) &&
439            !ucs_mpool_is_empty(&iface->tx.mp) &&
440            (iface->tx.reads_available > 0);
441 }
442 
443 static UCS_F_ALWAYS_INLINE uct_rc_send_handler_t
uct_rc_iface_atomic_handler(uct_rc_iface_t * iface,int ext,unsigned length)444 uct_rc_iface_atomic_handler(uct_rc_iface_t *iface, int ext, unsigned length)
445 {
446     ucs_assert((length == sizeof(uint32_t)) || (length == sizeof(uint64_t)));
447     switch (length) {
448     case sizeof(uint32_t):
449         return iface->config.atomic32_ext_handler;
450     case sizeof(uint64_t):
451         return ext ? iface->config.atomic64_ext_handler :
452                      iface->config.atomic64_handler;
453     }
454     return NULL;
455 }
456 
457 static UCS_F_ALWAYS_INLINE ucs_status_t
uct_rc_iface_fence_relaxed_order(uct_iface_h tl_iface)458 uct_rc_iface_fence_relaxed_order(uct_iface_h tl_iface)
459 {
460     uct_base_iface_t *iface = ucs_derived_of(tl_iface, uct_base_iface_t);
461     uct_ib_md_t *md         = ucs_derived_of(iface->md, uct_ib_md_t);
462 
463     ucs_assert(tl_iface->ops.iface_fence == uct_rc_iface_fence);
464 
465     if (!md->relaxed_order) {
466         return UCS_OK;
467     }
468 
469     return uct_rc_iface_fence(tl_iface, 0);
470 }
471 #endif
472