1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED.
3 *
4 * See file LICENSE for terms.
5 */
6
7 #ifndef UCT_RC_IFACE_H
8 #define UCT_RC_IFACE_H
9
10 #include "rc_def.h"
11
12 #include <uct/base/uct_iface.h>
13 #include <uct/ib/base/ib_log.h>
14 #include <uct/ib/base/ib_iface.h>
15 #include <ucs/datastruct/arbiter.h>
16 #include <ucs/datastruct/queue.h>
17 #include <ucs/datastruct/ptr_array.h>
18 #include <ucs/debug/log.h>
19
20
21 #define UCT_RC_QP_TABLE_ORDER 12
22 #define UCT_RC_QP_TABLE_SIZE UCS_BIT(UCT_RC_QP_TABLE_ORDER)
23 #define UCT_RC_QP_TABLE_MEMB_ORDER (UCT_IB_QPN_ORDER - UCT_RC_QP_TABLE_ORDER)
24 #define UCT_RC_QP_MAX_RETRY_COUNT 7
25
26 #define UCT_RC_CHECK_AM_SHORT(_am_id, _length, _max_inline) \
27 UCT_CHECK_AM_ID(_am_id); \
28 UCT_CHECK_LENGTH(sizeof(uct_rc_am_short_hdr_t) + _length, 0, _max_inline, "am_short");
29
30 #define UCT_RC_CHECK_ZCOPY_DATA(_header_length, _length, _seg_size) \
31 UCT_CHECK_LENGTH(_header_length + _length, 0, _seg_size, "am_zcopy payload"); \
32 UCT_CHECK_LENGTH(_header_length + _length, 0, UCT_IB_MAX_MESSAGE_SIZE, "am_zcopy ib max message");
33
34 #define UCT_RC_CHECK_AM_ZCOPY(_id, _header_length, _length, _desc_size, _seg_size) \
35 UCT_CHECK_AM_ID(_id); \
36 UCT_RC_CHECK_ZCOPY_DATA(_header_length, _length, _seg_size) \
37 UCT_CHECK_LENGTH(sizeof(uct_rc_hdr_t) + _header_length, 0, _desc_size, "am_zcopy header");
38
39
40 #define UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
41 UCT_TL_IFACE_GET_TX_DESC(&(_iface)->super.super, _mp, _desc, \
42 return UCS_ERR_NO_RESOURCE);
43
44 #define UCT_RC_IFACE_GET_TX_AM_BCOPY_DESC(_iface, _mp, _desc, _id, _pk_hdr_cb, \
45 _hdr, _pack_cb, _arg, _length) ({ \
46 _hdr *rch; \
47 UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
48 (_desc)->super.handler = (uct_rc_send_handler_t)ucs_mpool_put; \
49 rch = (_hdr *)(_desc + 1); \
50 _pk_hdr_cb(rch, _id); \
51 *(_length) = _pack_cb(rch + 1, _arg); \
52 })
53
54 #define UCT_RC_IFACE_GET_TX_AM_ZCOPY_DESC(_iface, _mp, _desc, \
55 _id, _header, _header_length, _comp, _send_flags) \
56 UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc); \
57 uct_rc_zcopy_desc_set_comp(_desc, _comp, _send_flags); \
58 uct_rc_zcopy_desc_set_header((uct_rc_hdr_t*)(_desc + 1), _id, _header, _header_length);
59
60 #define UCT_RC_IFACE_GET_TX_PUT_BCOPY_DESC(_iface, _mp, _desc, _pack_cb, _arg, _length) \
61 UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
62 (_desc)->super.handler = (uct_rc_send_handler_t)ucs_mpool_put; \
63 _length = _pack_cb(_desc + 1, _arg); \
64 UCT_SKIP_ZERO_LENGTH(_length, _desc);
65
66 #define UCT_RC_IFACE_GET_TX_GET_BCOPY_DESC(_iface, _mp, _desc, _unpack_cb, _comp, _arg, _length) \
67 UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
68 ucs_assert(_length <= (_iface)->super.config.seg_size); \
69 _desc->super.handler = (_comp == NULL) ? \
70 uct_rc_ep_get_bcopy_handler_no_completion : \
71 uct_rc_ep_get_bcopy_handler; \
72 _desc->super.unpack_arg = _arg; \
73 _desc->super.user_comp = _comp; \
74 _desc->super.length = _length; \
75 _desc->unpack_cb = _unpack_cb;
76
77
78 #define UCT_RC_IFACE_GET_TX_ATOMIC_DESC(_iface, _mp, _desc) \
79 UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
80 _desc->super.handler = (uct_rc_send_handler_t)ucs_mpool_put;
81
82 #define UCT_RC_IFACE_GET_TX_ATOMIC_FETCH_DESC(_iface, _mp, _desc, _handler, _result, _comp) \
83 UCT_CHECK_PARAM(_comp != NULL, "completion must be non-NULL"); \
84 UCT_RC_IFACE_GET_TX_DESC(_iface, _mp, _desc) \
85 _desc->super.handler = _handler; \
86 _desc->super.buffer = _result; \
87 _desc->super.user_comp = _comp;
88
89
90 enum {
91 UCT_RC_IFACE_STAT_RX_COMPLETION,
92 UCT_RC_IFACE_STAT_TX_COMPLETION,
93 UCT_RC_IFACE_STAT_NO_CQE,
94 UCT_RC_IFACE_STAT_NO_READS,
95 UCT_RC_IFACE_STAT_LAST
96 };
97
98
99 /* flags for uct_rc_iface_send_op_t */
100 enum {
101 #if UCS_ENABLE_ASSERT
102 UCT_RC_IFACE_SEND_OP_FLAG_ZCOPY = UCS_BIT(13), /* zcopy */
103 UCT_RC_IFACE_SEND_OP_FLAG_IFACE = UCS_BIT(14), /* belongs to iface ops buffer */
104 UCT_RC_IFACE_SEND_OP_FLAG_INUSE = UCS_BIT(15) /* queued on a txqp */
105 #else
106 UCT_RC_IFACE_SEND_OP_FLAG_ZCOPY = 0,
107 UCT_RC_IFACE_SEND_OP_FLAG_IFACE = 0,
108 UCT_RC_IFACE_SEND_OP_FLAG_INUSE = 0
109 #endif
110 };
111
112
113 typedef void (*uct_rc_send_handler_t)(uct_rc_iface_send_op_t *op, const void *resp);
114
115
116 /**
117 * RC network header.
118 */
119 typedef struct uct_rc_hdr {
120 uint8_t am_id; /* Active message ID */
121 } UCS_S_PACKED uct_rc_hdr_t;
122
123
124 typedef struct uct_rc_fc_request {
125 uct_pending_req_t super;
126 uct_ep_t *ep;
127 } uct_rc_fc_request_t;
128
129
130 /**
131 * RC fence type.
132 */
133 typedef enum uct_rc_fence_mode {
134 UCT_RC_FENCE_MODE_NONE,
135 UCT_RC_FENCE_MODE_WEAK,
136 UCT_RC_FENCE_MODE_AUTO,
137 UCT_RC_FENCE_MODE_LAST
138 } uct_rc_fence_mode_t;
139
140
141 /* Common configuration used for rc verbs, rcx and dc transports */
142 typedef struct uct_rc_iface_common_config {
143 uct_ib_iface_config_t super;
144 unsigned max_rd_atomic;
145 int ooo_rw; /* Enable out-of-order RDMA data placement */
146 int fence_mode;
147
148 struct {
149 double timeout;
150 unsigned retry_count;
151 double rnr_timeout;
152 unsigned rnr_retry_count;
153 size_t max_get_zcopy;
154 size_t max_get_bytes;
155 } tx;
156
157 struct {
158 int enable;
159 double hard_thresh;
160 unsigned wnd_size;
161 } fc;
162 } uct_rc_iface_common_config_t;
163
164
165 /* RC specific configuration used for rc verbs and rcx transports only */
166 struct uct_rc_iface_config {
167 uct_rc_iface_common_config_t super;
168 double soft_thresh;
169 unsigned tx_cq_moderation; /* How many TX messages are
170 batched to one CQE */
171 unsigned tx_cq_len;
172 };
173
174
175 typedef struct uct_rc_iface_ops {
176 uct_ib_iface_ops_t super;
177 ucs_status_t (*init_rx)(uct_rc_iface_t *iface,
178 const uct_rc_iface_common_config_t *config);
179 void (*cleanup_rx)(uct_rc_iface_t *iface);
180 ucs_status_t (*fc_ctrl)(uct_ep_t *ep, unsigned op,
181 uct_rc_fc_request_t *req);
182 ucs_status_t (*fc_handler)(uct_rc_iface_t *iface, unsigned qp_num,
183 uct_rc_hdr_t *hdr, unsigned length,
184 uint32_t imm_data, uint16_t lid,
185 unsigned flags);
186 } uct_rc_iface_ops_t;
187
188
189 typedef struct uct_rc_srq {
190 unsigned available;
191 unsigned quota;
192 } uct_rc_srq_t;
193
194
195 struct uct_rc_iface {
196 uct_ib_iface_t super;
197
198 struct {
199 ucs_mpool_t mp; /* pool for send descriptors */
200 ucs_mpool_t fc_mp; /* pool for FC grant pending requests */
201 ucs_mpool_t flush_mp; /* pool for flush completions */
202 /* Credits for completions.
203 * May be negative in case mlx5 because we take "num_bb" credits per
204 * post to be able to calculate credits of outstanding ops on failure.
205 * In case of verbs TL we use QWE number, so 1 post always takes 1
206 * credit */
207 signed cq_available;
208 ssize_t reads_available;
209 uct_rc_iface_send_op_t *free_ops; /* stack of free send operations */
210 ucs_arbiter_t arbiter;
211 uct_rc_iface_send_op_t *ops_buffer;
212 uct_ib_fence_info_t fi;
213 } tx;
214
215 struct {
216 ucs_mpool_t mp;
217 uct_rc_srq_t srq;
218 } rx;
219
220 struct {
221 unsigned tx_qp_len;
222 unsigned tx_min_sge;
223 unsigned tx_min_inline;
224 unsigned tx_ops_count;
225 uint16_t tx_moderation;
226
227 /* Threshold to send "soft" FC credit request. The peer will try to
228 * piggy-back credits grant to the counter AM, if any. */
229 int16_t fc_soft_thresh;
230
231 /* Threshold to sent "hard" credits request. The peer will grant
232 * credits in a separate AM as soon as it handles this request. */
233 int16_t fc_hard_thresh;
234
235 uint16_t fc_wnd_size;
236 uint8_t fc_enabled;
237
238 uint8_t min_rnr_timer;
239 uint8_t timeout;
240 uint8_t rnr_retry;
241 uint8_t retry_cnt;
242 uint8_t max_rd_atomic;
243 /* Enable out-of-order RDMA data placement */
244 uint8_t ooo_rw;
245 #if UCS_ENABLE_ASSERT
246 int tx_cq_len;
247 #endif
248 uct_rc_fence_mode_t fence_mode;
249 unsigned exp_backoff;
250 size_t max_get_zcopy;
251
252 /* Atomic callbacks */
253 uct_rc_send_handler_t atomic64_handler; /* 64bit ib-spec */
254 uct_rc_send_handler_t atomic32_ext_handler; /* 32bit extended */
255 uct_rc_send_handler_t atomic64_ext_handler; /* 64bit extended */
256 } config;
257
258 UCS_STATS_NODE_DECLARE(stats)
259
260 uct_rc_ep_t **eps[UCT_RC_QP_TABLE_SIZE];
261 ucs_list_link_t ep_list;
262
263 /* Progress function (either regular or TM aware) */
264 ucs_callback_t progress;
265 };
266 UCS_CLASS_DECLARE(uct_rc_iface_t, uct_rc_iface_ops_t*, uct_md_h, uct_worker_h,
267 const uct_iface_params_t*, const uct_rc_iface_common_config_t*,
268 uct_ib_iface_init_attr_t*);
269
270
271 struct uct_rc_iface_send_op {
272 union {
273 ucs_queue_elem_t queue; /* used when enqueued on a txqp */
274 uct_rc_iface_send_op_t *next; /* used when on free list */
275 };
276 uct_rc_send_handler_t handler;
277 uint16_t sn;
278 uint16_t flags;
279 unsigned length;
280 union {
281 void *buffer; /* atomics / desc */
282 void *unpack_arg; /* get_bcopy / desc */
283 uct_rc_iface_t *iface; /* should not be used with
284 get_bcopy completions */
285 };
286 uct_completion_t *user_comp;
287 };
288
289
290 struct uct_rc_iface_send_desc {
291 uct_rc_iface_send_op_t super;
292 uct_unpack_callback_t unpack_cb;
293 uint32_t lkey;
294 };
295
296
297 /*
298 * Short active message header (active message header is always 64 bit).
299 */
300 typedef struct uct_rc_am_short_hdr {
301 uct_rc_hdr_t rc_hdr;
302 uint64_t am_hdr;
303 } UCS_S_PACKED uct_rc_am_short_hdr_t;
304
305
306 extern ucs_config_field_t uct_rc_iface_config_table[];
307 extern ucs_config_field_t uct_rc_iface_common_config_table[];
308
309 unsigned uct_rc_iface_do_progress(uct_iface_h tl_iface);
310
311 ucs_status_t uct_rc_iface_query(uct_rc_iface_t *iface,
312 uct_iface_attr_t *iface_attr,
313 size_t put_max_short, size_t max_inline,
314 size_t am_max_hdr, size_t am_max_iov,
315 size_t am_min_hdr, size_t rma_max_iov);
316
317 void uct_rc_iface_add_qp(uct_rc_iface_t *iface, uct_rc_ep_t *ep,
318 unsigned qp_num);
319
320 void uct_rc_iface_remove_qp(uct_rc_iface_t *iface, unsigned qp_num);
321
322 ucs_status_t uct_rc_iface_flush(uct_iface_h tl_iface, unsigned flags,
323 uct_completion_t *comp);
324
325 void uct_rc_iface_send_desc_init(uct_iface_h tl_iface, void *obj, uct_mem_h memh);
326
327 void uct_rc_ep_am_zcopy_handler(uct_rc_iface_send_op_t *op, const void *resp);
328
329 /**
330 * Creates an RC or DCI QP
331 */
332 ucs_status_t uct_rc_iface_qp_create(uct_rc_iface_t *iface, struct ibv_qp **qp_p,
333 uct_ib_qp_attr_t *attr, unsigned max_send_wr,
334 struct ibv_srq *srq);
335
336 void uct_rc_iface_fill_attr(uct_rc_iface_t *iface,
337 uct_ib_qp_attr_t *qp_init_attr,
338 unsigned max_send_wr,
339 struct ibv_srq *srq);
340
341 ucs_status_t uct_rc_iface_qp_init(uct_rc_iface_t *iface, struct ibv_qp *qp);
342
343 ucs_status_t uct_rc_iface_qp_connect(uct_rc_iface_t *iface, struct ibv_qp *qp,
344 const uint32_t qp_num,
345 struct ibv_ah_attr *ah_attr,
346 enum ibv_mtu path_mtu);
347
348 ucs_status_t uct_rc_iface_fc_handler(uct_rc_iface_t *iface, unsigned qp_num,
349 uct_rc_hdr_t *hdr, unsigned length,
350 uint32_t imm_data, uint16_t lid, unsigned flags);
351
352 ucs_status_t uct_rc_init_fc_thresh(uct_rc_iface_config_t *rc_cfg,
353 uct_rc_iface_t *iface);
354
355 ucs_status_t uct_rc_iface_event_arm(uct_iface_h tl_iface, unsigned events);
356
357 ucs_status_t uct_rc_iface_common_event_arm(uct_iface_h tl_iface,
358 unsigned events, int force_rx_all);
359
360 ucs_status_t uct_rc_iface_init_rx(uct_rc_iface_t *iface,
361 const uct_rc_iface_common_config_t *config,
362 struct ibv_srq **p_srq);
363
364 ucs_status_t uct_rc_iface_fence(uct_iface_h tl_iface, unsigned flags);
365
366 static UCS_F_ALWAYS_INLINE ucs_status_t
uct_rc_fc_ctrl(uct_ep_t * ep,unsigned op,uct_rc_fc_request_t * req)367 uct_rc_fc_ctrl(uct_ep_t *ep, unsigned op, uct_rc_fc_request_t *req)
368 {
369 uct_rc_iface_t *iface = ucs_derived_of(ep->iface, uct_rc_iface_t);
370 uct_rc_iface_ops_t *ops = ucs_derived_of(iface->super.ops,
371 uct_rc_iface_ops_t);
372 return ops->fc_ctrl(ep, op, req);
373 }
374
uct_rc_iface_lookup_ep(uct_rc_iface_t * iface,unsigned qp_num)375 static inline uct_rc_ep_t *uct_rc_iface_lookup_ep(uct_rc_iface_t *iface,
376 unsigned qp_num)
377 {
378 ucs_assert(qp_num < UCS_BIT(UCT_IB_QPN_ORDER));
379 return iface->eps[qp_num >> UCT_RC_QP_TABLE_ORDER]
380 [qp_num & UCS_MASK(UCT_RC_QP_TABLE_MEMB_ORDER)];
381 }
382
383
384 static UCS_F_ALWAYS_INLINE int
uct_rc_iface_have_tx_cqe_avail(uct_rc_iface_t * iface)385 uct_rc_iface_have_tx_cqe_avail(uct_rc_iface_t* iface)
386 {
387 return iface->tx.cq_available > 0;
388 }
389
390 static UCS_F_ALWAYS_INLINE uct_rc_iface_send_op_t*
uct_rc_iface_get_send_op(uct_rc_iface_t * iface)391 uct_rc_iface_get_send_op(uct_rc_iface_t *iface)
392 {
393 uct_rc_iface_send_op_t *op;
394 op = iface->tx.free_ops;
395 iface->tx.free_ops = op->next;
396 return op;
397 }
398
399 static UCS_F_ALWAYS_INLINE void
uct_rc_iface_put_send_op(uct_rc_iface_send_op_t * op)400 uct_rc_iface_put_send_op(uct_rc_iface_send_op_t *op)
401 {
402 uct_rc_iface_t *iface = op->iface;
403 ucs_assert(op->flags == UCT_RC_IFACE_SEND_OP_FLAG_IFACE);
404 op->next = iface->tx.free_ops;
405 iface->tx.free_ops = op;
406 }
407
408 static UCS_F_ALWAYS_INLINE void
uct_rc_am_hdr_fill(uct_rc_hdr_t * rch,uint8_t id)409 uct_rc_am_hdr_fill(uct_rc_hdr_t *rch, uint8_t id)
410 {
411 rch->am_id = id;
412 }
413
uct_rc_zcopy_desc_set_comp(uct_rc_iface_send_desc_t * desc,uct_completion_t * comp,int * send_flags)414 static inline void uct_rc_zcopy_desc_set_comp(uct_rc_iface_send_desc_t *desc,
415 uct_completion_t *comp,
416 int *send_flags)
417 {
418 if (comp == NULL) {
419 desc->super.handler = (uct_rc_send_handler_t)ucs_mpool_put;
420 *send_flags = 0;
421 } else {
422 desc->super.handler = uct_rc_ep_am_zcopy_handler;
423 desc->super.user_comp = comp;
424 *send_flags = IBV_SEND_SIGNALED;
425 }
426 }
427
uct_rc_zcopy_desc_set_header(uct_rc_hdr_t * rch,uint8_t id,const void * header,unsigned header_length)428 static inline void uct_rc_zcopy_desc_set_header(uct_rc_hdr_t *rch,
429 uint8_t id, const void *header,
430 unsigned header_length)
431 {
432 uct_rc_am_hdr_fill(rch, id);
433 memcpy(rch + 1, header, header_length);
434 }
435
uct_rc_iface_has_tx_resources(uct_rc_iface_t * iface)436 static inline int uct_rc_iface_has_tx_resources(uct_rc_iface_t *iface)
437 {
438 return uct_rc_iface_have_tx_cqe_avail(iface) &&
439 !ucs_mpool_is_empty(&iface->tx.mp) &&
440 (iface->tx.reads_available > 0);
441 }
442
443 static UCS_F_ALWAYS_INLINE uct_rc_send_handler_t
uct_rc_iface_atomic_handler(uct_rc_iface_t * iface,int ext,unsigned length)444 uct_rc_iface_atomic_handler(uct_rc_iface_t *iface, int ext, unsigned length)
445 {
446 ucs_assert((length == sizeof(uint32_t)) || (length == sizeof(uint64_t)));
447 switch (length) {
448 case sizeof(uint32_t):
449 return iface->config.atomic32_ext_handler;
450 case sizeof(uint64_t):
451 return ext ? iface->config.atomic64_ext_handler :
452 iface->config.atomic64_handler;
453 }
454 return NULL;
455 }
456
457 static UCS_F_ALWAYS_INLINE ucs_status_t
uct_rc_iface_fence_relaxed_order(uct_iface_h tl_iface)458 uct_rc_iface_fence_relaxed_order(uct_iface_h tl_iface)
459 {
460 uct_base_iface_t *iface = ucs_derived_of(tl_iface, uct_base_iface_t);
461 uct_ib_md_t *md = ucs_derived_of(iface->md, uct_ib_md_t);
462
463 ucs_assert(tl_iface->ops.iface_fence == uct_rc_iface_fence);
464
465 if (!md->relaxed_order) {
466 return UCS_OK;
467 }
468
469 return uct_rc_iface_fence(tl_iface, 0);
470 }
471 #endif
472