1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2016-2020.  ALL RIGHTS RESERVED.
3 
4 * See file LICENSE for terms.
5 */
6 
7 #ifndef UCT_DC_EP_H
8 #define UCT_DC_EP_H
9 
10 #include <uct/api/uct.h>
11 #include <ucs/datastruct/arbiter.h>
12 #include <ucs/sys/compiler_def.h>
13 
14 #include "dc_mlx5.h"
15 
16 #define UCT_DC_MLX5_EP_NO_DCI ((uint8_t)-1)
17 
18 
19 enum {
20     /* Indicates that FC grant has been requested, but is not received yet.
21      * Flush will not complete until an outgoing grant request is acked.
22      * It is needed to avoid the following cases:
23      * 1) Grant arrives for the recently deleted ep.
24      * 2) QP resources are available, but there are some pending requests. */
25     UCT_DC_MLX5_EP_FC_FLAG_WAIT_FOR_GRANT = UCS_BIT(0)
26 };
27 
28 struct uct_dc_mlx5_ep {
29     /*
30      * per value of 'flags':
31      * INVALID   - 'list' is added to iface->tx.gc_list.
32      * Otherwise - 'super' and 'arb_group' are used.
33      */
34     union {
35         struct {
36             uct_base_ep_t         super;
37             ucs_arbiter_group_t   arb_group;
38         };
39         ucs_list_link_t           list;
40     };
41 
42     uint8_t                       dci;
43     uint8_t                       flags;
44     uint16_t                      atomic_mr_offset;
45     uct_rc_fc_t                   fc;
46     uct_ib_mlx5_base_av_t         av;
47 };
48 
49 typedef struct {
50     uct_dc_mlx5_ep_t                    super;
51     struct mlx5_grh_av                  grh_av;
52 } uct_dc_mlx5_grh_ep_t;
53 
54 typedef struct {
55     uct_pending_req_priv_arb_t arb;
56     uct_dc_mlx5_ep_t           *ep;
57 } uct_dc_mlx5_pending_req_priv_t;
58 
59 
60 UCS_CLASS_DECLARE(uct_dc_mlx5_ep_t, uct_dc_mlx5_iface_t *, const uct_dc_mlx5_iface_addr_t *,
61                   uct_ib_mlx5_base_av_t *);
62 
63 UCS_CLASS_DECLARE(uct_dc_mlx5_grh_ep_t, uct_dc_mlx5_iface_t *,
64                   const uct_dc_mlx5_iface_addr_t *,
65                   uct_ib_mlx5_base_av_t *, struct mlx5_grh_av *);
66 
67 
68 ucs_status_t uct_dc_mlx5_ep_put_short(uct_ep_h tl_ep, const void *payload,
69                                       unsigned length, uint64_t remote_addr,
70                                       uct_rkey_t rkey);
71 
72 ssize_t uct_dc_mlx5_ep_put_bcopy(uct_ep_h tl_ep, uct_pack_callback_t pack_cb,
73                                  void *arg, uint64_t remote_addr, uct_rkey_t rkey);
74 
75 ucs_status_t uct_dc_mlx5_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt,
76                                       uint64_t remote_addr, uct_rkey_t rkey,
77                                       uct_completion_t *comp);
78 
79 ucs_status_t uct_dc_mlx5_ep_get_bcopy(uct_ep_h tl_ep,
80                                       uct_unpack_callback_t unpack_cb,
81                                       void *arg, size_t length,
82                                       uint64_t remote_addr, uct_rkey_t rkey,
83                                       uct_completion_t *comp);
84 
85 ucs_status_t uct_dc_mlx5_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt,
86                                       uint64_t remote_addr, uct_rkey_t rkey,
87                                       uct_completion_t *comp);
88 
89 ucs_status_t uct_dc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
90                                      const void *buffer, unsigned length);
91 
92 ssize_t uct_dc_mlx5_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
93                                 uct_pack_callback_t pack_cb, void *arg,
94                                 unsigned flags);
95 
96 ucs_status_t uct_dc_mlx5_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *header,
97                                      unsigned header_length, const uct_iov_t *iov,
98                                      size_t iovcnt, unsigned flags,
99                                      uct_completion_t *comp);
100 
101 ucs_status_t uct_dc_mlx5_ep_atomic_cswap64(uct_ep_h tl_ep, uint64_t compare, uint64_t swap,
102                                            uint64_t remote_addr, uct_rkey_t rkey,
103                                            uint64_t *result, uct_completion_t *comp);
104 
105 ucs_status_t uct_dc_mlx5_ep_atomic_cswap32(uct_ep_h tl_ep, uint32_t compare, uint32_t swap,
106                                            uint64_t remote_addr, uct_rkey_t rkey,
107                                            uint32_t *result, uct_completion_t *comp);
108 
109 ucs_status_t uct_dc_mlx5_ep_atomic64_post(uct_ep_h ep, unsigned opcode, uint64_t value,
110                                           uint64_t remote_addr, uct_rkey_t rkey);
111 
112 ucs_status_t uct_dc_mlx5_ep_atomic32_post(uct_ep_h ep, unsigned opcode, uint32_t value,
113                                           uint64_t remote_addr, uct_rkey_t rkey);
114 
115 ucs_status_t uct_dc_mlx5_ep_atomic64_fetch(uct_ep_h ep, uct_atomic_op_t opcode,
116                                            uint64_t value, uint64_t *result,
117                                            uint64_t remote_addr, uct_rkey_t rkey,
118                                            uct_completion_t *comp);
119 
120 ucs_status_t uct_dc_mlx5_ep_atomic32_fetch(uct_ep_h ep, uct_atomic_op_t opcode,
121                                            uint32_t value, uint32_t *result,
122                                            uint64_t remote_addr, uct_rkey_t rkey,
123                                            uct_completion_t *comp);
124 
125 #if IBV_HW_TM
126 ucs_status_t uct_dc_mlx5_ep_tag_eager_short(uct_ep_h tl_ep, uct_tag_t tag,
127                                             const void *data, size_t length);
128 
129 ssize_t uct_dc_mlx5_ep_tag_eager_bcopy(uct_ep_h tl_ep, uct_tag_t tag,
130                                        uint64_t imm,
131                                        uct_pack_callback_t pack_cb,
132                                        void *arg, unsigned flags);
133 
134 ucs_status_t uct_dc_mlx5_ep_tag_eager_zcopy(uct_ep_h tl_ep, uct_tag_t tag,
135                                             uint64_t imm, const uct_iov_t *iov,
136                                             size_t iovcnt, unsigned flags,
137                                             uct_completion_t *comp);
138 
139 ucs_status_ptr_t uct_dc_mlx5_ep_tag_rndv_zcopy(uct_ep_h tl_ep, uct_tag_t tag,
140                                                const void *header,
141                                                unsigned header_length,
142                                                const uct_iov_t *iov,
143                                                size_t iovcnt, unsigned flags,
144                                                uct_completion_t *comp);
145 
146 ucs_status_t uct_dc_mlx5_ep_tag_rndv_request(uct_ep_h tl_ep, uct_tag_t tag,
147                                              const void* header,
148                                              unsigned header_length,
149                                              unsigned flags);
150 
151 ucs_status_t uct_dc_mlx5_iface_tag_recv_zcopy(uct_iface_h tl_iface,
152                                               uct_tag_t tag,
153                                               uct_tag_t tag_mask,
154                                               const uct_iov_t *iov,
155                                               size_t iovcnt,
156                                               uct_tag_context_t *ctx);
157 
158 ucs_status_t uct_dc_mlx5_iface_tag_recv_cancel(uct_iface_h tl_iface,
159                                                uct_tag_context_t *ctx,
160                                                int force);
161 #endif
162 
163 ucs_status_t uct_dc_mlx5_ep_fence(uct_ep_h tl_ep, unsigned flags);
164 
165 ucs_status_t uct_dc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp);
166 
167 ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op,
168                                     uct_rc_fc_request_t *req);
169 
170 ucs_arbiter_cb_result_t
171 uct_dc_mlx5_iface_dci_do_pending_wait(ucs_arbiter_t *arbiter,
172                                       ucs_arbiter_group_t *group,
173                                       ucs_arbiter_elem_t *elem,
174                                       void *arg);
175 
176 ucs_arbiter_cb_result_t
177 uct_dc_mlx5_iface_dci_do_dcs_pending_tx(ucs_arbiter_t *arbiter,
178                                         ucs_arbiter_group_t *group,
179                                         ucs_arbiter_elem_t *elem,
180                                         void *arg);
181 
182 ucs_arbiter_cb_result_t
183 uct_dc_mlx5_iface_dci_do_rand_pending_tx(ucs_arbiter_t *arbiter,
184                                          ucs_arbiter_group_t *group,
185                                          ucs_arbiter_elem_t *elem,
186                                          void *arg);
187 
188 ucs_status_t uct_dc_mlx5_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *r,
189                                         unsigned flags);
190 void uct_dc_mlx5_ep_pending_purge(uct_ep_h tl_ep, uct_pending_purge_callback_t cb, void *arg);
191 
192 void uct_dc_mlx5_ep_pending_common(uct_dc_mlx5_iface_t *iface,
193                                    uct_dc_mlx5_ep_t *ep, uct_pending_req_t *r,
194                                    unsigned flags, int push_to_head);
195 
196 void uct_dc_mlx5_ep_cleanup(uct_ep_h tl_ep, ucs_class_t *cls);
197 
198 void uct_dc_mlx5_ep_release(uct_dc_mlx5_ep_t *ep);
199 
200 static UCS_F_ALWAYS_INLINE uct_dc_mlx5_pending_req_priv_t *
uct_dc_mlx5_pending_req_priv(uct_pending_req_t * req)201 uct_dc_mlx5_pending_req_priv(uct_pending_req_t *req)
202 {
203     return (uct_dc_mlx5_pending_req_priv_t *)&(req)->priv;
204 }
205 
uct_dc_mlx5_iface_is_dci_rand(uct_dc_mlx5_iface_t * iface)206 static UCS_F_ALWAYS_INLINE int uct_dc_mlx5_iface_is_dci_rand(uct_dc_mlx5_iface_t *iface)
207 {
208     return iface->tx.policy == UCT_DC_TX_POLICY_RAND;
209 }
210 
211 static UCS_F_ALWAYS_INLINE ucs_arbiter_group_t*
uct_dc_mlx5_ep_rand_arb_group(uct_dc_mlx5_iface_t * iface,uct_dc_mlx5_ep_t * ep)212 uct_dc_mlx5_ep_rand_arb_group(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
213 {
214     ucs_assert(uct_dc_mlx5_iface_is_dci_rand(iface) &&
215                (ep->dci != UCT_DC_MLX5_EP_NO_DCI));
216     /* If DCI random policy is used, DCI is always assigned to EP */
217     return &iface->tx.dcis[ep->dci].arb_group;
218 }
219 
220 static UCS_F_ALWAYS_INLINE ucs_arbiter_group_t*
uct_dc_mlx5_ep_arb_group(uct_dc_mlx5_iface_t * iface,uct_dc_mlx5_ep_t * ep)221 uct_dc_mlx5_ep_arb_group(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
222 {
223     return (uct_dc_mlx5_iface_is_dci_rand(iface)) ?
224             uct_dc_mlx5_ep_rand_arb_group(iface, ep) : &ep->arb_group;
225 }
226 
227 static UCS_F_ALWAYS_INLINE void
uct_dc_mlx5_iface_dci_sched_tx(uct_dc_mlx5_iface_t * iface,uct_dc_mlx5_ep_t * ep)228 uct_dc_mlx5_iface_dci_sched_tx(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
229 {
230     if (uct_dc_mlx5_iface_is_dci_rand(iface)) {
231         ucs_arbiter_group_schedule(uct_dc_mlx5_iface_tx_waitq(iface),
232                                    uct_dc_mlx5_ep_rand_arb_group(iface, ep));
233     } else if (uct_dc_mlx5_iface_dci_has_tx_resources(iface, ep->dci)) {
234         ucs_arbiter_group_schedule(uct_dc_mlx5_iface_tx_waitq(iface),
235                                    &ep->arb_group);
236     }
237 }
238 
239 static UCS_F_ALWAYS_INLINE uct_dc_mlx5_ep_t *
uct_dc_mlx5_ep_from_dci(uct_dc_mlx5_iface_t * iface,uint8_t dci)240 uct_dc_mlx5_ep_from_dci(uct_dc_mlx5_iface_t *iface, uint8_t dci)
241 {
242     /* Can be used with dcs* policies only, with rand policy every dci may
243      * be used by many eps */
244     ucs_assert(!uct_dc_mlx5_iface_is_dci_rand(iface));
245     return iface->tx.dcis[dci].ep;
246 }
247 
248 static UCS_F_ALWAYS_INLINE void
uct_dc_mlx5_ep_clear_fc_grant_flag(uct_dc_mlx5_iface_t * iface,uct_dc_mlx5_ep_t * ep)249 uct_dc_mlx5_ep_clear_fc_grant_flag(uct_dc_mlx5_iface_t *iface,
250                                    uct_dc_mlx5_ep_t *ep)
251 {
252     ucs_assert((ep->fc.flags & UCT_DC_MLX5_EP_FC_FLAG_WAIT_FOR_GRANT) &&
253                iface->tx.fc_grants);
254     ep->fc.flags &= ~UCT_DC_MLX5_EP_FC_FLAG_WAIT_FOR_GRANT;
255     --iface->tx.fc_grants;
256 }
257 
258 enum uct_dc_mlx5_ep_flags {
259     UCT_DC_MLX5_EP_FLAG_TX_WAIT  = UCS_BIT(0), /* ep is in the tx_wait state. See
260                                                   description of the dcs+quota dci
261                                                   selection policy above */
262     UCT_DC_MLX5_EP_FLAG_GRH      = UCS_BIT(1), /* ep has GRH address. Used by
263                                                   dc_mlx5 endpoint */
264     UCT_DC_MLX5_EP_FLAG_VALID    = UCS_BIT(2)  /* ep is a valid endpoint */
265 };
266 
267 
268 void uct_dc_mlx5_ep_handle_failure(uct_dc_mlx5_ep_t *ep, void *arg,
269                                    ucs_status_t status);
270 
271 static UCS_F_ALWAYS_INLINE ucs_status_t
uct_dc_mlx5_ep_basic_init(uct_dc_mlx5_iface_t * iface,uct_dc_mlx5_ep_t * ep)272 uct_dc_mlx5_ep_basic_init(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
273 {
274     ucs_arbiter_group_init(&ep->arb_group);
275 
276     if (uct_dc_mlx5_iface_is_dci_rand(iface)) {
277         /* coverity[dont_call] */
278         ep->dci = rand_r(&iface->tx.rand_seed) % iface->tx.ndci;
279     } else {
280         ep->dci = UCT_DC_MLX5_EP_NO_DCI;
281     }
282 
283     /* valid = 1, global = 0, tx_wait = 0 */
284     ep->flags = UCT_DC_MLX5_EP_FLAG_VALID;
285 
286     return uct_rc_fc_init(&ep->fc, iface->super.super.config.fc_wnd_size
287                           UCS_STATS_ARG(ep->super.stats));
288 }
289 
290 static UCS_F_ALWAYS_INLINE int
uct_dc_mlx5_iface_dci_can_alloc(uct_dc_mlx5_iface_t * iface)291 uct_dc_mlx5_iface_dci_can_alloc(uct_dc_mlx5_iface_t *iface)
292 {
293     return iface->tx.stack_top < iface->tx.ndci;
294 }
295 
296 static UCS_F_ALWAYS_INLINE void
uct_dc_mlx5_iface_progress_pending(uct_dc_mlx5_iface_t * iface)297 uct_dc_mlx5_iface_progress_pending(uct_dc_mlx5_iface_t *iface)
298 {
299     do {
300         /**
301          * Pending op on the tx_waitq can complete with the UCS_OK
302          * status without actually sending anything on the dci.
303          * In this case pending ops on the waitq may never be
304          * scheduled.
305          *
306          * So we keep progressing pending while dci_waitq is not
307          * empty and it is possible to allocate a dci.
308          * NOTE: in case of rand dci allocation policy, dci_waitq is always
309          * empty.
310          */
311         if (uct_dc_mlx5_iface_dci_can_alloc(iface) &&
312             !uct_dc_mlx5_iface_is_dci_rand(iface)) {
313             ucs_arbiter_dispatch(uct_dc_mlx5_iface_dci_waitq(iface), 1,
314                                  uct_dc_mlx5_iface_dci_do_pending_wait, NULL);
315         }
316         ucs_arbiter_dispatch(uct_dc_mlx5_iface_tx_waitq(iface), 1,
317                              iface->tx.pend_cb, NULL);
318 
319     } while (ucs_unlikely(!ucs_arbiter_is_empty(uct_dc_mlx5_iface_dci_waitq(iface)) &&
320                            uct_dc_mlx5_iface_dci_can_alloc(iface)));
321 }
322 
uct_dc_mlx5_iface_dci_ep_can_send(uct_dc_mlx5_ep_t * ep)323 static inline int uct_dc_mlx5_iface_dci_ep_can_send(uct_dc_mlx5_ep_t *ep)
324 {
325     uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t);
326     return (!(ep->flags & UCT_DC_MLX5_EP_FLAG_TX_WAIT)) &&
327            uct_rc_fc_has_resources(&iface->super.super, &ep->fc) &&
328            uct_dc_mlx5_iface_dci_has_tx_resources(iface, ep->dci);
329 }
330 
331 static UCS_F_ALWAYS_INLINE
uct_dc_mlx5_iface_schedule_dci_alloc(uct_dc_mlx5_iface_t * iface,uct_dc_mlx5_ep_t * ep)332 void uct_dc_mlx5_iface_schedule_dci_alloc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
333 {
334     /* If FC window is empty the group will be scheduled when
335      * grant is received */
336     if (uct_rc_fc_has_resources(&iface->super.super, &ep->fc)) {
337         ucs_arbiter_group_schedule(uct_dc_mlx5_iface_dci_waitq(iface), &ep->arb_group);
338     }
339 }
340 
341 static UCS_F_ALWAYS_INLINE void
uct_dc_mlx5_iface_dci_release(uct_dc_mlx5_iface_t * iface,uint8_t dci)342  uct_dc_mlx5_iface_dci_release(uct_dc_mlx5_iface_t *iface, uint8_t dci)
343 {
344     iface->tx.stack_top--;
345     iface->tx.dcis_stack[iface->tx.stack_top] = dci;
346 #if UCS_ENABLE_ASSERT
347     iface->tx.dcis[dci].flags = 0;
348 #endif
349 }
350 
351 static UCS_F_ALWAYS_INLINE void
uct_dc_mlx5_iface_dci_put(uct_dc_mlx5_iface_t * iface,uint8_t dci)352  uct_dc_mlx5_iface_dci_put(uct_dc_mlx5_iface_t *iface, uint8_t dci)
353 {
354     uct_dc_mlx5_ep_t *ep;
355 
356     if (uct_dc_mlx5_iface_is_dci_rand(iface)) {
357         return;
358     }
359 
360     ep = uct_dc_mlx5_ep_from_dci(iface, dci);
361 
362     ucs_assert(iface->tx.stack_top > 0);
363 
364     if (ucs_unlikely(ep == NULL)) {
365         if (!uct_dc_mlx5_iface_dci_has_outstanding(iface, dci)) {
366             uct_dc_mlx5_iface_dci_release(iface, dci);
367         }
368         return;
369     }
370 
371     if (uct_dc_mlx5_iface_dci_has_outstanding(iface, dci)) {
372         if (iface->tx.policy == UCT_DC_TX_POLICY_DCS_QUOTA) {
373             /* in tx_wait state:
374              * -  if there are no eps are waiting for dci allocation
375              *    ep goes back to normal state
376              */
377             if (ep->flags & UCT_DC_MLX5_EP_FLAG_TX_WAIT) {
378                 if (!ucs_arbiter_is_empty(uct_dc_mlx5_iface_dci_waitq(iface))) {
379                     return;
380                 }
381                 ep->flags &= ~UCT_DC_MLX5_EP_FLAG_TX_WAIT;
382             }
383         }
384         ucs_arbiter_group_schedule(uct_dc_mlx5_iface_tx_waitq(iface), &ep->arb_group);
385         return;
386     }
387 
388     uct_dc_mlx5_iface_dci_release(iface, dci);
389 
390     ucs_assert(uct_dc_mlx5_ep_from_dci(iface, dci)->dci != UCT_DC_MLX5_EP_NO_DCI);
391     ep->dci    = UCT_DC_MLX5_EP_NO_DCI;
392     ep->flags &= ~UCT_DC_MLX5_EP_FLAG_TX_WAIT;
393     iface->tx.dcis[dci].ep = NULL;
394 
395     /* it is possible that dci is released while ep still has scheduled pending ops.
396      * move the group to the 'wait for dci alloc' state
397      */
398     ucs_arbiter_group_desched(uct_dc_mlx5_iface_tx_waitq(iface), &ep->arb_group);
399     uct_dc_mlx5_iface_schedule_dci_alloc(iface, ep);
400 }
401 
uct_dc_mlx5_iface_dci_alloc(uct_dc_mlx5_iface_t * iface,uct_dc_mlx5_ep_t * ep)402 static inline void uct_dc_mlx5_iface_dci_alloc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
403 {
404     /* take a first available dci from stack.
405      * There is no need to check txqp because
406      * dci must have resources to transmit.
407      */
408     ucs_assert(!uct_dc_mlx5_iface_is_dci_rand(iface));
409     ep->dci = iface->tx.dcis_stack[iface->tx.stack_top];
410     ucs_assert(ep->dci < iface->tx.ndci);
411     ucs_assert(uct_dc_mlx5_ep_from_dci(iface, ep->dci) == NULL);
412     ucs_assert(iface->tx.dcis[ep->dci].flags == 0);
413     iface->tx.dcis[ep->dci].ep = ep;
414     iface->tx.stack_top++;
415 }
416 
uct_dc_mlx5_iface_dci_free(uct_dc_mlx5_iface_t * iface,uct_dc_mlx5_ep_t * ep)417 static inline void uct_dc_mlx5_iface_dci_free(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
418 {
419     uint8_t dci;
420 
421     if (uct_dc_mlx5_iface_is_dci_rand(iface)) {
422         return;
423     }
424 
425     dci = ep->dci;
426 
427     ucs_assert(dci != UCT_DC_MLX5_EP_NO_DCI);
428     ucs_assert(iface->tx.stack_top > 0);
429 
430     if (uct_dc_mlx5_iface_dci_has_outstanding(iface, dci)) {
431         return;
432     }
433 
434     uct_dc_mlx5_iface_dci_release(iface, dci);
435 
436     iface->tx.dcis[dci].ep = NULL;
437     ep->dci                = UCT_DC_MLX5_EP_NO_DCI;
438     ep->flags             &= ~UCT_DC_MLX5_EP_FLAG_TX_WAIT;
439 }
440 
441 static UCS_F_ALWAYS_INLINE ucs_status_t
uct_dc_mlx5_iface_dci_get(uct_dc_mlx5_iface_t * iface,uct_dc_mlx5_ep_t * ep)442 uct_dc_mlx5_iface_dci_get(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
443 {
444     uct_rc_txqp_t *txqp;
445     int16_t available;
446 
447     ucs_assert(!iface->super.super.config.tx_moderation);
448 
449     if (uct_dc_mlx5_iface_is_dci_rand(iface)) {
450         if (uct_dc_mlx5_iface_dci_has_tx_resources(iface, ep->dci)) {
451             return UCS_OK;
452         } else {
453             UCS_STATS_UPDATE_COUNTER(iface->tx.dcis[ep->dci].txqp.stats,
454                                      UCT_RC_TXQP_STAT_QP_FULL, 1);
455             goto out_no_res;
456         }
457     }
458 
459     if (ep->dci != UCT_DC_MLX5_EP_NO_DCI) {
460         /* dci is already assigned - keep using it */
461         if ((iface->tx.policy == UCT_DC_TX_POLICY_DCS_QUOTA) &&
462             (ep->flags & UCT_DC_MLX5_EP_FLAG_TX_WAIT)) {
463             goto out_no_res;
464         }
465 
466         /* if dci has sent more than quota, and there are eps waiting for dci
467          * allocation ep goes into tx_wait state.
468          */
469         txqp      = &iface->tx.dcis[ep->dci].txqp;
470         available = uct_rc_txqp_available(txqp);
471         if ((iface->tx.policy == UCT_DC_TX_POLICY_DCS_QUOTA) &&
472             (available <= iface->tx.available_quota) &&
473             !ucs_arbiter_is_empty(uct_dc_mlx5_iface_dci_waitq(iface)))
474         {
475             ep->flags |= UCT_DC_MLX5_EP_FLAG_TX_WAIT;
476             goto out_no_res;
477         }
478 
479         if (available <= 0) {
480             UCS_STATS_UPDATE_COUNTER(txqp->stats, UCT_RC_TXQP_STAT_QP_FULL, 1);
481             goto out_no_res;
482         }
483 
484         return UCS_OK;
485     }
486 
487     /* Do not alloc dci if no TX desc resources,
488      * otherwise this dci may never be released. */
489     if (uct_dc_mlx5_iface_dci_can_alloc(iface) &&
490         uct_dc_mlx5_iface_has_tx_resources(iface)) {
491         uct_dc_mlx5_iface_dci_alloc(iface, ep);
492         return UCS_OK;
493     }
494 
495 out_no_res:
496     /* we will have to wait until someone releases dci */
497     UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1);
498     return UCS_ERR_NO_RESOURCE;
499 }
500 
uct_dc_mlx5_ep_fc_wait_for_grant(uct_dc_mlx5_ep_t * ep)501 static UCS_F_ALWAYS_INLINE int uct_dc_mlx5_ep_fc_wait_for_grant(uct_dc_mlx5_ep_t *ep)
502 {
503     return ep->fc.flags & UCT_DC_MLX5_EP_FC_FLAG_WAIT_FOR_GRANT;
504 }
505 
506 ucs_status_t uct_dc_mlx5_ep_check_fc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep);
507 
uct_dc_mlx5_ep_get_grh(uct_dc_mlx5_ep_t * ep)508 static inline struct mlx5_grh_av *uct_dc_mlx5_ep_get_grh(uct_dc_mlx5_ep_t *ep)
509 {
510    return (ep->flags & UCT_DC_MLX5_EP_FLAG_GRH) ?
511           &(ucs_derived_of(ep, uct_dc_mlx5_grh_ep_t)->grh_av) : NULL;
512 }
513 
514 
515 #define UCT_DC_MLX5_TXQP_DECL(_txqp, _txwq) \
516     uct_rc_txqp_t UCS_V_UNUSED *_txqp; \
517     uct_ib_mlx5_txwq_t UCS_V_UNUSED *_txwq;
518 
519 
520 #define UCT_DC_MLX5_CHECK_RES(_iface, _ep) \
521     { \
522         ucs_status_t _status = uct_dc_mlx5_iface_dci_get(_iface, _ep); \
523         if (ucs_unlikely(_status != UCS_OK)) { \
524             return _status; \
525         } \
526     }
527 
528 
529 #define UCT_DC_CHECK_RES_PTR(_iface, _ep) \
530     { \
531         ucs_status_t status = uct_dc_mlx5_iface_dci_get(_iface, _ep); \
532         if (ucs_unlikely(status != UCS_OK)) { \
533             return UCS_STATUS_PTR(status); \
534         } \
535     }
536 
537 
538 /**
539  * All RMA and AMO operations are not allowed if no RDMA_READ credits.
540  * Otherwise operations ordering can be broken (which fence operation
541  * relies on).
542  */
543 #define UCT_DC_MLX5_CHECK_RMA_RES(_iface, _ep) \
544     { \
545         UCT_RC_CHECK_NUM_RDMA_READ(&(_iface)->super.super) \
546         UCT_DC_MLX5_CHECK_RES(_iface, _ep) \
547     }
548 
549 
550 /* First, check whether we have FC window. If hard threshold is reached, credit
551  * request will be sent by "fc_ctrl" as a separate message. TX resources
552  * are checked after FC, because fc credits request may consume latest
553  * available TX resources. */
554 #define UCT_DC_CHECK_RES_AND_FC(_iface, _ep) \
555     { \
556         if (ucs_unlikely((_ep)->fc.fc_wnd <= \
557                          (_iface)->super.super.config.fc_hard_thresh)) { \
558             ucs_status_t _status = uct_dc_mlx5_ep_check_fc(_iface, _ep); \
559             if (ucs_unlikely(_status != UCS_OK)) { \
560                 if (((_ep)->dci != UCT_DC_MLX5_EP_NO_DCI) && \
561                     !uct_dc_mlx5_iface_is_dci_rand(_iface)) { \
562                     ucs_assertv_always(uct_dc_mlx5_iface_dci_has_outstanding(_iface, \
563                                                                              (_ep)->dci), \
564                                        "iface (%p) ep (%p) dci leak detected: dci=%d", \
565                                        _iface, _ep, (_ep)->dci); \
566                 } \
567                 return _status; \
568             } \
569         } \
570         UCT_DC_MLX5_CHECK_RES(_iface, _ep) \
571     }
572 
573 
574 #endif
575