1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
3 *
4 * See file LICENSE for terms.
5 */
6 
7 #ifndef UCT_UD_EP_H
8 #define UCT_UD_EP_H
9 
10 #include "ud_def.h"
11 
12 #include <uct/api/uct.h>
13 #include <ucs/datastruct/frag_list.h>
14 #include <ucs/datastruct/queue.h>
15 #include <ucs/datastruct/arbiter.h>
16 #include <ucs/datastruct/sglib.h>
17 #include <ucs/time/timer_wheel.h>
18 
19 #define UCT_UD_EP_NULL_ID     ((1<<24)-1)
20 #define UCT_UD_EP_ID_MAX      UCT_UD_EP_NULL_ID
21 #define UCT_UD_EP_CONN_ID_MAX UCT_UD_EP_ID_MAX
22 
23 #if UCT_UD_EP_DEBUG_HOOKS
24 /*
25    Hooks that allow packet header inspection and rewriting. UCT user can
26    set functions that will be called just before packet is put on wire
27    and when packet is received. Packet will be discarded if RX function
28    returns status different from UCS_OK.
29 
30    Example:
31 
32   static ucs_status_t clear_ack_req(uct_ud_ep_t *ep, uct_ud_neth_t *neth)
33   {
34      neth->packet_type &= ~UCT_UD_PACKET_FLAG_ACK_REQ;
35      return UCS_OK;
36   }
37 
38   uct_ep_t ep;
39   ....
40   // clear ack request bin on all outgoing packets
41   ucs_derived_of(ep, uct_ud_ep_t)->tx.tx_hook = clear_ack_req;
42 
43 */
44 
45 typedef ucs_status_t (*uct_ud_ep_hook_t)(uct_ud_ep_t *ep, uct_ud_neth_t *neth);
46 
47 #define UCT_UD_EP_HOOK_DECLARE(name) uct_ud_ep_hook_t name;
48 
49 #define UCT_UD_EP_HOOK_CALL_RX(ep, neth, len) \
50     if ((ep)->rx.rx_hook(ep, neth) != UCS_OK) { \
51         ucs_trace_data("RX: dropping packet"); \
52         return; \
53     }
54 
55 #define UCT_UD_EP_HOOK_CALL_TX(ep, neth) (ep)->tx.tx_hook(ep, neth);
56 #define UCT_UD_EP_HOOK_CALL_TIMER(ep)    (ep)->timer_hook(ep, NULL);
57 
uct_ud_ep_null_hook(uct_ud_ep_t * ep,uct_ud_neth_t * neth)58 static inline ucs_status_t uct_ud_ep_null_hook(uct_ud_ep_t *ep, uct_ud_neth_t *neth)
59 {
60     return UCS_OK;
61 }
62 
63 #define UCT_UD_EP_HOOK_INIT(ep) \
64 do { \
65    (ep)->tx.tx_hook = uct_ud_ep_null_hook; \
66    (ep)->rx.rx_hook = uct_ud_ep_null_hook; \
67    (ep)->timer_hook = uct_ud_ep_null_hook; \
68 } while(0);
69 
70 #else
71 
72 #define UCT_UD_EP_HOOK_DECLARE(name)
73 #define UCT_UD_EP_HOOK_CALL_RX(ep, neth, len)
74 #define UCT_UD_EP_HOOK_CALL_TX(ep, neth)
75 #define UCT_UD_EP_HOOK_CALL_TIMER(ep)
76 #define UCT_UD_EP_HOOK_INIT(ep)
77 
78 #endif
79 
80 
81 /**
82  * Slow ep timer
83  * The purpose of the slow timer is to schedule resends and ack replies.
84  * The timer is a wheel timer. Timer wheel sweep is done on every async
85  * progress invocation. One tick usually happens once in 0.1 seconds.
86  * It is best to avoid to take time in the fast path.
87  *
88  * wheel_time is the time of last timer wheel sweep.
89  * on send:
90  *   - try to start wheel timer.
91  *   - send_time = wheel_time. That is sending a packet resets retransmission
92  *   timeout. This does not allow infinite number of resets because number of
93  *   outstanding packets is bound by the TX window size.
94  * on ack recv:
95  *   - send_time = wheel_time. (advance last send time)
96  * on timer expiration:
97  *   - if wheel_time - saved_time > 3*one_tick_time
98  *        schedule resend
99  *        send_time = wheel_time
100  *        consgestion avoidance decreases tx window
101  *   - if window is not empty resched timer
102  *   3x is needed to avoid false resends because of errors in timekeeping
103  *
104  * Fast ep timer (Not implemented)
105  *
106  * The purpose of the fast timer is to detect packet loss as early as
107  * possible. The timer is a wheel timer. Fast timer sweep is done on
108  * CQ polling which happens either in explicit polling or in async
109  * progress. As a result fast timer resolution may vary.
110  *
111  * TODO: adaptive CHK algo description
112  *
113  * Fast time is relatively expensive. It is best to disable if packet loss
114  * is not expected. Usual reasons for packet loss are: slow receiver,
115  * many to one traffic pattern.
116  */
117 
118 /* Congestion avoidance and retransmits
119  *
120  * UD uses additive increase/multiplicative decrease algorightm
121  * See https://en.wikipedia.org/wiki/Additive_increase/multiplicative_decrease
122  *
123  * tx window is increased when ack is received and decreased when
124  * resend is scheduled. Ack must be a 'new' one that is it must
125  * acknowledge packets on window. Increasing window on ack does not casue
126  * exponential window increase because, unlike tcp, only two acks
127  * per window are sent.
128  *
129  * Todo:
130  *
131  * Consider trigering window decrease before resend timeout:
132  * - on ECN (explicit congestion notification) from receiever. ECN can
133  *   be based on some heuristic. For example on number of rx completions
134  *   that receiver picked from CQ.
135  * - upon receiving a 'duplicate ack' packet
136  *
137  * Consider using other algorithm (ex BIC/CUBIC)
138  */
139 
140 /*
141  * Handling retransmits
142  *
143  * On slow timer timeout schedule a retransmit operation for
144  * [acked_psn+1, psn-1]. These values are saved as 'resend window'
145  *
146  * Resend operation will resend no more then the current cwnd
147  * If ack arrives when resend window is active it means that
148  *  - something new in the resend window was acked. As a
149  *  resutlt a new resend operation will be scheduled.
150  *  - either resend window or something beyond it was
151  *  acked. It means that no more retransmisions are needed.
152  *  Current 'resend window' is deactivated
153  *
154  * When retransmitting, ack is requested if:
155  * psn == acked_psn + 1 or
156  * psn % UCT_UD_RESENDS_PER_ACK = 0
157  */
158 
159 /*
160  * Endpoint pending control operations. The operations
161  * are executed in time of progress along with
162  * pending requests added by uct user.
163  */
164 enum {
165     UCT_UD_EP_OP_NONE       = 0,
166     UCT_UD_EP_OP_ACK        = UCS_BIT(0),  /* ack data */
167     UCT_UD_EP_OP_ACK_REQ    = UCS_BIT(1),  /* request ack of sent packets */
168     UCT_UD_EP_OP_RESEND     = UCS_BIT(2),  /* resend un acked packets */
169     UCT_UD_EP_OP_CREP       = UCS_BIT(3),  /* send connection reply */
170     UCT_UD_EP_OP_CREQ       = UCS_BIT(4),  /* send connection request */
171     UCT_UD_EP_OP_NACK       = UCS_BIT(5),  /* send NACK */
172 };
173 
174 #define UCT_UD_EP_OP_CTL_LOW_PRIO (UCT_UD_EP_OP_ACK_REQ|UCT_UD_EP_OP_ACK)
175 #define UCT_UD_EP_OP_CTL_HI_PRIO  (UCT_UD_EP_OP_CREQ|UCT_UD_EP_OP_CREP|UCT_UD_EP_OP_RESEND)
176 #define UCT_UD_EP_OP_CTL_ACK      (UCT_UD_EP_OP_ACK|UCT_UD_EP_OP_ACK_REQ|UCT_UD_EP_OP_NACK)
177 
178 typedef struct uct_ud_ep_pending_op {
179     ucs_arbiter_group_t   group;
180     uint32_t              ops;    /* bitmask that describes what control ops are sceduled */
181     ucs_arbiter_elem_t    elem;
182 } uct_ud_ep_pending_op_t;
183 
184 enum {
185     UCT_UD_EP_STAT_TODO
186 };
187 
188 /* TODO: optimize endpoint memory footprint */
189 enum {
190     UCT_UD_EP_FLAG_DISCONNECTED      = UCS_BIT(0), /* EP was disconnected */
191     UCT_UD_EP_FLAG_PRIVATE           = UCS_BIT(1), /* EP was created as internal */
192     UCT_UD_EP_FLAG_HAS_PENDING       = UCS_BIT(2), /* EP has some pending requests */
193     UCT_UD_EP_FLAG_CONNECTED         = UCS_BIT(3), /* EP was connected to the peer */
194 
195     /* debug flags */
196     UCT_UD_EP_FLAG_CREQ_RCVD         = UCS_BIT(4), /* CREQ message was received */
197     UCT_UD_EP_FLAG_CREP_RCVD         = UCS_BIT(5), /* CREP message was received */
198     UCT_UD_EP_FLAG_CREQ_SENT         = UCS_BIT(6), /* CREQ message was sent */
199     UCT_UD_EP_FLAG_CREP_SENT         = UCS_BIT(7), /* CREP message was sent */
200     UCT_UD_EP_FLAG_CREQ_NOTSENT      = UCS_BIT(8), /* CREQ message is NOT sent, because
201                                                       connection establishment process
202                                                       is driven by remote side. */
203     UCT_UD_EP_FLAG_TX_NACKED         = UCS_BIT(9), /* Last psn was acked with NAK */
204 
205     /* Endpoint is currently executing the pending queue */
206 #if UCS_ENABLE_ASSERT
207     UCT_UD_EP_FLAG_IN_PENDING        = UCS_BIT(10)
208 #else
209     UCT_UD_EP_FLAG_IN_PENDING        = 0
210 #endif
211 };
212 
213 typedef struct uct_ud_peer_name {
214     char name[16];
215     int  pid;
216 } uct_ud_peer_name_t;
217 
218 struct uct_ud_ep {
219     uct_base_ep_t           super;
220     uint32_t                ep_id;
221     uint32_t                dest_ep_id;
222     struct {
223         uct_ud_psn_t           psn;          /* Next PSN to send */
224         uct_ud_psn_t           max_psn;      /* Largest PSN that can be sent */
225         uct_ud_psn_t           acked_psn;    /* last psn that was acked by remote side */
226         uint16_t               resend_count; /* number of in-flight resends on the ep */
227         ucs_queue_head_t       window;       /* send window: [acked_psn+1, psn-1] */
228         uct_ud_ep_pending_op_t pending;      /* pending ops */
229         ucs_time_t             send_time;    /* tx time of last packet */
230         ucs_time_t             resend_time;  /* tx time of last resent packet */
231         ucs_time_t             tick;         /* timeout to trigger timer */
232         UCS_STATS_NODE_DECLARE(stats)
233         UCT_UD_EP_HOOK_DECLARE(tx_hook)
234     } tx;
235     struct {
236         uct_ud_psn_t        acked_psn;    /* Last psn we acked */
237         ucs_frag_list_t     ooo_pkts;     /* Out of order packets that can not be processed yet,
238                                             also keeps last psn we successfully received and processed */
239         UCS_STATS_NODE_DECLARE(stats)
240         UCT_UD_EP_HOOK_DECLARE(rx_hook)
241     } rx;
242     struct {
243         uct_ud_psn_t  wmax;
244         uct_ud_psn_t  cwnd;
245     } ca;
246     struct UCS_S_PACKED {
247          ucs_queue_iter_t       pos;       /* points to the part of tx window that needs to be resent */
248          uct_ud_psn_t           psn;       /* last psn that was retransmitted */
249          uct_ud_psn_t           max_psn;   /* max psn that should be retransmitted */
250     } resend;
251     ucs_list_link_t  cep_list;
252     uint32_t         conn_id;      /* connection id. assigned in connect_to_iface() */
253     uint16_t         flags;
254     uint8_t          rx_creq_count; /* TODO: remove when reason for DUP/OOO CREQ is found */
255     uint8_t          path_index;
256     ucs_wtimer_t     timer;
257     ucs_time_t       close_time;   /* timestamp of closure */
258     UCS_STATS_NODE_DECLARE(stats)
259     UCT_UD_EP_HOOK_DECLARE(timer_hook)
260 #if ENABLE_DEBUG_DATA
261     uct_ud_peer_name_t  peer;
262 #endif
263 };
264 
265 #if ENABLE_DEBUG_DATA
266 #  define UCT_UD_EP_PEER_NAME_FMT        "%s:%d"
267 #  define UCT_UD_EP_PEER_NAME_ARG(_ep)   (_ep)->peer.name, (_ep)->peer.pid
268 #else
269 #  define UCT_UD_EP_PEER_NAME_FMT        "%s"
270 #  define UCT_UD_EP_PEER_NAME_ARG(_ep)   "<no debug data>"
271 #endif
272 
273 
274 UCS_CLASS_DECLARE(uct_ud_ep_t, uct_ud_iface_t*, const uct_ep_params_t*)
275 
276 /**
277  * UD pending request private data
278  */
279 typedef struct {
280     uct_pending_req_priv_arb_t arb;
281     unsigned                   flags;
282 } uct_ud_pending_req_priv_t;
283 
284 
285 static UCS_F_ALWAYS_INLINE uct_ud_pending_req_priv_t *
uct_ud_pending_req_priv(uct_pending_req_t * req)286 uct_ud_pending_req_priv(uct_pending_req_t *req)
287 {
288     return (uct_ud_pending_req_priv_t *)&(req)->priv;
289 }
290 
291 
292 void uct_ud_tx_wnd_purge_outstanding(uct_ud_iface_t *iface, uct_ud_ep_t *ud_ep,
293                                      ucs_status_t status, int is_async);
294 
295 ucs_status_t uct_ud_ep_flush(uct_ep_h ep, unsigned flags,
296                              uct_completion_t *comp);
297 /* internal flush */
298 ucs_status_t uct_ud_ep_flush_nolock(uct_ud_iface_t *iface, uct_ud_ep_t *ep,
299                                     uct_completion_t *comp);
300 
301 ucs_status_t uct_ud_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr);
302 
303 ucs_status_t uct_ud_ep_connect_to_ep(uct_ud_ep_t *ep,
304                                      const uct_ib_address_t *ib_addr,
305                                      const uct_ud_ep_addr_t *ep_addr);
306 
307 ucs_status_t uct_ud_ep_pending_add(uct_ep_h ep, uct_pending_req_t *n,
308                                    unsigned flags);
309 
310 void   uct_ud_ep_pending_purge(uct_ep_h ep, uct_pending_purge_callback_t cb,
311                                void *arg);
312 
313 void   uct_ud_ep_disconnect(uct_ep_h ep);
314 
315 void uct_ud_ep_window_release_completed(uct_ud_ep_t *ep, int is_async);
316 
317 
318 /* helper function to create/destroy new connected ep */
319 ucs_status_t uct_ud_ep_create_connected_common(uct_ud_iface_t *iface,
320                                                const uct_ib_address_t *ib_addr,
321                                                const uct_ud_iface_addr_t *if_addr,
322                                                unsigned path_index,
323                                                uct_ud_ep_t **new_ep_p,
324                                                uct_ud_send_skb_t **skb_p);
325 
326 void uct_ud_ep_destroy_connected(uct_ud_ep_t *ep,
327                                  const uct_ib_address_t *ib_addr,
328                                   const uct_ud_iface_addr_t *if_addr);
329 
330 uct_ud_send_skb_t *uct_ud_ep_prepare_creq(uct_ud_ep_t *ep);
331 
332 ucs_arbiter_cb_result_t
333 uct_ud_ep_do_pending(ucs_arbiter_t *arbiter, ucs_arbiter_group_t *group,
334                      ucs_arbiter_elem_t *elem, void *arg);
335 
336 void uct_ud_ep_clone(uct_ud_ep_t *old_ep, uct_ud_ep_t *new_ep);
337 
338 static UCS_F_ALWAYS_INLINE void
uct_ud_neth_set_type_am(uct_ud_ep_t * ep,uct_ud_neth_t * neth,uint8_t id)339 uct_ud_neth_set_type_am(uct_ud_ep_t *ep, uct_ud_neth_t *neth, uint8_t id)
340 {
341     neth->packet_type = (id << UCT_UD_PACKET_AM_ID_SHIFT) |
342                         ep->dest_ep_id |
343                         UCT_UD_PACKET_FLAG_AM;
344 }
345 
346 static UCS_F_ALWAYS_INLINE void
uct_ud_neth_set_type_put(uct_ud_ep_t * ep,uct_ud_neth_t * neth)347 uct_ud_neth_set_type_put(uct_ud_ep_t *ep, uct_ud_neth_t *neth)
348 {
349     neth->packet_type = ep->dest_ep_id | UCT_UD_PACKET_FLAG_PUT;
350 }
351 
352 void uct_ud_ep_process_rx(uct_ud_iface_t *iface,
353                           uct_ud_neth_t *neth, unsigned byte_len,
354                           uct_ud_recv_skb_t *skb, int is_async);
355 
356 
357 static UCS_F_ALWAYS_INLINE void
uct_ud_neth_init_data(uct_ud_ep_t * ep,uct_ud_neth_t * neth)358 uct_ud_neth_init_data(uct_ud_ep_t *ep, uct_ud_neth_t *neth)
359 {
360     neth->psn = ep->tx.psn;
361     neth->ack_psn = ep->rx.acked_psn = ucs_frag_list_sn(&ep->rx.ooo_pkts);
362 }
363 
uct_ud_ep_compare(uct_ud_ep_t * a,uct_ud_ep_t * b)364 static inline int uct_ud_ep_compare(uct_ud_ep_t *a, uct_ud_ep_t *b)
365 {
366     return a->conn_id - b->conn_id;
367 }
368 
uct_ud_ep_hash(uct_ud_ep_t * ep)369 static inline int uct_ud_ep_hash(uct_ud_ep_t *ep)
370 {
371     return ep->conn_id % UCT_UD_HASH_SIZE;
372 }
373 
SGLIB_DEFINE_LIST_PROTOTYPES(uct_ud_ep_t,uct_ud_ep_compare,next)374 SGLIB_DEFINE_LIST_PROTOTYPES(uct_ud_ep_t, uct_ud_ep_compare, next)
375 SGLIB_DEFINE_HASHED_CONTAINER_PROTOTYPES(uct_ud_ep_t, UCT_UD_HASH_SIZE, uct_ud_ep_hash)
376 
377 
378 static UCS_F_ALWAYS_INLINE void
379 uct_ud_ep_ctl_op_del(uct_ud_ep_t *ep, uint32_t ops)
380 {
381     ep->tx.pending.ops &= ~ops;
382 }
383 
384 static UCS_F_ALWAYS_INLINE int
uct_ud_ep_ctl_op_check(uct_ud_ep_t * ep,uint32_t op)385 uct_ud_ep_ctl_op_check(uct_ud_ep_t *ep, uint32_t op)
386 {
387     return ep->tx.pending.ops & op;
388 }
389 
390 static UCS_F_ALWAYS_INLINE int
uct_ud_ep_ctl_op_isany(uct_ud_ep_t * ep)391 uct_ud_ep_ctl_op_isany(uct_ud_ep_t *ep)
392 {
393     return ep->tx.pending.ops;
394 }
395 
396 static UCS_F_ALWAYS_INLINE int
uct_ud_ep_ctl_op_check_ex(uct_ud_ep_t * ep,uint32_t ops)397 uct_ud_ep_ctl_op_check_ex(uct_ud_ep_t *ep, uint32_t ops)
398 {
399     /* check that at least one the given ops is set and
400      * all ops not given are not set */
401     return (ep->tx.pending.ops & ops) &&
402            ((ep->tx.pending.ops & ~ops) == 0);
403 }
404 
405 /* TODO: rely on window check instead. max_psn = psn  */
uct_ud_ep_is_connected(uct_ud_ep_t * ep)406 static UCS_F_ALWAYS_INLINE int uct_ud_ep_is_connected(uct_ud_ep_t *ep)
407 {
408     ucs_assert((ep->dest_ep_id == UCT_UD_EP_NULL_ID) ==
409                !(ep->flags & UCT_UD_EP_FLAG_CONNECTED));
410     return ep->flags & UCT_UD_EP_FLAG_CONNECTED;
411 }
412 
413 static UCS_F_ALWAYS_INLINE int
uct_ud_ep_is_connected_and_no_pending(uct_ud_ep_t * ep)414 uct_ud_ep_is_connected_and_no_pending(uct_ud_ep_t *ep)
415 {
416     return (ep->flags & (UCT_UD_EP_FLAG_CONNECTED |
417                          UCT_UD_EP_FLAG_HAS_PENDING))
418            == UCT_UD_EP_FLAG_CONNECTED;
419 }
420 
uct_ud_ep_no_window(uct_ud_ep_t * ep)421 static UCS_F_ALWAYS_INLINE int uct_ud_ep_no_window(uct_ud_ep_t *ep)
422 {
423     /* max_psn can be decreased by CA, so check >= */
424     return UCT_UD_PSN_COMPARE(ep->tx.psn, >=, ep->tx.max_psn);
425 }
426 
427 /*
428  * Request ACK once we sent 1/4 of the window or once we got to the window end
429  * or there is a pending ack request operation
430  */
uct_ud_ep_req_ack(uct_ud_ep_t * ep)431 static UCS_F_ALWAYS_INLINE int uct_ud_ep_req_ack(uct_ud_ep_t *ep)
432 {
433     uct_ud_psn_t acked_psn, max_psn, psn;
434 
435     max_psn   = ep->tx.max_psn;
436     acked_psn = ep->tx.acked_psn;
437     psn       = ep->tx.psn;
438 
439     return UCT_UD_PSN_COMPARE(psn, ==, ((acked_psn * 3 + max_psn) >> 2)) ||
440            UCT_UD_PSN_COMPARE(psn + 1, ==, max_psn) ||
441            uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_ACK_REQ);
442 
443 }
444 
445 
446 static UCS_F_ALWAYS_INLINE void
uct_ud_neth_ack_req(uct_ud_ep_t * ep,uct_ud_neth_t * neth)447 uct_ud_neth_ack_req(uct_ud_ep_t *ep, uct_ud_neth_t *neth)
448 {
449     neth->packet_type |= uct_ud_ep_req_ack(ep) << UCT_UD_PACKET_ACK_REQ_SHIFT;
450     uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_ACK|UCT_UD_EP_OP_ACK_REQ);
451 }
452 
453 #endif
454