1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED.
3 *
4 * See file LICENSE for terms.
5 */
6
7 #ifndef UCT_UD_EP_H
8 #define UCT_UD_EP_H
9
10 #include "ud_def.h"
11
12 #include <uct/api/uct.h>
13 #include <ucs/datastruct/frag_list.h>
14 #include <ucs/datastruct/queue.h>
15 #include <ucs/datastruct/arbiter.h>
16 #include <ucs/datastruct/sglib.h>
17 #include <ucs/time/timer_wheel.h>
18
19 #define UCT_UD_EP_NULL_ID ((1<<24)-1)
20 #define UCT_UD_EP_ID_MAX UCT_UD_EP_NULL_ID
21 #define UCT_UD_EP_CONN_ID_MAX UCT_UD_EP_ID_MAX
22
23 #if UCT_UD_EP_DEBUG_HOOKS
24 /*
25 Hooks that allow packet header inspection and rewriting. UCT user can
26 set functions that will be called just before packet is put on wire
27 and when packet is received. Packet will be discarded if RX function
28 returns status different from UCS_OK.
29
30 Example:
31
32 static ucs_status_t clear_ack_req(uct_ud_ep_t *ep, uct_ud_neth_t *neth)
33 {
34 neth->packet_type &= ~UCT_UD_PACKET_FLAG_ACK_REQ;
35 return UCS_OK;
36 }
37
38 uct_ep_t ep;
39 ....
40 // clear ack request bin on all outgoing packets
41 ucs_derived_of(ep, uct_ud_ep_t)->tx.tx_hook = clear_ack_req;
42
43 */
44
45 typedef ucs_status_t (*uct_ud_ep_hook_t)(uct_ud_ep_t *ep, uct_ud_neth_t *neth);
46
47 #define UCT_UD_EP_HOOK_DECLARE(name) uct_ud_ep_hook_t name;
48
49 #define UCT_UD_EP_HOOK_CALL_RX(ep, neth, len) \
50 if ((ep)->rx.rx_hook(ep, neth) != UCS_OK) { \
51 ucs_trace_data("RX: dropping packet"); \
52 return; \
53 }
54
55 #define UCT_UD_EP_HOOK_CALL_TX(ep, neth) (ep)->tx.tx_hook(ep, neth);
56 #define UCT_UD_EP_HOOK_CALL_TIMER(ep) (ep)->timer_hook(ep, NULL);
57
uct_ud_ep_null_hook(uct_ud_ep_t * ep,uct_ud_neth_t * neth)58 static inline ucs_status_t uct_ud_ep_null_hook(uct_ud_ep_t *ep, uct_ud_neth_t *neth)
59 {
60 return UCS_OK;
61 }
62
63 #define UCT_UD_EP_HOOK_INIT(ep) \
64 do { \
65 (ep)->tx.tx_hook = uct_ud_ep_null_hook; \
66 (ep)->rx.rx_hook = uct_ud_ep_null_hook; \
67 (ep)->timer_hook = uct_ud_ep_null_hook; \
68 } while(0);
69
70 #else
71
72 #define UCT_UD_EP_HOOK_DECLARE(name)
73 #define UCT_UD_EP_HOOK_CALL_RX(ep, neth, len)
74 #define UCT_UD_EP_HOOK_CALL_TX(ep, neth)
75 #define UCT_UD_EP_HOOK_CALL_TIMER(ep)
76 #define UCT_UD_EP_HOOK_INIT(ep)
77
78 #endif
79
80
81 /**
82 * Slow ep timer
83 * The purpose of the slow timer is to schedule resends and ack replies.
84 * The timer is a wheel timer. Timer wheel sweep is done on every async
85 * progress invocation. One tick usually happens once in 0.1 seconds.
86 * It is best to avoid to take time in the fast path.
87 *
88 * wheel_time is the time of last timer wheel sweep.
89 * on send:
90 * - try to start wheel timer.
91 * - send_time = wheel_time. That is sending a packet resets retransmission
92 * timeout. This does not allow infinite number of resets because number of
93 * outstanding packets is bound by the TX window size.
94 * on ack recv:
95 * - send_time = wheel_time. (advance last send time)
96 * on timer expiration:
97 * - if wheel_time - saved_time > 3*one_tick_time
98 * schedule resend
99 * send_time = wheel_time
100 * consgestion avoidance decreases tx window
101 * - if window is not empty resched timer
102 * 3x is needed to avoid false resends because of errors in timekeeping
103 *
104 * Fast ep timer (Not implemented)
105 *
106 * The purpose of the fast timer is to detect packet loss as early as
107 * possible. The timer is a wheel timer. Fast timer sweep is done on
108 * CQ polling which happens either in explicit polling or in async
109 * progress. As a result fast timer resolution may vary.
110 *
111 * TODO: adaptive CHK algo description
112 *
113 * Fast time is relatively expensive. It is best to disable if packet loss
114 * is not expected. Usual reasons for packet loss are: slow receiver,
115 * many to one traffic pattern.
116 */
117
118 /* Congestion avoidance and retransmits
119 *
120 * UD uses additive increase/multiplicative decrease algorightm
121 * See https://en.wikipedia.org/wiki/Additive_increase/multiplicative_decrease
122 *
123 * tx window is increased when ack is received and decreased when
124 * resend is scheduled. Ack must be a 'new' one that is it must
125 * acknowledge packets on window. Increasing window on ack does not casue
126 * exponential window increase because, unlike tcp, only two acks
127 * per window are sent.
128 *
129 * Todo:
130 *
131 * Consider trigering window decrease before resend timeout:
132 * - on ECN (explicit congestion notification) from receiever. ECN can
133 * be based on some heuristic. For example on number of rx completions
134 * that receiver picked from CQ.
135 * - upon receiving a 'duplicate ack' packet
136 *
137 * Consider using other algorithm (ex BIC/CUBIC)
138 */
139
140 /*
141 * Handling retransmits
142 *
143 * On slow timer timeout schedule a retransmit operation for
144 * [acked_psn+1, psn-1]. These values are saved as 'resend window'
145 *
146 * Resend operation will resend no more then the current cwnd
147 * If ack arrives when resend window is active it means that
148 * - something new in the resend window was acked. As a
149 * resutlt a new resend operation will be scheduled.
150 * - either resend window or something beyond it was
151 * acked. It means that no more retransmisions are needed.
152 * Current 'resend window' is deactivated
153 *
154 * When retransmitting, ack is requested if:
155 * psn == acked_psn + 1 or
156 * psn % UCT_UD_RESENDS_PER_ACK = 0
157 */
158
159 /*
160 * Endpoint pending control operations. The operations
161 * are executed in time of progress along with
162 * pending requests added by uct user.
163 */
164 enum {
165 UCT_UD_EP_OP_NONE = 0,
166 UCT_UD_EP_OP_ACK = UCS_BIT(0), /* ack data */
167 UCT_UD_EP_OP_ACK_REQ = UCS_BIT(1), /* request ack of sent packets */
168 UCT_UD_EP_OP_RESEND = UCS_BIT(2), /* resend un acked packets */
169 UCT_UD_EP_OP_CREP = UCS_BIT(3), /* send connection reply */
170 UCT_UD_EP_OP_CREQ = UCS_BIT(4), /* send connection request */
171 UCT_UD_EP_OP_NACK = UCS_BIT(5), /* send NACK */
172 };
173
174 #define UCT_UD_EP_OP_CTL_LOW_PRIO (UCT_UD_EP_OP_ACK_REQ|UCT_UD_EP_OP_ACK)
175 #define UCT_UD_EP_OP_CTL_HI_PRIO (UCT_UD_EP_OP_CREQ|UCT_UD_EP_OP_CREP|UCT_UD_EP_OP_RESEND)
176 #define UCT_UD_EP_OP_CTL_ACK (UCT_UD_EP_OP_ACK|UCT_UD_EP_OP_ACK_REQ|UCT_UD_EP_OP_NACK)
177
178 typedef struct uct_ud_ep_pending_op {
179 ucs_arbiter_group_t group;
180 uint32_t ops; /* bitmask that describes what control ops are sceduled */
181 ucs_arbiter_elem_t elem;
182 } uct_ud_ep_pending_op_t;
183
184 enum {
185 UCT_UD_EP_STAT_TODO
186 };
187
188 /* TODO: optimize endpoint memory footprint */
189 enum {
190 UCT_UD_EP_FLAG_DISCONNECTED = UCS_BIT(0), /* EP was disconnected */
191 UCT_UD_EP_FLAG_PRIVATE = UCS_BIT(1), /* EP was created as internal */
192 UCT_UD_EP_FLAG_HAS_PENDING = UCS_BIT(2), /* EP has some pending requests */
193 UCT_UD_EP_FLAG_CONNECTED = UCS_BIT(3), /* EP was connected to the peer */
194
195 /* debug flags */
196 UCT_UD_EP_FLAG_CREQ_RCVD = UCS_BIT(4), /* CREQ message was received */
197 UCT_UD_EP_FLAG_CREP_RCVD = UCS_BIT(5), /* CREP message was received */
198 UCT_UD_EP_FLAG_CREQ_SENT = UCS_BIT(6), /* CREQ message was sent */
199 UCT_UD_EP_FLAG_CREP_SENT = UCS_BIT(7), /* CREP message was sent */
200 UCT_UD_EP_FLAG_CREQ_NOTSENT = UCS_BIT(8), /* CREQ message is NOT sent, because
201 connection establishment process
202 is driven by remote side. */
203 UCT_UD_EP_FLAG_TX_NACKED = UCS_BIT(9), /* Last psn was acked with NAK */
204
205 /* Endpoint is currently executing the pending queue */
206 #if UCS_ENABLE_ASSERT
207 UCT_UD_EP_FLAG_IN_PENDING = UCS_BIT(10)
208 #else
209 UCT_UD_EP_FLAG_IN_PENDING = 0
210 #endif
211 };
212
213 typedef struct uct_ud_peer_name {
214 char name[16];
215 int pid;
216 } uct_ud_peer_name_t;
217
218 struct uct_ud_ep {
219 uct_base_ep_t super;
220 uint32_t ep_id;
221 uint32_t dest_ep_id;
222 struct {
223 uct_ud_psn_t psn; /* Next PSN to send */
224 uct_ud_psn_t max_psn; /* Largest PSN that can be sent */
225 uct_ud_psn_t acked_psn; /* last psn that was acked by remote side */
226 uint16_t resend_count; /* number of in-flight resends on the ep */
227 ucs_queue_head_t window; /* send window: [acked_psn+1, psn-1] */
228 uct_ud_ep_pending_op_t pending; /* pending ops */
229 ucs_time_t send_time; /* tx time of last packet */
230 ucs_time_t resend_time; /* tx time of last resent packet */
231 ucs_time_t tick; /* timeout to trigger timer */
232 UCS_STATS_NODE_DECLARE(stats)
233 UCT_UD_EP_HOOK_DECLARE(tx_hook)
234 } tx;
235 struct {
236 uct_ud_psn_t acked_psn; /* Last psn we acked */
237 ucs_frag_list_t ooo_pkts; /* Out of order packets that can not be processed yet,
238 also keeps last psn we successfully received and processed */
239 UCS_STATS_NODE_DECLARE(stats)
240 UCT_UD_EP_HOOK_DECLARE(rx_hook)
241 } rx;
242 struct {
243 uct_ud_psn_t wmax;
244 uct_ud_psn_t cwnd;
245 } ca;
246 struct UCS_S_PACKED {
247 ucs_queue_iter_t pos; /* points to the part of tx window that needs to be resent */
248 uct_ud_psn_t psn; /* last psn that was retransmitted */
249 uct_ud_psn_t max_psn; /* max psn that should be retransmitted */
250 } resend;
251 ucs_list_link_t cep_list;
252 uint32_t conn_id; /* connection id. assigned in connect_to_iface() */
253 uint16_t flags;
254 uint8_t rx_creq_count; /* TODO: remove when reason for DUP/OOO CREQ is found */
255 uint8_t path_index;
256 ucs_wtimer_t timer;
257 ucs_time_t close_time; /* timestamp of closure */
258 UCS_STATS_NODE_DECLARE(stats)
259 UCT_UD_EP_HOOK_DECLARE(timer_hook)
260 #if ENABLE_DEBUG_DATA
261 uct_ud_peer_name_t peer;
262 #endif
263 };
264
265 #if ENABLE_DEBUG_DATA
266 # define UCT_UD_EP_PEER_NAME_FMT "%s:%d"
267 # define UCT_UD_EP_PEER_NAME_ARG(_ep) (_ep)->peer.name, (_ep)->peer.pid
268 #else
269 # define UCT_UD_EP_PEER_NAME_FMT "%s"
270 # define UCT_UD_EP_PEER_NAME_ARG(_ep) "<no debug data>"
271 #endif
272
273
274 UCS_CLASS_DECLARE(uct_ud_ep_t, uct_ud_iface_t*, const uct_ep_params_t*)
275
276 /**
277 * UD pending request private data
278 */
279 typedef struct {
280 uct_pending_req_priv_arb_t arb;
281 unsigned flags;
282 } uct_ud_pending_req_priv_t;
283
284
285 static UCS_F_ALWAYS_INLINE uct_ud_pending_req_priv_t *
uct_ud_pending_req_priv(uct_pending_req_t * req)286 uct_ud_pending_req_priv(uct_pending_req_t *req)
287 {
288 return (uct_ud_pending_req_priv_t *)&(req)->priv;
289 }
290
291
292 void uct_ud_tx_wnd_purge_outstanding(uct_ud_iface_t *iface, uct_ud_ep_t *ud_ep,
293 ucs_status_t status, int is_async);
294
295 ucs_status_t uct_ud_ep_flush(uct_ep_h ep, unsigned flags,
296 uct_completion_t *comp);
297 /* internal flush */
298 ucs_status_t uct_ud_ep_flush_nolock(uct_ud_iface_t *iface, uct_ud_ep_t *ep,
299 uct_completion_t *comp);
300
301 ucs_status_t uct_ud_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr);
302
303 ucs_status_t uct_ud_ep_connect_to_ep(uct_ud_ep_t *ep,
304 const uct_ib_address_t *ib_addr,
305 const uct_ud_ep_addr_t *ep_addr);
306
307 ucs_status_t uct_ud_ep_pending_add(uct_ep_h ep, uct_pending_req_t *n,
308 unsigned flags);
309
310 void uct_ud_ep_pending_purge(uct_ep_h ep, uct_pending_purge_callback_t cb,
311 void *arg);
312
313 void uct_ud_ep_disconnect(uct_ep_h ep);
314
315 void uct_ud_ep_window_release_completed(uct_ud_ep_t *ep, int is_async);
316
317
318 /* helper function to create/destroy new connected ep */
319 ucs_status_t uct_ud_ep_create_connected_common(uct_ud_iface_t *iface,
320 const uct_ib_address_t *ib_addr,
321 const uct_ud_iface_addr_t *if_addr,
322 unsigned path_index,
323 uct_ud_ep_t **new_ep_p,
324 uct_ud_send_skb_t **skb_p);
325
326 void uct_ud_ep_destroy_connected(uct_ud_ep_t *ep,
327 const uct_ib_address_t *ib_addr,
328 const uct_ud_iface_addr_t *if_addr);
329
330 uct_ud_send_skb_t *uct_ud_ep_prepare_creq(uct_ud_ep_t *ep);
331
332 ucs_arbiter_cb_result_t
333 uct_ud_ep_do_pending(ucs_arbiter_t *arbiter, ucs_arbiter_group_t *group,
334 ucs_arbiter_elem_t *elem, void *arg);
335
336 void uct_ud_ep_clone(uct_ud_ep_t *old_ep, uct_ud_ep_t *new_ep);
337
338 static UCS_F_ALWAYS_INLINE void
uct_ud_neth_set_type_am(uct_ud_ep_t * ep,uct_ud_neth_t * neth,uint8_t id)339 uct_ud_neth_set_type_am(uct_ud_ep_t *ep, uct_ud_neth_t *neth, uint8_t id)
340 {
341 neth->packet_type = (id << UCT_UD_PACKET_AM_ID_SHIFT) |
342 ep->dest_ep_id |
343 UCT_UD_PACKET_FLAG_AM;
344 }
345
346 static UCS_F_ALWAYS_INLINE void
uct_ud_neth_set_type_put(uct_ud_ep_t * ep,uct_ud_neth_t * neth)347 uct_ud_neth_set_type_put(uct_ud_ep_t *ep, uct_ud_neth_t *neth)
348 {
349 neth->packet_type = ep->dest_ep_id | UCT_UD_PACKET_FLAG_PUT;
350 }
351
352 void uct_ud_ep_process_rx(uct_ud_iface_t *iface,
353 uct_ud_neth_t *neth, unsigned byte_len,
354 uct_ud_recv_skb_t *skb, int is_async);
355
356
357 static UCS_F_ALWAYS_INLINE void
uct_ud_neth_init_data(uct_ud_ep_t * ep,uct_ud_neth_t * neth)358 uct_ud_neth_init_data(uct_ud_ep_t *ep, uct_ud_neth_t *neth)
359 {
360 neth->psn = ep->tx.psn;
361 neth->ack_psn = ep->rx.acked_psn = ucs_frag_list_sn(&ep->rx.ooo_pkts);
362 }
363
uct_ud_ep_compare(uct_ud_ep_t * a,uct_ud_ep_t * b)364 static inline int uct_ud_ep_compare(uct_ud_ep_t *a, uct_ud_ep_t *b)
365 {
366 return a->conn_id - b->conn_id;
367 }
368
uct_ud_ep_hash(uct_ud_ep_t * ep)369 static inline int uct_ud_ep_hash(uct_ud_ep_t *ep)
370 {
371 return ep->conn_id % UCT_UD_HASH_SIZE;
372 }
373
SGLIB_DEFINE_LIST_PROTOTYPES(uct_ud_ep_t,uct_ud_ep_compare,next)374 SGLIB_DEFINE_LIST_PROTOTYPES(uct_ud_ep_t, uct_ud_ep_compare, next)
375 SGLIB_DEFINE_HASHED_CONTAINER_PROTOTYPES(uct_ud_ep_t, UCT_UD_HASH_SIZE, uct_ud_ep_hash)
376
377
378 static UCS_F_ALWAYS_INLINE void
379 uct_ud_ep_ctl_op_del(uct_ud_ep_t *ep, uint32_t ops)
380 {
381 ep->tx.pending.ops &= ~ops;
382 }
383
384 static UCS_F_ALWAYS_INLINE int
uct_ud_ep_ctl_op_check(uct_ud_ep_t * ep,uint32_t op)385 uct_ud_ep_ctl_op_check(uct_ud_ep_t *ep, uint32_t op)
386 {
387 return ep->tx.pending.ops & op;
388 }
389
390 static UCS_F_ALWAYS_INLINE int
uct_ud_ep_ctl_op_isany(uct_ud_ep_t * ep)391 uct_ud_ep_ctl_op_isany(uct_ud_ep_t *ep)
392 {
393 return ep->tx.pending.ops;
394 }
395
396 static UCS_F_ALWAYS_INLINE int
uct_ud_ep_ctl_op_check_ex(uct_ud_ep_t * ep,uint32_t ops)397 uct_ud_ep_ctl_op_check_ex(uct_ud_ep_t *ep, uint32_t ops)
398 {
399 /* check that at least one the given ops is set and
400 * all ops not given are not set */
401 return (ep->tx.pending.ops & ops) &&
402 ((ep->tx.pending.ops & ~ops) == 0);
403 }
404
405 /* TODO: rely on window check instead. max_psn = psn */
uct_ud_ep_is_connected(uct_ud_ep_t * ep)406 static UCS_F_ALWAYS_INLINE int uct_ud_ep_is_connected(uct_ud_ep_t *ep)
407 {
408 ucs_assert((ep->dest_ep_id == UCT_UD_EP_NULL_ID) ==
409 !(ep->flags & UCT_UD_EP_FLAG_CONNECTED));
410 return ep->flags & UCT_UD_EP_FLAG_CONNECTED;
411 }
412
413 static UCS_F_ALWAYS_INLINE int
uct_ud_ep_is_connected_and_no_pending(uct_ud_ep_t * ep)414 uct_ud_ep_is_connected_and_no_pending(uct_ud_ep_t *ep)
415 {
416 return (ep->flags & (UCT_UD_EP_FLAG_CONNECTED |
417 UCT_UD_EP_FLAG_HAS_PENDING))
418 == UCT_UD_EP_FLAG_CONNECTED;
419 }
420
uct_ud_ep_no_window(uct_ud_ep_t * ep)421 static UCS_F_ALWAYS_INLINE int uct_ud_ep_no_window(uct_ud_ep_t *ep)
422 {
423 /* max_psn can be decreased by CA, so check >= */
424 return UCT_UD_PSN_COMPARE(ep->tx.psn, >=, ep->tx.max_psn);
425 }
426
427 /*
428 * Request ACK once we sent 1/4 of the window or once we got to the window end
429 * or there is a pending ack request operation
430 */
uct_ud_ep_req_ack(uct_ud_ep_t * ep)431 static UCS_F_ALWAYS_INLINE int uct_ud_ep_req_ack(uct_ud_ep_t *ep)
432 {
433 uct_ud_psn_t acked_psn, max_psn, psn;
434
435 max_psn = ep->tx.max_psn;
436 acked_psn = ep->tx.acked_psn;
437 psn = ep->tx.psn;
438
439 return UCT_UD_PSN_COMPARE(psn, ==, ((acked_psn * 3 + max_psn) >> 2)) ||
440 UCT_UD_PSN_COMPARE(psn + 1, ==, max_psn) ||
441 uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_ACK_REQ);
442
443 }
444
445
446 static UCS_F_ALWAYS_INLINE void
uct_ud_neth_ack_req(uct_ud_ep_t * ep,uct_ud_neth_t * neth)447 uct_ud_neth_ack_req(uct_ud_ep_t *ep, uct_ud_neth_t *neth)
448 {
449 neth->packet_type |= uct_ud_ep_req_ack(ep) << UCT_UD_PACKET_ACK_REQ_SHIFT;
450 uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_ACK|UCT_UD_EP_OP_ACK_REQ);
451 }
452
453 #endif
454