xref: /netbsd/sys/netinet/tcp_congctl.c (revision 9f431289)
1 /*	$NetBSD: tcp_congctl.c,v 1.28 2021/07/31 20:29:37 andvar Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
9  * Facility, NASA Ames Research Center.
10  * This code is derived from software contributed to The NetBSD Foundation
11  * by Charles M. Hannum.
12  * This code is derived from software contributed to The NetBSD Foundation
13  * by Rui Paulo.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
28  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 /*
38  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
39  * All rights reserved.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the project nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  */
65 
66 /*
67  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
68  *
69  * NRL grants permission for redistribution and use in source and binary
70  * forms, with or without modification, of the software and documentation
71  * created at NRL provided that the following conditions are met:
72  *
73  * 1. Redistributions of source code must retain the above copyright
74  *    notice, this list of conditions and the following disclaimer.
75  * 2. Redistributions in binary form must reproduce the above copyright
76  *    notice, this list of conditions and the following disclaimer in the
77  *    documentation and/or other materials provided with the distribution.
78  * 3. All advertising materials mentioning features or use of this software
79  *    must display the following acknowledgements:
80  *      This product includes software developed by the University of
81  *      California, Berkeley and its contributors.
82  *      This product includes software developed at the Information
83  *      Technology Division, US Naval Research Laboratory.
84  * 4. Neither the name of the NRL nor the names of its contributors
85  *    may be used to endorse or promote products derived from this software
86  *    without specific prior written permission.
87  *
88  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
89  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
90  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
91  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
92  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
93  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
94  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
95  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
96  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
97  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
98  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
99  *
100  * The views and conclusions contained in the software and documentation
101  * are those of the authors and should not be interpreted as representing
102  * official policies, either expressed or implied, of the US Naval
103  * Research Laboratory (NRL).
104  */
105 
106 /*
107  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
108  *	The Regents of the University of California.  All rights reserved.
109  *
110  * Redistribution and use in source and binary forms, with or without
111  * modification, are permitted provided that the following conditions
112  * are met:
113  * 1. Redistributions of source code must retain the above copyright
114  *    notice, this list of conditions and the following disclaimer.
115  * 2. Redistributions in binary form must reproduce the above copyright
116  *    notice, this list of conditions and the following disclaimer in the
117  *    documentation and/or other materials provided with the distribution.
118  * 3. Neither the name of the University nor the names of its contributors
119  *    may be used to endorse or promote products derived from this software
120  *    without specific prior written permission.
121  *
122  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
123  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
124  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
125  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
126  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
127  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
128  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
129  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
130  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
131  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
132  * SUCH DAMAGE.
133  *
134  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
135  */
136 
137 #include <sys/cdefs.h>
138 __KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.28 2021/07/31 20:29:37 andvar Exp $");
139 
140 #ifdef _KERNEL_OPT
141 #include "opt_inet.h"
142 #include "opt_tcp_debug.h"
143 #include "opt_tcp_congctl.h"
144 #endif
145 
146 #include <sys/param.h>
147 #include <sys/systm.h>
148 #include <sys/malloc.h>
149 #include <sys/mbuf.h>
150 #include <sys/protosw.h>
151 #include <sys/socket.h>
152 #include <sys/socketvar.h>
153 #include <sys/errno.h>
154 #include <sys/syslog.h>
155 #include <sys/pool.h>
156 #include <sys/domain.h>
157 #include <sys/kernel.h>
158 #include <sys/mutex.h>
159 
160 #include <net/if.h>
161 
162 #include <netinet/in.h>
163 #include <netinet/in_systm.h>
164 #include <netinet/ip.h>
165 #include <netinet/in_pcb.h>
166 #include <netinet/in_var.h>
167 #include <netinet/ip_var.h>
168 
169 #ifdef INET6
170 #include <netinet/ip6.h>
171 #include <netinet6/ip6_var.h>
172 #include <netinet6/in6_pcb.h>
173 #include <netinet6/ip6_var.h>
174 #include <netinet6/in6_var.h>
175 #include <netinet/icmp6.h>
176 #endif
177 
178 #include <netinet/tcp.h>
179 #include <netinet/tcp_fsm.h>
180 #include <netinet/tcp_seq.h>
181 #include <netinet/tcp_timer.h>
182 #include <netinet/tcp_var.h>
183 #include <netinet/tcp_congctl.h>
184 #ifdef TCP_DEBUG
185 #include <netinet/tcp_debug.h>
186 #endif
187 
188 /*
189  * TODO:
190  *   consider separating the actual implementations in another file.
191  */
192 
193 static void tcp_common_congestion_exp(struct tcpcb *, int, int);
194 
195 static int  tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *);
196 static int  tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
197 static void tcp_reno_slow_retransmit(struct tcpcb *);
198 static void tcp_reno_fast_retransmit_newack(struct tcpcb *,
199     const struct tcphdr *);
200 static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *);
201 static void tcp_reno_congestion_exp(struct tcpcb *tp);
202 
203 static int  tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
204 static void tcp_newreno_fast_retransmit_newack(struct tcpcb *,
205 	const struct tcphdr *);
206 static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *);
207 
208 static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *);
209 static void tcp_cubic_slow_retransmit(struct tcpcb *tp);
210 static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *);
211 static void tcp_cubic_congestion_exp(struct tcpcb *);
212 
213 static void tcp_congctl_fillnames(void);
214 
215 extern int tcprexmtthresh;
216 
217 MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures");
218 
219 /* currently selected global congestion control */
220 char tcp_congctl_global_name[TCPCC_MAXLEN];
221 
222 /* available global congestion control algorithms */
223 char tcp_congctl_avail[10 * TCPCC_MAXLEN];
224 
225 /*
226  * Used to list the available congestion control algorithms.
227  */
228 TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd =
229     TAILQ_HEAD_INITIALIZER(tcp_congctlhd);
230 
231 static struct tcp_congctlent * tcp_congctl_global;
232 
233 static kmutex_t tcp_congctl_mtx;
234 
235 void
tcp_congctl_init(void)236 tcp_congctl_init(void)
237 {
238 	int r __diagused;
239 
240 	mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE);
241 
242 	/* Base algorithms. */
243 	r = tcp_congctl_register("reno", &tcp_reno_ctl);
244 	KASSERT(r == 0);
245 	r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
246 	KASSERT(r == 0);
247 	r = tcp_congctl_register("cubic", &tcp_cubic_ctl);
248 	KASSERT(r == 0);
249 
250 	/* NewReno is the default. */
251 #ifndef TCP_CONGCTL_DEFAULT
252 #define TCP_CONGCTL_DEFAULT "newreno"
253 #endif
254 
255 	r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT);
256 	KASSERT(r == 0);
257 }
258 
259 /*
260  * Register a congestion algorithm and select it if we have none.
261  */
262 int
tcp_congctl_register(const char * name,const struct tcp_congctl * tcc)263 tcp_congctl_register(const char *name, const struct tcp_congctl *tcc)
264 {
265 	struct tcp_congctlent *ntcc, *tccp;
266 
267 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
268 		if (!strcmp(name, tccp->congctl_name)) {
269 			/* name already registered */
270 			return EEXIST;
271 		}
272 
273 	ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO);
274 
275 	strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1);
276 	ntcc->congctl_ctl = tcc;
277 
278 	TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent);
279 	tcp_congctl_fillnames();
280 
281 	if (TAILQ_FIRST(&tcp_congctlhd) == ntcc)
282 		tcp_congctl_select(NULL, name);
283 
284 	return 0;
285 }
286 
287 int
tcp_congctl_unregister(const char * name)288 tcp_congctl_unregister(const char *name)
289 {
290 	struct tcp_congctlent *tccp, *rtccp;
291 	unsigned int size;
292 
293 	rtccp = NULL;
294 	size = 0;
295 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
296 		if (!strcmp(name, tccp->congctl_name))
297 			rtccp = tccp;
298 		size++;
299 	}
300 
301 	if (!rtccp)
302 		return ENOENT;
303 
304 	if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt)
305 		return EBUSY;
306 
307 	TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent);
308 	free(rtccp, M_TCPCONGCTL);
309 	tcp_congctl_fillnames();
310 
311 	return 0;
312 }
313 
314 /*
315  * Select a congestion algorithm by name.
316  */
317 int
tcp_congctl_select(struct tcpcb * tp,const char * name)318 tcp_congctl_select(struct tcpcb *tp, const char *name)
319 {
320 	struct tcp_congctlent *tccp, *old_tccp, *new_tccp;
321 	bool old_found, new_found;
322 
323 	KASSERT(name);
324 
325 	old_found = (tp == NULL || tp->t_congctl == NULL);
326 	old_tccp = NULL;
327 	new_found = false;
328 	new_tccp = NULL;
329 
330 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
331 		if (!old_found && tccp->congctl_ctl == tp->t_congctl) {
332 			old_tccp = tccp;
333 			old_found = true;
334 		}
335 
336 		if (!new_found && !strcmp(name, tccp->congctl_name)) {
337 			new_tccp = tccp;
338 			new_found = true;
339 		}
340 
341 		if (new_found && old_found) {
342 			if (tp) {
343 				mutex_enter(&tcp_congctl_mtx);
344 				if (old_tccp)
345 					old_tccp->congctl_refcnt--;
346 				tp->t_congctl = new_tccp->congctl_ctl;
347 				new_tccp->congctl_refcnt++;
348 				mutex_exit(&tcp_congctl_mtx);
349 			} else {
350 				tcp_congctl_global = new_tccp;
351 				strlcpy(tcp_congctl_global_name,
352 				    new_tccp->congctl_name,
353 				    sizeof(tcp_congctl_global_name) - 1);
354 			}
355 			return 0;
356 		}
357 	}
358 
359 	return EINVAL;
360 }
361 
362 void
tcp_congctl_release(struct tcpcb * tp)363 tcp_congctl_release(struct tcpcb *tp)
364 {
365 	struct tcp_congctlent *tccp;
366 
367 	KASSERT(tp->t_congctl);
368 
369 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
370 		if (tccp->congctl_ctl == tp->t_congctl) {
371 			tccp->congctl_refcnt--;
372 			return;
373 		}
374 	}
375 }
376 
377 /*
378  * Returns the name of a congestion algorithm.
379  */
380 const char *
tcp_congctl_bystruct(const struct tcp_congctl * tcc)381 tcp_congctl_bystruct(const struct tcp_congctl *tcc)
382 {
383 	struct tcp_congctlent *tccp;
384 
385 	KASSERT(tcc);
386 
387 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
388 		if (tccp->congctl_ctl == tcc)
389 			return tccp->congctl_name;
390 
391 	return NULL;
392 }
393 
394 static void
tcp_congctl_fillnames(void)395 tcp_congctl_fillnames(void)
396 {
397 	struct tcp_congctlent *tccp;
398 	const char *delim = " ";
399 
400 	tcp_congctl_avail[0] = '\0';
401 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
402 		strlcat(tcp_congctl_avail, tccp->congctl_name,
403 		    sizeof(tcp_congctl_avail) - 1);
404 		if (TAILQ_NEXT(tccp, congctl_ent))
405 			strlcat(tcp_congctl_avail, delim,
406 			    sizeof(tcp_congctl_avail) - 1);
407 	}
408 
409 }
410 
411 /* ------------------------------------------------------------------------ */
412 
413 /*
414  * Common stuff
415  */
416 
417 /* Window reduction (1-beta) for [New]Reno: 0.5 */
418 #define RENO_BETAA 1
419 #define RENO_BETAB 2
420 /* Window reduction (1-beta) for Cubic: 0.8 */
421 #define CUBIC_BETAA 4
422 #define CUBIC_BETAB 5
423 /* Draft Rhee Section 4.1 */
424 #define CUBIC_CA 4
425 #define CUBIC_CB 10
426 
427 static void
tcp_common_congestion_exp(struct tcpcb * tp,int betaa,int betab)428 tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab)
429 {
430 	u_long win;
431 
432 	/*
433 	 * Reduce the congestion window and the slow start threshold.
434 	 */
435 	win = ulmin(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz;
436 	if (win < 2)
437 		win = 2;
438 
439 	tp->snd_ssthresh = win * tp->t_segsz;
440 	tp->snd_recover = tp->snd_max;
441 	tp->snd_cwnd = tp->snd_ssthresh;
442 
443 	/*
444 	 * When using TCP ECN, notify the peer that
445 	 * we reduced the cwnd.
446 	 */
447 	if (TCP_ECN_ALLOWED(tp))
448 		tp->t_flags |= TF_ECN_SND_CWR;
449 }
450 
451 
452 /* ------------------------------------------------------------------------ */
453 
454 /*
455  * TCP/Reno congestion control.
456  */
457 static void
tcp_reno_congestion_exp(struct tcpcb * tp)458 tcp_reno_congestion_exp(struct tcpcb *tp)
459 {
460 
461 	tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB);
462 }
463 
464 static int
tcp_reno_do_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)465 tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
466 {
467 	/*
468 	 * Dup acks mean that packets have left the
469 	 * network (they're now cached at the receiver)
470 	 * so bump cwnd by the amount in the receiver
471 	 * to keep a constant cwnd packets in the
472 	 * network.
473 	 *
474 	 * If we are using TCP/SACK, then enter
475 	 * Fast Recovery if the receiver SACKs
476 	 * data that is tcprexmtthresh * MSS
477 	 * bytes past the last ACKed segment,
478 	 * irrespective of the number of DupAcks.
479 	 */
480 
481 	tcp_seq onxt = tp->snd_nxt;
482 
483 	tp->t_partialacks = 0;
484 	TCP_TIMER_DISARM(tp, TCPT_REXMT);
485 	tp->t_rtttime = 0;
486 	if (TCP_SACK_ENABLED(tp)) {
487 		tp->t_dupacks = tcprexmtthresh;
488 		tp->sack_newdata = tp->snd_nxt;
489 		tp->snd_cwnd = tp->t_segsz;
490 		(void) tcp_output(tp);
491 		return 0;
492 	}
493 	tp->snd_nxt = th->th_ack;
494 	tp->snd_cwnd = tp->t_segsz;
495 	(void) tcp_output(tp);
496 	tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks;
497 	if (SEQ_GT(onxt, tp->snd_nxt))
498 		tp->snd_nxt = onxt;
499 
500 	return 0;
501 }
502 
503 static int
tcp_reno_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)504 tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
505 {
506 
507 	/*
508 	 * We know we're losing at the current
509 	 * window size so do congestion avoidance
510 	 * (set ssthresh to half the current window
511 	 * and pull our congestion window back to
512 	 * the new ssthresh).
513 	 */
514 
515 	tcp_reno_congestion_exp(tp);
516 	return tcp_reno_do_fast_retransmit(tp, th);
517 }
518 
519 static void
tcp_reno_slow_retransmit(struct tcpcb * tp)520 tcp_reno_slow_retransmit(struct tcpcb *tp)
521 {
522 	u_long win;
523 
524 	/*
525 	 * Close the congestion window down to one segment
526 	 * (we'll open it by one segment for each ack we get).
527 	 * Since we probably have a window's worth of unacked
528 	 * data accumulated, this "slow start" keeps us from
529 	 * dumping all that data as back-to-back packets (which
530 	 * might overwhelm an intermediate gateway).
531 	 *
532 	 * There are two phases to the opening: Initially we
533 	 * open by one mss on each ack.  This makes the window
534 	 * size increase exponentially with time.  If the
535 	 * window is larger than the path can handle, this
536 	 * exponential growth results in dropped packet(s)
537 	 * almost immediately.  To get more time between
538 	 * drops but still "push" the network to take advantage
539 	 * of improving conditions, we switch from exponential
540 	 * to linear window opening at some threshold size.
541 	 * For a threshold, we use half the current window
542 	 * size, truncated to a multiple of the mss.
543 	 *
544 	 * (the minimum cwnd that will give us exponential
545 	 * growth is 2 mss.  We don't allow the threshold
546 	 * to go below this.)
547 	 */
548 
549 	win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
550 	if (win < 2)
551 		win = 2;
552 	/* Loss Window MUST be one segment. */
553 	tp->snd_cwnd = tp->t_segsz;
554 	tp->snd_ssthresh = win * tp->t_segsz;
555 	tp->t_partialacks = -1;
556 	tp->t_dupacks = 0;
557 	tp->t_bytes_acked = 0;
558 
559 	if (TCP_ECN_ALLOWED(tp))
560 		tp->t_flags |= TF_ECN_SND_CWR;
561 }
562 
563 static void
tcp_reno_fast_retransmit_newack(struct tcpcb * tp,const struct tcphdr * th)564 tcp_reno_fast_retransmit_newack(struct tcpcb *tp,
565     const struct tcphdr *th)
566 {
567 	if (tp->t_partialacks < 0) {
568 		/*
569 		 * We were not in fast recovery.  Reset the duplicate ack
570 		 * counter.
571 		 */
572 		tp->t_dupacks = 0;
573 	} else {
574 		/*
575 		 * Clamp the congestion window to the crossover point and
576 		 * exit fast recovery.
577 		 */
578 		if (tp->snd_cwnd > tp->snd_ssthresh)
579 			tp->snd_cwnd = tp->snd_ssthresh;
580 		tp->t_partialacks = -1;
581 		tp->t_dupacks = 0;
582 		tp->t_bytes_acked = 0;
583 		if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
584 			tp->snd_fack = th->th_ack;
585 	}
586 }
587 
588 static void
tcp_reno_newack(struct tcpcb * tp,const struct tcphdr * th)589 tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th)
590 {
591 	/*
592 	 * When new data is acked, open the congestion window.
593 	 */
594 
595 	u_int cw = tp->snd_cwnd;
596 	u_int incr = tp->t_segsz;
597 
598 	if (tcp_do_abc) {
599 
600 		/*
601 		 * RFC 3465 Appropriate Byte Counting (ABC)
602 		 */
603 
604 		int acked = th->th_ack - tp->snd_una;
605 
606 		if (cw >= tp->snd_ssthresh) {
607 			tp->t_bytes_acked += acked;
608 			if (tp->t_bytes_acked >= cw) {
609 				/* Time to increase the window. */
610 				tp->t_bytes_acked -= cw;
611 			} else {
612 				/* No need to increase yet. */
613 				incr = 0;
614 			}
615 		} else {
616 			/*
617 			 * use 2*SMSS or 1*SMSS for the "L" param,
618 			 * depending on sysctl setting.
619 			 *
620 			 * (See RFC 3465 2.3 Choosing the Limit)
621 			 */
622 			u_int abc_lim;
623 
624 			abc_lim = (tcp_abc_aggressive == 0 ||
625 			    tp->snd_nxt != tp->snd_max) ? incr : incr * 2;
626 			incr = uimin(acked, abc_lim);
627 		}
628 	} else {
629 
630 		/*
631 		 * If the window gives us less than ssthresh packets
632 		 * in flight, open exponentially (segsz per packet).
633 		 * Otherwise open linearly: segsz per window
634 		 * (segsz^2 / cwnd per packet).
635 		 */
636 
637 		if (cw >= tp->snd_ssthresh) {
638 			incr = incr * incr / cw;
639 		}
640 	}
641 
642 	tp->snd_cwnd = uimin(cw + incr, TCP_MAXWIN << tp->snd_scale);
643 }
644 
645 const struct tcp_congctl tcp_reno_ctl = {
646 	.fast_retransmit = tcp_reno_fast_retransmit,
647 	.slow_retransmit = tcp_reno_slow_retransmit,
648 	.fast_retransmit_newack = tcp_reno_fast_retransmit_newack,
649 	.newack = tcp_reno_newack,
650 	.cong_exp = tcp_reno_congestion_exp,
651 };
652 
653 /*
654  * TCP/NewReno Congestion control.
655  */
656 static int
tcp_newreno_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)657 tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
658 {
659 
660 	if (SEQ_LT(th->th_ack, tp->snd_high)) {
661 		/*
662 		 * False fast retransmit after timeout.
663 		 * Do not enter fast recovery
664 		 */
665 		tp->t_dupacks = 0;
666 		return 1;
667 	}
668 	/*
669 	 * Fast retransmit is same as reno.
670 	 */
671 	return tcp_reno_fast_retransmit(tp, th);
672 }
673 
674 /*
675  * Implement the NewReno response to a new ack, checking for partial acks in
676  * fast recovery.
677  */
678 static void
tcp_newreno_fast_retransmit_newack(struct tcpcb * tp,const struct tcphdr * th)679 tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th)
680 {
681 	if (tp->t_partialacks < 0) {
682 		/*
683 		 * We were not in fast recovery.  Reset the duplicate ack
684 		 * counter.
685 		 */
686 		tp->t_dupacks = 0;
687 	} else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
688 		/*
689 		 * This is a partial ack.  Retransmit the first unacknowledged
690 		 * segment and deflate the congestion window by the amount of
691 		 * acknowledged data.  Do not exit fast recovery.
692 		 */
693 		tcp_seq onxt = tp->snd_nxt;
694 		u_long ocwnd = tp->snd_cwnd;
695 		int sack_num_segs = 1, sack_bytes_rxmt = 0;
696 
697 		/*
698 		 * snd_una has not yet been updated and the socket's send
699 		 * buffer has not yet drained off the ACK'd data, so we
700 		 * have to leave snd_una as it was to get the correct data
701 		 * offset in tcp_output().
702 		 */
703 		tp->t_partialacks++;
704 		TCP_TIMER_DISARM(tp, TCPT_REXMT);
705 		tp->t_rtttime = 0;
706 
707 		if (TCP_SACK_ENABLED(tp)) {
708 			/*
709 			 * Partial ack handling within a sack recovery episode.
710 			 * Keeping this very simple for now. When a partial ack
711 			 * is received, force snd_cwnd to a value that will
712 			 * allow the sender to transmit no more than 2 segments.
713 			 * If necessary, a fancier scheme can be adopted at a
714 			 * later point, but for now, the goal is to prevent the
715 			 * sender from bursting a large amount of data in the
716 			 * midst of sack recovery.
717 		 	 */
718 
719 			/*
720 			 * send one or 2 segments based on how much
721 			 * new data was acked
722 			 */
723 			if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
724 				sack_num_segs = 2;
725 			(void)tcp_sack_output(tp, &sack_bytes_rxmt);
726 			tp->snd_cwnd = sack_bytes_rxmt +
727 			    (tp->snd_nxt - tp->sack_newdata) +
728 			    sack_num_segs * tp->t_segsz;
729 			tp->t_flags |= TF_ACKNOW;
730 			(void) tcp_output(tp);
731 		} else {
732 			tp->snd_nxt = th->th_ack;
733 			/*
734 			 * Set snd_cwnd to one segment beyond ACK'd offset
735 			 * snd_una is not yet updated when we're called
736 			 */
737 			tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
738 			(void) tcp_output(tp);
739 			tp->snd_cwnd = ocwnd;
740 			if (SEQ_GT(onxt, tp->snd_nxt))
741 				tp->snd_nxt = onxt;
742 			/*
743 			 * Partial window deflation.  Relies on fact that
744 			 * tp->snd_una not updated yet.
745 		 	 */
746 			tp->snd_cwnd -= (th->th_ack - tp->snd_una -
747 			    tp->t_segsz);
748 		}
749 	} else {
750 		/*
751 		 * Complete ack.  Inflate the congestion window to ssthresh
752 		 * and exit fast recovery.
753 		 *
754 		 * Window inflation should have left us with approx.
755 		 * snd_ssthresh outstanding data.  But in case we
756 		 * would be inclined to send a burst, better to do
757 		 * it via the slow start mechanism.
758 		 */
759 		if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
760 			tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
761 			    + tp->t_segsz;
762 		else
763 			tp->snd_cwnd = tp->snd_ssthresh;
764 		tp->t_partialacks = -1;
765 		tp->t_dupacks = 0;
766 		tp->t_bytes_acked = 0;
767 		if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
768 			tp->snd_fack = th->th_ack;
769 	}
770 }
771 
772 static void
tcp_newreno_newack(struct tcpcb * tp,const struct tcphdr * th)773 tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th)
774 {
775 	/*
776 	 * If we are still in fast recovery (meaning we are using
777 	 * NewReno and we have only received partial acks), do not
778 	 * inflate the window yet.
779 	 */
780 	if (tp->t_partialacks < 0)
781 		tcp_reno_newack(tp, th);
782 }
783 
784 
785 const struct tcp_congctl tcp_newreno_ctl = {
786 	.fast_retransmit = tcp_newreno_fast_retransmit,
787 	.slow_retransmit = tcp_reno_slow_retransmit,
788 	.fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
789 	.newack = tcp_newreno_newack,
790 	.cong_exp = tcp_reno_congestion_exp,
791 };
792 
793 /*
794  * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02
795  */
796 
797 /* Cubic prototypes */
798 static void	tcp_cubic_update_ctime(struct tcpcb *tp);
799 static uint32_t	tcp_cubic_diff_ctime(struct tcpcb *);
800 static uint32_t	tcp_cubic_cbrt(uint32_t);
801 static ulong	tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t);
802 
803 /* Cubic TIME functions - XXX I don't like using timevals and microuptime */
804 /*
805  * Set congestion timer to now
806  */
807 static void
tcp_cubic_update_ctime(struct tcpcb * tp)808 tcp_cubic_update_ctime(struct tcpcb *tp)
809 {
810 	struct timeval now_timeval;
811 
812 	getmicrouptime(&now_timeval);
813 	tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 +
814 	    now_timeval.tv_usec / 1000;
815 }
816 
817 /*
818  * miliseconds from last congestion
819  */
820 static uint32_t
tcp_cubic_diff_ctime(struct tcpcb * tp)821 tcp_cubic_diff_ctime(struct tcpcb *tp)
822 {
823 	struct timeval now_timeval;
824 
825 	getmicrouptime(&now_timeval);
826 	return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 -
827 	    tp->snd_cubic_ctime;
828 }
829 
830 /*
831  * Approximate cubic root
832  */
833 #define CBRT_ROUNDS 30
834 static uint32_t
tcp_cubic_cbrt(uint32_t v)835 tcp_cubic_cbrt(uint32_t v)
836 {
837 	int i, rounds = CBRT_ROUNDS;
838 	uint64_t x = v / 3;
839 
840 	/* We fail to calculate correct for small numbers */
841 	if (v == 0)
842 		return 0;
843 	else if (v < 4)
844 		return 1;
845 
846 	/*
847 	 * largest x that 2*x^3+3*x fits 64bit
848 	 * Avoid overflow for a time cost
849 	 */
850 	if (x > 2097151)
851 		rounds += 10;
852 
853 	for (i = 0; i < rounds; i++)
854 		if (rounds == CBRT_ROUNDS)
855 			x = (v + 2 * x * x * x) / (3 * x * x);
856 		else
857 			/* Avoid overflow */
858 			x = v / (3 * x * x) + 2 * x / 3;
859 
860 	return (uint32_t)x;
861 }
862 
863 /* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */
864 static ulong
tcp_cubic_getW(struct tcpcb * tp,uint32_t ms_elapsed,uint32_t rtt)865 tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt)
866 {
867 	uint32_t K;
868 	long tK3;
869 
870 	/* Section 3.1 Eq. 2 */
871 	K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB *
872 	    CUBIC_CB / CUBIC_CA);
873 	/*  (t-K)^3 - not clear why is the measure unit mattering */
874 	tK3 = (long)(ms_elapsed + rtt) - (long)K;
875 	tK3 = tK3 * tK3 * tK3;
876 
877 	return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax;
878 }
879 
880 static void
tcp_cubic_congestion_exp(struct tcpcb * tp)881 tcp_cubic_congestion_exp(struct tcpcb *tp)
882 {
883 
884 	/*
885 	 * Congestion - Set WMax and shrink cwnd
886 	 */
887 	tcp_cubic_update_ctime(tp);
888 
889 	/* Section 3.6 - Fast Convergence */
890 	if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) {
891 		tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
892 		tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 +
893 		    tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2;
894 	} else {
895 		tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
896 		tp->snd_cubic_wmax = tp->snd_cwnd;
897 	}
898 
899 	tp->snd_cubic_wmax = uimax(tp->t_segsz, tp->snd_cubic_wmax);
900 
901 	/* Shrink CWND */
902 	tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB);
903 }
904 
905 static int
tcp_cubic_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)906 tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
907 {
908 
909 	if (SEQ_LT(th->th_ack, tp->snd_high)) {
910 		/* See newreno */
911 		tp->t_dupacks = 0;
912 		return 1;
913 	}
914 
915 	/*
916 	 * mark WMax
917 	 */
918 	tcp_cubic_congestion_exp(tp);
919 
920 	/* Do fast retransmit */
921 	return tcp_reno_do_fast_retransmit(tp, th);
922 }
923 
924 static void
tcp_cubic_newack(struct tcpcb * tp,const struct tcphdr * th)925 tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th)
926 {
927 	uint32_t ms_elapsed, rtt;
928 	u_long w_tcp;
929 
930 	/* Congestion avoidance and not in fast recovery and usable rtt */
931 	if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 &&
932 	    /*
933 	     * t_srtt is 1/32 units of slow ticks
934 	     * converting it in ms would be equal to
935 	     * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ
936 	     */
937 	    (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) {
938 		ms_elapsed = tcp_cubic_diff_ctime(tp);
939 
940 		/* Compute W_tcp(t) */
941 		w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB +
942 		    ms_elapsed / rtt / 3;
943 
944 		if (tp->snd_cwnd > w_tcp) {
945 			/* Not in TCP friendly mode */
946 			tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) -
947 			    tp->snd_cwnd) / tp->snd_cwnd;
948 		} else {
949 			/* friendly TCP mode */
950 			tp->snd_cwnd = w_tcp;
951 		}
952 
953 		/* Make sure we are within limits */
954 		tp->snd_cwnd = uimax(tp->snd_cwnd, tp->t_segsz);
955 		tp->snd_cwnd = uimin(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale);
956 	} else {
957 		/* Use New Reno */
958 		tcp_newreno_newack(tp, th);
959 	}
960 }
961 
962 static void
tcp_cubic_slow_retransmit(struct tcpcb * tp)963 tcp_cubic_slow_retransmit(struct tcpcb *tp)
964 {
965 
966 	/* Timeout - Mark new congestion */
967 	tcp_cubic_congestion_exp(tp);
968 
969 	/* Loss Window MUST be one segment. */
970 	tp->snd_cwnd = tp->t_segsz;
971 	tp->t_partialacks = -1;
972 	tp->t_dupacks = 0;
973 	tp->t_bytes_acked = 0;
974 
975 	if (TCP_ECN_ALLOWED(tp))
976 		tp->t_flags |= TF_ECN_SND_CWR;
977 }
978 
979 const struct tcp_congctl tcp_cubic_ctl = {
980 	.fast_retransmit = tcp_cubic_fast_retransmit,
981 	.slow_retransmit = tcp_cubic_slow_retransmit,
982 	.fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
983 	.newack = tcp_cubic_newack,
984 	.cong_exp = tcp_cubic_congestion_exp,
985 };
986