1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2007-2008
5 * Swinburne University of Technology, Melbourne, Australia.
6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010 The FreeBSD Foundation
8 * All rights reserved.
9 *
10 * This software was developed at the Centre for Advanced Internet
11 * Architectures, Swinburne University of Technology, by Lawrence Stewart and
12 * James Healy, made possible in part by a grant from the Cisco University
13 * Research Program Fund at Community Foundation Silicon Valley.
14 *
15 * Portions of this software were developed at the Centre for Advanced
16 * Internet Architectures, Swinburne University of Technology, Melbourne,
17 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18 *
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted provided that the following conditions
21 * are met:
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 * notice, this list of conditions and the following disclaimer in the
26 * documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 */
40
41 /*
42 * This software was first released in 2007 by James Healy and Lawrence Stewart
43 * whilst working on the NewTCP research project at Swinburne University of
44 * Technology's Centre for Advanced Internet Architectures, Melbourne,
45 * Australia, which was made possible in part by a grant from the Cisco
46 * University Research Program Fund at Community Foundation Silicon Valley.
47 * More details are available at:
48 * http://caia.swin.edu.au/urp/newtcp/
49 */
50
51 #include <sys/cdefs.h>
52 #include <opt_cc.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/libkern.h>
56 #include <sys/lock.h>
57 #include <sys/malloc.h>
58 #include <sys/module.h>
59 #include <sys/mutex.h>
60 #include <sys/queue.h>
61 #include <sys/rwlock.h>
62 #include <sys/sbuf.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/sysctl.h>
66
67 #include <net/vnet.h>
68
69 #include <netinet/in.h>
70 #include <netinet/in_pcb.h>
71 #include <netinet/tcp.h>
72 #include <netinet/tcp_seq.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcp_log_buf.h>
75 #include <netinet/tcp_hpts.h>
76 #include <netinet/cc/cc.h>
77 #include <netinet/cc/cc_module.h>
78
79 /*
80 * Have a sane default if no CC_DEFAULT is specified in the kernel config file.
81 */
82 #ifndef CC_DEFAULT
83 #define CC_DEFAULT "cubic"
84 #endif
85
86 uint32_t hystart_minrtt_thresh = 4000;
87 uint32_t hystart_maxrtt_thresh = 16000;
88 uint32_t hystart_n_rttsamples = 8;
89 uint32_t hystart_css_growth_div = 4;
90 uint32_t hystart_css_rounds = 5;
91 uint32_t hystart_bblogs = 0;
92
93 MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory");
94
95 /*
96 * List of available cc algorithms on the current system. First element
97 * is used as the system default CC algorithm.
98 */
99 struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
100
101 /* Protects the cc_list TAILQ. */
102 struct rwlock cc_list_lock;
103
104 VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL;
105
106 VNET_DEFINE(uint32_t, newreno_beta) = 50;
107 #define V_newreno_beta VNET(newreno_beta)
108 VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80;
109
110 void
cc_refer(struct cc_algo * algo)111 cc_refer(struct cc_algo *algo)
112 {
113 CC_LIST_LOCK_ASSERT();
114 refcount_acquire(&algo->cc_refcount);
115 }
116
117 void
cc_release(struct cc_algo * algo)118 cc_release(struct cc_algo *algo)
119 {
120 CC_LIST_LOCK_ASSERT();
121 refcount_release(&algo->cc_refcount);
122 }
123
124
125 void
cc_attach(struct tcpcb * tp,struct cc_algo * algo)126 cc_attach(struct tcpcb *tp, struct cc_algo *algo)
127 {
128 /*
129 * Attach the tcpcb to the algorithm.
130 */
131 CC_LIST_RLOCK();
132 CC_ALGO(tp) = algo;
133 cc_refer(algo);
134 CC_LIST_RUNLOCK();
135 }
136
137 void
cc_detach(struct tcpcb * tp)138 cc_detach(struct tcpcb *tp)
139 {
140 struct cc_algo *algo;
141
142 CC_LIST_RLOCK();
143 algo = CC_ALGO(tp);
144 CC_ALGO(tp) = NULL;
145 cc_release(algo);
146 CC_LIST_RUNLOCK();
147 }
148
149 /*
150 * Sysctl handler to show and change the default CC algorithm.
151 */
152 static int
cc_default_algo(SYSCTL_HANDLER_ARGS)153 cc_default_algo(SYSCTL_HANDLER_ARGS)
154 {
155 char default_cc[TCP_CA_NAME_MAX];
156 struct cc_algo *funcs;
157 int error;
158
159 /* Get the current default: */
160 CC_LIST_RLOCK();
161 if (CC_DEFAULT_ALGO() != NULL)
162 strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc));
163 else
164 memset(default_cc, 0, TCP_CA_NAME_MAX);
165 CC_LIST_RUNLOCK();
166
167 error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req);
168
169 /* Check for error or no change */
170 if (error != 0 || req->newptr == NULL)
171 goto done;
172
173 error = ESRCH;
174 /* Find algo with specified name and set it to default. */
175 CC_LIST_RLOCK();
176 STAILQ_FOREACH(funcs, &cc_list, entries) {
177 if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
178 continue;
179 if (funcs->flags & CC_MODULE_BEING_REMOVED) {
180 /* Its being removed, its not eligible */
181 continue;
182 }
183 V_default_cc_ptr = funcs;
184 error = 0;
185 break;
186 }
187 CC_LIST_RUNLOCK();
188 done:
189 return (error);
190 }
191
192 /*
193 * Sysctl handler to display the list of available CC algorithms.
194 */
195 static int
cc_list_available(SYSCTL_HANDLER_ARGS)196 cc_list_available(SYSCTL_HANDLER_ARGS)
197 {
198 struct cc_algo *algo;
199 int error, nalgos;
200 int linesz;
201 char *buffer, *cp;
202 size_t bufsz, outsz;
203
204 error = nalgos = 0;
205 CC_LIST_RLOCK();
206 STAILQ_FOREACH(algo, &cc_list, entries) {
207 nalgos++;
208 }
209 CC_LIST_RUNLOCK();
210 if (nalgos == 0) {
211 return (ENOENT);
212 }
213 bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1);
214 buffer = malloc(bufsz, M_TEMP, M_WAITOK);
215 cp = buffer;
216
217 linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D',
218 "PCB count");
219 cp += linesz;
220 bufsz -= linesz;
221 outsz = linesz;
222 CC_LIST_RLOCK();
223 STAILQ_FOREACH(algo, &cc_list, entries) {
224 linesz = snprintf(cp, bufsz, "%-16s%c %u\n",
225 algo->name,
226 (algo == CC_DEFAULT_ALGO()) ? '*' : ' ',
227 algo->cc_refcount);
228 if (linesz >= bufsz) {
229 error = EOVERFLOW;
230 break;
231 }
232 cp += linesz;
233 bufsz -= linesz;
234 outsz += linesz;
235 }
236 CC_LIST_RUNLOCK();
237 if (error == 0)
238 error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
239 free(buffer, M_TEMP);
240 return (error);
241 }
242
243 /*
244 * Return the number of times a proposed removal_cc is
245 * being used as the default.
246 */
247 static int
cc_check_default(struct cc_algo * remove_cc)248 cc_check_default(struct cc_algo *remove_cc)
249 {
250 int cnt = 0;
251 VNET_ITERATOR_DECL(vnet_iter);
252
253 CC_LIST_LOCK_ASSERT();
254
255 VNET_LIST_RLOCK_NOSLEEP();
256 VNET_FOREACH(vnet_iter) {
257 CURVNET_SET(vnet_iter);
258 if ((CC_DEFAULT_ALGO() != NULL) &&
259 strncmp(CC_DEFAULT_ALGO()->name,
260 remove_cc->name,
261 TCP_CA_NAME_MAX) == 0) {
262 cnt++;
263 }
264 CURVNET_RESTORE();
265 }
266 VNET_LIST_RUNLOCK_NOSLEEP();
267 return (cnt);
268 }
269
270 /*
271 * Initialise CC subsystem on system boot.
272 */
273 static void
cc_init(void)274 cc_init(void)
275 {
276 CC_LIST_LOCK_INIT();
277 STAILQ_INIT(&cc_list);
278 }
279
280 /*
281 * Returns non-zero on success, 0 on failure.
282 */
283 static int
cc_deregister_algo_locked(struct cc_algo * remove_cc)284 cc_deregister_algo_locked(struct cc_algo *remove_cc)
285 {
286 struct cc_algo *funcs;
287 int found = 0;
288
289 /* This is unlikely to fail */
290 STAILQ_FOREACH(funcs, &cc_list, entries) {
291 if (funcs == remove_cc)
292 found = 1;
293 }
294 if (found == 0) {
295 /* Nothing to remove? */
296 return (ENOENT);
297 }
298 /* We assert it should have been MOD_QUIESCE'd */
299 KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED),
300 ("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc));
301 if (cc_check_default(remove_cc)) {
302 return(EBUSY);
303 }
304 if (remove_cc->cc_refcount != 0) {
305 return (EBUSY);
306 }
307 /* Remove algo from cc_list so that new connections can't use it. */
308 STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries);
309 return (0);
310 }
311
312 /*
313 * Returns non-zero on success, 0 on failure.
314 */
315 int
cc_deregister_algo(struct cc_algo * remove_cc)316 cc_deregister_algo(struct cc_algo *remove_cc)
317 {
318 int ret;
319
320 CC_LIST_WLOCK();
321 ret = cc_deregister_algo_locked(remove_cc);
322 CC_LIST_WUNLOCK();
323 return (ret);
324 }
325
326 /*
327 * Returns 0 on success, non-zero on failure.
328 */
329 int
cc_register_algo(struct cc_algo * add_cc)330 cc_register_algo(struct cc_algo *add_cc)
331 {
332 struct cc_algo *funcs;
333 int err;
334
335 err = 0;
336
337 /*
338 * Iterate over list of registered CC algorithms and make sure
339 * we're not trying to add a duplicate.
340 */
341 CC_LIST_WLOCK();
342 STAILQ_FOREACH(funcs, &cc_list, entries) {
343 if (funcs == add_cc ||
344 strncmp(funcs->name, add_cc->name,
345 TCP_CA_NAME_MAX) == 0) {
346 err = EEXIST;
347 break;
348 }
349 }
350 /* Init its reference count */
351 if (err == 0)
352 refcount_init(&add_cc->cc_refcount, 0);
353 /*
354 * The first loaded congestion control module will become
355 * the default until we find the "CC_DEFAULT" defined in
356 * the config (if we do).
357 */
358 if (!err) {
359 STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
360 if (strcmp(add_cc->name, CC_DEFAULT) == 0) {
361 V_default_cc_ptr = add_cc;
362 } else if (V_default_cc_ptr == NULL) {
363 V_default_cc_ptr = add_cc;
364 }
365 }
366 CC_LIST_WUNLOCK();
367
368 return (err);
369 }
370
371 static void
vnet_cc_sysinit(void * arg)372 vnet_cc_sysinit(void *arg)
373 {
374 struct cc_algo *cc;
375
376 if (IS_DEFAULT_VNET(curvnet))
377 return;
378
379 CURVNET_SET(vnet0);
380 cc = V_default_cc_ptr;
381 CURVNET_RESTORE();
382
383 V_default_cc_ptr = cc;
384 }
385 VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
386 vnet_cc_sysinit, NULL);
387
388 /*
389 * Perform any necessary tasks before we exit congestion recovery.
390 */
391 void
newreno_cc_post_recovery(struct cc_var * ccv)392 newreno_cc_post_recovery(struct cc_var *ccv)
393 {
394 int pipe;
395
396 if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
397 /*
398 * Fast recovery will conclude after returning from this
399 * function. Window inflation should have left us with
400 * approximately snd_ssthresh outstanding data. But in case we
401 * would be inclined to send a burst, better to do it via the
402 * slow start mechanism.
403 *
404 * XXXLAS: Find a way to do this without needing curack
405 */
406 if (V_tcp_do_newsack)
407 pipe = tcp_compute_pipe(ccv->tp);
408 else
409 pipe = CCV(ccv, snd_max) - ccv->curack;
410 if (pipe < CCV(ccv, snd_ssthresh))
411 /*
412 * Ensure that cwnd does not collapse to 1 MSS under
413 * adverse conditions. Implements RFC6582
414 */
415 CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) +
416 CCV(ccv, t_maxseg);
417 else
418 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
419 }
420 }
421
422 void
newreno_cc_after_idle(struct cc_var * ccv)423 newreno_cc_after_idle(struct cc_var *ccv)
424 {
425 uint32_t rw;
426 /*
427 * If we've been idle for more than one retransmit timeout the old
428 * congestion window is no longer current and we have to reduce it to
429 * the restart window before we can transmit again.
430 *
431 * The restart window is the initial window or the last CWND, whichever
432 * is smaller.
433 *
434 * This is done to prevent us from flooding the path with a full CWND at
435 * wirespeed, overloading router and switch buffers along the way.
436 *
437 * See RFC5681 Section 4.1. "Restarting Idle Connections".
438 *
439 * In addition, per RFC2861 Section 2, the ssthresh is set to the
440 * maximum of the former ssthresh or 3/4 of the old cwnd, to
441 * not exit slow-start prematurely.
442 */
443 rw = tcp_compute_initwnd(tcp_fixed_maxseg(ccv->tp));
444
445 CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh),
446 CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2));
447
448 CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd));
449 }
450
451 /*
452 * Get a new congestion window size on a multiplicative decrease event.
453 * */
454 u_int
newreno_cc_cwnd_on_multiplicative_decrease(struct cc_var * ccv,uint32_t mss)455 newreno_cc_cwnd_on_multiplicative_decrease(struct cc_var *ccv, uint32_t mss)
456 {
457 uint32_t cwin, factor;
458
459 cwin = CCV(ccv, snd_cwnd);
460 /*
461 * Other TCP congestion controls use newreno_cong_signal(), but
462 * with their own private cc_data. Make sure the cc_data is used
463 * correctly.
464 */
465 factor = V_newreno_beta;
466
467 return max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 2) * mss;
468 }
469
470 /*
471 * Perform any necessary tasks before we enter congestion recovery.
472 */
473 void
newreno_cc_cong_signal(struct cc_var * ccv,ccsignal_t type)474 newreno_cc_cong_signal(struct cc_var *ccv, ccsignal_t type)
475 {
476 uint32_t cwin, mss, pipe;
477
478 mss = tcp_fixed_maxseg(ccv->tp);
479
480 /* Catch algos which mistakenly leak private signal types. */
481 KASSERT((type & CC_SIGPRIVMASK) == 0,
482 ("%s: congestion signal type 0x%08x is private\n", __func__, type));
483
484 cwin = newreno_cc_cwnd_on_multiplicative_decrease(ccv, mss);
485
486 switch (type) {
487 case CC_NDUPACK:
488 if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
489 if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
490 CCV(ccv, snd_ssthresh) = cwin;
491 ENTER_RECOVERY(CCV(ccv, t_flags));
492 }
493 break;
494 case CC_ECN:
495 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
496 CCV(ccv, snd_ssthresh) = cwin;
497 CCV(ccv, snd_cwnd) = cwin;
498 ENTER_CONGRECOVERY(CCV(ccv, t_flags));
499 }
500 break;
501 case CC_RTO:
502 if (CCV(ccv, t_rxtshift) == 1) {
503 if (V_tcp_do_newsack) {
504 pipe = tcp_compute_pipe(ccv->tp);
505 } else {
506 pipe = CCV(ccv, snd_max) -
507 CCV(ccv, snd_fack) +
508 CCV(ccv, sackhint.sack_bytes_rexmit);
509 }
510 CCV(ccv, snd_ssthresh) = max(2,
511 min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss;
512 }
513 CCV(ccv, snd_cwnd) = mss;
514 break;
515 default:
516 break;
517 }
518 }
519
520 u_int
newreno_cc_cwnd_in_cong_avoid(struct cc_var * ccv)521 newreno_cc_cwnd_in_cong_avoid(struct cc_var *ccv)
522 {
523 u_int cw = CCV(ccv, snd_cwnd);
524 u_int incr = CCV(ccv, t_maxseg);
525
526 KASSERT(cw > CCV(ccv, snd_ssthresh),
527 ("congestion control state not in congestion avoidance\n"));
528
529 /*
530 * Regular in-order ACK, open the congestion window.
531 * The congestion control state we're in is congestion avoidance.
532 *
533 * Check if ABC (RFC 3465) is enabled.
534 * cong avoid: cwnd > ssthresh
535 *
536 * cong avoid and ABC (RFC 3465):
537 * Grow cwnd linearly by maxseg per RTT for each
538 * cwnd worth of ACKed data.
539 *
540 * cong avoid without ABC (RFC 5681):
541 * Grow cwnd linearly by approximately maxseg per RTT using
542 * maxseg^2 / cwnd per ACK as the increment.
543 * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
544 * avoid capping cwnd.
545 */
546 if (V_tcp_do_rfc3465) {
547 if (ccv->flags & CCF_ABC_SENTAWND)
548 ccv->flags &= ~CCF_ABC_SENTAWND;
549 else
550 incr = 0;
551 } else
552 incr = max((incr * incr / cw), 1);
553 /* ABC is on by default, so incr equals 0 frequently. */
554 if (incr > 0)
555 return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale));
556 else
557 return cw;
558 }
559
560 u_int
newreno_cc_cwnd_in_slow_start(struct cc_var * ccv)561 newreno_cc_cwnd_in_slow_start(struct cc_var *ccv)
562 {
563 u_int cw = CCV(ccv, snd_cwnd);
564 u_int incr = CCV(ccv, t_maxseg);
565
566 KASSERT(cw <= CCV(ccv, snd_ssthresh),
567 ("congestion control state not in slow start\n"));
568
569 /*
570 * Regular in-order ACK, open the congestion window.
571 * The congestion control state we're in is slow start.
572 *
573 * slow start: cwnd <= ssthresh
574 *
575 * slow start and ABC (RFC 3465):
576 * Grow cwnd exponentially by the amount of data
577 * ACKed capping the max increment per ACK to
578 * (abc_l_var * maxseg) bytes.
579 *
580 * slow start without ABC (RFC 5681):
581 * Grow cwnd exponentially by maxseg per ACK.
582 */
583 if (V_tcp_do_rfc3465) {
584 /*
585 * In slow-start with ABC enabled and no RTO in sight?
586 * (Must not use abc_l_var > 1 if slow starting after
587 * an RTO. On RTO, snd_nxt = snd_una, so the
588 * snd_nxt == snd_max check is sufficient to
589 * handle this).
590 *
591 * XXXLAS: Find a way to signal SS after RTO that
592 * doesn't rely on tcpcb vars.
593 */
594 uint16_t abc_val;
595
596 if (ccv->flags & CCF_USE_LOCAL_ABC)
597 abc_val = ccv->labc;
598 else
599 abc_val = V_tcp_abc_l_var;
600 if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
601 incr = min(ccv->bytes_this_ack,
602 ccv->nsegs * abc_val * CCV(ccv, t_maxseg));
603 else
604 incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
605 }
606 /* ABC is on by default, so incr equals 0 frequently. */
607 if (incr > 0)
608 return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale));
609 else
610 return cw;
611 }
612
613 void
newreno_cc_ack_received(struct cc_var * ccv,ccsignal_t type)614 newreno_cc_ack_received(struct cc_var *ccv, ccsignal_t type)
615 {
616 if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
617 (ccv->flags & CCF_CWND_LIMITED)) {
618 if (CCV(ccv, snd_cwnd) > CCV(ccv, snd_ssthresh)) {
619 CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_cong_avoid(ccv);
620 } else {
621 CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_slow_start(ccv);
622 }
623 }
624 }
625
626 static int
cc_stop_new_assignments(struct cc_algo * algo)627 cc_stop_new_assignments(struct cc_algo *algo)
628 {
629 CC_LIST_WLOCK();
630 if (cc_check_default(algo)) {
631 /* A default cannot be removed */
632 CC_LIST_WUNLOCK();
633 return (EBUSY);
634 }
635 algo->flags |= CC_MODULE_BEING_REMOVED;
636 CC_LIST_WUNLOCK();
637 return (0);
638 }
639
640 /*
641 * Handles kld related events. Returns 0 on success, non-zero on failure.
642 */
643 int
cc_modevent(module_t mod,int event_type,void * data)644 cc_modevent(module_t mod, int event_type, void *data)
645 {
646 struct cc_algo *algo;
647 int err;
648
649 err = 0;
650 algo = (struct cc_algo *)data;
651
652 switch(event_type) {
653 case MOD_LOAD:
654 if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) {
655 /*
656 * A module must have a cc_data_sz function
657 * even if it has no data it should return 0.
658 */
659 printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n");
660 err = EINVAL;
661 break;
662 }
663 if (algo->mod_init != NULL)
664 err = algo->mod_init();
665 if (!err)
666 err = cc_register_algo(algo);
667 break;
668
669 case MOD_SHUTDOWN:
670 break;
671 case MOD_QUIESCE:
672 /* Stop any new assigments */
673 err = cc_stop_new_assignments(algo);
674 break;
675 case MOD_UNLOAD:
676 /*
677 * Deregister and remove the module from the list
678 */
679 CC_LIST_WLOCK();
680 /* Even with -f we can't unload if its the default */
681 if (cc_check_default(algo)) {
682 /* A default cannot be removed */
683 CC_LIST_WUNLOCK();
684 return (EBUSY);
685 }
686 /*
687 * If -f was used and users are still attached to
688 * the algorithm things are going to go boom.
689 */
690 err = cc_deregister_algo_locked(algo);
691 CC_LIST_WUNLOCK();
692 if ((err == 0) && (algo->mod_destroy != NULL)) {
693 algo->mod_destroy();
694 }
695 break;
696 default:
697 err = EINVAL;
698 break;
699 }
700
701 return (err);
702 }
703
704 SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
705
706 /* Declare sysctl tree and populate it. */
707 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
708 "Congestion control related settings");
709
710 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm,
711 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
712 NULL, 0, cc_default_algo, "A",
713 "Default congestion control algorithm");
714
715 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available,
716 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
717 NULL, 0, cc_list_available, "A",
718 "List available congestion control algorithms");
719
720 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hystartplusplus,
721 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
722 "New Reno related HyStart++ settings");
723
724 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, minrtt_thresh,
725 CTLFLAG_RW,
726 &hystart_minrtt_thresh, 4000,
727 "HyStarts++ minimum RTT thresh used in clamp (in microseconds)");
728
729 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, maxrtt_thresh,
730 CTLFLAG_RW,
731 &hystart_maxrtt_thresh, 16000,
732 "HyStarts++ maximum RTT thresh used in clamp (in microseconds)");
733
734 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, n_rttsamples,
735 CTLFLAG_RW,
736 &hystart_n_rttsamples, 8,
737 "The number of RTT samples that must be seen to consider HyStart++");
738
739 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_growth_div,
740 CTLFLAG_RW,
741 &hystart_css_growth_div, 4,
742 "The divisor to the growth when in Hystart++ CSS");
743
744 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_rounds,
745 CTLFLAG_RW,
746 &hystart_css_rounds, 5,
747 "The number of rounds HyStart++ lasts in CSS before falling to CA");
748
749 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, bblogs,
750 CTLFLAG_RW,
751 &hystart_bblogs, 0,
752 "Do we enable HyStart++ Black Box logs to be generated if BB logging is on");
753
754 VNET_DEFINE(int, cc_do_abe) = 0;
755 SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW,
756 &VNET_NAME(cc_do_abe), 0,
757 "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)");
758
759 VNET_DEFINE(int, cc_abe_frlossreduce) = 0;
760 SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW,
761 &VNET_NAME(cc_abe_frlossreduce), 0,
762 "Apply standard beta instead of ABE-beta during ECN-signalled congestion "
763 "recovery episodes if loss also needs to be repaired");
764