xref: /freebsd/sys/netinet/tcp_ratelimit.h (revision 1f628be8)
1 /*-
2  *
3  * SPDX-License-Identifier: BSD-3-Clause
4  *
5  * Copyright (c) 2018-2020
6  *	Netflix Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  */
30 /**
31  * Author: Randall Stewart <rrs@netflix.com>
32  */
33 #ifndef __tcp_ratelimit_h__
34 #define __tcp_ratelimit_h__
35 
36 struct m_snd_tag;
37 
38 #define RL_MIN_DIVISOR 50
39 #define RL_DEFAULT_DIVISOR 1000
40 
41 /* Flags on an individual rate */
42 #define HDWRPACE_INITED 	0x0001
43 #define HDWRPACE_TAGPRESENT	0x0002
44 #define HDWRPACE_IFPDEPARTED	0x0004
45 struct tcp_hwrate_limit_table {
46 	const struct tcp_rate_set *ptbl;	/* Pointer to parent table */
47 	struct m_snd_tag *tag;	/* Send tag if needed (chelsio) */
48 	long	 rate;		/* Rate we get in Bytes per second (Bps) */
49 	long	 using;		/* How many flows are using this hdwr rate. */
50 	long	 rs_num_enobufs;
51 	uint32_t time_between;	/* Time-Gap between packets at this rate */
52 	uint32_t flags;
53 };
54 
55 /* Rateset flags */
56 #define RS_IS_DEFF      0x0001	/* Its a lagg, do a double lookup */
57 #define RS_IS_INTF      0x0002	/* Its a plain interface */
58 #define RS_NO_PRE       0x0004	/* The interfacd has set rates */
59 #define RS_INT_TBL      0x0010	/*
60 				 * The table is the internal version
61 				 * which has special setup requirements.
62 				 */
63 #define RS_IS_DEAD      0x0020	/* The RS is dead list */
64 #define RS_FUNERAL_SCHD 0x0040  /* Is a epoch call scheduled to bury this guy?*/
65 #define RS_INTF_NO_SUP  0x0100 	/* The interface does not support the ratelimiting */
66 
67 struct tcp_rate_set {
68 	struct sysctl_ctx_list sysctl_ctx;
69 	CK_LIST_ENTRY(tcp_rate_set) next;
70 	struct ifnet *rs_ifp;
71 	struct tcp_hwrate_limit_table *rs_rlt;
72 	uint64_t rs_flows_using;
73 	uint64_t rs_flow_limit;
74 	uint32_t rs_if_dunit;
75 	int rs_rate_cnt;
76 	int rs_min_seg;
77 	int rs_highest_valid;
78 	int rs_lowest_valid;
79 	int rs_disable;
80 	int rs_flags;
81 	struct epoch_context rs_epoch_ctx;
82 };
83 
84 CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set);
85 
86 /* Request flags */
87 #define RS_PACING_EXACT_MATCH	0x0001	/* Need an exact match for rate */
88 #define RS_PACING_GT		0x0002	/* Greater than requested */
89 #define RS_PACING_GEQ		0x0004	/* Greater than or equal too */
90 #define RS_PACING_LT		0x0008	/* Less than requested rate */
91 #define RS_PACING_SUB_OK	0x0010	/* If a rate can't be found get the
92 					 * next best rate (highest or lowest). */
93 #ifdef _KERNEL
94 #ifndef ETHERNET_SEGMENT_SIZE
95 #define ETHERNET_SEGMENT_SIZE 1514
96 #endif
97 struct tcpcb;
98 
99 #ifdef RATELIMIT
100 #define DETAILED_RATELIMIT_SYSCTL 1	/*
101 					 * Undefine this if you don't want
102 					 * detailed rates to appear in
103 					 * net.inet.tcp.rl.
104 					 * With the defintion each rate
105 					 * shows up in your sysctl tree
106 					 * this can be big.
107 					 */
108 uint64_t inline
tcp_hw_highest_rate(const struct tcp_hwrate_limit_table * rle)109 tcp_hw_highest_rate(const struct tcp_hwrate_limit_table *rle)
110 {
111 	return (rle->ptbl->rs_rlt[rle->ptbl->rs_highest_valid].rate);
112 }
113 
114 uint64_t
115 tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp);
116 
117 const struct tcp_hwrate_limit_table *
118 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
119     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate);
120 
121 const struct tcp_hwrate_limit_table *
122 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
123     struct tcpcb *tp, struct ifnet *ifp,
124     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate);
125 void
126 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
127     struct tcpcb *tp);
128 
129 uint32_t
130 tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
131     const struct tcp_hwrate_limit_table *te, int *err, int divisor);
132 
133 void
134 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte);
135 
136 void
137 tcp_rl_release_ifnet(struct ifnet *ifp);
138 
139 #else
140 static inline const struct tcp_hwrate_limit_table *
tcp_set_pacing_rate(struct tcpcb * tp,struct ifnet * ifp,uint64_t bytes_per_sec,int flags,int * error,uint64_t * lower_rate)141 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
142     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
143 {
144 	if (error)
145 		*error = EOPNOTSUPP;
146 	return (NULL);
147 }
148 
149 static inline const struct tcp_hwrate_limit_table *
tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table * crte,struct tcpcb * tp,struct ifnet * ifp,uint64_t bytes_per_sec,int flags,int * error,uint64_t * lower_rate)150 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
151     struct tcpcb *tp, struct ifnet *ifp,
152     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
153 {
154 	if (error)
155 		*error = EOPNOTSUPP;
156 	return (NULL);
157 }
158 
159 static inline void
tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table * crte,struct tcpcb * tp)160 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
161     struct tcpcb *tp)
162 {
163 	return;
164 }
165 
166 static uint64_t inline
tcp_hw_highest_rate(const struct tcp_hwrate_limit_table * rle)167 tcp_hw_highest_rate(const struct tcp_hwrate_limit_table *rle)
168 {
169 	return (0);
170 }
171 
172 static uint64_t inline
tcp_hw_highest_rate_ifp(struct ifnet * ifp,struct inpcb * inp)173 tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp)
174 {
175 	return (0);
176 }
177 
178 static inline uint32_t
tcp_get_pacing_burst_size_w_divisor(struct tcpcb * tp,uint64_t bw,uint32_t segsiz,int can_use_1mss,const struct tcp_hwrate_limit_table * te,int * err,int divisor)179 tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
180    const struct tcp_hwrate_limit_table *te, int *err, int divisor)
181 {
182 	/*
183 	 * We use the google formula to calculate the
184 	 * TSO size. I.E.
185 	 * bw < 24Meg
186 	 *   tso = 2mss
187 	 * else
188 	 *   tso = min(bw/(div=1000), 64k)
189 	 *
190 	 * Note for these calculations we ignore the
191 	 * packet overhead (enet hdr, ip hdr and tcp hdr).
192 	 * We only get the google formula when we have
193 	 * divisor = 1000, which is the default for now.
194 	 */
195 	uint64_t bytes;
196 	uint32_t new_tso, min_tso_segs;
197 
198 	/* It can't be zero */
199 	if ((divisor == 0) ||
200 	    (divisor < RL_MIN_DIVISOR)) {
201 		bytes = bw / RL_DEFAULT_DIVISOR;
202 	} else
203 		bytes = bw / divisor;
204 	/* We can't ever send more than 65k in a TSO */
205 	if (bytes > 0xffff) {
206 		bytes = 0xffff;
207 	}
208 	/* Round up */
209 	new_tso = (bytes + segsiz - 1) / segsiz;
210 	if (can_use_1mss)
211 		min_tso_segs = 1;
212 	else
213 		min_tso_segs = 2;
214 	if (new_tso < min_tso_segs)
215 		new_tso = min_tso_segs;
216 	new_tso *= segsiz;
217 	return (new_tso);
218 }
219 
220 /* Do nothing if RATELIMIT is not defined */
221 static inline void
tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table * rte)222 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
223 {
224 }
225 
226 static inline void
tcp_rl_release_ifnet(struct ifnet * ifp)227 tcp_rl_release_ifnet(struct ifnet *ifp)
228 {
229 }
230 #endif
231 
232 /*
233  * Given a b/w and a segsiz, and optional hardware
234  * rate limit, return the ideal size to burst
235  * out at once. Note the parameter can_use_1mss
236  * dictates if the transport will tolerate a 1mss
237  * limit, if not it will bottom out at 2mss (think
238  * delayed ack).
239  */
240 static inline uint32_t
tcp_get_pacing_burst_size(struct tcpcb * tp,uint64_t bw,uint32_t segsiz,int can_use_1mss,const struct tcp_hwrate_limit_table * te,int * err)241 tcp_get_pacing_burst_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
242 			  const struct tcp_hwrate_limit_table *te, int *err)
243 {
244 
245 	return (tcp_get_pacing_burst_size_w_divisor(tp, bw, segsiz,
246 						    can_use_1mss,
247 						    te, err, 0));
248 }
249 
250 #endif
251 #endif
252