xref: /dragonfly/sys/net/dummynet3/ip_dummynet3.h (revision 0ca59c34)
1 /*
2  * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
3  * Portions Copyright (c) 2000 Akamba Corp.
4  * All rights reserved
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD: src/sys/netinet/ip_dummynet.h,v 1.10.2.9 2003/05/13 09:31:06 maxim Exp $
28  * $DragonFly: src/sys/net/dummynet/ip_dummynet.h,v 1.19 2008/09/20 04:36:51 sephe Exp $
29  */
30 
31 #ifndef _IP_DUMMYNET3_H_
32 #define _IP_DUMMYNET3_H_
33 
34 #ifndef _IP_DUMMYNET_H
35 
36 #define MODULE_DUMMYNET_ID	2
37 #define MODULE_DUMMYNET_NAME	"dummynet"
38 
39 
40 #ifdef _KERNEL
41 //placeholder for kernel
42 #endif
43 
44 enum ipfw_dummynet_opcodes {
45 	O_DUMMYNET_PIPE,
46 	O_DUMMYNET_QUEUE,
47 };
48 
49 /*
50  * We start with a heap, which is used in the scheduler to decide when to
51  * transmit packets etc.
52  *
53  * The key for the heap is used for two different values:
54  *
55  * 1. Timer ticks- max 10K/second, so 32 bits are enough;
56  *
57  * 2. Virtual times.  These increase in steps of len/x, where len is the
58  *	packet length, and x is either the weight of the flow, or the sum
59  *	of all weights.
60  *	If we limit to max 1000 flows and a max weight of 100, then x needs
61  *	17 bits.  The packet size is 16 bits, so we can easily overflow if
62  *	we do not allow errors.
63  *
64  * So we use a key "dn_key" which is 64 bits.
65  *
66  * MY_M is used as a shift count when doing fixed point arithmetic
67  * (a better name would be useful...).
68  */
69 typedef uint64_t	dn_key;	/* sorting key */
70 
71 /*
72  * Number of left shift to obtain a larger precision
73  *
74  * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the
75  * virtual time wraps every 15 days.
76  */
77 #define MY_M		16
78 
79 #ifdef _KERNEL
80 
81 /*
82  * A heap entry is made of a key and a pointer to the actual object stored
83  * in the heap.
84  *
85  * The heap is an array of dn_heap_entry entries, dynamically allocated.
86  * Current size is "size", with "elements" actually in use.
87  *
88  * The heap normally supports only ordered insert and extract from the top.
89  * If we want to extract an object from the middle of the heap, we have to
90  * know where the object itself is located in the heap (or we need to scan
91  * the whole array).  To this purpose, an object has a field (int) which
92  * contains the index of the object itself into the heap.  When the object
93  * is moved, the field must also be updated.  The offset of the index in the
94  * object is stored in the 'offset' field in the heap descriptor.  The
95  * assumption is that this offset is non-zero if we want to support extract
96  * from the middle.
97  */
98 struct dn_heap_entry {
99 	dn_key key;	/* sorting key.  Topmost element is smallest one */
100 	void *object;	/* object pointer */
101 };
102 
103 struct dn_heap {
104 	int size;
105 	int elements;
106 	int offset; /* XXX if > 0 this is the offset of direct ptr to obj */
107 	struct dn_heap_entry *p;	/* really an array of "size" entries */
108 };
109 
110 struct dn_flow_id {
111 	uint16_t fid_type;	/* ETHERTYPE_ */
112 	uint16_t pad;
113 	union {
114 		struct {
115 			uint32_t dst_ip;
116 			uint32_t src_ip;
117 			uint16_t dst_port;
118 			uint16_t src_port;
119 			uint8_t proto;
120 			uint8_t flags;
121 		} inet;
122 	} fid_u;
123 #define fid_dst_ip	fid_u.inet.dst_ip
124 #define fid_src_ip	fid_u.inet.src_ip
125 #define fid_dst_port	fid_u.inet.dst_port
126 #define fid_src_port	fid_u.inet.src_port
127 #define fid_proto	fid_u.inet.proto
128 #define fid_flags	fid_u.inet.flags
129 };
130 
131 typedef void	(*ip_dn_unref_priv_t)(void *);
132 struct lwkt_port;
133 
134 /*
135  * struct dn_pkt identifies a packet in the dummynet queue, but is also used
136  * to tag packets passed back to the various destinations (ip_input(),
137  * ip_output() and so on).
138  *
139  * It is a tag (PACKET_TAG_DUMMYNET) associated with the actual mbuf.
140  */
141 struct dn_pkt {
142 	struct mbuf *dn_m;
143 	TAILQ_ENTRY(dn_pkt) dn_next;
144 
145 	void *dn_priv;
146 	ip_dn_unref_priv_t dn_unref_priv;
147 
148 	uint32_t dn_flags;		/* action when packet comes out. */
149 #define DN_FLAGS_IS_PIPE	0x10
150 #define DN_FLAGS_DIR_MASK	0x0f
151 #define DN_TO_IP_OUT		1
152 #define DN_TO_IP_IN		2
153 #define DN_TO_ETH_DEMUX		4
154 #define DN_TO_ETH_OUT		5
155 #define DN_TO_MAX		6
156 
157 	dn_key output_time;		/* when the pkt is due for delivery */
158 	struct ifnet *ifp;		/* interface, for ip_output */
159 	struct sockaddr_in *dn_dst;
160 	struct route ro;		/* route, for ip_output. MUST COPY */
161 	int flags;			/* flags, for ip_output (IPv6 ?) */
162 
163 	u_short pipe_nr;		/* pipe/flow_set number */
164 	u_short pad;
165 
166 	struct dn_flow_id id;	/* flow id */
167 	int cpuid;			/* target cpuid, for assertion */
168 	struct lwkt_port *msgport;	/* target msgport */
169 };
170 TAILQ_HEAD(dn_pkt_queue, dn_pkt);
171 
172 /*
173  * Overall structure of dummynet (with WF2Q+):
174  *
175  * In dummynet, packets are selected with the firewall rules, and passed to
176  * two different objects: PIPE or QUEUE.
177  *
178  * A QUEUE is just a queue with configurable size and queue management policy.
179  * It is also associated with a mask (to discriminate among different flows),
180  * a weight (used to give different shares of the bandwidth to different flows)
181  * and a "pipe", which essentially supplies the transmit clock for all queues
182  * associated with that pipe.
183  *
184  * A PIPE emulates a fixed-bandwidth link, whose bandwidth is configurable.
185  * The "clock" for a pipe comes from an internal timer.  A pipe is also
186  * associated with one (or more, if masks are used) queue, where all packets
187  * for that pipe are stored.
188  *
189  * The bandwidth available on the pipe is shared by the queues associated with
190  * that pipe (only one in case the packet is sent to a PIPE) according to the
191  * WF2Q+ scheduling algorithm and the configured weights.
192  *
193  * In general, incoming packets are stored in the appropriate queue, which is
194  * then placed into one of a few heaps managed by a scheduler to decide when
195  * the packet should be extracted.  The scheduler (a function called dummynet())
196  * is run at every timer tick, and grabs queues from the head of the heaps when
197  * they are ready for processing.
198  *
199  * There are three data structures definining a pipe and associated queues:
200  *
201  *  + dn_pipe, which contains the main configuration parameters related to
202  *	delay and bandwidth;
203  *  + dn_flow_set, which contains WF2Q+ configuration, flow masks, plr and
204  *	RED configuration;
205  *  + dn_flow_queue, which is the per-flow queue (containing the packets)
206  *
207  * Multiple dn_flow_set can be linked to the same pipe, and multiple
208  * dn_flow_queue can be linked to the same dn_flow_set.
209  * All data structures are linked in a linear list which is used for
210  * housekeeping purposes.
211  *
212  * During configuration, we create and initialize the dn_flow_set and dn_pipe
213  * structures (a dn_pipe also contains a dn_flow_set).
214  *
215  * At runtime: packets are sent to the appropriate dn_flow_set (either WFQ
216  * ones, or the one embedded in the dn_pipe for fixed-rate flows), which in
217  * turn dispatches them to the appropriate dn_flow_queue (created dynamically
218  * according to the masks).
219  *
220  * The transmit clock for fixed rate flows (ready_event()) selects the
221  * dn_flow_queue to be used to transmit the next packet. For WF2Q,
222  * wfq_ready_event() extract a pipe which in turn selects the right flow using
223  * a number of heaps defined into the pipe itself.
224  */
225 
226 /*
227  * Per flow queue.  This contains the flow identifier, the queue of packets,
228  * counters, and parameters used to support both RED and WF2Q+.
229  *
230  * A dn_flow_queue is created and initialized whenever a packet for a new
231  * flow arrives.
232  */
233 struct dn_flow_queue {
234 	struct dn_flow_id id;
235 	LIST_ENTRY(dn_flow_queue) q_link;
236 
237 	struct dn_pkt_queue queue;	/* queue of packets */
238 	u_int len;
239 	u_int len_bytes;
240 	u_long numbytes;		/* credit for transmission (dynamic queues) */
241 
242 	uint64_t tot_pkts;		/* statistics counters */
243 	uint64_t tot_bytes;
244 	uint32_t drops;
245 
246 	int hash_slot;		/* debugging/diagnostic */
247 
248 	/* RED parameters */
249 	int avg;			/* average queue length est. (scaled) */
250 	int count;			/* arrivals since last RED drop */
251 	int random;			/* random value (scaled) */
252 	uint32_t q_time;		/* start of queue idle time */
253 
254 	/* WF2Q+ support */
255 	struct dn_flow_set *fs;	/* parent flow set */
256 	int heap_pos;		/* position (index) of struct in heap */
257 	dn_key sched_time;		/* current time when queue enters ready_heap */
258 
259 	dn_key S, F;		/* start time, finish time */
260 	/*
261 	 * Setting F < S means the timestamp is invalid. We only need
262 	 * to test this when the queue is empty.
263 	 */
264 };
265 LIST_HEAD(dn_flowqueue_head, dn_flow_queue);
266 
267 /*
268  * flow_set descriptor.  Contains the "template" parameters for the queue
269  * configuration, and pointers to the hash table of dn_flow_queue's.
270  *
271  * The hash table is an array of lists -- we identify the slot by hashing
272  * the flow-id, then scan the list looking for a match.
273  * The size of the hash table (buckets) is configurable on a per-queue basis.
274  *
275  * A dn_flow_set is created whenever a new queue or pipe is created (in the
276  * latter case, the structure is located inside the struct dn_pipe).
277  */
278 struct dn_flow_set {
279 	u_short fs_nr;		/* flow_set number */
280 	u_short flags_fs;		/* see 'Flow set flags' */
281 
282 	LIST_ENTRY(dn_flow_set) fs_link;
283 
284 	struct dn_pipe *pipe;	/* pointer to parent pipe */
285 	u_short parent_nr;		/* parent pipe#, 0 if local to a pipe */
286 
287 	int weight;			/* WFQ queue weight */
288 	int qsize;			/* queue size in slots or bytes */
289 	int plr;			/* pkt loss rate (2^31-1 means 100%) */
290 
291 	struct dn_flow_id flow_mask;
292 
293 	/* hash table of queues onto this flow_set */
294 	int rq_size;		/* number of slots */
295 	int rq_elements;		/* active elements */
296 	struct dn_flowqueue_head *rq;/* array of rq_size entries */
297 
298 	uint32_t last_expired;	/* do not expire too frequently */
299 	int backlogged;		/* #active queues for this flowset */
300 
301 	/* RED parameters */
302 	int w_q;			/* queue weight (scaled) */
303 	int max_th;			/* maximum threshold for queue (scaled) */
304 	int min_th;			/* minimum threshold for queue (scaled) */
305 	int max_p;			/* maximum value for p_b (scaled) */
306 	u_int c_1;			/* max_p/(max_th-min_th) (scaled) */
307 	u_int c_2;			/* max_p*min_th/(max_th-min_th) (scaled) */
308 	u_int c_3;			/* for GRED, (1-max_p)/max_th (scaled) */
309 	u_int c_4;			/* for GRED, 1 - 2*max_p (scaled) */
310 	u_int *w_q_lookup;		/* lookup table for computing (1-w_q)^t */
311 	u_int lookup_depth;		/* depth of lookup table */
312 	int lookup_step;		/* granularity inside the lookup table */
313 	int lookup_weight;		/* equal to (1-w_q)^t / (1-w_q)^(t+1) */
314 	int avg_pkt_size;		/* medium packet size */
315 	int max_pkt_size;		/* max packet size */
316 };
317 LIST_HEAD(dn_flowset_head, dn_flow_set);
318 
319 /*
320  * Pipe descriptor. Contains global parameters, delay-line queue, and the
321  * flow_set used for fixed-rate queues.
322  *
323  * For WF2Q+ support it also has 3 heaps holding dn_flow_queue:
324  *  + not_eligible_heap, for queues whose start time is higher than the
325  *	virtual time. Sorted by start time.
326  *  + scheduler_heap, for queues eligible for scheduling.  Sorted by finish
327  *	time.
328  *  + idle_heap, all flows that are idle and can be removed.  We do that on
329  *	each tick so we do not slow down too much operations during forwarding.
330  */
331 struct dn_pipe {		/* a pipe */
332 	int pipe_nr;		/* number */
333 	int bandwidth;		/* really, bytes/tick. */
334 	int delay;			/* really, ticks */
335 
336 	struct dn_pkt_queue p_queue;/* packets in delay line */
337 	LIST_ENTRY(dn_pipe) p_link;
338 
339 	/* WF2Q+ */
340 	struct dn_heap scheduler_heap; /* top extract - key Finish time*/
341 	struct dn_heap not_eligible_heap; /* top extract- key Start time */
342 	struct dn_heap idle_heap;	/* random extract - key Start=Finish time */
343 
344 	dn_key V;			/* virtual time */
345 	int sum;			/* sum of weights of all active sessions */
346 	int numbytes;		/* bits I can transmit (more or less). */
347 
348 	dn_key sched_time;		/* time pipe was scheduled in ready_heap */
349 
350 	struct dn_flow_set fs;	/* used with fixed-rate flows */
351 };
352 LIST_HEAD(dn_pipe_head, dn_pipe);
353 
354 struct dn_sopt {
355 	int	dn_sopt_name;
356 	void	*dn_sopt_arg;
357 	size_t	dn_sopt_arglen;
358 };
359 
360 typedef int	ip_dn_ctl_t(struct dn_sopt *);
361 typedef int	ip_dn_io_t(struct mbuf *);
362 
363 extern ip_dn_ctl_t	*ip_dn_ctl_ptr;
364 extern ip_dn_io_t	*ip_dn_io_ptr;
365 
366 void	ip_dn_queue(struct mbuf *);
367 void	ip_dn_packet_free(struct dn_pkt *);
368 void	ip_dn_packet_redispatch(struct dn_pkt *);
369 int	ip_dn_sockopt(struct sockopt *);
370 
371 #define	DUMMYNET_LOADED	(ip_dn_io_ptr != NULL)
372 
373 #endif	/* _KERNEL */
374 
375 struct dn_ioc_flowid {
376 	uint16_t type;	/* ETHERTYPE_ */
377 	uint16_t pad;
378 	union {
379 		struct {
380 			uint32_t dst_ip;
381 			uint32_t src_ip;
382 			uint16_t dst_port;
383 			uint16_t src_port;
384 			uint8_t proto;
385 			uint8_t flags;
386 		} ip;
387 		uint8_t pad[64];
388 	} u;
389 };
390 
391 struct dn_ioc_flowqueue {
392 	u_int len;
393 	u_int len_bytes;
394 
395 	uint64_t tot_pkts;
396 	uint64_t tot_bytes;
397 	uint32_t drops;
398 
399 	int hash_slot;		/* debugging/diagnostic */
400 	dn_key S;			/* virtual start time */
401 	dn_key F;			/* virtual finish time */
402 
403 	struct dn_ioc_flowid id;
404 	uint8_t reserved[16];
405 };
406 
407 struct dn_ioc_flowset {
408 	u_short fs_type;		/* DN_IS_{QUEUE,PIPE}, MUST be first */
409 
410 	u_short fs_nr;		/* flow_set number */
411 	u_short flags_fs;		/* see 'Flow set flags' */
412 	u_short parent_nr;		/* parent pipe#, 0 if local to a pipe */
413 
414 	int weight;			/* WFQ queue weight */
415 	int qsize;			/* queue size in slots or bytes */
416 	int plr;			/* pkt loss rate (2^31-1 means 100%) */
417 
418 	/* Hash table information */
419 	int rq_size;		/* number of slots */
420 	int rq_elements;		/* active elements */
421 
422 	/* RED parameters */
423 	int w_q;			/* queue weight (scaled) */
424 	int max_th;			/* maximum threshold for queue (scaled) */
425 	int min_th;			/* minimum threshold for queue (scaled) */
426 	int max_p;			/* maximum value for p_b (scaled) */
427 	int lookup_step;		/* granularity inside the lookup table */
428 	int lookup_weight;		/* equal to (1-w_q)^t / (1-w_q)^(t+1) */
429 
430 	struct dn_ioc_flowid flow_mask;
431 	uint8_t reserved[16];
432 };
433 
434 struct dn_ioc_pipe {
435 	struct dn_ioc_flowset fs;	/* MUST be first */
436 
437 	int pipe_nr;		/* pipe number */
438 	int bandwidth;		/* bit/second */
439 	int delay;			/* milliseconds */
440 
441 	dn_key V;			/* virtual time */
442 
443 	uint8_t reserved[16];
444 };
445 
446 /*
447  * Flow set flags
448  */
449 #define DN_HAVE_FLOW_MASK	0x0001
450 #define DN_IS_RED		0x0002
451 #define DN_IS_GENTLE_RED	0x0004
452 #define DN_QSIZE_IS_BYTES	0x0008	/* queue size is measured in bytes */
453 #define DN_NOERROR		0x0010	/* do not report ENOBUFS on drops */
454 #define DN_IS_PIPE		0x4000
455 #define DN_IS_QUEUE		0x8000
456 
457 /*
458  * Macros for RED
459  */
460 #define SCALE_RED		16
461 #define SCALE(x)		((x) << SCALE_RED)
462 #define SCALE_VAL(x)		((x) >> SCALE_RED)
463 #define SCALE_MUL(x, y)		(((x) * (y)) >> SCALE_RED)
464 
465 /*
466  * Maximum pipe number
467  */
468 #define DN_PIPE_NR_MAX		65536
469 
470 #endif
471 #endif /* !_IP_DUMMYNET_H */
472