xref: /openbsd/lib/libpcap/optimize.c (revision 146262ea)
1 /*	$OpenBSD: optimize.c,v 1.23 2024/04/08 02:51:14 jsg Exp $	*/
2 
3 /*
4  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that: (1) source code distributions
9  * retain the above copyright notice and this paragraph in its entirety, (2)
10  * distributions including binary code include the above copyright notice and
11  * this paragraph in its entirety in the documentation or other materials
12  * provided with the distribution, and (3) all advertising materials mentioning
13  * features or use of this software display the following acknowledgement:
14  * ``This product includes software developed by the University of California,
15  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
16  * the University nor the names of its contributors may be used to endorse
17  * or promote products derived from this software without specific prior
18  * written permission.
19  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
20  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
21  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
22  *
23  *  Optimization module for tcpdump intermediate representation.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/time.h>
28 
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <stdint.h>
32 #include <string.h>
33 
34 #include "pcap-int.h"
35 
36 #include "gencode.h"
37 
38 #ifdef HAVE_OS_PROTO_H
39 #include "os-proto.h"
40 #endif
41 
42 #ifdef BDEBUG
43 extern int dflag;
44 #endif
45 
46 #define A_ATOM BPF_MEMWORDS
47 #define X_ATOM (BPF_MEMWORDS+1)
48 
49 #define NOP -1
50 
51 /*
52  * This define is used to represent *both* the accumulator and
53  * x register in use-def computations.
54  * Currently, the use-def code assumes only one definition per instruction.
55  */
56 #define AX_ATOM N_ATOMS
57 
58 /*
59  * A flag to indicate that further optimization is needed.
60  * Iterative passes are continued until a given pass yields no
61  * branch movement.
62  */
63 static int done;
64 
65 /*
66  * A block is marked if only if its mark equals the current mark.
67  * Rather than traverse the code array, marking each item, 'cur_mark' is
68  * incremented.  This automatically makes each element unmarked.
69  */
70 static int cur_mark;
71 #define isMarked(p) ((p)->mark == cur_mark)
72 #define unMarkAll() cur_mark += 1
73 #define Mark(p) ((p)->mark = cur_mark)
74 
75 static void opt_init(struct block *);
76 static void opt_cleanup(void);
77 
78 static void make_marks(struct block *);
79 static void mark_code(struct block *);
80 
81 static void intern_blocks(struct block *);
82 
83 static int eq_slist(struct slist *, struct slist *);
84 
85 static void find_levels_r(struct block *);
86 
87 static void find_levels(struct block *);
88 static void find_dom(struct block *);
89 static void propedom(struct edge *);
90 static void find_edom(struct block *);
91 static void find_closure(struct block *);
92 static int atomuse(struct stmt *);
93 static int atomdef(struct stmt *);
94 static void compute_local_ud(struct block *);
95 static void find_ud(struct block *);
96 static void init_val(void);
97 static int F(int, int, int);
98 static __inline void vstore(struct stmt *, int *, int, int);
99 static void opt_blk(struct block *, int);
100 static int use_conflict(struct block *, struct block *);
101 static void opt_j(struct edge *);
102 static void or_pullup(struct block *);
103 static void and_pullup(struct block *);
104 static void opt_blks(struct block *, int);
105 static __inline void link_inedge(struct edge *, struct block *);
106 static void find_inedges(struct block *);
107 static void opt_root(struct block **);
108 static void opt_loop(struct block *, int);
109 static void fold_op(struct stmt *, int, int);
110 static __inline struct slist *this_op(struct slist *);
111 static void opt_not(struct block *);
112 static void opt_peep(struct block *);
113 static void opt_stmt(struct stmt *, int[], int);
114 static void deadstmt(struct stmt *, struct stmt *[]);
115 static void opt_deadstores(struct block *);
116 static void opt_blk(struct block *, int);
117 static int use_conflict(struct block *, struct block *);
118 static void opt_j(struct edge *);
119 static struct block *fold_edge(struct block *, struct edge *);
120 static __inline int eq_blk(struct block *, struct block *);
121 static int slength(struct slist *);
122 static int count_blocks(struct block *);
123 static void number_blks_r(struct block *);
124 static int count_stmts(struct block *);
125 static int convert_code_r(struct block *);
126 #ifdef BDEBUG
127 static void opt_dump(struct block *);
128 #endif
129 
130 static int n_blocks;
131 struct block **blocks;
132 static int n_edges;
133 struct edge **edges;
134 
135 /*
136  * A bit vector set representation of the dominators.
137  * We round up the set size to the next power of two.
138  */
139 static int nodewords;
140 static int edgewords;
141 struct block **levels;
142 bpf_u_int32 *space1;
143 bpf_u_int32 *space2;
144 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
145 /*
146  * True if a is in uset {p}
147  */
148 #define SET_MEMBER(p, a) \
149 ((p)[(unsigned)(a) / BITS_PER_WORD] & (1 << ((unsigned)(a) % BITS_PER_WORD)))
150 
151 /*
152  * Add 'a' to uset p.
153  */
154 #define SET_INSERT(p, a) \
155 (p)[(unsigned)(a) / BITS_PER_WORD] |= (1 << ((unsigned)(a) % BITS_PER_WORD))
156 
157 /*
158  * Delete 'a' from uset p.
159  */
160 #define SET_DELETE(p, a) \
161 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~(1 << ((unsigned)(a) % BITS_PER_WORD))
162 
163 /*
164  * a := a intersect b
165  */
166 #define SET_INTERSECT(a, b, n)\
167 {\
168 	bpf_u_int32 *_x = a, *_y = b;\
169 	int _n = n;\
170 	while (--_n >= 0) *_x++ &= *_y++;\
171 }
172 
173 /*
174  * a := a - b
175  */
176 #define SET_SUBTRACT(a, b, n)\
177 {\
178 	bpf_u_int32 *_x = a, *_y = b;\
179 	int _n = n;\
180 	while (--_n >= 0) *_x++ &=~ *_y++;\
181 }
182 
183 /*
184  * a := a union b
185  */
186 #define SET_UNION(a, b, n)\
187 {\
188 	bpf_u_int32 *_x = a, *_y = b;\
189 	int _n = n;\
190 	while (--_n >= 0) *_x++ |= *_y++;\
191 }
192 
193 static uset all_dom_sets;
194 static uset all_closure_sets;
195 static uset all_edge_sets;
196 
197 #ifndef MAX
198 #define MAX(a,b) ((a)>(b)?(a):(b))
199 #endif
200 
201 static void
find_levels_r(struct block * b)202 find_levels_r(struct block *b)
203 {
204 	int level;
205 
206 	if (isMarked(b))
207 		return;
208 
209 	Mark(b);
210 	b->link = 0;
211 
212 	if (JT(b)) {
213 		find_levels_r(JT(b));
214 		find_levels_r(JF(b));
215 		level = MAX(JT(b)->level, JF(b)->level) + 1;
216 	} else
217 		level = 0;
218 	b->level = level;
219 	b->link = levels[level];
220 	levels[level] = b;
221 }
222 
223 /*
224  * Level graph.  The levels go from 0 at the leaves to
225  * N_LEVELS at the root.  The levels[] array points to the
226  * first node of the level list, whose elements are linked
227  * with the 'link' field of the struct block.
228  */
229 static void
find_levels(struct block * root)230 find_levels(struct block *root)
231 {
232 	memset((char *)levels, 0, n_blocks * sizeof(*levels));
233 	unMarkAll();
234 	find_levels_r(root);
235 }
236 
237 /*
238  * Find dominator relationships.
239  * Assumes graph has been leveled.
240  */
241 static void
find_dom(struct block * root)242 find_dom(struct block *root)
243 {
244 	int i;
245 	struct block *b;
246 	bpf_u_int32 *x;
247 
248 	/*
249 	 * Initialize sets to contain all nodes.
250 	 */
251 	x = all_dom_sets;
252 	i = n_blocks * nodewords;
253 	while (--i >= 0)
254 		*x++ = ~0;
255 	/* Root starts off empty. */
256 	for (i = nodewords; --i >= 0;)
257 		root->dom[i] = 0;
258 
259 	/* root->level is the highest level no found. */
260 	for (i = root->level; i >= 0; --i) {
261 		for (b = levels[i]; b; b = b->link) {
262 			SET_INSERT(b->dom, b->id);
263 			if (JT(b) == 0)
264 				continue;
265 			SET_INTERSECT(JT(b)->dom, b->dom, nodewords);
266 			SET_INTERSECT(JF(b)->dom, b->dom, nodewords);
267 		}
268 	}
269 }
270 
271 static void
propedom(struct edge * ep)272 propedom(struct edge *ep)
273 {
274 	SET_INSERT(ep->edom, ep->id);
275 	if (ep->succ) {
276 		SET_INTERSECT(ep->succ->et.edom, ep->edom, edgewords);
277 		SET_INTERSECT(ep->succ->ef.edom, ep->edom, edgewords);
278 	}
279 }
280 
281 /*
282  * Compute edge dominators.
283  * Assumes graph has been leveled and predecessors established.
284  */
285 static void
find_edom(struct block * root)286 find_edom(struct block *root)
287 {
288 	int i;
289 	uset x;
290 	struct block *b;
291 
292 	x = all_edge_sets;
293 	for (i = n_edges * edgewords; --i >= 0; )
294 		x[i] = ~0;
295 
296 	/* root->level is the highest level no found. */
297 	memset(root->et.edom, 0, edgewords * sizeof(*(uset)0));
298 	memset(root->ef.edom, 0, edgewords * sizeof(*(uset)0));
299 	for (i = root->level; i >= 0; --i) {
300 		for (b = levels[i]; b != 0; b = b->link) {
301 			propedom(&b->et);
302 			propedom(&b->ef);
303 		}
304 	}
305 }
306 
307 /*
308  * Find the backwards transitive closure of the flow graph.  These sets
309  * are backwards in the sense that we find the set of nodes that reach
310  * a given node, not the set of nodes that can be reached by a node.
311  *
312  * Assumes graph has been leveled.
313  */
314 static void
find_closure(struct block * root)315 find_closure(struct block *root)
316 {
317 	int i;
318 	struct block *b;
319 
320 	/*
321 	 * Initialize sets to contain no nodes.
322 	 */
323 	memset((char *)all_closure_sets, 0,
324 	      n_blocks * nodewords * sizeof(*all_closure_sets));
325 
326 	/* root->level is the highest level no found. */
327 	for (i = root->level; i >= 0; --i) {
328 		for (b = levels[i]; b; b = b->link) {
329 			SET_INSERT(b->closure, b->id);
330 			if (JT(b) == 0)
331 				continue;
332 			SET_UNION(JT(b)->closure, b->closure, nodewords);
333 			SET_UNION(JF(b)->closure, b->closure, nodewords);
334 		}
335 	}
336 }
337 
338 /*
339  * Return the register number that is used by s.  If A and X are both
340  * used, return AX_ATOM.  If no register is used, return -1.
341  *
342  * The implementation should probably change to an array access.
343  */
344 static int
atomuse(struct stmt * s)345 atomuse(struct stmt *s)
346 {
347 	int c = s->code;
348 
349 	if (c == NOP)
350 		return -1;
351 
352 	switch (BPF_CLASS(c)) {
353 
354 	case BPF_RET:
355 		return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
356 			(BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
357 
358 	case BPF_LD:
359 	case BPF_LDX:
360 		return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
361 			(BPF_MODE(c) == BPF_MEM) ? s->k : -1;
362 
363 	case BPF_ST:
364 		return A_ATOM;
365 
366 	case BPF_STX:
367 		return X_ATOM;
368 
369 	case BPF_JMP:
370 	case BPF_ALU:
371 		if (BPF_SRC(c) == BPF_X)
372 			return AX_ATOM;
373 		return A_ATOM;
374 
375 	case BPF_MISC:
376 		return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
377 	}
378 	abort();
379 	/* NOTREACHED */
380 }
381 
382 /*
383  * Return the register number that is defined by 's'.  We assume that
384  * a single stmt cannot define more than one register.  If no register
385  * is defined, return -1.
386  *
387  * The implementation should probably change to an array access.
388  */
389 static int
atomdef(struct stmt * s)390 atomdef(struct stmt *s)
391 {
392 	if (s->code == NOP)
393 		return -1;
394 
395 	switch (BPF_CLASS(s->code)) {
396 
397 	case BPF_LD:
398 	case BPF_ALU:
399 		return A_ATOM;
400 
401 	case BPF_LDX:
402 		return X_ATOM;
403 
404 	case BPF_ST:
405 	case BPF_STX:
406 		return s->k;
407 
408 	case BPF_MISC:
409 		return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
410 	}
411 	return -1;
412 }
413 
414 static void
compute_local_ud(struct block * b)415 compute_local_ud(struct block *b)
416 {
417 	struct slist *s;
418 	atomset def = 0, use = 0, kill = 0;
419 	int atom;
420 
421 	for (s = b->stmts; s; s = s->next) {
422 		if (s->s.code == NOP)
423 			continue;
424 		atom = atomuse(&s->s);
425 		if (atom >= 0) {
426 			if (atom == AX_ATOM) {
427 				if (!ATOMELEM(def, X_ATOM))
428 					use |= ATOMMASK(X_ATOM);
429 				if (!ATOMELEM(def, A_ATOM))
430 					use |= ATOMMASK(A_ATOM);
431 			}
432 			else if (atom < N_ATOMS) {
433 				if (!ATOMELEM(def, atom))
434 					use |= ATOMMASK(atom);
435 			}
436 			else
437 				abort();
438 		}
439 		atom = atomdef(&s->s);
440 		if (atom >= 0) {
441 			if (!ATOMELEM(use, atom))
442 				kill |= ATOMMASK(atom);
443 			def |= ATOMMASK(atom);
444 		}
445 	}
446 	if (!ATOMELEM(def, A_ATOM) && BPF_CLASS(b->s.code) == BPF_JMP)
447 		use |= ATOMMASK(A_ATOM);
448 
449 	b->def = def;
450 	b->kill = kill;
451 	b->in_use = use;
452 }
453 
454 /*
455  * Assume graph is already leveled.
456  */
457 static void
find_ud(struct block * root)458 find_ud(struct block *root)
459 {
460 	int i, maxlevel;
461 	struct block *p;
462 
463 	/*
464 	 * root->level is the highest level no found;
465 	 * count down from there.
466 	 */
467 	maxlevel = root->level;
468 	for (i = maxlevel; i >= 0; --i)
469 		for (p = levels[i]; p; p = p->link) {
470 			compute_local_ud(p);
471 			p->out_use = 0;
472 		}
473 
474 	for (i = 1; i <= maxlevel; ++i) {
475 		for (p = levels[i]; p; p = p->link) {
476 			p->out_use |= JT(p)->in_use | JF(p)->in_use;
477 			p->in_use |= p->out_use &~ p->kill;
478 		}
479 	}
480 }
481 
482 /*
483  * These data structures are used in a Cocke and Shwarz style
484  * value numbering scheme.  Since the flowgraph is acyclic,
485  * exit values can be propagated from a node's predecessors
486  * provided it is uniquely defined.
487  */
488 struct valnode {
489 	int code;
490 	int v0, v1;
491 	int val;
492 	struct valnode *next;
493 };
494 
495 #define MODULUS 213
496 static struct valnode *hashtbl[MODULUS];
497 static int curval;
498 static int maxval;
499 
500 /* Integer constants mapped with the load immediate opcode. */
501 #define K(i) F(BPF_LD|BPF_IMM|BPF_W, i, 0L)
502 
503 struct vmapinfo {
504 	int is_const;
505 	bpf_int32 const_val;
506 };
507 
508 struct vmapinfo *vmap;
509 struct valnode *vnode_base;
510 struct valnode *next_vnode;
511 
512 static void
init_val(void)513 init_val(void)
514 {
515 	curval = 0;
516 	next_vnode = vnode_base;
517 	memset((char *)vmap, 0, maxval * sizeof(*vmap));
518 	memset((char *)hashtbl, 0, sizeof hashtbl);
519 }
520 
521 /* Because we really don't have an IR, this stuff is a little messy. */
522 static int
F(int code,int v0,int v1)523 F(int code, int v0, int v1)
524 {
525 	u_int hash;
526 	int val;
527 	struct valnode *p;
528 
529 	hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
530 	hash %= MODULUS;
531 
532 	for (p = hashtbl[hash]; p; p = p->next)
533 		if (p->code == code && p->v0 == v0 && p->v1 == v1)
534 			return p->val;
535 
536 	val = ++curval;
537 	if (BPF_MODE(code) == BPF_IMM &&
538 	    (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
539 		vmap[val].const_val = v0;
540 		vmap[val].is_const = 1;
541 	}
542 	p = next_vnode++;
543 	p->val = val;
544 	p->code = code;
545 	p->v0 = v0;
546 	p->v1 = v1;
547 	p->next = hashtbl[hash];
548 	hashtbl[hash] = p;
549 
550 	return val;
551 }
552 
553 static __inline void
vstore(struct stmt * s,int * valp,int newval,int alter)554 vstore(struct stmt *s, int *valp, int newval, int alter)
555 {
556 	if (alter && *valp == newval)
557 		s->code = NOP;
558 	else
559 		*valp = newval;
560 }
561 
562 static void
fold_op(struct stmt * s,int v0,int v1)563 fold_op(struct stmt *s, int v0, int v1)
564 {
565 	bpf_int32 a, b;
566 
567 	a = vmap[v0].const_val;
568 	b = vmap[v1].const_val;
569 
570 	switch (BPF_OP(s->code)) {
571 	case BPF_ADD:
572 		a += b;
573 		break;
574 
575 	case BPF_SUB:
576 		a -= b;
577 		break;
578 
579 	case BPF_MUL:
580 		a *= b;
581 		break;
582 
583 	case BPF_DIV:
584 		if (b == 0)
585 			bpf_error("division by zero");
586 		a /= b;
587 		break;
588 
589 	case BPF_AND:
590 		a &= b;
591 		break;
592 
593 	case BPF_OR:
594 		a |= b;
595 		break;
596 
597 	case BPF_LSH:
598 		a <<= b;
599 		break;
600 
601 	case BPF_RSH:
602 		a >>= b;
603 		break;
604 
605 	case BPF_NEG:
606 		a = -a;
607 		break;
608 
609 	default:
610 		abort();
611 	}
612 	s->k = a;
613 	s->code = BPF_LD|BPF_IMM;
614 	done = 0;
615 }
616 
617 static __inline struct slist *
this_op(struct slist * s)618 this_op(struct slist *s)
619 {
620 	while (s != 0 && s->s.code == NOP)
621 		s = s->next;
622 	return s;
623 }
624 
625 static void
opt_not(struct block * b)626 opt_not(struct block *b)
627 {
628 	struct block *tmp = JT(b);
629 
630 	JT(b) = JF(b);
631 	JF(b) = tmp;
632 }
633 
634 static void
opt_peep(struct block * b)635 opt_peep(struct block *b)
636 {
637 	struct slist *s;
638 	struct slist *next, *last;
639 	int val;
640 
641 	s = b->stmts;
642 	if (s == 0)
643 		return;
644 
645 	last = s;
646 	while (1) {
647 		s = this_op(s);
648 		if (s == 0)
649 			break;
650 		next = this_op(s->next);
651 		if (next == 0)
652 			break;
653 		last = next;
654 
655 		/*
656 		 * st  M[k]	-->	st  M[k]
657 		 * ldx M[k]		tax
658 		 */
659 		if (s->s.code == BPF_ST &&
660 		    next->s.code == (BPF_LDX|BPF_MEM) &&
661 		    s->s.k == next->s.k) {
662 			done = 0;
663 			next->s.code = BPF_MISC|BPF_TAX;
664 		}
665 		/*
666 		 * ld  #k	-->	ldx  #k
667 		 * tax			txa
668 		 */
669 		if (s->s.code == (BPF_LD|BPF_IMM) &&
670 		    next->s.code == (BPF_MISC|BPF_TAX)) {
671 			s->s.code = BPF_LDX|BPF_IMM;
672 			next->s.code = BPF_MISC|BPF_TXA;
673 			done = 0;
674 		}
675 		/*
676 		 * This is an ugly special case, but it happens
677 		 * when you say tcp[k] or udp[k] where k is a constant.
678 		 */
679 		if (s->s.code == (BPF_LD|BPF_IMM)) {
680 			struct slist *add, *tax, *ild;
681 
682 			/*
683 			 * Check that X isn't used on exit from this
684 			 * block (which the optimizer might cause).
685 			 * We know the code generator won't generate
686 			 * any local dependencies.
687 			 */
688 			if (ATOMELEM(b->out_use, X_ATOM))
689 				break;
690 
691 			if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
692 				add = next;
693 			else
694 				add = this_op(next->next);
695 			if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
696 				break;
697 
698 			tax = this_op(add->next);
699 			if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
700 				break;
701 
702 			ild = this_op(tax->next);
703 			if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
704 			    BPF_MODE(ild->s.code) != BPF_IND)
705 				break;
706 			/*
707 			 * XXX We need to check that X is not
708 			 * subsequently used.  We know we can eliminate the
709 			 * accumulator modifications since it is defined
710 			 * by the last stmt of this sequence.
711 			 *
712 			 * We want to turn this sequence:
713 			 *
714 			 * (004) ldi     #0x2		{s}
715 			 * (005) ldxms   [14]		{next}  -- optional
716 			 * (006) addx			{add}
717 			 * (007) tax			{tax}
718 			 * (008) ild     [x+0]		{ild}
719 			 *
720 			 * into this sequence:
721 			 *
722 			 * (004) nop
723 			 * (005) ldxms   [14]
724 			 * (006) nop
725 			 * (007) nop
726 			 * (008) ild     [x+2]
727 			 *
728 			 */
729 			ild->s.k += s->s.k;
730 			s->s.code = NOP;
731 			add->s.code = NOP;
732 			tax->s.code = NOP;
733 			done = 0;
734 		}
735 		s = next;
736 	}
737 	/*
738 	 * If we have a subtract to do a comparison, and the X register
739 	 * is a known constant, we can merge this value into the
740 	 * comparison.
741 	 */
742 	if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X) &&
743 	    !ATOMELEM(b->out_use, A_ATOM)) {
744 		val = b->val[X_ATOM];
745 		if (vmap[val].is_const) {
746 			int op;
747 
748 			b->s.k += vmap[val].const_val;
749 			op = BPF_OP(b->s.code);
750 			if (op == BPF_JGT || op == BPF_JGE) {
751 				struct block *t = JT(b);
752 				JT(b) = JF(b);
753 				JF(b) = t;
754 				b->s.k += 0x80000000;
755 			}
756 			last->s.code = NOP;
757 			done = 0;
758 		} else if (b->s.k == 0) {
759 			/*
760 			 * sub x  ->	nop
761 			 * j  #0	j  x
762 			 */
763 			last->s.code = NOP;
764 			b->s.code = BPF_CLASS(b->s.code) | BPF_OP(b->s.code) |
765 				BPF_X;
766 			done = 0;
767 		}
768 	}
769 	/*
770 	 * Likewise, a constant subtract can be simplified.
771 	 */
772 	else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K) &&
773 		 !ATOMELEM(b->out_use, A_ATOM)) {
774 		int op;
775 
776 		b->s.k += last->s.k;
777 		last->s.code = NOP;
778 		op = BPF_OP(b->s.code);
779 		if (op == BPF_JGT || op == BPF_JGE) {
780 			struct block *t = JT(b);
781 			JT(b) = JF(b);
782 			JF(b) = t;
783 			b->s.k += 0x80000000;
784 		}
785 		done = 0;
786 	}
787 	/*
788 	 * and #k	nop
789 	 * jeq #0  ->	jset #k
790 	 */
791 	if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
792 	    !ATOMELEM(b->out_use, A_ATOM) && b->s.k == 0) {
793 		b->s.k = last->s.k;
794 		b->s.code = BPF_JMP|BPF_K|BPF_JSET;
795 		last->s.code = NOP;
796 		done = 0;
797 		opt_not(b);
798 	}
799 	/*
800 	 * If the accumulator is a known constant, we can compute the
801 	 * comparison result.
802 	 */
803 	val = b->val[A_ATOM];
804 	if (vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
805 		bpf_int32 v = vmap[val].const_val;
806 		switch (BPF_OP(b->s.code)) {
807 
808 		case BPF_JEQ:
809 			v = v == b->s.k;
810 			break;
811 
812 		case BPF_JGT:
813 			v = (unsigned)v > b->s.k;
814 			break;
815 
816 		case BPF_JGE:
817 			v = (unsigned)v >= b->s.k;
818 			break;
819 
820 		case BPF_JSET:
821 			v &= b->s.k;
822 			break;
823 
824 		default:
825 			abort();
826 		}
827 		if (JF(b) != JT(b))
828 			done = 0;
829 		if (v)
830 			JF(b) = JT(b);
831 		else
832 			JT(b) = JF(b);
833 	}
834 }
835 
836 /*
837  * Compute the symbolic value of expression of 's', and update
838  * anything it defines in the value table 'val'.  If 'alter' is true,
839  * do various optimizations.  This code would be cleaner if symbolic
840  * evaluation and code transformations weren't folded together.
841  */
842 static void
opt_stmt(struct stmt * s,int val[],int alter)843 opt_stmt(struct stmt *s, int val[], int alter)
844 {
845 	int op;
846 	int v;
847 
848 	switch (s->code) {
849 
850 	case BPF_LD|BPF_ABS|BPF_W:
851 	case BPF_LD|BPF_ABS|BPF_H:
852 	case BPF_LD|BPF_ABS|BPF_B:
853 		v = F(s->code, s->k, 0L);
854 		vstore(s, &val[A_ATOM], v, alter);
855 		break;
856 
857 	case BPF_LD|BPF_IND|BPF_W:
858 	case BPF_LD|BPF_IND|BPF_H:
859 	case BPF_LD|BPF_IND|BPF_B:
860 		v = val[X_ATOM];
861 		if (alter && vmap[v].is_const) {
862 			s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
863 			s->k += vmap[v].const_val;
864 			v = F(s->code, s->k, 0L);
865 			done = 0;
866 		}
867 		else
868 			v = F(s->code, s->k, v);
869 		vstore(s, &val[A_ATOM], v, alter);
870 		break;
871 
872 	case BPF_LD|BPF_LEN:
873 	case BPF_LD|BPF_RND:
874 		v = F(s->code, 0L, 0L);
875 		vstore(s, &val[A_ATOM], v, alter);
876 		break;
877 
878 	case BPF_LD|BPF_IMM:
879 		v = K(s->k);
880 		vstore(s, &val[A_ATOM], v, alter);
881 		break;
882 
883 	case BPF_LDX|BPF_IMM:
884 		v = K(s->k);
885 		vstore(s, &val[X_ATOM], v, alter);
886 		break;
887 
888 	case BPF_LDX|BPF_MSH|BPF_B:
889 		v = F(s->code, s->k, 0L);
890 		vstore(s, &val[X_ATOM], v, alter);
891 		break;
892 
893 	case BPF_ALU|BPF_NEG:
894 		if (alter && vmap[val[A_ATOM]].is_const) {
895 			s->code = BPF_LD|BPF_IMM;
896 			s->k = -vmap[val[A_ATOM]].const_val;
897 			val[A_ATOM] = K(s->k);
898 		}
899 		else
900 			val[A_ATOM] = F(s->code, val[A_ATOM], 0L);
901 		break;
902 
903 	case BPF_ALU|BPF_ADD|BPF_K:
904 	case BPF_ALU|BPF_SUB|BPF_K:
905 	case BPF_ALU|BPF_MUL|BPF_K:
906 	case BPF_ALU|BPF_DIV|BPF_K:
907 	case BPF_ALU|BPF_AND|BPF_K:
908 	case BPF_ALU|BPF_OR|BPF_K:
909 	case BPF_ALU|BPF_LSH|BPF_K:
910 	case BPF_ALU|BPF_RSH|BPF_K:
911 		op = BPF_OP(s->code);
912 		if (alter) {
913 			if (s->k == 0) {
914 				if (op == BPF_ADD || op == BPF_SUB ||
915 				    op == BPF_LSH || op == BPF_RSH ||
916 				    op == BPF_OR) {
917 					s->code = NOP;
918 					break;
919 				}
920 				if (op == BPF_MUL || op == BPF_AND) {
921 					s->code = BPF_LD|BPF_IMM;
922 					val[A_ATOM] = K(s->k);
923 					break;
924 				}
925 			}
926 			if (vmap[val[A_ATOM]].is_const) {
927 				fold_op(s, val[A_ATOM], K(s->k));
928 				val[A_ATOM] = K(s->k);
929 				break;
930 			}
931 		}
932 		val[A_ATOM] = F(s->code, val[A_ATOM], K(s->k));
933 		break;
934 
935 	case BPF_ALU|BPF_ADD|BPF_X:
936 	case BPF_ALU|BPF_SUB|BPF_X:
937 	case BPF_ALU|BPF_MUL|BPF_X:
938 	case BPF_ALU|BPF_DIV|BPF_X:
939 	case BPF_ALU|BPF_AND|BPF_X:
940 	case BPF_ALU|BPF_OR|BPF_X:
941 	case BPF_ALU|BPF_LSH|BPF_X:
942 	case BPF_ALU|BPF_RSH|BPF_X:
943 		op = BPF_OP(s->code);
944 		if (alter && vmap[val[X_ATOM]].is_const) {
945 			if (vmap[val[A_ATOM]].is_const) {
946 				fold_op(s, val[A_ATOM], val[X_ATOM]);
947 				val[A_ATOM] = K(s->k);
948 			}
949 			else {
950 				s->code = BPF_ALU|BPF_K|op;
951 				s->k = vmap[val[X_ATOM]].const_val;
952 				done = 0;
953 				val[A_ATOM] =
954 					F(s->code, val[A_ATOM], K(s->k));
955 			}
956 			break;
957 		}
958 		/*
959 		 * Check if we're doing something to an accumulator
960 		 * that is 0, and simplify.  This may not seem like
961 		 * much of a simplification but it could open up further
962 		 * optimizations.
963 		 * XXX We could also check for mul by 1, and -1, etc.
964 		 */
965 		if (alter && vmap[val[A_ATOM]].is_const
966 		    && vmap[val[A_ATOM]].const_val == 0) {
967 			if (op == BPF_ADD || op == BPF_OR ||
968 			    op == BPF_LSH || op == BPF_RSH || op == BPF_SUB) {
969 				s->code = BPF_MISC|BPF_TXA;
970 				vstore(s, &val[A_ATOM], val[X_ATOM], alter);
971 				break;
972 			}
973 			else if (op == BPF_MUL || op == BPF_DIV ||
974 				 op == BPF_AND) {
975 				s->code = BPF_LD|BPF_IMM;
976 				s->k = 0;
977 				vstore(s, &val[A_ATOM], K(s->k), alter);
978 				break;
979 			}
980 			else if (op == BPF_NEG) {
981 				s->code = NOP;
982 				break;
983 			}
984 		}
985 		val[A_ATOM] = F(s->code, val[A_ATOM], val[X_ATOM]);
986 		break;
987 
988 	case BPF_MISC|BPF_TXA:
989 		vstore(s, &val[A_ATOM], val[X_ATOM], alter);
990 		break;
991 
992 	case BPF_LD|BPF_MEM:
993 		v = val[s->k];
994 		if (alter && vmap[v].is_const) {
995 			s->code = BPF_LD|BPF_IMM;
996 			s->k = vmap[v].const_val;
997 			done = 0;
998 		}
999 		vstore(s, &val[A_ATOM], v, alter);
1000 		break;
1001 
1002 	case BPF_MISC|BPF_TAX:
1003 		vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1004 		break;
1005 
1006 	case BPF_LDX|BPF_MEM:
1007 		v = val[s->k];
1008 		if (alter && vmap[v].is_const) {
1009 			s->code = BPF_LDX|BPF_IMM;
1010 			s->k = vmap[v].const_val;
1011 			done = 0;
1012 		}
1013 		vstore(s, &val[X_ATOM], v, alter);
1014 		break;
1015 
1016 	case BPF_ST:
1017 		vstore(s, &val[s->k], val[A_ATOM], alter);
1018 		break;
1019 
1020 	case BPF_STX:
1021 		vstore(s, &val[s->k], val[X_ATOM], alter);
1022 		break;
1023 	}
1024 }
1025 
1026 static void
deadstmt(struct stmt * s,struct stmt * last[])1027 deadstmt(struct stmt *s, struct stmt *last[])
1028 {
1029 	int atom;
1030 
1031 	atom = atomuse(s);
1032 	if (atom >= 0) {
1033 		if (atom == AX_ATOM) {
1034 			last[X_ATOM] = 0;
1035 			last[A_ATOM] = 0;
1036 		}
1037 		else
1038 			last[atom] = 0;
1039 	}
1040 	atom = atomdef(s);
1041 	if (atom >= 0) {
1042 		if (last[atom]) {
1043 			done = 0;
1044 			last[atom]->code = NOP;
1045 		}
1046 		last[atom] = s;
1047 	}
1048 }
1049 
1050 static void
opt_deadstores(struct block * b)1051 opt_deadstores(struct block *b)
1052 {
1053 	struct slist *s;
1054 	int atom;
1055 	struct stmt *last[N_ATOMS];
1056 
1057 	memset((char *)last, 0, sizeof last);
1058 
1059 	for (s = b->stmts; s != 0; s = s->next)
1060 		deadstmt(&s->s, last);
1061 	deadstmt(&b->s, last);
1062 
1063 	for (atom = 0; atom < N_ATOMS; ++atom)
1064 		if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1065 			last[atom]->code = NOP;
1066 			done = 0;
1067 		}
1068 }
1069 
1070 static void
opt_blk(struct block * b,int do_stmts)1071 opt_blk(struct block *b, int do_stmts)
1072 {
1073 	struct slist *s;
1074 	struct edge *p;
1075 	int i;
1076 	bpf_int32 aval;
1077 
1078 #if 0
1079 	for (s = b->stmts; s && s->next; s = s->next)
1080 		if (BPF_CLASS(s->s.code) == BPF_JMP) {
1081 			do_stmts = 0;
1082 			break;
1083 		}
1084 #endif
1085 
1086 	/*
1087 	 * Initialize the atom values.
1088 	 * If we have no predecessors, everything is undefined.
1089 	 * Otherwise, we inherent our values from our predecessors.
1090 	 * If any register has an ambiguous value (i.e. control paths are
1091 	 * merging) give it the undefined value of 0.
1092 	 */
1093 	p = b->in_edges;
1094 	if (p == 0)
1095 		memset((char *)b->val, 0, sizeof(b->val));
1096 	else {
1097 		memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1098 		while ((p = p->next) != NULL) {
1099 			for (i = 0; i < N_ATOMS; ++i)
1100 				if (b->val[i] != p->pred->val[i])
1101 					b->val[i] = 0;
1102 		}
1103 	}
1104 	aval = b->val[A_ATOM];
1105 	for (s = b->stmts; s; s = s->next)
1106 		opt_stmt(&s->s, b->val, do_stmts);
1107 
1108 	/*
1109 	 * This is a special case: if we don't use anything from this
1110 	 * block, and we load the accumulator with value that is
1111 	 * already there, or if this block is a return,
1112 	 * eliminate all the statements.
1113 	 */
1114 	if (do_stmts &&
1115 	    ((b->out_use == 0 && aval != 0 &&b->val[A_ATOM] == aval) ||
1116 	     BPF_CLASS(b->s.code) == BPF_RET)) {
1117 		if (b->stmts != 0) {
1118 			b->stmts = 0;
1119 			done = 0;
1120 		}
1121 	} else {
1122 		opt_peep(b);
1123 		opt_deadstores(b);
1124 	}
1125 	/*
1126 	 * Set up values for branch optimizer.
1127 	 */
1128 	if (BPF_SRC(b->s.code) == BPF_K)
1129 		b->oval = K(b->s.k);
1130 	else
1131 		b->oval = b->val[X_ATOM];
1132 	b->et.code = b->s.code;
1133 	b->ef.code = -b->s.code;
1134 }
1135 
1136 /*
1137  * Return true if any register that is used on exit from 'succ', has
1138  * an exit value that is different from the corresponding exit value
1139  * from 'b'.
1140  */
1141 static int
use_conflict(struct block * b,struct block * succ)1142 use_conflict(struct block *b, struct block *succ)
1143 {
1144 	int atom;
1145 	atomset use = succ->out_use;
1146 
1147 	if (use == 0)
1148 		return 0;
1149 
1150 	for (atom = 0; atom < N_ATOMS; ++atom)
1151 		if (ATOMELEM(use, atom))
1152 			if (b->val[atom] != succ->val[atom])
1153 				return 1;
1154 	return 0;
1155 }
1156 
1157 static struct block *
fold_edge(struct block * child,struct edge * ep)1158 fold_edge(struct block *child, struct edge *ep)
1159 {
1160 	int sense;
1161 	int aval0, aval1, oval0, oval1;
1162 	int code = ep->code;
1163 
1164 	if (code < 0) {
1165 		code = -code;
1166 		sense = 0;
1167 	} else
1168 		sense = 1;
1169 
1170 	if (child->s.code != code)
1171 		return 0;
1172 
1173 	aval0 = child->val[A_ATOM];
1174 	oval0 = child->oval;
1175 	aval1 = ep->pred->val[A_ATOM];
1176 	oval1 = ep->pred->oval;
1177 
1178 	if (aval0 != aval1)
1179 		return 0;
1180 
1181 	if (oval0 == oval1)
1182 		/*
1183 		 * The operands are identical, so the
1184 		 * result is true if a true branch was
1185 		 * taken to get here, otherwise false.
1186 		 */
1187 		return sense ? JT(child) : JF(child);
1188 
1189 	if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1190 		/*
1191 		 * At this point, we only know the comparison if we
1192 		 * came down the true branch, and it was an equality
1193 		 * comparison with a constant.  We rely on the fact that
1194 		 * distinct constants have distinct value numbers.
1195 		 */
1196 		return JF(child);
1197 
1198 	return 0;
1199 }
1200 
1201 static void
opt_j(struct edge * ep)1202 opt_j(struct edge *ep)
1203 {
1204 	int i, k;
1205 	struct block *target;
1206 
1207 	if (JT(ep->succ) == 0)
1208 		return;
1209 
1210 	if (JT(ep->succ) == JF(ep->succ)) {
1211 		/*
1212 		 * Common branch targets can be eliminated, provided
1213 		 * there is no data dependency.
1214 		 */
1215 		if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1216 			done = 0;
1217 			ep->succ = JT(ep->succ);
1218 		}
1219 	}
1220 	/*
1221 	 * For each edge dominator that matches the successor of this
1222 	 * edge, promote the edge successor to the its grandchild.
1223 	 *
1224 	 * XXX We violate the set abstraction here in favor a reasonably
1225 	 * efficient loop.
1226 	 */
1227  top:
1228 	for (i = 0; i < edgewords; ++i) {
1229 		bpf_u_int32 x = ep->edom[i];
1230 
1231 		while (x != 0) {
1232 			k = ffs(x) - 1;
1233 			x &=~ (1 << k);
1234 			k += i * BITS_PER_WORD;
1235 
1236 			target = fold_edge(ep->succ, edges[k]);
1237 			/*
1238 			 * Check that there is no data dependency between
1239 			 * nodes that will be violated if we move the edge.
1240 			 */
1241 			if (target != 0 && !use_conflict(ep->pred, target)) {
1242 				done = 0;
1243 				ep->succ = target;
1244 				if (JT(target) != 0)
1245 					/*
1246 					 * Start over unless we hit a leaf.
1247 					 */
1248 					goto top;
1249 				return;
1250 			}
1251 		}
1252 	}
1253 }
1254 
1255 
1256 static void
or_pullup(struct block * b)1257 or_pullup(struct block *b)
1258 {
1259 	int val, at_top;
1260 	struct block *pull;
1261 	struct block **diffp, **samep;
1262 	struct edge *ep;
1263 
1264 	ep = b->in_edges;
1265 	if (ep == 0)
1266 		return;
1267 
1268 	/*
1269 	 * Make sure each predecessor loads the same value.
1270 	 * XXX why?
1271 	 */
1272 	val = ep->pred->val[A_ATOM];
1273 	for (ep = ep->next; ep != 0; ep = ep->next)
1274 		if (val != ep->pred->val[A_ATOM])
1275 			return;
1276 
1277 	if (JT(b->in_edges->pred) == b)
1278 		diffp = &JT(b->in_edges->pred);
1279 	else
1280 		diffp = &JF(b->in_edges->pred);
1281 
1282 	at_top = 1;
1283 	while (1) {
1284 		if (*diffp == 0)
1285 			return;
1286 
1287 		if (JT(*diffp) != JT(b))
1288 			return;
1289 
1290 		if (!SET_MEMBER((*diffp)->dom, b->id))
1291 			return;
1292 
1293 		if ((*diffp)->val[A_ATOM] != val)
1294 			break;
1295 
1296 		diffp = &JF(*diffp);
1297 		at_top = 0;
1298 	}
1299 	samep = &JF(*diffp);
1300 	while (1) {
1301 		if (*samep == 0)
1302 			return;
1303 
1304 		if (JT(*samep) != JT(b))
1305 			return;
1306 
1307 		if (!SET_MEMBER((*samep)->dom, b->id))
1308 			return;
1309 
1310 		if ((*samep)->val[A_ATOM] == val)
1311 			break;
1312 
1313 		/* XXX Need to check that there are no data dependencies
1314 		   between dp0 and dp1.  Currently, the code generator
1315 		   will not produce such dependencies. */
1316 		samep = &JF(*samep);
1317 	}
1318 #ifdef notdef
1319 	/* XXX This doesn't cover everything. */
1320 	for (i = 0; i < N_ATOMS; ++i)
1321 		if ((*samep)->val[i] != pred->val[i])
1322 			return;
1323 #endif
1324 	/* Pull up the node. */
1325 	pull = *samep;
1326 	*samep = JF(pull);
1327 	JF(pull) = *diffp;
1328 
1329 	/*
1330 	 * At the top of the chain, each predecessor needs to point at the
1331 	 * pulled up node.  Inside the chain, there is only one predecessor
1332 	 * to worry about.
1333 	 */
1334 	if (at_top) {
1335 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1336 			if (JT(ep->pred) == b)
1337 				JT(ep->pred) = pull;
1338 			else
1339 				JF(ep->pred) = pull;
1340 		}
1341 	}
1342 	else
1343 		*diffp = pull;
1344 
1345 	done = 0;
1346 }
1347 
1348 static void
and_pullup(struct block * b)1349 and_pullup(struct block *b)
1350 {
1351 	int val, at_top;
1352 	struct block *pull;
1353 	struct block **diffp, **samep;
1354 	struct edge *ep;
1355 
1356 	ep = b->in_edges;
1357 	if (ep == 0)
1358 		return;
1359 
1360 	/*
1361 	 * Make sure each predecessor loads the same value.
1362 	 */
1363 	val = ep->pred->val[A_ATOM];
1364 	for (ep = ep->next; ep != 0; ep = ep->next)
1365 		if (val != ep->pred->val[A_ATOM])
1366 			return;
1367 
1368 	if (JT(b->in_edges->pred) == b)
1369 		diffp = &JT(b->in_edges->pred);
1370 	else
1371 		diffp = &JF(b->in_edges->pred);
1372 
1373 	at_top = 1;
1374 	while (1) {
1375 		if (*diffp == 0)
1376 			return;
1377 
1378 		if (JF(*diffp) != JF(b))
1379 			return;
1380 
1381 		if (!SET_MEMBER((*diffp)->dom, b->id))
1382 			return;
1383 
1384 		if ((*diffp)->val[A_ATOM] != val)
1385 			break;
1386 
1387 		diffp = &JT(*diffp);
1388 		at_top = 0;
1389 	}
1390 	samep = &JT(*diffp);
1391 	while (1) {
1392 		if (*samep == 0)
1393 			return;
1394 
1395 		if (JF(*samep) != JF(b))
1396 			return;
1397 
1398 		if (!SET_MEMBER((*samep)->dom, b->id))
1399 			return;
1400 
1401 		if ((*samep)->val[A_ATOM] == val)
1402 			break;
1403 
1404 		/* XXX Need to check that there are no data dependencies
1405 		   between diffp and samep.  Currently, the code generator
1406 		   will not produce such dependencies. */
1407 		samep = &JT(*samep);
1408 	}
1409 #ifdef notdef
1410 	/* XXX This doesn't cover everything. */
1411 	for (i = 0; i < N_ATOMS; ++i)
1412 		if ((*samep)->val[i] != pred->val[i])
1413 			return;
1414 #endif
1415 	/* Pull up the node. */
1416 	pull = *samep;
1417 	*samep = JT(pull);
1418 	JT(pull) = *diffp;
1419 
1420 	/*
1421 	 * At the top of the chain, each predecessor needs to point at the
1422 	 * pulled up node.  Inside the chain, there is only one predecessor
1423 	 * to worry about.
1424 	 */
1425 	if (at_top) {
1426 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1427 			if (JT(ep->pred) == b)
1428 				JT(ep->pred) = pull;
1429 			else
1430 				JF(ep->pred) = pull;
1431 		}
1432 	}
1433 	else
1434 		*diffp = pull;
1435 
1436 	done = 0;
1437 }
1438 
1439 static void
opt_blks(struct block * root,int do_stmts)1440 opt_blks(struct block *root, int do_stmts)
1441 {
1442 	int i, maxlevel;
1443 	struct block *p;
1444 
1445 	init_val();
1446 	maxlevel = root->level;
1447 	for (i = maxlevel; i >= 0; --i)
1448 		for (p = levels[i]; p; p = p->link)
1449 			opt_blk(p, do_stmts);
1450 
1451 	if (do_stmts)
1452 		/*
1453 		 * No point trying to move branches; it can't possibly
1454 		 * make a difference at this point.
1455 		 */
1456 		return;
1457 
1458 	for (i = 1; i <= maxlevel; ++i) {
1459 		for (p = levels[i]; p; p = p->link) {
1460 			opt_j(&p->et);
1461 			opt_j(&p->ef);
1462 		}
1463 	}
1464 	for (i = 1; i <= maxlevel; ++i) {
1465 		for (p = levels[i]; p; p = p->link) {
1466 			or_pullup(p);
1467 			and_pullup(p);
1468 		}
1469 	}
1470 }
1471 
1472 static __inline void
link_inedge(struct edge * parent,struct block * child)1473 link_inedge(struct edge *parent, struct block *child)
1474 {
1475 	parent->next = child->in_edges;
1476 	child->in_edges = parent;
1477 }
1478 
1479 static void
find_inedges(struct block * root)1480 find_inedges(struct block *root)
1481 {
1482 	int i;
1483 	struct block *b;
1484 
1485 	for (i = 0; i < n_blocks; ++i)
1486 		blocks[i]->in_edges = 0;
1487 
1488 	/*
1489 	 * Traverse the graph, adding each edge to the predecessor
1490 	 * list of its successors.  Skip the leaves (i.e. level 0).
1491 	 */
1492 	for (i = root->level; i > 0; --i) {
1493 		for (b = levels[i]; b != 0; b = b->link) {
1494 			link_inedge(&b->et, JT(b));
1495 			link_inedge(&b->ef, JF(b));
1496 		}
1497 	}
1498 }
1499 
1500 static void
opt_root(struct block ** b)1501 opt_root(struct block **b)
1502 {
1503 	struct slist *tmp, *s;
1504 
1505 	s = (*b)->stmts;
1506 	(*b)->stmts = 0;
1507 	while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1508 		*b = JT(*b);
1509 
1510 	tmp = (*b)->stmts;
1511 	if (tmp != 0)
1512 		sappend(s, tmp);
1513 	(*b)->stmts = s;
1514 
1515 	/*
1516 	 * If the root node is a return, then there is no
1517 	 * point executing any statements (since the bpf machine
1518 	 * has no side effects).
1519 	 */
1520 	if (BPF_CLASS((*b)->s.code) == BPF_RET)
1521 		(*b)->stmts = 0;
1522 }
1523 
1524 static void
opt_loop(struct block * root,int do_stmts)1525 opt_loop(struct block *root, int do_stmts)
1526 {
1527 
1528 #ifdef BDEBUG
1529 	if (dflag > 1)
1530 		opt_dump(root);
1531 #endif
1532 	do {
1533 		done = 1;
1534 		find_levels(root);
1535 		find_dom(root);
1536 		find_closure(root);
1537 		find_inedges(root);
1538 		find_ud(root);
1539 		find_edom(root);
1540 		opt_blks(root, do_stmts);
1541 #ifdef BDEBUG
1542 		if (dflag > 1)
1543 			opt_dump(root);
1544 #endif
1545 	} while (!done);
1546 }
1547 
1548 /*
1549  * Optimize the filter code in its dag representation.
1550  */
1551 void
bpf_optimize(struct block ** rootp)1552 bpf_optimize(struct block **rootp)
1553 {
1554 	struct block *root;
1555 
1556 	root = *rootp;
1557 
1558 	opt_init(root);
1559 	opt_loop(root, 0);
1560 	opt_loop(root, 1);
1561 	intern_blocks(root);
1562 	opt_root(rootp);
1563 	opt_cleanup();
1564 }
1565 
1566 static void
make_marks(struct block * p)1567 make_marks(struct block *p)
1568 {
1569 	if (!isMarked(p)) {
1570 		Mark(p);
1571 		if (BPF_CLASS(p->s.code) != BPF_RET) {
1572 			make_marks(JT(p));
1573 			make_marks(JF(p));
1574 		}
1575 	}
1576 }
1577 
1578 /*
1579  * Mark code array such that isMarked(i) is true
1580  * only for nodes that are alive.
1581  */
1582 static void
mark_code(struct block * p)1583 mark_code(struct block *p)
1584 {
1585 	cur_mark += 1;
1586 	make_marks(p);
1587 }
1588 
1589 /*
1590  * True iff the two stmt lists load the same value from the packet into
1591  * the accumulator.
1592  */
1593 static int
eq_slist(struct slist * x,struct slist * y)1594 eq_slist(struct slist *x, struct slist *y)
1595 {
1596 	while (1) {
1597 		while (x && x->s.code == NOP)
1598 			x = x->next;
1599 		while (y && y->s.code == NOP)
1600 			y = y->next;
1601 		if (x == 0)
1602 			return y == 0;
1603 		if (y == 0)
1604 			return x == 0;
1605 		if (x->s.code != y->s.code || x->s.k != y->s.k)
1606 			return 0;
1607 		x = x->next;
1608 		y = y->next;
1609 	}
1610 }
1611 
1612 static __inline int
eq_blk(struct block * b0,struct block * b1)1613 eq_blk(struct block *b0, struct block *b1)
1614 {
1615 	if (b0->s.code == b1->s.code &&
1616 	    b0->s.k == b1->s.k &&
1617 	    b0->et.succ == b1->et.succ &&
1618 	    b0->ef.succ == b1->ef.succ)
1619 		return eq_slist(b0->stmts, b1->stmts);
1620 	return 0;
1621 }
1622 
1623 static void
intern_blocks(struct block * root)1624 intern_blocks(struct block *root)
1625 {
1626 	struct block *p;
1627 	int i, j;
1628 	int done;
1629  top:
1630 	done = 1;
1631 	for (i = 0; i < n_blocks; ++i)
1632 		blocks[i]->link = 0;
1633 
1634 	mark_code(root);
1635 
1636 	for (i = n_blocks - 1; --i >= 0; ) {
1637 		if (!isMarked(blocks[i]))
1638 			continue;
1639 		for (j = i + 1; j < n_blocks; ++j) {
1640 			if (!isMarked(blocks[j]))
1641 				continue;
1642 			if (eq_blk(blocks[i], blocks[j])) {
1643 				blocks[i]->link = blocks[j]->link ?
1644 					blocks[j]->link : blocks[j];
1645 				break;
1646 			}
1647 		}
1648 	}
1649 	for (i = 0; i < n_blocks; ++i) {
1650 		p = blocks[i];
1651 		if (JT(p) == 0)
1652 			continue;
1653 		if (JT(p)->link) {
1654 			done = 0;
1655 			JT(p) = JT(p)->link;
1656 		}
1657 		if (JF(p)->link) {
1658 			done = 0;
1659 			JF(p) = JF(p)->link;
1660 		}
1661 	}
1662 	if (!done)
1663 		goto top;
1664 }
1665 
1666 static void
opt_cleanup(void)1667 opt_cleanup(void)
1668 {
1669 	free((void *)vnode_base);
1670 	free((void *)vmap);
1671 	free((void *)edges);
1672 	free((void *)space1);
1673 	free((void *)space2);
1674 	free((void *)levels);
1675 	free((void *)blocks);
1676 }
1677 
1678 /*
1679  * Return the number of stmts in 's'.
1680  */
1681 static int
slength(struct slist * s)1682 slength(struct slist *s)
1683 {
1684 	int n = 0;
1685 
1686 	for (; s; s = s->next)
1687 		if (s->s.code != NOP)
1688 			++n;
1689 	return n;
1690 }
1691 
1692 /*
1693  * Return the number of nodes reachable by 'p'.
1694  * All nodes should be initially unmarked.
1695  */
1696 static int
count_blocks(struct block * p)1697 count_blocks(struct block *p)
1698 {
1699 	if (p == 0 || isMarked(p))
1700 		return 0;
1701 	Mark(p);
1702 	return count_blocks(JT(p)) + count_blocks(JF(p)) + 1;
1703 }
1704 
1705 /*
1706  * Do a depth first search on the flow graph, numbering the
1707  * the basic blocks, and entering them into the 'blocks' array.`
1708  */
1709 static void
number_blks_r(struct block * p)1710 number_blks_r(struct block *p)
1711 {
1712 	int n;
1713 
1714 	if (p == 0 || isMarked(p))
1715 		return;
1716 
1717 	Mark(p);
1718 	n = n_blocks++;
1719 	p->id = n;
1720 	blocks[n] = p;
1721 
1722 	number_blks_r(JT(p));
1723 	number_blks_r(JF(p));
1724 }
1725 
1726 /*
1727  * Return the number of stmts in the flowgraph reachable by 'p'.
1728  * The nodes should be unmarked before calling.
1729  */
1730 static int
count_stmts(struct block * p)1731 count_stmts(struct block *p)
1732 {
1733 	int n;
1734 
1735 	if (p == 0 || isMarked(p))
1736 		return 0;
1737 	Mark(p);
1738 	n = count_stmts(JT(p)) + count_stmts(JF(p));
1739 	return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
1740 }
1741 
1742 /*
1743  * Allocate memory.  All allocation is done before optimization
1744  * is begun.  A linear bound on the size of all data structures is computed
1745  * from the total number of blocks and/or statements.
1746  */
1747 static void
opt_init(struct block * root)1748 opt_init(struct block *root)
1749 {
1750 	bpf_u_int32 *p;
1751 	int i, n, max_stmts;
1752 	size_t size1, size2;
1753 
1754 	/*
1755 	 * First, count the blocks, so we can malloc an array to map
1756 	 * block number to block.  Then, put the blocks into the array.
1757 	 */
1758 	unMarkAll();
1759 	n = count_blocks(root);
1760 	blocks = reallocarray(NULL, n, sizeof(*blocks));
1761 	if (blocks == NULL)
1762 		bpf_error("malloc");
1763 
1764 	unMarkAll();
1765 	n_blocks = 0;
1766 	number_blks_r(root);
1767 
1768 	n_edges = 2 * n_blocks;
1769 	edges = reallocarray(NULL, n_edges, sizeof(*edges));
1770 	if (edges == NULL)
1771 		bpf_error("malloc");
1772 
1773 	/*
1774 	 * The number of levels is bounded by the number of nodes.
1775 	 */
1776 	levels = reallocarray(NULL, n_blocks, sizeof(*levels));
1777 	if (levels == NULL)
1778 		bpf_error("malloc");
1779 
1780 	edgewords = n_edges / (8 * sizeof(bpf_u_int32)) + 1;
1781 	nodewords = n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
1782 
1783 	size1 = 2;
1784 	if (n_blocks > SIZE_MAX / size1)
1785 		goto fail1;
1786 	size1 *= n_blocks;
1787 	if (nodewords > SIZE_MAX / size1)
1788 		goto fail1;
1789 	size1 *= nodewords;
1790 	if (sizeof(*space1) > SIZE_MAX / size1)
1791 		goto fail1;
1792 	size1 *= sizeof(*space1);
1793 
1794 	space1 = (bpf_u_int32 *)malloc(size1);
1795 	if (space1 == NULL) {
1796 fail1:
1797 		bpf_error("malloc");
1798 	}
1799 
1800 	size2 = n_edges;
1801 	if (edgewords > SIZE_MAX / size2)
1802 		goto fail2;
1803 	size2 *= edgewords;
1804 	if (sizeof(*space2) > SIZE_MAX / size2)
1805 		goto fail2;
1806 	size2 *= sizeof(*space2);
1807 
1808 	space2 = (bpf_u_int32 *)malloc(size2);
1809 	if (space2 == NULL) {
1810 fail2:
1811 		free(space1);
1812 		bpf_error("malloc");
1813 	}
1814 
1815 	p = space1;
1816 	all_dom_sets = p;
1817 	for (i = 0; i < n; ++i) {
1818 		blocks[i]->dom = p;
1819 		p += nodewords;
1820 	}
1821 	all_closure_sets = p;
1822 	for (i = 0; i < n; ++i) {
1823 		blocks[i]->closure = p;
1824 		p += nodewords;
1825 	}
1826 	p = space2;
1827 	all_edge_sets = p;
1828 	for (i = 0; i < n; ++i) {
1829 		struct block *b = blocks[i];
1830 
1831 		b->et.edom = p;
1832 		p += edgewords;
1833 		b->ef.edom = p;
1834 		p += edgewords;
1835 		b->et.id = i;
1836 		edges[i] = &b->et;
1837 		b->ef.id = n_blocks + i;
1838 		edges[n_blocks + i] = &b->ef;
1839 		b->et.pred = b;
1840 		b->ef.pred = b;
1841 	}
1842 	max_stmts = 0;
1843 	for (i = 0; i < n; ++i)
1844 		max_stmts += slength(blocks[i]->stmts) + 1;
1845 	/*
1846 	 * We allocate at most 3 value numbers per statement,
1847 	 * so this is an upper bound on the number of valnodes
1848 	 * we'll need.
1849 	 */
1850 	maxval = 3 * max_stmts;
1851 	vmap = reallocarray(NULL, maxval, sizeof(*vmap));
1852 	vnode_base = reallocarray(NULL, maxval, sizeof(*vnode_base));
1853 	if (vmap == NULL || vnode_base == NULL)
1854 		bpf_error("malloc");
1855 }
1856 
1857 /*
1858  * Some pointers used to convert the basic block form of the code,
1859  * into the array form that BPF requires.  'fstart' will point to
1860  * the malloc'd array while 'ftail' is used during the recursive traversal.
1861  */
1862 static struct bpf_insn *fstart;
1863 static struct bpf_insn *ftail;
1864 
1865 #ifdef BDEBUG
1866 int bids[1000];
1867 #endif
1868 
1869 /*
1870  * Returns true if successful.  Returns false if a branch has
1871  * an offset that is too large.  If so, we have marked that
1872  * branch so that on a subsequent iteration, it will be treated
1873  * properly.
1874  */
1875 static int
convert_code_r(struct block * p)1876 convert_code_r(struct block *p)
1877 {
1878 	struct bpf_insn *dst;
1879 	struct slist *src;
1880 	int slen;
1881 	u_int off;
1882 	int extrajmps;		/* number of extra jumps inserted */
1883 	struct slist **offset = NULL;
1884 
1885 	if (p == 0 || isMarked(p))
1886 		return (1);
1887 	Mark(p);
1888 
1889 	if (convert_code_r(JF(p)) == 0)
1890 		return (0);
1891 	if (convert_code_r(JT(p)) == 0)
1892 		return (0);
1893 
1894 	slen = slength(p->stmts);
1895 	dst = ftail -= (slen + 1 + p->longjt + p->longjf);
1896 		/* inflate length by any extra jumps */
1897 
1898 	p->offset = dst - fstart;
1899 
1900 	/* generate offset[] for convenience  */
1901 	if (slen) {
1902 		offset = calloc(slen, sizeof(struct slist *));
1903 		if (!offset) {
1904 			bpf_error("not enough core");
1905 			/*NOTREACHED*/
1906 		}
1907 	}
1908 	src = p->stmts;
1909 	for (off = 0; off < slen && src; off++) {
1910 #if 0
1911 		printf("off=%d src=%x\n", off, src);
1912 #endif
1913 		offset[off] = src;
1914 		src = src->next;
1915 	}
1916 
1917 	off = 0;
1918 	for (src = p->stmts; src; src = src->next) {
1919 		if (src->s.code == NOP)
1920 			continue;
1921 		dst->code = (u_short)src->s.code;
1922 		dst->k = src->s.k;
1923 
1924 		/* fill block-local relative jump */
1925 		if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
1926 #if 0
1927 			if (src->s.jt || src->s.jf) {
1928 				bpf_error("illegal jmp destination");
1929 				/*NOTREACHED*/
1930 			}
1931 #endif
1932 			goto filled;
1933 		}
1934 		if (off == slen - 2)	/*???*/
1935 			goto filled;
1936 
1937 	    {
1938 		int i;
1939 		int jt, jf;
1940 		static const char ljerr[] =
1941 		    "%s for block-local relative jump: off=%d";
1942 
1943 #if 0
1944 		printf("code=%x off=%d %x %x\n", src->s.code,
1945 			off, src->s.jt, src->s.jf);
1946 #endif
1947 
1948 		if (!src->s.jt || !src->s.jf) {
1949 			bpf_error(ljerr, "no jmp destination", off);
1950 			/*NOTREACHED*/
1951 		}
1952 
1953 		jt = jf = 0;
1954 		for (i = 0; i < slen; i++) {
1955 			if (offset[i] == src->s.jt) {
1956 				if (jt) {
1957 					bpf_error(ljerr, "multiple matches", off);
1958 					/*NOTREACHED*/
1959 				}
1960 
1961 				dst->jt = i - off - 1;
1962 				jt++;
1963 			}
1964 			if (offset[i] == src->s.jf) {
1965 				if (jf) {
1966 					bpf_error(ljerr, "multiple matches", off);
1967 					/*NOTREACHED*/
1968 				}
1969 				dst->jf = i - off - 1;
1970 				jf++;
1971 			}
1972 		}
1973 		if (!jt || !jf) {
1974 			bpf_error(ljerr, "no destination found", off);
1975 			/*NOTREACHED*/
1976 		}
1977 	    }
1978 filled:
1979 		++dst;
1980 		++off;
1981 	}
1982 	free(offset);
1983 
1984 #ifdef BDEBUG
1985 	bids[dst - fstart] = p->id + 1;
1986 #endif
1987 	dst->code = (u_short)p->s.code;
1988 	dst->k = p->s.k;
1989 	if (JT(p)) {
1990 		extrajmps = 0;
1991 		off = JT(p)->offset - (p->offset + slen) - 1;
1992 		if (off >= 256) {
1993 		    /* offset too large for branch, must add a jump */
1994 		    if (p->longjt == 0) {
1995 		    	/* mark this instruction and retry */
1996 			p->longjt++;
1997 			return(0);
1998 		    }
1999 		    /* branch if T to following jump */
2000 		    dst->jt = extrajmps;
2001 		    extrajmps++;
2002 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
2003 		    dst[extrajmps].k = off - extrajmps;
2004 		}
2005 		else
2006 		    dst->jt = off;
2007 		off = JF(p)->offset - (p->offset + slen) - 1;
2008 		if (off >= 256) {
2009 		    /* offset too large for branch, must add a jump */
2010 		    if (p->longjf == 0) {
2011 		    	/* mark this instruction and retry */
2012 			p->longjf++;
2013 			return(0);
2014 		    }
2015 		    /* branch if F to following jump */
2016 		    /* if two jumps are inserted, F goes to second one */
2017 		    dst->jf = extrajmps;
2018 		    extrajmps++;
2019 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
2020 		    dst[extrajmps].k = off - extrajmps;
2021 		}
2022 		else
2023 		    dst->jf = off;
2024 	}
2025 	return (1);
2026 }
2027 
2028 
2029 /*
2030  * Convert flowgraph intermediate representation to the
2031  * BPF array representation.  Set *lenp to the number of instructions.
2032  */
2033 struct bpf_insn *
icode_to_fcode(struct block * root,int * lenp)2034 icode_to_fcode(struct block *root, int *lenp)
2035 {
2036 	int n;
2037 	struct bpf_insn *fp;
2038 
2039 	/*
2040 	 * Loop doing convert_codr_r() until no branches remain
2041 	 * with too-large offsets.
2042 	 */
2043 	while (1) {
2044 	    unMarkAll();
2045 	    n = *lenp = count_stmts(root);
2046 
2047 	    fp = calloc(n, sizeof(*fp));
2048 	    if (fp == NULL)
2049 		    bpf_error("calloc");
2050 
2051 	    fstart = fp;
2052 	    ftail = fp + n;
2053 
2054 	    unMarkAll();
2055 	    if (convert_code_r(root))
2056 		break;
2057 	    free(fp);
2058 	}
2059 
2060 	return fp;
2061 }
2062 
2063 #ifdef BDEBUG
2064 static void
opt_dump(struct block * root)2065 opt_dump(struct block *root)
2066 {
2067 	struct bpf_program f;
2068 
2069 	memset(bids, 0, sizeof bids);
2070 	f.bf_insns = icode_to_fcode(root, &f.bf_len);
2071 	bpf_dump(&f, 1);
2072 	putchar('\n');
2073 	free((char *)f.bf_insns);
2074 }
2075 #endif
2076