xref: /minix/minix/net/lwip/bpf_filter.c (revision e4dbab1e)
1 /* LWIP service - bpf_filter.c - Berkeley Packet Filter core implementation */
2 /*
3  * This is basically a drop-in replacement of NetBSD's bpf_filter.c, which
4  * itself can be compiled for either the NetBSD kernel or for userland.  On
5  * MINIX 3, we would like to perform certain checks that NetBSD implements only
6  * for its kernel (e.g., memory store access validation) while replacing the
7  * NetBSD kernel specifics with our own (pbuf instead of mbuf, no BPF contexts
8  * for now, etc.).  As a result, it is easier to reimplement the whole thing,
9  * because there is not all that much to it.
10  *
11  * Support for the standard BSD API allows us to run standard tests against
12  * this module from userland, where _MINIX_SYSTEM is not defined.  MINIX 3
13  * specific extensions are enabled only if _MINIX_SYSTEM is defined.
14  */
15 #include <string.h>
16 #include <limits.h>
17 #include <net/bpf.h>
18 #include <minix/bitmap.h>
19 
20 #ifdef _MINIX_SYSTEM
21 #include "lwip.h"
22 
23 /*
24  * Obtain an unsigned 32-bit value in network byte order from the pbuf chain
25  * 'pbuf' at offset 'k'.  The given offset is guaranteed to be within bounds.
26  */
27 static uint32_t
28 bpf_get32_ext(const struct pbuf * pbuf, uint32_t k)
29 {
30 	uint32_t val;
31 	unsigned int i;
32 
33 	/*
34 	 * Find the pbuf that contains the first byte.  We expect that most
35 	 * filters will operate only on the headers of packets, so that we
36 	 * mostly avoid going through this O(n) loop.  Since only the superuser
37 	 * can open BPF devices at all, we need not be worried about abuse in
38 	 * this regard.  However, it turns out that this loop is particularly
39 	 * CPU-intensive after all, we can probably improve it by caching the
40 	 * last visited pbuf, as read locality is likely high.
41 	 */
42 	while (k >= pbuf->len) {
43 		k -= pbuf->len;
44 		pbuf = pbuf->next;
45 		assert(pbuf != NULL);
46 	}
47 
48 	/*
49 	 * We assume that every pbuf has some data, but we make no assumptions
50 	 * about any minimum amount of data per pbuf.  Therefore, we may have
51 	 * to take the bytes from anywhere between one and four pbufs.
52 	 * Hopefully the compiler will unroll this loop for us.
53 	 */
54 	val = (uint32_t)(((u_char *)pbuf->payload)[k]) << 24;
55 
56 	for (i = 0; i < 3; i++) {
57 		if (k >= (uint32_t)pbuf->len - 1) {
58 			k = 0;
59 			pbuf = pbuf->next;
60 			assert(pbuf != NULL);
61 		} else
62 			k++;
63 		val = (val << 8) | (uint32_t)(((u_char *)pbuf->payload)[k]);
64 	}
65 
66 	return val;
67 }
68 
69 /*
70  * Obtain an unsigned 16-bit value in network byte order from the pbuf chain
71  * 'pbuf' at offset 'k'.  The given offset is guaranteed to be within bounds.
72  */
73 static uint32_t
74 bpf_get16_ext(const struct pbuf * pbuf, uint32_t k)
75 {
76 
77 	/* As above. */
78 	while (k >= pbuf->len) {
79 		k -= pbuf->len;
80 		pbuf = pbuf->next;
81 		assert(pbuf != NULL);
82 	}
83 
84 	/*
85 	 * There are only two possible cases to cover here: either the two
86 	 * bytes are in the same pbuf, or they are in subsequent ones.
87 	 */
88 	if (k < (uint32_t)pbuf->len - 1) {
89 		return ((uint32_t)(((u_char *)pbuf->payload)[k]) << 8) |
90 		    (uint32_t)(((u_char *)pbuf->next->payload)[k + 1]);
91 	} else {
92 		assert(pbuf->next != NULL);
93 		return ((uint32_t)(((u_char *)pbuf->payload)[k]) << 8) |
94 		    (uint32_t)(((u_char *)pbuf->next->payload)[0]);
95 	}
96 }
97 
98 /*
99  * Obtain an unsigned 8-bit value from the pbuf chain 'pbuf' at offset 'k'.
100  * The given offset is guaranteed to be within bounds.
101  */
102 static uint32_t
103 bpf_get8_ext(const struct pbuf * pbuf, uint32_t k)
104 {
105 
106 	/* As above. */
107 	while (k >= pbuf->len) {
108 		k -= pbuf->len;
109 		pbuf = pbuf->next;
110 		assert(pbuf != NULL);
111 	}
112 
113 	return (uint32_t)(((u_char *)pbuf->payload)[k]);
114 }
115 
116 #endif /* _MINIX_SYSTEM */
117 
118 /*
119  * Execute a BPF filter program on (the first part of) a packet, and return the
120  * maximum size of the packet that should be delivered to the filter owner.
121  *
122  * The 'pc' parameter points to an array of BPF instructions that together form
123  * the filter program to be executed.  If 'pc' is NULL, the packet is fully
124  * accepted.  Otherwise, the given program MUST have passed a previous call to
125  * bpf_validate().  Not doing so will allow for arbitrary memory access.
126  *
127  * The 'packet' array contains up to the whole packet.  The value of 'total'
128  * denotes the total length of the packet; 'len' contains the size of the array
129  * 'packet'.  Chunked storage of the packet is not supported at this time.
130  *
131  * If executing the program succeeds, the return value is the maximum number of
132  * bytes from the packet to be delivered.  The return value may exceed the full
133  * packet size.  If the number of bytes returned is zero, the packet is to be
134  * ignored.  If the program fails to execute properly and return a value, a
135  * value of zero is returned as well, thus also indicating that the packet
136  * should be ignored.  This is intentional: it saves filter programs from
137  * having to perform explicit checks on the packet they are filtering.
138  */
139 u_int
140 bpf_filter(const struct bpf_insn * pc, const u_char * packet, u_int total,
141 	u_int len)
142 #ifdef _MINIX_SYSTEM
143 {
144 
145 	return bpf_filter_ext(pc, NULL /*pbuf*/, packet, total, len);
146 }
147 
148 u_int
149 bpf_filter_ext(const struct bpf_insn * pc, const struct pbuf * pbuf,
150 	const u_char * packet, u_int total, u_int len)
151 #endif /* _MINIX_SYSTEM */
152 {
153 	uint32_t k, a, x, mem[BPF_MEMWORDS];
154 
155 	/* An empty program accepts all packets. */
156 	if (pc == NULL)
157 		return UINT_MAX;
158 
159 	/*
160 	 * We need not clear 'mem': the checker guarantees that each memory
161 	 * store word is always written before it is read.
162 	 */
163 	a = 0;
164 	x = 0;
165 
166 	/* Execute the program. */
167 	for (;; pc++) {
168 		k = pc->k;
169 
170 		switch (pc->code) {
171 		case BPF_LD+BPF_W+BPF_IND:	/* A <- P[X+k:4] */
172 			if (k + x < k)
173 				return 0;
174 			k += x;
175 			/* FALLTHROUGH */
176 		case BPF_LD+BPF_W+BPF_ABS:	/* A <- P[k:4] */
177 			/*
178 			 * 'k' may have any value, so check bounds in such a
179 			 * way that 'k' cannot possibly overflow and wrap.
180 			 */
181 			if (len >= 3 && k < len - 3)
182 				a = ((uint32_t)packet[k] << 24) |
183 				    ((uint32_t)packet[k + 1] << 16) |
184 				    ((uint32_t)packet[k + 2] << 8) |
185 				    (uint32_t)packet[k + 3];
186 #ifdef _MINIX_SYSTEM
187 			else if (total >= 3 && k < total - 3)
188 				a = bpf_get32_ext(pbuf, k);
189 #endif /* _MINIX_SYSTEM */
190 			else
191 				return 0;
192 			break;
193 		case BPF_LD+BPF_H+BPF_IND:	/* A <- P[X+k:2] */
194 			if (k + x < k)
195 				return 0;
196 			k += x;
197 			/* FALLTHROUGH */
198 		case BPF_LD+BPF_H+BPF_ABS:	/* A <- P[k:2] */
199 			/* As above. */
200 			if (len >= 1 && k < len - 1)
201 				a = ((uint32_t)packet[k] << 8) |
202 				    (uint32_t)packet[k + 1];
203 #ifdef _MINIX_SYSTEM
204 			else if (total >= 1 && k < total - 1)
205 				a = bpf_get16_ext(pbuf, k);
206 #endif /* _MINIX_SYSTEM */
207 			else
208 				return 0;
209 			break;
210 		case BPF_LD+BPF_B+BPF_IND:	/* A <- P[X+k:1] */
211 			if (k + x < k)
212 				return 0;
213 			k += x;
214 			/* FALLTHROUGH */
215 		case BPF_LD+BPF_B+BPF_ABS:	/* A <- P[k:1] */
216 			if (k < len)
217 				a = (uint32_t)packet[k];
218 #ifdef _MINIX_SYSTEM
219 			else if (k < total)
220 				a = bpf_get8_ext(pbuf, k);
221 #endif /* _MINIX_SYSTEM */
222 			else
223 				return 0;
224 			break;
225 		case BPF_LD+BPF_W+BPF_LEN:	/* A <- len */
226 			a = total;
227 			break;
228 		case BPF_LD+BPF_IMM:		/* A <- k */
229 			a = k;
230 			break;
231 		case BPF_LD+BPF_MEM:		/* A <- M[k] */
232 			a = mem[k];
233 			break;
234 
235 		case BPF_LDX+BPF_IMM:		/* X <- k */
236 			x = k;
237 			break;
238 		case BPF_LDX+BPF_MEM:		/* X <- M[k] */
239 			x = mem[k];
240 			break;
241 		case BPF_LDX+BPF_LEN:		/* X <- len */
242 			x = total;
243 			break;
244 		case BPF_LDX+BPF_B+BPF_MSH:	/* X <- 4*(P[k:1]&0xf) */
245 			if (k < len)
246 				x = ((uint32_t)packet[k] & 0xf) << 2;
247 #ifdef _MINIX_SYSTEM
248 			else if (k < total)
249 				x = (bpf_get8_ext(pbuf, k) & 0xf) << 2;
250 #endif /* _MINIX_SYSTEM */
251 			else
252 				return 0;
253 			break;
254 
255 		case BPF_ST:			/* M[k] <- A */
256 			mem[k] = a;
257 			break;
258 
259 		case BPF_STX:			/* M[k] <- X */
260 			mem[k] = x;
261 			break;
262 
263 		case BPF_ALU+BPF_ADD+BPF_K:	/* A <- A + k */
264 			a += k;
265 			break;
266 		case BPF_ALU+BPF_SUB+BPF_K:	/* A <- A - k */
267 			a -= k;
268 			break;
269 		case BPF_ALU+BPF_MUL+BPF_K:	/* A <- A * k */
270 			a *= k;
271 			break;
272 		case BPF_ALU+BPF_DIV+BPF_K:	/* A <- A / k */
273 			a /= k;
274 			break;
275 		case BPF_ALU+BPF_MOD+BPF_K:	/* A <- A % k */
276 			a %= k;
277 			break;
278 		case BPF_ALU+BPF_AND+BPF_K:	/* A <- A & k */
279 			a &= k;
280 			break;
281 		case BPF_ALU+BPF_OR+BPF_K:	/* A <- A | k */
282 			a |= k;
283 			break;
284 		case BPF_ALU+BPF_XOR+BPF_K:	/* A <- A ^ k */
285 			a ^= k;
286 			break;
287 		case BPF_ALU+BPF_LSH+BPF_K:	/* A <- A << k */
288 			a <<= k;
289 			break;
290 		case BPF_ALU+BPF_RSH+BPF_K:	/* A <- A >> k */
291 			a >>= k;
292 			break;
293 		case BPF_ALU+BPF_ADD+BPF_X:	/* A <- A + X */
294 			a += x;
295 			break;
296 		case BPF_ALU+BPF_SUB+BPF_X:	/* A <- A - X */
297 			a -= x;
298 			break;
299 		case BPF_ALU+BPF_MUL+BPF_X:	/* A <- A * X */
300 			a *= x;
301 			break;
302 		case BPF_ALU+BPF_DIV+BPF_X:	/* A <- A / X */
303 			if (x == 0)
304 				return 0;
305 			a /= x;
306 			break;
307 		case BPF_ALU+BPF_MOD+BPF_X:	/* A <- A % X */
308 			if (x == 0)
309 				return 0;
310 			a %= x;
311 			break;
312 		case BPF_ALU+BPF_AND+BPF_X:	/* A <- A & X */
313 			a &= x;
314 			break;
315 		case BPF_ALU+BPF_OR+BPF_X:	/* A <- A | X */
316 			a |= x;
317 			break;
318 		case BPF_ALU+BPF_XOR+BPF_X:	/* A <- A ^ X */
319 			a ^= x;
320 			break;
321 		case BPF_ALU+BPF_LSH+BPF_X:	/* A <- A << X */
322 			if (x >= 32)
323 				return 0;
324 			a <<= x;
325 			break;
326 		case BPF_ALU+BPF_RSH+BPF_X:	/* A <- A >> X */
327 			if (x >= 32)
328 				return 0;
329 			a >>= x;
330 			break;
331 		case BPF_ALU+BPF_NEG:		/* A <- -A */
332 			a = -a;
333 			break;
334 
335 		case BPF_JMP+BPF_JA:		/* pc += k */
336 			pc += k;
337 			break;
338 		case BPF_JMP+BPF_JGT+BPF_K:	/* pc += (A > k) ? jt : jf */
339 			pc += (a > k) ? pc->jt : pc->jf;
340 			break;
341 		case BPF_JMP+BPF_JGE+BPF_K:	/* pc += (A >= k) ? jt : jf */
342 			pc += (a >= k) ? pc->jt : pc->jf;
343 			break;
344 		case BPF_JMP+BPF_JEQ+BPF_K:	/* pc += (A == k) ? jt : jf */
345 			pc += (a == k) ? pc->jt : pc->jf;
346 			break;
347 		case BPF_JMP+BPF_JSET+BPF_K:	/* pc += (A & k) ? jt : jf */
348 			pc += (a & k) ? pc->jt : pc->jf;
349 			break;
350 		case BPF_JMP+BPF_JGT+BPF_X:	/* pc += (A > X) ? jt : jf */
351 			pc += (a > x) ? pc->jt : pc->jf;
352 			break;
353 		case BPF_JMP+BPF_JGE+BPF_X:	/* pc += (A >= X) ? jt : jf */
354 			pc += (a >= x) ? pc->jt : pc->jf;
355 			break;
356 		case BPF_JMP+BPF_JEQ+BPF_X:	/* pc += (A == X) ? jt : jf */
357 			pc += (a == x) ? pc->jt : pc->jf;
358 			break;
359 		case BPF_JMP+BPF_JSET+BPF_X:	/* pc += (A & X) ? jt : jf */
360 			pc += (a & x) ? pc->jt : pc->jf;
361 			break;
362 
363 		case BPF_RET+BPF_A:		/* accept A bytes */
364 			return a;
365 		case BPF_RET+BPF_K:		/* accept K bytes */
366 			return k;
367 
368 		case BPF_MISC+BPF_TAX:		/* X <- A */
369 			x = a;
370 			break;
371 		case BPF_MISC+BPF_TXA:		/* A <- X */
372 			a = x;
373 			break;
374 
375 		default:			/* unknown instruction */
376 			return 0;
377 		}
378 	}
379 
380 	/* NOTREACHED */
381 }
382 
383 /*
384  * In order to avoid having to perform explicit memory allocation, we store
385  * some validation state on the stack, using data types that are as small as
386  * possible for the current definitions.  The data types, and in fact the whole
387  * assumption that we can store the state on the stack, may need to be revised
388  * if certain constants are increased in the future.  As of writing, the
389  * validation routine uses a little over 1KB of stack memory.
390  */
391 #if BPF_MEMWORDS <= 16	/* value as of writing: 16 */
392 typedef uint16_t meminv_t;
393 #else
394 #error "increased BPF_MEMWORDS may require code revision"
395 #endif
396 
397 #if BPF_MAXINSNS > 2048	/* value as of writing: 512 */
398 #error "increased BPF_MAXINSNS may require code revision"
399 #endif
400 
401 /*
402  * Verify that the given filter program is safe to execute, by performing as
403  * many static validity checks as possible.  The program is given as 'insns',
404  * which must be an array of 'ninsns' BPF instructions.  Unlike bpf_filter(),
405  * this function does not accept empty filter programs.  The function returns 1
406  * if the program was successfully validated, or 0 if the program should not be
407  * accepted.
408  */
409 int
410 bpf_validate(const struct bpf_insn * insns, int ninsns)
411 {
412 	bitchunk_t reachable[BITMAP_CHUNKS(BPF_MAXINSNS)];
413 	meminv_t invalid, meminv[BPF_MAXINSNS];
414 	const struct bpf_insn *insn;
415 	u_int pc, count, target;
416 	int advance;
417 
418 	if (insns == NULL || ninsns <= 0 || ninsns > BPF_MAXINSNS)
419 		return 0;
420 	count = (u_int)ninsns;
421 
422 	memset(reachable, 0, sizeof(reachable[0]) * BITMAP_CHUNKS(count));
423 	memset(meminv, 0, sizeof(meminv[0]) * count);
424 
425 	SET_BIT(reachable, 0);
426 	meminv[0] = (meminv_t)~0;
427 
428 	for (pc = 0; pc < count; pc++) {
429 		/* We completely ignore instructions that are not reachable. */
430 		if (!GET_BIT(reachable, pc))
431 			continue;
432 
433 		invalid = meminv[pc];
434 		advance = 1;
435 
436 		insn = &insns[pc];
437 
438 		switch (insn->code) {
439 		case BPF_LD+BPF_W+BPF_ABS:
440 		case BPF_LD+BPF_H+BPF_ABS:
441 		case BPF_LD+BPF_B+BPF_ABS:
442 		case BPF_LD+BPF_W+BPF_IND:
443 		case BPF_LD+BPF_H+BPF_IND:
444 		case BPF_LD+BPF_B+BPF_IND:
445 		case BPF_LD+BPF_LEN:
446 		case BPF_LD+BPF_IMM:
447 		case BPF_LDX+BPF_IMM:
448 		case BPF_LDX+BPF_LEN:
449 		case BPF_LDX+BPF_B+BPF_MSH:
450 		case BPF_ALU+BPF_ADD+BPF_K:
451 		case BPF_ALU+BPF_SUB+BPF_K:
452 		case BPF_ALU+BPF_MUL+BPF_K:
453 		case BPF_ALU+BPF_AND+BPF_K:
454 		case BPF_ALU+BPF_OR+BPF_K:
455 		case BPF_ALU+BPF_XOR+BPF_K:
456 		case BPF_ALU+BPF_ADD+BPF_X:
457 		case BPF_ALU+BPF_SUB+BPF_X:
458 		case BPF_ALU+BPF_MUL+BPF_X:
459 		case BPF_ALU+BPF_DIV+BPF_X:
460 		case BPF_ALU+BPF_MOD+BPF_X:
461 		case BPF_ALU+BPF_AND+BPF_X:
462 		case BPF_ALU+BPF_OR+BPF_X:
463 		case BPF_ALU+BPF_XOR+BPF_X:
464 		case BPF_ALU+BPF_LSH+BPF_X:
465 		case BPF_ALU+BPF_RSH+BPF_X:
466 		case BPF_ALU+BPF_NEG:
467 		case BPF_MISC+BPF_TAX:
468 		case BPF_MISC+BPF_TXA:
469 			/* Nothing we can check for these. */
470 			break;
471 		case BPF_ALU+BPF_DIV+BPF_K:
472 		case BPF_ALU+BPF_MOD+BPF_K:
473 			/* No division by zero. */
474 			if (insn->k == 0)
475 				return 0;
476 			break;
477 		case BPF_ALU+BPF_LSH+BPF_K:
478 		case BPF_ALU+BPF_RSH+BPF_K:
479 			/* Do not invoke undefined behavior. */
480 			if (insn->k >= 32)
481 				return 0;
482 			break;
483 		case BPF_LD+BPF_MEM:
484 		case BPF_LDX+BPF_MEM:
485 			/*
486 			 * Only allow loading words that have been stored in
487 			 * all execution paths leading up to this instruction.
488 			 */
489 			if (insn->k >= BPF_MEMWORDS ||
490 			    (invalid & (1 << insn->k)))
491 				return 0;
492 			break;
493 		case BPF_ST:
494 		case BPF_STX:
495 			if (insn->k >= BPF_MEMWORDS)
496 				return 0;
497 			invalid &= ~(1 << insn->k);
498 			break;
499 		case BPF_JMP+BPF_JA:
500 			/*
501 			 * Make sure that the target instruction of the jump is
502 			 * still part of the program, and mark it as reachable.
503 			 */
504 			if (insn->k >= count - pc - 1)
505 				return 0;
506 			target = pc + insn->k + 1;
507 			SET_BIT(reachable, target);
508 			meminv[target] |= invalid;
509 			advance = 0;
510 			break;
511 		case BPF_JMP+BPF_JGT+BPF_K:
512 		case BPF_JMP+BPF_JGE+BPF_K:
513 		case BPF_JMP+BPF_JEQ+BPF_K:
514 		case BPF_JMP+BPF_JSET+BPF_K:
515 		case BPF_JMP+BPF_JGT+BPF_X:
516 		case BPF_JMP+BPF_JGE+BPF_X:
517 		case BPF_JMP+BPF_JEQ+BPF_X:
518 		case BPF_JMP+BPF_JSET+BPF_X:
519 			/*
520 			 * Make sure that both target instructions are still
521 			 * part of the program, and mark both as reachable.
522 			 * There is no chance that the additions will overflow.
523 			 */
524 			target = pc + insn->jt + 1;
525 			if (target >= count)
526 				return 0;
527 			SET_BIT(reachable, target);
528 			meminv[target] |= invalid;
529 
530 			target = pc + insn->jf + 1;
531 			if (target >= count)
532 				return 0;
533 			SET_BIT(reachable, target);
534 			meminv[target] |= invalid;
535 
536 			advance = 0;
537 			break;
538 		case BPF_RET+BPF_A:
539 		case BPF_RET+BPF_K:
540 			advance = 0;
541 			break;
542 		default:
543 			return 0;
544 		}
545 
546 		/*
547 		 * After most instructions, we simply advance to the next.  For
548 		 * one thing, this means that there must be a next instruction
549 		 * at all.
550 		 */
551 		if (advance) {
552 			if (pc + 1 == count)
553 				return 0;
554 			SET_BIT(reachable, pc + 1);
555 			meminv[pc + 1] |= invalid;
556 		}
557 	}
558 
559 	/* The program has passed all our basic tests. */
560 	return 1;
561 }
562