1 /* LWIP service - bpf_filter.c - Berkeley Packet Filter core implementation */ 2 /* 3 * This is basically a drop-in replacement of NetBSD's bpf_filter.c, which 4 * itself can be compiled for either the NetBSD kernel or for userland. On 5 * MINIX 3, we would like to perform certain checks that NetBSD implements only 6 * for its kernel (e.g., memory store access validation) while replacing the 7 * NetBSD kernel specifics with our own (pbuf instead of mbuf, no BPF contexts 8 * for now, etc.). As a result, it is easier to reimplement the whole thing, 9 * because there is not all that much to it. 10 * 11 * Support for the standard BSD API allows us to run standard tests against 12 * this module from userland, where _MINIX_SYSTEM is not defined. MINIX 3 13 * specific extensions are enabled only if _MINIX_SYSTEM is defined. 14 */ 15 #include <string.h> 16 #include <limits.h> 17 #include <net/bpf.h> 18 #include <minix/bitmap.h> 19 20 #ifdef _MINIX_SYSTEM 21 #include "lwip.h" 22 23 /* 24 * Obtain an unsigned 32-bit value in network byte order from the pbuf chain 25 * 'pbuf' at offset 'k'. The given offset is guaranteed to be within bounds. 26 */ 27 static uint32_t 28 bpf_get32_ext(const struct pbuf * pbuf, uint32_t k) 29 { 30 uint32_t val; 31 unsigned int i; 32 33 /* 34 * Find the pbuf that contains the first byte. We expect that most 35 * filters will operate only on the headers of packets, so that we 36 * mostly avoid going through this O(n) loop. Since only the superuser 37 * can open BPF devices at all, we need not be worried about abuse in 38 * this regard. However, it turns out that this loop is particularly 39 * CPU-intensive after all, we can probably improve it by caching the 40 * last visited pbuf, as read locality is likely high. 41 */ 42 while (k >= pbuf->len) { 43 k -= pbuf->len; 44 pbuf = pbuf->next; 45 assert(pbuf != NULL); 46 } 47 48 /* 49 * We assume that every pbuf has some data, but we make no assumptions 50 * about any minimum amount of data per pbuf. Therefore, we may have 51 * to take the bytes from anywhere between one and four pbufs. 52 * Hopefully the compiler will unroll this loop for us. 53 */ 54 val = (uint32_t)(((u_char *)pbuf->payload)[k]) << 24; 55 56 for (i = 0; i < 3; i++) { 57 if (k >= (uint32_t)pbuf->len - 1) { 58 k = 0; 59 pbuf = pbuf->next; 60 assert(pbuf != NULL); 61 } else 62 k++; 63 val = (val << 8) | (uint32_t)(((u_char *)pbuf->payload)[k]); 64 } 65 66 return val; 67 } 68 69 /* 70 * Obtain an unsigned 16-bit value in network byte order from the pbuf chain 71 * 'pbuf' at offset 'k'. The given offset is guaranteed to be within bounds. 72 */ 73 static uint32_t 74 bpf_get16_ext(const struct pbuf * pbuf, uint32_t k) 75 { 76 77 /* As above. */ 78 while (k >= pbuf->len) { 79 k -= pbuf->len; 80 pbuf = pbuf->next; 81 assert(pbuf != NULL); 82 } 83 84 /* 85 * There are only two possible cases to cover here: either the two 86 * bytes are in the same pbuf, or they are in subsequent ones. 87 */ 88 if (k < (uint32_t)pbuf->len - 1) { 89 return ((uint32_t)(((u_char *)pbuf->payload)[k]) << 8) | 90 (uint32_t)(((u_char *)pbuf->next->payload)[k + 1]); 91 } else { 92 assert(pbuf->next != NULL); 93 return ((uint32_t)(((u_char *)pbuf->payload)[k]) << 8) | 94 (uint32_t)(((u_char *)pbuf->next->payload)[0]); 95 } 96 } 97 98 /* 99 * Obtain an unsigned 8-bit value from the pbuf chain 'pbuf' at offset 'k'. 100 * The given offset is guaranteed to be within bounds. 101 */ 102 static uint32_t 103 bpf_get8_ext(const struct pbuf * pbuf, uint32_t k) 104 { 105 106 /* As above. */ 107 while (k >= pbuf->len) { 108 k -= pbuf->len; 109 pbuf = pbuf->next; 110 assert(pbuf != NULL); 111 } 112 113 return (uint32_t)(((u_char *)pbuf->payload)[k]); 114 } 115 116 #endif /* _MINIX_SYSTEM */ 117 118 /* 119 * Execute a BPF filter program on (the first part of) a packet, and return the 120 * maximum size of the packet that should be delivered to the filter owner. 121 * 122 * The 'pc' parameter points to an array of BPF instructions that together form 123 * the filter program to be executed. If 'pc' is NULL, the packet is fully 124 * accepted. Otherwise, the given program MUST have passed a previous call to 125 * bpf_validate(). Not doing so will allow for arbitrary memory access. 126 * 127 * The 'packet' array contains up to the whole packet. The value of 'total' 128 * denotes the total length of the packet; 'len' contains the size of the array 129 * 'packet'. Chunked storage of the packet is not supported at this time. 130 * 131 * If executing the program succeeds, the return value is the maximum number of 132 * bytes from the packet to be delivered. The return value may exceed the full 133 * packet size. If the number of bytes returned is zero, the packet is to be 134 * ignored. If the program fails to execute properly and return a value, a 135 * value of zero is returned as well, thus also indicating that the packet 136 * should be ignored. This is intentional: it saves filter programs from 137 * having to perform explicit checks on the packet they are filtering. 138 */ 139 u_int 140 bpf_filter(const struct bpf_insn * pc, const u_char * packet, u_int total, 141 u_int len) 142 #ifdef _MINIX_SYSTEM 143 { 144 145 return bpf_filter_ext(pc, NULL /*pbuf*/, packet, total, len); 146 } 147 148 u_int 149 bpf_filter_ext(const struct bpf_insn * pc, const struct pbuf * pbuf, 150 const u_char * packet, u_int total, u_int len) 151 #endif /* _MINIX_SYSTEM */ 152 { 153 uint32_t k, a, x, mem[BPF_MEMWORDS]; 154 155 /* An empty program accepts all packets. */ 156 if (pc == NULL) 157 return UINT_MAX; 158 159 /* 160 * We need not clear 'mem': the checker guarantees that each memory 161 * store word is always written before it is read. 162 */ 163 a = 0; 164 x = 0; 165 166 /* Execute the program. */ 167 for (;; pc++) { 168 k = pc->k; 169 170 switch (pc->code) { 171 case BPF_LD+BPF_W+BPF_IND: /* A <- P[X+k:4] */ 172 if (k + x < k) 173 return 0; 174 k += x; 175 /* FALLTHROUGH */ 176 case BPF_LD+BPF_W+BPF_ABS: /* A <- P[k:4] */ 177 /* 178 * 'k' may have any value, so check bounds in such a 179 * way that 'k' cannot possibly overflow and wrap. 180 */ 181 if (len >= 3 && k < len - 3) 182 a = ((uint32_t)packet[k] << 24) | 183 ((uint32_t)packet[k + 1] << 16) | 184 ((uint32_t)packet[k + 2] << 8) | 185 (uint32_t)packet[k + 3]; 186 #ifdef _MINIX_SYSTEM 187 else if (total >= 3 && k < total - 3) 188 a = bpf_get32_ext(pbuf, k); 189 #endif /* _MINIX_SYSTEM */ 190 else 191 return 0; 192 break; 193 case BPF_LD+BPF_H+BPF_IND: /* A <- P[X+k:2] */ 194 if (k + x < k) 195 return 0; 196 k += x; 197 /* FALLTHROUGH */ 198 case BPF_LD+BPF_H+BPF_ABS: /* A <- P[k:2] */ 199 /* As above. */ 200 if (len >= 1 && k < len - 1) 201 a = ((uint32_t)packet[k] << 8) | 202 (uint32_t)packet[k + 1]; 203 #ifdef _MINIX_SYSTEM 204 else if (total >= 1 && k < total - 1) 205 a = bpf_get16_ext(pbuf, k); 206 #endif /* _MINIX_SYSTEM */ 207 else 208 return 0; 209 break; 210 case BPF_LD+BPF_B+BPF_IND: /* A <- P[X+k:1] */ 211 if (k + x < k) 212 return 0; 213 k += x; 214 /* FALLTHROUGH */ 215 case BPF_LD+BPF_B+BPF_ABS: /* A <- P[k:1] */ 216 if (k < len) 217 a = (uint32_t)packet[k]; 218 #ifdef _MINIX_SYSTEM 219 else if (k < total) 220 a = bpf_get8_ext(pbuf, k); 221 #endif /* _MINIX_SYSTEM */ 222 else 223 return 0; 224 break; 225 case BPF_LD+BPF_W+BPF_LEN: /* A <- len */ 226 a = total; 227 break; 228 case BPF_LD+BPF_IMM: /* A <- k */ 229 a = k; 230 break; 231 case BPF_LD+BPF_MEM: /* A <- M[k] */ 232 a = mem[k]; 233 break; 234 235 case BPF_LDX+BPF_IMM: /* X <- k */ 236 x = k; 237 break; 238 case BPF_LDX+BPF_MEM: /* X <- M[k] */ 239 x = mem[k]; 240 break; 241 case BPF_LDX+BPF_LEN: /* X <- len */ 242 x = total; 243 break; 244 case BPF_LDX+BPF_B+BPF_MSH: /* X <- 4*(P[k:1]&0xf) */ 245 if (k < len) 246 x = ((uint32_t)packet[k] & 0xf) << 2; 247 #ifdef _MINIX_SYSTEM 248 else if (k < total) 249 x = (bpf_get8_ext(pbuf, k) & 0xf) << 2; 250 #endif /* _MINIX_SYSTEM */ 251 else 252 return 0; 253 break; 254 255 case BPF_ST: /* M[k] <- A */ 256 mem[k] = a; 257 break; 258 259 case BPF_STX: /* M[k] <- X */ 260 mem[k] = x; 261 break; 262 263 case BPF_ALU+BPF_ADD+BPF_K: /* A <- A + k */ 264 a += k; 265 break; 266 case BPF_ALU+BPF_SUB+BPF_K: /* A <- A - k */ 267 a -= k; 268 break; 269 case BPF_ALU+BPF_MUL+BPF_K: /* A <- A * k */ 270 a *= k; 271 break; 272 case BPF_ALU+BPF_DIV+BPF_K: /* A <- A / k */ 273 a /= k; 274 break; 275 case BPF_ALU+BPF_MOD+BPF_K: /* A <- A % k */ 276 a %= k; 277 break; 278 case BPF_ALU+BPF_AND+BPF_K: /* A <- A & k */ 279 a &= k; 280 break; 281 case BPF_ALU+BPF_OR+BPF_K: /* A <- A | k */ 282 a |= k; 283 break; 284 case BPF_ALU+BPF_XOR+BPF_K: /* A <- A ^ k */ 285 a ^= k; 286 break; 287 case BPF_ALU+BPF_LSH+BPF_K: /* A <- A << k */ 288 a <<= k; 289 break; 290 case BPF_ALU+BPF_RSH+BPF_K: /* A <- A >> k */ 291 a >>= k; 292 break; 293 case BPF_ALU+BPF_ADD+BPF_X: /* A <- A + X */ 294 a += x; 295 break; 296 case BPF_ALU+BPF_SUB+BPF_X: /* A <- A - X */ 297 a -= x; 298 break; 299 case BPF_ALU+BPF_MUL+BPF_X: /* A <- A * X */ 300 a *= x; 301 break; 302 case BPF_ALU+BPF_DIV+BPF_X: /* A <- A / X */ 303 if (x == 0) 304 return 0; 305 a /= x; 306 break; 307 case BPF_ALU+BPF_MOD+BPF_X: /* A <- A % X */ 308 if (x == 0) 309 return 0; 310 a %= x; 311 break; 312 case BPF_ALU+BPF_AND+BPF_X: /* A <- A & X */ 313 a &= x; 314 break; 315 case BPF_ALU+BPF_OR+BPF_X: /* A <- A | X */ 316 a |= x; 317 break; 318 case BPF_ALU+BPF_XOR+BPF_X: /* A <- A ^ X */ 319 a ^= x; 320 break; 321 case BPF_ALU+BPF_LSH+BPF_X: /* A <- A << X */ 322 if (x >= 32) 323 return 0; 324 a <<= x; 325 break; 326 case BPF_ALU+BPF_RSH+BPF_X: /* A <- A >> X */ 327 if (x >= 32) 328 return 0; 329 a >>= x; 330 break; 331 case BPF_ALU+BPF_NEG: /* A <- -A */ 332 a = -a; 333 break; 334 335 case BPF_JMP+BPF_JA: /* pc += k */ 336 pc += k; 337 break; 338 case BPF_JMP+BPF_JGT+BPF_K: /* pc += (A > k) ? jt : jf */ 339 pc += (a > k) ? pc->jt : pc->jf; 340 break; 341 case BPF_JMP+BPF_JGE+BPF_K: /* pc += (A >= k) ? jt : jf */ 342 pc += (a >= k) ? pc->jt : pc->jf; 343 break; 344 case BPF_JMP+BPF_JEQ+BPF_K: /* pc += (A == k) ? jt : jf */ 345 pc += (a == k) ? pc->jt : pc->jf; 346 break; 347 case BPF_JMP+BPF_JSET+BPF_K: /* pc += (A & k) ? jt : jf */ 348 pc += (a & k) ? pc->jt : pc->jf; 349 break; 350 case BPF_JMP+BPF_JGT+BPF_X: /* pc += (A > X) ? jt : jf */ 351 pc += (a > x) ? pc->jt : pc->jf; 352 break; 353 case BPF_JMP+BPF_JGE+BPF_X: /* pc += (A >= X) ? jt : jf */ 354 pc += (a >= x) ? pc->jt : pc->jf; 355 break; 356 case BPF_JMP+BPF_JEQ+BPF_X: /* pc += (A == X) ? jt : jf */ 357 pc += (a == x) ? pc->jt : pc->jf; 358 break; 359 case BPF_JMP+BPF_JSET+BPF_X: /* pc += (A & X) ? jt : jf */ 360 pc += (a & x) ? pc->jt : pc->jf; 361 break; 362 363 case BPF_RET+BPF_A: /* accept A bytes */ 364 return a; 365 case BPF_RET+BPF_K: /* accept K bytes */ 366 return k; 367 368 case BPF_MISC+BPF_TAX: /* X <- A */ 369 x = a; 370 break; 371 case BPF_MISC+BPF_TXA: /* A <- X */ 372 a = x; 373 break; 374 375 default: /* unknown instruction */ 376 return 0; 377 } 378 } 379 380 /* NOTREACHED */ 381 } 382 383 /* 384 * In order to avoid having to perform explicit memory allocation, we store 385 * some validation state on the stack, using data types that are as small as 386 * possible for the current definitions. The data types, and in fact the whole 387 * assumption that we can store the state on the stack, may need to be revised 388 * if certain constants are increased in the future. As of writing, the 389 * validation routine uses a little over 1KB of stack memory. 390 */ 391 #if BPF_MEMWORDS <= 16 /* value as of writing: 16 */ 392 typedef uint16_t meminv_t; 393 #else 394 #error "increased BPF_MEMWORDS may require code revision" 395 #endif 396 397 #if BPF_MAXINSNS > 2048 /* value as of writing: 512 */ 398 #error "increased BPF_MAXINSNS may require code revision" 399 #endif 400 401 /* 402 * Verify that the given filter program is safe to execute, by performing as 403 * many static validity checks as possible. The program is given as 'insns', 404 * which must be an array of 'ninsns' BPF instructions. Unlike bpf_filter(), 405 * this function does not accept empty filter programs. The function returns 1 406 * if the program was successfully validated, or 0 if the program should not be 407 * accepted. 408 */ 409 int 410 bpf_validate(const struct bpf_insn * insns, int ninsns) 411 { 412 bitchunk_t reachable[BITMAP_CHUNKS(BPF_MAXINSNS)]; 413 meminv_t invalid, meminv[BPF_MAXINSNS]; 414 const struct bpf_insn *insn; 415 u_int pc, count, target; 416 int advance; 417 418 if (insns == NULL || ninsns <= 0 || ninsns > BPF_MAXINSNS) 419 return 0; 420 count = (u_int)ninsns; 421 422 memset(reachable, 0, sizeof(reachable[0]) * BITMAP_CHUNKS(count)); 423 memset(meminv, 0, sizeof(meminv[0]) * count); 424 425 SET_BIT(reachable, 0); 426 meminv[0] = (meminv_t)~0; 427 428 for (pc = 0; pc < count; pc++) { 429 /* We completely ignore instructions that are not reachable. */ 430 if (!GET_BIT(reachable, pc)) 431 continue; 432 433 invalid = meminv[pc]; 434 advance = 1; 435 436 insn = &insns[pc]; 437 438 switch (insn->code) { 439 case BPF_LD+BPF_W+BPF_ABS: 440 case BPF_LD+BPF_H+BPF_ABS: 441 case BPF_LD+BPF_B+BPF_ABS: 442 case BPF_LD+BPF_W+BPF_IND: 443 case BPF_LD+BPF_H+BPF_IND: 444 case BPF_LD+BPF_B+BPF_IND: 445 case BPF_LD+BPF_LEN: 446 case BPF_LD+BPF_IMM: 447 case BPF_LDX+BPF_IMM: 448 case BPF_LDX+BPF_LEN: 449 case BPF_LDX+BPF_B+BPF_MSH: 450 case BPF_ALU+BPF_ADD+BPF_K: 451 case BPF_ALU+BPF_SUB+BPF_K: 452 case BPF_ALU+BPF_MUL+BPF_K: 453 case BPF_ALU+BPF_AND+BPF_K: 454 case BPF_ALU+BPF_OR+BPF_K: 455 case BPF_ALU+BPF_XOR+BPF_K: 456 case BPF_ALU+BPF_ADD+BPF_X: 457 case BPF_ALU+BPF_SUB+BPF_X: 458 case BPF_ALU+BPF_MUL+BPF_X: 459 case BPF_ALU+BPF_DIV+BPF_X: 460 case BPF_ALU+BPF_MOD+BPF_X: 461 case BPF_ALU+BPF_AND+BPF_X: 462 case BPF_ALU+BPF_OR+BPF_X: 463 case BPF_ALU+BPF_XOR+BPF_X: 464 case BPF_ALU+BPF_LSH+BPF_X: 465 case BPF_ALU+BPF_RSH+BPF_X: 466 case BPF_ALU+BPF_NEG: 467 case BPF_MISC+BPF_TAX: 468 case BPF_MISC+BPF_TXA: 469 /* Nothing we can check for these. */ 470 break; 471 case BPF_ALU+BPF_DIV+BPF_K: 472 case BPF_ALU+BPF_MOD+BPF_K: 473 /* No division by zero. */ 474 if (insn->k == 0) 475 return 0; 476 break; 477 case BPF_ALU+BPF_LSH+BPF_K: 478 case BPF_ALU+BPF_RSH+BPF_K: 479 /* Do not invoke undefined behavior. */ 480 if (insn->k >= 32) 481 return 0; 482 break; 483 case BPF_LD+BPF_MEM: 484 case BPF_LDX+BPF_MEM: 485 /* 486 * Only allow loading words that have been stored in 487 * all execution paths leading up to this instruction. 488 */ 489 if (insn->k >= BPF_MEMWORDS || 490 (invalid & (1 << insn->k))) 491 return 0; 492 break; 493 case BPF_ST: 494 case BPF_STX: 495 if (insn->k >= BPF_MEMWORDS) 496 return 0; 497 invalid &= ~(1 << insn->k); 498 break; 499 case BPF_JMP+BPF_JA: 500 /* 501 * Make sure that the target instruction of the jump is 502 * still part of the program, and mark it as reachable. 503 */ 504 if (insn->k >= count - pc - 1) 505 return 0; 506 target = pc + insn->k + 1; 507 SET_BIT(reachable, target); 508 meminv[target] |= invalid; 509 advance = 0; 510 break; 511 case BPF_JMP+BPF_JGT+BPF_K: 512 case BPF_JMP+BPF_JGE+BPF_K: 513 case BPF_JMP+BPF_JEQ+BPF_K: 514 case BPF_JMP+BPF_JSET+BPF_K: 515 case BPF_JMP+BPF_JGT+BPF_X: 516 case BPF_JMP+BPF_JGE+BPF_X: 517 case BPF_JMP+BPF_JEQ+BPF_X: 518 case BPF_JMP+BPF_JSET+BPF_X: 519 /* 520 * Make sure that both target instructions are still 521 * part of the program, and mark both as reachable. 522 * There is no chance that the additions will overflow. 523 */ 524 target = pc + insn->jt + 1; 525 if (target >= count) 526 return 0; 527 SET_BIT(reachable, target); 528 meminv[target] |= invalid; 529 530 target = pc + insn->jf + 1; 531 if (target >= count) 532 return 0; 533 SET_BIT(reachable, target); 534 meminv[target] |= invalid; 535 536 advance = 0; 537 break; 538 case BPF_RET+BPF_A: 539 case BPF_RET+BPF_K: 540 advance = 0; 541 break; 542 default: 543 return 0; 544 } 545 546 /* 547 * After most instructions, we simply advance to the next. For 548 * one thing, this means that there must be a next instruction 549 * at all. 550 */ 551 if (advance) { 552 if (pc + 1 == count) 553 return 0; 554 SET_BIT(reachable, pc + 1); 555 meminv[pc + 1] |= invalid; 556 } 557 } 558 559 /* The program has passed all our basic tests. */ 560 return 1; 561 } 562