1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 // Copyright (c) 2019, 2020 Cloudflare 3 4 #include <stdbool.h> 5 #include <stddef.h> 6 #include <stdint.h> 7 #include <string.h> 8 9 #include <linux/bpf.h> 10 #include <linux/icmp.h> 11 #include <linux/icmpv6.h> 12 #include <linux/if_ether.h> 13 #include <linux/in.h> 14 #include <linux/ip.h> 15 #include <linux/ipv6.h> 16 #include <linux/pkt_cls.h> 17 #include <linux/tcp.h> 18 #include <linux/udp.h> 19 20 #include <bpf/bpf_helpers.h> 21 #include <bpf/bpf_endian.h> 22 23 #include "test_cls_redirect.h" 24 25 #ifdef SUBPROGS 26 #define INLINING __noinline 27 #else 28 #define INLINING __always_inline 29 #endif 30 31 #define offsetofend(TYPE, MEMBER) \ 32 (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) 33 34 #define IP_OFFSET_MASK (0x1FFF) 35 #define IP_MF (0x2000) 36 37 char _license[] SEC("license") = "Dual BSD/GPL"; 38 39 /** 40 * Destination port and IP used for UDP encapsulation. 41 */ 42 volatile const __be16 ENCAPSULATION_PORT; 43 volatile const __be32 ENCAPSULATION_IP; 44 45 typedef struct { 46 uint64_t processed_packets_total; 47 uint64_t l3_protocol_packets_total_ipv4; 48 uint64_t l3_protocol_packets_total_ipv6; 49 uint64_t l4_protocol_packets_total_tcp; 50 uint64_t l4_protocol_packets_total_udp; 51 uint64_t accepted_packets_total_syn; 52 uint64_t accepted_packets_total_syn_cookies; 53 uint64_t accepted_packets_total_last_hop; 54 uint64_t accepted_packets_total_icmp_echo_request; 55 uint64_t accepted_packets_total_established; 56 uint64_t forwarded_packets_total_gue; 57 uint64_t forwarded_packets_total_gre; 58 59 uint64_t errors_total_unknown_l3_proto; 60 uint64_t errors_total_unknown_l4_proto; 61 uint64_t errors_total_malformed_ip; 62 uint64_t errors_total_fragmented_ip; 63 uint64_t errors_total_malformed_icmp; 64 uint64_t errors_total_unwanted_icmp; 65 uint64_t errors_total_malformed_icmp_pkt_too_big; 66 uint64_t errors_total_malformed_tcp; 67 uint64_t errors_total_malformed_udp; 68 uint64_t errors_total_icmp_echo_replies; 69 uint64_t errors_total_malformed_encapsulation; 70 uint64_t errors_total_encap_adjust_failed; 71 uint64_t errors_total_encap_buffer_too_small; 72 uint64_t errors_total_redirect_loop; 73 uint64_t errors_total_encap_mtu_violate; 74 } metrics_t; 75 76 typedef enum { 77 INVALID = 0, 78 UNKNOWN, 79 ECHO_REQUEST, 80 SYN, 81 SYN_COOKIE, 82 ESTABLISHED, 83 } verdict_t; 84 85 typedef struct { 86 uint16_t src, dst; 87 } flow_ports_t; 88 89 _Static_assert( 90 sizeof(flow_ports_t) != 91 offsetofend(struct bpf_sock_tuple, ipv4.dport) - 92 offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, 93 "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 94 _Static_assert( 95 sizeof(flow_ports_t) != 96 offsetofend(struct bpf_sock_tuple, ipv6.dport) - 97 offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, 98 "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 99 100 typedef int ret_t; 101 102 /* This is a bit of a hack. We need a return value which allows us to 103 * indicate that the regular flow of the program should continue, 104 * while allowing functions to use XDP_PASS and XDP_DROP, etc. 105 */ 106 static const ret_t CONTINUE_PROCESSING = -1; 107 108 /* Convenience macro to call functions which return ret_t. 109 */ 110 #define MAYBE_RETURN(x) \ 111 do { \ 112 ret_t __ret = x; \ 113 if (__ret != CONTINUE_PROCESSING) \ 114 return __ret; \ 115 } while (0) 116 117 /* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes), 118 * or not aligned if the arch supports efficient unaligned access. 119 * 120 * Since the verifier ensures that eBPF packet accesses follow these rules, 121 * we can tell LLVM to emit code as if we always had a larger alignment. 122 * It will yell at us if we end up on a platform where this is not valid. 123 */ 124 typedef uint8_t *net_ptr __attribute__((align_value(8))); 125 126 typedef struct buf { 127 struct __sk_buff *skb; 128 net_ptr head; 129 /* NB: tail musn't have alignment other than 1, otherwise 130 * LLVM will go and eliminate code, e.g. when checking packet lengths. 131 */ 132 uint8_t *const tail; 133 } buf_t; 134 135 static __always_inline size_t buf_off(const buf_t *buf) 136 { 137 /* Clang seems to optimize constructs like 138 * a - b + c 139 * if c is known: 140 * r? = c 141 * r? -= b 142 * r? += a 143 * 144 * This is a problem if a and b are packet pointers, 145 * since the verifier allows subtracting two pointers to 146 * get a scalar, but not a scalar and a pointer. 147 * 148 * Use inline asm to break this optimization. 149 */ 150 size_t off = (size_t)buf->head; 151 asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data)); 152 return off; 153 } 154 155 static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len) 156 { 157 if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { 158 return false; 159 } 160 161 buf->head += len; 162 return true; 163 } 164 165 static __always_inline bool buf_skip(buf_t *buf, const size_t len) 166 { 167 /* Check whether off + len is valid in the non-linear part. */ 168 if (buf_off(buf) + len > buf->skb->len) { 169 return false; 170 } 171 172 buf->head += len; 173 return true; 174 } 175 176 /* Returns a pointer to the start of buf, or NULL if len is 177 * larger than the remaining data. Consumes len bytes on a successful 178 * call. 179 * 180 * If scratch is not NULL, the function will attempt to load non-linear 181 * data via bpf_skb_load_bytes. On success, scratch is returned. 182 */ 183 static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch) 184 { 185 if (buf->head + len > buf->tail) { 186 if (scratch == NULL) { 187 return NULL; 188 } 189 190 return buf_copy(buf, scratch, len) ? scratch : NULL; 191 } 192 193 void *ptr = buf->head; 194 buf->head += len; 195 return ptr; 196 } 197 198 static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) 199 { 200 if (ipv4->ihl <= 5) { 201 return true; 202 } 203 204 return buf_skip(buf, (ipv4->ihl - 5) * 4); 205 } 206 207 static INLINING bool ipv4_is_fragment(const struct iphdr *ip) 208 { 209 uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); 210 return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; 211 } 212 213 static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) 214 { 215 struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch); 216 if (ipv4 == NULL) { 217 return NULL; 218 } 219 220 if (ipv4->ihl < 5) { 221 return NULL; 222 } 223 224 if (!pkt_skip_ipv4_options(pkt, ipv4)) { 225 return NULL; 226 } 227 228 return ipv4; 229 } 230 231 /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ 232 static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) 233 { 234 if (!buf_copy(pkt, ports, sizeof(*ports))) { 235 return false; 236 } 237 238 /* Ports in the L4 headers are reversed, since we are parsing an ICMP 239 * payload which is going towards the eyeball. 240 */ 241 uint16_t dst = ports->src; 242 ports->src = ports->dst; 243 ports->dst = dst; 244 return true; 245 } 246 247 static INLINING uint16_t pkt_checksum_fold(uint32_t csum) 248 { 249 /* The highest reasonable value for an IPv4 header 250 * checksum requires two folds, so we just do that always. 251 */ 252 csum = (csum & 0xffff) + (csum >> 16); 253 csum = (csum & 0xffff) + (csum >> 16); 254 return (uint16_t)~csum; 255 } 256 257 static INLINING void pkt_ipv4_checksum(struct iphdr *iph) 258 { 259 iph->check = 0; 260 261 /* An IP header without options is 20 bytes. Two of those 262 * are the checksum, which we always set to zero. Hence, 263 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, 264 * which fits in 32 bit. 265 */ 266 _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes"); 267 uint32_t acc = 0; 268 uint16_t *ipw = (uint16_t *)iph; 269 270 #pragma clang loop unroll(full) 271 for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) { 272 acc += ipw[i]; 273 } 274 275 iph->check = pkt_checksum_fold(acc); 276 } 277 278 static INLINING 279 bool pkt_skip_ipv6_extension_headers(buf_t *pkt, 280 const struct ipv6hdr *ipv6, 281 uint8_t *upper_proto, 282 bool *is_fragment) 283 { 284 /* We understand five extension headers. 285 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all 286 * headers should occur once, except Destination Options, which may 287 * occur twice. Hence we give up after 6 headers. 288 */ 289 struct { 290 uint8_t next; 291 uint8_t len; 292 } exthdr = { 293 .next = ipv6->nexthdr, 294 }; 295 *is_fragment = false; 296 297 #pragma clang loop unroll(full) 298 for (int i = 0; i < 6; i++) { 299 switch (exthdr.next) { 300 case IPPROTO_FRAGMENT: 301 *is_fragment = true; 302 /* NB: We don't check that hdrlen == 0 as per spec. */ 303 /* fallthrough; */ 304 305 case IPPROTO_HOPOPTS: 306 case IPPROTO_ROUTING: 307 case IPPROTO_DSTOPTS: 308 case IPPROTO_MH: 309 if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) { 310 return false; 311 } 312 313 /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ 314 if (!buf_skip(pkt, 315 (exthdr.len + 1) * 8 - sizeof(exthdr))) { 316 return false; 317 } 318 319 /* Decode next header */ 320 break; 321 322 default: 323 /* The next header is not one of the known extension 324 * headers, treat it as the upper layer header. 325 * 326 * This handles IPPROTO_NONE. 327 * 328 * Encapsulating Security Payload (50) and Authentication 329 * Header (51) also end up here (and will trigger an 330 * unknown proto error later). They have a custom header 331 * format and seem too esoteric to care about. 332 */ 333 *upper_proto = exthdr.next; 334 return true; 335 } 336 } 337 338 /* We never found an upper layer header. */ 339 return false; 340 } 341 342 /* This function has to be inlined, because the verifier otherwise rejects it 343 * due to returning a pointer to the stack. This is technically correct, since 344 * scratch is allocated on the stack. However, this usage should be safe since 345 * it's the callers stack after all. 346 */ 347 static __always_inline struct ipv6hdr * 348 pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, 349 bool *is_fragment) 350 { 351 struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch); 352 if (ipv6 == NULL) { 353 return NULL; 354 } 355 356 if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) { 357 return NULL; 358 } 359 360 return ipv6; 361 } 362 363 /* Global metrics, per CPU 364 */ 365 struct { 366 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 367 __uint(max_entries, 1); 368 __type(key, unsigned int); 369 __type(value, metrics_t); 370 } metrics_map SEC(".maps"); 371 372 static INLINING metrics_t *get_global_metrics(void) 373 { 374 uint64_t key = 0; 375 return bpf_map_lookup_elem(&metrics_map, &key); 376 } 377 378 static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) 379 { 380 const int payload_off = 381 sizeof(*encap) + 382 sizeof(struct in_addr) * encap->unigue.hop_count; 383 int32_t encap_overhead = payload_off - sizeof(struct ethhdr); 384 385 // Changing the ethertype if the encapsulated packet is ipv6 386 if (encap->gue.proto_ctype == IPPROTO_IPV6) { 387 encap->eth.h_proto = bpf_htons(ETH_P_IPV6); 388 } 389 390 if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, 391 BPF_F_ADJ_ROOM_FIXED_GSO | 392 BPF_F_ADJ_ROOM_NO_CSUM_RESET) || 393 bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) 394 return TC_ACT_SHOT; 395 396 return bpf_redirect(skb->ifindex, BPF_F_INGRESS); 397 } 398 399 static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, 400 struct in_addr *next_hop, metrics_t *metrics) 401 { 402 metrics->forwarded_packets_total_gre++; 403 404 const int payload_off = 405 sizeof(*encap) + 406 sizeof(struct in_addr) * encap->unigue.hop_count; 407 int32_t encap_overhead = 408 payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); 409 int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; 410 uint16_t proto = ETH_P_IP; 411 uint32_t mtu_len = 0; 412 413 /* Loop protection: the inner packet's TTL is decremented as a safeguard 414 * against any forwarding loop. As the only interesting field is the TTL 415 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes 416 * as they handle the split packets if needed (no need for the data to be 417 * in the linear section). 418 */ 419 if (encap->gue.proto_ctype == IPPROTO_IPV6) { 420 proto = ETH_P_IPV6; 421 uint8_t ttl; 422 int rc; 423 424 rc = bpf_skb_load_bytes( 425 skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 426 &ttl, 1); 427 if (rc != 0) { 428 metrics->errors_total_malformed_encapsulation++; 429 return TC_ACT_SHOT; 430 } 431 432 if (ttl == 0) { 433 metrics->errors_total_redirect_loop++; 434 return TC_ACT_SHOT; 435 } 436 437 ttl--; 438 rc = bpf_skb_store_bytes( 439 skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 440 &ttl, 1, 0); 441 if (rc != 0) { 442 metrics->errors_total_malformed_encapsulation++; 443 return TC_ACT_SHOT; 444 } 445 } else { 446 uint8_t ttl; 447 int rc; 448 449 rc = bpf_skb_load_bytes( 450 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 451 1); 452 if (rc != 0) { 453 metrics->errors_total_malformed_encapsulation++; 454 return TC_ACT_SHOT; 455 } 456 457 if (ttl == 0) { 458 metrics->errors_total_redirect_loop++; 459 return TC_ACT_SHOT; 460 } 461 462 /* IPv4 also has a checksum to patch. While the TTL is only one byte, 463 * this function only works for 2 and 4 bytes arguments (the result is 464 * the same). 465 */ 466 rc = bpf_l3_csum_replace( 467 skb, payload_off + offsetof(struct iphdr, check), ttl, 468 ttl - 1, 2); 469 if (rc != 0) { 470 metrics->errors_total_malformed_encapsulation++; 471 return TC_ACT_SHOT; 472 } 473 474 ttl--; 475 rc = bpf_skb_store_bytes( 476 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, 477 0); 478 if (rc != 0) { 479 metrics->errors_total_malformed_encapsulation++; 480 return TC_ACT_SHOT; 481 } 482 } 483 484 if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) { 485 metrics->errors_total_encap_mtu_violate++; 486 return TC_ACT_SHOT; 487 } 488 489 if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, 490 BPF_F_ADJ_ROOM_FIXED_GSO | 491 BPF_F_ADJ_ROOM_NO_CSUM_RESET) || 492 bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { 493 metrics->errors_total_encap_adjust_failed++; 494 return TC_ACT_SHOT; 495 } 496 497 if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { 498 metrics->errors_total_encap_buffer_too_small++; 499 return TC_ACT_SHOT; 500 } 501 502 buf_t pkt = { 503 .skb = skb, 504 .head = (uint8_t *)(long)skb->data, 505 .tail = (uint8_t *)(long)skb->data_end, 506 }; 507 508 encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL); 509 if (encap_gre == NULL) { 510 metrics->errors_total_encap_buffer_too_small++; 511 return TC_ACT_SHOT; 512 } 513 514 encap_gre->ip.protocol = IPPROTO_GRE; 515 encap_gre->ip.daddr = next_hop->s_addr; 516 encap_gre->ip.saddr = ENCAPSULATION_IP; 517 encap_gre->ip.tot_len = 518 bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); 519 encap_gre->gre.flags = 0; 520 encap_gre->gre.protocol = bpf_htons(proto); 521 pkt_ipv4_checksum((void *)&encap_gre->ip); 522 523 return bpf_redirect(skb->ifindex, 0); 524 } 525 526 static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, 527 struct in_addr *next_hop, metrics_t *metrics) 528 { 529 /* swap L2 addresses */ 530 /* This assumes that packets are received from a router. 531 * So just swapping the MAC addresses here will make the packet go back to 532 * the router, which will send it to the appropriate machine. 533 */ 534 unsigned char temp[ETH_ALEN]; 535 memcpy(temp, encap->eth.h_dest, sizeof(temp)); 536 memcpy(encap->eth.h_dest, encap->eth.h_source, 537 sizeof(encap->eth.h_dest)); 538 memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); 539 540 if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && 541 encap->unigue.last_hop_gre) { 542 return forward_with_gre(skb, encap, next_hop, metrics); 543 } 544 545 metrics->forwarded_packets_total_gue++; 546 uint32_t old_saddr = encap->ip.saddr; 547 encap->ip.saddr = encap->ip.daddr; 548 encap->ip.daddr = next_hop->s_addr; 549 if (encap->unigue.next_hop < encap->unigue.hop_count) { 550 encap->unigue.next_hop++; 551 } 552 553 /* Remove ip->saddr, add next_hop->s_addr */ 554 const uint64_t off = offsetof(typeof(*encap), ip.check); 555 int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); 556 if (ret < 0) { 557 return TC_ACT_SHOT; 558 } 559 560 return bpf_redirect(skb->ifindex, 0); 561 } 562 563 static INLINING ret_t skip_next_hops(buf_t *pkt, int n) 564 { 565 switch (n) { 566 case 1: 567 if (!buf_skip(pkt, sizeof(struct in_addr))) 568 return TC_ACT_SHOT; 569 case 0: 570 return CONTINUE_PROCESSING; 571 572 default: 573 return TC_ACT_SHOT; 574 } 575 } 576 577 /* Get the next hop from the GLB header. 578 * 579 * Sets next_hop->s_addr to 0 if there are no more hops left. 580 * pkt is positioned just after the variable length GLB header 581 * iff the call is successful. 582 */ 583 static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, 584 struct in_addr *next_hop) 585 { 586 if (encap->unigue.next_hop > encap->unigue.hop_count) { 587 return TC_ACT_SHOT; 588 } 589 590 /* Skip "used" next hops. */ 591 MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop)); 592 593 if (encap->unigue.next_hop == encap->unigue.hop_count) { 594 /* No more next hops, we are at the end of the GLB header. */ 595 next_hop->s_addr = 0; 596 return CONTINUE_PROCESSING; 597 } 598 599 if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) { 600 return TC_ACT_SHOT; 601 } 602 603 /* Skip the remaining next hops (may be zero). */ 604 return skip_next_hops(pkt, encap->unigue.hop_count - 605 encap->unigue.next_hop - 1); 606 } 607 608 /* Fill a bpf_sock_tuple to be used with the socket lookup functions. 609 * This is a kludge that let's us work around verifier limitations: 610 * 611 * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) 612 * 613 * clang will substitute a constant for sizeof, which allows the verifier 614 * to track its value. Based on this, it can figure out the constant 615 * return value, and calling code works while still being "generic" to 616 * IPv4 and IPv6. 617 */ 618 static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, 619 uint64_t iphlen, uint16_t sport, uint16_t dport) 620 { 621 switch (iphlen) { 622 case sizeof(struct iphdr): { 623 struct iphdr *ipv4 = (struct iphdr *)iph; 624 tuple->ipv4.daddr = ipv4->daddr; 625 tuple->ipv4.saddr = ipv4->saddr; 626 tuple->ipv4.sport = sport; 627 tuple->ipv4.dport = dport; 628 return sizeof(tuple->ipv4); 629 } 630 631 case sizeof(struct ipv6hdr): { 632 struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; 633 memcpy(&tuple->ipv6.daddr, &ipv6->daddr, 634 sizeof(tuple->ipv6.daddr)); 635 memcpy(&tuple->ipv6.saddr, &ipv6->saddr, 636 sizeof(tuple->ipv6.saddr)); 637 tuple->ipv6.sport = sport; 638 tuple->ipv6.dport = dport; 639 return sizeof(tuple->ipv6); 640 } 641 642 default: 643 return 0; 644 } 645 } 646 647 static INLINING verdict_t classify_tcp(struct __sk_buff *skb, 648 struct bpf_sock_tuple *tuple, uint64_t tuplen, 649 void *iph, struct tcphdr *tcp) 650 { 651 struct bpf_sock *sk = 652 bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 653 if (sk == NULL) { 654 return UNKNOWN; 655 } 656 657 if (sk->state != BPF_TCP_LISTEN) { 658 bpf_sk_release(sk); 659 return ESTABLISHED; 660 } 661 662 if (iph != NULL && tcp != NULL) { 663 /* Kludge: we've run out of arguments, but need the length of the ip header. */ 664 uint64_t iphlen = sizeof(struct iphdr); 665 if (tuplen == sizeof(tuple->ipv6)) { 666 iphlen = sizeof(struct ipv6hdr); 667 } 668 669 if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, 670 sizeof(*tcp)) == 0) { 671 bpf_sk_release(sk); 672 return SYN_COOKIE; 673 } 674 } 675 676 bpf_sk_release(sk); 677 return UNKNOWN; 678 } 679 680 static INLINING verdict_t classify_udp(struct __sk_buff *skb, 681 struct bpf_sock_tuple *tuple, uint64_t tuplen) 682 { 683 struct bpf_sock *sk = 684 bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 685 if (sk == NULL) { 686 return UNKNOWN; 687 } 688 689 if (sk->state == BPF_TCP_ESTABLISHED) { 690 bpf_sk_release(sk); 691 return ESTABLISHED; 692 } 693 694 bpf_sk_release(sk); 695 return UNKNOWN; 696 } 697 698 static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, 699 struct bpf_sock_tuple *tuple, uint64_t tuplen, 700 metrics_t *metrics) 701 { 702 switch (proto) { 703 case IPPROTO_TCP: 704 return classify_tcp(skb, tuple, tuplen, NULL, NULL); 705 706 case IPPROTO_UDP: 707 return classify_udp(skb, tuple, tuplen); 708 709 default: 710 metrics->errors_total_malformed_icmp++; 711 return INVALID; 712 } 713 } 714 715 static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) 716 { 717 struct icmphdr icmp; 718 if (!buf_copy(pkt, &icmp, sizeof(icmp))) { 719 metrics->errors_total_malformed_icmp++; 720 return INVALID; 721 } 722 723 /* We should never receive encapsulated echo replies. */ 724 if (icmp.type == ICMP_ECHOREPLY) { 725 metrics->errors_total_icmp_echo_replies++; 726 return INVALID; 727 } 728 729 if (icmp.type == ICMP_ECHO) { 730 return ECHO_REQUEST; 731 } 732 733 if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { 734 metrics->errors_total_unwanted_icmp++; 735 return INVALID; 736 } 737 738 struct iphdr _ip4; 739 const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); 740 if (ipv4 == NULL) { 741 metrics->errors_total_malformed_icmp_pkt_too_big++; 742 return INVALID; 743 } 744 745 /* The source address in the outer IP header is from the entity that 746 * originated the ICMP message. Use the original IP header to restore 747 * the correct flow tuple. 748 */ 749 struct bpf_sock_tuple tuple; 750 tuple.ipv4.saddr = ipv4->daddr; 751 tuple.ipv4.daddr = ipv4->saddr; 752 753 if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) { 754 metrics->errors_total_malformed_icmp_pkt_too_big++; 755 return INVALID; 756 } 757 758 return classify_icmp(pkt->skb, ipv4->protocol, &tuple, 759 sizeof(tuple.ipv4), metrics); 760 } 761 762 static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) 763 { 764 struct icmp6hdr icmp6; 765 if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) { 766 metrics->errors_total_malformed_icmp++; 767 return INVALID; 768 } 769 770 /* We should never receive encapsulated echo replies. */ 771 if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { 772 metrics->errors_total_icmp_echo_replies++; 773 return INVALID; 774 } 775 776 if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { 777 return ECHO_REQUEST; 778 } 779 780 if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { 781 metrics->errors_total_unwanted_icmp++; 782 return INVALID; 783 } 784 785 bool is_fragment; 786 uint8_t l4_proto; 787 struct ipv6hdr _ipv6; 788 const struct ipv6hdr *ipv6 = 789 pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); 790 if (ipv6 == NULL) { 791 metrics->errors_total_malformed_icmp_pkt_too_big++; 792 return INVALID; 793 } 794 795 if (is_fragment) { 796 metrics->errors_total_fragmented_ip++; 797 return INVALID; 798 } 799 800 /* Swap source and dest addresses. */ 801 struct bpf_sock_tuple tuple; 802 memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr)); 803 memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr)); 804 805 if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) { 806 metrics->errors_total_malformed_icmp_pkt_too_big++; 807 return INVALID; 808 } 809 810 return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6), 811 metrics); 812 } 813 814 static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, 815 metrics_t *metrics) 816 { 817 metrics->l4_protocol_packets_total_tcp++; 818 819 struct tcphdr _tcp; 820 struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp); 821 if (tcp == NULL) { 822 metrics->errors_total_malformed_tcp++; 823 return INVALID; 824 } 825 826 if (tcp->syn) { 827 return SYN; 828 } 829 830 struct bpf_sock_tuple tuple; 831 uint64_t tuplen = 832 fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest); 833 return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp); 834 } 835 836 static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, 837 metrics_t *metrics) 838 { 839 metrics->l4_protocol_packets_total_udp++; 840 841 struct udphdr _udp; 842 struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp); 843 if (udph == NULL) { 844 metrics->errors_total_malformed_udp++; 845 return INVALID; 846 } 847 848 struct bpf_sock_tuple tuple; 849 uint64_t tuplen = 850 fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest); 851 return classify_udp(pkt->skb, &tuple, tuplen); 852 } 853 854 static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) 855 { 856 metrics->l3_protocol_packets_total_ipv4++; 857 858 struct iphdr _ip4; 859 struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); 860 if (ipv4 == NULL) { 861 metrics->errors_total_malformed_ip++; 862 return INVALID; 863 } 864 865 if (ipv4->version != 4) { 866 metrics->errors_total_malformed_ip++; 867 return INVALID; 868 } 869 870 if (ipv4_is_fragment(ipv4)) { 871 metrics->errors_total_fragmented_ip++; 872 return INVALID; 873 } 874 875 switch (ipv4->protocol) { 876 case IPPROTO_ICMP: 877 return process_icmpv4(pkt, metrics); 878 879 case IPPROTO_TCP: 880 return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics); 881 882 case IPPROTO_UDP: 883 return process_udp(pkt, ipv4, sizeof(*ipv4), metrics); 884 885 default: 886 metrics->errors_total_unknown_l4_proto++; 887 return INVALID; 888 } 889 } 890 891 static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) 892 { 893 metrics->l3_protocol_packets_total_ipv6++; 894 895 uint8_t l4_proto; 896 bool is_fragment; 897 struct ipv6hdr _ipv6; 898 struct ipv6hdr *ipv6 = 899 pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); 900 if (ipv6 == NULL) { 901 metrics->errors_total_malformed_ip++; 902 return INVALID; 903 } 904 905 if (ipv6->version != 6) { 906 metrics->errors_total_malformed_ip++; 907 return INVALID; 908 } 909 910 if (is_fragment) { 911 metrics->errors_total_fragmented_ip++; 912 return INVALID; 913 } 914 915 switch (l4_proto) { 916 case IPPROTO_ICMPV6: 917 return process_icmpv6(pkt, metrics); 918 919 case IPPROTO_TCP: 920 return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics); 921 922 case IPPROTO_UDP: 923 return process_udp(pkt, ipv6, sizeof(*ipv6), metrics); 924 925 default: 926 metrics->errors_total_unknown_l4_proto++; 927 return INVALID; 928 } 929 } 930 931 SEC("tc") 932 int cls_redirect(struct __sk_buff *skb) 933 { 934 metrics_t *metrics = get_global_metrics(); 935 if (metrics == NULL) { 936 return TC_ACT_SHOT; 937 } 938 939 metrics->processed_packets_total++; 940 941 /* Pass bogus packets as long as we're not sure they're 942 * destined for us. 943 */ 944 if (skb->protocol != bpf_htons(ETH_P_IP)) { 945 return TC_ACT_OK; 946 } 947 948 encap_headers_t *encap; 949 950 /* Make sure that all encapsulation headers are available in 951 * the linear portion of the skb. This makes it easy to manipulate them. 952 */ 953 if (bpf_skb_pull_data(skb, sizeof(*encap))) { 954 return TC_ACT_OK; 955 } 956 957 buf_t pkt = { 958 .skb = skb, 959 .head = (uint8_t *)(long)skb->data, 960 .tail = (uint8_t *)(long)skb->data_end, 961 }; 962 963 encap = buf_assign(&pkt, sizeof(*encap), NULL); 964 if (encap == NULL) { 965 return TC_ACT_OK; 966 } 967 968 if (encap->ip.ihl != 5) { 969 /* We never have any options. */ 970 return TC_ACT_OK; 971 } 972 973 if (encap->ip.daddr != ENCAPSULATION_IP || 974 encap->ip.protocol != IPPROTO_UDP) { 975 return TC_ACT_OK; 976 } 977 978 /* TODO Check UDP length? */ 979 if (encap->udp.dest != ENCAPSULATION_PORT) { 980 return TC_ACT_OK; 981 } 982 983 /* We now know that the packet is destined to us, we can 984 * drop bogus ones. 985 */ 986 if (ipv4_is_fragment((void *)&encap->ip)) { 987 metrics->errors_total_fragmented_ip++; 988 return TC_ACT_SHOT; 989 } 990 991 if (encap->gue.variant != 0) { 992 metrics->errors_total_malformed_encapsulation++; 993 return TC_ACT_SHOT; 994 } 995 996 if (encap->gue.control != 0) { 997 metrics->errors_total_malformed_encapsulation++; 998 return TC_ACT_SHOT; 999 } 1000 1001 if (encap->gue.flags != 0) { 1002 metrics->errors_total_malformed_encapsulation++; 1003 return TC_ACT_SHOT; 1004 } 1005 1006 if (encap->gue.hlen != 1007 sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { 1008 metrics->errors_total_malformed_encapsulation++; 1009 return TC_ACT_SHOT; 1010 } 1011 1012 if (encap->unigue.version != 0) { 1013 metrics->errors_total_malformed_encapsulation++; 1014 return TC_ACT_SHOT; 1015 } 1016 1017 if (encap->unigue.reserved != 0) { 1018 return TC_ACT_SHOT; 1019 } 1020 1021 struct in_addr next_hop; 1022 MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop)); 1023 1024 if (next_hop.s_addr == 0) { 1025 metrics->accepted_packets_total_last_hop++; 1026 return accept_locally(skb, encap); 1027 } 1028 1029 verdict_t verdict; 1030 switch (encap->gue.proto_ctype) { 1031 case IPPROTO_IPIP: 1032 verdict = process_ipv4(&pkt, metrics); 1033 break; 1034 1035 case IPPROTO_IPV6: 1036 verdict = process_ipv6(&pkt, metrics); 1037 break; 1038 1039 default: 1040 metrics->errors_total_unknown_l3_proto++; 1041 return TC_ACT_SHOT; 1042 } 1043 1044 switch (verdict) { 1045 case INVALID: 1046 /* metrics have already been bumped */ 1047 return TC_ACT_SHOT; 1048 1049 case UNKNOWN: 1050 return forward_to_next_hop(skb, encap, &next_hop, metrics); 1051 1052 case ECHO_REQUEST: 1053 metrics->accepted_packets_total_icmp_echo_request++; 1054 break; 1055 1056 case SYN: 1057 if (encap->unigue.forward_syn) { 1058 return forward_to_next_hop(skb, encap, &next_hop, 1059 metrics); 1060 } 1061 1062 metrics->accepted_packets_total_syn++; 1063 break; 1064 1065 case SYN_COOKIE: 1066 metrics->accepted_packets_total_syn_cookies++; 1067 break; 1068 1069 case ESTABLISHED: 1070 metrics->accepted_packets_total_established++; 1071 break; 1072 } 1073 1074 return accept_locally(skb, encap); 1075 } 1076