1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 // Copyright (c) 2019, 2020 Cloudflare
3 
4 #include <stdbool.h>
5 #include <stddef.h>
6 #include <stdint.h>
7 #include <string.h>
8 
9 #include <linux/bpf.h>
10 #include <linux/icmp.h>
11 #include <linux/icmpv6.h>
12 #include <linux/if_ether.h>
13 #include <linux/in.h>
14 #include <linux/ip.h>
15 #include <linux/ipv6.h>
16 #include <linux/pkt_cls.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 
20 #include <bpf/bpf_helpers.h>
21 #include <bpf/bpf_endian.h>
22 
23 #include "bpf_compiler.h"
24 #include "test_cls_redirect.h"
25 
26 #pragma GCC diagnostic ignored "-Waddress-of-packed-member"
27 
28 #ifdef SUBPROGS
29 #define INLINING __noinline
30 #else
31 #define INLINING __always_inline
32 #endif
33 
34 #define offsetofend(TYPE, MEMBER) \
35 	(offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
36 
37 #define IP_OFFSET_MASK (0x1FFF)
38 #define IP_MF (0x2000)
39 
40 char _license[] SEC("license") = "Dual BSD/GPL";
41 
42 /**
43  * Destination port and IP used for UDP encapsulation.
44  */
45 volatile const __be16 ENCAPSULATION_PORT;
46 volatile const __be32 ENCAPSULATION_IP;
47 
48 typedef struct {
49 	uint64_t processed_packets_total;
50 	uint64_t l3_protocol_packets_total_ipv4;
51 	uint64_t l3_protocol_packets_total_ipv6;
52 	uint64_t l4_protocol_packets_total_tcp;
53 	uint64_t l4_protocol_packets_total_udp;
54 	uint64_t accepted_packets_total_syn;
55 	uint64_t accepted_packets_total_syn_cookies;
56 	uint64_t accepted_packets_total_last_hop;
57 	uint64_t accepted_packets_total_icmp_echo_request;
58 	uint64_t accepted_packets_total_established;
59 	uint64_t forwarded_packets_total_gue;
60 	uint64_t forwarded_packets_total_gre;
61 
62 	uint64_t errors_total_unknown_l3_proto;
63 	uint64_t errors_total_unknown_l4_proto;
64 	uint64_t errors_total_malformed_ip;
65 	uint64_t errors_total_fragmented_ip;
66 	uint64_t errors_total_malformed_icmp;
67 	uint64_t errors_total_unwanted_icmp;
68 	uint64_t errors_total_malformed_icmp_pkt_too_big;
69 	uint64_t errors_total_malformed_tcp;
70 	uint64_t errors_total_malformed_udp;
71 	uint64_t errors_total_icmp_echo_replies;
72 	uint64_t errors_total_malformed_encapsulation;
73 	uint64_t errors_total_encap_adjust_failed;
74 	uint64_t errors_total_encap_buffer_too_small;
75 	uint64_t errors_total_redirect_loop;
76 	uint64_t errors_total_encap_mtu_violate;
77 } metrics_t;
78 
79 typedef enum {
80 	INVALID = 0,
81 	UNKNOWN,
82 	ECHO_REQUEST,
83 	SYN,
84 	SYN_COOKIE,
85 	ESTABLISHED,
86 } verdict_t;
87 
88 typedef struct {
89 	uint16_t src, dst;
90 } flow_ports_t;
91 
92 _Static_assert(
93 	sizeof(flow_ports_t) !=
94 		offsetofend(struct bpf_sock_tuple, ipv4.dport) -
95 			offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
96 	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
97 _Static_assert(
98 	sizeof(flow_ports_t) !=
99 		offsetofend(struct bpf_sock_tuple, ipv6.dport) -
100 			offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
101 	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
102 
103 typedef int ret_t;
104 
105 /* This is a bit of a hack. We need a return value which allows us to
106  * indicate that the regular flow of the program should continue,
107  * while allowing functions to use XDP_PASS and XDP_DROP, etc.
108  */
109 static const ret_t CONTINUE_PROCESSING = -1;
110 
111 /* Convenience macro to call functions which return ret_t.
112  */
113 #define MAYBE_RETURN(x)                           \
114 	do {                                      \
115 		ret_t __ret = x;                  \
116 		if (__ret != CONTINUE_PROCESSING) \
117 			return __ret;             \
118 	} while (0)
119 
120 /* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes),
121  * or not aligned if the arch supports efficient unaligned access.
122  *
123  * Since the verifier ensures that eBPF packet accesses follow these rules,
124  * we can tell LLVM to emit code as if we always had a larger alignment.
125  * It will yell at us if we end up on a platform where this is not valid.
126  */
127 typedef uint8_t *net_ptr __attribute__((align_value(8)));
128 
129 typedef struct buf {
130 	struct __sk_buff *skb;
131 	net_ptr head;
132 	/* NB: tail musn't have alignment other than 1, otherwise
133 	* LLVM will go and eliminate code, e.g. when checking packet lengths.
134 	*/
135 	uint8_t *const tail;
136 } buf_t;
137 
138 static __always_inline size_t buf_off(const buf_t *buf)
139 {
140 	/* Clang seems to optimize constructs like
141 	 *    a - b + c
142 	 * if c is known:
143 	 *    r? = c
144 	 *    r? -= b
145 	 *    r? += a
146 	 *
147 	 * This is a problem if a and b are packet pointers,
148 	 * since the verifier allows subtracting two pointers to
149 	 * get a scalar, but not a scalar and a pointer.
150 	 *
151 	 * Use inline asm to break this optimization.
152 	 */
153 	size_t off = (size_t)buf->head;
154 	asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
155 	return off;
156 }
157 
158 static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len)
159 {
160 	if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) {
161 		return false;
162 	}
163 
164 	buf->head += len;
165 	return true;
166 }
167 
168 static __always_inline bool buf_skip(buf_t *buf, const size_t len)
169 {
170 	/* Check whether off + len is valid in the non-linear part. */
171 	if (buf_off(buf) + len > buf->skb->len) {
172 		return false;
173 	}
174 
175 	buf->head += len;
176 	return true;
177 }
178 
179 /* Returns a pointer to the start of buf, or NULL if len is
180  * larger than the remaining data. Consumes len bytes on a successful
181  * call.
182  *
183  * If scratch is not NULL, the function will attempt to load non-linear
184  * data via bpf_skb_load_bytes. On success, scratch is returned.
185  */
186 static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch)
187 {
188 	if (buf->head + len > buf->tail) {
189 		if (scratch == NULL) {
190 			return NULL;
191 		}
192 
193 		return buf_copy(buf, scratch, len) ? scratch : NULL;
194 	}
195 
196 	void *ptr = buf->head;
197 	buf->head += len;
198 	return ptr;
199 }
200 
201 static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4)
202 {
203 	if (ipv4->ihl <= 5) {
204 		return true;
205 	}
206 
207 	return buf_skip(buf, (ipv4->ihl - 5) * 4);
208 }
209 
210 static INLINING bool ipv4_is_fragment(const struct iphdr *ip)
211 {
212 	uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
213 	return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
214 }
215 
216 static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch)
217 {
218 	struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch);
219 	if (ipv4 == NULL) {
220 		return NULL;
221 	}
222 
223 	if (ipv4->ihl < 5) {
224 		return NULL;
225 	}
226 
227 	if (!pkt_skip_ipv4_options(pkt, ipv4)) {
228 		return NULL;
229 	}
230 
231 	return ipv4;
232 }
233 
234 /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
235 static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports)
236 {
237 	if (!buf_copy(pkt, ports, sizeof(*ports))) {
238 		return false;
239 	}
240 
241 	/* Ports in the L4 headers are reversed, since we are parsing an ICMP
242 	 * payload which is going towards the eyeball.
243 	 */
244 	uint16_t dst = ports->src;
245 	ports->src = ports->dst;
246 	ports->dst = dst;
247 	return true;
248 }
249 
250 static INLINING uint16_t pkt_checksum_fold(uint32_t csum)
251 {
252 	/* The highest reasonable value for an IPv4 header
253 	 * checksum requires two folds, so we just do that always.
254 	 */
255 	csum = (csum & 0xffff) + (csum >> 16);
256 	csum = (csum & 0xffff) + (csum >> 16);
257 	return (uint16_t)~csum;
258 }
259 
260 static INLINING void pkt_ipv4_checksum(struct iphdr *iph)
261 {
262 	iph->check = 0;
263 
264 	/* An IP header without options is 20 bytes. Two of those
265 	 * are the checksum, which we always set to zero. Hence,
266 	 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
267 	 * which fits in 32 bit.
268 	 */
269 	_Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
270 	uint32_t acc = 0;
271 	uint16_t *ipw = (uint16_t *)iph;
272 
273 	__pragma_loop_unroll_full
274 	for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) {
275 		acc += ipw[i];
276 	}
277 
278 	iph->check = pkt_checksum_fold(acc);
279 }
280 
281 static INLINING
282 bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
283 				     const struct ipv6hdr *ipv6,
284 				     uint8_t *upper_proto,
285 				     bool *is_fragment)
286 {
287 	/* We understand five extension headers.
288 	 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
289 	 * headers should occur once, except Destination Options, which may
290 	 * occur twice. Hence we give up after 6 headers.
291 	 */
292 	struct {
293 		uint8_t next;
294 		uint8_t len;
295 	} exthdr = {
296 		.next = ipv6->nexthdr,
297 	};
298 	*is_fragment = false;
299 
300 	__pragma_loop_unroll_full
301 	for (int i = 0; i < 6; i++) {
302 		switch (exthdr.next) {
303 		case IPPROTO_FRAGMENT:
304 			*is_fragment = true;
305 			/* NB: We don't check that hdrlen == 0 as per spec. */
306 			/* fallthrough; */
307 
308 		case IPPROTO_HOPOPTS:
309 		case IPPROTO_ROUTING:
310 		case IPPROTO_DSTOPTS:
311 		case IPPROTO_MH:
312 			if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) {
313 				return false;
314 			}
315 
316 			/* hdrlen is in 8-octet units, and excludes the first 8 octets. */
317 			if (!buf_skip(pkt,
318 				      (exthdr.len + 1) * 8 - sizeof(exthdr))) {
319 				return false;
320 			}
321 
322 			/* Decode next header */
323 			break;
324 
325 		default:
326 			/* The next header is not one of the known extension
327 			 * headers, treat it as the upper layer header.
328 			 *
329 			 * This handles IPPROTO_NONE.
330 			 *
331 			 * Encapsulating Security Payload (50) and Authentication
332 			 * Header (51) also end up here (and will trigger an
333 			 * unknown proto error later). They have a custom header
334 			 * format and seem too esoteric to care about.
335 			 */
336 			*upper_proto = exthdr.next;
337 			return true;
338 		}
339 	}
340 
341 	/* We never found an upper layer header. */
342 	return false;
343 }
344 
345 /* This function has to be inlined, because the verifier otherwise rejects it
346  * due to returning a pointer to the stack. This is technically correct, since
347  * scratch is allocated on the stack. However, this usage should be safe since
348  * it's the callers stack after all.
349  */
350 static __always_inline struct ipv6hdr *
351 pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto,
352 	       bool *is_fragment)
353 {
354 	struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch);
355 	if (ipv6 == NULL) {
356 		return NULL;
357 	}
358 
359 	if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) {
360 		return NULL;
361 	}
362 
363 	return ipv6;
364 }
365 
366 /* Global metrics, per CPU
367  */
368 struct {
369 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
370 	__uint(max_entries, 1);
371 	__type(key, unsigned int);
372 	__type(value, metrics_t);
373 } metrics_map SEC(".maps");
374 
375 static INLINING metrics_t *get_global_metrics(void)
376 {
377 	uint64_t key = 0;
378 	return bpf_map_lookup_elem(&metrics_map, &key);
379 }
380 
381 static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
382 {
383 	const int payload_off =
384 		sizeof(*encap) +
385 		sizeof(struct in_addr) * encap->unigue.hop_count;
386 	int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
387 
388 	// Changing the ethertype if the encapsulated packet is ipv6
389 	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
390 		encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
391 	}
392 
393 	if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
394 				BPF_F_ADJ_ROOM_FIXED_GSO |
395 				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
396 	    bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
397 		return TC_ACT_SHOT;
398 
399 	return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
400 }
401 
402 static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap,
403 				       struct in_addr *next_hop, metrics_t *metrics)
404 {
405 	metrics->forwarded_packets_total_gre++;
406 
407 	const int payload_off =
408 		sizeof(*encap) +
409 		sizeof(struct in_addr) * encap->unigue.hop_count;
410 	int32_t encap_overhead =
411 		payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
412 	int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
413 	uint16_t proto = ETH_P_IP;
414 	uint32_t mtu_len = 0;
415 
416 	/* Loop protection: the inner packet's TTL is decremented as a safeguard
417 	 * against any forwarding loop. As the only interesting field is the TTL
418 	 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
419 	 * as they handle the split packets if needed (no need for the data to be
420 	 * in the linear section).
421 	 */
422 	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
423 		proto = ETH_P_IPV6;
424 		uint8_t ttl;
425 		int rc;
426 
427 		rc = bpf_skb_load_bytes(
428 			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
429 			&ttl, 1);
430 		if (rc != 0) {
431 			metrics->errors_total_malformed_encapsulation++;
432 			return TC_ACT_SHOT;
433 		}
434 
435 		if (ttl == 0) {
436 			metrics->errors_total_redirect_loop++;
437 			return TC_ACT_SHOT;
438 		}
439 
440 		ttl--;
441 		rc = bpf_skb_store_bytes(
442 			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
443 			&ttl, 1, 0);
444 		if (rc != 0) {
445 			metrics->errors_total_malformed_encapsulation++;
446 			return TC_ACT_SHOT;
447 		}
448 	} else {
449 		uint8_t ttl;
450 		int rc;
451 
452 		rc = bpf_skb_load_bytes(
453 			skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
454 			1);
455 		if (rc != 0) {
456 			metrics->errors_total_malformed_encapsulation++;
457 			return TC_ACT_SHOT;
458 		}
459 
460 		if (ttl == 0) {
461 			metrics->errors_total_redirect_loop++;
462 			return TC_ACT_SHOT;
463 		}
464 
465 		/* IPv4 also has a checksum to patch. While the TTL is only one byte,
466 		 * this function only works for 2 and 4 bytes arguments (the result is
467 		 * the same).
468 		 */
469 		rc = bpf_l3_csum_replace(
470 			skb, payload_off + offsetof(struct iphdr, check), ttl,
471 			ttl - 1, 2);
472 		if (rc != 0) {
473 			metrics->errors_total_malformed_encapsulation++;
474 			return TC_ACT_SHOT;
475 		}
476 
477 		ttl--;
478 		rc = bpf_skb_store_bytes(
479 			skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
480 			0);
481 		if (rc != 0) {
482 			metrics->errors_total_malformed_encapsulation++;
483 			return TC_ACT_SHOT;
484 		}
485 	}
486 
487 	if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) {
488 		metrics->errors_total_encap_mtu_violate++;
489 		return TC_ACT_SHOT;
490 	}
491 
492 	if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
493 				BPF_F_ADJ_ROOM_FIXED_GSO |
494 				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
495 	    bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
496 		metrics->errors_total_encap_adjust_failed++;
497 		return TC_ACT_SHOT;
498 	}
499 
500 	if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
501 		metrics->errors_total_encap_buffer_too_small++;
502 		return TC_ACT_SHOT;
503 	}
504 
505 	buf_t pkt = {
506 		.skb = skb,
507 		.head = (uint8_t *)(long)skb->data,
508 		.tail = (uint8_t *)(long)skb->data_end,
509 	};
510 
511 	encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL);
512 	if (encap_gre == NULL) {
513 		metrics->errors_total_encap_buffer_too_small++;
514 		return TC_ACT_SHOT;
515 	}
516 
517 	encap_gre->ip.protocol = IPPROTO_GRE;
518 	encap_gre->ip.daddr = next_hop->s_addr;
519 	encap_gre->ip.saddr = ENCAPSULATION_IP;
520 	encap_gre->ip.tot_len =
521 		bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
522 	encap_gre->gre.flags = 0;
523 	encap_gre->gre.protocol = bpf_htons(proto);
524 	pkt_ipv4_checksum((void *)&encap_gre->ip);
525 
526 	return bpf_redirect(skb->ifindex, 0);
527 }
528 
529 static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap,
530 					  struct in_addr *next_hop, metrics_t *metrics)
531 {
532 	/* swap L2 addresses */
533 	/* This assumes that packets are received from a router.
534 	 * So just swapping the MAC addresses here will make the packet go back to
535 	 * the router, which will send it to the appropriate machine.
536 	 */
537 	unsigned char temp[ETH_ALEN];
538 	memcpy(temp, encap->eth.h_dest, sizeof(temp));
539 	memcpy(encap->eth.h_dest, encap->eth.h_source,
540 	       sizeof(encap->eth.h_dest));
541 	memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
542 
543 	if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
544 	    encap->unigue.last_hop_gre) {
545 		return forward_with_gre(skb, encap, next_hop, metrics);
546 	}
547 
548 	metrics->forwarded_packets_total_gue++;
549 	uint32_t old_saddr = encap->ip.saddr;
550 	encap->ip.saddr = encap->ip.daddr;
551 	encap->ip.daddr = next_hop->s_addr;
552 	if (encap->unigue.next_hop < encap->unigue.hop_count) {
553 		encap->unigue.next_hop++;
554 	}
555 
556 	/* Remove ip->saddr, add next_hop->s_addr */
557 	const uint64_t off = offsetof(typeof(*encap), ip.check);
558 	int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
559 	if (ret < 0) {
560 		return TC_ACT_SHOT;
561 	}
562 
563 	return bpf_redirect(skb->ifindex, 0);
564 }
565 
566 static INLINING ret_t skip_next_hops(buf_t *pkt, int n)
567 {
568 	switch (n) {
569 	case 1:
570 		if (!buf_skip(pkt, sizeof(struct in_addr)))
571 			return TC_ACT_SHOT;
572 	case 0:
573 		return CONTINUE_PROCESSING;
574 
575 	default:
576 		return TC_ACT_SHOT;
577 	}
578 }
579 
580 /* Get the next hop from the GLB header.
581  *
582  * Sets next_hop->s_addr to 0 if there are no more hops left.
583  * pkt is positioned just after the variable length GLB header
584  * iff the call is successful.
585  */
586 static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
587 				   struct in_addr *next_hop)
588 {
589 	if (encap->unigue.next_hop > encap->unigue.hop_count) {
590 		return TC_ACT_SHOT;
591 	}
592 
593 	/* Skip "used" next hops. */
594 	MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop));
595 
596 	if (encap->unigue.next_hop == encap->unigue.hop_count) {
597 		/* No more next hops, we are at the end of the GLB header. */
598 		next_hop->s_addr = 0;
599 		return CONTINUE_PROCESSING;
600 	}
601 
602 	if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) {
603 		return TC_ACT_SHOT;
604 	}
605 
606 	/* Skip the remaining next hops (may be zero). */
607 	return skip_next_hops(pkt, encap->unigue.hop_count -
608 					   encap->unigue.next_hop - 1);
609 }
610 
611 /* Fill a bpf_sock_tuple to be used with the socket lookup functions.
612  * This is a kludge that let's us work around verifier limitations:
613  *
614  *    fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
615  *
616  * clang will substitute a constant for sizeof, which allows the verifier
617  * to track its value. Based on this, it can figure out the constant
618  * return value, and calling code works while still being "generic" to
619  * IPv4 and IPv6.
620  */
621 static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
622 				    uint64_t iphlen, uint16_t sport, uint16_t dport)
623 {
624 	switch (iphlen) {
625 	case sizeof(struct iphdr): {
626 		struct iphdr *ipv4 = (struct iphdr *)iph;
627 		tuple->ipv4.daddr = ipv4->daddr;
628 		tuple->ipv4.saddr = ipv4->saddr;
629 		tuple->ipv4.sport = sport;
630 		tuple->ipv4.dport = dport;
631 		return sizeof(tuple->ipv4);
632 	}
633 
634 	case sizeof(struct ipv6hdr): {
635 		struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
636 		memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
637 		       sizeof(tuple->ipv6.daddr));
638 		memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
639 		       sizeof(tuple->ipv6.saddr));
640 		tuple->ipv6.sport = sport;
641 		tuple->ipv6.dport = dport;
642 		return sizeof(tuple->ipv6);
643 	}
644 
645 	default:
646 		return 0;
647 	}
648 }
649 
650 static INLINING verdict_t classify_tcp(struct __sk_buff *skb,
651 				       struct bpf_sock_tuple *tuple, uint64_t tuplen,
652 				       void *iph, struct tcphdr *tcp)
653 {
654 	struct bpf_sock *sk =
655 		bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
656 	if (sk == NULL) {
657 		return UNKNOWN;
658 	}
659 
660 	if (sk->state != BPF_TCP_LISTEN) {
661 		bpf_sk_release(sk);
662 		return ESTABLISHED;
663 	}
664 
665 	if (iph != NULL && tcp != NULL) {
666 		/* Kludge: we've run out of arguments, but need the length of the ip header. */
667 		uint64_t iphlen = sizeof(struct iphdr);
668 		if (tuplen == sizeof(tuple->ipv6)) {
669 			iphlen = sizeof(struct ipv6hdr);
670 		}
671 
672 		if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
673 					    sizeof(*tcp)) == 0) {
674 			bpf_sk_release(sk);
675 			return SYN_COOKIE;
676 		}
677 	}
678 
679 	bpf_sk_release(sk);
680 	return UNKNOWN;
681 }
682 
683 static INLINING verdict_t classify_udp(struct __sk_buff *skb,
684 				       struct bpf_sock_tuple *tuple, uint64_t tuplen)
685 {
686 	struct bpf_sock *sk =
687 		bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
688 	if (sk == NULL) {
689 		return UNKNOWN;
690 	}
691 
692 	if (sk->state == BPF_TCP_ESTABLISHED) {
693 		bpf_sk_release(sk);
694 		return ESTABLISHED;
695 	}
696 
697 	bpf_sk_release(sk);
698 	return UNKNOWN;
699 }
700 
701 static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
702 					struct bpf_sock_tuple *tuple, uint64_t tuplen,
703 					metrics_t *metrics)
704 {
705 	switch (proto) {
706 	case IPPROTO_TCP:
707 		return classify_tcp(skb, tuple, tuplen, NULL, NULL);
708 
709 	case IPPROTO_UDP:
710 		return classify_udp(skb, tuple, tuplen);
711 
712 	default:
713 		metrics->errors_total_malformed_icmp++;
714 		return INVALID;
715 	}
716 }
717 
718 static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics)
719 {
720 	struct icmphdr icmp;
721 	if (!buf_copy(pkt, &icmp, sizeof(icmp))) {
722 		metrics->errors_total_malformed_icmp++;
723 		return INVALID;
724 	}
725 
726 	/* We should never receive encapsulated echo replies. */
727 	if (icmp.type == ICMP_ECHOREPLY) {
728 		metrics->errors_total_icmp_echo_replies++;
729 		return INVALID;
730 	}
731 
732 	if (icmp.type == ICMP_ECHO) {
733 		return ECHO_REQUEST;
734 	}
735 
736 	if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
737 		metrics->errors_total_unwanted_icmp++;
738 		return INVALID;
739 	}
740 
741 	struct iphdr _ip4;
742 	const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
743 	if (ipv4 == NULL) {
744 		metrics->errors_total_malformed_icmp_pkt_too_big++;
745 		return INVALID;
746 	}
747 
748 	/* The source address in the outer IP header is from the entity that
749 	 * originated the ICMP message. Use the original IP header to restore
750 	 * the correct flow tuple.
751 	 */
752 	struct bpf_sock_tuple tuple;
753 	tuple.ipv4.saddr = ipv4->daddr;
754 	tuple.ipv4.daddr = ipv4->saddr;
755 
756 	if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) {
757 		metrics->errors_total_malformed_icmp_pkt_too_big++;
758 		return INVALID;
759 	}
760 
761 	return classify_icmp(pkt->skb, ipv4->protocol, &tuple,
762 			     sizeof(tuple.ipv4), metrics);
763 }
764 
765 static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics)
766 {
767 	struct icmp6hdr icmp6;
768 	if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) {
769 		metrics->errors_total_malformed_icmp++;
770 		return INVALID;
771 	}
772 
773 	/* We should never receive encapsulated echo replies. */
774 	if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
775 		metrics->errors_total_icmp_echo_replies++;
776 		return INVALID;
777 	}
778 
779 	if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
780 		return ECHO_REQUEST;
781 	}
782 
783 	if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
784 		metrics->errors_total_unwanted_icmp++;
785 		return INVALID;
786 	}
787 
788 	bool is_fragment;
789 	uint8_t l4_proto;
790 	struct ipv6hdr _ipv6;
791 	const struct ipv6hdr *ipv6 =
792 		pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
793 	if (ipv6 == NULL) {
794 		metrics->errors_total_malformed_icmp_pkt_too_big++;
795 		return INVALID;
796 	}
797 
798 	if (is_fragment) {
799 		metrics->errors_total_fragmented_ip++;
800 		return INVALID;
801 	}
802 
803 	/* Swap source and dest addresses. */
804 	struct bpf_sock_tuple tuple;
805 	memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr));
806 	memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr));
807 
808 	if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) {
809 		metrics->errors_total_malformed_icmp_pkt_too_big++;
810 		return INVALID;
811 	}
812 
813 	return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6),
814 			     metrics);
815 }
816 
817 static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen,
818 				      metrics_t *metrics)
819 {
820 	metrics->l4_protocol_packets_total_tcp++;
821 
822 	struct tcphdr _tcp;
823 	struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp);
824 	if (tcp == NULL) {
825 		metrics->errors_total_malformed_tcp++;
826 		return INVALID;
827 	}
828 
829 	if (tcp->syn) {
830 		return SYN;
831 	}
832 
833 	struct bpf_sock_tuple tuple;
834 	uint64_t tuplen =
835 		fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest);
836 	return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp);
837 }
838 
839 static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen,
840 				      metrics_t *metrics)
841 {
842 	metrics->l4_protocol_packets_total_udp++;
843 
844 	struct udphdr _udp;
845 	struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp);
846 	if (udph == NULL) {
847 		metrics->errors_total_malformed_udp++;
848 		return INVALID;
849 	}
850 
851 	struct bpf_sock_tuple tuple;
852 	uint64_t tuplen =
853 		fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest);
854 	return classify_udp(pkt->skb, &tuple, tuplen);
855 }
856 
857 static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics)
858 {
859 	metrics->l3_protocol_packets_total_ipv4++;
860 
861 	struct iphdr _ip4;
862 	struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
863 	if (ipv4 == NULL) {
864 		metrics->errors_total_malformed_ip++;
865 		return INVALID;
866 	}
867 
868 	if (ipv4->version != 4) {
869 		metrics->errors_total_malformed_ip++;
870 		return INVALID;
871 	}
872 
873 	if (ipv4_is_fragment(ipv4)) {
874 		metrics->errors_total_fragmented_ip++;
875 		return INVALID;
876 	}
877 
878 	switch (ipv4->protocol) {
879 	case IPPROTO_ICMP:
880 		return process_icmpv4(pkt, metrics);
881 
882 	case IPPROTO_TCP:
883 		return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics);
884 
885 	case IPPROTO_UDP:
886 		return process_udp(pkt, ipv4, sizeof(*ipv4), metrics);
887 
888 	default:
889 		metrics->errors_total_unknown_l4_proto++;
890 		return INVALID;
891 	}
892 }
893 
894 static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics)
895 {
896 	metrics->l3_protocol_packets_total_ipv6++;
897 
898 	uint8_t l4_proto;
899 	bool is_fragment;
900 	struct ipv6hdr _ipv6;
901 	struct ipv6hdr *ipv6 =
902 		pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
903 	if (ipv6 == NULL) {
904 		metrics->errors_total_malformed_ip++;
905 		return INVALID;
906 	}
907 
908 	if (ipv6->version != 6) {
909 		metrics->errors_total_malformed_ip++;
910 		return INVALID;
911 	}
912 
913 	if (is_fragment) {
914 		metrics->errors_total_fragmented_ip++;
915 		return INVALID;
916 	}
917 
918 	switch (l4_proto) {
919 	case IPPROTO_ICMPV6:
920 		return process_icmpv6(pkt, metrics);
921 
922 	case IPPROTO_TCP:
923 		return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics);
924 
925 	case IPPROTO_UDP:
926 		return process_udp(pkt, ipv6, sizeof(*ipv6), metrics);
927 
928 	default:
929 		metrics->errors_total_unknown_l4_proto++;
930 		return INVALID;
931 	}
932 }
933 
934 SEC("tc")
935 int cls_redirect(struct __sk_buff *skb)
936 {
937 	metrics_t *metrics = get_global_metrics();
938 	if (metrics == NULL) {
939 		return TC_ACT_SHOT;
940 	}
941 
942 	metrics->processed_packets_total++;
943 
944 	/* Pass bogus packets as long as we're not sure they're
945 	 * destined for us.
946 	 */
947 	if (skb->protocol != bpf_htons(ETH_P_IP)) {
948 		return TC_ACT_OK;
949 	}
950 
951 	encap_headers_t *encap;
952 
953 	/* Make sure that all encapsulation headers are available in
954 	 * the linear portion of the skb. This makes it easy to manipulate them.
955 	 */
956 	if (bpf_skb_pull_data(skb, sizeof(*encap))) {
957 		return TC_ACT_OK;
958 	}
959 
960 	buf_t pkt = {
961 		.skb = skb,
962 		.head = (uint8_t *)(long)skb->data,
963 		.tail = (uint8_t *)(long)skb->data_end,
964 	};
965 
966 	encap = buf_assign(&pkt, sizeof(*encap), NULL);
967 	if (encap == NULL) {
968 		return TC_ACT_OK;
969 	}
970 
971 	if (encap->ip.ihl != 5) {
972 		/* We never have any options. */
973 		return TC_ACT_OK;
974 	}
975 
976 	if (encap->ip.daddr != ENCAPSULATION_IP ||
977 	    encap->ip.protocol != IPPROTO_UDP) {
978 		return TC_ACT_OK;
979 	}
980 
981 	/* TODO Check UDP length? */
982 	if (encap->udp.dest != ENCAPSULATION_PORT) {
983 		return TC_ACT_OK;
984 	}
985 
986 	/* We now know that the packet is destined to us, we can
987 	 * drop bogus ones.
988 	 */
989 	if (ipv4_is_fragment((void *)&encap->ip)) {
990 		metrics->errors_total_fragmented_ip++;
991 		return TC_ACT_SHOT;
992 	}
993 
994 	if (encap->gue.variant != 0) {
995 		metrics->errors_total_malformed_encapsulation++;
996 		return TC_ACT_SHOT;
997 	}
998 
999 	if (encap->gue.control != 0) {
1000 		metrics->errors_total_malformed_encapsulation++;
1001 		return TC_ACT_SHOT;
1002 	}
1003 
1004 	if (encap->gue.flags != 0) {
1005 		metrics->errors_total_malformed_encapsulation++;
1006 		return TC_ACT_SHOT;
1007 	}
1008 
1009 	if (encap->gue.hlen !=
1010 	    sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
1011 		metrics->errors_total_malformed_encapsulation++;
1012 		return TC_ACT_SHOT;
1013 	}
1014 
1015 	if (encap->unigue.version != 0) {
1016 		metrics->errors_total_malformed_encapsulation++;
1017 		return TC_ACT_SHOT;
1018 	}
1019 
1020 	if (encap->unigue.reserved != 0) {
1021 		return TC_ACT_SHOT;
1022 	}
1023 
1024 	struct in_addr next_hop;
1025 	MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop));
1026 
1027 	if (next_hop.s_addr == 0) {
1028 		metrics->accepted_packets_total_last_hop++;
1029 		return accept_locally(skb, encap);
1030 	}
1031 
1032 	verdict_t verdict;
1033 	switch (encap->gue.proto_ctype) {
1034 	case IPPROTO_IPIP:
1035 		verdict = process_ipv4(&pkt, metrics);
1036 		break;
1037 
1038 	case IPPROTO_IPV6:
1039 		verdict = process_ipv6(&pkt, metrics);
1040 		break;
1041 
1042 	default:
1043 		metrics->errors_total_unknown_l3_proto++;
1044 		return TC_ACT_SHOT;
1045 	}
1046 
1047 	switch (verdict) {
1048 	case INVALID:
1049 		/* metrics have already been bumped */
1050 		return TC_ACT_SHOT;
1051 
1052 	case UNKNOWN:
1053 		return forward_to_next_hop(skb, encap, &next_hop, metrics);
1054 
1055 	case ECHO_REQUEST:
1056 		metrics->accepted_packets_total_icmp_echo_request++;
1057 		break;
1058 
1059 	case SYN:
1060 		if (encap->unigue.forward_syn) {
1061 			return forward_to_next_hop(skb, encap, &next_hop,
1062 						   metrics);
1063 		}
1064 
1065 		metrics->accepted_packets_total_syn++;
1066 		break;
1067 
1068 	case SYN_COOKIE:
1069 		metrics->accepted_packets_total_syn_cookies++;
1070 		break;
1071 
1072 	case ESTABLISHED:
1073 		metrics->accepted_packets_total_established++;
1074 		break;
1075 	}
1076 
1077 	return accept_locally(skb, encap);
1078 }
1079