xref: /qemu/tools/ebpf/rss.bpf.c (revision 370ed600)
1 /*
2  * eBPF RSS program
3  *
4  * Developed by Daynix Computing LTD (http://www.daynix.com)
5  *
6  * Authors:
7  *  Andrew Melnychenko <andrew@daynix.com>
8  *  Yuri Benditovich <yuri.benditovich@daynix.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2.  See
11  * the COPYING file in the top-level directory.
12  *
13  * Prepare:
14  * Requires llvm, clang, bpftool, linux kernel tree
15  *
16  * Build rss.bpf.skeleton.h:
17  * make -f Makefile.ebpf clean all
18  */
19 
20 #include <stddef.h>
21 #include <stdbool.h>
22 #include <linux/bpf.h>
23 
24 #include <linux/in.h>
25 #include <linux/if_ether.h>
26 #include <linux/ip.h>
27 #include <linux/ipv6.h>
28 
29 #include <linux/udp.h>
30 #include <linux/tcp.h>
31 
32 #include <bpf/bpf_helpers.h>
33 #include <bpf/bpf_endian.h>
34 #include <linux/virtio_net.h>
35 
36 #define INDIRECTION_TABLE_SIZE 128
37 #define HASH_CALCULATION_BUFFER_SIZE 36
38 
39 struct rss_config_t {
40     __u8 redirect;
41     __u8 populate_hash;
42     __u32 hash_types;
43     __u16 indirections_len;
44     __u16 default_queue;
45 } __attribute__((packed));
46 
47 struct toeplitz_key_data_t {
48     __u32 leftmost_32_bits;
49     __u8 next_byte[HASH_CALCULATION_BUFFER_SIZE];
50 };
51 
52 struct packet_hash_info_t {
53     __u8 is_ipv4;
54     __u8 is_ipv6;
55     __u8 is_udp;
56     __u8 is_tcp;
57     __u8 is_ipv6_ext_src;
58     __u8 is_ipv6_ext_dst;
59     __u8 is_fragmented;
60 
61     __u16 src_port;
62     __u16 dst_port;
63 
64     union {
65         struct {
66             __be32 in_src;
67             __be32 in_dst;
68         };
69 
70         struct {
71             struct in6_addr in6_src;
72             struct in6_addr in6_dst;
73             struct in6_addr in6_ext_src;
74             struct in6_addr in6_ext_dst;
75         };
76     };
77 };
78 
79 struct {
80     __uint(type, BPF_MAP_TYPE_ARRAY);
81     __uint(key_size, sizeof(__u32));
82     __uint(value_size, sizeof(struct rss_config_t));
83     __uint(max_entries, 1);
84 } tap_rss_map_configurations SEC(".maps");
85 
86 struct {
87     __uint(type, BPF_MAP_TYPE_ARRAY);
88     __uint(key_size, sizeof(__u32));
89     __uint(value_size, sizeof(struct toeplitz_key_data_t));
90     __uint(max_entries, 1);
91 } tap_rss_map_toeplitz_key SEC(".maps");
92 
93 struct {
94     __uint(type, BPF_MAP_TYPE_ARRAY);
95     __uint(key_size, sizeof(__u32));
96     __uint(value_size, sizeof(__u16));
97     __uint(max_entries, INDIRECTION_TABLE_SIZE);
98 } tap_rss_map_indirection_table SEC(".maps");
99 
100 static inline void net_rx_rss_add_chunk(__u8 *rss_input, size_t *bytes_written,
101                                         const void *ptr, size_t size) {
102     __builtin_memcpy(&rss_input[*bytes_written], ptr, size);
103     *bytes_written += size;
104 }
105 
106 static inline
107 void net_toeplitz_add(__u32 *result,
108                       __u8 *input,
109                       __u32 len
110         , struct toeplitz_key_data_t *key) {
111 
112     __u32 accumulator = *result;
113     __u32 leftmost_32_bits = key->leftmost_32_bits;
114     __u32 byte;
115 
116     for (byte = 0; byte < HASH_CALCULATION_BUFFER_SIZE; byte++) {
117         __u8 input_byte = input[byte];
118         __u8 key_byte = key->next_byte[byte];
119         __u8 bit;
120 
121         for (bit = 0; bit < 8; bit++) {
122             if (input_byte & (1 << 7)) {
123                 accumulator ^= leftmost_32_bits;
124             }
125 
126             leftmost_32_bits =
127                     (leftmost_32_bits << 1) | ((key_byte & (1 << 7)) >> 7);
128 
129             input_byte <<= 1;
130             key_byte <<= 1;
131         }
132     }
133 
134     *result = accumulator;
135 }
136 
137 
138 static inline int ip6_extension_header_type(__u8 hdr_type)
139 {
140     switch (hdr_type) {
141     case IPPROTO_HOPOPTS:
142     case IPPROTO_ROUTING:
143     case IPPROTO_FRAGMENT:
144     case IPPROTO_ICMPV6:
145     case IPPROTO_NONE:
146     case IPPROTO_DSTOPTS:
147     case IPPROTO_MH:
148         return 1;
149     default:
150         return 0;
151     }
152 }
153 /*
154  * According to
155  * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml
156  * we expect that there are would be no more than 11 extensions in IPv6 header,
157  * also there is 27 TLV options for Destination and Hop-by-hop extensions.
158  * Need to choose reasonable amount of maximum extensions/options we may
159  * check to find ext src/dst.
160  */
161 #define IP6_EXTENSIONS_COUNT 11
162 #define IP6_OPTIONS_COUNT 30
163 
164 static inline int parse_ipv6_ext(struct __sk_buff *skb,
165         struct packet_hash_info_t *info,
166         __u8 *l4_protocol, size_t *l4_offset)
167 {
168     int err = 0;
169 
170     if (!ip6_extension_header_type(*l4_protocol)) {
171         return 0;
172     }
173 
174     struct ipv6_opt_hdr ext_hdr = {};
175 
176     for (unsigned int i = 0; i < IP6_EXTENSIONS_COUNT; ++i) {
177 
178         err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_hdr,
179                                     sizeof(ext_hdr), BPF_HDR_START_NET);
180         if (err) {
181             goto error;
182         }
183 
184         if (*l4_protocol == IPPROTO_ROUTING) {
185             struct ipv6_rt_hdr ext_rt = {};
186 
187             err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_rt,
188                                         sizeof(ext_rt), BPF_HDR_START_NET);
189             if (err) {
190                 goto error;
191             }
192 
193             if ((ext_rt.type == IPV6_SRCRT_TYPE_2) &&
194                     (ext_rt.hdrlen == sizeof(struct in6_addr) / 8) &&
195                     (ext_rt.segments_left == 1)) {
196 
197                 err = bpf_skb_load_bytes_relative(skb,
198                     *l4_offset + offsetof(struct rt2_hdr, addr),
199                     &info->in6_ext_dst, sizeof(info->in6_ext_dst),
200                     BPF_HDR_START_NET);
201                 if (err) {
202                     goto error;
203                 }
204 
205                 info->is_ipv6_ext_dst = 1;
206             }
207 
208         } else if (*l4_protocol == IPPROTO_DSTOPTS) {
209             struct ipv6_opt_t {
210                 __u8 type;
211                 __u8 length;
212             } __attribute__((packed)) opt = {};
213 
214             size_t opt_offset = sizeof(ext_hdr);
215 
216             for (unsigned int j = 0; j < IP6_OPTIONS_COUNT; ++j) {
217                 err = bpf_skb_load_bytes_relative(skb, *l4_offset + opt_offset,
218                                         &opt, sizeof(opt), BPF_HDR_START_NET);
219                 if (err) {
220                     goto error;
221                 }
222 
223                 if (opt.type == IPV6_TLV_HAO) {
224                     err = bpf_skb_load_bytes_relative(skb,
225                         *l4_offset + opt_offset
226                         + offsetof(struct ipv6_destopt_hao, addr),
227                         &info->in6_ext_src, sizeof(info->in6_ext_src),
228                         BPF_HDR_START_NET);
229                     if (err) {
230                         goto error;
231                     }
232 
233                     info->is_ipv6_ext_src = 1;
234                     break;
235                 }
236 
237                 opt_offset += (opt.type == IPV6_TLV_PAD1) ?
238                               1 : opt.length + sizeof(opt);
239 
240                 if (opt_offset + 1 >= ext_hdr.hdrlen * 8) {
241                     break;
242                 }
243             }
244         } else if (*l4_protocol == IPPROTO_FRAGMENT) {
245             info->is_fragmented = true;
246         }
247 
248         *l4_protocol = ext_hdr.nexthdr;
249         *l4_offset += (ext_hdr.hdrlen + 1) * 8;
250 
251         if (!ip6_extension_header_type(ext_hdr.nexthdr)) {
252             return 0;
253         }
254     }
255 
256     return 0;
257 error:
258     return err;
259 }
260 
261 static __be16 parse_eth_type(struct __sk_buff *skb)
262 {
263     unsigned int offset = 12;
264     __be16 ret = 0;
265     int err = 0;
266 
267     err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
268                                 BPF_HDR_START_MAC);
269     if (err) {
270         return 0;
271     }
272 
273     switch (bpf_ntohs(ret)) {
274     case ETH_P_8021AD:
275         offset += 4;
276     case ETH_P_8021Q:
277         offset += 4;
278         err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
279                                     BPF_HDR_START_MAC);
280     default:
281         break;
282     }
283 
284     if (err) {
285         return 0;
286     }
287 
288     return ret;
289 }
290 
291 static inline int parse_packet(struct __sk_buff *skb,
292         struct packet_hash_info_t *info)
293 {
294     int err = 0;
295 
296     if (!info || !skb) {
297         return -1;
298     }
299 
300     size_t l4_offset = 0;
301     __u8 l4_protocol = 0;
302     __u16 l3_protocol = bpf_ntohs(parse_eth_type(skb));
303     if (l3_protocol == 0) {
304         err = -1;
305         goto error;
306     }
307 
308     if (l3_protocol == ETH_P_IP) {
309         info->is_ipv4 = 1;
310 
311         struct iphdr ip = {};
312         err = bpf_skb_load_bytes_relative(skb, 0, &ip, sizeof(ip),
313                                     BPF_HDR_START_NET);
314         if (err) {
315             goto error;
316         }
317 
318         info->in_src = ip.saddr;
319         info->in_dst = ip.daddr;
320         info->is_fragmented = !!ip.frag_off;
321 
322         l4_protocol = ip.protocol;
323         l4_offset = ip.ihl * 4;
324     } else if (l3_protocol == ETH_P_IPV6) {
325         info->is_ipv6 = 1;
326 
327         struct ipv6hdr ip6 = {};
328         err = bpf_skb_load_bytes_relative(skb, 0, &ip6, sizeof(ip6),
329                                     BPF_HDR_START_NET);
330         if (err) {
331             goto error;
332         }
333 
334         info->in6_src = ip6.saddr;
335         info->in6_dst = ip6.daddr;
336 
337         l4_protocol = ip6.nexthdr;
338         l4_offset = sizeof(ip6);
339 
340         err = parse_ipv6_ext(skb, info, &l4_protocol, &l4_offset);
341         if (err) {
342             goto error;
343         }
344     }
345 
346     if (l4_protocol != 0 && !info->is_fragmented) {
347         if (l4_protocol == IPPROTO_TCP) {
348             info->is_tcp = 1;
349 
350             struct tcphdr tcp = {};
351             err = bpf_skb_load_bytes_relative(skb, l4_offset, &tcp, sizeof(tcp),
352                                         BPF_HDR_START_NET);
353             if (err) {
354                 goto error;
355             }
356 
357             info->src_port = tcp.source;
358             info->dst_port = tcp.dest;
359         } else if (l4_protocol == IPPROTO_UDP) { /* TODO: add udplite? */
360             info->is_udp = 1;
361 
362             struct udphdr udp = {};
363             err = bpf_skb_load_bytes_relative(skb, l4_offset, &udp, sizeof(udp),
364                                         BPF_HDR_START_NET);
365             if (err) {
366                 goto error;
367             }
368 
369             info->src_port = udp.source;
370             info->dst_port = udp.dest;
371         }
372     }
373 
374     return 0;
375 
376 error:
377     return err;
378 }
379 
380 static inline __u32 calculate_rss_hash(struct __sk_buff *skb,
381         struct rss_config_t *config, struct toeplitz_key_data_t *toe)
382 {
383     __u8 rss_input[HASH_CALCULATION_BUFFER_SIZE] = {};
384     size_t bytes_written = 0;
385     __u32 result = 0;
386     int err = 0;
387     struct packet_hash_info_t packet_info = {};
388 
389     err = parse_packet(skb, &packet_info);
390     if (err) {
391         return 0;
392     }
393 
394     if (packet_info.is_ipv4) {
395         if (packet_info.is_tcp &&
396             config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
397 
398             net_rx_rss_add_chunk(rss_input, &bytes_written,
399                                  &packet_info.in_src,
400                                  sizeof(packet_info.in_src));
401             net_rx_rss_add_chunk(rss_input, &bytes_written,
402                                  &packet_info.in_dst,
403                                  sizeof(packet_info.in_dst));
404             net_rx_rss_add_chunk(rss_input, &bytes_written,
405                                  &packet_info.src_port,
406                                  sizeof(packet_info.src_port));
407             net_rx_rss_add_chunk(rss_input, &bytes_written,
408                                  &packet_info.dst_port,
409                                  sizeof(packet_info.dst_port));
410         } else if (packet_info.is_udp &&
411                    config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
412 
413             net_rx_rss_add_chunk(rss_input, &bytes_written,
414                                  &packet_info.in_src,
415                                  sizeof(packet_info.in_src));
416             net_rx_rss_add_chunk(rss_input, &bytes_written,
417                                  &packet_info.in_dst,
418                                  sizeof(packet_info.in_dst));
419             net_rx_rss_add_chunk(rss_input, &bytes_written,
420                                  &packet_info.src_port,
421                                  sizeof(packet_info.src_port));
422             net_rx_rss_add_chunk(rss_input, &bytes_written,
423                                  &packet_info.dst_port,
424                                  sizeof(packet_info.dst_port));
425         } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
426             net_rx_rss_add_chunk(rss_input, &bytes_written,
427                                  &packet_info.in_src,
428                                  sizeof(packet_info.in_src));
429             net_rx_rss_add_chunk(rss_input, &bytes_written,
430                                  &packet_info.in_dst,
431                                  sizeof(packet_info.in_dst));
432         }
433     } else if (packet_info.is_ipv6) {
434         if (packet_info.is_tcp &&
435             config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
436 
437             if (packet_info.is_ipv6_ext_src &&
438                 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
439 
440                 net_rx_rss_add_chunk(rss_input, &bytes_written,
441                                      &packet_info.in6_ext_src,
442                                      sizeof(packet_info.in6_ext_src));
443             } else {
444                 net_rx_rss_add_chunk(rss_input, &bytes_written,
445                                      &packet_info.in6_src,
446                                      sizeof(packet_info.in6_src));
447             }
448             if (packet_info.is_ipv6_ext_dst &&
449                 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
450 
451                 net_rx_rss_add_chunk(rss_input, &bytes_written,
452                                      &packet_info.in6_ext_dst,
453                                      sizeof(packet_info.in6_ext_dst));
454             } else {
455                 net_rx_rss_add_chunk(rss_input, &bytes_written,
456                                      &packet_info.in6_dst,
457                                      sizeof(packet_info.in6_dst));
458             }
459             net_rx_rss_add_chunk(rss_input, &bytes_written,
460                                  &packet_info.src_port,
461                                  sizeof(packet_info.src_port));
462             net_rx_rss_add_chunk(rss_input, &bytes_written,
463                                  &packet_info.dst_port,
464                                  sizeof(packet_info.dst_port));
465         } else if (packet_info.is_udp &&
466                    config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
467 
468             if (packet_info.is_ipv6_ext_src &&
469                config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
470 
471                 net_rx_rss_add_chunk(rss_input, &bytes_written,
472                                      &packet_info.in6_ext_src,
473                                      sizeof(packet_info.in6_ext_src));
474             } else {
475                 net_rx_rss_add_chunk(rss_input, &bytes_written,
476                                      &packet_info.in6_src,
477                                      sizeof(packet_info.in6_src));
478             }
479             if (packet_info.is_ipv6_ext_dst &&
480                config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
481 
482                 net_rx_rss_add_chunk(rss_input, &bytes_written,
483                                      &packet_info.in6_ext_dst,
484                                      sizeof(packet_info.in6_ext_dst));
485             } else {
486                 net_rx_rss_add_chunk(rss_input, &bytes_written,
487                                      &packet_info.in6_dst,
488                                      sizeof(packet_info.in6_dst));
489             }
490 
491             net_rx_rss_add_chunk(rss_input, &bytes_written,
492                                  &packet_info.src_port,
493                                  sizeof(packet_info.src_port));
494             net_rx_rss_add_chunk(rss_input, &bytes_written,
495                                  &packet_info.dst_port,
496                                  sizeof(packet_info.dst_port));
497 
498         } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
499             if (packet_info.is_ipv6_ext_src &&
500                config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
501 
502                 net_rx_rss_add_chunk(rss_input, &bytes_written,
503                                      &packet_info.in6_ext_src,
504                                      sizeof(packet_info.in6_ext_src));
505             } else {
506                 net_rx_rss_add_chunk(rss_input, &bytes_written,
507                                      &packet_info.in6_src,
508                                      sizeof(packet_info.in6_src));
509             }
510             if (packet_info.is_ipv6_ext_dst &&
511                 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
512 
513                 net_rx_rss_add_chunk(rss_input, &bytes_written,
514                                      &packet_info.in6_ext_dst,
515                                      sizeof(packet_info.in6_ext_dst));
516             } else {
517                 net_rx_rss_add_chunk(rss_input, &bytes_written,
518                                      &packet_info.in6_dst,
519                                      sizeof(packet_info.in6_dst));
520             }
521         }
522     }
523 
524     if (bytes_written) {
525         net_toeplitz_add(&result, rss_input, bytes_written, toe);
526     }
527 
528     return result;
529 }
530 
531 SEC("tun_rss_steering")
532 int tun_rss_steering_prog(struct __sk_buff *skb)
533 {
534 
535     struct rss_config_t *config;
536     struct toeplitz_key_data_t *toe;
537 
538     __u32 key = 0;
539     __u32 hash = 0;
540 
541     config = bpf_map_lookup_elem(&tap_rss_map_configurations, &key);
542     toe = bpf_map_lookup_elem(&tap_rss_map_toeplitz_key, &key);
543 
544     if (config && toe) {
545         if (!config->redirect) {
546             return config->default_queue;
547         }
548 
549         hash = calculate_rss_hash(skb, config, toe);
550         if (hash) {
551             __u32 table_idx = hash % config->indirections_len;
552             __u16 *queue = 0;
553 
554             queue = bpf_map_lookup_elem(&tap_rss_map_indirection_table,
555                                         &table_idx);
556 
557             if (queue) {
558                 return *queue;
559             }
560         }
561 
562         return config->default_queue;
563     }
564 
565     return -1;
566 }
567 
568 char _license[] SEC("license") = "GPL v2";
569