xref: /linux/net/netfilter/nf_flow_table_core.c (revision 9a6b55ac)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/module.h>
5 #include <linux/netfilter.h>
6 #include <linux/rhashtable.h>
7 #include <linux/netdevice.h>
8 #include <net/ip.h>
9 #include <net/ip6_route.h>
10 #include <net/netfilter/nf_tables.h>
11 #include <net/netfilter/nf_flow_table.h>
12 #include <net/netfilter/nf_conntrack.h>
13 #include <net/netfilter/nf_conntrack_core.h>
14 #include <net/netfilter/nf_conntrack_l4proto.h>
15 #include <net/netfilter/nf_conntrack_tuple.h>
16 
17 static DEFINE_MUTEX(flowtable_lock);
18 static LIST_HEAD(flowtables);
19 
20 static void
21 flow_offload_fill_dir(struct flow_offload *flow,
22 		      enum flow_offload_tuple_dir dir)
23 {
24 	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
25 	struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple;
26 
27 	ft->dir = dir;
28 
29 	switch (ctt->src.l3num) {
30 	case NFPROTO_IPV4:
31 		ft->src_v4 = ctt->src.u3.in;
32 		ft->dst_v4 = ctt->dst.u3.in;
33 		break;
34 	case NFPROTO_IPV6:
35 		ft->src_v6 = ctt->src.u3.in6;
36 		ft->dst_v6 = ctt->dst.u3.in6;
37 		break;
38 	}
39 
40 	ft->l3proto = ctt->src.l3num;
41 	ft->l4proto = ctt->dst.protonum;
42 	ft->src_port = ctt->src.u.tcp.port;
43 	ft->dst_port = ctt->dst.u.tcp.port;
44 }
45 
46 struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
47 {
48 	struct flow_offload *flow;
49 
50 	if (unlikely(nf_ct_is_dying(ct) ||
51 	    !atomic_inc_not_zero(&ct->ct_general.use)))
52 		return NULL;
53 
54 	flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
55 	if (!flow)
56 		goto err_ct_refcnt;
57 
58 	flow->ct = ct;
59 
60 	flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
61 	flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY);
62 
63 	if (ct->status & IPS_SRC_NAT)
64 		flow->flags |= FLOW_OFFLOAD_SNAT;
65 	if (ct->status & IPS_DST_NAT)
66 		flow->flags |= FLOW_OFFLOAD_DNAT;
67 
68 	return flow;
69 
70 err_ct_refcnt:
71 	nf_ct_put(ct);
72 
73 	return NULL;
74 }
75 EXPORT_SYMBOL_GPL(flow_offload_alloc);
76 
77 static int flow_offload_fill_route(struct flow_offload *flow,
78 				   const struct nf_flow_route *route,
79 				   enum flow_offload_tuple_dir dir)
80 {
81 	struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
82 	struct dst_entry *other_dst = route->tuple[!dir].dst;
83 	struct dst_entry *dst = route->tuple[dir].dst;
84 
85 	if (!dst_hold_safe(route->tuple[dir].dst))
86 		return -1;
87 
88 	switch (flow_tuple->l3proto) {
89 	case NFPROTO_IPV4:
90 		flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
91 		break;
92 	case NFPROTO_IPV6:
93 		flow_tuple->mtu = ip6_dst_mtu_forward(dst);
94 		break;
95 	}
96 
97 	flow_tuple->iifidx = other_dst->dev->ifindex;
98 	flow_tuple->dst_cache = dst;
99 
100 	return 0;
101 }
102 
103 int flow_offload_route_init(struct flow_offload *flow,
104 			    const struct nf_flow_route *route)
105 {
106 	int err;
107 
108 	err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
109 	if (err < 0)
110 		return err;
111 
112 	err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
113 	if (err < 0)
114 		goto err_route_reply;
115 
116 	flow->type = NF_FLOW_OFFLOAD_ROUTE;
117 
118 	return 0;
119 
120 err_route_reply:
121 	dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
122 
123 	return err;
124 }
125 EXPORT_SYMBOL_GPL(flow_offload_route_init);
126 
127 static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
128 {
129 	tcp->state = TCP_CONNTRACK_ESTABLISHED;
130 	tcp->seen[0].td_maxwin = 0;
131 	tcp->seen[1].td_maxwin = 0;
132 }
133 
134 #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT	(120 * HZ)
135 #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT	(30 * HZ)
136 
137 static inline __s32 nf_flow_timeout_delta(unsigned int timeout)
138 {
139 	return (__s32)(timeout - (u32)jiffies);
140 }
141 
142 static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
143 {
144 	const struct nf_conntrack_l4proto *l4proto;
145 	int l4num = nf_ct_protonum(ct);
146 	unsigned int timeout;
147 
148 	l4proto = nf_ct_l4proto_find(l4num);
149 	if (!l4proto)
150 		return;
151 
152 	if (l4num == IPPROTO_TCP)
153 		timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT;
154 	else if (l4num == IPPROTO_UDP)
155 		timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT;
156 	else
157 		return;
158 
159 	if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
160 		ct->timeout = nfct_time_stamp + timeout;
161 }
162 
163 static void flow_offload_fixup_ct_state(struct nf_conn *ct)
164 {
165 	if (nf_ct_protonum(ct) == IPPROTO_TCP)
166 		flow_offload_fixup_tcp(&ct->proto.tcp);
167 }
168 
169 static void flow_offload_fixup_ct(struct nf_conn *ct)
170 {
171 	flow_offload_fixup_ct_state(ct);
172 	flow_offload_fixup_ct_timeout(ct);
173 }
174 
175 static void flow_offload_route_release(struct flow_offload *flow)
176 {
177 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
178 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
179 }
180 
181 void flow_offload_free(struct flow_offload *flow)
182 {
183 	switch (flow->type) {
184 	case NF_FLOW_OFFLOAD_ROUTE:
185 		flow_offload_route_release(flow);
186 		break;
187 	default:
188 		break;
189 	}
190 	if (flow->flags & FLOW_OFFLOAD_DYING)
191 		nf_ct_delete(flow->ct, 0, 0);
192 	nf_ct_put(flow->ct);
193 	kfree_rcu(flow, rcu_head);
194 }
195 EXPORT_SYMBOL_GPL(flow_offload_free);
196 
197 static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
198 {
199 	const struct flow_offload_tuple *tuple = data;
200 
201 	return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
202 }
203 
204 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
205 {
206 	const struct flow_offload_tuple_rhash *tuplehash = data;
207 
208 	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
209 }
210 
211 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
212 					const void *ptr)
213 {
214 	const struct flow_offload_tuple *tuple = arg->key;
215 	const struct flow_offload_tuple_rhash *x = ptr;
216 
217 	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
218 		return 1;
219 
220 	return 0;
221 }
222 
223 static const struct rhashtable_params nf_flow_offload_rhash_params = {
224 	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
225 	.hashfn			= flow_offload_hash,
226 	.obj_hashfn		= flow_offload_hash_obj,
227 	.obj_cmpfn		= flow_offload_hash_cmp,
228 	.automatic_shrinking	= true,
229 };
230 
231 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
232 {
233 	int err;
234 
235 	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
236 
237 	err = rhashtable_insert_fast(&flow_table->rhashtable,
238 				     &flow->tuplehash[0].node,
239 				     nf_flow_offload_rhash_params);
240 	if (err < 0)
241 		return err;
242 
243 	err = rhashtable_insert_fast(&flow_table->rhashtable,
244 				     &flow->tuplehash[1].node,
245 				     nf_flow_offload_rhash_params);
246 	if (err < 0) {
247 		rhashtable_remove_fast(&flow_table->rhashtable,
248 				       &flow->tuplehash[0].node,
249 				       nf_flow_offload_rhash_params);
250 		return err;
251 	}
252 
253 	if (flow_table->flags & NF_FLOWTABLE_HW_OFFLOAD)
254 		nf_flow_offload_add(flow_table, flow);
255 
256 	return 0;
257 }
258 EXPORT_SYMBOL_GPL(flow_offload_add);
259 
260 static inline bool nf_flow_has_expired(const struct flow_offload *flow)
261 {
262 	return nf_flow_timeout_delta(flow->timeout) <= 0;
263 }
264 
265 static void flow_offload_del(struct nf_flowtable *flow_table,
266 			     struct flow_offload *flow)
267 {
268 	rhashtable_remove_fast(&flow_table->rhashtable,
269 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
270 			       nf_flow_offload_rhash_params);
271 	rhashtable_remove_fast(&flow_table->rhashtable,
272 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
273 			       nf_flow_offload_rhash_params);
274 
275 	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
276 
277 	if (nf_flow_has_expired(flow))
278 		flow_offload_fixup_ct(flow->ct);
279 	else if (flow->flags & FLOW_OFFLOAD_TEARDOWN)
280 		flow_offload_fixup_ct_timeout(flow->ct);
281 
282 	flow_offload_free(flow);
283 }
284 
285 void flow_offload_teardown(struct flow_offload *flow)
286 {
287 	flow->flags |= FLOW_OFFLOAD_TEARDOWN;
288 
289 	flow_offload_fixup_ct_state(flow->ct);
290 }
291 EXPORT_SYMBOL_GPL(flow_offload_teardown);
292 
293 struct flow_offload_tuple_rhash *
294 flow_offload_lookup(struct nf_flowtable *flow_table,
295 		    struct flow_offload_tuple *tuple)
296 {
297 	struct flow_offload_tuple_rhash *tuplehash;
298 	struct flow_offload *flow;
299 	int dir;
300 
301 	tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
302 				      nf_flow_offload_rhash_params);
303 	if (!tuplehash)
304 		return NULL;
305 
306 	dir = tuplehash->tuple.dir;
307 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
308 	if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))
309 		return NULL;
310 
311 	if (unlikely(nf_ct_is_dying(flow->ct)))
312 		return NULL;
313 
314 	return tuplehash;
315 }
316 EXPORT_SYMBOL_GPL(flow_offload_lookup);
317 
318 static int
319 nf_flow_table_iterate(struct nf_flowtable *flow_table,
320 		      void (*iter)(struct flow_offload *flow, void *data),
321 		      void *data)
322 {
323 	struct flow_offload_tuple_rhash *tuplehash;
324 	struct rhashtable_iter hti;
325 	struct flow_offload *flow;
326 	int err = 0;
327 
328 	rhashtable_walk_enter(&flow_table->rhashtable, &hti);
329 	rhashtable_walk_start(&hti);
330 
331 	while ((tuplehash = rhashtable_walk_next(&hti))) {
332 		if (IS_ERR(tuplehash)) {
333 			if (PTR_ERR(tuplehash) != -EAGAIN) {
334 				err = PTR_ERR(tuplehash);
335 				break;
336 			}
337 			continue;
338 		}
339 		if (tuplehash->tuple.dir)
340 			continue;
341 
342 		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
343 
344 		iter(flow, data);
345 	}
346 	rhashtable_walk_stop(&hti);
347 	rhashtable_walk_exit(&hti);
348 
349 	return err;
350 }
351 
352 static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
353 {
354 	struct nf_flowtable *flow_table = data;
355 
356 	if (flow->flags & FLOW_OFFLOAD_HW)
357 		nf_flow_offload_stats(flow_table, flow);
358 
359 	if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) ||
360 	    (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) {
361 		if (flow->flags & FLOW_OFFLOAD_HW) {
362 			if (!(flow->flags & FLOW_OFFLOAD_HW_DYING))
363 				nf_flow_offload_del(flow_table, flow);
364 			else if (flow->flags & FLOW_OFFLOAD_HW_DEAD)
365 				flow_offload_del(flow_table, flow);
366 		} else {
367 			flow_offload_del(flow_table, flow);
368 		}
369 	}
370 }
371 
372 static void nf_flow_offload_work_gc(struct work_struct *work)
373 {
374 	struct nf_flowtable *flow_table;
375 
376 	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
377 	nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
378 	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
379 }
380 
381 static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
382 				__be16 port, __be16 new_port)
383 {
384 	struct tcphdr *tcph;
385 
386 	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
387 	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
388 		return -1;
389 
390 	tcph = (void *)(skb_network_header(skb) + thoff);
391 	inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
392 
393 	return 0;
394 }
395 
396 static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
397 				__be16 port, __be16 new_port)
398 {
399 	struct udphdr *udph;
400 
401 	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
402 	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
403 		return -1;
404 
405 	udph = (void *)(skb_network_header(skb) + thoff);
406 	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
407 		inet_proto_csum_replace2(&udph->check, skb, port,
408 					 new_port, true);
409 		if (!udph->check)
410 			udph->check = CSUM_MANGLED_0;
411 	}
412 
413 	return 0;
414 }
415 
416 static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
417 			    u8 protocol, __be16 port, __be16 new_port)
418 {
419 	switch (protocol) {
420 	case IPPROTO_TCP:
421 		if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
422 			return NF_DROP;
423 		break;
424 	case IPPROTO_UDP:
425 		if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
426 			return NF_DROP;
427 		break;
428 	}
429 
430 	return 0;
431 }
432 
433 int nf_flow_snat_port(const struct flow_offload *flow,
434 		      struct sk_buff *skb, unsigned int thoff,
435 		      u8 protocol, enum flow_offload_tuple_dir dir)
436 {
437 	struct flow_ports *hdr;
438 	__be16 port, new_port;
439 
440 	if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
441 	    skb_try_make_writable(skb, thoff + sizeof(*hdr)))
442 		return -1;
443 
444 	hdr = (void *)(skb_network_header(skb) + thoff);
445 
446 	switch (dir) {
447 	case FLOW_OFFLOAD_DIR_ORIGINAL:
448 		port = hdr->source;
449 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
450 		hdr->source = new_port;
451 		break;
452 	case FLOW_OFFLOAD_DIR_REPLY:
453 		port = hdr->dest;
454 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
455 		hdr->dest = new_port;
456 		break;
457 	default:
458 		return -1;
459 	}
460 
461 	return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
462 }
463 EXPORT_SYMBOL_GPL(nf_flow_snat_port);
464 
465 int nf_flow_dnat_port(const struct flow_offload *flow,
466 		      struct sk_buff *skb, unsigned int thoff,
467 		      u8 protocol, enum flow_offload_tuple_dir dir)
468 {
469 	struct flow_ports *hdr;
470 	__be16 port, new_port;
471 
472 	if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
473 	    skb_try_make_writable(skb, thoff + sizeof(*hdr)))
474 		return -1;
475 
476 	hdr = (void *)(skb_network_header(skb) + thoff);
477 
478 	switch (dir) {
479 	case FLOW_OFFLOAD_DIR_ORIGINAL:
480 		port = hdr->dest;
481 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
482 		hdr->dest = new_port;
483 		break;
484 	case FLOW_OFFLOAD_DIR_REPLY:
485 		port = hdr->source;
486 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
487 		hdr->source = new_port;
488 		break;
489 	default:
490 		return -1;
491 	}
492 
493 	return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
494 }
495 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
496 
497 int nf_flow_table_init(struct nf_flowtable *flowtable)
498 {
499 	int err;
500 
501 	INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
502 	flow_block_init(&flowtable->flow_block);
503 
504 	err = rhashtable_init(&flowtable->rhashtable,
505 			      &nf_flow_offload_rhash_params);
506 	if (err < 0)
507 		return err;
508 
509 	queue_delayed_work(system_power_efficient_wq,
510 			   &flowtable->gc_work, HZ);
511 
512 	mutex_lock(&flowtable_lock);
513 	list_add(&flowtable->list, &flowtables);
514 	mutex_unlock(&flowtable_lock);
515 
516 	return 0;
517 }
518 EXPORT_SYMBOL_GPL(nf_flow_table_init);
519 
520 static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
521 {
522 	struct net_device *dev = data;
523 
524 	if (!dev) {
525 		flow_offload_teardown(flow);
526 		return;
527 	}
528 
529 	if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) &&
530 	    (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
531 	     flow->tuplehash[1].tuple.iifidx == dev->ifindex))
532 		flow_offload_dead(flow);
533 }
534 
535 static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
536 					  struct net_device *dev)
537 {
538 	nf_flow_table_offload_flush(flowtable);
539 	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
540 	flush_delayed_work(&flowtable->gc_work);
541 }
542 
543 void nf_flow_table_cleanup(struct net_device *dev)
544 {
545 	struct nf_flowtable *flowtable;
546 
547 	mutex_lock(&flowtable_lock);
548 	list_for_each_entry(flowtable, &flowtables, list)
549 		nf_flow_table_iterate_cleanup(flowtable, dev);
550 	mutex_unlock(&flowtable_lock);
551 }
552 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
553 
554 void nf_flow_table_free(struct nf_flowtable *flow_table)
555 {
556 	mutex_lock(&flowtable_lock);
557 	list_del(&flow_table->list);
558 	mutex_unlock(&flowtable_lock);
559 	cancel_delayed_work_sync(&flow_table->gc_work);
560 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
561 	nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
562 	rhashtable_destroy(&flow_table->rhashtable);
563 }
564 EXPORT_SYMBOL_GPL(nf_flow_table_free);
565 
566 static int __init nf_flow_table_module_init(void)
567 {
568 	return nf_flow_table_offload_init();
569 }
570 
571 static void __exit nf_flow_table_module_exit(void)
572 {
573 	nf_flow_table_offload_exit();
574 }
575 
576 module_init(nf_flow_table_module_init);
577 module_exit(nf_flow_table_module_exit);
578 
579 MODULE_LICENSE("GPL");
580 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
581