1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/kernel.h> 3 #include <linux/init.h> 4 #include <linux/module.h> 5 #include <linux/netfilter.h> 6 #include <linux/rhashtable.h> 7 #include <linux/netdevice.h> 8 #include <net/ip.h> 9 #include <net/ip6_route.h> 10 #include <net/netfilter/nf_tables.h> 11 #include <net/netfilter/nf_flow_table.h> 12 #include <net/netfilter/nf_conntrack.h> 13 #include <net/netfilter/nf_conntrack_core.h> 14 #include <net/netfilter/nf_conntrack_l4proto.h> 15 #include <net/netfilter/nf_conntrack_tuple.h> 16 17 static DEFINE_MUTEX(flowtable_lock); 18 static LIST_HEAD(flowtables); 19 20 static void 21 flow_offload_fill_dir(struct flow_offload *flow, 22 enum flow_offload_tuple_dir dir) 23 { 24 struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; 25 struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple; 26 27 ft->dir = dir; 28 29 switch (ctt->src.l3num) { 30 case NFPROTO_IPV4: 31 ft->src_v4 = ctt->src.u3.in; 32 ft->dst_v4 = ctt->dst.u3.in; 33 break; 34 case NFPROTO_IPV6: 35 ft->src_v6 = ctt->src.u3.in6; 36 ft->dst_v6 = ctt->dst.u3.in6; 37 break; 38 } 39 40 ft->l3proto = ctt->src.l3num; 41 ft->l4proto = ctt->dst.protonum; 42 ft->src_port = ctt->src.u.tcp.port; 43 ft->dst_port = ctt->dst.u.tcp.port; 44 } 45 46 struct flow_offload *flow_offload_alloc(struct nf_conn *ct) 47 { 48 struct flow_offload *flow; 49 50 if (unlikely(nf_ct_is_dying(ct) || 51 !atomic_inc_not_zero(&ct->ct_general.use))) 52 return NULL; 53 54 flow = kzalloc(sizeof(*flow), GFP_ATOMIC); 55 if (!flow) 56 goto err_ct_refcnt; 57 58 flow->ct = ct; 59 60 flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); 61 flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); 62 63 if (ct->status & IPS_SRC_NAT) 64 flow->flags |= FLOW_OFFLOAD_SNAT; 65 if (ct->status & IPS_DST_NAT) 66 flow->flags |= FLOW_OFFLOAD_DNAT; 67 68 return flow; 69 70 err_ct_refcnt: 71 nf_ct_put(ct); 72 73 return NULL; 74 } 75 EXPORT_SYMBOL_GPL(flow_offload_alloc); 76 77 static int flow_offload_fill_route(struct flow_offload *flow, 78 const struct nf_flow_route *route, 79 enum flow_offload_tuple_dir dir) 80 { 81 struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; 82 struct dst_entry *other_dst = route->tuple[!dir].dst; 83 struct dst_entry *dst = route->tuple[dir].dst; 84 85 if (!dst_hold_safe(route->tuple[dir].dst)) 86 return -1; 87 88 switch (flow_tuple->l3proto) { 89 case NFPROTO_IPV4: 90 flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); 91 break; 92 case NFPROTO_IPV6: 93 flow_tuple->mtu = ip6_dst_mtu_forward(dst); 94 break; 95 } 96 97 flow_tuple->iifidx = other_dst->dev->ifindex; 98 flow_tuple->dst_cache = dst; 99 100 return 0; 101 } 102 103 int flow_offload_route_init(struct flow_offload *flow, 104 const struct nf_flow_route *route) 105 { 106 int err; 107 108 err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); 109 if (err < 0) 110 return err; 111 112 err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); 113 if (err < 0) 114 goto err_route_reply; 115 116 flow->type = NF_FLOW_OFFLOAD_ROUTE; 117 118 return 0; 119 120 err_route_reply: 121 dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); 122 123 return err; 124 } 125 EXPORT_SYMBOL_GPL(flow_offload_route_init); 126 127 static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) 128 { 129 tcp->state = TCP_CONNTRACK_ESTABLISHED; 130 tcp->seen[0].td_maxwin = 0; 131 tcp->seen[1].td_maxwin = 0; 132 } 133 134 #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) 135 #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) 136 137 static inline __s32 nf_flow_timeout_delta(unsigned int timeout) 138 { 139 return (__s32)(timeout - (u32)jiffies); 140 } 141 142 static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) 143 { 144 const struct nf_conntrack_l4proto *l4proto; 145 int l4num = nf_ct_protonum(ct); 146 unsigned int timeout; 147 148 l4proto = nf_ct_l4proto_find(l4num); 149 if (!l4proto) 150 return; 151 152 if (l4num == IPPROTO_TCP) 153 timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT; 154 else if (l4num == IPPROTO_UDP) 155 timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT; 156 else 157 return; 158 159 if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) 160 ct->timeout = nfct_time_stamp + timeout; 161 } 162 163 static void flow_offload_fixup_ct_state(struct nf_conn *ct) 164 { 165 if (nf_ct_protonum(ct) == IPPROTO_TCP) 166 flow_offload_fixup_tcp(&ct->proto.tcp); 167 } 168 169 static void flow_offload_fixup_ct(struct nf_conn *ct) 170 { 171 flow_offload_fixup_ct_state(ct); 172 flow_offload_fixup_ct_timeout(ct); 173 } 174 175 static void flow_offload_route_release(struct flow_offload *flow) 176 { 177 dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); 178 dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); 179 } 180 181 void flow_offload_free(struct flow_offload *flow) 182 { 183 switch (flow->type) { 184 case NF_FLOW_OFFLOAD_ROUTE: 185 flow_offload_route_release(flow); 186 break; 187 default: 188 break; 189 } 190 if (flow->flags & FLOW_OFFLOAD_DYING) 191 nf_ct_delete(flow->ct, 0, 0); 192 nf_ct_put(flow->ct); 193 kfree_rcu(flow, rcu_head); 194 } 195 EXPORT_SYMBOL_GPL(flow_offload_free); 196 197 static u32 flow_offload_hash(const void *data, u32 len, u32 seed) 198 { 199 const struct flow_offload_tuple *tuple = data; 200 201 return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); 202 } 203 204 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) 205 { 206 const struct flow_offload_tuple_rhash *tuplehash = data; 207 208 return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); 209 } 210 211 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, 212 const void *ptr) 213 { 214 const struct flow_offload_tuple *tuple = arg->key; 215 const struct flow_offload_tuple_rhash *x = ptr; 216 217 if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) 218 return 1; 219 220 return 0; 221 } 222 223 static const struct rhashtable_params nf_flow_offload_rhash_params = { 224 .head_offset = offsetof(struct flow_offload_tuple_rhash, node), 225 .hashfn = flow_offload_hash, 226 .obj_hashfn = flow_offload_hash_obj, 227 .obj_cmpfn = flow_offload_hash_cmp, 228 .automatic_shrinking = true, 229 }; 230 231 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) 232 { 233 int err; 234 235 flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; 236 237 err = rhashtable_insert_fast(&flow_table->rhashtable, 238 &flow->tuplehash[0].node, 239 nf_flow_offload_rhash_params); 240 if (err < 0) 241 return err; 242 243 err = rhashtable_insert_fast(&flow_table->rhashtable, 244 &flow->tuplehash[1].node, 245 nf_flow_offload_rhash_params); 246 if (err < 0) { 247 rhashtable_remove_fast(&flow_table->rhashtable, 248 &flow->tuplehash[0].node, 249 nf_flow_offload_rhash_params); 250 return err; 251 } 252 253 if (flow_table->flags & NF_FLOWTABLE_HW_OFFLOAD) 254 nf_flow_offload_add(flow_table, flow); 255 256 return 0; 257 } 258 EXPORT_SYMBOL_GPL(flow_offload_add); 259 260 static inline bool nf_flow_has_expired(const struct flow_offload *flow) 261 { 262 return nf_flow_timeout_delta(flow->timeout) <= 0; 263 } 264 265 static void flow_offload_del(struct nf_flowtable *flow_table, 266 struct flow_offload *flow) 267 { 268 rhashtable_remove_fast(&flow_table->rhashtable, 269 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, 270 nf_flow_offload_rhash_params); 271 rhashtable_remove_fast(&flow_table->rhashtable, 272 &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, 273 nf_flow_offload_rhash_params); 274 275 clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); 276 277 if (nf_flow_has_expired(flow)) 278 flow_offload_fixup_ct(flow->ct); 279 else if (flow->flags & FLOW_OFFLOAD_TEARDOWN) 280 flow_offload_fixup_ct_timeout(flow->ct); 281 282 flow_offload_free(flow); 283 } 284 285 void flow_offload_teardown(struct flow_offload *flow) 286 { 287 flow->flags |= FLOW_OFFLOAD_TEARDOWN; 288 289 flow_offload_fixup_ct_state(flow->ct); 290 } 291 EXPORT_SYMBOL_GPL(flow_offload_teardown); 292 293 struct flow_offload_tuple_rhash * 294 flow_offload_lookup(struct nf_flowtable *flow_table, 295 struct flow_offload_tuple *tuple) 296 { 297 struct flow_offload_tuple_rhash *tuplehash; 298 struct flow_offload *flow; 299 int dir; 300 301 tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, 302 nf_flow_offload_rhash_params); 303 if (!tuplehash) 304 return NULL; 305 306 dir = tuplehash->tuple.dir; 307 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 308 if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)) 309 return NULL; 310 311 if (unlikely(nf_ct_is_dying(flow->ct))) 312 return NULL; 313 314 return tuplehash; 315 } 316 EXPORT_SYMBOL_GPL(flow_offload_lookup); 317 318 static int 319 nf_flow_table_iterate(struct nf_flowtable *flow_table, 320 void (*iter)(struct flow_offload *flow, void *data), 321 void *data) 322 { 323 struct flow_offload_tuple_rhash *tuplehash; 324 struct rhashtable_iter hti; 325 struct flow_offload *flow; 326 int err = 0; 327 328 rhashtable_walk_enter(&flow_table->rhashtable, &hti); 329 rhashtable_walk_start(&hti); 330 331 while ((tuplehash = rhashtable_walk_next(&hti))) { 332 if (IS_ERR(tuplehash)) { 333 if (PTR_ERR(tuplehash) != -EAGAIN) { 334 err = PTR_ERR(tuplehash); 335 break; 336 } 337 continue; 338 } 339 if (tuplehash->tuple.dir) 340 continue; 341 342 flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); 343 344 iter(flow, data); 345 } 346 rhashtable_walk_stop(&hti); 347 rhashtable_walk_exit(&hti); 348 349 return err; 350 } 351 352 static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) 353 { 354 struct nf_flowtable *flow_table = data; 355 356 if (flow->flags & FLOW_OFFLOAD_HW) 357 nf_flow_offload_stats(flow_table, flow); 358 359 if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || 360 (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) { 361 if (flow->flags & FLOW_OFFLOAD_HW) { 362 if (!(flow->flags & FLOW_OFFLOAD_HW_DYING)) 363 nf_flow_offload_del(flow_table, flow); 364 else if (flow->flags & FLOW_OFFLOAD_HW_DEAD) 365 flow_offload_del(flow_table, flow); 366 } else { 367 flow_offload_del(flow_table, flow); 368 } 369 } 370 } 371 372 static void nf_flow_offload_work_gc(struct work_struct *work) 373 { 374 struct nf_flowtable *flow_table; 375 376 flow_table = container_of(work, struct nf_flowtable, gc_work.work); 377 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); 378 queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); 379 } 380 381 static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, 382 __be16 port, __be16 new_port) 383 { 384 struct tcphdr *tcph; 385 386 if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || 387 skb_try_make_writable(skb, thoff + sizeof(*tcph))) 388 return -1; 389 390 tcph = (void *)(skb_network_header(skb) + thoff); 391 inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true); 392 393 return 0; 394 } 395 396 static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, 397 __be16 port, __be16 new_port) 398 { 399 struct udphdr *udph; 400 401 if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || 402 skb_try_make_writable(skb, thoff + sizeof(*udph))) 403 return -1; 404 405 udph = (void *)(skb_network_header(skb) + thoff); 406 if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { 407 inet_proto_csum_replace2(&udph->check, skb, port, 408 new_port, true); 409 if (!udph->check) 410 udph->check = CSUM_MANGLED_0; 411 } 412 413 return 0; 414 } 415 416 static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, 417 u8 protocol, __be16 port, __be16 new_port) 418 { 419 switch (protocol) { 420 case IPPROTO_TCP: 421 if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0) 422 return NF_DROP; 423 break; 424 case IPPROTO_UDP: 425 if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0) 426 return NF_DROP; 427 break; 428 } 429 430 return 0; 431 } 432 433 int nf_flow_snat_port(const struct flow_offload *flow, 434 struct sk_buff *skb, unsigned int thoff, 435 u8 protocol, enum flow_offload_tuple_dir dir) 436 { 437 struct flow_ports *hdr; 438 __be16 port, new_port; 439 440 if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) || 441 skb_try_make_writable(skb, thoff + sizeof(*hdr))) 442 return -1; 443 444 hdr = (void *)(skb_network_header(skb) + thoff); 445 446 switch (dir) { 447 case FLOW_OFFLOAD_DIR_ORIGINAL: 448 port = hdr->source; 449 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; 450 hdr->source = new_port; 451 break; 452 case FLOW_OFFLOAD_DIR_REPLY: 453 port = hdr->dest; 454 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; 455 hdr->dest = new_port; 456 break; 457 default: 458 return -1; 459 } 460 461 return nf_flow_nat_port(skb, thoff, protocol, port, new_port); 462 } 463 EXPORT_SYMBOL_GPL(nf_flow_snat_port); 464 465 int nf_flow_dnat_port(const struct flow_offload *flow, 466 struct sk_buff *skb, unsigned int thoff, 467 u8 protocol, enum flow_offload_tuple_dir dir) 468 { 469 struct flow_ports *hdr; 470 __be16 port, new_port; 471 472 if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) || 473 skb_try_make_writable(skb, thoff + sizeof(*hdr))) 474 return -1; 475 476 hdr = (void *)(skb_network_header(skb) + thoff); 477 478 switch (dir) { 479 case FLOW_OFFLOAD_DIR_ORIGINAL: 480 port = hdr->dest; 481 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port; 482 hdr->dest = new_port; 483 break; 484 case FLOW_OFFLOAD_DIR_REPLY: 485 port = hdr->source; 486 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; 487 hdr->source = new_port; 488 break; 489 default: 490 return -1; 491 } 492 493 return nf_flow_nat_port(skb, thoff, protocol, port, new_port); 494 } 495 EXPORT_SYMBOL_GPL(nf_flow_dnat_port); 496 497 int nf_flow_table_init(struct nf_flowtable *flowtable) 498 { 499 int err; 500 501 INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); 502 flow_block_init(&flowtable->flow_block); 503 504 err = rhashtable_init(&flowtable->rhashtable, 505 &nf_flow_offload_rhash_params); 506 if (err < 0) 507 return err; 508 509 queue_delayed_work(system_power_efficient_wq, 510 &flowtable->gc_work, HZ); 511 512 mutex_lock(&flowtable_lock); 513 list_add(&flowtable->list, &flowtables); 514 mutex_unlock(&flowtable_lock); 515 516 return 0; 517 } 518 EXPORT_SYMBOL_GPL(nf_flow_table_init); 519 520 static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) 521 { 522 struct net_device *dev = data; 523 524 if (!dev) { 525 flow_offload_teardown(flow); 526 return; 527 } 528 529 if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && 530 (flow->tuplehash[0].tuple.iifidx == dev->ifindex || 531 flow->tuplehash[1].tuple.iifidx == dev->ifindex)) 532 flow_offload_dead(flow); 533 } 534 535 static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, 536 struct net_device *dev) 537 { 538 nf_flow_table_offload_flush(flowtable); 539 nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); 540 flush_delayed_work(&flowtable->gc_work); 541 } 542 543 void nf_flow_table_cleanup(struct net_device *dev) 544 { 545 struct nf_flowtable *flowtable; 546 547 mutex_lock(&flowtable_lock); 548 list_for_each_entry(flowtable, &flowtables, list) 549 nf_flow_table_iterate_cleanup(flowtable, dev); 550 mutex_unlock(&flowtable_lock); 551 } 552 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); 553 554 void nf_flow_table_free(struct nf_flowtable *flow_table) 555 { 556 mutex_lock(&flowtable_lock); 557 list_del(&flow_table->list); 558 mutex_unlock(&flowtable_lock); 559 cancel_delayed_work_sync(&flow_table->gc_work); 560 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); 561 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); 562 rhashtable_destroy(&flow_table->rhashtable); 563 } 564 EXPORT_SYMBOL_GPL(nf_flow_table_free); 565 566 static int __init nf_flow_table_module_init(void) 567 { 568 return nf_flow_table_offload_init(); 569 } 570 571 static void __exit nf_flow_table_module_exit(void) 572 { 573 nf_flow_table_offload_exit(); 574 } 575 576 module_init(nf_flow_table_module_init); 577 module_exit(nf_flow_table_module_exit); 578 579 MODULE_LICENSE("GPL"); 580 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); 581