1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/kernel.h> 3 #include <linux/init.h> 4 #include <linux/module.h> 5 #include <linux/netfilter.h> 6 #include <linux/rhashtable.h> 7 #include <linux/netdevice.h> 8 #include <net/ip.h> 9 #include <net/ip6_route.h> 10 #include <net/netfilter/nf_tables.h> 11 #include <net/netfilter/nf_flow_table.h> 12 #include <net/netfilter/nf_conntrack.h> 13 #include <net/netfilter/nf_conntrack_core.h> 14 #include <net/netfilter/nf_conntrack_tuple.h> 15 16 struct flow_offload_entry { 17 struct flow_offload flow; 18 struct nf_conn *ct; 19 struct rcu_head rcu_head; 20 }; 21 22 static DEFINE_MUTEX(flowtable_lock); 23 static LIST_HEAD(flowtables); 24 25 static void 26 flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, 27 struct nf_flow_route *route, 28 enum flow_offload_tuple_dir dir) 29 { 30 struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; 31 struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple; 32 struct dst_entry *other_dst = route->tuple[!dir].dst; 33 struct dst_entry *dst = route->tuple[dir].dst; 34 35 ft->dir = dir; 36 37 switch (ctt->src.l3num) { 38 case NFPROTO_IPV4: 39 ft->src_v4 = ctt->src.u3.in; 40 ft->dst_v4 = ctt->dst.u3.in; 41 ft->mtu = ip_dst_mtu_maybe_forward(dst, true); 42 break; 43 case NFPROTO_IPV6: 44 ft->src_v6 = ctt->src.u3.in6; 45 ft->dst_v6 = ctt->dst.u3.in6; 46 ft->mtu = ip6_dst_mtu_forward(dst); 47 break; 48 } 49 50 ft->l3proto = ctt->src.l3num; 51 ft->l4proto = ctt->dst.protonum; 52 ft->src_port = ctt->src.u.tcp.port; 53 ft->dst_port = ctt->dst.u.tcp.port; 54 55 ft->iifidx = other_dst->dev->ifindex; 56 ft->oifidx = dst->dev->ifindex; 57 ft->dst_cache = dst; 58 } 59 60 struct flow_offload * 61 flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) 62 { 63 struct flow_offload_entry *entry; 64 struct flow_offload *flow; 65 66 if (unlikely(nf_ct_is_dying(ct) || 67 !atomic_inc_not_zero(&ct->ct_general.use))) 68 return NULL; 69 70 entry = kzalloc(sizeof(*entry), GFP_ATOMIC); 71 if (!entry) 72 goto err_ct_refcnt; 73 74 flow = &entry->flow; 75 76 if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst)) 77 goto err_dst_cache_original; 78 79 if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst)) 80 goto err_dst_cache_reply; 81 82 entry->ct = ct; 83 84 flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL); 85 flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY); 86 87 if (ct->status & IPS_SRC_NAT) 88 flow->flags |= FLOW_OFFLOAD_SNAT; 89 if (ct->status & IPS_DST_NAT) 90 flow->flags |= FLOW_OFFLOAD_DNAT; 91 92 return flow; 93 94 err_dst_cache_reply: 95 dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); 96 err_dst_cache_original: 97 kfree(entry); 98 err_ct_refcnt: 99 nf_ct_put(ct); 100 101 return NULL; 102 } 103 EXPORT_SYMBOL_GPL(flow_offload_alloc); 104 105 static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) 106 { 107 tcp->state = TCP_CONNTRACK_ESTABLISHED; 108 tcp->seen[0].td_maxwin = 0; 109 tcp->seen[1].td_maxwin = 0; 110 } 111 112 #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) 113 #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) 114 115 static void flow_offload_fixup_ct_state(struct nf_conn *ct) 116 { 117 const struct nf_conntrack_l4proto *l4proto; 118 unsigned int timeout; 119 int l4num; 120 121 l4num = nf_ct_protonum(ct); 122 if (l4num == IPPROTO_TCP) 123 flow_offload_fixup_tcp(&ct->proto.tcp); 124 125 l4proto = nf_ct_l4proto_find(l4num); 126 if (!l4proto) 127 return; 128 129 if (l4num == IPPROTO_TCP) 130 timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT; 131 else if (l4num == IPPROTO_UDP) 132 timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT; 133 else 134 return; 135 136 ct->timeout = nfct_time_stamp + timeout; 137 } 138 139 void flow_offload_free(struct flow_offload *flow) 140 { 141 struct flow_offload_entry *e; 142 143 dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); 144 dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); 145 e = container_of(flow, struct flow_offload_entry, flow); 146 if (flow->flags & FLOW_OFFLOAD_DYING) 147 nf_ct_delete(e->ct, 0, 0); 148 nf_ct_put(e->ct); 149 kfree_rcu(e, rcu_head); 150 } 151 EXPORT_SYMBOL_GPL(flow_offload_free); 152 153 static u32 flow_offload_hash(const void *data, u32 len, u32 seed) 154 { 155 const struct flow_offload_tuple *tuple = data; 156 157 return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); 158 } 159 160 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) 161 { 162 const struct flow_offload_tuple_rhash *tuplehash = data; 163 164 return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); 165 } 166 167 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, 168 const void *ptr) 169 { 170 const struct flow_offload_tuple *tuple = arg->key; 171 const struct flow_offload_tuple_rhash *x = ptr; 172 173 if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) 174 return 1; 175 176 return 0; 177 } 178 179 static const struct rhashtable_params nf_flow_offload_rhash_params = { 180 .head_offset = offsetof(struct flow_offload_tuple_rhash, node), 181 .hashfn = flow_offload_hash, 182 .obj_hashfn = flow_offload_hash_obj, 183 .obj_cmpfn = flow_offload_hash_cmp, 184 .automatic_shrinking = true, 185 }; 186 187 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) 188 { 189 int err; 190 191 err = rhashtable_insert_fast(&flow_table->rhashtable, 192 &flow->tuplehash[0].node, 193 nf_flow_offload_rhash_params); 194 if (err < 0) 195 return err; 196 197 err = rhashtable_insert_fast(&flow_table->rhashtable, 198 &flow->tuplehash[1].node, 199 nf_flow_offload_rhash_params); 200 if (err < 0) { 201 rhashtable_remove_fast(&flow_table->rhashtable, 202 &flow->tuplehash[0].node, 203 nf_flow_offload_rhash_params); 204 return err; 205 } 206 207 flow->timeout = (u32)jiffies; 208 return 0; 209 } 210 EXPORT_SYMBOL_GPL(flow_offload_add); 211 212 static void flow_offload_del(struct nf_flowtable *flow_table, 213 struct flow_offload *flow) 214 { 215 struct flow_offload_entry *e; 216 217 rhashtable_remove_fast(&flow_table->rhashtable, 218 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, 219 nf_flow_offload_rhash_params); 220 rhashtable_remove_fast(&flow_table->rhashtable, 221 &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, 222 nf_flow_offload_rhash_params); 223 224 e = container_of(flow, struct flow_offload_entry, flow); 225 clear_bit(IPS_OFFLOAD_BIT, &e->ct->status); 226 227 flow_offload_free(flow); 228 } 229 230 void flow_offload_teardown(struct flow_offload *flow) 231 { 232 struct flow_offload_entry *e; 233 234 flow->flags |= FLOW_OFFLOAD_TEARDOWN; 235 236 e = container_of(flow, struct flow_offload_entry, flow); 237 flow_offload_fixup_ct_state(e->ct); 238 } 239 EXPORT_SYMBOL_GPL(flow_offload_teardown); 240 241 struct flow_offload_tuple_rhash * 242 flow_offload_lookup(struct nf_flowtable *flow_table, 243 struct flow_offload_tuple *tuple) 244 { 245 struct flow_offload_tuple_rhash *tuplehash; 246 struct flow_offload *flow; 247 struct flow_offload_entry *e; 248 int dir; 249 250 tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, 251 nf_flow_offload_rhash_params); 252 if (!tuplehash) 253 return NULL; 254 255 dir = tuplehash->tuple.dir; 256 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 257 if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)) 258 return NULL; 259 260 e = container_of(flow, struct flow_offload_entry, flow); 261 if (unlikely(nf_ct_is_dying(e->ct))) 262 return NULL; 263 264 return tuplehash; 265 } 266 EXPORT_SYMBOL_GPL(flow_offload_lookup); 267 268 static int 269 nf_flow_table_iterate(struct nf_flowtable *flow_table, 270 void (*iter)(struct flow_offload *flow, void *data), 271 void *data) 272 { 273 struct flow_offload_tuple_rhash *tuplehash; 274 struct rhashtable_iter hti; 275 struct flow_offload *flow; 276 int err = 0; 277 278 rhashtable_walk_enter(&flow_table->rhashtable, &hti); 279 rhashtable_walk_start(&hti); 280 281 while ((tuplehash = rhashtable_walk_next(&hti))) { 282 if (IS_ERR(tuplehash)) { 283 if (PTR_ERR(tuplehash) != -EAGAIN) { 284 err = PTR_ERR(tuplehash); 285 break; 286 } 287 continue; 288 } 289 if (tuplehash->tuple.dir) 290 continue; 291 292 flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); 293 294 iter(flow, data); 295 } 296 rhashtable_walk_stop(&hti); 297 rhashtable_walk_exit(&hti); 298 299 return err; 300 } 301 302 static inline bool nf_flow_has_expired(const struct flow_offload *flow) 303 { 304 return (__s32)(flow->timeout - (u32)jiffies) <= 0; 305 } 306 307 static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) 308 { 309 struct nf_flowtable *flow_table = data; 310 struct flow_offload_entry *e; 311 312 e = container_of(flow, struct flow_offload_entry, flow); 313 if (nf_flow_has_expired(flow) || nf_ct_is_dying(e->ct) || 314 (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) 315 flow_offload_del(flow_table, flow); 316 } 317 318 static void nf_flow_offload_work_gc(struct work_struct *work) 319 { 320 struct nf_flowtable *flow_table; 321 322 flow_table = container_of(work, struct nf_flowtable, gc_work.work); 323 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); 324 queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); 325 } 326 327 static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, 328 __be16 port, __be16 new_port) 329 { 330 struct tcphdr *tcph; 331 332 if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || 333 skb_try_make_writable(skb, thoff + sizeof(*tcph))) 334 return -1; 335 336 tcph = (void *)(skb_network_header(skb) + thoff); 337 inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true); 338 339 return 0; 340 } 341 342 static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, 343 __be16 port, __be16 new_port) 344 { 345 struct udphdr *udph; 346 347 if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || 348 skb_try_make_writable(skb, thoff + sizeof(*udph))) 349 return -1; 350 351 udph = (void *)(skb_network_header(skb) + thoff); 352 if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { 353 inet_proto_csum_replace2(&udph->check, skb, port, 354 new_port, true); 355 if (!udph->check) 356 udph->check = CSUM_MANGLED_0; 357 } 358 359 return 0; 360 } 361 362 static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, 363 u8 protocol, __be16 port, __be16 new_port) 364 { 365 switch (protocol) { 366 case IPPROTO_TCP: 367 if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0) 368 return NF_DROP; 369 break; 370 case IPPROTO_UDP: 371 if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0) 372 return NF_DROP; 373 break; 374 } 375 376 return 0; 377 } 378 379 int nf_flow_snat_port(const struct flow_offload *flow, 380 struct sk_buff *skb, unsigned int thoff, 381 u8 protocol, enum flow_offload_tuple_dir dir) 382 { 383 struct flow_ports *hdr; 384 __be16 port, new_port; 385 386 if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) || 387 skb_try_make_writable(skb, thoff + sizeof(*hdr))) 388 return -1; 389 390 hdr = (void *)(skb_network_header(skb) + thoff); 391 392 switch (dir) { 393 case FLOW_OFFLOAD_DIR_ORIGINAL: 394 port = hdr->source; 395 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; 396 hdr->source = new_port; 397 break; 398 case FLOW_OFFLOAD_DIR_REPLY: 399 port = hdr->dest; 400 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; 401 hdr->dest = new_port; 402 break; 403 default: 404 return -1; 405 } 406 407 return nf_flow_nat_port(skb, thoff, protocol, port, new_port); 408 } 409 EXPORT_SYMBOL_GPL(nf_flow_snat_port); 410 411 int nf_flow_dnat_port(const struct flow_offload *flow, 412 struct sk_buff *skb, unsigned int thoff, 413 u8 protocol, enum flow_offload_tuple_dir dir) 414 { 415 struct flow_ports *hdr; 416 __be16 port, new_port; 417 418 if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) || 419 skb_try_make_writable(skb, thoff + sizeof(*hdr))) 420 return -1; 421 422 hdr = (void *)(skb_network_header(skb) + thoff); 423 424 switch (dir) { 425 case FLOW_OFFLOAD_DIR_ORIGINAL: 426 port = hdr->dest; 427 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port; 428 hdr->dest = new_port; 429 break; 430 case FLOW_OFFLOAD_DIR_REPLY: 431 port = hdr->source; 432 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; 433 hdr->source = new_port; 434 break; 435 default: 436 return -1; 437 } 438 439 return nf_flow_nat_port(skb, thoff, protocol, port, new_port); 440 } 441 EXPORT_SYMBOL_GPL(nf_flow_dnat_port); 442 443 int nf_flow_table_init(struct nf_flowtable *flowtable) 444 { 445 int err; 446 447 INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); 448 449 err = rhashtable_init(&flowtable->rhashtable, 450 &nf_flow_offload_rhash_params); 451 if (err < 0) 452 return err; 453 454 queue_delayed_work(system_power_efficient_wq, 455 &flowtable->gc_work, HZ); 456 457 mutex_lock(&flowtable_lock); 458 list_add(&flowtable->list, &flowtables); 459 mutex_unlock(&flowtable_lock); 460 461 return 0; 462 } 463 EXPORT_SYMBOL_GPL(nf_flow_table_init); 464 465 static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) 466 { 467 struct net_device *dev = data; 468 struct flow_offload_entry *e; 469 470 e = container_of(flow, struct flow_offload_entry, flow); 471 472 if (!dev) { 473 flow_offload_teardown(flow); 474 return; 475 } 476 if (net_eq(nf_ct_net(e->ct), dev_net(dev)) && 477 (flow->tuplehash[0].tuple.iifidx == dev->ifindex || 478 flow->tuplehash[1].tuple.iifidx == dev->ifindex)) 479 flow_offload_dead(flow); 480 } 481 482 static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, 483 struct net_device *dev) 484 { 485 nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); 486 flush_delayed_work(&flowtable->gc_work); 487 } 488 489 void nf_flow_table_cleanup(struct net_device *dev) 490 { 491 struct nf_flowtable *flowtable; 492 493 mutex_lock(&flowtable_lock); 494 list_for_each_entry(flowtable, &flowtables, list) 495 nf_flow_table_iterate_cleanup(flowtable, dev); 496 mutex_unlock(&flowtable_lock); 497 } 498 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); 499 500 void nf_flow_table_free(struct nf_flowtable *flow_table) 501 { 502 mutex_lock(&flowtable_lock); 503 list_del(&flow_table->list); 504 mutex_unlock(&flowtable_lock); 505 cancel_delayed_work_sync(&flow_table->gc_work); 506 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); 507 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); 508 rhashtable_destroy(&flow_table->rhashtable); 509 } 510 EXPORT_SYMBOL_GPL(nf_flow_table_free); 511 512 MODULE_LICENSE("GPL"); 513 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); 514