1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6 /* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/types.h> 15 #include <linux/netfilter.h> 16 #include <linux/module.h> 17 #include <linux/sched.h> 18 #include <linux/skbuff.h> 19 #include <linux/proc_fs.h> 20 #include <linux/vmalloc.h> 21 #include <linux/stddef.h> 22 #include <linux/slab.h> 23 #include <linux/random.h> 24 #include <linux/siphash.h> 25 #include <linux/err.h> 26 #include <linux/percpu.h> 27 #include <linux/moduleparam.h> 28 #include <linux/notifier.h> 29 #include <linux/kernel.h> 30 #include <linux/netdevice.h> 31 #include <linux/socket.h> 32 #include <linux/mm.h> 33 #include <linux/nsproxy.h> 34 #include <linux/rculist_nulls.h> 35 36 #include <net/netfilter/nf_conntrack.h> 37 #include <net/netfilter/nf_conntrack_bpf.h> 38 #include <net/netfilter/nf_conntrack_l4proto.h> 39 #include <net/netfilter/nf_conntrack_expect.h> 40 #include <net/netfilter/nf_conntrack_helper.h> 41 #include <net/netfilter/nf_conntrack_core.h> 42 #include <net/netfilter/nf_conntrack_extend.h> 43 #include <net/netfilter/nf_conntrack_acct.h> 44 #include <net/netfilter/nf_conntrack_ecache.h> 45 #include <net/netfilter/nf_conntrack_zones.h> 46 #include <net/netfilter/nf_conntrack_timestamp.h> 47 #include <net/netfilter/nf_conntrack_timeout.h> 48 #include <net/netfilter/nf_conntrack_labels.h> 49 #include <net/netfilter/nf_conntrack_synproxy.h> 50 #include <net/netfilter/nf_nat.h> 51 #include <net/netfilter/nf_nat_helper.h> 52 #include <net/netns/hash.h> 53 #include <net/ip.h> 54 55 #include "nf_internals.h" 56 57 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 58 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 59 60 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 61 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 62 63 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 64 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 65 66 struct conntrack_gc_work { 67 struct delayed_work dwork; 68 u32 next_bucket; 69 u32 avg_timeout; 70 u32 count; 71 u32 start_time; 72 bool exiting; 73 bool early_drop; 74 }; 75 76 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 77 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 78 static __read_mostly bool nf_conntrack_locks_all; 79 80 /* serialize hash resizes and nf_ct_iterate_cleanup */ 81 static DEFINE_MUTEX(nf_conntrack_mutex); 82 83 #define GC_SCAN_INTERVAL_MAX (60ul * HZ) 84 #define GC_SCAN_INTERVAL_MIN (1ul * HZ) 85 86 /* clamp timeouts to this value (TCP unacked) */ 87 #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) 88 89 /* Initial bias pretending we have 100 entries at the upper bound so we don't 90 * wakeup often just because we have three entries with a 1s timeout while still 91 * allowing non-idle machines to wakeup more often when needed. 92 */ 93 #define GC_SCAN_INITIAL_COUNT 100 94 #define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX 95 96 #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) 97 #define GC_SCAN_EXPIRED_MAX (64000u / HZ) 98 99 #define MIN_CHAINLEN 50u 100 #define MAX_CHAINLEN (80u - MIN_CHAINLEN) 101 102 static struct conntrack_gc_work conntrack_gc_work; 103 104 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 105 { 106 /* 1) Acquire the lock */ 107 spin_lock(lock); 108 109 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 110 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 111 */ 112 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 113 return; 114 115 /* fast path failed, unlock */ 116 spin_unlock(lock); 117 118 /* Slow path 1) get global lock */ 119 spin_lock(&nf_conntrack_locks_all_lock); 120 121 /* Slow path 2) get the lock we want */ 122 spin_lock(lock); 123 124 /* Slow path 3) release the global lock */ 125 spin_unlock(&nf_conntrack_locks_all_lock); 126 } 127 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 128 129 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 130 { 131 h1 %= CONNTRACK_LOCKS; 132 h2 %= CONNTRACK_LOCKS; 133 spin_unlock(&nf_conntrack_locks[h1]); 134 if (h1 != h2) 135 spin_unlock(&nf_conntrack_locks[h2]); 136 } 137 138 /* return true if we need to recompute hashes (in case hash table was resized) */ 139 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, 140 unsigned int h2, unsigned int sequence) 141 { 142 h1 %= CONNTRACK_LOCKS; 143 h2 %= CONNTRACK_LOCKS; 144 if (h1 <= h2) { 145 nf_conntrack_lock(&nf_conntrack_locks[h1]); 146 if (h1 != h2) 147 spin_lock_nested(&nf_conntrack_locks[h2], 148 SINGLE_DEPTH_NESTING); 149 } else { 150 nf_conntrack_lock(&nf_conntrack_locks[h2]); 151 spin_lock_nested(&nf_conntrack_locks[h1], 152 SINGLE_DEPTH_NESTING); 153 } 154 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 155 nf_conntrack_double_unlock(h1, h2); 156 return true; 157 } 158 return false; 159 } 160 161 static void nf_conntrack_all_lock(void) 162 __acquires(&nf_conntrack_locks_all_lock) 163 { 164 int i; 165 166 spin_lock(&nf_conntrack_locks_all_lock); 167 168 /* For nf_contrack_locks_all, only the latest time when another 169 * CPU will see an update is controlled, by the "release" of the 170 * spin_lock below. 171 * The earliest time is not controlled, an thus KCSAN could detect 172 * a race when nf_conntract_lock() reads the variable. 173 * WRITE_ONCE() is used to ensure the compiler will not 174 * optimize the write. 175 */ 176 WRITE_ONCE(nf_conntrack_locks_all, true); 177 178 for (i = 0; i < CONNTRACK_LOCKS; i++) { 179 spin_lock(&nf_conntrack_locks[i]); 180 181 /* This spin_unlock provides the "release" to ensure that 182 * nf_conntrack_locks_all==true is visible to everyone that 183 * acquired spin_lock(&nf_conntrack_locks[]). 184 */ 185 spin_unlock(&nf_conntrack_locks[i]); 186 } 187 } 188 189 static void nf_conntrack_all_unlock(void) 190 __releases(&nf_conntrack_locks_all_lock) 191 { 192 /* All prior stores must be complete before we clear 193 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 194 * might observe the false value but not the entire 195 * critical section. 196 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 197 */ 198 smp_store_release(&nf_conntrack_locks_all, false); 199 spin_unlock(&nf_conntrack_locks_all_lock); 200 } 201 202 unsigned int nf_conntrack_htable_size __read_mostly; 203 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 204 205 unsigned int nf_conntrack_max __read_mostly; 206 EXPORT_SYMBOL_GPL(nf_conntrack_max); 207 seqcount_spinlock_t nf_conntrack_generation __read_mostly; 208 static siphash_aligned_key_t nf_conntrack_hash_rnd; 209 210 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 211 unsigned int zoneid, 212 const struct net *net) 213 { 214 u64 a, b, c, d; 215 216 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 217 218 /* The direction must be ignored, handle usable tuplehash members manually */ 219 a = (u64)tuple->src.u3.all[0] << 32 | tuple->src.u3.all[3]; 220 b = (u64)tuple->dst.u3.all[0] << 32 | tuple->dst.u3.all[3]; 221 222 c = (__force u64)tuple->src.u.all << 32 | (__force u64)tuple->dst.u.all << 16; 223 c |= tuple->dst.protonum; 224 225 d = (u64)zoneid << 32 | net_hash_mix(net); 226 227 /* IPv4: u3.all[1,2,3] == 0 */ 228 c ^= (u64)tuple->src.u3.all[1] << 32 | tuple->src.u3.all[2]; 229 d += (u64)tuple->dst.u3.all[1] << 32 | tuple->dst.u3.all[2]; 230 231 return (u32)siphash_4u64(a, b, c, d, &nf_conntrack_hash_rnd); 232 } 233 234 static u32 scale_hash(u32 hash) 235 { 236 return reciprocal_scale(hash, nf_conntrack_htable_size); 237 } 238 239 static u32 __hash_conntrack(const struct net *net, 240 const struct nf_conntrack_tuple *tuple, 241 unsigned int zoneid, 242 unsigned int size) 243 { 244 return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); 245 } 246 247 static u32 hash_conntrack(const struct net *net, 248 const struct nf_conntrack_tuple *tuple, 249 unsigned int zoneid) 250 { 251 return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); 252 } 253 254 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, 255 unsigned int dataoff, 256 struct nf_conntrack_tuple *tuple) 257 { struct { 258 __be16 sport; 259 __be16 dport; 260 } _inet_hdr, *inet_hdr; 261 262 /* Actually only need first 4 bytes to get ports. */ 263 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 264 if (!inet_hdr) 265 return false; 266 267 tuple->src.u.udp.port = inet_hdr->sport; 268 tuple->dst.u.udp.port = inet_hdr->dport; 269 return true; 270 } 271 272 static bool 273 nf_ct_get_tuple(const struct sk_buff *skb, 274 unsigned int nhoff, 275 unsigned int dataoff, 276 u_int16_t l3num, 277 u_int8_t protonum, 278 struct net *net, 279 struct nf_conntrack_tuple *tuple) 280 { 281 unsigned int size; 282 const __be32 *ap; 283 __be32 _addrs[8]; 284 285 memset(tuple, 0, sizeof(*tuple)); 286 287 tuple->src.l3num = l3num; 288 switch (l3num) { 289 case NFPROTO_IPV4: 290 nhoff += offsetof(struct iphdr, saddr); 291 size = 2 * sizeof(__be32); 292 break; 293 case NFPROTO_IPV6: 294 nhoff += offsetof(struct ipv6hdr, saddr); 295 size = sizeof(_addrs); 296 break; 297 default: 298 return true; 299 } 300 301 ap = skb_header_pointer(skb, nhoff, size, _addrs); 302 if (!ap) 303 return false; 304 305 switch (l3num) { 306 case NFPROTO_IPV4: 307 tuple->src.u3.ip = ap[0]; 308 tuple->dst.u3.ip = ap[1]; 309 break; 310 case NFPROTO_IPV6: 311 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 312 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 313 break; 314 } 315 316 tuple->dst.protonum = protonum; 317 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 318 319 switch (protonum) { 320 #if IS_ENABLED(CONFIG_IPV6) 321 case IPPROTO_ICMPV6: 322 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); 323 #endif 324 case IPPROTO_ICMP: 325 return icmp_pkt_to_tuple(skb, dataoff, net, tuple); 326 #ifdef CONFIG_NF_CT_PROTO_GRE 327 case IPPROTO_GRE: 328 return gre_pkt_to_tuple(skb, dataoff, net, tuple); 329 #endif 330 case IPPROTO_TCP: 331 case IPPROTO_UDP: 332 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 333 case IPPROTO_UDPLITE: 334 #endif 335 #ifdef CONFIG_NF_CT_PROTO_SCTP 336 case IPPROTO_SCTP: 337 #endif 338 #ifdef CONFIG_NF_CT_PROTO_DCCP 339 case IPPROTO_DCCP: 340 #endif 341 /* fallthrough */ 342 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 343 default: 344 break; 345 } 346 347 return true; 348 } 349 350 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 351 u_int8_t *protonum) 352 { 353 int dataoff = -1; 354 const struct iphdr *iph; 355 struct iphdr _iph; 356 357 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 358 if (!iph) 359 return -1; 360 361 /* Conntrack defragments packets, we might still see fragments 362 * inside ICMP packets though. 363 */ 364 if (iph->frag_off & htons(IP_OFFSET)) 365 return -1; 366 367 dataoff = nhoff + (iph->ihl << 2); 368 *protonum = iph->protocol; 369 370 /* Check bogus IP headers */ 371 if (dataoff > skb->len) { 372 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 373 nhoff, iph->ihl << 2, skb->len); 374 return -1; 375 } 376 return dataoff; 377 } 378 379 #if IS_ENABLED(CONFIG_IPV6) 380 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 381 u8 *protonum) 382 { 383 int protoff = -1; 384 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 385 __be16 frag_off; 386 u8 nexthdr; 387 388 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 389 &nexthdr, sizeof(nexthdr)) != 0) { 390 pr_debug("can't get nexthdr\n"); 391 return -1; 392 } 393 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 394 /* 395 * (protoff == skb->len) means the packet has not data, just 396 * IPv6 and possibly extensions headers, but it is tracked anyway 397 */ 398 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 399 pr_debug("can't find proto in pkt\n"); 400 return -1; 401 } 402 403 *protonum = nexthdr; 404 return protoff; 405 } 406 #endif 407 408 static int get_l4proto(const struct sk_buff *skb, 409 unsigned int nhoff, u8 pf, u8 *l4num) 410 { 411 switch (pf) { 412 case NFPROTO_IPV4: 413 return ipv4_get_l4proto(skb, nhoff, l4num); 414 #if IS_ENABLED(CONFIG_IPV6) 415 case NFPROTO_IPV6: 416 return ipv6_get_l4proto(skb, nhoff, l4num); 417 #endif 418 default: 419 *l4num = 0; 420 break; 421 } 422 return -1; 423 } 424 425 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 426 u_int16_t l3num, 427 struct net *net, struct nf_conntrack_tuple *tuple) 428 { 429 u8 protonum; 430 int protoff; 431 432 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 433 if (protoff <= 0) 434 return false; 435 436 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); 437 } 438 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 439 440 bool 441 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 442 const struct nf_conntrack_tuple *orig) 443 { 444 memset(inverse, 0, sizeof(*inverse)); 445 446 inverse->src.l3num = orig->src.l3num; 447 448 switch (orig->src.l3num) { 449 case NFPROTO_IPV4: 450 inverse->src.u3.ip = orig->dst.u3.ip; 451 inverse->dst.u3.ip = orig->src.u3.ip; 452 break; 453 case NFPROTO_IPV6: 454 inverse->src.u3.in6 = orig->dst.u3.in6; 455 inverse->dst.u3.in6 = orig->src.u3.in6; 456 break; 457 default: 458 break; 459 } 460 461 inverse->dst.dir = !orig->dst.dir; 462 463 inverse->dst.protonum = orig->dst.protonum; 464 465 switch (orig->dst.protonum) { 466 case IPPROTO_ICMP: 467 return nf_conntrack_invert_icmp_tuple(inverse, orig); 468 #if IS_ENABLED(CONFIG_IPV6) 469 case IPPROTO_ICMPV6: 470 return nf_conntrack_invert_icmpv6_tuple(inverse, orig); 471 #endif 472 } 473 474 inverse->src.u.all = orig->dst.u.all; 475 inverse->dst.u.all = orig->src.u.all; 476 return true; 477 } 478 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 479 480 /* Generate a almost-unique pseudo-id for a given conntrack. 481 * 482 * intentionally doesn't re-use any of the seeds used for hash 483 * table location, we assume id gets exposed to userspace. 484 * 485 * Following nf_conn items do not change throughout lifetime 486 * of the nf_conn: 487 * 488 * 1. nf_conn address 489 * 2. nf_conn->master address (normally NULL) 490 * 3. the associated net namespace 491 * 4. the original direction tuple 492 */ 493 u32 nf_ct_get_id(const struct nf_conn *ct) 494 { 495 static siphash_aligned_key_t ct_id_seed; 496 unsigned long a, b, c, d; 497 498 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); 499 500 a = (unsigned long)ct; 501 b = (unsigned long)ct->master; 502 c = (unsigned long)nf_ct_net(ct); 503 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 504 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), 505 &ct_id_seed); 506 #ifdef CONFIG_64BIT 507 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); 508 #else 509 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); 510 #endif 511 } 512 EXPORT_SYMBOL_GPL(nf_ct_get_id); 513 514 static void 515 clean_from_lists(struct nf_conn *ct) 516 { 517 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 518 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 519 520 /* Destroy all pending expectations */ 521 nf_ct_remove_expectations(ct); 522 } 523 524 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 525 526 /* Released via nf_ct_destroy() */ 527 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 528 const struct nf_conntrack_zone *zone, 529 gfp_t flags) 530 { 531 struct nf_conn *tmpl, *p; 532 533 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 534 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 535 if (!tmpl) 536 return NULL; 537 538 p = tmpl; 539 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 540 if (tmpl != p) { 541 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 542 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 543 } 544 } else { 545 tmpl = kzalloc(sizeof(*tmpl), flags); 546 if (!tmpl) 547 return NULL; 548 } 549 550 tmpl->status = IPS_TEMPLATE; 551 write_pnet(&tmpl->ct_net, net); 552 nf_ct_zone_add(tmpl, zone); 553 refcount_set(&tmpl->ct_general.use, 1); 554 555 return tmpl; 556 } 557 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 558 559 void nf_ct_tmpl_free(struct nf_conn *tmpl) 560 { 561 kfree(tmpl->ext); 562 563 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 564 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 565 else 566 kfree(tmpl); 567 } 568 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 569 570 static void destroy_gre_conntrack(struct nf_conn *ct) 571 { 572 #ifdef CONFIG_NF_CT_PROTO_GRE 573 struct nf_conn *master = ct->master; 574 575 if (master) 576 nf_ct_gre_keymap_destroy(master); 577 #endif 578 } 579 580 void nf_ct_destroy(struct nf_conntrack *nfct) 581 { 582 struct nf_conn *ct = (struct nf_conn *)nfct; 583 584 WARN_ON(refcount_read(&nfct->use) != 0); 585 586 if (unlikely(nf_ct_is_template(ct))) { 587 nf_ct_tmpl_free(ct); 588 return; 589 } 590 591 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) 592 destroy_gre_conntrack(ct); 593 594 /* Expectations will have been removed in clean_from_lists, 595 * except TFTP can create an expectation on the first packet, 596 * before connection is in the list, so we need to clean here, 597 * too. 598 */ 599 nf_ct_remove_expectations(ct); 600 601 if (ct->master) 602 nf_ct_put(ct->master); 603 604 nf_conntrack_free(ct); 605 } 606 EXPORT_SYMBOL(nf_ct_destroy); 607 608 static void __nf_ct_delete_from_lists(struct nf_conn *ct) 609 { 610 struct net *net = nf_ct_net(ct); 611 unsigned int hash, reply_hash; 612 unsigned int sequence; 613 614 do { 615 sequence = read_seqcount_begin(&nf_conntrack_generation); 616 hash = hash_conntrack(net, 617 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 618 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 619 reply_hash = hash_conntrack(net, 620 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 621 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 622 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 623 624 clean_from_lists(ct); 625 nf_conntrack_double_unlock(hash, reply_hash); 626 } 627 628 static void nf_ct_delete_from_lists(struct nf_conn *ct) 629 { 630 nf_ct_helper_destroy(ct); 631 local_bh_disable(); 632 633 __nf_ct_delete_from_lists(ct); 634 635 local_bh_enable(); 636 } 637 638 static void nf_ct_add_to_ecache_list(struct nf_conn *ct) 639 { 640 #ifdef CONFIG_NF_CONNTRACK_EVENTS 641 struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); 642 643 spin_lock(&cnet->ecache.dying_lock); 644 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 645 &cnet->ecache.dying_list); 646 spin_unlock(&cnet->ecache.dying_lock); 647 #endif 648 } 649 650 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 651 { 652 struct nf_conn_tstamp *tstamp; 653 struct net *net; 654 655 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 656 return false; 657 658 tstamp = nf_conn_tstamp_find(ct); 659 if (tstamp) { 660 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; 661 662 tstamp->stop = ktime_get_real_ns(); 663 if (timeout < 0) 664 tstamp->stop -= jiffies_to_nsecs(-timeout); 665 } 666 667 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 668 portid, report) < 0) { 669 /* destroy event was not delivered. nf_ct_put will 670 * be done by event cache worker on redelivery. 671 */ 672 nf_ct_helper_destroy(ct); 673 local_bh_disable(); 674 __nf_ct_delete_from_lists(ct); 675 nf_ct_add_to_ecache_list(ct); 676 local_bh_enable(); 677 678 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); 679 return false; 680 } 681 682 net = nf_ct_net(ct); 683 if (nf_conntrack_ecache_dwork_pending(net)) 684 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); 685 nf_ct_delete_from_lists(ct); 686 nf_ct_put(ct); 687 return true; 688 } 689 EXPORT_SYMBOL_GPL(nf_ct_delete); 690 691 static inline bool 692 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 693 const struct nf_conntrack_tuple *tuple, 694 const struct nf_conntrack_zone *zone, 695 const struct net *net) 696 { 697 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 698 699 /* A conntrack can be recreated with the equal tuple, 700 * so we need to check that the conntrack is confirmed 701 */ 702 return nf_ct_tuple_equal(tuple, &h->tuple) && 703 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 704 nf_ct_is_confirmed(ct) && 705 net_eq(net, nf_ct_net(ct)); 706 } 707 708 static inline bool 709 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 710 { 711 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 712 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 713 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 714 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 715 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 716 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 717 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 718 } 719 720 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 721 static void nf_ct_gc_expired(struct nf_conn *ct) 722 { 723 if (!refcount_inc_not_zero(&ct->ct_general.use)) 724 return; 725 726 /* load ->status after refcount increase */ 727 smp_acquire__after_ctrl_dep(); 728 729 if (nf_ct_should_gc(ct)) 730 nf_ct_kill(ct); 731 732 nf_ct_put(ct); 733 } 734 735 /* 736 * Warning : 737 * - Caller must take a reference on returned object 738 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 739 */ 740 static struct nf_conntrack_tuple_hash * 741 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 742 const struct nf_conntrack_tuple *tuple, u32 hash) 743 { 744 struct nf_conntrack_tuple_hash *h; 745 struct hlist_nulls_head *ct_hash; 746 struct hlist_nulls_node *n; 747 unsigned int bucket, hsize; 748 749 begin: 750 nf_conntrack_get_ht(&ct_hash, &hsize); 751 bucket = reciprocal_scale(hash, hsize); 752 753 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 754 struct nf_conn *ct; 755 756 ct = nf_ct_tuplehash_to_ctrack(h); 757 if (nf_ct_is_expired(ct)) { 758 nf_ct_gc_expired(ct); 759 continue; 760 } 761 762 if (nf_ct_key_equal(h, tuple, zone, net)) 763 return h; 764 } 765 /* 766 * if the nulls value we got at the end of this lookup is 767 * not the expected one, we must restart lookup. 768 * We probably met an item that was moved to another chain. 769 */ 770 if (get_nulls_value(n) != bucket) { 771 NF_CT_STAT_INC_ATOMIC(net, search_restart); 772 goto begin; 773 } 774 775 return NULL; 776 } 777 778 /* Find a connection corresponding to a tuple. */ 779 static struct nf_conntrack_tuple_hash * 780 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 781 const struct nf_conntrack_tuple *tuple, u32 hash) 782 { 783 struct nf_conntrack_tuple_hash *h; 784 struct nf_conn *ct; 785 786 h = ____nf_conntrack_find(net, zone, tuple, hash); 787 if (h) { 788 /* We have a candidate that matches the tuple we're interested 789 * in, try to obtain a reference and re-check tuple 790 */ 791 ct = nf_ct_tuplehash_to_ctrack(h); 792 if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { 793 /* re-check key after refcount */ 794 smp_acquire__after_ctrl_dep(); 795 796 if (likely(nf_ct_key_equal(h, tuple, zone, net))) 797 return h; 798 799 /* TYPESAFE_BY_RCU recycled the candidate */ 800 nf_ct_put(ct); 801 } 802 803 h = NULL; 804 } 805 806 return h; 807 } 808 809 struct nf_conntrack_tuple_hash * 810 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 811 const struct nf_conntrack_tuple *tuple) 812 { 813 unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 814 struct nf_conntrack_tuple_hash *thash; 815 816 rcu_read_lock(); 817 818 thash = __nf_conntrack_find_get(net, zone, tuple, 819 hash_conntrack_raw(tuple, zone_id, net)); 820 821 if (thash) 822 goto out_unlock; 823 824 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 825 if (rid != zone_id) 826 thash = __nf_conntrack_find_get(net, zone, tuple, 827 hash_conntrack_raw(tuple, rid, net)); 828 829 out_unlock: 830 rcu_read_unlock(); 831 return thash; 832 } 833 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 834 835 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 836 unsigned int hash, 837 unsigned int reply_hash) 838 { 839 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 840 &nf_conntrack_hash[hash]); 841 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 842 &nf_conntrack_hash[reply_hash]); 843 } 844 845 static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) 846 { 847 /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions 848 * may contain stale pointers to e.g. helper that has been removed. 849 * 850 * The helper can't clear this because the nf_conn object isn't in 851 * any hash and synchronize_rcu() isn't enough because associated skb 852 * might sit in a queue. 853 */ 854 return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); 855 } 856 857 static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) 858 { 859 if (!ext) 860 return true; 861 862 if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) 863 return false; 864 865 /* inserted into conntrack table, nf_ct_iterate_cleanup() 866 * will find it. Disable nf_ct_ext_find() id check. 867 */ 868 WRITE_ONCE(ext->gen_id, 0); 869 return true; 870 } 871 872 int 873 nf_conntrack_hash_check_insert(struct nf_conn *ct) 874 { 875 const struct nf_conntrack_zone *zone; 876 struct net *net = nf_ct_net(ct); 877 unsigned int hash, reply_hash; 878 struct nf_conntrack_tuple_hash *h; 879 struct hlist_nulls_node *n; 880 unsigned int max_chainlen; 881 unsigned int chainlen = 0; 882 unsigned int sequence; 883 int err = -EEXIST; 884 885 zone = nf_ct_zone(ct); 886 887 if (!nf_ct_ext_valid_pre(ct->ext)) 888 return -EAGAIN; 889 890 local_bh_disable(); 891 do { 892 sequence = read_seqcount_begin(&nf_conntrack_generation); 893 hash = hash_conntrack(net, 894 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 895 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 896 reply_hash = hash_conntrack(net, 897 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 898 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 899 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 900 901 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 902 903 /* See if there's one in the list already, including reverse */ 904 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 905 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 906 zone, net)) 907 goto out; 908 909 if (chainlen++ > max_chainlen) 910 goto chaintoolong; 911 } 912 913 chainlen = 0; 914 915 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 916 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 917 zone, net)) 918 goto out; 919 if (chainlen++ > max_chainlen) 920 goto chaintoolong; 921 } 922 923 /* If genid has changed, we can't insert anymore because ct 924 * extensions could have stale pointers and nf_ct_iterate_destroy 925 * might have completed its table scan already. 926 * 927 * Increment of the ext genid right after this check is fine: 928 * nf_ct_iterate_destroy blocks until locks are released. 929 */ 930 if (!nf_ct_ext_valid_post(ct->ext)) { 931 err = -EAGAIN; 932 goto out; 933 } 934 935 ct->status |= IPS_CONFIRMED; 936 smp_wmb(); 937 /* The caller holds a reference to this object */ 938 refcount_set(&ct->ct_general.use, 2); 939 __nf_conntrack_hash_insert(ct, hash, reply_hash); 940 nf_conntrack_double_unlock(hash, reply_hash); 941 NF_CT_STAT_INC(net, insert); 942 local_bh_enable(); 943 944 return 0; 945 chaintoolong: 946 NF_CT_STAT_INC(net, chaintoolong); 947 err = -ENOSPC; 948 out: 949 nf_conntrack_double_unlock(hash, reply_hash); 950 local_bh_enable(); 951 return err; 952 } 953 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 954 955 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, 956 unsigned int bytes) 957 { 958 struct nf_conn_acct *acct; 959 960 acct = nf_conn_acct_find(ct); 961 if (acct) { 962 struct nf_conn_counter *counter = acct->counter; 963 964 atomic64_add(packets, &counter[dir].packets); 965 atomic64_add(bytes, &counter[dir].bytes); 966 } 967 } 968 EXPORT_SYMBOL_GPL(nf_ct_acct_add); 969 970 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 971 const struct nf_conn *loser_ct) 972 { 973 struct nf_conn_acct *acct; 974 975 acct = nf_conn_acct_find(loser_ct); 976 if (acct) { 977 struct nf_conn_counter *counter = acct->counter; 978 unsigned int bytes; 979 980 /* u32 should be fine since we must have seen one packet. */ 981 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 982 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 983 } 984 } 985 986 static void __nf_conntrack_insert_prepare(struct nf_conn *ct) 987 { 988 struct nf_conn_tstamp *tstamp; 989 990 refcount_inc(&ct->ct_general.use); 991 992 /* set conntrack timestamp, if enabled. */ 993 tstamp = nf_conn_tstamp_find(ct); 994 if (tstamp) 995 tstamp->start = ktime_get_real_ns(); 996 } 997 998 /* caller must hold locks to prevent concurrent changes */ 999 static int __nf_ct_resolve_clash(struct sk_buff *skb, 1000 struct nf_conntrack_tuple_hash *h) 1001 { 1002 /* This is the conntrack entry already in hashes that won race. */ 1003 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1004 enum ip_conntrack_info ctinfo; 1005 struct nf_conn *loser_ct; 1006 1007 loser_ct = nf_ct_get(skb, &ctinfo); 1008 1009 if (nf_ct_is_dying(ct)) 1010 return NF_DROP; 1011 1012 if (((ct->status & IPS_NAT_DONE_MASK) == 0) || 1013 nf_ct_match(ct, loser_ct)) { 1014 struct net *net = nf_ct_net(ct); 1015 1016 nf_conntrack_get(&ct->ct_general); 1017 1018 nf_ct_acct_merge(ct, ctinfo, loser_ct); 1019 nf_ct_put(loser_ct); 1020 nf_ct_set(skb, ct, ctinfo); 1021 1022 NF_CT_STAT_INC(net, clash_resolve); 1023 return NF_ACCEPT; 1024 } 1025 1026 return NF_DROP; 1027 } 1028 1029 /** 1030 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry 1031 * 1032 * @skb: skb that causes the collision 1033 * @repl_idx: hash slot for reply direction 1034 * 1035 * Called when origin or reply direction had a clash. 1036 * The skb can be handled without packet drop provided the reply direction 1037 * is unique or there the existing entry has the identical tuple in both 1038 * directions. 1039 * 1040 * Caller must hold conntrack table locks to prevent concurrent updates. 1041 * 1042 * Returns NF_DROP if the clash could not be handled. 1043 */ 1044 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) 1045 { 1046 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); 1047 const struct nf_conntrack_zone *zone; 1048 struct nf_conntrack_tuple_hash *h; 1049 struct hlist_nulls_node *n; 1050 struct net *net; 1051 1052 zone = nf_ct_zone(loser_ct); 1053 net = nf_ct_net(loser_ct); 1054 1055 /* Reply direction must never result in a clash, unless both origin 1056 * and reply tuples are identical. 1057 */ 1058 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { 1059 if (nf_ct_key_equal(h, 1060 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1061 zone, net)) 1062 return __nf_ct_resolve_clash(skb, h); 1063 } 1064 1065 /* We want the clashing entry to go away real soon: 1 second timeout. */ 1066 WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); 1067 1068 /* IPS_NAT_CLASH removes the entry automatically on the first 1069 * reply. Also prevents UDP tracker from moving the entry to 1070 * ASSURED state, i.e. the entry can always be evicted under 1071 * pressure. 1072 */ 1073 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; 1074 1075 __nf_conntrack_insert_prepare(loser_ct); 1076 1077 /* fake add for ORIGINAL dir: we want lookups to only find the entry 1078 * already in the table. This also hides the clashing entry from 1079 * ctnetlink iteration, i.e. conntrack -L won't show them. 1080 */ 1081 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1082 1083 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 1084 &nf_conntrack_hash[repl_idx]); 1085 1086 NF_CT_STAT_INC(net, clash_resolve); 1087 return NF_ACCEPT; 1088 } 1089 1090 /** 1091 * nf_ct_resolve_clash - attempt to handle clash without packet drop 1092 * 1093 * @skb: skb that causes the clash 1094 * @h: tuplehash of the clashing entry already in table 1095 * @reply_hash: hash slot for reply direction 1096 * 1097 * A conntrack entry can be inserted to the connection tracking table 1098 * if there is no existing entry with an identical tuple. 1099 * 1100 * If there is one, @skb (and the assocated, unconfirmed conntrack) has 1101 * to be dropped. In case @skb is retransmitted, next conntrack lookup 1102 * will find the already-existing entry. 1103 * 1104 * The major problem with such packet drop is the extra delay added by 1105 * the packet loss -- it will take some time for a retransmit to occur 1106 * (or the sender to time out when waiting for a reply). 1107 * 1108 * This function attempts to handle the situation without packet drop. 1109 * 1110 * If @skb has no NAT transformation or if the colliding entries are 1111 * exactly the same, only the to-be-confirmed conntrack entry is discarded 1112 * and @skb is associated with the conntrack entry already in the table. 1113 * 1114 * Failing that, the new, unconfirmed conntrack is still added to the table 1115 * provided that the collision only occurs in the ORIGINAL direction. 1116 * The new entry will be added only in the non-clashing REPLY direction, 1117 * so packets in the ORIGINAL direction will continue to match the existing 1118 * entry. The new entry will also have a fixed timeout so it expires -- 1119 * due to the collision, it will only see reply traffic. 1120 * 1121 * Returns NF_DROP if the clash could not be resolved. 1122 */ 1123 static __cold noinline int 1124 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, 1125 u32 reply_hash) 1126 { 1127 /* This is the conntrack entry already in hashes that won race. */ 1128 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1129 const struct nf_conntrack_l4proto *l4proto; 1130 enum ip_conntrack_info ctinfo; 1131 struct nf_conn *loser_ct; 1132 struct net *net; 1133 int ret; 1134 1135 loser_ct = nf_ct_get(skb, &ctinfo); 1136 net = nf_ct_net(loser_ct); 1137 1138 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1139 if (!l4proto->allow_clash) 1140 goto drop; 1141 1142 ret = __nf_ct_resolve_clash(skb, h); 1143 if (ret == NF_ACCEPT) 1144 return ret; 1145 1146 ret = nf_ct_resolve_clash_harder(skb, reply_hash); 1147 if (ret == NF_ACCEPT) 1148 return ret; 1149 1150 drop: 1151 NF_CT_STAT_INC(net, drop); 1152 NF_CT_STAT_INC(net, insert_failed); 1153 return NF_DROP; 1154 } 1155 1156 /* Confirm a connection given skb; places it in hash table */ 1157 int 1158 __nf_conntrack_confirm(struct sk_buff *skb) 1159 { 1160 unsigned int chainlen = 0, sequence, max_chainlen; 1161 const struct nf_conntrack_zone *zone; 1162 unsigned int hash, reply_hash; 1163 struct nf_conntrack_tuple_hash *h; 1164 struct nf_conn *ct; 1165 struct nf_conn_help *help; 1166 struct hlist_nulls_node *n; 1167 enum ip_conntrack_info ctinfo; 1168 struct net *net; 1169 int ret = NF_DROP; 1170 1171 ct = nf_ct_get(skb, &ctinfo); 1172 net = nf_ct_net(ct); 1173 1174 /* ipt_REJECT uses nf_conntrack_attach to attach related 1175 ICMP/TCP RST packets in other direction. Actual packet 1176 which created connection will be IP_CT_NEW or for an 1177 expected connection, IP_CT_RELATED. */ 1178 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1179 return NF_ACCEPT; 1180 1181 zone = nf_ct_zone(ct); 1182 local_bh_disable(); 1183 1184 do { 1185 sequence = read_seqcount_begin(&nf_conntrack_generation); 1186 /* reuse the hash saved before */ 1187 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 1188 hash = scale_hash(hash); 1189 reply_hash = hash_conntrack(net, 1190 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1191 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 1192 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 1193 1194 /* We're not in hash table, and we refuse to set up related 1195 * connections for unconfirmed conns. But packet copies and 1196 * REJECT will give spurious warnings here. 1197 */ 1198 1199 /* Another skb with the same unconfirmed conntrack may 1200 * win the race. This may happen for bridge(br_flood) 1201 * or broadcast/multicast packets do skb_clone with 1202 * unconfirmed conntrack. 1203 */ 1204 if (unlikely(nf_ct_is_confirmed(ct))) { 1205 WARN_ON_ONCE(1); 1206 nf_conntrack_double_unlock(hash, reply_hash); 1207 local_bh_enable(); 1208 return NF_DROP; 1209 } 1210 1211 if (!nf_ct_ext_valid_pre(ct->ext)) { 1212 NF_CT_STAT_INC(net, insert_failed); 1213 goto dying; 1214 } 1215 1216 /* We have to check the DYING flag after unlink to prevent 1217 * a race against nf_ct_get_next_corpse() possibly called from 1218 * user context, else we insert an already 'dead' hash, blocking 1219 * further use of that particular connection -JM. 1220 */ 1221 ct->status |= IPS_CONFIRMED; 1222 1223 if (unlikely(nf_ct_is_dying(ct))) { 1224 NF_CT_STAT_INC(net, insert_failed); 1225 goto dying; 1226 } 1227 1228 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 1229 /* See if there's one in the list already, including reverse: 1230 NAT could have grabbed it without realizing, since we're 1231 not in the hash. If there is, we lost race. */ 1232 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 1233 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1234 zone, net)) 1235 goto out; 1236 if (chainlen++ > max_chainlen) 1237 goto chaintoolong; 1238 } 1239 1240 chainlen = 0; 1241 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 1242 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1243 zone, net)) 1244 goto out; 1245 if (chainlen++ > max_chainlen) { 1246 chaintoolong: 1247 NF_CT_STAT_INC(net, chaintoolong); 1248 NF_CT_STAT_INC(net, insert_failed); 1249 ret = NF_DROP; 1250 goto dying; 1251 } 1252 } 1253 1254 /* Timer relative to confirmation time, not original 1255 setting time, otherwise we'd get timer wrap in 1256 weird delay cases. */ 1257 ct->timeout += nfct_time_stamp; 1258 1259 __nf_conntrack_insert_prepare(ct); 1260 1261 /* Since the lookup is lockless, hash insertion must be done after 1262 * starting the timer and setting the CONFIRMED bit. The RCU barriers 1263 * guarantee that no other CPU can find the conntrack before the above 1264 * stores are visible. 1265 */ 1266 __nf_conntrack_hash_insert(ct, hash, reply_hash); 1267 nf_conntrack_double_unlock(hash, reply_hash); 1268 local_bh_enable(); 1269 1270 /* ext area is still valid (rcu read lock is held, 1271 * but will go out of scope soon, we need to remove 1272 * this conntrack again. 1273 */ 1274 if (!nf_ct_ext_valid_post(ct->ext)) { 1275 nf_ct_kill(ct); 1276 NF_CT_STAT_INC_ATOMIC(net, drop); 1277 return NF_DROP; 1278 } 1279 1280 help = nfct_help(ct); 1281 if (help && help->helper) 1282 nf_conntrack_event_cache(IPCT_HELPER, ct); 1283 1284 nf_conntrack_event_cache(master_ct(ct) ? 1285 IPCT_RELATED : IPCT_NEW, ct); 1286 return NF_ACCEPT; 1287 1288 out: 1289 ret = nf_ct_resolve_clash(skb, h, reply_hash); 1290 dying: 1291 nf_conntrack_double_unlock(hash, reply_hash); 1292 local_bh_enable(); 1293 return ret; 1294 } 1295 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1296 1297 /* Returns true if a connection correspondings to the tuple (required 1298 for NAT). */ 1299 int 1300 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1301 const struct nf_conn *ignored_conntrack) 1302 { 1303 struct net *net = nf_ct_net(ignored_conntrack); 1304 const struct nf_conntrack_zone *zone; 1305 struct nf_conntrack_tuple_hash *h; 1306 struct hlist_nulls_head *ct_hash; 1307 unsigned int hash, hsize; 1308 struct hlist_nulls_node *n; 1309 struct nf_conn *ct; 1310 1311 zone = nf_ct_zone(ignored_conntrack); 1312 1313 rcu_read_lock(); 1314 begin: 1315 nf_conntrack_get_ht(&ct_hash, &hsize); 1316 hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); 1317 1318 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1319 ct = nf_ct_tuplehash_to_ctrack(h); 1320 1321 if (ct == ignored_conntrack) 1322 continue; 1323 1324 if (nf_ct_is_expired(ct)) { 1325 nf_ct_gc_expired(ct); 1326 continue; 1327 } 1328 1329 if (nf_ct_key_equal(h, tuple, zone, net)) { 1330 /* Tuple is taken already, so caller will need to find 1331 * a new source port to use. 1332 * 1333 * Only exception: 1334 * If the *original tuples* are identical, then both 1335 * conntracks refer to the same flow. 1336 * This is a rare situation, it can occur e.g. when 1337 * more than one UDP packet is sent from same socket 1338 * in different threads. 1339 * 1340 * Let nf_ct_resolve_clash() deal with this later. 1341 */ 1342 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1343 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 1344 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) 1345 continue; 1346 1347 NF_CT_STAT_INC_ATOMIC(net, found); 1348 rcu_read_unlock(); 1349 return 1; 1350 } 1351 } 1352 1353 if (get_nulls_value(n) != hash) { 1354 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1355 goto begin; 1356 } 1357 1358 rcu_read_unlock(); 1359 1360 return 0; 1361 } 1362 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1363 1364 #define NF_CT_EVICTION_RANGE 8 1365 1366 /* There's a small race here where we may free a just-assured 1367 connection. Too bad: we're in trouble anyway. */ 1368 static unsigned int early_drop_list(struct net *net, 1369 struct hlist_nulls_head *head) 1370 { 1371 struct nf_conntrack_tuple_hash *h; 1372 struct hlist_nulls_node *n; 1373 unsigned int drops = 0; 1374 struct nf_conn *tmp; 1375 1376 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1377 tmp = nf_ct_tuplehash_to_ctrack(h); 1378 1379 if (nf_ct_is_expired(tmp)) { 1380 nf_ct_gc_expired(tmp); 1381 continue; 1382 } 1383 1384 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1385 !net_eq(nf_ct_net(tmp), net) || 1386 nf_ct_is_dying(tmp)) 1387 continue; 1388 1389 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1390 continue; 1391 1392 /* load ->ct_net and ->status after refcount increase */ 1393 smp_acquire__after_ctrl_dep(); 1394 1395 /* kill only if still in same netns -- might have moved due to 1396 * SLAB_TYPESAFE_BY_RCU rules. 1397 * 1398 * We steal the timer reference. If that fails timer has 1399 * already fired or someone else deleted it. Just drop ref 1400 * and move to next entry. 1401 */ 1402 if (net_eq(nf_ct_net(tmp), net) && 1403 nf_ct_is_confirmed(tmp) && 1404 nf_ct_delete(tmp, 0, 0)) 1405 drops++; 1406 1407 nf_ct_put(tmp); 1408 } 1409 1410 return drops; 1411 } 1412 1413 static noinline int early_drop(struct net *net, unsigned int hash) 1414 { 1415 unsigned int i, bucket; 1416 1417 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1418 struct hlist_nulls_head *ct_hash; 1419 unsigned int hsize, drops; 1420 1421 rcu_read_lock(); 1422 nf_conntrack_get_ht(&ct_hash, &hsize); 1423 if (!i) 1424 bucket = reciprocal_scale(hash, hsize); 1425 else 1426 bucket = (bucket + 1) % hsize; 1427 1428 drops = early_drop_list(net, &ct_hash[bucket]); 1429 rcu_read_unlock(); 1430 1431 if (drops) { 1432 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1433 return true; 1434 } 1435 } 1436 1437 return false; 1438 } 1439 1440 static bool gc_worker_skip_ct(const struct nf_conn *ct) 1441 { 1442 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1443 } 1444 1445 static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1446 { 1447 const struct nf_conntrack_l4proto *l4proto; 1448 u8 protonum = nf_ct_protonum(ct); 1449 1450 if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP) 1451 return false; 1452 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1453 return true; 1454 1455 l4proto = nf_ct_l4proto_find(protonum); 1456 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1457 return true; 1458 1459 return false; 1460 } 1461 1462 static void gc_worker(struct work_struct *work) 1463 { 1464 unsigned int i, hashsz, nf_conntrack_max95 = 0; 1465 u32 end_time, start_time = nfct_time_stamp; 1466 struct conntrack_gc_work *gc_work; 1467 unsigned int expired_count = 0; 1468 unsigned long next_run; 1469 s32 delta_time; 1470 long count; 1471 1472 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1473 1474 i = gc_work->next_bucket; 1475 if (gc_work->early_drop) 1476 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1477 1478 if (i == 0) { 1479 gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; 1480 gc_work->count = GC_SCAN_INITIAL_COUNT; 1481 gc_work->start_time = start_time; 1482 } 1483 1484 next_run = gc_work->avg_timeout; 1485 count = gc_work->count; 1486 1487 end_time = start_time + GC_SCAN_MAX_DURATION; 1488 1489 do { 1490 struct nf_conntrack_tuple_hash *h; 1491 struct hlist_nulls_head *ct_hash; 1492 struct hlist_nulls_node *n; 1493 struct nf_conn *tmp; 1494 1495 rcu_read_lock(); 1496 1497 nf_conntrack_get_ht(&ct_hash, &hashsz); 1498 if (i >= hashsz) { 1499 rcu_read_unlock(); 1500 break; 1501 } 1502 1503 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1504 struct nf_conntrack_net *cnet; 1505 struct net *net; 1506 long expires; 1507 1508 tmp = nf_ct_tuplehash_to_ctrack(h); 1509 1510 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { 1511 nf_ct_offload_timeout(tmp); 1512 if (!nf_conntrack_max95) 1513 continue; 1514 } 1515 1516 if (expired_count > GC_SCAN_EXPIRED_MAX) { 1517 rcu_read_unlock(); 1518 1519 gc_work->next_bucket = i; 1520 gc_work->avg_timeout = next_run; 1521 gc_work->count = count; 1522 1523 delta_time = nfct_time_stamp - gc_work->start_time; 1524 1525 /* re-sched immediately if total cycle time is exceeded */ 1526 next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; 1527 goto early_exit; 1528 } 1529 1530 if (nf_ct_is_expired(tmp)) { 1531 nf_ct_gc_expired(tmp); 1532 expired_count++; 1533 continue; 1534 } 1535 1536 expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); 1537 expires = (expires - (long)next_run) / ++count; 1538 next_run += expires; 1539 1540 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1541 continue; 1542 1543 net = nf_ct_net(tmp); 1544 cnet = nf_ct_pernet(net); 1545 if (atomic_read(&cnet->count) < nf_conntrack_max95) 1546 continue; 1547 1548 /* need to take reference to avoid possible races */ 1549 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1550 continue; 1551 1552 /* load ->status after refcount increase */ 1553 smp_acquire__after_ctrl_dep(); 1554 1555 if (gc_worker_skip_ct(tmp)) { 1556 nf_ct_put(tmp); 1557 continue; 1558 } 1559 1560 if (gc_worker_can_early_drop(tmp)) { 1561 nf_ct_kill(tmp); 1562 expired_count++; 1563 } 1564 1565 nf_ct_put(tmp); 1566 } 1567 1568 /* could check get_nulls_value() here and restart if ct 1569 * was moved to another chain. But given gc is best-effort 1570 * we will just continue with next hash slot. 1571 */ 1572 rcu_read_unlock(); 1573 cond_resched(); 1574 i++; 1575 1576 delta_time = nfct_time_stamp - end_time; 1577 if (delta_time > 0 && i < hashsz) { 1578 gc_work->avg_timeout = next_run; 1579 gc_work->count = count; 1580 gc_work->next_bucket = i; 1581 next_run = 0; 1582 goto early_exit; 1583 } 1584 } while (i < hashsz); 1585 1586 gc_work->next_bucket = 0; 1587 1588 next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); 1589 1590 delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); 1591 if (next_run > (unsigned long)delta_time) 1592 next_run -= delta_time; 1593 else 1594 next_run = 1; 1595 1596 early_exit: 1597 if (gc_work->exiting) 1598 return; 1599 1600 if (next_run) 1601 gc_work->early_drop = false; 1602 1603 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1604 } 1605 1606 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1607 { 1608 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1609 gc_work->exiting = false; 1610 } 1611 1612 static struct nf_conn * 1613 __nf_conntrack_alloc(struct net *net, 1614 const struct nf_conntrack_zone *zone, 1615 const struct nf_conntrack_tuple *orig, 1616 const struct nf_conntrack_tuple *repl, 1617 gfp_t gfp, u32 hash) 1618 { 1619 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 1620 unsigned int ct_count; 1621 struct nf_conn *ct; 1622 1623 /* We don't want any race condition at early drop stage */ 1624 ct_count = atomic_inc_return(&cnet->count); 1625 1626 if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { 1627 if (!early_drop(net, hash)) { 1628 if (!conntrack_gc_work.early_drop) 1629 conntrack_gc_work.early_drop = true; 1630 atomic_dec(&cnet->count); 1631 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1632 return ERR_PTR(-ENOMEM); 1633 } 1634 } 1635 1636 /* 1637 * Do not use kmem_cache_zalloc(), as this cache uses 1638 * SLAB_TYPESAFE_BY_RCU. 1639 */ 1640 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1641 if (ct == NULL) 1642 goto out; 1643 1644 spin_lock_init(&ct->lock); 1645 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1646 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1647 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1648 /* save hash for reusing when confirming */ 1649 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1650 ct->status = 0; 1651 WRITE_ONCE(ct->timeout, 0); 1652 write_pnet(&ct->ct_net, net); 1653 memset_after(ct, 0, __nfct_init_offset); 1654 1655 nf_ct_zone_add(ct, zone); 1656 1657 /* Because we use RCU lookups, we set ct_general.use to zero before 1658 * this is inserted in any list. 1659 */ 1660 refcount_set(&ct->ct_general.use, 0); 1661 return ct; 1662 out: 1663 atomic_dec(&cnet->count); 1664 return ERR_PTR(-ENOMEM); 1665 } 1666 1667 struct nf_conn *nf_conntrack_alloc(struct net *net, 1668 const struct nf_conntrack_zone *zone, 1669 const struct nf_conntrack_tuple *orig, 1670 const struct nf_conntrack_tuple *repl, 1671 gfp_t gfp) 1672 { 1673 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1674 } 1675 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1676 1677 void nf_conntrack_free(struct nf_conn *ct) 1678 { 1679 struct net *net = nf_ct_net(ct); 1680 struct nf_conntrack_net *cnet; 1681 1682 /* A freed object has refcnt == 0, that's 1683 * the golden rule for SLAB_TYPESAFE_BY_RCU 1684 */ 1685 WARN_ON(refcount_read(&ct->ct_general.use) != 0); 1686 1687 if (ct->status & IPS_SRC_NAT_DONE) { 1688 const struct nf_nat_hook *nat_hook; 1689 1690 rcu_read_lock(); 1691 nat_hook = rcu_dereference(nf_nat_hook); 1692 if (nat_hook) 1693 nat_hook->remove_nat_bysrc(ct); 1694 rcu_read_unlock(); 1695 } 1696 1697 kfree(ct->ext); 1698 kmem_cache_free(nf_conntrack_cachep, ct); 1699 cnet = nf_ct_pernet(net); 1700 1701 smp_mb__before_atomic(); 1702 atomic_dec(&cnet->count); 1703 } 1704 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1705 1706 1707 /* Allocate a new conntrack: we return -ENOMEM if classification 1708 failed due to stress. Otherwise it really is unclassifiable. */ 1709 static noinline struct nf_conntrack_tuple_hash * 1710 init_conntrack(struct net *net, struct nf_conn *tmpl, 1711 const struct nf_conntrack_tuple *tuple, 1712 struct sk_buff *skb, 1713 unsigned int dataoff, u32 hash) 1714 { 1715 struct nf_conn *ct; 1716 struct nf_conn_help *help; 1717 struct nf_conntrack_tuple repl_tuple; 1718 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1719 struct nf_conntrack_ecache *ecache; 1720 #endif 1721 struct nf_conntrack_expect *exp = NULL; 1722 const struct nf_conntrack_zone *zone; 1723 struct nf_conn_timeout *timeout_ext; 1724 struct nf_conntrack_zone tmp; 1725 struct nf_conntrack_net *cnet; 1726 1727 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) 1728 return NULL; 1729 1730 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1731 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1732 hash); 1733 if (IS_ERR(ct)) 1734 return (struct nf_conntrack_tuple_hash *)ct; 1735 1736 if (!nf_ct_add_synproxy(ct, tmpl)) { 1737 nf_conntrack_free(ct); 1738 return ERR_PTR(-ENOMEM); 1739 } 1740 1741 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1742 1743 if (timeout_ext) 1744 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1745 GFP_ATOMIC); 1746 1747 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1748 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1749 nf_ct_labels_ext_add(ct); 1750 1751 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1752 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1753 1754 if ((ecache || net->ct.sysctl_events) && 1755 !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1756 ecache ? ecache->expmask : 0, 1757 GFP_ATOMIC)) { 1758 nf_conntrack_free(ct); 1759 return ERR_PTR(-ENOMEM); 1760 } 1761 #endif 1762 1763 cnet = nf_ct_pernet(net); 1764 if (cnet->expect_count) { 1765 spin_lock_bh(&nf_conntrack_expect_lock); 1766 exp = nf_ct_find_expectation(net, zone, tuple); 1767 if (exp) { 1768 /* Welcome, Mr. Bond. We've been expecting you... */ 1769 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1770 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1771 ct->master = exp->master; 1772 if (exp->helper) { 1773 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1774 if (help) 1775 rcu_assign_pointer(help->helper, exp->helper); 1776 } 1777 1778 #ifdef CONFIG_NF_CONNTRACK_MARK 1779 ct->mark = READ_ONCE(exp->master->mark); 1780 #endif 1781 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1782 ct->secmark = exp->master->secmark; 1783 #endif 1784 NF_CT_STAT_INC(net, expect_new); 1785 } 1786 spin_unlock_bh(&nf_conntrack_expect_lock); 1787 } 1788 if (!exp && tmpl) 1789 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1790 1791 /* Other CPU might have obtained a pointer to this object before it was 1792 * released. Because refcount is 0, refcount_inc_not_zero() will fail. 1793 * 1794 * After refcount_set(1) it will succeed; ensure that zeroing of 1795 * ct->status and the correct ct->net pointer are visible; else other 1796 * core might observe CONFIRMED bit which means the entry is valid and 1797 * in the hash table, but its not (anymore). 1798 */ 1799 smp_wmb(); 1800 1801 /* Now it is going to be associated with an sk_buff, set refcount to 1. */ 1802 refcount_set(&ct->ct_general.use, 1); 1803 1804 if (exp) { 1805 if (exp->expectfn) 1806 exp->expectfn(ct, exp); 1807 nf_ct_expect_put(exp); 1808 } 1809 1810 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1811 } 1812 1813 /* On success, returns 0, sets skb->_nfct | ctinfo */ 1814 static int 1815 resolve_normal_ct(struct nf_conn *tmpl, 1816 struct sk_buff *skb, 1817 unsigned int dataoff, 1818 u_int8_t protonum, 1819 const struct nf_hook_state *state) 1820 { 1821 const struct nf_conntrack_zone *zone; 1822 struct nf_conntrack_tuple tuple; 1823 struct nf_conntrack_tuple_hash *h; 1824 enum ip_conntrack_info ctinfo; 1825 struct nf_conntrack_zone tmp; 1826 u32 hash, zone_id, rid; 1827 struct nf_conn *ct; 1828 1829 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1830 dataoff, state->pf, protonum, state->net, 1831 &tuple)) 1832 return 0; 1833 1834 /* look for tuple match */ 1835 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1836 1837 zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 1838 hash = hash_conntrack_raw(&tuple, zone_id, state->net); 1839 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1840 1841 if (!h) { 1842 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 1843 if (zone_id != rid) { 1844 u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); 1845 1846 h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); 1847 } 1848 } 1849 1850 if (!h) { 1851 h = init_conntrack(state->net, tmpl, &tuple, 1852 skb, dataoff, hash); 1853 if (!h) 1854 return 0; 1855 if (IS_ERR(h)) 1856 return PTR_ERR(h); 1857 } 1858 ct = nf_ct_tuplehash_to_ctrack(h); 1859 1860 /* It exists; we have (non-exclusive) reference. */ 1861 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1862 ctinfo = IP_CT_ESTABLISHED_REPLY; 1863 } else { 1864 unsigned long status = READ_ONCE(ct->status); 1865 1866 /* Once we've had two way comms, always ESTABLISHED. */ 1867 if (likely(status & IPS_SEEN_REPLY)) 1868 ctinfo = IP_CT_ESTABLISHED; 1869 else if (status & IPS_EXPECTED) 1870 ctinfo = IP_CT_RELATED; 1871 else 1872 ctinfo = IP_CT_NEW; 1873 } 1874 nf_ct_set(skb, ct, ctinfo); 1875 return 0; 1876 } 1877 1878 /* 1879 * icmp packets need special treatment to handle error messages that are 1880 * related to a connection. 1881 * 1882 * Callers need to check if skb has a conntrack assigned when this 1883 * helper returns; in such case skb belongs to an already known connection. 1884 */ 1885 static unsigned int __cold 1886 nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1887 struct sk_buff *skb, 1888 unsigned int dataoff, 1889 u8 protonum, 1890 const struct nf_hook_state *state) 1891 { 1892 int ret; 1893 1894 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1895 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1896 #if IS_ENABLED(CONFIG_IPV6) 1897 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1898 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1899 #endif 1900 else 1901 return NF_ACCEPT; 1902 1903 if (ret <= 0) 1904 NF_CT_STAT_INC_ATOMIC(state->net, error); 1905 1906 return ret; 1907 } 1908 1909 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, 1910 enum ip_conntrack_info ctinfo) 1911 { 1912 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 1913 1914 if (!timeout) 1915 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; 1916 1917 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 1918 return NF_ACCEPT; 1919 } 1920 1921 /* Returns verdict for packet, or -1 for invalid. */ 1922 static int nf_conntrack_handle_packet(struct nf_conn *ct, 1923 struct sk_buff *skb, 1924 unsigned int dataoff, 1925 enum ip_conntrack_info ctinfo, 1926 const struct nf_hook_state *state) 1927 { 1928 switch (nf_ct_protonum(ct)) { 1929 case IPPROTO_TCP: 1930 return nf_conntrack_tcp_packet(ct, skb, dataoff, 1931 ctinfo, state); 1932 case IPPROTO_UDP: 1933 return nf_conntrack_udp_packet(ct, skb, dataoff, 1934 ctinfo, state); 1935 case IPPROTO_ICMP: 1936 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); 1937 #if IS_ENABLED(CONFIG_IPV6) 1938 case IPPROTO_ICMPV6: 1939 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); 1940 #endif 1941 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 1942 case IPPROTO_UDPLITE: 1943 return nf_conntrack_udplite_packet(ct, skb, dataoff, 1944 ctinfo, state); 1945 #endif 1946 #ifdef CONFIG_NF_CT_PROTO_SCTP 1947 case IPPROTO_SCTP: 1948 return nf_conntrack_sctp_packet(ct, skb, dataoff, 1949 ctinfo, state); 1950 #endif 1951 #ifdef CONFIG_NF_CT_PROTO_DCCP 1952 case IPPROTO_DCCP: 1953 return nf_conntrack_dccp_packet(ct, skb, dataoff, 1954 ctinfo, state); 1955 #endif 1956 #ifdef CONFIG_NF_CT_PROTO_GRE 1957 case IPPROTO_GRE: 1958 return nf_conntrack_gre_packet(ct, skb, dataoff, 1959 ctinfo, state); 1960 #endif 1961 } 1962 1963 return generic_packet(ct, skb, ctinfo); 1964 } 1965 1966 unsigned int 1967 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 1968 { 1969 enum ip_conntrack_info ctinfo; 1970 struct nf_conn *ct, *tmpl; 1971 u_int8_t protonum; 1972 int dataoff, ret; 1973 1974 tmpl = nf_ct_get(skb, &ctinfo); 1975 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 1976 /* Previously seen (loopback or untracked)? Ignore. */ 1977 if ((tmpl && !nf_ct_is_template(tmpl)) || 1978 ctinfo == IP_CT_UNTRACKED) 1979 return NF_ACCEPT; 1980 skb->_nfct = 0; 1981 } 1982 1983 /* rcu_read_lock()ed by nf_hook_thresh */ 1984 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 1985 if (dataoff <= 0) { 1986 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1987 ret = NF_ACCEPT; 1988 goto out; 1989 } 1990 1991 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 1992 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 1993 protonum, state); 1994 if (ret <= 0) { 1995 ret = -ret; 1996 goto out; 1997 } 1998 /* ICMP[v6] protocol trackers may assign one conntrack. */ 1999 if (skb->_nfct) 2000 goto out; 2001 } 2002 repeat: 2003 ret = resolve_normal_ct(tmpl, skb, dataoff, 2004 protonum, state); 2005 if (ret < 0) { 2006 /* Too stressed to deal. */ 2007 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2008 ret = NF_DROP; 2009 goto out; 2010 } 2011 2012 ct = nf_ct_get(skb, &ctinfo); 2013 if (!ct) { 2014 /* Not valid part of a connection */ 2015 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2016 ret = NF_ACCEPT; 2017 goto out; 2018 } 2019 2020 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); 2021 if (ret <= 0) { 2022 /* Invalid: inverse of the return code tells 2023 * the netfilter core what to do */ 2024 nf_ct_put(ct); 2025 skb->_nfct = 0; 2026 /* Special case: TCP tracker reports an attempt to reopen a 2027 * closed/aborted connection. We have to go back and create a 2028 * fresh conntrack. 2029 */ 2030 if (ret == -NF_REPEAT) 2031 goto repeat; 2032 2033 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2034 if (ret == -NF_DROP) 2035 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2036 2037 ret = -ret; 2038 goto out; 2039 } 2040 2041 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 2042 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 2043 nf_conntrack_event_cache(IPCT_REPLY, ct); 2044 out: 2045 if (tmpl) 2046 nf_ct_put(tmpl); 2047 2048 return ret; 2049 } 2050 EXPORT_SYMBOL_GPL(nf_conntrack_in); 2051 2052 /* Alter reply tuple (maybe alter helper). This is for NAT, and is 2053 implicitly racy: see __nf_conntrack_confirm */ 2054 void nf_conntrack_alter_reply(struct nf_conn *ct, 2055 const struct nf_conntrack_tuple *newreply) 2056 { 2057 struct nf_conn_help *help = nfct_help(ct); 2058 2059 /* Should be unconfirmed, so not in hash table yet */ 2060 WARN_ON(nf_ct_is_confirmed(ct)); 2061 2062 nf_ct_dump_tuple(newreply); 2063 2064 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 2065 if (ct->master || (help && !hlist_empty(&help->expectations))) 2066 return; 2067 } 2068 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 2069 2070 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 2071 void __nf_ct_refresh_acct(struct nf_conn *ct, 2072 enum ip_conntrack_info ctinfo, 2073 const struct sk_buff *skb, 2074 u32 extra_jiffies, 2075 bool do_acct) 2076 { 2077 /* Only update if this is not a fixed timeout */ 2078 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2079 goto acct; 2080 2081 /* If not in hash table, timer will not be active yet */ 2082 if (nf_ct_is_confirmed(ct)) 2083 extra_jiffies += nfct_time_stamp; 2084 2085 if (READ_ONCE(ct->timeout) != extra_jiffies) 2086 WRITE_ONCE(ct->timeout, extra_jiffies); 2087 acct: 2088 if (do_acct) 2089 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2090 } 2091 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 2092 2093 bool nf_ct_kill_acct(struct nf_conn *ct, 2094 enum ip_conntrack_info ctinfo, 2095 const struct sk_buff *skb) 2096 { 2097 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2098 2099 return nf_ct_delete(ct, 0, 0); 2100 } 2101 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 2102 2103 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 2104 2105 #include <linux/netfilter/nfnetlink.h> 2106 #include <linux/netfilter/nfnetlink_conntrack.h> 2107 #include <linux/mutex.h> 2108 2109 /* Generic function for tcp/udp/sctp/dccp and alike. */ 2110 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 2111 const struct nf_conntrack_tuple *tuple) 2112 { 2113 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 2114 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 2115 goto nla_put_failure; 2116 return 0; 2117 2118 nla_put_failure: 2119 return -1; 2120 } 2121 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 2122 2123 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 2124 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 2125 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 2126 }; 2127 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 2128 2129 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 2130 struct nf_conntrack_tuple *t, 2131 u_int32_t flags) 2132 { 2133 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { 2134 if (!tb[CTA_PROTO_SRC_PORT]) 2135 return -EINVAL; 2136 2137 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 2138 } 2139 2140 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { 2141 if (!tb[CTA_PROTO_DST_PORT]) 2142 return -EINVAL; 2143 2144 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 2145 } 2146 2147 return 0; 2148 } 2149 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 2150 2151 unsigned int nf_ct_port_nlattr_tuple_size(void) 2152 { 2153 static unsigned int size __read_mostly; 2154 2155 if (!size) 2156 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 2157 2158 return size; 2159 } 2160 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 2161 #endif 2162 2163 /* Used by ipt_REJECT and ip6t_REJECT. */ 2164 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 2165 { 2166 struct nf_conn *ct; 2167 enum ip_conntrack_info ctinfo; 2168 2169 /* This ICMP is in reverse direction to the packet which caused it */ 2170 ct = nf_ct_get(skb, &ctinfo); 2171 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 2172 ctinfo = IP_CT_RELATED_REPLY; 2173 else 2174 ctinfo = IP_CT_RELATED; 2175 2176 /* Attach to new skbuff, and increment count */ 2177 nf_ct_set(nskb, ct, ctinfo); 2178 nf_conntrack_get(skb_nfct(nskb)); 2179 } 2180 2181 static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, 2182 struct nf_conn *ct, 2183 enum ip_conntrack_info ctinfo) 2184 { 2185 const struct nf_nat_hook *nat_hook; 2186 struct nf_conntrack_tuple_hash *h; 2187 struct nf_conntrack_tuple tuple; 2188 unsigned int status; 2189 int dataoff; 2190 u16 l3num; 2191 u8 l4num; 2192 2193 l3num = nf_ct_l3num(ct); 2194 2195 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); 2196 if (dataoff <= 0) 2197 return -1; 2198 2199 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 2200 l4num, net, &tuple)) 2201 return -1; 2202 2203 if (ct->status & IPS_SRC_NAT) { 2204 memcpy(tuple.src.u3.all, 2205 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, 2206 sizeof(tuple.src.u3.all)); 2207 tuple.src.u.all = 2208 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; 2209 } 2210 2211 if (ct->status & IPS_DST_NAT) { 2212 memcpy(tuple.dst.u3.all, 2213 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, 2214 sizeof(tuple.dst.u3.all)); 2215 tuple.dst.u.all = 2216 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; 2217 } 2218 2219 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); 2220 if (!h) 2221 return 0; 2222 2223 /* Store status bits of the conntrack that is clashing to re-do NAT 2224 * mangling according to what it has been done already to this packet. 2225 */ 2226 status = ct->status; 2227 2228 nf_ct_put(ct); 2229 ct = nf_ct_tuplehash_to_ctrack(h); 2230 nf_ct_set(skb, ct, ctinfo); 2231 2232 nat_hook = rcu_dereference(nf_nat_hook); 2233 if (!nat_hook) 2234 return 0; 2235 2236 if (status & IPS_SRC_NAT && 2237 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, 2238 IP_CT_DIR_ORIGINAL) == NF_DROP) 2239 return -1; 2240 2241 if (status & IPS_DST_NAT && 2242 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, 2243 IP_CT_DIR_ORIGINAL) == NF_DROP) 2244 return -1; 2245 2246 return 0; 2247 } 2248 2249 /* This packet is coming from userspace via nf_queue, complete the packet 2250 * processing after the helper invocation in nf_confirm(). 2251 */ 2252 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, 2253 enum ip_conntrack_info ctinfo) 2254 { 2255 const struct nf_conntrack_helper *helper; 2256 const struct nf_conn_help *help; 2257 int protoff; 2258 2259 help = nfct_help(ct); 2260 if (!help) 2261 return 0; 2262 2263 helper = rcu_dereference(help->helper); 2264 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) 2265 return 0; 2266 2267 switch (nf_ct_l3num(ct)) { 2268 case NFPROTO_IPV4: 2269 protoff = skb_network_offset(skb) + ip_hdrlen(skb); 2270 break; 2271 #if IS_ENABLED(CONFIG_IPV6) 2272 case NFPROTO_IPV6: { 2273 __be16 frag_off; 2274 u8 pnum; 2275 2276 pnum = ipv6_hdr(skb)->nexthdr; 2277 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, 2278 &frag_off); 2279 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) 2280 return 0; 2281 break; 2282 } 2283 #endif 2284 default: 2285 return 0; 2286 } 2287 2288 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 2289 !nf_is_loopback_packet(skb)) { 2290 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { 2291 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 2292 return -1; 2293 } 2294 } 2295 2296 /* We've seen it coming out the other side: confirm it */ 2297 return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; 2298 } 2299 2300 static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 2301 { 2302 enum ip_conntrack_info ctinfo; 2303 struct nf_conn *ct; 2304 int err; 2305 2306 ct = nf_ct_get(skb, &ctinfo); 2307 if (!ct) 2308 return 0; 2309 2310 if (!nf_ct_is_confirmed(ct)) { 2311 err = __nf_conntrack_update(net, skb, ct, ctinfo); 2312 if (err < 0) 2313 return err; 2314 2315 ct = nf_ct_get(skb, &ctinfo); 2316 } 2317 2318 return nf_confirm_cthelper(skb, ct, ctinfo); 2319 } 2320 2321 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 2322 const struct sk_buff *skb) 2323 { 2324 const struct nf_conntrack_tuple *src_tuple; 2325 const struct nf_conntrack_tuple_hash *hash; 2326 struct nf_conntrack_tuple srctuple; 2327 enum ip_conntrack_info ctinfo; 2328 struct nf_conn *ct; 2329 2330 ct = nf_ct_get(skb, &ctinfo); 2331 if (ct) { 2332 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 2333 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2334 return true; 2335 } 2336 2337 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 2338 NFPROTO_IPV4, dev_net(skb->dev), 2339 &srctuple)) 2340 return false; 2341 2342 hash = nf_conntrack_find_get(dev_net(skb->dev), 2343 &nf_ct_zone_dflt, 2344 &srctuple); 2345 if (!hash) 2346 return false; 2347 2348 ct = nf_ct_tuplehash_to_ctrack(hash); 2349 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 2350 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2351 nf_ct_put(ct); 2352 2353 return true; 2354 } 2355 2356 /* Bring out ya dead! */ 2357 static struct nf_conn * 2358 get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 2359 const struct nf_ct_iter_data *iter_data, unsigned int *bucket) 2360 { 2361 struct nf_conntrack_tuple_hash *h; 2362 struct nf_conn *ct; 2363 struct hlist_nulls_node *n; 2364 spinlock_t *lockp; 2365 2366 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 2367 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; 2368 2369 if (hlist_nulls_empty(hslot)) 2370 continue; 2371 2372 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2373 local_bh_disable(); 2374 nf_conntrack_lock(lockp); 2375 hlist_nulls_for_each_entry(h, n, hslot, hnnode) { 2376 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2377 continue; 2378 /* All nf_conn objects are added to hash table twice, one 2379 * for original direction tuple, once for the reply tuple. 2380 * 2381 * Exception: In the IPS_NAT_CLASH case, only the reply 2382 * tuple is added (the original tuple already existed for 2383 * a different object). 2384 * 2385 * We only need to call the iterator once for each 2386 * conntrack, so we just use the 'reply' direction 2387 * tuple while iterating. 2388 */ 2389 ct = nf_ct_tuplehash_to_ctrack(h); 2390 2391 if (iter_data->net && 2392 !net_eq(iter_data->net, nf_ct_net(ct))) 2393 continue; 2394 2395 if (iter(ct, iter_data->data)) 2396 goto found; 2397 } 2398 spin_unlock(lockp); 2399 local_bh_enable(); 2400 cond_resched(); 2401 } 2402 2403 return NULL; 2404 found: 2405 refcount_inc(&ct->ct_general.use); 2406 spin_unlock(lockp); 2407 local_bh_enable(); 2408 return ct; 2409 } 2410 2411 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2412 const struct nf_ct_iter_data *iter_data) 2413 { 2414 unsigned int bucket = 0; 2415 struct nf_conn *ct; 2416 2417 might_sleep(); 2418 2419 mutex_lock(&nf_conntrack_mutex); 2420 while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { 2421 /* Time to push up daises... */ 2422 2423 nf_ct_delete(ct, iter_data->portid, iter_data->report); 2424 nf_ct_put(ct); 2425 cond_resched(); 2426 } 2427 mutex_unlock(&nf_conntrack_mutex); 2428 } 2429 2430 void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), 2431 const struct nf_ct_iter_data *iter_data) 2432 { 2433 struct net *net = iter_data->net; 2434 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2435 2436 might_sleep(); 2437 2438 if (atomic_read(&cnet->count) == 0) 2439 return; 2440 2441 nf_ct_iterate_cleanup(iter, iter_data); 2442 } 2443 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2444 2445 /** 2446 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2447 * @iter: callback to invoke for each conntrack 2448 * @data: data to pass to @iter 2449 * 2450 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2451 * unconfirmed list as dying (so they will not be inserted into 2452 * main table). 2453 * 2454 * Can only be called in module exit path. 2455 */ 2456 void 2457 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2458 { 2459 struct nf_ct_iter_data iter_data = {}; 2460 struct net *net; 2461 2462 down_read(&net_rwsem); 2463 for_each_net(net) { 2464 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2465 2466 if (atomic_read(&cnet->count) == 0) 2467 continue; 2468 nf_queue_nf_hook_drop(net); 2469 } 2470 up_read(&net_rwsem); 2471 2472 /* Need to wait for netns cleanup worker to finish, if its 2473 * running -- it might have deleted a net namespace from 2474 * the global list, so hook drop above might not have 2475 * affected all namespaces. 2476 */ 2477 net_ns_barrier(); 2478 2479 /* a skb w. unconfirmed conntrack could have been reinjected just 2480 * before we called nf_queue_nf_hook_drop(). 2481 * 2482 * This makes sure its inserted into conntrack table. 2483 */ 2484 synchronize_net(); 2485 2486 nf_ct_ext_bump_genid(); 2487 iter_data.data = data; 2488 nf_ct_iterate_cleanup(iter, &iter_data); 2489 2490 /* Another cpu might be in a rcu read section with 2491 * rcu protected pointer cleared in iter callback 2492 * or hidden via nf_ct_ext_bump_genid() above. 2493 * 2494 * Wait until those are done. 2495 */ 2496 synchronize_rcu(); 2497 } 2498 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2499 2500 static int kill_all(struct nf_conn *i, void *data) 2501 { 2502 return 1; 2503 } 2504 2505 void nf_conntrack_cleanup_start(void) 2506 { 2507 cleanup_nf_conntrack_bpf(); 2508 conntrack_gc_work.exiting = true; 2509 } 2510 2511 void nf_conntrack_cleanup_end(void) 2512 { 2513 RCU_INIT_POINTER(nf_ct_hook, NULL); 2514 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2515 kvfree(nf_conntrack_hash); 2516 2517 nf_conntrack_proto_fini(); 2518 nf_conntrack_helper_fini(); 2519 nf_conntrack_expect_fini(); 2520 2521 kmem_cache_destroy(nf_conntrack_cachep); 2522 } 2523 2524 /* 2525 * Mishearing the voices in his head, our hero wonders how he's 2526 * supposed to kill the mall. 2527 */ 2528 void nf_conntrack_cleanup_net(struct net *net) 2529 { 2530 LIST_HEAD(single); 2531 2532 list_add(&net->exit_list, &single); 2533 nf_conntrack_cleanup_net_list(&single); 2534 } 2535 2536 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2537 { 2538 struct nf_ct_iter_data iter_data = {}; 2539 struct net *net; 2540 int busy; 2541 2542 /* 2543 * This makes sure all current packets have passed through 2544 * netfilter framework. Roll on, two-stage module 2545 * delete... 2546 */ 2547 synchronize_net(); 2548 i_see_dead_people: 2549 busy = 0; 2550 list_for_each_entry(net, net_exit_list, exit_list) { 2551 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2552 2553 iter_data.net = net; 2554 nf_ct_iterate_cleanup_net(kill_all, &iter_data); 2555 if (atomic_read(&cnet->count) != 0) 2556 busy = 1; 2557 } 2558 if (busy) { 2559 schedule(); 2560 goto i_see_dead_people; 2561 } 2562 2563 list_for_each_entry(net, net_exit_list, exit_list) { 2564 nf_conntrack_ecache_pernet_fini(net); 2565 nf_conntrack_expect_pernet_fini(net); 2566 free_percpu(net->ct.stat); 2567 } 2568 } 2569 2570 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2571 { 2572 struct hlist_nulls_head *hash; 2573 unsigned int nr_slots, i; 2574 2575 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) 2576 return NULL; 2577 2578 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2579 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2580 2581 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); 2582 2583 if (hash && nulls) 2584 for (i = 0; i < nr_slots; i++) 2585 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2586 2587 return hash; 2588 } 2589 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2590 2591 int nf_conntrack_hash_resize(unsigned int hashsize) 2592 { 2593 int i, bucket; 2594 unsigned int old_size; 2595 struct hlist_nulls_head *hash, *old_hash; 2596 struct nf_conntrack_tuple_hash *h; 2597 struct nf_conn *ct; 2598 2599 if (!hashsize) 2600 return -EINVAL; 2601 2602 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2603 if (!hash) 2604 return -ENOMEM; 2605 2606 mutex_lock(&nf_conntrack_mutex); 2607 old_size = nf_conntrack_htable_size; 2608 if (old_size == hashsize) { 2609 mutex_unlock(&nf_conntrack_mutex); 2610 kvfree(hash); 2611 return 0; 2612 } 2613 2614 local_bh_disable(); 2615 nf_conntrack_all_lock(); 2616 write_seqcount_begin(&nf_conntrack_generation); 2617 2618 /* Lookups in the old hash might happen in parallel, which means we 2619 * might get false negatives during connection lookup. New connections 2620 * created because of a false negative won't make it into the hash 2621 * though since that required taking the locks. 2622 */ 2623 2624 for (i = 0; i < nf_conntrack_htable_size; i++) { 2625 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2626 unsigned int zone_id; 2627 2628 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2629 struct nf_conntrack_tuple_hash, hnnode); 2630 ct = nf_ct_tuplehash_to_ctrack(h); 2631 hlist_nulls_del_rcu(&h->hnnode); 2632 2633 zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); 2634 bucket = __hash_conntrack(nf_ct_net(ct), 2635 &h->tuple, zone_id, hashsize); 2636 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2637 } 2638 } 2639 old_hash = nf_conntrack_hash; 2640 2641 nf_conntrack_hash = hash; 2642 nf_conntrack_htable_size = hashsize; 2643 2644 write_seqcount_end(&nf_conntrack_generation); 2645 nf_conntrack_all_unlock(); 2646 local_bh_enable(); 2647 2648 mutex_unlock(&nf_conntrack_mutex); 2649 2650 synchronize_net(); 2651 kvfree(old_hash); 2652 return 0; 2653 } 2654 2655 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2656 { 2657 unsigned int hashsize; 2658 int rc; 2659 2660 if (current->nsproxy->net_ns != &init_net) 2661 return -EOPNOTSUPP; 2662 2663 /* On boot, we can set this without any fancy locking. */ 2664 if (!nf_conntrack_hash) 2665 return param_set_uint(val, kp); 2666 2667 rc = kstrtouint(val, 0, &hashsize); 2668 if (rc) 2669 return rc; 2670 2671 return nf_conntrack_hash_resize(hashsize); 2672 } 2673 2674 int nf_conntrack_init_start(void) 2675 { 2676 unsigned long nr_pages = totalram_pages(); 2677 int max_factor = 8; 2678 int ret = -ENOMEM; 2679 int i; 2680 2681 seqcount_spinlock_init(&nf_conntrack_generation, 2682 &nf_conntrack_locks_all_lock); 2683 2684 for (i = 0; i < CONNTRACK_LOCKS; i++) 2685 spin_lock_init(&nf_conntrack_locks[i]); 2686 2687 if (!nf_conntrack_htable_size) { 2688 nf_conntrack_htable_size 2689 = (((nr_pages << PAGE_SHIFT) / 16384) 2690 / sizeof(struct hlist_head)); 2691 if (BITS_PER_LONG >= 64 && 2692 nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2693 nf_conntrack_htable_size = 262144; 2694 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2695 nf_conntrack_htable_size = 65536; 2696 2697 if (nf_conntrack_htable_size < 1024) 2698 nf_conntrack_htable_size = 1024; 2699 /* Use a max. factor of one by default to keep the average 2700 * hash chain length at 2 entries. Each entry has to be added 2701 * twice (once for original direction, once for reply). 2702 * When a table size is given we use the old value of 8 to 2703 * avoid implicit reduction of the max entries setting. 2704 */ 2705 max_factor = 1; 2706 } 2707 2708 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2709 if (!nf_conntrack_hash) 2710 return -ENOMEM; 2711 2712 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2713 2714 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2715 sizeof(struct nf_conn), 2716 NFCT_INFOMASK + 1, 2717 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2718 if (!nf_conntrack_cachep) 2719 goto err_cachep; 2720 2721 ret = nf_conntrack_expect_init(); 2722 if (ret < 0) 2723 goto err_expect; 2724 2725 ret = nf_conntrack_helper_init(); 2726 if (ret < 0) 2727 goto err_helper; 2728 2729 ret = nf_conntrack_proto_init(); 2730 if (ret < 0) 2731 goto err_proto; 2732 2733 conntrack_gc_work_init(&conntrack_gc_work); 2734 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2735 2736 ret = register_nf_conntrack_bpf(); 2737 if (ret < 0) 2738 goto err_kfunc; 2739 2740 return 0; 2741 2742 err_kfunc: 2743 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2744 nf_conntrack_proto_fini(); 2745 err_proto: 2746 nf_conntrack_helper_fini(); 2747 err_helper: 2748 nf_conntrack_expect_fini(); 2749 err_expect: 2750 kmem_cache_destroy(nf_conntrack_cachep); 2751 err_cachep: 2752 kvfree(nf_conntrack_hash); 2753 return ret; 2754 } 2755 2756 static void nf_conntrack_set_closing(struct nf_conntrack *nfct) 2757 { 2758 struct nf_conn *ct = nf_ct_to_nf_conn(nfct); 2759 2760 switch (nf_ct_protonum(ct)) { 2761 case IPPROTO_TCP: 2762 nf_conntrack_tcp_set_closing(ct); 2763 break; 2764 } 2765 } 2766 2767 static const struct nf_ct_hook nf_conntrack_hook = { 2768 .update = nf_conntrack_update, 2769 .destroy = nf_ct_destroy, 2770 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2771 .attach = nf_conntrack_attach, 2772 .set_closing = nf_conntrack_set_closing, 2773 }; 2774 2775 void nf_conntrack_init_end(void) 2776 { 2777 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2778 } 2779 2780 /* 2781 * We need to use special "null" values, not used in hash table 2782 */ 2783 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2784 2785 int nf_conntrack_init_net(struct net *net) 2786 { 2787 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2788 int ret = -ENOMEM; 2789 2790 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2791 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); 2792 atomic_set(&cnet->count, 0); 2793 2794 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2795 if (!net->ct.stat) 2796 return ret; 2797 2798 ret = nf_conntrack_expect_pernet_init(net); 2799 if (ret < 0) 2800 goto err_expect; 2801 2802 nf_conntrack_acct_pernet_init(net); 2803 nf_conntrack_tstamp_pernet_init(net); 2804 nf_conntrack_ecache_pernet_init(net); 2805 nf_conntrack_proto_pernet_init(net); 2806 2807 return 0; 2808 2809 err_expect: 2810 free_percpu(net->ct.stat); 2811 return ret; 2812 } 2813 2814 /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ 2815 2816 int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) 2817 { 2818 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2819 return -EPERM; 2820 2821 __nf_ct_set_timeout(ct, timeout); 2822 2823 if (test_bit(IPS_DYING_BIT, &ct->status)) 2824 return -ETIME; 2825 2826 return 0; 2827 } 2828 EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); 2829 2830 void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) 2831 { 2832 unsigned int bit; 2833 2834 /* Ignore these unchangable bits */ 2835 on &= ~IPS_UNCHANGEABLE_MASK; 2836 off &= ~IPS_UNCHANGEABLE_MASK; 2837 2838 for (bit = 0; bit < __IPS_MAX_BIT; bit++) { 2839 if (on & (1 << bit)) 2840 set_bit(bit, &ct->status); 2841 else if (off & (1 << bit)) 2842 clear_bit(bit, &ct->status); 2843 } 2844 } 2845 EXPORT_SYMBOL_GPL(__nf_ct_change_status); 2846 2847 int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) 2848 { 2849 unsigned long d; 2850 2851 d = ct->status ^ status; 2852 2853 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) 2854 /* unchangeable */ 2855 return -EBUSY; 2856 2857 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) 2858 /* SEEN_REPLY bit can only be set */ 2859 return -EBUSY; 2860 2861 if (d & IPS_ASSURED && !(status & IPS_ASSURED)) 2862 /* ASSURED bit can only be set */ 2863 return -EBUSY; 2864 2865 __nf_ct_change_status(ct, status, 0); 2866 return 0; 2867 } 2868 EXPORT_SYMBOL_GPL(nf_ct_change_status_common); 2869