1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * ip_vs_xmit.c: various packet transmitters for IPVS 4 * 5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 6 * Julian Anastasov <ja@ssi.bg> 7 * 8 * Changes: 9 * 10 * Description of forwarding methods: 11 * - all transmitters are called from LOCAL_IN (remote clients) and 12 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD 13 * - not all connections have destination server, for example, 14 * connections in backup server when fwmark is used 15 * - bypass connections use daddr from packet 16 * - we can use dst without ref while sending in RCU section, we use 17 * ref when returning NF_ACCEPT for NAT-ed packet via loopback 18 * LOCAL_OUT rules: 19 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) 20 * - skb->pkt_type is not set yet 21 * - the only place where we can see skb->sk != NULL 22 */ 23 24 #define KMSG_COMPONENT "IPVS" 25 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 26 27 #include <linux/kernel.h> 28 #include <linux/slab.h> 29 #include <linux/tcp.h> /* for tcphdr */ 30 #include <net/ip.h> 31 #include <net/gue.h> 32 #include <net/gre.h> 33 #include <net/tcp.h> /* for csum_tcpudp_magic */ 34 #include <net/udp.h> 35 #include <net/icmp.h> /* for icmp_send */ 36 #include <net/route.h> /* for ip_route_output */ 37 #include <net/ipv6.h> 38 #include <net/ip6_route.h> 39 #include <net/ip_tunnels.h> 40 #include <net/ip6_checksum.h> 41 #include <net/addrconf.h> 42 #include <linux/icmpv6.h> 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv4.h> 45 46 #include <net/ip_vs.h> 47 48 enum { 49 IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ 50 IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ 51 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to 52 * local 53 */ 54 IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ 55 IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ 56 IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ 57 }; 58 59 static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) 60 { 61 return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); 62 } 63 64 static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) 65 { 66 kfree(dest_dst); 67 } 68 69 /* 70 * Destination cache to speed up outgoing route lookup 71 */ 72 static inline void 73 __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, 74 struct dst_entry *dst, u32 dst_cookie) 75 { 76 struct ip_vs_dest_dst *old; 77 78 old = rcu_dereference_protected(dest->dest_dst, 79 lockdep_is_held(&dest->dst_lock)); 80 81 if (dest_dst) { 82 dest_dst->dst_cache = dst; 83 dest_dst->dst_cookie = dst_cookie; 84 } 85 rcu_assign_pointer(dest->dest_dst, dest_dst); 86 87 if (old) 88 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); 89 } 90 91 static inline struct ip_vs_dest_dst * 92 __ip_vs_dst_check(struct ip_vs_dest *dest) 93 { 94 struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); 95 struct dst_entry *dst; 96 97 if (!dest_dst) 98 return NULL; 99 dst = dest_dst->dst_cache; 100 if (dst->obsolete && 101 dst->ops->check(dst, dest_dst->dst_cookie) == NULL) 102 return NULL; 103 return dest_dst; 104 } 105 106 static inline bool 107 __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) 108 { 109 if (IP6CB(skb)->frag_max_size) { 110 /* frag_max_size tell us that, this packet have been 111 * defragmented by netfilter IPv6 conntrack module. 112 */ 113 if (IP6CB(skb)->frag_max_size > mtu) 114 return true; /* largest fragment violate MTU */ 115 } 116 else if (skb->len > mtu && !skb_is_gso(skb)) { 117 return true; /* Packet size violate MTU size */ 118 } 119 return false; 120 } 121 122 /* Get route to daddr, update *saddr, optionally bind route to saddr */ 123 static struct rtable *do_output_route4(struct net *net, __be32 daddr, 124 int rt_mode, __be32 *saddr) 125 { 126 struct flowi4 fl4; 127 struct rtable *rt; 128 bool loop = false; 129 130 memset(&fl4, 0, sizeof(fl4)); 131 fl4.daddr = daddr; 132 fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? 133 FLOWI_FLAG_KNOWN_NH : 0; 134 135 retry: 136 rt = ip_route_output_key(net, &fl4); 137 if (IS_ERR(rt)) { 138 /* Invalid saddr ? */ 139 if (PTR_ERR(rt) == -EINVAL && *saddr && 140 rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { 141 *saddr = 0; 142 flowi4_update_output(&fl4, 0, 0, daddr, 0); 143 goto retry; 144 } 145 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); 146 return NULL; 147 } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { 148 ip_rt_put(rt); 149 *saddr = fl4.saddr; 150 flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); 151 loop = true; 152 goto retry; 153 } 154 *saddr = fl4.saddr; 155 return rt; 156 } 157 158 #ifdef CONFIG_IP_VS_IPV6 159 static inline int __ip_vs_is_local_route6(struct rt6_info *rt) 160 { 161 return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; 162 } 163 #endif 164 165 static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, 166 int rt_mode, 167 bool new_rt_is_local) 168 { 169 bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); 170 bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL); 171 bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); 172 bool source_is_loopback; 173 bool old_rt_is_local; 174 175 #ifdef CONFIG_IP_VS_IPV6 176 if (skb_af == AF_INET6) { 177 int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); 178 179 source_is_loopback = 180 (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && 181 (addr_type & IPV6_ADDR_LOOPBACK); 182 old_rt_is_local = __ip_vs_is_local_route6( 183 (struct rt6_info *)skb_dst(skb)); 184 } else 185 #endif 186 { 187 source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr); 188 old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 189 } 190 191 if (unlikely(new_rt_is_local)) { 192 if (!rt_mode_allow_local) 193 return true; 194 if (!rt_mode_allow_redirect && !old_rt_is_local) 195 return true; 196 } else { 197 if (!rt_mode_allow_non_local) 198 return true; 199 if (source_is_loopback) 200 return true; 201 } 202 return false; 203 } 204 205 static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) 206 { 207 struct sock *sk = skb->sk; 208 struct rtable *ort = skb_rtable(skb); 209 210 if (!skb->dev && sk && sk_fullsock(sk)) 211 ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); 212 } 213 214 static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, 215 int rt_mode, 216 struct ip_vs_iphdr *ipvsh, 217 struct sk_buff *skb, int mtu) 218 { 219 #ifdef CONFIG_IP_VS_IPV6 220 if (skb_af == AF_INET6) { 221 struct net *net = ipvs->net; 222 223 if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { 224 if (!skb->dev) 225 skb->dev = net->loopback_dev; 226 /* only send ICMP too big on first fragment */ 227 if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh)) 228 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 229 IP_VS_DBG(1, "frag needed for %pI6c\n", 230 &ipv6_hdr(skb)->saddr); 231 return false; 232 } 233 } else 234 #endif 235 { 236 /* If we're going to tunnel the packet and pmtu discovery 237 * is disabled, we'll just fragment it anyway 238 */ 239 if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs)) 240 return true; 241 242 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && 243 skb->len > mtu && !skb_is_gso(skb) && 244 !ip_vs_iph_icmp(ipvsh))) { 245 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 246 htonl(mtu)); 247 IP_VS_DBG(1, "frag needed for %pI4\n", 248 &ip_hdr(skb)->saddr); 249 return false; 250 } 251 } 252 253 return true; 254 } 255 256 static inline bool decrement_ttl(struct netns_ipvs *ipvs, 257 int skb_af, 258 struct sk_buff *skb) 259 { 260 struct net *net = ipvs->net; 261 262 #ifdef CONFIG_IP_VS_IPV6 263 if (skb_af == AF_INET6) { 264 struct dst_entry *dst = skb_dst(skb); 265 266 /* check and decrement ttl */ 267 if (ipv6_hdr(skb)->hop_limit <= 1) { 268 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 269 270 /* Force OUTPUT device used as source address */ 271 skb->dev = dst->dev; 272 icmpv6_send(skb, ICMPV6_TIME_EXCEED, 273 ICMPV6_EXC_HOPLIMIT, 0); 274 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 275 276 return false; 277 } 278 279 /* don't propagate ttl change to cloned packets */ 280 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) 281 return false; 282 283 ipv6_hdr(skb)->hop_limit--; 284 } else 285 #endif 286 { 287 if (ip_hdr(skb)->ttl <= 1) { 288 /* Tell the sender its packet died... */ 289 __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); 290 icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); 291 return false; 292 } 293 294 /* don't propagate ttl change to cloned packets */ 295 if (skb_ensure_writable(skb, sizeof(struct iphdr))) 296 return false; 297 298 /* Decrease ttl */ 299 ip_decrease_ttl(ip_hdr(skb)); 300 } 301 302 return true; 303 } 304 305 /* Get route to destination or remote server */ 306 static int 307 __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 308 struct ip_vs_dest *dest, 309 __be32 daddr, int rt_mode, __be32 *ret_saddr, 310 struct ip_vs_iphdr *ipvsh) 311 { 312 struct net *net = ipvs->net; 313 struct ip_vs_dest_dst *dest_dst; 314 struct rtable *rt; /* Route to the other host */ 315 int mtu; 316 int local, noref = 1; 317 318 if (dest) { 319 dest_dst = __ip_vs_dst_check(dest); 320 if (likely(dest_dst)) 321 rt = (struct rtable *) dest_dst->dst_cache; 322 else { 323 dest_dst = ip_vs_dest_dst_alloc(); 324 spin_lock_bh(&dest->dst_lock); 325 if (!dest_dst) { 326 __ip_vs_dst_set(dest, NULL, NULL, 0); 327 spin_unlock_bh(&dest->dst_lock); 328 goto err_unreach; 329 } 330 rt = do_output_route4(net, dest->addr.ip, rt_mode, 331 &dest_dst->dst_saddr.ip); 332 if (!rt) { 333 __ip_vs_dst_set(dest, NULL, NULL, 0); 334 spin_unlock_bh(&dest->dst_lock); 335 ip_vs_dest_dst_free(dest_dst); 336 goto err_unreach; 337 } 338 __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); 339 spin_unlock_bh(&dest->dst_lock); 340 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", 341 &dest->addr.ip, &dest_dst->dst_saddr.ip, 342 rcuref_read(&rt->dst.__rcuref)); 343 } 344 if (ret_saddr) 345 *ret_saddr = dest_dst->dst_saddr.ip; 346 } else { 347 __be32 saddr = htonl(INADDR_ANY); 348 349 noref = 0; 350 351 /* For such unconfigured boxes avoid many route lookups 352 * for performance reasons because we do not remember saddr 353 */ 354 rt_mode &= ~IP_VS_RT_MODE_CONNECT; 355 rt = do_output_route4(net, daddr, rt_mode, &saddr); 356 if (!rt) 357 goto err_unreach; 358 if (ret_saddr) 359 *ret_saddr = saddr; 360 } 361 362 local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; 363 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, 364 local))) { 365 IP_VS_DBG_RL("We are crossing local and non-local addresses" 366 " daddr=%pI4\n", &daddr); 367 goto err_put; 368 } 369 370 if (unlikely(local)) { 371 /* skb to local stack, preserve old route */ 372 if (!noref) 373 ip_rt_put(rt); 374 return local; 375 } 376 377 if (!decrement_ttl(ipvs, skb_af, skb)) 378 goto err_put; 379 380 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { 381 mtu = dst_mtu(&rt->dst); 382 } else { 383 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 384 if (!dest) 385 goto err_put; 386 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 387 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); 388 if ((dest->tun_flags & 389 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 390 skb->ip_summed == CHECKSUM_PARTIAL) 391 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 392 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 393 __be16 tflags = 0; 394 395 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 396 tflags |= TUNNEL_CSUM; 397 mtu -= gre_calc_hlen(tflags); 398 } 399 if (mtu < 68) { 400 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); 401 goto err_put; 402 } 403 maybe_update_pmtu(skb_af, skb, mtu); 404 } 405 406 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) 407 goto err_put; 408 409 skb_dst_drop(skb); 410 if (noref) 411 skb_dst_set_noref(skb, &rt->dst); 412 else 413 skb_dst_set(skb, &rt->dst); 414 415 return local; 416 417 err_put: 418 if (!noref) 419 ip_rt_put(rt); 420 return -1; 421 422 err_unreach: 423 dst_link_failure(skb); 424 return -1; 425 } 426 427 #ifdef CONFIG_IP_VS_IPV6 428 static struct dst_entry * 429 __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, 430 struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) 431 { 432 struct dst_entry *dst; 433 struct flowi6 fl6 = { 434 .daddr = *daddr, 435 }; 436 437 if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) 438 fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; 439 440 dst = ip6_route_output(net, NULL, &fl6); 441 if (dst->error) 442 goto out_err; 443 if (!ret_saddr) 444 return dst; 445 if (ipv6_addr_any(&fl6.saddr) && 446 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, 447 &fl6.daddr, 0, &fl6.saddr) < 0) 448 goto out_err; 449 if (do_xfrm) { 450 dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); 451 if (IS_ERR(dst)) { 452 dst = NULL; 453 goto out_err; 454 } 455 } 456 *ret_saddr = fl6.saddr; 457 return dst; 458 459 out_err: 460 dst_release(dst); 461 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); 462 return NULL; 463 } 464 465 /* 466 * Get route to destination or remote server 467 */ 468 static int 469 __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 470 struct ip_vs_dest *dest, 471 struct in6_addr *daddr, struct in6_addr *ret_saddr, 472 struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) 473 { 474 struct net *net = ipvs->net; 475 struct ip_vs_dest_dst *dest_dst; 476 struct rt6_info *rt; /* Route to the other host */ 477 struct dst_entry *dst; 478 int mtu; 479 int local, noref = 1; 480 481 if (dest) { 482 dest_dst = __ip_vs_dst_check(dest); 483 if (likely(dest_dst)) 484 rt = (struct rt6_info *) dest_dst->dst_cache; 485 else { 486 u32 cookie; 487 488 dest_dst = ip_vs_dest_dst_alloc(); 489 spin_lock_bh(&dest->dst_lock); 490 if (!dest_dst) { 491 __ip_vs_dst_set(dest, NULL, NULL, 0); 492 spin_unlock_bh(&dest->dst_lock); 493 goto err_unreach; 494 } 495 dst = __ip_vs_route_output_v6(net, &dest->addr.in6, 496 &dest_dst->dst_saddr.in6, 497 do_xfrm, rt_mode); 498 if (!dst) { 499 __ip_vs_dst_set(dest, NULL, NULL, 0); 500 spin_unlock_bh(&dest->dst_lock); 501 ip_vs_dest_dst_free(dest_dst); 502 goto err_unreach; 503 } 504 rt = (struct rt6_info *) dst; 505 cookie = rt6_get_cookie(rt); 506 __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); 507 spin_unlock_bh(&dest->dst_lock); 508 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", 509 &dest->addr.in6, &dest_dst->dst_saddr.in6, 510 rcuref_read(&rt->dst.__rcuref)); 511 } 512 if (ret_saddr) 513 *ret_saddr = dest_dst->dst_saddr.in6; 514 } else { 515 noref = 0; 516 dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, 517 rt_mode); 518 if (!dst) 519 goto err_unreach; 520 rt = (struct rt6_info *) dst; 521 } 522 523 local = __ip_vs_is_local_route6(rt); 524 525 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, 526 local))) { 527 IP_VS_DBG_RL("We are crossing local and non-local addresses" 528 " daddr=%pI6\n", daddr); 529 goto err_put; 530 } 531 532 if (unlikely(local)) { 533 /* skb to local stack, preserve old route */ 534 if (!noref) 535 dst_release(&rt->dst); 536 return local; 537 } 538 539 if (!decrement_ttl(ipvs, skb_af, skb)) 540 goto err_put; 541 542 /* MTU checking */ 543 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) 544 mtu = dst_mtu(&rt->dst); 545 else { 546 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); 547 if (!dest) 548 goto err_put; 549 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 550 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); 551 if ((dest->tun_flags & 552 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 553 skb->ip_summed == CHECKSUM_PARTIAL) 554 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 555 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 556 __be16 tflags = 0; 557 558 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 559 tflags |= TUNNEL_CSUM; 560 mtu -= gre_calc_hlen(tflags); 561 } 562 if (mtu < IPV6_MIN_MTU) { 563 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, 564 IPV6_MIN_MTU); 565 goto err_put; 566 } 567 maybe_update_pmtu(skb_af, skb, mtu); 568 } 569 570 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) 571 goto err_put; 572 573 skb_dst_drop(skb); 574 if (noref) 575 skb_dst_set_noref(skb, &rt->dst); 576 else 577 skb_dst_set(skb, &rt->dst); 578 579 return local; 580 581 err_put: 582 if (!noref) 583 dst_release(&rt->dst); 584 return -1; 585 586 err_unreach: 587 /* The ip6_link_failure function requires the dev field to be set 588 * in order to get the net (further for the sake of fwmark 589 * reflection). 590 */ 591 if (!skb->dev) 592 skb->dev = skb_dst(skb)->dev; 593 594 dst_link_failure(skb); 595 return -1; 596 } 597 #endif 598 599 600 /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ 601 static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, 602 struct ip_vs_conn *cp) 603 { 604 int ret = NF_ACCEPT; 605 606 skb->ipvs_property = 1; 607 if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) 608 ret = ip_vs_confirm_conntrack(skb); 609 if (ret == NF_ACCEPT) { 610 nf_reset_ct(skb); 611 skb_forward_csum(skb); 612 if (skb->dev) 613 skb_clear_tstamp(skb); 614 } 615 return ret; 616 } 617 618 /* In the event of a remote destination, it's possible that we would have 619 * matches against an old socket (particularly a TIME-WAIT socket). This 620 * causes havoc down the line (ip_local_out et. al. expect regular sockets 621 * and invalid memory accesses will happen) so simply drop the association 622 * in this case. 623 */ 624 static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) 625 { 626 /* If dev is set, the packet came from the LOCAL_IN callback and 627 * not from a local TCP socket. 628 */ 629 if (skb->dev) 630 skb_orphan(skb); 631 } 632 633 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ 634 static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, 635 struct ip_vs_conn *cp, int local) 636 { 637 int ret = NF_STOLEN; 638 639 skb->ipvs_property = 1; 640 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) 641 ip_vs_notrack(skb); 642 else 643 ip_vs_update_conntrack(skb, cp, 1); 644 645 /* Remove the early_demux association unless it's bound for the 646 * exact same port and address on this host after translation. 647 */ 648 if (!local || cp->vport != cp->dport || 649 !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) 650 ip_vs_drop_early_demux_sk(skb); 651 652 if (!local) { 653 skb_forward_csum(skb); 654 if (skb->dev) 655 skb_clear_tstamp(skb); 656 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, 657 NULL, skb_dst(skb)->dev, dst_output); 658 } else 659 ret = NF_ACCEPT; 660 661 return ret; 662 } 663 664 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ 665 static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, 666 struct ip_vs_conn *cp, int local) 667 { 668 int ret = NF_STOLEN; 669 670 skb->ipvs_property = 1; 671 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) 672 ip_vs_notrack(skb); 673 if (!local) { 674 ip_vs_drop_early_demux_sk(skb); 675 skb_forward_csum(skb); 676 if (skb->dev) 677 skb_clear_tstamp(skb); 678 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, 679 NULL, skb_dst(skb)->dev, dst_output); 680 } else 681 ret = NF_ACCEPT; 682 return ret; 683 } 684 685 686 /* 687 * NULL transmitter (do nothing except return NF_ACCEPT) 688 */ 689 int 690 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 691 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 692 { 693 /* we do not touch skb and do not need pskb ptr */ 694 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 695 } 696 697 698 /* 699 * Bypass transmitter 700 * Let packets bypass the destination when the destination is not 701 * available, it may be only used in transparent cache cluster. 702 */ 703 int 704 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 705 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 706 { 707 struct iphdr *iph = ip_hdr(skb); 708 709 if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, 710 IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) 711 goto tx_error; 712 713 ip_send_check(iph); 714 715 /* Another hack: avoid icmp_send in ip_fragment */ 716 skb->ignore_df = 1; 717 718 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); 719 720 return NF_STOLEN; 721 722 tx_error: 723 kfree_skb(skb); 724 return NF_STOLEN; 725 } 726 727 #ifdef CONFIG_IP_VS_IPV6 728 int 729 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 730 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 731 { 732 struct ipv6hdr *iph = ipv6_hdr(skb); 733 734 if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, 735 &iph->daddr, NULL, 736 ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) 737 goto tx_error; 738 739 /* Another hack: avoid icmp_send in ip_fragment */ 740 skb->ignore_df = 1; 741 742 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); 743 744 return NF_STOLEN; 745 746 tx_error: 747 kfree_skb(skb); 748 return NF_STOLEN; 749 } 750 #endif 751 752 /* 753 * NAT transmitter (only for outside-to-inside nat forwarding) 754 * Not used for related ICMP 755 */ 756 int 757 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 758 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 759 { 760 struct rtable *rt; /* Route to the other host */ 761 int local, rc, was_input; 762 763 /* check if it is a connection of no-client-port */ 764 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { 765 __be16 _pt, *p; 766 767 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); 768 if (p == NULL) 769 goto tx_error; 770 ip_vs_conn_fill_cport(cp, *p); 771 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 772 } 773 774 was_input = rt_is_input_route(skb_rtable(skb)); 775 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 776 IP_VS_RT_MODE_LOCAL | 777 IP_VS_RT_MODE_NON_LOCAL | 778 IP_VS_RT_MODE_RDR, NULL, ipvsh); 779 if (local < 0) 780 goto tx_error; 781 rt = skb_rtable(skb); 782 /* 783 * Avoid duplicate tuple in reply direction for NAT traffic 784 * to local address when connection is sync-ed 785 */ 786 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 787 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 788 enum ip_conntrack_info ctinfo; 789 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 790 791 if (ct) { 792 IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, 793 "ip_vs_nat_xmit(): " 794 "stopping DNAT to local address"); 795 goto tx_error; 796 } 797 } 798 #endif 799 800 /* From world but DNAT to loopback address? */ 801 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { 802 IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, 803 "ip_vs_nat_xmit(): stopping DNAT to loopback " 804 "address"); 805 goto tx_error; 806 } 807 808 /* copy-on-write the packet before mangling it */ 809 if (skb_ensure_writable(skb, sizeof(struct iphdr))) 810 goto tx_error; 811 812 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 813 goto tx_error; 814 815 /* mangle the packet */ 816 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) 817 goto tx_error; 818 ip_hdr(skb)->daddr = cp->daddr.ip; 819 ip_send_check(ip_hdr(skb)); 820 821 IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT"); 822 823 /* FIXME: when application helper enlarges the packet and the length 824 is larger than the MTU of outgoing device, there will be still 825 MTU problem. */ 826 827 /* Another hack: avoid icmp_send in ip_fragment */ 828 skb->ignore_df = 1; 829 830 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); 831 832 return rc; 833 834 tx_error: 835 kfree_skb(skb); 836 return NF_STOLEN; 837 } 838 839 #ifdef CONFIG_IP_VS_IPV6 840 int 841 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 842 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 843 { 844 struct rt6_info *rt; /* Route to the other host */ 845 int local, rc; 846 847 /* check if it is a connection of no-client-port */ 848 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { 849 __be16 _pt, *p; 850 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); 851 if (p == NULL) 852 goto tx_error; 853 ip_vs_conn_fill_cport(cp, *p); 854 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 855 } 856 857 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 858 &cp->daddr.in6, 859 NULL, ipvsh, 0, 860 IP_VS_RT_MODE_LOCAL | 861 IP_VS_RT_MODE_NON_LOCAL | 862 IP_VS_RT_MODE_RDR); 863 if (local < 0) 864 goto tx_error; 865 rt = (struct rt6_info *) skb_dst(skb); 866 /* 867 * Avoid duplicate tuple in reply direction for NAT traffic 868 * to local address when connection is sync-ed 869 */ 870 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 871 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 872 enum ip_conntrack_info ctinfo; 873 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 874 875 if (ct) { 876 IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, 877 "ip_vs_nat_xmit_v6(): " 878 "stopping DNAT to local address"); 879 goto tx_error; 880 } 881 } 882 #endif 883 884 /* From world but DNAT to loopback address? */ 885 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && 886 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { 887 IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off, 888 "ip_vs_nat_xmit_v6(): " 889 "stopping DNAT to loopback address"); 890 goto tx_error; 891 } 892 893 /* copy-on-write the packet before mangling it */ 894 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) 895 goto tx_error; 896 897 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 898 goto tx_error; 899 900 /* mangle the packet */ 901 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) 902 goto tx_error; 903 ipv6_hdr(skb)->daddr = cp->daddr.in6; 904 905 IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT"); 906 907 /* FIXME: when application helper enlarges the packet and the length 908 is larger than the MTU of outgoing device, there will be still 909 MTU problem. */ 910 911 /* Another hack: avoid icmp_send in ip_fragment */ 912 skb->ignore_df = 1; 913 914 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); 915 916 return rc; 917 918 tx_error: 919 kfree_skb(skb); 920 return NF_STOLEN; 921 } 922 #endif 923 924 /* When forwarding a packet, we must ensure that we've got enough headroom 925 * for the encapsulation packet in the skb. This also gives us an 926 * opportunity to figure out what the payload_len, dsfield, ttl, and df 927 * values should be, so that we won't need to look at the old ip header 928 * again 929 */ 930 static struct sk_buff * 931 ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, 932 unsigned int max_headroom, __u8 *next_protocol, 933 __u32 *payload_len, __u8 *dsfield, __u8 *ttl, 934 __be16 *df) 935 { 936 struct sk_buff *new_skb = NULL; 937 struct iphdr *old_iph = NULL; 938 __u8 old_dsfield; 939 #ifdef CONFIG_IP_VS_IPV6 940 struct ipv6hdr *old_ipv6h = NULL; 941 #endif 942 943 ip_vs_drop_early_demux_sk(skb); 944 945 if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { 946 new_skb = skb_realloc_headroom(skb, max_headroom); 947 if (!new_skb) 948 goto error; 949 if (skb->sk) 950 skb_set_owner_w(new_skb, skb->sk); 951 consume_skb(skb); 952 skb = new_skb; 953 } 954 955 #ifdef CONFIG_IP_VS_IPV6 956 if (skb_af == AF_INET6) { 957 old_ipv6h = ipv6_hdr(skb); 958 *next_protocol = IPPROTO_IPV6; 959 if (payload_len) 960 *payload_len = 961 ntohs(old_ipv6h->payload_len) + 962 sizeof(*old_ipv6h); 963 old_dsfield = ipv6_get_dsfield(old_ipv6h); 964 *ttl = old_ipv6h->hop_limit; 965 if (df) 966 *df = 0; 967 } else 968 #endif 969 { 970 old_iph = ip_hdr(skb); 971 /* Copy DF, reset fragment offset and MF */ 972 if (df) 973 *df = (old_iph->frag_off & htons(IP_DF)); 974 *next_protocol = IPPROTO_IPIP; 975 976 /* fix old IP header checksum */ 977 ip_send_check(old_iph); 978 old_dsfield = ipv4_get_dsfield(old_iph); 979 *ttl = old_iph->ttl; 980 if (payload_len) 981 *payload_len = skb_ip_totlen(skb); 982 } 983 984 /* Implement full-functionality option for ECN encapsulation */ 985 *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield); 986 987 return skb; 988 error: 989 kfree_skb(skb); 990 return ERR_PTR(-ENOMEM); 991 } 992 993 static inline int __tun_gso_type_mask(int encaps_af, int orig_af) 994 { 995 switch (encaps_af) { 996 case AF_INET: 997 return SKB_GSO_IPXIP4; 998 case AF_INET6: 999 return SKB_GSO_IPXIP6; 1000 default: 1001 return 0; 1002 } 1003 } 1004 1005 static int 1006 ipvs_gue_encap(struct net *net, struct sk_buff *skb, 1007 struct ip_vs_conn *cp, __u8 *next_protocol) 1008 { 1009 __be16 dport; 1010 __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); 1011 struct udphdr *udph; /* Our new UDP header */ 1012 struct guehdr *gueh; /* Our new GUE header */ 1013 size_t hdrlen, optlen = 0; 1014 void *data; 1015 bool need_priv = false; 1016 1017 if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1018 skb->ip_summed == CHECKSUM_PARTIAL) { 1019 optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1020 need_priv = true; 1021 } 1022 1023 hdrlen = sizeof(struct guehdr) + optlen; 1024 1025 skb_push(skb, hdrlen); 1026 1027 gueh = (struct guehdr *)skb->data; 1028 1029 gueh->control = 0; 1030 gueh->version = 0; 1031 gueh->hlen = optlen >> 2; 1032 gueh->flags = 0; 1033 gueh->proto_ctype = *next_protocol; 1034 1035 data = &gueh[1]; 1036 1037 if (need_priv) { 1038 __be32 *flags = data; 1039 u16 csum_start = skb_checksum_start_offset(skb); 1040 __be16 *pd; 1041 1042 gueh->flags |= GUE_FLAG_PRIV; 1043 *flags = 0; 1044 data += GUE_LEN_PRIV; 1045 1046 if (csum_start < hdrlen) 1047 return -EINVAL; 1048 1049 csum_start -= hdrlen; 1050 pd = data; 1051 pd[0] = htons(csum_start); 1052 pd[1] = htons(csum_start + skb->csum_offset); 1053 1054 if (!skb_is_gso(skb)) { 1055 skb->ip_summed = CHECKSUM_NONE; 1056 skb->encapsulation = 0; 1057 } 1058 1059 *flags |= GUE_PFLAG_REMCSUM; 1060 data += GUE_PLEN_REMCSUM; 1061 } 1062 1063 skb_push(skb, sizeof(struct udphdr)); 1064 skb_reset_transport_header(skb); 1065 1066 udph = udp_hdr(skb); 1067 1068 dport = cp->dest->tun_port; 1069 udph->dest = dport; 1070 udph->source = sport; 1071 udph->len = htons(skb->len); 1072 udph->check = 0; 1073 1074 *next_protocol = IPPROTO_UDP; 1075 1076 return 0; 1077 } 1078 1079 static void 1080 ipvs_gre_encap(struct net *net, struct sk_buff *skb, 1081 struct ip_vs_conn *cp, __u8 *next_protocol) 1082 { 1083 __be16 proto = *next_protocol == IPPROTO_IPIP ? 1084 htons(ETH_P_IP) : htons(ETH_P_IPV6); 1085 __be16 tflags = 0; 1086 size_t hdrlen; 1087 1088 if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1089 tflags |= TUNNEL_CSUM; 1090 1091 hdrlen = gre_calc_hlen(tflags); 1092 gre_build_header(skb, hdrlen, tflags, proto, 0, 0); 1093 1094 *next_protocol = IPPROTO_GRE; 1095 } 1096 1097 /* 1098 * IP Tunneling transmitter 1099 * 1100 * This function encapsulates the packet in a new IP packet, its 1101 * destination will be set to cp->daddr. Most code of this function 1102 * is taken from ipip.c. 1103 * 1104 * It is used in VS/TUN cluster. The load balancer selects a real 1105 * server from a cluster based on a scheduling algorithm, 1106 * encapsulates the request packet and forwards it to the selected 1107 * server. For example, all real servers are configured with 1108 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives 1109 * the encapsulated packet, it will decapsulate the packet, processe 1110 * the request and return the response packets directly to the client 1111 * without passing the load balancer. This can greatly increase the 1112 * scalability of virtual server. 1113 * 1114 * Used for ANY protocol 1115 */ 1116 int 1117 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1118 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1119 { 1120 struct netns_ipvs *ipvs = cp->ipvs; 1121 struct net *net = ipvs->net; 1122 struct rtable *rt; /* Route to the other host */ 1123 __be32 saddr; /* Source for tunnel */ 1124 struct net_device *tdev; /* Device to other host */ 1125 __u8 next_protocol = 0; 1126 __u8 dsfield = 0; 1127 __u8 ttl = 0; 1128 __be16 df = 0; 1129 __be16 *dfp = NULL; 1130 struct iphdr *iph; /* Our new IP header */ 1131 unsigned int max_headroom; /* The extra header space needed */ 1132 int ret, local; 1133 int tun_type, gso_type; 1134 int tun_flags; 1135 1136 local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 1137 IP_VS_RT_MODE_LOCAL | 1138 IP_VS_RT_MODE_NON_LOCAL | 1139 IP_VS_RT_MODE_CONNECT | 1140 IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh); 1141 if (local < 0) 1142 goto tx_error; 1143 if (local) 1144 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 1145 1146 rt = skb_rtable(skb); 1147 tdev = rt->dst.dev; 1148 1149 /* 1150 * Okay, now see if we can stuff it in the buffer as-is. 1151 */ 1152 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); 1153 1154 tun_type = cp->dest->tun_type; 1155 tun_flags = cp->dest->tun_flags; 1156 1157 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1158 size_t gue_hdrlen, gue_optlen = 0; 1159 1160 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1161 skb->ip_summed == CHECKSUM_PARTIAL) { 1162 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1163 } 1164 gue_hdrlen = sizeof(struct guehdr) + gue_optlen; 1165 1166 max_headroom += sizeof(struct udphdr) + gue_hdrlen; 1167 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1168 size_t gre_hdrlen; 1169 __be16 tflags = 0; 1170 1171 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1172 tflags |= TUNNEL_CSUM; 1173 gre_hdrlen = gre_calc_hlen(tflags); 1174 1175 max_headroom += gre_hdrlen; 1176 } 1177 1178 /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ 1179 dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; 1180 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, 1181 &next_protocol, NULL, &dsfield, 1182 &ttl, dfp); 1183 if (IS_ERR(skb)) 1184 return NF_STOLEN; 1185 1186 gso_type = __tun_gso_type_mask(AF_INET, cp->af); 1187 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1188 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1189 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1190 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; 1191 else 1192 gso_type |= SKB_GSO_UDP_TUNNEL; 1193 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1194 skb->ip_summed == CHECKSUM_PARTIAL) { 1195 gso_type |= SKB_GSO_TUNNEL_REMCSUM; 1196 } 1197 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1198 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1199 gso_type |= SKB_GSO_GRE_CSUM; 1200 else 1201 gso_type |= SKB_GSO_GRE; 1202 } 1203 1204 if (iptunnel_handle_offloads(skb, gso_type)) 1205 goto tx_error; 1206 1207 skb->transport_header = skb->network_header; 1208 1209 skb_set_inner_ipproto(skb, next_protocol); 1210 1211 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1212 bool check = false; 1213 1214 if (ipvs_gue_encap(net, skb, cp, &next_protocol)) 1215 goto tx_error; 1216 1217 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1218 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1219 check = true; 1220 1221 udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); 1222 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) 1223 ipvs_gre_encap(net, skb, cp, &next_protocol); 1224 1225 skb_push(skb, sizeof(struct iphdr)); 1226 skb_reset_network_header(skb); 1227 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1228 1229 /* 1230 * Push down and install the IPIP header. 1231 */ 1232 iph = ip_hdr(skb); 1233 iph->version = 4; 1234 iph->ihl = sizeof(struct iphdr)>>2; 1235 iph->frag_off = df; 1236 iph->protocol = next_protocol; 1237 iph->tos = dsfield; 1238 iph->daddr = cp->daddr.ip; 1239 iph->saddr = saddr; 1240 iph->ttl = ttl; 1241 ip_select_ident(net, skb, NULL); 1242 1243 /* Another hack: avoid icmp_send in ip_fragment */ 1244 skb->ignore_df = 1; 1245 1246 ret = ip_vs_tunnel_xmit_prepare(skb, cp); 1247 if (ret == NF_ACCEPT) 1248 ip_local_out(net, skb->sk, skb); 1249 else if (ret == NF_DROP) 1250 kfree_skb(skb); 1251 1252 return NF_STOLEN; 1253 1254 tx_error: 1255 kfree_skb(skb); 1256 return NF_STOLEN; 1257 } 1258 1259 #ifdef CONFIG_IP_VS_IPV6 1260 int 1261 ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1262 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1263 { 1264 struct netns_ipvs *ipvs = cp->ipvs; 1265 struct net *net = ipvs->net; 1266 struct rt6_info *rt; /* Route to the other host */ 1267 struct in6_addr saddr; /* Source for tunnel */ 1268 struct net_device *tdev; /* Device to other host */ 1269 __u8 next_protocol = 0; 1270 __u32 payload_len = 0; 1271 __u8 dsfield = 0; 1272 __u8 ttl = 0; 1273 struct ipv6hdr *iph; /* Our new IP header */ 1274 unsigned int max_headroom; /* The extra header space needed */ 1275 int ret, local; 1276 int tun_type, gso_type; 1277 int tun_flags; 1278 1279 local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, 1280 &cp->daddr.in6, 1281 &saddr, ipvsh, 1, 1282 IP_VS_RT_MODE_LOCAL | 1283 IP_VS_RT_MODE_NON_LOCAL | 1284 IP_VS_RT_MODE_TUNNEL); 1285 if (local < 0) 1286 goto tx_error; 1287 if (local) 1288 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); 1289 1290 rt = (struct rt6_info *) skb_dst(skb); 1291 tdev = rt->dst.dev; 1292 1293 /* 1294 * Okay, now see if we can stuff it in the buffer as-is. 1295 */ 1296 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); 1297 1298 tun_type = cp->dest->tun_type; 1299 tun_flags = cp->dest->tun_flags; 1300 1301 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1302 size_t gue_hdrlen, gue_optlen = 0; 1303 1304 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1305 skb->ip_summed == CHECKSUM_PARTIAL) { 1306 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1307 } 1308 gue_hdrlen = sizeof(struct guehdr) + gue_optlen; 1309 1310 max_headroom += sizeof(struct udphdr) + gue_hdrlen; 1311 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1312 size_t gre_hdrlen; 1313 __be16 tflags = 0; 1314 1315 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1316 tflags |= TUNNEL_CSUM; 1317 gre_hdrlen = gre_calc_hlen(tflags); 1318 1319 max_headroom += gre_hdrlen; 1320 } 1321 1322 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, 1323 &next_protocol, &payload_len, 1324 &dsfield, &ttl, NULL); 1325 if (IS_ERR(skb)) 1326 return NF_STOLEN; 1327 1328 gso_type = __tun_gso_type_mask(AF_INET6, cp->af); 1329 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1330 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1331 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1332 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; 1333 else 1334 gso_type |= SKB_GSO_UDP_TUNNEL; 1335 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1336 skb->ip_summed == CHECKSUM_PARTIAL) { 1337 gso_type |= SKB_GSO_TUNNEL_REMCSUM; 1338 } 1339 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1340 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1341 gso_type |= SKB_GSO_GRE_CSUM; 1342 else 1343 gso_type |= SKB_GSO_GRE; 1344 } 1345 1346 if (iptunnel_handle_offloads(skb, gso_type)) 1347 goto tx_error; 1348 1349 skb->transport_header = skb->network_header; 1350 1351 skb_set_inner_ipproto(skb, next_protocol); 1352 1353 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1354 bool check = false; 1355 1356 if (ipvs_gue_encap(net, skb, cp, &next_protocol)) 1357 goto tx_error; 1358 1359 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1360 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1361 check = true; 1362 1363 udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); 1364 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) 1365 ipvs_gre_encap(net, skb, cp, &next_protocol); 1366 1367 skb_push(skb, sizeof(struct ipv6hdr)); 1368 skb_reset_network_header(skb); 1369 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1370 1371 /* 1372 * Push down and install the IPIP header. 1373 */ 1374 iph = ipv6_hdr(skb); 1375 iph->version = 6; 1376 iph->nexthdr = next_protocol; 1377 iph->payload_len = htons(payload_len); 1378 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); 1379 ipv6_change_dsfield(iph, 0, dsfield); 1380 iph->daddr = cp->daddr.in6; 1381 iph->saddr = saddr; 1382 iph->hop_limit = ttl; 1383 1384 /* Another hack: avoid icmp_send in ip_fragment */ 1385 skb->ignore_df = 1; 1386 1387 ret = ip_vs_tunnel_xmit_prepare(skb, cp); 1388 if (ret == NF_ACCEPT) 1389 ip6_local_out(net, skb->sk, skb); 1390 else if (ret == NF_DROP) 1391 kfree_skb(skb); 1392 1393 return NF_STOLEN; 1394 1395 tx_error: 1396 kfree_skb(skb); 1397 return NF_STOLEN; 1398 } 1399 #endif 1400 1401 1402 /* 1403 * Direct Routing transmitter 1404 * Used for ANY protocol 1405 */ 1406 int 1407 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1408 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1409 { 1410 int local; 1411 1412 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 1413 IP_VS_RT_MODE_LOCAL | 1414 IP_VS_RT_MODE_NON_LOCAL | 1415 IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); 1416 if (local < 0) 1417 goto tx_error; 1418 if (local) 1419 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 1420 1421 ip_send_check(ip_hdr(skb)); 1422 1423 /* Another hack: avoid icmp_send in ip_fragment */ 1424 skb->ignore_df = 1; 1425 1426 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); 1427 1428 return NF_STOLEN; 1429 1430 tx_error: 1431 kfree_skb(skb); 1432 return NF_STOLEN; 1433 } 1434 1435 #ifdef CONFIG_IP_VS_IPV6 1436 int 1437 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1438 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1439 { 1440 int local; 1441 1442 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 1443 &cp->daddr.in6, 1444 NULL, ipvsh, 0, 1445 IP_VS_RT_MODE_LOCAL | 1446 IP_VS_RT_MODE_NON_LOCAL | 1447 IP_VS_RT_MODE_KNOWN_NH); 1448 if (local < 0) 1449 goto tx_error; 1450 if (local) 1451 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); 1452 1453 /* Another hack: avoid icmp_send in ip_fragment */ 1454 skb->ignore_df = 1; 1455 1456 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); 1457 1458 return NF_STOLEN; 1459 1460 tx_error: 1461 kfree_skb(skb); 1462 return NF_STOLEN; 1463 } 1464 #endif 1465 1466 1467 /* 1468 * ICMP packet transmitter 1469 * called by the ip_vs_in_icmp 1470 */ 1471 int 1472 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1473 struct ip_vs_protocol *pp, int offset, unsigned int hooknum, 1474 struct ip_vs_iphdr *iph) 1475 { 1476 struct rtable *rt; /* Route to the other host */ 1477 int rc; 1478 int local; 1479 int rt_mode, was_input; 1480 1481 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 1482 forwarded directly here, because there is no need to 1483 translate address/port back */ 1484 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 1485 if (cp->packet_xmit) 1486 rc = cp->packet_xmit(skb, cp, pp, iph); 1487 else 1488 rc = NF_ACCEPT; 1489 /* do not touch skb anymore */ 1490 atomic_inc(&cp->in_pkts); 1491 return rc; 1492 } 1493 1494 /* 1495 * mangle and send the packet here (only for VS/NAT) 1496 */ 1497 was_input = rt_is_input_route(skb_rtable(skb)); 1498 1499 /* LOCALNODE from FORWARD hook is not supported */ 1500 rt_mode = (hooknum != NF_INET_FORWARD) ? 1501 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | 1502 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; 1503 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, 1504 NULL, iph); 1505 if (local < 0) 1506 goto tx_error; 1507 rt = skb_rtable(skb); 1508 1509 /* 1510 * Avoid duplicate tuple in reply direction for NAT traffic 1511 * to local address when connection is sync-ed 1512 */ 1513 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 1514 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 1515 enum ip_conntrack_info ctinfo; 1516 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 1517 1518 if (ct) { 1519 IP_VS_DBG(10, "%s(): " 1520 "stopping DNAT to local address %pI4\n", 1521 __func__, &cp->daddr.ip); 1522 goto tx_error; 1523 } 1524 } 1525 #endif 1526 1527 /* From world but DNAT to loopback address? */ 1528 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { 1529 IP_VS_DBG(1, "%s(): " 1530 "stopping DNAT to loopback %pI4\n", 1531 __func__, &cp->daddr.ip); 1532 goto tx_error; 1533 } 1534 1535 /* copy-on-write the packet before mangling it */ 1536 if (skb_ensure_writable(skb, offset)) 1537 goto tx_error; 1538 1539 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1540 goto tx_error; 1541 1542 ip_vs_nat_icmp(skb, pp, cp, 0); 1543 1544 /* Another hack: avoid icmp_send in ip_fragment */ 1545 skb->ignore_df = 1; 1546 1547 return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); 1548 1549 tx_error: 1550 kfree_skb(skb); 1551 rc = NF_STOLEN; 1552 return rc; 1553 } 1554 1555 #ifdef CONFIG_IP_VS_IPV6 1556 int 1557 ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1558 struct ip_vs_protocol *pp, int offset, unsigned int hooknum, 1559 struct ip_vs_iphdr *ipvsh) 1560 { 1561 struct rt6_info *rt; /* Route to the other host */ 1562 int rc; 1563 int local; 1564 int rt_mode; 1565 1566 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 1567 forwarded directly here, because there is no need to 1568 translate address/port back */ 1569 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 1570 if (cp->packet_xmit) 1571 rc = cp->packet_xmit(skb, cp, pp, ipvsh); 1572 else 1573 rc = NF_ACCEPT; 1574 /* do not touch skb anymore */ 1575 atomic_inc(&cp->in_pkts); 1576 return rc; 1577 } 1578 1579 /* 1580 * mangle and send the packet here (only for VS/NAT) 1581 */ 1582 1583 /* LOCALNODE from FORWARD hook is not supported */ 1584 rt_mode = (hooknum != NF_INET_FORWARD) ? 1585 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | 1586 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; 1587 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 1588 &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); 1589 if (local < 0) 1590 goto tx_error; 1591 rt = (struct rt6_info *) skb_dst(skb); 1592 /* 1593 * Avoid duplicate tuple in reply direction for NAT traffic 1594 * to local address when connection is sync-ed 1595 */ 1596 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 1597 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 1598 enum ip_conntrack_info ctinfo; 1599 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 1600 1601 if (ct) { 1602 IP_VS_DBG(10, "%s(): " 1603 "stopping DNAT to local address %pI6\n", 1604 __func__, &cp->daddr.in6); 1605 goto tx_error; 1606 } 1607 } 1608 #endif 1609 1610 /* From world but DNAT to loopback address? */ 1611 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && 1612 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { 1613 IP_VS_DBG(1, "%s(): " 1614 "stopping DNAT to loopback %pI6\n", 1615 __func__, &cp->daddr.in6); 1616 goto tx_error; 1617 } 1618 1619 /* copy-on-write the packet before mangling it */ 1620 if (skb_ensure_writable(skb, offset)) 1621 goto tx_error; 1622 1623 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1624 goto tx_error; 1625 1626 ip_vs_nat_icmp_v6(skb, pp, cp, 0); 1627 1628 /* Another hack: avoid icmp_send in ip_fragment */ 1629 skb->ignore_df = 1; 1630 1631 return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); 1632 1633 tx_error: 1634 kfree_skb(skb); 1635 rc = NF_STOLEN; 1636 return rc; 1637 } 1638 #endif 1639