1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the NetFilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Version 1, is capable of handling both version 0 and 1 messages. 10 * Version 0 is the plain old format. 11 * Note Version 0 receivers will just drop Ver 1 messages. 12 * Version 1 is capable of handle IPv6, Persistence data, 13 * time-outs, and firewall marks. 14 * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. 15 * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 16 * 17 * Definitions Message: is a complete datagram 18 * Sync_conn: is a part of a Message 19 * Param Data is an option to a Sync_conn. 20 * 21 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 22 * 23 * ip_vs_sync: sync connection info from master load balancer to backups 24 * through multicast 25 * 26 * Changes: 27 * Alexandre Cassen : Added master & backup support at a time. 28 * Alexandre Cassen : Added SyncID support for incoming sync 29 * messages filtering. 30 * Justin Ossevoort : Fix endian problem on sync message size. 31 * Hans Schillstrom : Added Version 1: i.e. IPv6, 32 * Persistence support, fwmark and time-out. 33 */ 34 35 #define KMSG_COMPONENT "IPVS" 36 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 37 38 #include <linux/module.h> 39 #include <linux/slab.h> 40 #include <linux/inetdevice.h> 41 #include <linux/net.h> 42 #include <linux/completion.h> 43 #include <linux/delay.h> 44 #include <linux/skbuff.h> 45 #include <linux/in.h> 46 #include <linux/igmp.h> /* for ip_mc_join_group */ 47 #include <linux/udp.h> 48 #include <linux/err.h> 49 #include <linux/kthread.h> 50 #include <linux/wait.h> 51 #include <linux/kernel.h> 52 #include <linux/sched/signal.h> 53 54 #include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ 55 56 #include <net/ip.h> 57 #include <net/sock.h> 58 59 #include <net/ip_vs.h> 60 61 #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ 62 #define IP_VS_SYNC_PORT 8848 /* multicast port */ 63 64 #define SYNC_PROTO_VER 1 /* Protocol version in header */ 65 66 static struct lock_class_key __ipvs_sync_key; 67 /* 68 * IPVS sync connection entry 69 * Version 0, i.e. original version. 70 */ 71 struct ip_vs_sync_conn_v0 { 72 __u8 reserved; 73 74 /* Protocol, addresses and port numbers */ 75 __u8 protocol; /* Which protocol (TCP/UDP) */ 76 __be16 cport; 77 __be16 vport; 78 __be16 dport; 79 __be32 caddr; /* client address */ 80 __be32 vaddr; /* virtual address */ 81 __be32 daddr; /* destination address */ 82 83 /* Flags and state transition */ 84 __be16 flags; /* status flags */ 85 __be16 state; /* state info */ 86 87 /* The sequence options start here */ 88 }; 89 90 struct ip_vs_sync_conn_options { 91 struct ip_vs_seq in_seq; /* incoming seq. struct */ 92 struct ip_vs_seq out_seq; /* outgoing seq. struct */ 93 }; 94 95 /* 96 Sync Connection format (sync_conn) 97 98 0 1 2 3 99 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 100 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101 | Type | Protocol | Ver. | Size | 102 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 103 | Flags | 104 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 105 | State | cport | 106 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 107 | vport | dport | 108 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 109 | fwmark | 110 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 111 | timeout (in sec.) | 112 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 113 | ... | 114 | IP-Addresses (v4 or v6) | 115 | ... | 116 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 117 Optional Parameters. 118 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 119 | Param. Type | Param. Length | Param. data | 120 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 121 | ... | 122 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 123 | | Param Type | Param. Length | 124 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 125 | Param data | 126 | Last Param data should be padded for 32 bit alignment | 127 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 128 */ 129 130 /* 131 * Type 0, IPv4 sync connection format 132 */ 133 struct ip_vs_sync_v4 { 134 __u8 type; 135 __u8 protocol; /* Which protocol (TCP/UDP) */ 136 __be16 ver_size; /* Version msb 4 bits */ 137 /* Flags and state transition */ 138 __be32 flags; /* status flags */ 139 __be16 state; /* state info */ 140 /* Protocol, addresses and port numbers */ 141 __be16 cport; 142 __be16 vport; 143 __be16 dport; 144 __be32 fwmark; /* Firewall mark from skb */ 145 __be32 timeout; /* cp timeout */ 146 __be32 caddr; /* client address */ 147 __be32 vaddr; /* virtual address */ 148 __be32 daddr; /* destination address */ 149 /* The sequence options start here */ 150 /* PE data padded to 32bit alignment after seq. options */ 151 }; 152 /* 153 * Type 2 messages IPv6 154 */ 155 struct ip_vs_sync_v6 { 156 __u8 type; 157 __u8 protocol; /* Which protocol (TCP/UDP) */ 158 __be16 ver_size; /* Version msb 4 bits */ 159 /* Flags and state transition */ 160 __be32 flags; /* status flags */ 161 __be16 state; /* state info */ 162 /* Protocol, addresses and port numbers */ 163 __be16 cport; 164 __be16 vport; 165 __be16 dport; 166 __be32 fwmark; /* Firewall mark from skb */ 167 __be32 timeout; /* cp timeout */ 168 struct in6_addr caddr; /* client address */ 169 struct in6_addr vaddr; /* virtual address */ 170 struct in6_addr daddr; /* destination address */ 171 /* The sequence options start here */ 172 /* PE data padded to 32bit alignment after seq. options */ 173 }; 174 175 union ip_vs_sync_conn { 176 struct ip_vs_sync_v4 v4; 177 struct ip_vs_sync_v6 v6; 178 }; 179 180 /* Bits in Type field in above */ 181 #define STYPE_INET6 0 182 #define STYPE_F_INET6 (1 << STYPE_INET6) 183 184 #define SVER_SHIFT 12 /* Shift to get version */ 185 #define SVER_MASK 0x0fff /* Mask to strip version */ 186 187 #define IPVS_OPT_SEQ_DATA 1 188 #define IPVS_OPT_PE_DATA 2 189 #define IPVS_OPT_PE_NAME 3 190 #define IPVS_OPT_PARAM 7 191 192 #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) 193 #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) 194 #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) 195 #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) 196 197 struct ip_vs_sync_thread_data { 198 struct task_struct *task; 199 struct netns_ipvs *ipvs; 200 struct socket *sock; 201 char *buf; 202 int id; 203 }; 204 205 /* Version 0 definition of packet sizes */ 206 #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) 207 #define FULL_CONN_SIZE \ 208 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) 209 210 211 /* 212 The master mulitcasts messages (Datagrams) to the backup load balancers 213 in the following format. 214 215 Version 1: 216 Note, first byte should be Zero, so ver 0 receivers will drop the packet. 217 218 0 1 2 3 219 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 220 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 221 | 0 | SyncID | Size | 222 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 223 | Count Conns | Version | Reserved, set to Zero | 224 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 225 | | 226 | IPVS Sync Connection (1) | 227 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 228 | . | 229 ~ . ~ 230 | . | 231 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 232 | | 233 | IPVS Sync Connection (n) | 234 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 235 236 Version 0 Header 237 0 1 2 3 238 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 239 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 240 | Count Conns | SyncID | Size | 241 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 242 | IPVS Sync Connection (1) | 243 */ 244 245 /* Version 0 header */ 246 struct ip_vs_sync_mesg_v0 { 247 __u8 nr_conns; 248 __u8 syncid; 249 __be16 size; 250 251 /* ip_vs_sync_conn entries start here */ 252 }; 253 254 /* Version 1 header */ 255 struct ip_vs_sync_mesg { 256 __u8 reserved; /* must be zero */ 257 __u8 syncid; 258 __be16 size; 259 __u8 nr_conns; 260 __s8 version; /* SYNC_PROTO_VER */ 261 __u16 spare; 262 /* ip_vs_sync_conn entries start here */ 263 }; 264 265 union ipvs_sockaddr { 266 struct sockaddr_in in; 267 struct sockaddr_in6 in6; 268 }; 269 270 struct ip_vs_sync_buff { 271 struct list_head list; 272 unsigned long firstuse; 273 274 /* pointers for the message data */ 275 struct ip_vs_sync_mesg *mesg; 276 unsigned char *head; 277 unsigned char *end; 278 }; 279 280 /* 281 * Copy of struct ip_vs_seq 282 * From unaligned network order to aligned host order 283 */ 284 static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) 285 { 286 memset(ho, 0, sizeof(*ho)); 287 ho->init_seq = get_unaligned_be32(&no->init_seq); 288 ho->delta = get_unaligned_be32(&no->delta); 289 ho->previous_delta = get_unaligned_be32(&no->previous_delta); 290 } 291 292 /* 293 * Copy of struct ip_vs_seq 294 * From Aligned host order to unaligned network order 295 */ 296 static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) 297 { 298 put_unaligned_be32(ho->init_seq, &no->init_seq); 299 put_unaligned_be32(ho->delta, &no->delta); 300 put_unaligned_be32(ho->previous_delta, &no->previous_delta); 301 } 302 303 static inline struct ip_vs_sync_buff * 304 sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 305 { 306 struct ip_vs_sync_buff *sb; 307 308 spin_lock_bh(&ipvs->sync_lock); 309 if (list_empty(&ms->sync_queue)) { 310 sb = NULL; 311 __set_current_state(TASK_INTERRUPTIBLE); 312 } else { 313 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, 314 list); 315 list_del(&sb->list); 316 ms->sync_queue_len--; 317 if (!ms->sync_queue_len) 318 ms->sync_queue_delay = 0; 319 } 320 spin_unlock_bh(&ipvs->sync_lock); 321 322 return sb; 323 } 324 325 /* 326 * Create a new sync buffer for Version 1 proto. 327 */ 328 static inline struct ip_vs_sync_buff * 329 ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) 330 { 331 struct ip_vs_sync_buff *sb; 332 333 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 334 return NULL; 335 336 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), 337 ipvs->mcfg.sync_maxlen); 338 sb->mesg = kmalloc(len, GFP_ATOMIC); 339 if (!sb->mesg) { 340 kfree(sb); 341 return NULL; 342 } 343 sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ 344 sb->mesg->version = SYNC_PROTO_VER; 345 sb->mesg->syncid = ipvs->mcfg.syncid; 346 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); 347 sb->mesg->nr_conns = 0; 348 sb->mesg->spare = 0; 349 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); 350 sb->end = (unsigned char *)sb->mesg + len; 351 352 sb->firstuse = jiffies; 353 return sb; 354 } 355 356 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) 357 { 358 kfree(sb->mesg); 359 kfree(sb); 360 } 361 362 static inline void sb_queue_tail(struct netns_ipvs *ipvs, 363 struct ipvs_master_sync_state *ms) 364 { 365 struct ip_vs_sync_buff *sb = ms->sync_buff; 366 367 spin_lock(&ipvs->sync_lock); 368 if (ipvs->sync_state & IP_VS_STATE_MASTER && 369 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { 370 if (!ms->sync_queue_len) 371 schedule_delayed_work(&ms->master_wakeup_work, 372 max(IPVS_SYNC_SEND_DELAY, 1)); 373 ms->sync_queue_len++; 374 list_add_tail(&sb->list, &ms->sync_queue); 375 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) { 376 int id = (int)(ms - ipvs->ms); 377 378 wake_up_process(ipvs->master_tinfo[id].task); 379 } 380 } else 381 ip_vs_sync_buff_release(sb); 382 spin_unlock(&ipvs->sync_lock); 383 } 384 385 /* 386 * Get the current sync buffer if it has been created for more 387 * than the specified time or the specified time is zero. 388 */ 389 static inline struct ip_vs_sync_buff * 390 get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, 391 unsigned long time) 392 { 393 struct ip_vs_sync_buff *sb; 394 395 spin_lock_bh(&ipvs->sync_buff_lock); 396 sb = ms->sync_buff; 397 if (sb && time_after_eq(jiffies - sb->firstuse, time)) { 398 ms->sync_buff = NULL; 399 __set_current_state(TASK_RUNNING); 400 } else 401 sb = NULL; 402 spin_unlock_bh(&ipvs->sync_buff_lock); 403 return sb; 404 } 405 406 static inline int 407 select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) 408 { 409 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; 410 } 411 412 /* 413 * Create a new sync buffer for Version 0 proto. 414 */ 415 static inline struct ip_vs_sync_buff * 416 ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) 417 { 418 struct ip_vs_sync_buff *sb; 419 struct ip_vs_sync_mesg_v0 *mesg; 420 421 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 422 return NULL; 423 424 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), 425 ipvs->mcfg.sync_maxlen); 426 sb->mesg = kmalloc(len, GFP_ATOMIC); 427 if (!sb->mesg) { 428 kfree(sb); 429 return NULL; 430 } 431 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; 432 mesg->nr_conns = 0; 433 mesg->syncid = ipvs->mcfg.syncid; 434 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); 435 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); 436 sb->end = (unsigned char *)mesg + len; 437 sb->firstuse = jiffies; 438 return sb; 439 } 440 441 /* Check if connection is controlled by persistence */ 442 static inline bool in_persistence(struct ip_vs_conn *cp) 443 { 444 for (cp = cp->control; cp; cp = cp->control) { 445 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 446 return true; 447 } 448 return false; 449 } 450 451 /* Check if conn should be synced. 452 * pkts: conn packets, use sysctl_sync_threshold to avoid packet check 453 * - (1) sync_refresh_period: reduce sync rate. Additionally, retry 454 * sync_retries times with period of sync_refresh_period/8 455 * - (2) if both sync_refresh_period and sync_period are 0 send sync only 456 * for state changes or only once when pkts matches sync_threshold 457 * - (3) templates: rate can be reduced only with sync_refresh_period or 458 * with (2) 459 */ 460 static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, 461 struct ip_vs_conn *cp, int pkts) 462 { 463 unsigned long orig = READ_ONCE(cp->sync_endtime); 464 unsigned long now = jiffies; 465 unsigned long n = (now + cp->timeout) & ~3UL; 466 unsigned int sync_refresh_period; 467 int sync_period; 468 int force; 469 470 /* Check if we sync in current state */ 471 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) 472 force = 0; 473 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) 474 return 0; 475 else if (likely(cp->protocol == IPPROTO_TCP)) { 476 if (!((1 << cp->state) & 477 ((1 << IP_VS_TCP_S_ESTABLISHED) | 478 (1 << IP_VS_TCP_S_FIN_WAIT) | 479 (1 << IP_VS_TCP_S_CLOSE) | 480 (1 << IP_VS_TCP_S_CLOSE_WAIT) | 481 (1 << IP_VS_TCP_S_TIME_WAIT)))) 482 return 0; 483 force = cp->state != cp->old_state; 484 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) 485 goto set; 486 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { 487 if (!((1 << cp->state) & 488 ((1 << IP_VS_SCTP_S_ESTABLISHED) | 489 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | 490 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | 491 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | 492 (1 << IP_VS_SCTP_S_CLOSED)))) 493 return 0; 494 force = cp->state != cp->old_state; 495 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) 496 goto set; 497 } else { 498 /* UDP or another protocol with single state */ 499 force = 0; 500 } 501 502 sync_refresh_period = sysctl_sync_refresh_period(ipvs); 503 if (sync_refresh_period > 0) { 504 long diff = n - orig; 505 long min_diff = max(cp->timeout >> 1, 10UL * HZ); 506 507 /* Avoid sync if difference is below sync_refresh_period 508 * and below the half timeout. 509 */ 510 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { 511 int retries = orig & 3; 512 513 if (retries >= sysctl_sync_retries(ipvs)) 514 return 0; 515 if (time_before(now, orig - cp->timeout + 516 (sync_refresh_period >> 3))) 517 return 0; 518 n |= retries + 1; 519 } 520 } 521 sync_period = sysctl_sync_period(ipvs); 522 if (sync_period > 0) { 523 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && 524 pkts % sync_period != sysctl_sync_threshold(ipvs)) 525 return 0; 526 } else if (!sync_refresh_period && 527 pkts != sysctl_sync_threshold(ipvs)) 528 return 0; 529 530 set: 531 cp->old_state = cp->state; 532 n = cmpxchg(&cp->sync_endtime, orig, n); 533 return n == orig || force; 534 } 535 536 /* 537 * Version 0 , could be switched in by sys_ctl. 538 * Add an ip_vs_conn information into the current sync_buff. 539 */ 540 static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, 541 int pkts) 542 { 543 struct ip_vs_sync_mesg_v0 *m; 544 struct ip_vs_sync_conn_v0 *s; 545 struct ip_vs_sync_buff *buff; 546 struct ipvs_master_sync_state *ms; 547 int id; 548 unsigned int len; 549 550 if (unlikely(cp->af != AF_INET)) 551 return; 552 /* Do not sync ONE PACKET */ 553 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 554 return; 555 556 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 557 return; 558 559 spin_lock_bh(&ipvs->sync_buff_lock); 560 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 561 spin_unlock_bh(&ipvs->sync_buff_lock); 562 return; 563 } 564 565 id = select_master_thread_id(ipvs, cp); 566 ms = &ipvs->ms[id]; 567 buff = ms->sync_buff; 568 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 569 SIMPLE_CONN_SIZE; 570 if (buff) { 571 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 572 /* Send buffer if it is for v1 */ 573 if (buff->head + len > buff->end || !m->nr_conns) { 574 sb_queue_tail(ipvs, ms); 575 ms->sync_buff = NULL; 576 buff = NULL; 577 } 578 } 579 if (!buff) { 580 buff = ip_vs_sync_buff_create_v0(ipvs, len); 581 if (!buff) { 582 spin_unlock_bh(&ipvs->sync_buff_lock); 583 pr_err("ip_vs_sync_buff_create failed.\n"); 584 return; 585 } 586 ms->sync_buff = buff; 587 } 588 589 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 590 s = (struct ip_vs_sync_conn_v0 *) buff->head; 591 592 /* copy members */ 593 s->reserved = 0; 594 s->protocol = cp->protocol; 595 s->cport = cp->cport; 596 s->vport = cp->vport; 597 s->dport = cp->dport; 598 s->caddr = cp->caddr.ip; 599 s->vaddr = cp->vaddr.ip; 600 s->daddr = cp->daddr.ip; 601 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); 602 s->state = htons(cp->state); 603 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 604 struct ip_vs_sync_conn_options *opt = 605 (struct ip_vs_sync_conn_options *)&s[1]; 606 memcpy(opt, &cp->sync_conn_opt, sizeof(*opt)); 607 } 608 609 m->nr_conns++; 610 m->size = htons(ntohs(m->size) + len); 611 buff->head += len; 612 spin_unlock_bh(&ipvs->sync_buff_lock); 613 614 /* synchronize its controller if it has */ 615 cp = cp->control; 616 if (cp) { 617 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 618 pkts = atomic_inc_return(&cp->in_pkts); 619 else 620 pkts = sysctl_sync_threshold(ipvs); 621 ip_vs_sync_conn(ipvs, cp, pkts); 622 } 623 } 624 625 /* 626 * Add an ip_vs_conn information into the current sync_buff. 627 * Called by ip_vs_in. 628 * Sending Version 1 messages 629 */ 630 void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) 631 { 632 struct ip_vs_sync_mesg *m; 633 union ip_vs_sync_conn *s; 634 struct ip_vs_sync_buff *buff; 635 struct ipvs_master_sync_state *ms; 636 int id; 637 __u8 *p; 638 unsigned int len, pe_name_len, pad; 639 640 /* Handle old version of the protocol */ 641 if (sysctl_sync_ver(ipvs) == 0) { 642 ip_vs_sync_conn_v0(ipvs, cp, pkts); 643 return; 644 } 645 /* Do not sync ONE PACKET */ 646 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 647 goto control; 648 sloop: 649 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 650 goto control; 651 652 /* Sanity checks */ 653 pe_name_len = 0; 654 if (cp->pe_data_len) { 655 if (!cp->pe_data || !cp->dest) { 656 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); 657 return; 658 } 659 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); 660 } 661 662 spin_lock_bh(&ipvs->sync_buff_lock); 663 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 664 spin_unlock_bh(&ipvs->sync_buff_lock); 665 return; 666 } 667 668 id = select_master_thread_id(ipvs, cp); 669 ms = &ipvs->ms[id]; 670 671 #ifdef CONFIG_IP_VS_IPV6 672 if (cp->af == AF_INET6) 673 len = sizeof(struct ip_vs_sync_v6); 674 else 675 #endif 676 len = sizeof(struct ip_vs_sync_v4); 677 678 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) 679 len += sizeof(struct ip_vs_sync_conn_options) + 2; 680 681 if (cp->pe_data_len) 682 len += cp->pe_data_len + 2; /* + Param hdr field */ 683 if (pe_name_len) 684 len += pe_name_len + 2; 685 686 /* check if there is a space for this one */ 687 pad = 0; 688 buff = ms->sync_buff; 689 if (buff) { 690 m = buff->mesg; 691 pad = (4 - (size_t) buff->head) & 3; 692 /* Send buffer if it is for v0 */ 693 if (buff->head + len + pad > buff->end || m->reserved) { 694 sb_queue_tail(ipvs, ms); 695 ms->sync_buff = NULL; 696 buff = NULL; 697 pad = 0; 698 } 699 } 700 701 if (!buff) { 702 buff = ip_vs_sync_buff_create(ipvs, len); 703 if (!buff) { 704 spin_unlock_bh(&ipvs->sync_buff_lock); 705 pr_err("ip_vs_sync_buff_create failed.\n"); 706 return; 707 } 708 ms->sync_buff = buff; 709 m = buff->mesg; 710 } 711 712 p = buff->head; 713 buff->head += pad + len; 714 m->size = htons(ntohs(m->size) + pad + len); 715 /* Add ev. padding from prev. sync_conn */ 716 while (pad--) 717 *(p++) = 0; 718 719 s = (union ip_vs_sync_conn *)p; 720 721 /* Set message type & copy members */ 722 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); 723 s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ 724 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); 725 s->v4.state = htons(cp->state); 726 s->v4.protocol = cp->protocol; 727 s->v4.cport = cp->cport; 728 s->v4.vport = cp->vport; 729 s->v4.dport = cp->dport; 730 s->v4.fwmark = htonl(cp->fwmark); 731 s->v4.timeout = htonl(cp->timeout / HZ); 732 m->nr_conns++; 733 734 #ifdef CONFIG_IP_VS_IPV6 735 if (cp->af == AF_INET6) { 736 p += sizeof(struct ip_vs_sync_v6); 737 s->v6.caddr = cp->caddr.in6; 738 s->v6.vaddr = cp->vaddr.in6; 739 s->v6.daddr = cp->daddr.in6; 740 } else 741 #endif 742 { 743 p += sizeof(struct ip_vs_sync_v4); /* options ptr */ 744 s->v4.caddr = cp->caddr.ip; 745 s->v4.vaddr = cp->vaddr.ip; 746 s->v4.daddr = cp->daddr.ip; 747 } 748 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 749 *(p++) = IPVS_OPT_SEQ_DATA; 750 *(p++) = sizeof(struct ip_vs_sync_conn_options); 751 hton_seq((struct ip_vs_seq *)p, &cp->in_seq); 752 p += sizeof(struct ip_vs_seq); 753 hton_seq((struct ip_vs_seq *)p, &cp->out_seq); 754 p += sizeof(struct ip_vs_seq); 755 } 756 /* Handle pe data */ 757 if (cp->pe_data_len && cp->pe_data) { 758 *(p++) = IPVS_OPT_PE_DATA; 759 *(p++) = cp->pe_data_len; 760 memcpy(p, cp->pe_data, cp->pe_data_len); 761 p += cp->pe_data_len; 762 if (pe_name_len) { 763 /* Add PE_NAME */ 764 *(p++) = IPVS_OPT_PE_NAME; 765 *(p++) = pe_name_len; 766 memcpy(p, cp->pe->name, pe_name_len); 767 p += pe_name_len; 768 } 769 } 770 771 spin_unlock_bh(&ipvs->sync_buff_lock); 772 773 control: 774 /* synchronize its controller if it has */ 775 cp = cp->control; 776 if (!cp) 777 return; 778 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 779 pkts = atomic_inc_return(&cp->in_pkts); 780 else 781 pkts = sysctl_sync_threshold(ipvs); 782 goto sloop; 783 } 784 785 /* 786 * fill_param used by version 1 787 */ 788 static inline int 789 ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, 790 struct ip_vs_conn_param *p, 791 __u8 *pe_data, unsigned int pe_data_len, 792 __u8 *pe_name, unsigned int pe_name_len) 793 { 794 #ifdef CONFIG_IP_VS_IPV6 795 if (af == AF_INET6) 796 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, 797 (const union nf_inet_addr *)&sc->v6.caddr, 798 sc->v6.cport, 799 (const union nf_inet_addr *)&sc->v6.vaddr, 800 sc->v6.vport, p); 801 else 802 #endif 803 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, 804 (const union nf_inet_addr *)&sc->v4.caddr, 805 sc->v4.cport, 806 (const union nf_inet_addr *)&sc->v4.vaddr, 807 sc->v4.vport, p); 808 /* Handle pe data */ 809 if (pe_data_len) { 810 if (pe_name_len) { 811 char buff[IP_VS_PENAME_MAXLEN+1]; 812 813 memcpy(buff, pe_name, pe_name_len); 814 buff[pe_name_len]=0; 815 p->pe = __ip_vs_pe_getbyname(buff); 816 if (!p->pe) { 817 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", 818 buff); 819 return 1; 820 } 821 } else { 822 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); 823 return 1; 824 } 825 826 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); 827 if (!p->pe_data) { 828 module_put(p->pe->module); 829 return -ENOMEM; 830 } 831 p->pe_data_len = pe_data_len; 832 } 833 return 0; 834 } 835 836 /* 837 * Connection Add / Update. 838 * Common for version 0 and 1 reception of backup sync_conns. 839 * Param: ... 840 * timeout is in sec. 841 */ 842 static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, 843 unsigned int flags, unsigned int state, 844 unsigned int protocol, unsigned int type, 845 const union nf_inet_addr *daddr, __be16 dport, 846 unsigned long timeout, __u32 fwmark, 847 struct ip_vs_sync_conn_options *opt) 848 { 849 struct ip_vs_dest *dest; 850 struct ip_vs_conn *cp; 851 852 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 853 cp = ip_vs_conn_in_get(param); 854 if (cp && ((cp->dport != dport) || 855 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { 856 if (!(flags & IP_VS_CONN_F_INACTIVE)) { 857 ip_vs_conn_expire_now(cp); 858 __ip_vs_conn_put(cp); 859 cp = NULL; 860 } else { 861 /* This is the expiration message for the 862 * connection that was already replaced, so we 863 * just ignore it. 864 */ 865 __ip_vs_conn_put(cp); 866 kfree(param->pe_data); 867 return; 868 } 869 } 870 } else { 871 cp = ip_vs_ct_in_get(param); 872 } 873 874 if (cp) { 875 /* Free pe_data */ 876 kfree(param->pe_data); 877 878 dest = cp->dest; 879 spin_lock_bh(&cp->lock); 880 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && 881 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { 882 if (flags & IP_VS_CONN_F_INACTIVE) { 883 atomic_dec(&dest->activeconns); 884 atomic_inc(&dest->inactconns); 885 } else { 886 atomic_inc(&dest->activeconns); 887 atomic_dec(&dest->inactconns); 888 } 889 } 890 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; 891 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; 892 cp->flags = flags; 893 spin_unlock_bh(&cp->lock); 894 if (!dest) 895 ip_vs_try_bind_dest(cp); 896 } else { 897 /* 898 * Find the appropriate destination for the connection. 899 * If it is not found the connection will remain unbound 900 * but still handled. 901 */ 902 rcu_read_lock(); 903 /* This function is only invoked by the synchronization 904 * code. We do not currently support heterogeneous pools 905 * with synchronization, so we can make the assumption that 906 * the svc_af is the same as the dest_af 907 */ 908 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, 909 param->vaddr, param->vport, protocol, 910 fwmark, flags); 911 912 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, 913 fwmark); 914 rcu_read_unlock(); 915 if (!cp) { 916 kfree(param->pe_data); 917 IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); 918 return; 919 } 920 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 921 kfree(param->pe_data); 922 } 923 924 if (opt) { 925 cp->in_seq = opt->in_seq; 926 cp->out_seq = opt->out_seq; 927 } 928 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); 929 cp->state = state; 930 cp->old_state = cp->state; 931 /* 932 * For Ver 0 messages style 933 * - Not possible to recover the right timeout for templates 934 * - can not find the right fwmark 935 * virtual service. If needed, we can do it for 936 * non-fwmark persistent services. 937 * Ver 1 messages style. 938 * - No problem. 939 */ 940 if (timeout) { 941 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) 942 timeout = MAX_SCHEDULE_TIMEOUT / HZ; 943 cp->timeout = timeout*HZ; 944 } else { 945 struct ip_vs_proto_data *pd; 946 947 pd = ip_vs_proto_data_get(ipvs, protocol); 948 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) 949 cp->timeout = pd->timeout_table[state]; 950 else 951 cp->timeout = (3*60*HZ); 952 } 953 ip_vs_conn_put(cp); 954 } 955 956 /* 957 * Process received multicast message for Version 0 958 */ 959 static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, 960 const size_t buflen) 961 { 962 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; 963 struct ip_vs_sync_conn_v0 *s; 964 struct ip_vs_sync_conn_options *opt; 965 struct ip_vs_protocol *pp; 966 struct ip_vs_conn_param param; 967 char *p; 968 int i; 969 970 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); 971 for (i=0; i<m->nr_conns; i++) { 972 unsigned int flags, state; 973 974 if (p + SIMPLE_CONN_SIZE > buffer+buflen) { 975 IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); 976 return; 977 } 978 s = (struct ip_vs_sync_conn_v0 *) p; 979 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; 980 flags &= ~IP_VS_CONN_F_HASHED; 981 if (flags & IP_VS_CONN_F_SEQ_MASK) { 982 opt = (struct ip_vs_sync_conn_options *)&s[1]; 983 p += FULL_CONN_SIZE; 984 if (p > buffer+buflen) { 985 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); 986 return; 987 } 988 } else { 989 opt = NULL; 990 p += SIMPLE_CONN_SIZE; 991 } 992 993 state = ntohs(s->state); 994 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 995 pp = ip_vs_proto_get(s->protocol); 996 if (!pp) { 997 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", 998 s->protocol); 999 continue; 1000 } 1001 if (state >= pp->num_states) { 1002 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", 1003 pp->name, state); 1004 continue; 1005 } 1006 } else { 1007 if (state >= IP_VS_CTPL_S_LAST) 1008 IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n", 1009 state); 1010 } 1011 1012 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, 1013 (const union nf_inet_addr *)&s->caddr, 1014 s->cport, 1015 (const union nf_inet_addr *)&s->vaddr, 1016 s->vport, ¶m); 1017 1018 /* Send timeout as Zero */ 1019 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, 1020 (union nf_inet_addr *)&s->daddr, s->dport, 1021 0, 0, opt); 1022 } 1023 } 1024 1025 /* 1026 * Handle options 1027 */ 1028 static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, 1029 __u32 *opt_flags, 1030 struct ip_vs_sync_conn_options *opt) 1031 { 1032 struct ip_vs_sync_conn_options *topt; 1033 1034 topt = (struct ip_vs_sync_conn_options *)p; 1035 1036 if (plen != sizeof(struct ip_vs_sync_conn_options)) { 1037 IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); 1038 return -EINVAL; 1039 } 1040 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { 1041 IP_VS_DBG(2, "BACKUP, conn options found twice\n"); 1042 return -EINVAL; 1043 } 1044 ntoh_seq(&topt->in_seq, &opt->in_seq); 1045 ntoh_seq(&topt->out_seq, &opt->out_seq); 1046 *opt_flags |= IPVS_OPT_F_SEQ_DATA; 1047 return 0; 1048 } 1049 1050 static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, 1051 __u8 **data, unsigned int maxlen, 1052 __u32 *opt_flags, __u32 flag) 1053 { 1054 if (plen > maxlen) { 1055 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); 1056 return -EINVAL; 1057 } 1058 if (*opt_flags & flag) { 1059 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); 1060 return -EINVAL; 1061 } 1062 *data_len = plen; 1063 *data = p; 1064 *opt_flags |= flag; 1065 return 0; 1066 } 1067 /* 1068 * Process a Version 1 sync. connection 1069 */ 1070 static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) 1071 { 1072 struct ip_vs_sync_conn_options opt; 1073 union ip_vs_sync_conn *s; 1074 struct ip_vs_protocol *pp; 1075 struct ip_vs_conn_param param; 1076 __u32 flags; 1077 unsigned int af, state, pe_data_len=0, pe_name_len=0; 1078 __u8 *pe_data=NULL, *pe_name=NULL; 1079 __u32 opt_flags=0; 1080 int retc=0; 1081 1082 s = (union ip_vs_sync_conn *) p; 1083 1084 if (s->v6.type & STYPE_F_INET6) { 1085 #ifdef CONFIG_IP_VS_IPV6 1086 af = AF_INET6; 1087 p += sizeof(struct ip_vs_sync_v6); 1088 #else 1089 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); 1090 retc = 10; 1091 goto out; 1092 #endif 1093 } else if (!s->v4.type) { 1094 af = AF_INET; 1095 p += sizeof(struct ip_vs_sync_v4); 1096 } else { 1097 return -10; 1098 } 1099 if (p > msg_end) 1100 return -20; 1101 1102 /* Process optional params check Type & Len. */ 1103 while (p < msg_end) { 1104 int ptype; 1105 int plen; 1106 1107 if (p+2 > msg_end) 1108 return -30; 1109 ptype = *(p++); 1110 plen = *(p++); 1111 1112 if (!plen || ((p + plen) > msg_end)) 1113 return -40; 1114 /* Handle seq option p = param data */ 1115 switch (ptype & ~IPVS_OPT_F_PARAM) { 1116 case IPVS_OPT_SEQ_DATA: 1117 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) 1118 return -50; 1119 break; 1120 1121 case IPVS_OPT_PE_DATA: 1122 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, 1123 IP_VS_PEDATA_MAXLEN, &opt_flags, 1124 IPVS_OPT_F_PE_DATA)) 1125 return -60; 1126 break; 1127 1128 case IPVS_OPT_PE_NAME: 1129 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, 1130 IP_VS_PENAME_MAXLEN, &opt_flags, 1131 IPVS_OPT_F_PE_NAME)) 1132 return -70; 1133 break; 1134 1135 default: 1136 /* Param data mandatory ? */ 1137 if (!(ptype & IPVS_OPT_F_PARAM)) { 1138 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", 1139 ptype & ~IPVS_OPT_F_PARAM); 1140 retc = 20; 1141 goto out; 1142 } 1143 } 1144 p += plen; /* Next option */ 1145 } 1146 1147 /* Get flags and Mask off unsupported */ 1148 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; 1149 flags |= IP_VS_CONN_F_SYNC; 1150 state = ntohs(s->v4.state); 1151 1152 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 1153 pp = ip_vs_proto_get(s->v4.protocol); 1154 if (!pp) { 1155 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", 1156 s->v4.protocol); 1157 retc = 30; 1158 goto out; 1159 } 1160 if (state >= pp->num_states) { 1161 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", 1162 pp->name, state); 1163 retc = 40; 1164 goto out; 1165 } 1166 } else { 1167 if (state >= IP_VS_CTPL_S_LAST) 1168 IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n", 1169 state); 1170 } 1171 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, 1172 pe_data_len, pe_name, pe_name_len)) { 1173 retc = 50; 1174 goto out; 1175 } 1176 /* If only IPv4, just silent skip IPv6 */ 1177 if (af == AF_INET) 1178 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, 1179 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, 1180 ntohl(s->v4.timeout), ntohl(s->v4.fwmark), 1181 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1182 ); 1183 #ifdef CONFIG_IP_VS_IPV6 1184 else 1185 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, 1186 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, 1187 ntohl(s->v6.timeout), ntohl(s->v6.fwmark), 1188 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1189 ); 1190 #endif 1191 ip_vs_pe_put(param.pe); 1192 return 0; 1193 /* Error exit */ 1194 out: 1195 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); 1196 return retc; 1197 1198 } 1199 /* 1200 * Process received multicast message and create the corresponding 1201 * ip_vs_conn entries. 1202 * Handles Version 0 & 1 1203 */ 1204 static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, 1205 const size_t buflen) 1206 { 1207 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; 1208 __u8 *p, *msg_end; 1209 int i, nr_conns; 1210 1211 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { 1212 IP_VS_DBG(2, "BACKUP, message header too short\n"); 1213 return; 1214 } 1215 1216 if (buflen != ntohs(m2->size)) { 1217 IP_VS_DBG(2, "BACKUP, bogus message size\n"); 1218 return; 1219 } 1220 /* SyncID sanity check */ 1221 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { 1222 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); 1223 return; 1224 } 1225 /* Handle version 1 message */ 1226 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) 1227 && (m2->spare == 0)) { 1228 1229 msg_end = buffer + sizeof(struct ip_vs_sync_mesg); 1230 nr_conns = m2->nr_conns; 1231 1232 for (i=0; i<nr_conns; i++) { 1233 union ip_vs_sync_conn *s; 1234 unsigned int size; 1235 int retc; 1236 1237 p = msg_end; 1238 if (p + sizeof(s->v4) > buffer+buflen) { 1239 IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n"); 1240 return; 1241 } 1242 s = (union ip_vs_sync_conn *)p; 1243 size = ntohs(s->v4.ver_size) & SVER_MASK; 1244 msg_end = p + size; 1245 /* Basic sanity checks */ 1246 if (msg_end > buffer+buflen) { 1247 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); 1248 return; 1249 } 1250 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { 1251 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", 1252 ntohs(s->v4.ver_size) >> SVER_SHIFT); 1253 return; 1254 } 1255 /* Process a single sync_conn */ 1256 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); 1257 if (retc < 0) { 1258 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", 1259 retc); 1260 return; 1261 } 1262 /* Make sure we have 32 bit alignment */ 1263 msg_end = p + ((size + 3) & ~3); 1264 } 1265 } else { 1266 /* Old type of message */ 1267 ip_vs_process_message_v0(ipvs, buffer, buflen); 1268 return; 1269 } 1270 } 1271 1272 1273 /* 1274 * Setup sndbuf (mode=1) or rcvbuf (mode=0) 1275 */ 1276 static void set_sock_size(struct sock *sk, int mode, int val) 1277 { 1278 /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ 1279 /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ 1280 lock_sock(sk); 1281 if (mode) { 1282 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, 1283 READ_ONCE(sysctl_wmem_max)); 1284 sk->sk_sndbuf = val * 2; 1285 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1286 } else { 1287 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, 1288 READ_ONCE(sysctl_rmem_max)); 1289 sk->sk_rcvbuf = val * 2; 1290 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 1291 } 1292 release_sock(sk); 1293 } 1294 1295 /* 1296 * Setup loopback of outgoing multicasts on a sending socket 1297 */ 1298 static void set_mcast_loop(struct sock *sk, u_char loop) 1299 { 1300 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ 1301 lock_sock(sk); 1302 inet_assign_bit(MC_LOOP, sk, loop); 1303 #ifdef CONFIG_IP_VS_IPV6 1304 if (sk->sk_family == AF_INET6) { 1305 struct ipv6_pinfo *np = inet6_sk(sk); 1306 1307 /* IPV6_MULTICAST_LOOP */ 1308 np->mc_loop = loop ? 1 : 0; 1309 } 1310 #endif 1311 release_sock(sk); 1312 } 1313 1314 /* 1315 * Specify TTL for outgoing multicasts on a sending socket 1316 */ 1317 static void set_mcast_ttl(struct sock *sk, u_char ttl) 1318 { 1319 struct inet_sock *inet = inet_sk(sk); 1320 1321 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ 1322 lock_sock(sk); 1323 inet->mc_ttl = ttl; 1324 #ifdef CONFIG_IP_VS_IPV6 1325 if (sk->sk_family == AF_INET6) { 1326 struct ipv6_pinfo *np = inet6_sk(sk); 1327 1328 /* IPV6_MULTICAST_HOPS */ 1329 np->mcast_hops = ttl; 1330 } 1331 #endif 1332 release_sock(sk); 1333 } 1334 1335 /* Control fragmentation of messages */ 1336 static void set_mcast_pmtudisc(struct sock *sk, int val) 1337 { 1338 struct inet_sock *inet = inet_sk(sk); 1339 1340 /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ 1341 lock_sock(sk); 1342 inet->pmtudisc = val; 1343 #ifdef CONFIG_IP_VS_IPV6 1344 if (sk->sk_family == AF_INET6) { 1345 struct ipv6_pinfo *np = inet6_sk(sk); 1346 1347 /* IPV6_MTU_DISCOVER */ 1348 np->pmtudisc = val; 1349 } 1350 #endif 1351 release_sock(sk); 1352 } 1353 1354 /* 1355 * Specifiy default interface for outgoing multicasts 1356 */ 1357 static int set_mcast_if(struct sock *sk, struct net_device *dev) 1358 { 1359 struct inet_sock *inet = inet_sk(sk); 1360 1361 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1362 return -EINVAL; 1363 1364 lock_sock(sk); 1365 inet->mc_index = dev->ifindex; 1366 /* inet->mc_addr = 0; */ 1367 #ifdef CONFIG_IP_VS_IPV6 1368 if (sk->sk_family == AF_INET6) { 1369 struct ipv6_pinfo *np = inet6_sk(sk); 1370 1371 /* IPV6_MULTICAST_IF */ 1372 np->mcast_oif = dev->ifindex; 1373 } 1374 #endif 1375 release_sock(sk); 1376 1377 return 0; 1378 } 1379 1380 1381 /* 1382 * Join a multicast group. 1383 * the group is specified by a class D multicast address 224.0.0.0/8 1384 * in the in_addr structure passed in as a parameter. 1385 */ 1386 static int 1387 join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev) 1388 { 1389 struct ip_mreqn mreq; 1390 int ret; 1391 1392 memset(&mreq, 0, sizeof(mreq)); 1393 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); 1394 1395 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1396 return -EINVAL; 1397 1398 mreq.imr_ifindex = dev->ifindex; 1399 1400 lock_sock(sk); 1401 ret = ip_mc_join_group(sk, &mreq); 1402 release_sock(sk); 1403 1404 return ret; 1405 } 1406 1407 #ifdef CONFIG_IP_VS_IPV6 1408 static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, 1409 struct net_device *dev) 1410 { 1411 int ret; 1412 1413 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1414 return -EINVAL; 1415 1416 lock_sock(sk); 1417 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); 1418 release_sock(sk); 1419 1420 return ret; 1421 } 1422 #endif 1423 1424 static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) 1425 { 1426 __be32 addr; 1427 struct sockaddr_in sin; 1428 1429 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 1430 if (!addr) 1431 pr_err("You probably need to specify IP address on " 1432 "multicast interface.\n"); 1433 1434 IP_VS_DBG(7, "binding socket with (%s) %pI4\n", 1435 dev->name, &addr); 1436 1437 /* Now bind the socket with the address of multicast interface */ 1438 sin.sin_family = AF_INET; 1439 sin.sin_addr.s_addr = addr; 1440 sin.sin_port = 0; 1441 1442 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); 1443 } 1444 1445 static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, 1446 struct ipvs_sync_daemon_cfg *c, int id) 1447 { 1448 if (AF_INET6 == c->mcast_af) { 1449 sa->in6 = (struct sockaddr_in6) { 1450 .sin6_family = AF_INET6, 1451 .sin6_port = htons(c->mcast_port + id), 1452 }; 1453 sa->in6.sin6_addr = c->mcast_group.in6; 1454 *salen = sizeof(sa->in6); 1455 } else { 1456 sa->in = (struct sockaddr_in) { 1457 .sin_family = AF_INET, 1458 .sin_port = htons(c->mcast_port + id), 1459 }; 1460 sa->in.sin_addr = c->mcast_group.in; 1461 *salen = sizeof(sa->in); 1462 } 1463 } 1464 1465 /* 1466 * Set up sending multicast socket over UDP 1467 */ 1468 static int make_send_sock(struct netns_ipvs *ipvs, int id, 1469 struct net_device *dev, struct socket **sock_ret) 1470 { 1471 /* multicast addr */ 1472 union ipvs_sockaddr mcast_addr; 1473 struct socket *sock; 1474 int result, salen; 1475 1476 /* First create a socket */ 1477 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, 1478 IPPROTO_UDP, &sock); 1479 if (result < 0) { 1480 pr_err("Error during creation of socket; terminating\n"); 1481 goto error; 1482 } 1483 *sock_ret = sock; 1484 result = set_mcast_if(sock->sk, dev); 1485 if (result < 0) { 1486 pr_err("Error setting outbound mcast interface\n"); 1487 goto error; 1488 } 1489 1490 set_mcast_loop(sock->sk, 0); 1491 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); 1492 /* Allow fragmentation if MTU changes */ 1493 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); 1494 result = sysctl_sync_sock_size(ipvs); 1495 if (result > 0) 1496 set_sock_size(sock->sk, 1, result); 1497 1498 if (AF_INET == ipvs->mcfg.mcast_af) 1499 result = bind_mcastif_addr(sock, dev); 1500 else 1501 result = 0; 1502 if (result < 0) { 1503 pr_err("Error binding address of the mcast interface\n"); 1504 goto error; 1505 } 1506 1507 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); 1508 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, 1509 salen, 0); 1510 if (result < 0) { 1511 pr_err("Error connecting to the multicast addr\n"); 1512 goto error; 1513 } 1514 1515 return 0; 1516 1517 error: 1518 return result; 1519 } 1520 1521 1522 /* 1523 * Set up receiving multicast socket over UDP 1524 */ 1525 static int make_receive_sock(struct netns_ipvs *ipvs, int id, 1526 struct net_device *dev, struct socket **sock_ret) 1527 { 1528 /* multicast addr */ 1529 union ipvs_sockaddr mcast_addr; 1530 struct socket *sock; 1531 int result, salen; 1532 1533 /* First create a socket */ 1534 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, 1535 IPPROTO_UDP, &sock); 1536 if (result < 0) { 1537 pr_err("Error during creation of socket; terminating\n"); 1538 goto error; 1539 } 1540 *sock_ret = sock; 1541 /* it is equivalent to the REUSEADDR option in user-space */ 1542 sock->sk->sk_reuse = SK_CAN_REUSE; 1543 result = sysctl_sync_sock_size(ipvs); 1544 if (result > 0) 1545 set_sock_size(sock->sk, 0, result); 1546 1547 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); 1548 sock->sk->sk_bound_dev_if = dev->ifindex; 1549 result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); 1550 if (result < 0) { 1551 pr_err("Error binding to the multicast addr\n"); 1552 goto error; 1553 } 1554 1555 /* join the multicast group */ 1556 #ifdef CONFIG_IP_VS_IPV6 1557 if (ipvs->bcfg.mcast_af == AF_INET6) 1558 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, 1559 dev); 1560 else 1561 #endif 1562 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, 1563 dev); 1564 if (result < 0) { 1565 pr_err("Error joining to the multicast group\n"); 1566 goto error; 1567 } 1568 1569 return 0; 1570 1571 error: 1572 return result; 1573 } 1574 1575 1576 static int 1577 ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) 1578 { 1579 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; 1580 struct kvec iov; 1581 int len; 1582 1583 iov.iov_base = (void *)buffer; 1584 iov.iov_len = length; 1585 1586 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); 1587 1588 return len; 1589 } 1590 1591 static int 1592 ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) 1593 { 1594 int msize; 1595 int ret; 1596 1597 msize = ntohs(msg->size); 1598 1599 ret = ip_vs_send_async(sock, (char *)msg, msize); 1600 if (ret >= 0 || ret == -EAGAIN) 1601 return ret; 1602 pr_err("ip_vs_send_async error %d\n", ret); 1603 return 0; 1604 } 1605 1606 static int 1607 ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) 1608 { 1609 struct msghdr msg = {NULL,}; 1610 struct kvec iov = {buffer, buflen}; 1611 int len; 1612 1613 /* Receive a packet */ 1614 iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, buflen); 1615 len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); 1616 if (len < 0) 1617 return len; 1618 1619 return len; 1620 } 1621 1622 /* Wakeup the master thread for sending */ 1623 static void master_wakeup_work_handler(struct work_struct *work) 1624 { 1625 struct ipvs_master_sync_state *ms = 1626 container_of(work, struct ipvs_master_sync_state, 1627 master_wakeup_work.work); 1628 struct netns_ipvs *ipvs = ms->ipvs; 1629 1630 spin_lock_bh(&ipvs->sync_lock); 1631 if (ms->sync_queue_len && 1632 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { 1633 int id = (int)(ms - ipvs->ms); 1634 1635 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; 1636 wake_up_process(ipvs->master_tinfo[id].task); 1637 } 1638 spin_unlock_bh(&ipvs->sync_lock); 1639 } 1640 1641 /* Get next buffer to send */ 1642 static inline struct ip_vs_sync_buff * 1643 next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 1644 { 1645 struct ip_vs_sync_buff *sb; 1646 1647 sb = sb_dequeue(ipvs, ms); 1648 if (sb) 1649 return sb; 1650 /* Do not delay entries in buffer for more than 2 seconds */ 1651 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); 1652 } 1653 1654 static int sync_thread_master(void *data) 1655 { 1656 struct ip_vs_sync_thread_data *tinfo = data; 1657 struct netns_ipvs *ipvs = tinfo->ipvs; 1658 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; 1659 struct sock *sk = tinfo->sock->sk; 1660 struct ip_vs_sync_buff *sb; 1661 1662 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 1663 "syncid = %d, id = %d\n", 1664 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); 1665 1666 for (;;) { 1667 sb = next_sync_buff(ipvs, ms); 1668 if (unlikely(kthread_should_stop())) 1669 break; 1670 if (!sb) { 1671 schedule_timeout(IPVS_SYNC_CHECK_PERIOD); 1672 continue; 1673 } 1674 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { 1675 /* (Ab)use interruptible sleep to avoid increasing 1676 * the load avg. 1677 */ 1678 __wait_event_interruptible(*sk_sleep(sk), 1679 sock_writeable(sk) || 1680 kthread_should_stop()); 1681 if (unlikely(kthread_should_stop())) 1682 goto done; 1683 } 1684 ip_vs_sync_buff_release(sb); 1685 } 1686 1687 done: 1688 __set_current_state(TASK_RUNNING); 1689 if (sb) 1690 ip_vs_sync_buff_release(sb); 1691 1692 /* clean up the sync_buff queue */ 1693 while ((sb = sb_dequeue(ipvs, ms))) 1694 ip_vs_sync_buff_release(sb); 1695 __set_current_state(TASK_RUNNING); 1696 1697 /* clean up the current sync_buff */ 1698 sb = get_curr_sync_buff(ipvs, ms, 0); 1699 if (sb) 1700 ip_vs_sync_buff_release(sb); 1701 1702 return 0; 1703 } 1704 1705 1706 static int sync_thread_backup(void *data) 1707 { 1708 struct ip_vs_sync_thread_data *tinfo = data; 1709 struct netns_ipvs *ipvs = tinfo->ipvs; 1710 struct sock *sk = tinfo->sock->sk; 1711 struct udp_sock *up = udp_sk(sk); 1712 int len; 1713 1714 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 1715 "syncid = %d, id = %d\n", 1716 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); 1717 1718 while (!kthread_should_stop()) { 1719 wait_event_interruptible(*sk_sleep(sk), 1720 !skb_queue_empty_lockless(&sk->sk_receive_queue) || 1721 !skb_queue_empty_lockless(&up->reader_queue) || 1722 kthread_should_stop()); 1723 1724 /* do we have data now? */ 1725 while (!skb_queue_empty_lockless(&sk->sk_receive_queue) || 1726 !skb_queue_empty_lockless(&up->reader_queue)) { 1727 len = ip_vs_receive(tinfo->sock, tinfo->buf, 1728 ipvs->bcfg.sync_maxlen); 1729 if (len <= 0) { 1730 if (len != -EAGAIN) 1731 pr_err("receiving message error\n"); 1732 break; 1733 } 1734 1735 ip_vs_process_message(ipvs, tinfo->buf, len); 1736 } 1737 } 1738 1739 return 0; 1740 } 1741 1742 1743 int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, 1744 int state) 1745 { 1746 struct ip_vs_sync_thread_data *ti = NULL, *tinfo; 1747 struct task_struct *task; 1748 struct net_device *dev; 1749 char *name; 1750 int (*threadfn)(void *data); 1751 int id = 0, count, hlen; 1752 int result = -ENOMEM; 1753 u16 mtu, min_mtu; 1754 1755 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1756 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", 1757 sizeof(struct ip_vs_sync_conn_v0)); 1758 1759 /* increase the module use count */ 1760 if (!ip_vs_use_count_inc()) 1761 return -ENOPROTOOPT; 1762 1763 /* Do not hold one mutex and then to block on another */ 1764 for (;;) { 1765 rtnl_lock(); 1766 if (mutex_trylock(&ipvs->sync_mutex)) 1767 break; 1768 rtnl_unlock(); 1769 mutex_lock(&ipvs->sync_mutex); 1770 if (rtnl_trylock()) 1771 break; 1772 mutex_unlock(&ipvs->sync_mutex); 1773 } 1774 1775 if (!ipvs->sync_state) { 1776 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); 1777 ipvs->threads_mask = count - 1; 1778 } else 1779 count = ipvs->threads_mask + 1; 1780 1781 if (c->mcast_af == AF_UNSPEC) { 1782 c->mcast_af = AF_INET; 1783 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); 1784 } 1785 if (!c->mcast_port) 1786 c->mcast_port = IP_VS_SYNC_PORT; 1787 if (!c->mcast_ttl) 1788 c->mcast_ttl = 1; 1789 1790 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); 1791 if (!dev) { 1792 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); 1793 result = -ENODEV; 1794 goto out_early; 1795 } 1796 hlen = (AF_INET6 == c->mcast_af) ? 1797 sizeof(struct ipv6hdr) + sizeof(struct udphdr) : 1798 sizeof(struct iphdr) + sizeof(struct udphdr); 1799 mtu = (state == IP_VS_STATE_BACKUP) ? 1800 clamp(dev->mtu, 1500U, 65535U) : 1500U; 1801 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; 1802 1803 if (c->sync_maxlen) 1804 c->sync_maxlen = clamp_t(unsigned int, 1805 c->sync_maxlen, min_mtu, 1806 65535 - hlen); 1807 else 1808 c->sync_maxlen = mtu - hlen; 1809 1810 if (state == IP_VS_STATE_MASTER) { 1811 result = -EEXIST; 1812 if (ipvs->ms) 1813 goto out_early; 1814 1815 ipvs->mcfg = *c; 1816 name = "ipvs-m:%d:%d"; 1817 threadfn = sync_thread_master; 1818 } else if (state == IP_VS_STATE_BACKUP) { 1819 result = -EEXIST; 1820 if (ipvs->backup_tinfo) 1821 goto out_early; 1822 1823 ipvs->bcfg = *c; 1824 name = "ipvs-b:%d:%d"; 1825 threadfn = sync_thread_backup; 1826 } else { 1827 result = -EINVAL; 1828 goto out_early; 1829 } 1830 1831 if (state == IP_VS_STATE_MASTER) { 1832 struct ipvs_master_sync_state *ms; 1833 1834 result = -ENOMEM; 1835 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL); 1836 if (!ipvs->ms) 1837 goto out; 1838 ms = ipvs->ms; 1839 for (id = 0; id < count; id++, ms++) { 1840 INIT_LIST_HEAD(&ms->sync_queue); 1841 ms->sync_queue_len = 0; 1842 ms->sync_queue_delay = 0; 1843 INIT_DELAYED_WORK(&ms->master_wakeup_work, 1844 master_wakeup_work_handler); 1845 ms->ipvs = ipvs; 1846 } 1847 } 1848 result = -ENOMEM; 1849 ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data), 1850 GFP_KERNEL); 1851 if (!ti) 1852 goto out; 1853 1854 for (id = 0; id < count; id++) { 1855 tinfo = &ti[id]; 1856 tinfo->ipvs = ipvs; 1857 if (state == IP_VS_STATE_BACKUP) { 1858 result = -ENOMEM; 1859 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, 1860 GFP_KERNEL); 1861 if (!tinfo->buf) 1862 goto out; 1863 } 1864 tinfo->id = id; 1865 if (state == IP_VS_STATE_MASTER) 1866 result = make_send_sock(ipvs, id, dev, &tinfo->sock); 1867 else 1868 result = make_receive_sock(ipvs, id, dev, &tinfo->sock); 1869 if (result < 0) 1870 goto out; 1871 1872 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); 1873 if (IS_ERR(task)) { 1874 result = PTR_ERR(task); 1875 goto out; 1876 } 1877 tinfo->task = task; 1878 } 1879 1880 /* mark as active */ 1881 1882 if (state == IP_VS_STATE_MASTER) 1883 ipvs->master_tinfo = ti; 1884 else 1885 ipvs->backup_tinfo = ti; 1886 spin_lock_bh(&ipvs->sync_buff_lock); 1887 ipvs->sync_state |= state; 1888 spin_unlock_bh(&ipvs->sync_buff_lock); 1889 1890 mutex_unlock(&ipvs->sync_mutex); 1891 rtnl_unlock(); 1892 1893 return 0; 1894 1895 out: 1896 /* We do not need RTNL lock anymore, release it here so that 1897 * sock_release below can use rtnl_lock to leave the mcast group. 1898 */ 1899 rtnl_unlock(); 1900 id = min(id, count - 1); 1901 if (ti) { 1902 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 1903 if (tinfo->task) 1904 kthread_stop(tinfo->task); 1905 } 1906 } 1907 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 1908 kfree(ipvs->ms); 1909 ipvs->ms = NULL; 1910 } 1911 mutex_unlock(&ipvs->sync_mutex); 1912 1913 /* No more mutexes, release socks */ 1914 if (ti) { 1915 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 1916 if (tinfo->sock) 1917 sock_release(tinfo->sock); 1918 kfree(tinfo->buf); 1919 } 1920 kfree(ti); 1921 } 1922 1923 /* decrease the module use count */ 1924 ip_vs_use_count_dec(); 1925 return result; 1926 1927 out_early: 1928 mutex_unlock(&ipvs->sync_mutex); 1929 rtnl_unlock(); 1930 1931 /* decrease the module use count */ 1932 ip_vs_use_count_dec(); 1933 return result; 1934 } 1935 1936 1937 int stop_sync_thread(struct netns_ipvs *ipvs, int state) 1938 { 1939 struct ip_vs_sync_thread_data *ti, *tinfo; 1940 int id; 1941 int retc = -EINVAL; 1942 1943 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1944 1945 mutex_lock(&ipvs->sync_mutex); 1946 if (state == IP_VS_STATE_MASTER) { 1947 retc = -ESRCH; 1948 if (!ipvs->ms) 1949 goto err; 1950 ti = ipvs->master_tinfo; 1951 1952 /* 1953 * The lock synchronizes with sb_queue_tail(), so that we don't 1954 * add sync buffers to the queue, when we are already in 1955 * progress of stopping the master sync daemon. 1956 */ 1957 1958 spin_lock_bh(&ipvs->sync_buff_lock); 1959 spin_lock(&ipvs->sync_lock); 1960 ipvs->sync_state &= ~IP_VS_STATE_MASTER; 1961 spin_unlock(&ipvs->sync_lock); 1962 spin_unlock_bh(&ipvs->sync_buff_lock); 1963 1964 retc = 0; 1965 for (id = ipvs->threads_mask; id >= 0; id--) { 1966 struct ipvs_master_sync_state *ms = &ipvs->ms[id]; 1967 int ret; 1968 1969 tinfo = &ti[id]; 1970 pr_info("stopping master sync thread %d ...\n", 1971 task_pid_nr(tinfo->task)); 1972 cancel_delayed_work_sync(&ms->master_wakeup_work); 1973 ret = kthread_stop(tinfo->task); 1974 if (retc >= 0) 1975 retc = ret; 1976 } 1977 kfree(ipvs->ms); 1978 ipvs->ms = NULL; 1979 ipvs->master_tinfo = NULL; 1980 } else if (state == IP_VS_STATE_BACKUP) { 1981 retc = -ESRCH; 1982 if (!ipvs->backup_tinfo) 1983 goto err; 1984 ti = ipvs->backup_tinfo; 1985 1986 ipvs->sync_state &= ~IP_VS_STATE_BACKUP; 1987 retc = 0; 1988 for (id = ipvs->threads_mask; id >= 0; id--) { 1989 int ret; 1990 1991 tinfo = &ti[id]; 1992 pr_info("stopping backup sync thread %d ...\n", 1993 task_pid_nr(tinfo->task)); 1994 ret = kthread_stop(tinfo->task); 1995 if (retc >= 0) 1996 retc = ret; 1997 } 1998 ipvs->backup_tinfo = NULL; 1999 } else { 2000 goto err; 2001 } 2002 id = ipvs->threads_mask; 2003 mutex_unlock(&ipvs->sync_mutex); 2004 2005 /* No more mutexes, release socks */ 2006 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 2007 if (tinfo->sock) 2008 sock_release(tinfo->sock); 2009 kfree(tinfo->buf); 2010 } 2011 kfree(ti); 2012 2013 /* decrease the module use count */ 2014 ip_vs_use_count_dec(); 2015 return retc; 2016 2017 err: 2018 mutex_unlock(&ipvs->sync_mutex); 2019 return retc; 2020 } 2021 2022 /* 2023 * Initialize data struct for each netns 2024 */ 2025 int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) 2026 { 2027 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); 2028 spin_lock_init(&ipvs->sync_lock); 2029 spin_lock_init(&ipvs->sync_buff_lock); 2030 return 0; 2031 } 2032 2033 void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) 2034 { 2035 int retc; 2036 2037 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); 2038 if (retc && retc != -ESRCH) 2039 pr_err("Failed to stop Master Daemon\n"); 2040 2041 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); 2042 if (retc && retc != -ESRCH) 2043 pr_err("Failed to stop Backup Daemon\n"); 2044 } 2045