1 /* 2 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* 7 * Copyright (c) 1987 Regents of the University of California. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms are permitted 11 * provided that the above copyright notice and this paragraph are 12 * duplicated in all such forms and that any documentation, 13 * advertising materials, and other materials related to such 14 * distribution and use acknowledge that the software was developed 15 * by the University of California, Berkeley. The name of the 16 * University may not be used to endorse or promote products derived 17 * from this software without specific prior written permission. 18 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 20 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 21 */ 22 23 #include "mpd_defs.h" 24 #include "mpd_tables.h" 25 26 /* 27 * Probe types for probe() 28 */ 29 #define PROBE_UNI 0x1234 /* Unicast probe packet */ 30 #define PROBE_MULTI 0x5678 /* Multicast probe packet */ 31 #define PROBE_RTT 0x9abc /* RTT only probe packet */ 32 33 #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ 34 35 /* 36 * Format of probe / probe response packets. This is an ICMP Echo request 37 * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 38 */ 39 struct pr_icmp 40 { 41 uint8_t pr_icmp_type; /* type field */ 42 uint8_t pr_icmp_code; /* code field */ 43 uint16_t pr_icmp_cksum; /* checksum field */ 44 uint16_t pr_icmp_id; /* Identification */ 45 uint16_t pr_icmp_seq; /* sequence number */ 46 uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */ 47 uint32_t pr_icmp_mtype; /* Message type */ 48 }; 49 50 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 51 0x0, 0x0, 0x0, 0x0, 52 0x0, 0x0, 0x0, 0x0, 53 0x0, 0x0, 0x0, 0x1 } }; 54 55 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; 56 57 static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ 58 59 static void *find_ancillary(struct msghdr *msg, int cmsg_level, 60 int cmsg_type); 61 static void pi_set_crtt(struct target *tg, int64_t m, 62 boolean_t is_probe_uni); 63 static void incoming_echo_reply(struct phyint_instance *pii, 64 struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp); 65 static void incoming_rtt_reply(struct phyint_instance *pii, 66 struct pr_icmp *reply, struct in6_addr fromaddr); 67 static void incoming_mcast_reply(struct phyint_instance *pii, 68 struct pr_icmp *reply, struct in6_addr fromaddr); 69 70 static boolean_t check_pg_crtt_improved(struct phyint_group *pg); 71 static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); 72 static boolean_t check_exception_target(struct phyint_instance *pii, 73 struct target *target); 74 static void probe_fail_info(struct phyint_instance *pii, 75 struct target *cur_tg, struct probe_fail_count *pfinfo); 76 static void probe_success_info(struct phyint_instance *pii, 77 struct target *cur_tg, struct probe_success_count *psinfo); 78 static boolean_t phyint_repaired(struct phyint *pi); 79 80 static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); 81 static int in_cksum(ushort_t *addr, int len); 82 static void reset_snxt_basetimes(void); 83 static int ns2ms(int64_t ns); 84 static int64_t tv2ns(struct timeval *); 85 86 /* 87 * CRTT - Conservative Round Trip Time Estimate 88 * Probe success - A matching probe reply received before CRTT ms has elapsed 89 * after sending the probe. 90 * Probe failure - No probe reply received and more than CRTT ms has elapsed 91 * after sending the probe. 92 * 93 * TLS - Time last success. Most recent probe ack received at this time. 94 * TFF - Time first fail. The time of the earliest probe failure in 95 * a consecutive series of probe failures. 96 * NUM_PROBE_REPAIRS - Number of consecutive successful probes required 97 * before declaring phyint repair. 98 * NUM_PROBE_FAILS - Number of consecutive probe failures required to 99 * declare a phyint failure. 100 * 101 * Phyint state diagram 102 * 103 * The state of a phyint that is capable of being probed, is completely 104 * specified by the 3-tuple <pi_state, pg_state, I>. 105 * 106 * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state 107 * of the link (according to the driver). If the phyint is also configured 108 * with a test address (the common case) and probe targets, then a phyint must 109 * also successfully be able to send and receive probes in order to remain in 110 * the PI_RUNNING state (otherwise, it transitions to PI_FAILED). 111 * 112 * Further, if a PI_RUNNING phyint is configured with a test address but is 113 * unable to find any probe targets, it will transition to the PI_NOTARGETS 114 * state, which indicates that the link is apparently functional but that 115 * in.mpathd is unable to send probes to verify functionality (in this case, 116 * in.mpathd makes the optimistic assumption that the interface is working 117 * correctly and thus does not mark the interface FAILED, but reports it as 118 * IPMP_IF_UNKNOWN through the async events and query interfaces). 119 * 120 * At any point, a phyint may be administratively marked offline via if_mpadm. 121 * In this case, the interface always transitions to PI_OFFLINE, regardless 122 * of its previous state. When the interface is later brought back online, 123 * in.mpathd acts as if the interface is new (and thus it transitions to 124 * PI_RUNNING or PI_FAILED based on the status of the link and the result of 125 * its probes, if probes are sent). 126 * 127 * pi_state - PI_RUNNING or PI_FAILED 128 * PI_RUNNING: The failure detection logic says the phyint is good. 129 * PI_FAILED: The failure detection logic says the phyint has failed. 130 * 131 * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED. 132 * PG_OK: All interfaces in the group are OK. 133 * PG_DEGRADED: Some interfaces in the group are unusable. 134 * PG_FAILED: All interfaces in the group are unusable. 135 * 136 * In the case of router targets, we assume that the current list of 137 * targets obtained from the routing table, is still valid, so the 138 * phyint stat is PI_FAILED. In the case of host targets, we delete the 139 * list of targets, and multicast to the all hosts, to reconstruct the 140 * target list. So the phyints are in the PI_NOTARGETS state. 141 * 142 * I - value of (pi_flags & IFF_INACTIVE) 143 * IFF_INACTIVE: This phyint will not send or receive packets. 144 * Usually, inactive is tied to standby interfaces that are not yet 145 * needed (e.g., no non-standby interfaces in the group have failed). 146 * When failback has been disabled (FAILBACK=no configured), phyint can 147 * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint 148 * subsequently recovers after a failure. 149 * 150 * Not all 9 possible combinations of the above 3-tuple are possible. 151 * 152 * I is tracked by IP. pi_state is tracked by mpathd. 153 * 154 * pi_state state machine 155 * --------------------------------------------------------------------------- 156 * Event State New State 157 * Action: 158 * --------------------------------------------------------------------------- 159 * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) 160 * detection : set IFF_FAILED on this phyint 161 * 162 * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) 163 * detection : set IFF_FAILED on this phyint 164 * 165 * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes) 166 * detection -> (PI_RUNNING, I == 0) 167 * : clear IFF_FAILED on this phyint 168 * 169 * IP interface repair (PI_FAILED, I == 0, FAILBACK=no) 170 * detection -> (PI_RUNNING, I == 1) 171 * : clear IFF_FAILED on this phyint 172 * : if failback is disabled set I == 1 173 * 174 * Group failure (perform on all phyints in the group) 175 * detection PI_RUNNING PI_FAILED 176 * (Router targets) : set IFF_FAILED 177 * 178 * Group failure (perform on all phyints in the group) 179 * detection PI_RUNNING PI_NOTARGETS 180 * (Host targets) : set IFF_FAILED 181 * : delete the target list on all phyints 182 * --------------------------------------------------------------------------- 183 */ 184 185 struct probes_missed probes_missed; 186 187 /* 188 * Compose and transmit an ICMP ECHO REQUEST packet. The IP header 189 * will be added on by the kernel. The id field identifies this phyint. 190 * and the sequence number is an increasing (modulo 2^^16) integer. The data 191 * portion holds the time value when the packet is sent. On echo this is 192 * extracted to compute the round-trip time. Three different types of 193 * probe packets are used. 194 * 195 * PROBE_UNI: This type is used to do failure detection / failure recovery 196 * and RTT calculation. PROBE_UNI probes are spaced apart in time, 197 * not less than the current CRTT. pii_probes[] stores data 198 * about these probes. These packets consume sequence number space. 199 * 200 * PROBE_RTT: This type is used to make only rtt measurements. Normally these 201 * are not used. Under heavy network load, the rtt may go up very high, 202 * due to a spike, or may appear to go high, due to extreme scheduling 203 * delays. Once the network stress is removed, mpathd takes long time to 204 * recover, because the probe_interval is already high, and it takes 205 * a long time to send out sufficient number of probes to bring down the 206 * rtt. To avoid this problem, PROBE_RTT probes are sent out every 207 * user_probe_interval ms. and will cause only rtt updates. These packets 208 * do not consume sequence number space nor is information about these 209 * packets stored in the pii_probes[] 210 * 211 * PROBE_MULTI: This type is only used to construct a list of targets, when 212 * no targets are known. The packet is multicast to the all hosts addr. 213 */ 214 static void 215 probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime) 216 { 217 hrtime_t sent_hrtime; 218 struct timeval sent_tv; 219 struct pr_icmp probe_pkt; /* Probe packet */ 220 struct sockaddr_storage targ; /* target address */ 221 uint_t targaddrlen; /* targed address length */ 222 int pr_ndx; /* probe index in pii->pii_probes[] */ 223 boolean_t sent = _B_TRUE; 224 225 if (debug & D_TARGET) { 226 logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af), 227 pii->pii_name, probe_type, start_hrtime); 228 } 229 230 assert(pii->pii_probe_sock != -1); 231 assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || 232 probe_type == PROBE_RTT); 233 234 probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? 235 ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; 236 probe_pkt.pr_icmp_code = 0; 237 probe_pkt.pr_icmp_cksum = 0; 238 probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); 239 240 /* 241 * Since there is no need to do arithmetic on the icmpid, 242 * (only equality check is done) pii_icmpid is stored in 243 * network byte order at initialization itself. 244 */ 245 probe_pkt.pr_icmp_id = pii->pii_icmpid; 246 probe_pkt.pr_icmp_timestamp = htonll(start_hrtime); 247 probe_pkt.pr_icmp_mtype = htonl(probe_type); 248 249 /* 250 * If probe_type is PROBE_MULTI, this packet will be multicast to 251 * the all hosts address. Otherwise it is unicast to the next target. 252 */ 253 assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && 254 pii->pii_rtt_target_next != NULL)); 255 256 bzero(&targ, sizeof (targ)); 257 targ.ss_family = pii->pii_af; 258 259 if (pii->pii_af == AF_INET6) { 260 struct in6_addr *addr6; 261 262 addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr; 263 targaddrlen = sizeof (struct sockaddr_in6); 264 if (probe_type == PROBE_MULTI) { 265 *addr6 = all_nodes_mcast_v6; 266 } else if (probe_type == PROBE_UNI) { 267 *addr6 = pii->pii_target_next->tg_address; 268 } else { /* type is PROBE_RTT */ 269 *addr6 = pii->pii_rtt_target_next->tg_address; 270 } 271 } else { 272 struct in_addr *addr4; 273 274 addr4 = &((struct sockaddr_in *)&targ)->sin_addr; 275 targaddrlen = sizeof (struct sockaddr_in); 276 if (probe_type == PROBE_MULTI) { 277 *addr4 = all_nodes_mcast_v4; 278 } else if (probe_type == PROBE_UNI) { 279 IN6_V4MAPPED_TO_INADDR( 280 &pii->pii_target_next->tg_address, addr4); 281 } else { /* type is PROBE_RTT */ 282 IN6_V4MAPPED_TO_INADDR( 283 &pii->pii_rtt_target_next->tg_address, addr4); 284 } 285 286 /* 287 * Compute the IPv4 icmp checksum. Does not cover the IP header. 288 */ 289 probe_pkt.pr_icmp_cksum = 290 in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); 291 } 292 293 /* 294 * Use the current time as the time we sent. Not atomic, but the best 295 * we can do from here. 296 */ 297 sent_hrtime = gethrtime(); 298 (void) gettimeofday(&sent_tv, NULL); 299 if (sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0, 300 (struct sockaddr *)&targ, targaddrlen) != sizeof (probe_pkt)) { 301 logperror_pii(pii, "probe: probe sendto"); 302 sent = _B_FALSE; 303 } 304 305 /* 306 * If this is a PROBE_UNI probe packet being unicast to a target, then 307 * update our tables. We will need this info in processing the probe 308 * response. PROBE_MULTI and PROBE_RTT packets are not used for 309 * the purpose of failure or recovery detection. PROBE_MULTI packets 310 * are only used to construct a list of targets. PROBE_RTT packets are 311 * used only for updating the rtt and not for failure detection. 312 */ 313 if (probe_type == PROBE_UNI && sent) { 314 pr_ndx = pii->pii_probe_next; 315 assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); 316 317 /* Collect statistics, before we reuse the last slot. */ 318 if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) 319 pii->pii_cum_stats.lost++; 320 else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) 321 pii->pii_cum_stats.acked++; 322 pii->pii_cum_stats.sent++; 323 324 pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt; 325 pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv; 326 pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime; 327 pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime; 328 pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; 329 probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED); 330 331 pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); 332 pii->pii_target_next = target_next(pii->pii_target_next); 333 assert(pii->pii_target_next != NULL); 334 /* 335 * If we have a single variable to denote the next target to 336 * probe for both rtt probes and failure detection probes, we 337 * could end up with a situation where the failure detection 338 * probe targets become disjoint from the rtt probe targets. 339 * Eg. if 2 targets and the actual fdt is double the user 340 * specified fdt. So we have 2 variables. In this scheme 341 * we also reset pii_rtt_target_next for every fdt probe, 342 * though that may not be necessary. 343 */ 344 pii->pii_rtt_target_next = pii->pii_target_next; 345 pii->pii_snxt++; 346 } else if (probe_type == PROBE_RTT) { 347 pii->pii_rtt_target_next = 348 target_next(pii->pii_rtt_target_next); 349 assert(pii->pii_rtt_target_next != NULL); 350 } 351 } 352 353 /* 354 * Incoming IPv4 data from wire, is received here. Called from main. 355 */ 356 void 357 in_data(struct phyint_instance *pii) 358 { 359 struct sockaddr_in from; 360 struct in6_addr fromaddr; 361 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 362 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 363 struct ip *ip; 364 int iphlen; 365 int len; 366 char abuf[INET_ADDRSTRLEN]; 367 struct msghdr msg; 368 struct iovec iov; 369 struct pr_icmp *reply; 370 struct timeval *recv_tvp; 371 372 if (debug & D_PROBE) { 373 logdebug("in_data(%s %s)\n", 374 AF_STR(pii->pii_af), pii->pii_name); 375 } 376 377 iov.iov_base = (char *)in_packet; 378 iov.iov_len = sizeof (in_packet); 379 msg.msg_iov = &iov; 380 msg.msg_iovlen = 1; 381 msg.msg_name = (struct sockaddr *)&from; 382 msg.msg_namelen = sizeof (from); 383 msg.msg_control = ancillary_data; 384 msg.msg_controllen = sizeof (ancillary_data); 385 386 /* 387 * Poll has already told us that a message is waiting, 388 * on this socket. Read it now. We should not block. 389 */ 390 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 391 logperror_pii(pii, "in_data: recvmsg"); 392 return; 393 } 394 395 /* 396 * If the datalink has indicated the link is down, don't go 397 * any further. 398 */ 399 if (LINK_DOWN(pii->pii_phyint)) 400 return; 401 402 /* Get the printable address for error reporting */ 403 (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); 404 405 /* Ignore packets > 64k or control buffers that don't fit */ 406 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 407 if (debug & D_PKTBAD) { 408 logdebug("Truncated message: msg_flags 0x%x from %s\n", 409 msg.msg_flags, abuf); 410 } 411 return; 412 } 413 414 /* Make sure packet contains at least minimum ICMP header */ 415 ip = (struct ip *)in_packet; 416 iphlen = ip->ip_hl << 2; 417 if (len < iphlen + ICMP_MINLEN) { 418 if (debug & D_PKTBAD) { 419 logdebug("in_data: packet too short (%d bytes)" 420 " from %s\n", len, abuf); 421 } 422 return; 423 } 424 425 /* 426 * Subtract the IP hdr length, 'len' will be length of the probe 427 * reply, starting from the icmp hdr. 428 */ 429 len -= iphlen; 430 /* LINTED */ 431 reply = (struct pr_icmp *)((char *)in_packet + iphlen); 432 433 /* Probe replies are icmp echo replies. Ignore anything else */ 434 if (reply->pr_icmp_type != ICMP_ECHO_REPLY) 435 return; 436 437 /* 438 * The icmp id should match what we sent, which is stored 439 * in pi_icmpid. The icmp code for reply must be 0. 440 * The reply content must be a struct pr_icmp 441 */ 442 if (reply->pr_icmp_id != pii->pii_icmpid) { 443 /* Not in response to our probe */ 444 return; 445 } 446 447 if (reply->pr_icmp_code != 0) { 448 logtrace("probe reply code %d from %s on %s\n", 449 reply->pr_icmp_code, abuf, pii->pii_name); 450 return; 451 } 452 453 if (len < sizeof (struct pr_icmp)) { 454 logtrace("probe reply too short: %d bytes from %s on %s\n", 455 len, abuf, pii->pii_name); 456 return; 457 } 458 459 recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); 460 if (recv_tvp == NULL) { 461 logtrace("message without timestamp from %s on %s\n", 462 abuf, pii->pii_name); 463 return; 464 } 465 466 IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); 467 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) 468 /* Unicast probe reply */ 469 incoming_echo_reply(pii, reply, fromaddr, recv_tvp); 470 else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 471 /* Multicast reply */ 472 incoming_mcast_reply(pii, reply, fromaddr); 473 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 474 incoming_rtt_reply(pii, reply, fromaddr); 475 } else { 476 /* Probably not in response to our probe */ 477 logtrace("probe reply type: %d from %s on %s\n", 478 reply->pr_icmp_mtype, abuf, pii->pii_name); 479 return; 480 } 481 } 482 483 /* 484 * Incoming IPv6 data from wire is received here. Called from main. 485 */ 486 void 487 in6_data(struct phyint_instance *pii) 488 { 489 struct sockaddr_in6 from; 490 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 491 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 492 int len; 493 char abuf[INET6_ADDRSTRLEN]; 494 struct msghdr msg; 495 struct iovec iov; 496 void *opt; 497 struct pr_icmp *reply; 498 struct timeval *recv_tvp; 499 500 if (debug & D_PROBE) { 501 logdebug("in6_data(%s %s)\n", 502 AF_STR(pii->pii_af), pii->pii_name); 503 } 504 505 iov.iov_base = (char *)in_packet; 506 iov.iov_len = sizeof (in_packet); 507 msg.msg_iov = &iov; 508 msg.msg_iovlen = 1; 509 msg.msg_name = (struct sockaddr *)&from; 510 msg.msg_namelen = sizeof (from); 511 msg.msg_control = ancillary_data; 512 msg.msg_controllen = sizeof (ancillary_data); 513 514 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 515 logperror_pii(pii, "in6_data: recvmsg"); 516 return; 517 } 518 519 /* 520 * If the datalink has indicated that the link is down, don't go 521 * any further. 522 */ 523 if (LINK_DOWN(pii->pii_phyint)) 524 return; 525 526 /* Get the printable address for error reporting */ 527 (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); 528 if (len < ICMP_MINLEN) { 529 if (debug & D_PKTBAD) { 530 logdebug("Truncated message: msg_flags 0x%x from %s\n", 531 msg.msg_flags, abuf); 532 } 533 return; 534 } 535 /* Ignore packets > 64k or control buffers that don't fit */ 536 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 537 if (debug & D_PKTBAD) { 538 logdebug("Truncated message: msg_flags 0x%x from %s\n", 539 msg.msg_flags, abuf); 540 } 541 return; 542 } 543 544 reply = (struct pr_icmp *)in_packet; 545 if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) 546 return; 547 548 if (reply->pr_icmp_id != pii->pii_icmpid) { 549 /* Not in response to our probe */ 550 return; 551 } 552 553 /* 554 * The kernel has already verified the the ICMP checksum. 555 */ 556 if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { 557 logtrace("ICMPv6 echo reply source address not linklocal from " 558 "%s on %s\n", abuf, pii->pii_name); 559 return; 560 } 561 opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR); 562 if (opt != NULL) { 563 /* Can't allow routing headers in probe replies */ 564 logtrace("message with routing header from %s on %s\n", 565 abuf, pii->pii_name); 566 return; 567 } 568 569 if (reply->pr_icmp_code != 0) { 570 logtrace("probe reply code: %d from %s on %s\n", 571 reply->pr_icmp_code, abuf, pii->pii_name); 572 return; 573 } 574 if (len < (sizeof (struct pr_icmp))) { 575 logtrace("probe reply too short: %d bytes from %s on %s\n", 576 len, abuf, pii->pii_name); 577 return; 578 } 579 580 recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); 581 if (recv_tvp == NULL) { 582 logtrace("message without timestamp from %s on %s\n", 583 abuf, pii->pii_name); 584 return; 585 } 586 587 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { 588 incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp); 589 } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 590 incoming_mcast_reply(pii, reply, from.sin6_addr); 591 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 592 incoming_rtt_reply(pii, reply, from.sin6_addr); 593 } else { 594 /* Probably not in response to our probe */ 595 logtrace("probe reply type: %d from %s on %s\n", 596 reply->pr_icmp_mtype, abuf, pii->pii_name); 597 } 598 } 599 600 /* 601 * Process the incoming rtt reply, in response to our rtt probe. 602 * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't 603 * have any stored information about the probe we sent. So we don't log 604 * any errors if we receive bad replies. 605 */ 606 static void 607 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, 608 struct in6_addr fromaddr) 609 { 610 int64_t m; /* rtt measurement in ns */ 611 char abuf[INET6_ADDRSTRLEN]; 612 struct target *target; 613 struct phyint_group *pg; 614 615 /* Get the printable address for error reporting */ 616 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 617 618 if (debug & D_PROBE) { 619 logdebug("incoming_rtt_reply: %s %s %s\n", 620 AF_STR(pii->pii_af), pii->pii_name, abuf); 621 } 622 623 /* Do we know this target ? */ 624 target = target_lookup(pii, fromaddr); 625 if (target == NULL) 626 return; 627 628 m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp)); 629 /* Invalid rtt. It has wrapped around */ 630 if (m < 0) 631 return; 632 633 /* 634 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 635 * The initial few responses after the interface is repaired may 636 * contain high rtt's because they could have been queued up waiting 637 * for ARP/NDP resolution on a failed interface. 638 */ 639 pg = pii->pii_phyint->pi_group; 640 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 641 return; 642 643 /* 644 * Update rtt only if the new rtt is lower than the current rtt. 645 * (specified by the 3rd parameter to pi_set_crtt). 646 * If a spike has caused the current probe_interval to be > 647 * user_probe_interval, then this mechanism is used to bring down 648 * the rtt rapidly once the network stress is removed. 649 * If the new rtt is higher than the current rtt, we don't want to 650 * update the rtt. We are having more than 1 outstanding probe and 651 * the increase in rtt we are seeing is being unnecessarily weighted 652 * many times. The regular rtt update will be handled by 653 * incoming_echo_reply() and will take care of any rtt increase. 654 */ 655 pi_set_crtt(target, m, _B_FALSE); 656 if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 657 (user_failure_detection_time < pg->pg_fdt) && 658 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 659 /* 660 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, 661 * investigate if we can improve the failure detection time to 662 * meet whatever the user specified. 663 */ 664 if (check_pg_crtt_improved(pg)) { 665 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 666 user_failure_detection_time); 667 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 668 if (pii->pii_phyint->pi_group != phyint_anongroup) { 669 logerr("Improved failure detection time %d ms " 670 "on (%s %s) for group \"%s\"\n", 671 pg->pg_fdt, AF_STR(pii->pii_af), 672 pii->pii_name, 673 pii->pii_phyint->pi_group->pg_name); 674 } 675 if (user_failure_detection_time == pg->pg_fdt) { 676 /* Avoid any truncation or rounding errors */ 677 pg->pg_probeint = user_probe_interval; 678 /* 679 * No more rtt probes will be sent. The actual 680 * fdt has dropped to the user specified value. 681 * pii_fd_snxt_basetime and pii_snxt_basetime 682 * will be in sync henceforth. 683 */ 684 reset_snxt_basetimes(); 685 } 686 } 687 } 688 } 689 690 /* 691 * Process the incoming echo reply, in response to our unicast probe. 692 * Common for both IPv4 and IPv6 693 */ 694 static void 695 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, 696 struct in6_addr fromaddr, struct timeval *recv_tvp) 697 { 698 int64_t m; /* rtt measurement in ns */ 699 hrtime_t cur_hrtime; /* in ns from some arbitrary point */ 700 char abuf[INET6_ADDRSTRLEN]; 701 int pr_ndx; 702 struct target *target; 703 boolean_t exception; 704 uint64_t pr_icmp_timestamp; 705 uint16_t pr_icmp_seq; 706 struct probe_stats *pr_statp; 707 struct phyint_group *pg = pii->pii_phyint->pi_group; 708 709 /* Get the printable address for error reporting */ 710 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 711 712 if (debug & D_PROBE) { 713 logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n", 714 AF_STR(pii->pii_af), pii->pii_name, abuf, 715 ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp)); 716 } 717 718 pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp); 719 pr_icmp_seq = ntohs(reply->pr_icmp_seq); 720 721 /* Reject out of window probe replies */ 722 if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || 723 SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { 724 logtrace("out of window probe seq %u snxt %u on %s from %s\n", 725 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 726 pii->pii_cum_stats.unknown++; 727 return; 728 } 729 730 cur_hrtime = gethrtime(); 731 m = (int64_t)(cur_hrtime - pr_icmp_timestamp); 732 if (m < 0) { 733 /* 734 * This is a ridiculously high value of rtt. rtt has wrapped 735 * around. Log a message, and ignore the rtt. 736 */ 737 logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld " 738 "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp); 739 } 740 741 /* 742 * Get the probe index pr_ndx corresponding to the received icmp seq. 743 * number in our pii->pii_probes[] array. The icmp sequence number 744 * pii_snxt corresponds to the probe index pii->pii_probe_next 745 */ 746 pr_ndx = MOD_SUB(pii->pii_probe_next, 747 (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); 748 749 assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); 750 751 target = pii->pii_probes[pr_ndx].pr_target; 752 753 /* 754 * Perform sanity checks, whether this probe reply that we 755 * have received is genuine 756 */ 757 if (target != NULL) { 758 /* 759 * Compare the src. addr of the received ICMP or ICMPv6 760 * probe reply with the target address in our tables. 761 */ 762 if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { 763 /* 764 * We don't have any record of having sent a probe to 765 * this target. This is a fake probe reply. Log an error 766 */ 767 logtrace("probe status %d Fake probe reply seq %u " 768 "snxt %u on %s from %s\n", 769 pii->pii_probes[pr_ndx].pr_status, 770 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 771 pii->pii_cum_stats.unknown++; 772 return; 773 } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 774 /* 775 * The address matches, but our tables indicate that 776 * this probe reply has been acked already. So this 777 * is a duplicate probe reply. Log an error 778 */ 779 logtrace("probe status %d Duplicate probe reply seq %u " 780 "snxt %u on %s from %s\n", 781 pii->pii_probes[pr_ndx].pr_status, 782 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 783 pii->pii_cum_stats.unknown++; 784 return; 785 } 786 } else { 787 /* 788 * Target must not be NULL in the PR_UNACKED state 789 */ 790 assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); 791 if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { 792 /* 793 * The probe stats slot is unused. So we didn't 794 * send out any probe to this target. This is a fake. 795 * Log an error. 796 */ 797 logtrace("probe status %d Fake probe reply seq %u " 798 "snxt %u on %s from %s\n", 799 pii->pii_probes[pr_ndx].pr_status, 800 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 801 } 802 pii->pii_cum_stats.unknown++; 803 return; 804 } 805 806 /* 807 * If the rtt does not appear to be right, don't update the 808 * rtt stats. This can happen if the system dropped into the 809 * debugger, or the system was hung or too busy for a 810 * substantial time that we didn't get a chance to run. 811 */ 812 if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) { 813 /* 814 * If the probe corresponding to this received response 815 * was truly sent 'm' ns. ago, then this response must 816 * have been rejected by the sequence number checks. The 817 * fact that it has passed the sequence number checks 818 * means that the measured rtt is wrong. We were probably 819 * scheduled long after the packet was received. 820 */ 821 goto out; 822 } 823 824 /* 825 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 826 * The initial few responses after the interface is repaired may 827 * contain high rtt's because they could have been queued up waiting 828 * for ARP/NDP resolution on a failed interface. 829 */ 830 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 831 goto out; 832 833 /* 834 * Don't update the Conservative Round Trip Time estimate for this 835 * (phint, target) pair if this is the not the highest ack seq seen 836 * thus far on this target. 837 */ 838 if (!highest_ack_tg(pr_icmp_seq, target)) 839 goto out; 840 841 /* 842 * Always update the rtt. This is a failure detection probe 843 * and we want to measure both increase / decrease in rtt. 844 */ 845 pi_set_crtt(target, m, _B_TRUE); 846 847 /* 848 * If the crtt exceeds the average time between probes, 849 * investigate if this slow target is an exception. If so we 850 * can avoid this target and still meet the failure detection 851 * time. Otherwise we can't meet the failure detection time. 852 */ 853 if (target->tg_crtt > pg->pg_probeint) { 854 exception = check_exception_target(pii, target); 855 if (exception) { 856 /* 857 * This target is exceptionally slow. Don't use it 858 * for future probes. check_exception_target() has 859 * made sure that we have at least MIN_PROBE_TARGETS 860 * other active targets 861 */ 862 if (pii->pii_targets_are_routers) { 863 /* 864 * This is a slow router, mark it as slow 865 * and don't use it for further probes. We 866 * don't delete it, since it will be populated 867 * again when we do a router scan. Hence we 868 * need to maintain extra state (unlike the 869 * host case below). Mark it as TG_SLOW. 870 */ 871 if (target->tg_status == TG_ACTIVE) 872 pii->pii_ntargets--; 873 target->tg_status = TG_SLOW; 874 target->tg_latime = gethrtime(); 875 target->tg_rtt_sa = -1; 876 target->tg_crtt = 0; 877 target->tg_rtt_sd = 0; 878 if (pii->pii_target_next == target) { 879 pii->pii_target_next = 880 target_next(target); 881 } 882 } else { 883 /* 884 * the slow target is not a router, we can 885 * just delete it. Send an icmp multicast and 886 * pick the fastest responder that is not 887 * already an active target. target_delete() 888 * adjusts pii->pii_target_next 889 */ 890 target_delete(target); 891 probe(pii, PROBE_MULTI, cur_hrtime); 892 } 893 } else { 894 /* 895 * We can't meet the failure detection time. 896 * Log a message, and update the detection time to 897 * whatever we can achieve. 898 */ 899 pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; 900 pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); 901 last_fdt_bumpup_time = gethrtime(); 902 if (pg != phyint_anongroup) { 903 logerr("Cannot meet requested failure detection" 904 " time of %d ms on (%s %s) new failure" 905 " detection time for group \"%s\" is %d" 906 " ms\n", user_failure_detection_time, 907 AF_STR(pii->pii_af), pii->pii_name, 908 pg->pg_name, pg->pg_fdt); 909 } 910 } 911 } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 912 (user_failure_detection_time < pg->pg_fdt) && 913 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 914 /* 915 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER 916 * investigate if we can improve the failure detection time to 917 * meet whatever the user specified. 918 */ 919 if (check_pg_crtt_improved(pg)) { 920 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 921 user_failure_detection_time); 922 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 923 if (pg != phyint_anongroup) { 924 logerr("Improved failure detection time %d ms " 925 "on (%s %s) for group \"%s\"\n", pg->pg_fdt, 926 AF_STR(pii->pii_af), pii->pii_name, 927 pg->pg_name); 928 } 929 if (user_failure_detection_time == pg->pg_fdt) { 930 /* Avoid any truncation or rounding errors */ 931 pg->pg_probeint = user_probe_interval; 932 /* 933 * No more rtt probes will be sent. The actual 934 * fdt has dropped to the user specified value. 935 * pii_fd_snxt_basetime and pii_snxt_basetime 936 * will be in sync henceforth. 937 */ 938 reset_snxt_basetimes(); 939 } 940 } 941 } 942 out: 943 pr_statp = &pii->pii_probes[pr_ndx]; 944 pr_statp->pr_hrtime_ackproc = cur_hrtime; 945 pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent + 946 (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent)); 947 948 probe_chstate(pr_statp, pii, PR_ACKED); 949 950 /* 951 * Update pii->pii_rack, i.e. the sequence number of the last received 952 * probe response, based on the echo reply we have received now, if 953 * either of the following conditions are satisfied. 954 * a. pii_rack is outside the current receive window of 955 * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). 956 * This means we have not received probe responses for a 957 * long time, and the sequence number has wrapped around. 958 * b. pii_rack is within the current receive window and this echo 959 * reply corresponds to the highest sequence number we have seen 960 * so far. 961 */ 962 if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || 963 SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || 964 SEQ_GT(pr_icmp_seq, pii->pii_rack)) { 965 pii->pii_rack = pr_icmp_seq; 966 } 967 } 968 969 /* 970 * Returns true if seq is the highest unacknowledged seq for target tg 971 * else returns false 972 */ 973 static boolean_t 974 highest_ack_tg(uint16_t seq, struct target *tg) 975 { 976 struct phyint_instance *pii; 977 int pr_ndx; 978 uint16_t pr_seq; 979 980 pii = tg->tg_phyint_inst; 981 982 /* 983 * Get the seq number of the most recent probe sent so far, 984 * and also get the corresponding probe index in the probe stats 985 * array. 986 */ 987 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 988 pr_seq = pii->pii_snxt; 989 pr_seq--; 990 991 /* 992 * Start from the most recent probe and walk back, trying to find 993 * an acked probe corresponding to target tg. 994 */ 995 for (; pr_ndx != pii->pii_probe_next; 996 pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { 997 if (pii->pii_probes[pr_ndx].pr_target == tg && 998 pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 999 if (SEQ_GT(pr_seq, seq)) 1000 return (_B_FALSE); 1001 } 1002 } 1003 return (_B_TRUE); 1004 } 1005 1006 /* 1007 * Check whether the crtt for the group has improved by a factor of 1008 * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure 1009 * detection time flapping in the face of small crtt changes. 1010 */ 1011 static boolean_t 1012 check_pg_crtt_improved(struct phyint_group *pg) 1013 { 1014 struct phyint *pi; 1015 1016 if (debug & D_PROBE) 1017 logdebug("check_pg_crtt_improved()\n"); 1018 1019 /* 1020 * The crtt for the group is only improved if each phyint_instance 1021 * for both ipv4 and ipv6 is improved. 1022 */ 1023 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 1024 if (!check_pii_crtt_improved(pi->pi_v4) || 1025 !check_pii_crtt_improved(pi->pi_v6)) 1026 return (_B_FALSE); 1027 } 1028 1029 return (_B_TRUE); 1030 } 1031 1032 /* 1033 * Check whether the crtt has improved substantially on this phyint_instance. 1034 * Returns _B_TRUE if there's no crtt information available, because pii 1035 * is NULL or the phyint_instance is not capable of probing. 1036 */ 1037 boolean_t 1038 check_pii_crtt_improved(struct phyint_instance *pii) { 1039 struct target *tg; 1040 1041 if (pii == NULL) 1042 return (_B_TRUE); 1043 1044 if (!PROBE_CAPABLE(pii) || 1045 pii->pii_phyint->pi_state == PI_FAILED) 1046 return (_B_TRUE); 1047 1048 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1049 if (tg->tg_status != TG_ACTIVE) 1050 continue; 1051 if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / 1052 LOWER_FDT_TRIGGER)) { 1053 return (_B_FALSE); 1054 } 1055 } 1056 1057 return (_B_TRUE); 1058 } 1059 1060 /* 1061 * This target responds very slowly to probes. The target's crtt exceeds 1062 * the probe interval of its group. Compare against other targets 1063 * and determine if this target is an exception, if so return true, else false 1064 */ 1065 static boolean_t 1066 check_exception_target(struct phyint_instance *pii, struct target *target) 1067 { 1068 struct target *tg; 1069 char abuf[INET6_ADDRSTRLEN]; 1070 1071 if (debug & D_PROBE) { 1072 logdebug("check_exception_target(%s %s target %s)\n", 1073 AF_STR(pii->pii_af), pii->pii_name, 1074 pr_addr(pii->pii_af, target->tg_address, 1075 abuf, sizeof (abuf))); 1076 } 1077 1078 /* 1079 * We should have at least MIN_PROBE_TARGETS + 1 good targets now, 1080 * to make a good judgement. Otherwise don't drop this target. 1081 */ 1082 if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) 1083 return (_B_FALSE); 1084 1085 /* 1086 * Determine whether only this particular target is slow. 1087 * We know that this target's crtt exceeds the group's probe interval. 1088 * If all other active targets have a 1089 * crtt < (this group's probe interval) / EXCEPTION_FACTOR, 1090 * then this target is considered slow. 1091 */ 1092 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1093 if (tg != target && tg->tg_status == TG_ACTIVE) { 1094 if (tg->tg_crtt > 1095 pii->pii_phyint->pi_group->pg_probeint / 1096 EXCEPTION_FACTOR) { 1097 return (_B_FALSE); 1098 } 1099 } 1100 } 1101 1102 return (_B_TRUE); 1103 } 1104 1105 /* 1106 * Update the target list. The icmp all hosts multicast has given us 1107 * some host to which we can send probes. If we already have sufficient 1108 * targets, discard it. 1109 */ 1110 static void 1111 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, 1112 struct in6_addr fromaddr) 1113 /* ARGSUSED */ 1114 { 1115 int af; 1116 char abuf[INET6_ADDRSTRLEN]; 1117 struct phyint *pi; 1118 1119 if (debug & D_PROBE) { 1120 logdebug("incoming_mcast_reply(%s %s %s)\n", 1121 AF_STR(pii->pii_af), pii->pii_name, 1122 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); 1123 } 1124 1125 /* 1126 * Using host targets is a fallback mechanism. If we have 1127 * found a router, don't add this host target. If we already 1128 * know MAX_PROBE_TARGETS, don't add another target. 1129 */ 1130 assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 1131 if (pii->pii_targets != NULL) { 1132 if (pii->pii_targets_are_routers || 1133 (pii->pii_ntargets == MAX_PROBE_TARGETS)) { 1134 return; 1135 } 1136 } 1137 1138 if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || 1139 IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { 1140 /* 1141 * Guard against response from 0.0.0.0 1142 * and ::. Log a trace message 1143 */ 1144 logtrace("probe response from %s on %s\n", 1145 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), 1146 pii->pii_name); 1147 return; 1148 } 1149 1150 /* 1151 * This address is one of our own, so reject this address as a 1152 * valid probe target. 1153 */ 1154 af = pii->pii_af; 1155 if (own_address(fromaddr)) 1156 return; 1157 1158 /* 1159 * If the phyint is part a named group, then add the address to all 1160 * members of the group. Otherwise, add the address only to the 1161 * phyint itself, since other phyints in the anongroup may not be on 1162 * the same subnet. 1163 */ 1164 pi = pii->pii_phyint; 1165 if (pi->pi_group == phyint_anongroup) { 1166 target_add(pii, fromaddr, _B_FALSE); 1167 } else { 1168 pi = pi->pi_group->pg_phyint; 1169 for (; pi != NULL; pi = pi->pi_pgnext) 1170 target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); 1171 } 1172 } 1173 1174 /* 1175 * Compute CRTT given an existing scaled average, scaled deviation estimate 1176 * and a new rtt time. The formula is from Jacobson and Karels' 1177 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 1178 * are the same as those in Appendix A.2 of that paper. 1179 * 1180 * m = new measurement 1181 * sa = scaled RTT average (8 * average estimates) 1182 * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). 1183 * crtt = Conservative round trip time. Used to determine whether probe 1184 * has timed out. 1185 * 1186 * New scaled average and deviation are passed back via sap and svp 1187 */ 1188 static int64_t 1189 compute_crtt(int64_t *sap, int64_t *svp, int64_t m) 1190 { 1191 int64_t sa = *sap; 1192 int64_t sv = *svp; 1193 int64_t crtt; 1194 int64_t saved_m = m; 1195 1196 assert(*sap >= -1); 1197 assert(*svp >= 0); 1198 1199 if (sa != -1) { 1200 /* 1201 * Update average estimator: 1202 * new rtt = old rtt + 1/8 Error 1203 * where Error = m - old rtt 1204 * i.e. 8 * new rtt = 8 * old rtt + Error 1205 * i.e. new sa = old sa + Error 1206 */ 1207 m -= sa >> 3; /* m is now Error in estimate. */ 1208 if ((sa += m) < 0) { 1209 /* Don't allow the smoothed average to be negative. */ 1210 sa = 0; 1211 } 1212 1213 /* 1214 * Update deviation estimator: 1215 * new mdev = old mdev + 1/4 (abs(Error) - old mdev) 1216 * i.e. 4 * new mdev = 4 * old mdev + 1217 * (abs(Error) - old mdev) 1218 * i.e. new sv = old sv + (abs(Error) - old mdev) 1219 */ 1220 if (m < 0) 1221 m = -m; 1222 m -= sv >> 2; 1223 sv += m; 1224 } else { 1225 /* Initialization. This is the first response received. */ 1226 sa = (m << 3); 1227 sv = (m << 1); 1228 } 1229 1230 crtt = (sa >> 3) + sv; 1231 1232 if (debug & D_PROBE) { 1233 logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> " 1234 "crtt = %lld\n", saved_m, sa, sv, crtt); 1235 } 1236 1237 *sap = sa; 1238 *svp = sv; 1239 1240 /* 1241 * CRTT = average estimates + 4 * deviation estimates 1242 * = sa / 8 + sv 1243 */ 1244 return (crtt); 1245 } 1246 1247 static void 1248 pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni) 1249 { 1250 struct phyint_instance *pii = tg->tg_phyint_inst; 1251 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1252 int64_t sa = tg->tg_rtt_sa; 1253 int64_t sv = tg->tg_rtt_sd; 1254 int new_crtt; 1255 int i; 1256 1257 if (debug & D_PROBE) 1258 logdebug("pi_set_crtt: target - m %lld\n", m); 1259 1260 /* store the round trip time, in case we need to defer computation */ 1261 tg->tg_deferred[tg->tg_num_deferred] = m; 1262 1263 new_crtt = ns2ms(compute_crtt(&sa, &sv, m)); 1264 1265 /* 1266 * If this probe's round trip time would singlehandedly cause an 1267 * increase in the group's probe interval consider it suspect. 1268 */ 1269 if ((new_crtt > probe_interval) && is_probe_uni) { 1270 if (debug & D_PROBE) { 1271 logdebug("Received a suspect probe on %s, new_crtt =" 1272 " %d, probe_interval = %d, num_deferred = %d\n", 1273 pii->pii_probe_logint->li_name, new_crtt, 1274 probe_interval, tg->tg_num_deferred); 1275 } 1276 1277 /* 1278 * If we've deferred as many rtts as we plan on deferring, then 1279 * assume the link really did slow down and process all queued 1280 * rtts 1281 */ 1282 if (tg->tg_num_deferred == MAXDEFERREDRTT) { 1283 if (debug & D_PROBE) { 1284 logdebug("Received MAXDEFERREDRTT probes which " 1285 "would cause an increased probe_interval. " 1286 "Integrating queued rtt data points.\n"); 1287 } 1288 1289 for (i = 0; i <= tg->tg_num_deferred; i++) { 1290 tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa, 1291 &tg->tg_rtt_sd, tg->tg_deferred[i])); 1292 } 1293 1294 tg->tg_num_deferred = 0; 1295 } else { 1296 tg->tg_num_deferred++; 1297 } 1298 return; 1299 } 1300 1301 /* 1302 * If this is a normal probe, or an RTT probe that would lead to a 1303 * reduced CRTT, then update our CRTT data. Further, if this was 1304 * a normal probe, pitch any deferred probes since our probes are 1305 * again being answered within our CRTT estimates. 1306 */ 1307 if (is_probe_uni || new_crtt < tg->tg_crtt) { 1308 tg->tg_rtt_sa = sa; 1309 tg->tg_rtt_sd = sv; 1310 tg->tg_crtt = new_crtt; 1311 if (is_probe_uni) 1312 tg->tg_num_deferred = 0; 1313 } 1314 } 1315 1316 /* 1317 * Return a pointer to the specified option buffer. 1318 * If not found return NULL. 1319 */ 1320 static void * 1321 find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type) 1322 { 1323 struct cmsghdr *cmsg; 1324 1325 for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1326 cmsg = CMSG_NXTHDR(msg, cmsg)) { 1327 if (cmsg->cmsg_level == cmsg_level && 1328 cmsg->cmsg_type == cmsg_type) { 1329 return (CMSG_DATA(cmsg)); 1330 } 1331 } 1332 return (NULL); 1333 } 1334 1335 /* 1336 * Try to activate another INACTIVE interface in the same group as `pi'. 1337 * Prefer STANDBY INACTIVE to just INACTIVE. 1338 */ 1339 void 1340 phyint_activate_another(struct phyint *pi) 1341 { 1342 struct phyint *pi2; 1343 struct phyint *inactivepi = NULL; 1344 1345 if (pi->pi_group == phyint_anongroup) 1346 return; 1347 1348 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1349 if (pi == pi2 || pi2->pi_state != PI_RUNNING || 1350 !(pi2->pi_flags & IFF_INACTIVE)) 1351 continue; 1352 1353 inactivepi = pi2; 1354 if (pi2->pi_flags & IFF_STANDBY) 1355 break; 1356 } 1357 1358 if (inactivepi != NULL) 1359 (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE); 1360 } 1361 1362 /* 1363 * Transition a phyint back to PI_RUNNING (from PI_FAILED or PI_OFFLINE). The 1364 * caller must ensure that the transition is appropriate. Clears IFF_OFFLINE 1365 * or IFF_FAILED, as appropriate. Also sets IFF_INACTIVE on this or other 1366 * interfaces as appropriate (see comment below). Finally, also updates the 1367 * phyint's group state to account for the change. 1368 */ 1369 void 1370 phyint_transition_to_running(struct phyint *pi) 1371 { 1372 struct phyint *pi2; 1373 struct phyint *actstandbypi = NULL; 1374 uint_t nactive = 0, nnonstandby = 0; 1375 boolean_t onlining = (pi->pi_state == PI_OFFLINE); 1376 uint64_t set, clear; 1377 1378 /* 1379 * The interface is running again, but should it or another interface 1380 * in the group end up INACTIVE? There are three cases: 1381 * 1382 * 1. If it's a STANDBY interface, it should be end up INACTIVE if 1383 * the group is operating at capacity (i.e., there are at least as 1384 * many active interfaces as non-STANDBY interfaces in the group). 1385 * No other interfaces should be changed. 1386 * 1387 * 2. If it's a non-STANDBY interface and we're onlining it or 1388 * FAILBACK is enabled, then it should *not* end up INACTIVE. 1389 * Further, if the group is above capacity as a result of this 1390 * interface, then an active STANDBY interface in the group should 1391 * end up INACTIVE. 1392 * 1393 * 3. If it's a non-STANDBY interface, we're repairing it, and 1394 * FAILBACK is disabled, then it should end up INACTIVE *unless* 1395 * the group was failed (in which case we have no choice but to 1396 * use it). No other interfaces should be changed. 1397 */ 1398 if (pi->pi_group != phyint_anongroup) { 1399 pi2 = pi->pi_group->pg_phyint; 1400 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1401 if (!(pi2->pi_flags & IFF_STANDBY)) 1402 nnonstandby++; 1403 1404 if (pi2->pi_state == PI_RUNNING) { 1405 if (!(pi2->pi_flags & IFF_INACTIVE)) { 1406 nactive++; 1407 if (pi2->pi_flags & IFF_STANDBY) 1408 actstandbypi = pi2; 1409 } 1410 } 1411 } 1412 } 1413 1414 set = 0; 1415 clear = (onlining ? IFF_OFFLINE : IFF_FAILED); 1416 1417 if (pi->pi_flags & IFF_STANDBY) { /* case 1 */ 1418 if (nactive >= nnonstandby) 1419 set |= IFF_INACTIVE; 1420 else 1421 clear |= IFF_INACTIVE; 1422 } else if (onlining || failback_enabled) { /* case 2 */ 1423 if (nactive >= nnonstandby && actstandbypi != NULL) 1424 (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0); 1425 } else if (!GROUP_FAILED(pi->pi_group)) { /* case 3 */ 1426 set |= IFF_INACTIVE; 1427 } 1428 (void) change_pif_flags(pi, set, clear); 1429 1430 phyint_chstate(pi, PI_RUNNING); 1431 1432 /* 1433 * Update the group state to account for the change. 1434 */ 1435 phyint_group_refresh_state(pi->pi_group); 1436 } 1437 1438 /* 1439 * See if a previously failed interface has started working again. 1440 */ 1441 void 1442 phyint_check_for_repair(struct phyint *pi) 1443 { 1444 if (!phyint_repaired(pi)) 1445 return; 1446 1447 if (pi->pi_group == phyint_anongroup) { 1448 logerr("IP interface repair detected on %s\n", pi->pi_name); 1449 } else { 1450 logerr("IP interface repair detected on %s of group %s\n", 1451 pi->pi_name, pi->pi_group->pg_name); 1452 } 1453 1454 /* 1455 * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet. 1456 * So just clear IFF_OFFLINE and defer phyint_transition_to_running() 1457 * until it is brought back online. 1458 */ 1459 if (pi->pi_state == PI_OFFLINE) { 1460 (void) change_pif_flags(pi, 0, IFF_FAILED); 1461 return; 1462 } 1463 1464 phyint_transition_to_running(pi); /* calls phyint_chstate() */ 1465 } 1466 1467 /* 1468 * See if an interface has failed, or if the whole group of interfaces has 1469 * failed. 1470 */ 1471 static void 1472 phyint_inst_check_for_failure(struct phyint_instance *pii) 1473 { 1474 struct phyint *pi = pii->pii_phyint; 1475 struct phyint *pi2; 1476 boolean_t was_active; 1477 1478 switch (failure_state(pii)) { 1479 case PHYINT_FAILURE: 1480 was_active = ((pi->pi_flags & IFF_INACTIVE) == 0); 1481 1482 (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE); 1483 if (pi->pi_group == phyint_anongroup) { 1484 logerr("IP interface failure detected on %s\n", 1485 pii->pii_name); 1486 } else { 1487 logerr("IP interface failure detected on %s of group" 1488 " %s\n", pii->pii_name, pi->pi_group->pg_name); 1489 } 1490 1491 /* 1492 * If the failed interface was active, activate another 1493 * INACTIVE interface in the group if possible. 1494 */ 1495 if (was_active) 1496 phyint_activate_another(pi); 1497 1498 /* 1499 * If the interface is offline, the state change will be 1500 * noted when it comes back online. 1501 */ 1502 if (pi->pi_state != PI_OFFLINE) { 1503 phyint_chstate(pi, PI_FAILED); 1504 reset_crtt_all(pi); 1505 } 1506 break; 1507 1508 case GROUP_FAILURE: 1509 pi2 = pi->pi_group->pg_phyint; 1510 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1511 (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE); 1512 if (pi2->pi_state == PI_OFFLINE) /* see comment above */ 1513 continue; 1514 1515 reset_crtt_all(pi2); 1516 /* 1517 * In the case of host targets, we would have flushed 1518 * the targets, and gone to PI_NOTARGETS state. 1519 */ 1520 if (pi2->pi_state == PI_RUNNING) 1521 phyint_chstate(pi2, PI_FAILED); 1522 } 1523 break; 1524 1525 default: 1526 break; 1527 } 1528 } 1529 1530 /* 1531 * Determines if any timeout event has occurred and returns the number of 1532 * milliseconds until the next timeout event for the phyint. Returns 1533 * TIMER_INFINITY for "never". 1534 */ 1535 uint_t 1536 phyint_inst_timer(struct phyint_instance *pii) 1537 { 1538 int pr_ndx; 1539 uint_t timeout; 1540 struct target *cur_tg; 1541 struct probe_stats *pr_statp; 1542 struct phyint_instance *pii_other; 1543 struct phyint *pi; 1544 int valid_unack_count; 1545 int i; 1546 int interval; 1547 uint_t check_time; 1548 uint_t cur_time; 1549 hrtime_t cur_hrtime; 1550 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1551 1552 cur_hrtime = gethrtime(); 1553 cur_time = ns2ms(cur_hrtime); 1554 1555 if (debug & D_TIMER) { 1556 logdebug("phyint_inst_timer(%s %s)\n", 1557 AF_STR(pii->pii_af), pii->pii_name); 1558 } 1559 1560 pii_other = phyint_inst_other(pii); 1561 if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { 1562 /* 1563 * Check to see if we're here due to link up/down flapping; If 1564 * enough time has passed, then try to bring the interface 1565 * back up; otherwise, schedule a timer to bring it back up 1566 * when enough time *has* elapsed. 1567 */ 1568 pi = pii->pii_phyint; 1569 if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { 1570 check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; 1571 if (check_time > cur_time) 1572 return (check_time - cur_time); 1573 1574 phyint_check_for_repair(pi); 1575 } 1576 } 1577 1578 /* 1579 * If probing is not enabled on this phyint instance, don't proceed. 1580 */ 1581 if (!PROBE_ENABLED(pii)) 1582 return (TIMER_INFINITY); 1583 1584 /* 1585 * If the timer has fired too soon, probably triggered 1586 * by some other phyint instance, return the remaining 1587 * time 1588 */ 1589 if (TIME_LT(cur_time, pii->pii_snxt_time)) 1590 return (pii->pii_snxt_time - cur_time); 1591 1592 /* 1593 * If the link is down, don't send any probes for now. 1594 */ 1595 if (LINK_DOWN(pii->pii_phyint)) 1596 return (TIMER_INFINITY); 1597 1598 /* 1599 * Randomize the next probe time, between MIN_RANDOM_FACTOR 1600 * and MAX_RANDOM_FACTOR with respect to the base probe time. 1601 * Base probe time is strictly periodic. 1602 */ 1603 interval = GET_RANDOM( 1604 (int)(MIN_RANDOM_FACTOR * user_probe_interval), 1605 (int)(MAX_RANDOM_FACTOR * user_probe_interval)); 1606 pii->pii_snxt_time = pii->pii_snxt_basetime + interval; 1607 1608 /* 1609 * Check if the current time > next time to probe. If so, we missed 1610 * sending 1 or more probes, probably due to heavy system load. At least 1611 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we 1612 * were scheduled. Make adjustments to the times, in multiples of 1613 * user_probe_interval. 1614 */ 1615 if (TIME_GT(cur_time, pii->pii_snxt_time)) { 1616 int n; 1617 1618 n = (cur_time - pii->pii_snxt_time) / user_probe_interval; 1619 pii->pii_snxt_time += (n + 1) * user_probe_interval; 1620 pii->pii_snxt_basetime += (n + 1) * user_probe_interval; 1621 logtrace("missed sending %d probes cur_time %u snxt_time %u" 1622 " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, 1623 pii->pii_snxt_basetime); 1624 1625 /* Collect statistics about missed probes */ 1626 probes_missed.pm_nprobes += n + 1; 1627 probes_missed.pm_ntimes++; 1628 } 1629 pii->pii_snxt_basetime += user_probe_interval; 1630 interval = pii->pii_snxt_time - cur_time; 1631 if (debug & D_TARGET) { 1632 logdebug("cur_time %u snxt_time %u snxt_basetime %u" 1633 " interval %u\n", cur_time, pii->pii_snxt_time, 1634 pii->pii_snxt_basetime, interval); 1635 } 1636 1637 /* 1638 * If no targets are known, we need to send an ICMP multicast. The 1639 * probe type is PROBE_MULTI. We'll check back in 'interval' msec 1640 * to see if we found a target. 1641 */ 1642 if (pii->pii_target_next == NULL) { 1643 assert(pii->pii_ntargets == 0); 1644 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1645 probe(pii, PROBE_MULTI, cur_time); 1646 return (interval); 1647 } 1648 1649 if ((user_probe_interval != probe_interval) && 1650 TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { 1651 /* 1652 * the failure detection (fd) probe timer has not yet fired. 1653 * Need to send only an rtt probe. The probe type is PROBE_RTT. 1654 */ 1655 probe(pii, PROBE_RTT, cur_hrtime); 1656 return (interval); 1657 } 1658 /* 1659 * the fd probe timer has fired. Need to do all failure 1660 * detection / recovery calculations, and then send an fd probe 1661 * of type PROBE_UNI. 1662 */ 1663 if (user_probe_interval == probe_interval) { 1664 /* 1665 * We could have missed some probes, and then adjusted 1666 * pii_snxt_basetime above. Otherwise we could have 1667 * blindly added probe_interval to pii_fd_snxt_basetime. 1668 */ 1669 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1670 } else { 1671 pii->pii_fd_snxt_basetime += probe_interval; 1672 if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { 1673 int n; 1674 1675 n = (cur_time - pii->pii_fd_snxt_basetime) / 1676 probe_interval; 1677 pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; 1678 } 1679 } 1680 1681 /* 1682 * We can have at most, the latest 2 probes that we sent, in 1683 * the PR_UNACKED state. All previous probes sent, are either 1684 * PR_LOST or PR_ACKED. An unacknowledged probe is considered 1685 * timed out if the probe's time_start + the CRTT < currenttime. 1686 * For each of the last 2 probes, examine whether it has timed 1687 * out. If so, mark it PR_LOST. The probe stats is a circular array. 1688 */ 1689 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1690 valid_unack_count = 0; 1691 1692 for (i = 0; i < 2; i++) { 1693 pr_statp = &pii->pii_probes[pr_ndx]; 1694 cur_tg = pii->pii_probes[pr_ndx].pr_target; 1695 switch (pr_statp->pr_status) { 1696 case PR_ACKED: 1697 /* 1698 * We received back an ACK, so the switch clearly 1699 * is not dropping our traffic, and thus we can 1700 * enable failure detection immediately. 1701 */ 1702 if (pii->pii_fd_hrtime > gethrtime()) { 1703 if (debug & D_PROBE) { 1704 logdebug("successful probe on %s; " 1705 "ending quiet period\n", 1706 pii->pii_phyint->pi_name); 1707 } 1708 pii->pii_fd_hrtime = gethrtime(); 1709 } 1710 break; 1711 1712 case PR_UNACKED: 1713 assert(cur_tg != NULL); 1714 /* 1715 * The crtt could be zero for some reason, 1716 * Eg. the phyint could be failed. If the crtt is 1717 * not available use group's probe interval, 1718 * which is a worst case estimate. 1719 */ 1720 timeout = ns2ms(pr_statp->pr_hrtime_start); 1721 if (cur_tg->tg_crtt != 0) { 1722 timeout += cur_tg->tg_crtt; 1723 } else { 1724 timeout += probe_interval; 1725 } 1726 if (TIME_LT(timeout, cur_time)) { 1727 pr_statp->pr_time_lost = timeout; 1728 probe_chstate(pr_statp, pii, PR_LOST); 1729 } else if (i == 1) { 1730 /* 1731 * We are forced to consider this probe 1732 * lost, as we can have at most 2 unack. 1733 * probes any time, and we will be sending a 1734 * probe at the end of this function. 1735 * Normally, we should not be here, but 1736 * this can happen if an incoming response 1737 * that was considered lost has increased 1738 * the crtt for this target, and also bumped 1739 * up the FDT. Note that we never cancel or 1740 * increase the current pii_time_left, so 1741 * when the timer fires, we find 2 valid 1742 * unacked probes, and they are yet to timeout 1743 */ 1744 pr_statp->pr_time_lost = cur_time; 1745 probe_chstate(pr_statp, pii, PR_LOST); 1746 } else { 1747 /* 1748 * Only the most recent probe can enter 1749 * this 'else' arm. The second most recent 1750 * probe must take either of the above arms, 1751 * if it is unacked. 1752 */ 1753 valid_unack_count++; 1754 } 1755 break; 1756 } 1757 pr_ndx = PROBE_INDEX_PREV(pr_ndx); 1758 } 1759 1760 /* 1761 * We send out 1 probe randomly in the interval between one half 1762 * and one probe interval for the group. Given that the CRTT is always 1763 * less than the group's probe interval, we can have at most 1 1764 * unacknowledged probe now. All previous probes are either lost or 1765 * acked. 1766 */ 1767 assert(valid_unack_count == 0 || valid_unack_count == 1); 1768 1769 /* 1770 * The timer has fired. Take appropriate action depending 1771 * on the current state of the phyint. 1772 * 1773 * PI_RUNNING state - Failure detection 1774 * PI_FAILED state - Repair detection 1775 */ 1776 switch (pii->pii_phyint->pi_state) { 1777 case PI_FAILED: 1778 /* 1779 * If the most recent probe (excluding unacked probes that 1780 * are yet to time out) has been acked, check whether the 1781 * phyint is now repaired. 1782 */ 1783 if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { 1784 phyint_check_for_repair(pii->pii_phyint); 1785 } 1786 break; 1787 1788 case PI_RUNNING: 1789 /* 1790 * It's possible our probes have been lost because of a 1791 * spanning-tree mandated quiet period on the switch. If so, 1792 * ignore the lost probes. 1793 */ 1794 if (pii->pii_fd_hrtime - cur_hrtime > 0) 1795 break; 1796 1797 if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { 1798 /* 1799 * We have 1 or more failed probes (excluding unacked 1800 * probes that are yet to time out). Determine if the 1801 * phyint has failed. 1802 */ 1803 phyint_inst_check_for_failure(pii); 1804 } 1805 break; 1806 1807 default: 1808 logerr("phyint_inst_timer: invalid state %d\n", 1809 pii->pii_phyint->pi_state); 1810 abort(); 1811 } 1812 1813 /* 1814 * Start the next probe. probe() will also set pii->pii_probe_time_left 1815 * to the group's probe interval. If phyint_failed -> target_flush_hosts 1816 * was called, the target list may be empty. 1817 */ 1818 if (pii->pii_target_next != NULL) { 1819 probe(pii, PROBE_UNI, cur_hrtime); 1820 /* 1821 * If we have just the one probe target, and we're not using 1822 * router targets, try to find another as we presently have 1823 * no resilience. 1824 */ 1825 if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) 1826 probe(pii, PROBE_MULTI, cur_hrtime); 1827 } else { 1828 probe(pii, PROBE_MULTI, cur_hrtime); 1829 } 1830 return (interval); 1831 } 1832 1833 /* 1834 * Start the probe timer for an interface instance. 1835 */ 1836 void 1837 start_timer(struct phyint_instance *pii) 1838 { 1839 uint32_t interval; 1840 1841 /* 1842 * Spread the base probe times (pi_snxt_basetime) across phyints 1843 * uniformly over the (curtime..curtime + the group's probe_interval). 1844 * pi_snxt_basetime is strictly periodic with a frequency of 1845 * the group's probe interval. The actual probe time pi_snxt_time 1846 * adds some randomness to pi_snxt_basetime and happens in probe(). 1847 * For the 1st probe on each phyint after the timer is started, 1848 * pi_snxt_time and pi_snxt_basetime are the same. 1849 */ 1850 interval = GET_RANDOM(0, 1851 (int)pii->pii_phyint->pi_group->pg_probeint); 1852 1853 pii->pii_snxt_basetime = getcurrenttime() + interval; 1854 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1855 pii->pii_snxt_time = pii->pii_snxt_basetime; 1856 timer_schedule(interval); 1857 } 1858 1859 /* 1860 * Restart the probe timer on an interface instance. 1861 */ 1862 static void 1863 restart_timer(struct phyint_instance *pii) 1864 { 1865 /* 1866 * We don't need to restart the timer if it was never started in 1867 * the first place (pii->pii_basetime_inited not set), as the timer 1868 * won't have gone off yet. 1869 */ 1870 if (pii->pii_basetime_inited != 0) { 1871 1872 if (debug & D_LINKNOTE) 1873 logdebug("restart timer: restarting timer on %s, " 1874 "address family %s\n", pii->pii_phyint->pi_name, 1875 AF_STR(pii->pii_af)); 1876 1877 start_timer(pii); 1878 } 1879 } 1880 1881 static void 1882 process_link_state_down(struct phyint *pi) 1883 { 1884 logerr("The link has gone down on %s\n", pi->pi_name); 1885 1886 /* 1887 * Clear the probe statistics arrays, we don't want the repair 1888 * detection logic relying on probes that were successful prior 1889 * to the link going down. 1890 */ 1891 if (PROBE_CAPABLE(pi->pi_v4)) 1892 clear_pii_probe_stats(pi->pi_v4); 1893 if (PROBE_CAPABLE(pi->pi_v6)) 1894 clear_pii_probe_stats(pi->pi_v6); 1895 /* 1896 * Check for interface failure. Although we know the interface 1897 * has failed, we don't know if all the other interfaces in the 1898 * group have failed as well. 1899 */ 1900 if ((pi->pi_state == PI_RUNNING) || 1901 (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { 1902 if (debug & D_LINKNOTE) { 1903 logdebug("process_link_state_down:" 1904 " checking for failure on %s\n", pi->pi_name); 1905 } 1906 1907 if (pi->pi_v4 != NULL) 1908 phyint_inst_check_for_failure(pi->pi_v4); 1909 else if (pi->pi_v6 != NULL) 1910 phyint_inst_check_for_failure(pi->pi_v6); 1911 } 1912 } 1913 1914 static void 1915 process_link_state_up(struct phyint *pi) 1916 { 1917 logerr("The link has come up on %s\n", pi->pi_name); 1918 1919 /* 1920 * We stopped any running timers on each instance when the link 1921 * went down, so restart them. 1922 */ 1923 if (pi->pi_v4) 1924 restart_timer(pi->pi_v4); 1925 if (pi->pi_v6) 1926 restart_timer(pi->pi_v6); 1927 1928 phyint_check_for_repair(pi); 1929 1930 pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); 1931 if (pi->pi_whendx == LINK_UP_PERMIN) 1932 pi->pi_whendx = 0; 1933 } 1934 1935 /* 1936 * Process any changes in link state passed up from the interfaces. 1937 */ 1938 void 1939 process_link_state_changes(void) 1940 { 1941 struct phyint *pi; 1942 1943 /* Look for interfaces where the link state has just changed */ 1944 1945 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 1946 boolean_t old_link_state_up = LINK_UP(pi); 1947 1948 /* 1949 * Except when the "phyint" structure is created, this is 1950 * the only place the link state is updated. This allows 1951 * this routine to detect changes in link state, rather 1952 * than just the current state. 1953 */ 1954 UPDATE_LINK_STATE(pi); 1955 1956 if (LINK_DOWN(pi)) { 1957 /* 1958 * Has link just gone down? 1959 */ 1960 if (old_link_state_up) 1961 process_link_state_down(pi); 1962 } else { 1963 /* 1964 * Has link just gone back up? 1965 */ 1966 if (!old_link_state_up) 1967 process_link_state_up(pi); 1968 } 1969 } 1970 } 1971 1972 void 1973 reset_crtt_all(struct phyint *pi) 1974 { 1975 struct phyint_instance *pii; 1976 struct target *tg; 1977 1978 pii = pi->pi_v4; 1979 if (pii != NULL) { 1980 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1981 tg->tg_crtt = 0; 1982 tg->tg_rtt_sa = -1; 1983 tg->tg_rtt_sd = 0; 1984 } 1985 } 1986 1987 pii = pi->pi_v6; 1988 if (pii != NULL) { 1989 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1990 tg->tg_crtt = 0; 1991 tg->tg_rtt_sa = -1; 1992 tg->tg_rtt_sd = 0; 1993 } 1994 } 1995 } 1996 1997 /* 1998 * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive 1999 * probes on both instances IPv4 and IPv6. 2000 * If the interface has failed, return the time of the first probe failure 2001 * in "tff". 2002 */ 2003 static int 2004 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) 2005 { 2006 uint_t pi_tff; 2007 struct target *cur_tg; 2008 struct probe_fail_count pfinfo; 2009 struct phyint_instance *pii_other; 2010 int pr_ndx; 2011 2012 /* 2013 * Get the number of consecutive failed probes on 2014 * this phyint across all targets. Also get the number 2015 * of consecutive failed probes on this target only 2016 */ 2017 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2018 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2019 probe_fail_info(pii, cur_tg, &pfinfo); 2020 2021 /* Get the time of first failure, for later use */ 2022 pi_tff = pfinfo.pf_tff; 2023 2024 /* 2025 * If the current target has not responded to the 2026 * last NUM_PROBE_FAILS probes, and other targets are 2027 * responding delete this target. Dead gateway detection 2028 * will eventually remove this target (if router) from the 2029 * routing tables. If that does not occur, we may end 2030 * up adding this to our list again. 2031 */ 2032 if (pfinfo.pf_nfail < NUM_PROBE_FAILS && 2033 pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { 2034 if (pii->pii_targets_are_routers) { 2035 if (cur_tg->tg_status == TG_ACTIVE) 2036 pii->pii_ntargets--; 2037 cur_tg->tg_status = TG_DEAD; 2038 cur_tg->tg_crtt = 0; 2039 cur_tg->tg_rtt_sa = -1; 2040 cur_tg->tg_rtt_sd = 0; 2041 if (pii->pii_target_next == cur_tg) 2042 pii->pii_target_next = target_next(cur_tg); 2043 } else { 2044 target_delete(cur_tg); 2045 probe(pii, PROBE_MULTI, gethrtime()); 2046 } 2047 return (PHYINT_OK); 2048 } 2049 2050 /* 2051 * If the phyint has lost NUM_PROBE_FAILS or more 2052 * consecutive probes, on both IPv4 and IPv6 protocol 2053 * instances of the phyint, then trigger failure 2054 * detection, else return false 2055 */ 2056 if (pfinfo.pf_nfail < NUM_PROBE_FAILS) 2057 return (PHYINT_OK); 2058 2059 pii_other = phyint_inst_other(pii); 2060 if (PROBE_CAPABLE(pii_other)) { 2061 probe_fail_info(pii_other, NULL, &pfinfo); 2062 if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { 2063 /* 2064 * We have NUM_PROBE_FAILS or more failures 2065 * on both IPv4 and IPv6. Get the earliest 2066 * time when failure was detected on this 2067 * phyint across IPv4 and IPv6. 2068 */ 2069 if (TIME_LT(pfinfo.pf_tff, pi_tff)) 2070 pi_tff = pfinfo.pf_tff; 2071 } else { 2072 /* 2073 * This instance has < NUM_PROBE_FAILS failure. 2074 * So return false 2075 */ 2076 return (PHYINT_OK); 2077 } 2078 } 2079 *tff = pi_tff; 2080 return (PHYINT_FAILURE); 2081 } 2082 2083 /* 2084 * Check if the link has gone down on this phyint, or it has failed the 2085 * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. 2086 * Also look at other phyints of this group, for group failures. 2087 */ 2088 int 2089 failure_state(struct phyint_instance *pii) 2090 { 2091 struct probe_success_count psinfo; 2092 uint_t pi2_tls; /* time last success */ 2093 uint_t pi_tff; /* time first fail */ 2094 struct phyint *pi2; 2095 struct phyint *pi; 2096 struct phyint_instance *pii2; 2097 struct phyint_group *pg; 2098 int retval; 2099 2100 if (debug & D_FAILREP) 2101 logdebug("phyint_failed(%s)\n", pii->pii_name); 2102 2103 pi = pii->pii_phyint; 2104 pg = pi->pi_group; 2105 2106 if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == 2107 PHYINT_OK) 2108 return (PHYINT_OK); 2109 2110 /* 2111 * At this point, the link is down, or the phyint is suspect, as it 2112 * has lost NUM_PROBE_FAILS or more probes. If the phyint does not 2113 * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue 2114 * on to determine whether this should be considered a PHYINT_FAILURE 2115 * or GROUP_FAILURE. 2116 */ 2117 if (pg == phyint_anongroup) 2118 return (PHYINT_FAILURE); 2119 2120 /* 2121 * Need to compare against other phyints of the same group 2122 * to exclude group failures. If the failure was detected via 2123 * probing, then if the time of last success (tls) of any 2124 * phyint is more recent than the time of first fail (tff) of the 2125 * phyint in question, and the link is up on the phyint, 2126 * then it is a phyint failure. Otherwise it is a group failure. 2127 * If failure was detected via a link down notification sent from 2128 * the driver to IP, we see if any phyints in the group are still 2129 * running and haven't received a link down notification. We 2130 * will usually be processing the link down notification shortly 2131 * after it was received, so there is no point looking at the tls 2132 * of other phyints. 2133 */ 2134 retval = GROUP_FAILURE; 2135 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2136 /* Exclude ourself from comparison */ 2137 if (pi2 == pi) 2138 continue; 2139 2140 if (LINK_DOWN(pi)) { 2141 /* 2142 * We use FLAGS_TO_LINK_STATE() to test the flags 2143 * directly, rather then LINK_UP() or LINK_DOWN(), as 2144 * we may not have got round to processing the link 2145 * state for the other phyints in the group yet. 2146 * 2147 * The check for PI_RUNNING and group failure handles 2148 * the case when the group begins to recover. 2149 * PI_RUNNING will be set, and group failure cleared 2150 * only after receipt of NUM_PROBE_REPAIRS, by which 2151 * time the other phyints should have received at 2152 * least 1 packet, and so will not have NUM_PROBE_FAILS. 2153 */ 2154 if ((pi2->pi_state == PI_RUNNING) && 2155 !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) { 2156 retval = PHYINT_FAILURE; 2157 break; 2158 } 2159 continue; 2160 } 2161 2162 if (LINK_DOWN(pi2)) 2163 continue; 2164 2165 /* 2166 * If there's no probe-based failure detection on this 2167 * interface, and its link is still up, then it's still 2168 * working and thus the group has not failed. 2169 */ 2170 if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) { 2171 retval = PHYINT_FAILURE; 2172 break; 2173 } 2174 2175 /* 2176 * Need to compare against both IPv4 and IPv6 instances. 2177 */ 2178 pii2 = pi2->pi_v4; 2179 if (pii2 != NULL) { 2180 probe_success_info(pii2, NULL, &psinfo); 2181 if (psinfo.ps_tls_valid) { 2182 pi2_tls = psinfo.ps_tls; 2183 /* 2184 * See comment above regarding check 2185 * for PI_RUNNING and group failure. 2186 */ 2187 if (TIME_GT(pi2_tls, pi_tff) && 2188 (pi2->pi_state == PI_RUNNING) && 2189 !GROUP_FAILED(pg) && 2190 FLAGS_TO_LINK_STATE(pi2)) { 2191 retval = PHYINT_FAILURE; 2192 break; 2193 } 2194 } 2195 } 2196 2197 pii2 = pi2->pi_v6; 2198 if (pii2 != NULL) { 2199 probe_success_info(pii2, NULL, &psinfo); 2200 if (psinfo.ps_tls_valid) { 2201 pi2_tls = psinfo.ps_tls; 2202 /* 2203 * See comment above regarding check 2204 * for PI_RUNNING and group failure. 2205 */ 2206 if (TIME_GT(pi2_tls, pi_tff) && 2207 (pi2->pi_state == PI_RUNNING) && 2208 !GROUP_FAILED(pg) && 2209 FLAGS_TO_LINK_STATE(pi2)) { 2210 retval = PHYINT_FAILURE; 2211 break; 2212 } 2213 } 2214 } 2215 } 2216 2217 /* 2218 * Update the group state to account for the changes. 2219 */ 2220 phyint_group_refresh_state(pg); 2221 return (retval); 2222 } 2223 2224 /* 2225 * Return the information associated with consecutive probe successes 2226 * starting with the most recent probe. At most the last 2 probes can be 2227 * in the unacknowledged state. All previous probes have either failed 2228 * or succeeded. 2229 */ 2230 static void 2231 probe_success_info(struct phyint_instance *pii, struct target *cur_tg, 2232 struct probe_success_count *psinfo) 2233 { 2234 uint_t i; 2235 struct probe_stats *pr_statp; 2236 uint_t most_recent; 2237 uint_t second_most_recent; 2238 boolean_t pi_found_failure = _B_FALSE; 2239 boolean_t tg_found_failure = _B_FALSE; 2240 uint_t now; 2241 uint_t timeout; 2242 struct target *tg; 2243 2244 if (debug & D_FAILREP) 2245 logdebug("probe_success_info(%s)\n", pii->pii_name); 2246 2247 bzero(psinfo, sizeof (*psinfo)); 2248 now = getcurrenttime(); 2249 2250 /* 2251 * Start with the most recent probe, and count the number 2252 * of consecutive probe successes. Latch the number of successes 2253 * on hitting a failure. 2254 */ 2255 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2256 second_most_recent = PROBE_INDEX_PREV(most_recent); 2257 2258 for (i = most_recent; i != pii->pii_probe_next; 2259 i = PROBE_INDEX_PREV(i)) { 2260 pr_statp = &pii->pii_probes[i]; 2261 2262 switch (pr_statp->pr_status) { 2263 case PR_UNACKED: 2264 /* 2265 * Only the most recent 2 probes can be unacknowledged 2266 */ 2267 assert(i == most_recent || i == second_most_recent); 2268 2269 tg = pr_statp->pr_target; 2270 assert(tg != NULL); 2271 /* 2272 * The crtt could be zero for some reason, 2273 * Eg. the phyint could be failed. If the crtt is 2274 * not available use the value of the group's probe 2275 * interval which is a worst case estimate. 2276 */ 2277 timeout = ns2ms(pr_statp->pr_hrtime_start); 2278 if (tg->tg_crtt != 0) { 2279 timeout += tg->tg_crtt; 2280 } else { 2281 timeout += 2282 pii->pii_phyint->pi_group->pg_probeint; 2283 } 2284 2285 if (TIME_LT(timeout, now)) { 2286 /* 2287 * We hit a failure. Latch the total number of 2288 * recent consecutive successes. 2289 */ 2290 pr_statp->pr_time_lost = timeout; 2291 probe_chstate(pr_statp, pii, PR_LOST); 2292 pi_found_failure = _B_TRUE; 2293 if (cur_tg != NULL && tg == cur_tg) { 2294 /* 2295 * We hit a failure for the desired 2296 * target. Latch the number of recent 2297 * consecutive successes for this target 2298 */ 2299 tg_found_failure = _B_TRUE; 2300 } 2301 } 2302 break; 2303 2304 case PR_ACKED: 2305 /* 2306 * Bump up the count of probe successes, if we 2307 * have not seen any failure so far. 2308 */ 2309 if (!pi_found_failure) 2310 psinfo->ps_nsucc++; 2311 2312 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2313 !tg_found_failure) { 2314 psinfo->ps_nsucc_tg++; 2315 } 2316 2317 /* 2318 * Record the time of last success, if this is 2319 * the most recent probe success. 2320 */ 2321 if (!psinfo->ps_tls_valid) { 2322 psinfo->ps_tls = 2323 ns2ms(pr_statp->pr_hrtime_ackproc); 2324 psinfo->ps_tls_valid = _B_TRUE; 2325 } 2326 break; 2327 2328 case PR_LOST: 2329 /* 2330 * We hit a failure. Latch the total number of 2331 * recent consecutive successes. 2332 */ 2333 pi_found_failure = _B_TRUE; 2334 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2335 /* 2336 * We hit a failure for the desired target. 2337 * Latch the number of recent consecutive 2338 * successes for this target 2339 */ 2340 tg_found_failure = _B_TRUE; 2341 } 2342 break; 2343 2344 default: 2345 return; 2346 2347 } 2348 } 2349 } 2350 2351 /* 2352 * Return the information associated with consecutive probe failures 2353 * starting with the most recent probe. Only the last 2 probes can be in the 2354 * unacknowledged state. All previous probes have either failed or succeeded. 2355 */ 2356 static void 2357 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, 2358 struct probe_fail_count *pfinfo) 2359 { 2360 int i; 2361 struct probe_stats *pr_statp; 2362 boolean_t tg_found_success = _B_FALSE; 2363 boolean_t pi_found_success = _B_FALSE; 2364 int most_recent; 2365 int second_most_recent; 2366 uint_t now; 2367 uint_t timeout; 2368 struct target *tg; 2369 2370 if (debug & D_FAILREP) 2371 logdebug("probe_fail_info(%s)\n", pii->pii_name); 2372 2373 bzero(pfinfo, sizeof (*pfinfo)); 2374 now = getcurrenttime(); 2375 2376 /* 2377 * Start with the most recent probe, and count the number 2378 * of consecutive probe failures. Latch the number of failures 2379 * on hitting a probe success. 2380 */ 2381 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2382 second_most_recent = PROBE_INDEX_PREV(most_recent); 2383 2384 for (i = most_recent; i != pii->pii_probe_next; 2385 i = PROBE_INDEX_PREV(i)) { 2386 pr_statp = &pii->pii_probes[i]; 2387 2388 assert(PR_STATUS_VALID(pr_statp->pr_status)); 2389 2390 switch (pr_statp->pr_status) { 2391 case PR_UNACKED: 2392 /* 2393 * Only the most recent 2 probes can be unacknowledged 2394 */ 2395 assert(i == most_recent || i == second_most_recent); 2396 2397 tg = pr_statp->pr_target; 2398 /* 2399 * Target is guaranteed to exist in the unack. state 2400 */ 2401 assert(tg != NULL); 2402 /* 2403 * The crtt could be zero for some reason, 2404 * Eg. the phyint could be failed. If the crtt is 2405 * not available use the group's probe interval, 2406 * which is a worst case estimate. 2407 */ 2408 timeout = ns2ms(pr_statp->pr_hrtime_start); 2409 if (tg->tg_crtt != 0) { 2410 timeout += tg->tg_crtt; 2411 } else { 2412 timeout += 2413 pii->pii_phyint->pi_group->pg_probeint; 2414 } 2415 2416 if (TIME_GT(timeout, now)) 2417 break; 2418 2419 pr_statp->pr_time_lost = timeout; 2420 probe_chstate(pr_statp, pii, PR_LOST); 2421 /* FALLTHRU */ 2422 2423 case PR_LOST: 2424 if (!pi_found_success) { 2425 pfinfo->pf_nfail++; 2426 pfinfo->pf_tff = pr_statp->pr_time_lost; 2427 } 2428 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2429 !tg_found_success) { 2430 pfinfo->pf_nfail_tg++; 2431 } 2432 break; 2433 2434 default: 2435 /* 2436 * We hit a success or unused slot. Latch the 2437 * total number of recent consecutive failures. 2438 */ 2439 pi_found_success = _B_TRUE; 2440 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2441 /* 2442 * We hit a success for the desired target. 2443 * Latch the number of recent consecutive 2444 * failures for this target 2445 */ 2446 tg_found_success = _B_TRUE; 2447 } 2448 } 2449 } 2450 } 2451 2452 /* 2453 * Change the state of probe `pr' on phyint_instance `pii' to state `state'. 2454 */ 2455 void 2456 probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state) 2457 { 2458 if (pr->pr_status == state) 2459 return; 2460 2461 pr->pr_status = state; 2462 (void) probe_state_event(pr, pii); 2463 } 2464 2465 /* 2466 * Check if the phyint has been repaired. If no test address has been 2467 * configured, then consider the interface repaired if the link is up (unless 2468 * the link is flapping; see below). Otherwise, look for proof of probes 2469 * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on 2470 * either IPv4 or IPv6 instance, the phyint can be considered repaired. 2471 */ 2472 static boolean_t 2473 phyint_repaired(struct phyint *pi) 2474 { 2475 struct probe_success_count psinfo; 2476 struct phyint_instance *pii; 2477 struct target *cur_tg; 2478 int pr_ndx; 2479 uint_t cur_time; 2480 2481 if (debug & D_FAILREP) 2482 logdebug("phyint_repaired(%s)\n", pi->pi_name); 2483 2484 if (LINK_DOWN(pi)) 2485 return (_B_FALSE); 2486 2487 /* 2488 * If we don't have any test addresses and the link is up, then 2489 * consider the interface repaired, unless we've received more than 2490 * LINK_UP_PERMIN link up notifications in the last minute, in 2491 * which case we keep the link down until we drop back below 2492 * the threshold. 2493 */ 2494 if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 2495 cur_time = getcurrenttime(); 2496 if ((pi->pi_whenup[pi->pi_whendx] == 0 || 2497 (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { 2498 pi->pi_lfmsg_printed = 0; 2499 return (_B_TRUE); 2500 } 2501 if (!pi->pi_lfmsg_printed) { 2502 logerr("The link has come up on %s more than %d times " 2503 "in the last minute; disabling repair until it " 2504 "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); 2505 pi->pi_lfmsg_printed = 1; 2506 } 2507 2508 return (_B_FALSE); 2509 } 2510 2511 pii = pi->pi_v4; 2512 if (PROBE_CAPABLE(pii)) { 2513 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2514 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2515 probe_success_info(pii, cur_tg, &psinfo); 2516 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2517 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2518 return (_B_TRUE); 2519 } 2520 2521 pii = pi->pi_v6; 2522 if (PROBE_CAPABLE(pii)) { 2523 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2524 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2525 probe_success_info(pii, cur_tg, &psinfo); 2526 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2527 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2528 return (_B_TRUE); 2529 } 2530 2531 return (_B_FALSE); 2532 } 2533 2534 /* 2535 * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. 2536 */ 2537 boolean_t 2538 change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear) 2539 { 2540 int ifsock; 2541 struct lifreq lifr; 2542 uint64_t old_flags; 2543 2544 if (debug & D_FAILREP) { 2545 logdebug("change_pif_flags(%s): set %llx clear %llx\n", 2546 pi->pi_name, set, clear); 2547 } 2548 2549 if (pi->pi_v4 != NULL) 2550 ifsock = ifsock_v4; 2551 else 2552 ifsock = ifsock_v6; 2553 2554 /* 2555 * Get the current flags from the kernel, and set/clear the 2556 * desired phyint flags. Since we set only phyint flags, we can 2557 * do it on either IPv4 or IPv6 instance. 2558 */ 2559 (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 2560 2561 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 2562 if (errno != ENXIO) 2563 logperror("change_pif_flags: ioctl (get flags)"); 2564 return (_B_FALSE); 2565 } 2566 2567 old_flags = lifr.lifr_flags; 2568 lifr.lifr_flags |= set; 2569 lifr.lifr_flags &= ~clear; 2570 2571 if (old_flags == lifr.lifr_flags) { 2572 /* No change in the flags. No need to send ioctl */ 2573 return (_B_TRUE); 2574 } 2575 2576 if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 2577 if (errno != ENXIO) 2578 logperror("change_pif_flags: ioctl (set flags)"); 2579 return (_B_FALSE); 2580 } 2581 2582 /* 2583 * Keep pi_flags in synch. with actual flags. Assumes flags are 2584 * phyint flags. 2585 */ 2586 pi->pi_flags |= set; 2587 pi->pi_flags &= ~clear; 2588 2589 if (pi->pi_v4 != NULL) 2590 pi->pi_v4->pii_flags = pi->pi_flags; 2591 2592 if (pi->pi_v6 != NULL) 2593 pi->pi_v6->pii_flags = pi->pi_flags; 2594 2595 return (_B_TRUE); 2596 } 2597 2598 /* 2599 * icmp cksum computation for IPv4. 2600 */ 2601 static int 2602 in_cksum(ushort_t *addr, int len) 2603 { 2604 register int nleft = len; 2605 register ushort_t *w = addr; 2606 register ushort_t answer; 2607 ushort_t odd_byte = 0; 2608 register int sum = 0; 2609 2610 /* 2611 * Our algorithm is simple, using a 32 bit accumulator (sum), 2612 * we add sequential 16 bit words to it, and at the end, fold 2613 * back all the carry bits from the top 16 bits into the lower 2614 * 16 bits. 2615 */ 2616 while (nleft > 1) { 2617 sum += *w++; 2618 nleft -= 2; 2619 } 2620 2621 /* mop up an odd byte, if necessary */ 2622 if (nleft == 1) { 2623 *(uchar_t *)(&odd_byte) = *(uchar_t *)w; 2624 sum += odd_byte; 2625 } 2626 2627 /* 2628 * add back carry outs from top 16 bits to low 16 bits 2629 */ 2630 sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ 2631 sum += (sum >> 16); /* add carry */ 2632 answer = ~sum; /* truncate to 16 bits */ 2633 return (answer); 2634 } 2635 2636 static void 2637 reset_snxt_basetimes(void) 2638 { 2639 struct phyint_instance *pii; 2640 2641 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2642 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 2643 } 2644 } 2645 2646 /* 2647 * Is the address one of our own addresses? Unfortunately, 2648 * we cannot check our phyint tables to determine if the address 2649 * is our own. This is because, we don't track interfaces that 2650 * are not part of any group. We have to either use a 'bind' or 2651 * get the complete list of all interfaces using SIOCGLIFCONF, 2652 * to do this check. We could also use SIOCTMYADDR. 2653 * Bind fails for the local zone address, so we might include local zone 2654 * address as target address. If local zone address is a target address 2655 * and it is up, it is not possible to detect the interface failure. 2656 * SIOCTMYADDR also doesn't consider local zone address as own address. 2657 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they 2658 * are stored in `localaddrs' 2659 */ 2660 boolean_t 2661 own_address(struct in6_addr addr) 2662 { 2663 addrlist_t *addrp; 2664 struct sockaddr_storage ss; 2665 int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6; 2666 2667 addr2storage(af, &addr, &ss); 2668 for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) { 2669 if (sockaddrcmp(&ss, &addrp->al_addr)) 2670 return (_B_TRUE); 2671 } 2672 return (_B_FALSE); 2673 } 2674 2675 static int 2676 ns2ms(int64_t ns) 2677 { 2678 return (ns / (NANOSEC / MILLISEC)); 2679 } 2680 2681 static int64_t 2682 tv2ns(struct timeval *tvp) 2683 { 2684 return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000); 2685 } 2686