1 /* 2 * Copyright (c) 2003-2016 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * This module implements IPI message queueing and the MI portion of IPI 37 * message processing. 38 */ 39 40 #include "opt_ddb.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/rtprio.h> 47 #include <sys/queue.h> 48 #include <sys/thread2.h> 49 #include <sys/sysctl.h> 50 #include <sys/ktr.h> 51 #include <sys/kthread.h> 52 #include <machine/cpu.h> 53 #include <sys/lock.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_param.h> 57 #include <vm/vm_kern.h> 58 #include <vm/vm_object.h> 59 #include <vm/vm_page.h> 60 #include <vm/vm_map.h> 61 #include <vm/vm_pager.h> 62 #include <vm/vm_extern.h> 63 #include <vm/vm_zone.h> 64 65 #include <machine/stdarg.h> 66 #include <machine/smp.h> 67 #include <machine/clock.h> 68 #include <machine/atomic.h> 69 70 struct ipiq_stats { 71 int64_t ipiq_count; /* total calls to lwkt_send_ipiq*() */ 72 int64_t ipiq_fifofull; /* number of fifo full conditions detected */ 73 int64_t ipiq_avoided; /* interlock with target avoids cpu ipi */ 74 int64_t ipiq_passive; /* passive IPI messages */ 75 int64_t ipiq_cscount; /* number of cpu synchronizations */ 76 } __cachealign; 77 78 static struct ipiq_stats ipiq_stats_percpu[MAXCPU]; 79 #define ipiq_stat(gd) ipiq_stats_percpu[(gd)->gd_cpuid] 80 81 static int ipiq_debug; /* set to 1 for debug */ 82 #ifdef PANIC_DEBUG 83 static int panic_ipiq_cpu = -1; 84 static int panic_ipiq_count = 100; 85 #endif 86 87 SYSCTL_INT(_lwkt, OID_AUTO, ipiq_debug, CTLFLAG_RW, &ipiq_debug, 0, 88 ""); 89 #ifdef PANIC_DEBUG 90 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_cpu, CTLFLAG_RW, &panic_ipiq_cpu, 0, ""); 91 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_count, CTLFLAG_RW, &panic_ipiq_count, 0, ""); 92 #endif 93 94 #define IPIQ_STRING "func=%p arg1=%p arg2=%d scpu=%d dcpu=%d" 95 #define IPIQ_ARGS void *func, void *arg1, int arg2, int scpu, int dcpu 96 97 #if !defined(KTR_IPIQ) 98 #define KTR_IPIQ KTR_ALL 99 #endif 100 KTR_INFO_MASTER(ipiq); 101 KTR_INFO(KTR_IPIQ, ipiq, send_norm, 0, IPIQ_STRING, IPIQ_ARGS); 102 KTR_INFO(KTR_IPIQ, ipiq, send_pasv, 1, IPIQ_STRING, IPIQ_ARGS); 103 KTR_INFO(KTR_IPIQ, ipiq, receive, 4, IPIQ_STRING, IPIQ_ARGS); 104 KTR_INFO(KTR_IPIQ, ipiq, sync_start, 5, "cpumask=%08lx", unsigned long mask); 105 KTR_INFO(KTR_IPIQ, ipiq, sync_end, 6, "cpumask=%08lx", unsigned long mask); 106 KTR_INFO(KTR_IPIQ, ipiq, cpu_send, 7, IPIQ_STRING, IPIQ_ARGS); 107 KTR_INFO(KTR_IPIQ, ipiq, send_end, 8, IPIQ_STRING, IPIQ_ARGS); 108 KTR_INFO(KTR_IPIQ, ipiq, sync_quick, 9, "cpumask=%08lx", unsigned long mask); 109 110 #define logipiq(name, func, arg1, arg2, sgd, dgd) \ 111 KTR_LOG(ipiq_ ## name, func, arg1, arg2, sgd->gd_cpuid, dgd->gd_cpuid) 112 #define logipiq2(name, arg) \ 113 KTR_LOG(ipiq_ ## name, arg) 114 115 static void lwkt_process_ipiq_nested(void); 116 static int lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, 117 struct intrframe *frame, int limit); 118 static void lwkt_cpusync_remote1(lwkt_cpusync_t cs); 119 static void lwkt_cpusync_remote2(lwkt_cpusync_t cs); 120 121 #define IPIQ_SYSCTL(name) \ 122 static int \ 123 sysctl_##name(SYSCTL_HANDLER_ARGS) \ 124 { \ 125 int64_t val = 0; \ 126 int cpu, error; \ 127 \ 128 for (cpu = 0; cpu < ncpus; ++cpu) \ 129 val += ipiq_stats_percpu[cpu].name; \ 130 \ 131 error = sysctl_handle_quad(oidp, &val, 0, req); \ 132 if (error || req->newptr == NULL) \ 133 return error; \ 134 \ 135 for (cpu = 0; cpu < ncpus; ++cpu) \ 136 ipiq_stats_percpu[cpu].name = val; \ 137 \ 138 return 0; \ 139 } 140 141 IPIQ_SYSCTL(ipiq_count); 142 IPIQ_SYSCTL(ipiq_fifofull); 143 IPIQ_SYSCTL(ipiq_avoided); 144 IPIQ_SYSCTL(ipiq_passive); 145 IPIQ_SYSCTL(ipiq_cscount); 146 147 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_count, (CTLTYPE_QUAD | CTLFLAG_RW), 148 0, 0, sysctl_ipiq_count, "Q", "Number of IPI's sent"); 149 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_fifofull, (CTLTYPE_QUAD | CTLFLAG_RW), 150 0, 0, sysctl_ipiq_fifofull, "Q", 151 "Number of fifo full conditions detected"); 152 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_avoided, (CTLTYPE_QUAD | CTLFLAG_RW), 153 0, 0, sysctl_ipiq_avoided, "Q", 154 "Number of IPI's avoided by interlock with target cpu"); 155 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_passive, (CTLTYPE_QUAD | CTLFLAG_RW), 156 0, 0, sysctl_ipiq_passive, "Q", 157 "Number of passive IPI messages sent"); 158 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_cscount, (CTLTYPE_QUAD | CTLFLAG_RW), 159 0, 0, sysctl_ipiq_cscount, "Q", 160 "Number of cpu synchronizations"); 161 162 /* 163 * Send a function execution request to another cpu. The request is queued 164 * on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every 165 * possible target cpu. The FIFO can be written. 166 * 167 * If the FIFO fills up we have to enable interrupts to avoid an APIC 168 * deadlock and process pending IPIQs while waiting for it to empty. 169 * Otherwise we may soft-deadlock with another cpu whos FIFO is also full. 170 * 171 * We can safely bump gd_intr_nesting_level because our crit_exit() at the 172 * end will take care of any pending interrupts. 173 * 174 * The actual hardware IPI is avoided if the target cpu is already processing 175 * the queue from a prior IPI. It is possible to pipeline IPI messages 176 * very quickly between cpus due to the FIFO hysteresis. 177 * 178 * Need not be called from a critical section. 179 */ 180 int 181 lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2) 182 { 183 lwkt_ipiq_t ip; 184 int windex; 185 int level1; 186 int level2; 187 long rflags; 188 struct globaldata *gd = mycpu; 189 190 logipiq(send_norm, func, arg1, arg2, gd, target); 191 192 if (target == gd) { 193 func(arg1, arg2, NULL); 194 logipiq(send_end, func, arg1, arg2, gd, target); 195 return(0); 196 } 197 crit_enter(); 198 ++gd->gd_intr_nesting_level; 199 #ifdef INVARIANTS 200 if (gd->gd_intr_nesting_level > 20) 201 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); 202 #endif 203 KKASSERT(curthread->td_critcount); 204 ++ipiq_stat(gd).ipiq_count; 205 ip = &gd->gd_ipiq[target->gd_cpuid]; 206 207 /* 208 * Do not allow the FIFO to become full. Interrupts must be physically 209 * enabled while we liveloop to avoid deadlocking the APIC. 210 * 211 * When we are not nested inside a processing loop we allow the FIFO 212 * to get 1/2 full. Once it exceeds 1/2 full we must wait for it to 213 * drain, executing any incoming IPIs while we wait. 214 * 215 * When we are nested we allow the FIFO to get almost completely full. 216 * This allows us to queue IPIs sent from IPI callbacks. The processing 217 * code will only process incoming FIFOs that are trying to drain while 218 * we wait, and only to the only-slightly-less-full point, to avoid a 219 * deadlock. 220 * 221 * We are guaranteed 222 */ 223 224 if (gd->gd_processing_ipiq == 0) { 225 level1 = MAXCPUFIFO / 2; 226 level2 = MAXCPUFIFO / 4; 227 } else { 228 level1 = MAXCPUFIFO - 3; 229 level2 = MAXCPUFIFO - 5; 230 } 231 232 if (ip->ip_windex - ip->ip_rindex > level1) { 233 #ifndef _KERNEL_VIRTUAL 234 uint64_t tsc_base = rdtsc(); 235 #endif 236 int repeating = 0; 237 int olimit; 238 239 rflags = read_rflags(); 240 cpu_enable_intr(); 241 ++ipiq_stat(gd).ipiq_fifofull; 242 DEBUG_PUSH_INFO("send_ipiq3"); 243 olimit = atomic_swap_int(&ip->ip_drain, level2); 244 while (ip->ip_windex - ip->ip_rindex > level2) { 245 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1); 246 lwkt_process_ipiq_nested(); 247 cpu_pause(); 248 249 /* 250 * Check for target not draining issue. This should be fixed but 251 * leave the code in-place anyway as it can recover an otherwise 252 * dead system. 253 */ 254 #ifdef _KERNEL_VIRTUAL 255 if (repeating++ > 10) 256 vkernel_yield(); 257 #else 258 if (rdtsc() - tsc_base > tsc_frequency) { 259 ++repeating; 260 if (repeating > 10) { 261 kprintf("send_ipiq %d->%d tgt not draining (%d) sniff=%p,%p\n", 262 gd->gd_cpuid, target->gd_cpuid, repeating, 263 target->gd_sample_pc, target->gd_sample_sp); 264 smp_sniff(); 265 cpu_disable_intr(); 266 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 267 cpu_send_ipiq(target->gd_cpuid); 268 cpu_enable_intr(); 269 } else { 270 kprintf("send_ipiq %d->%d tgt not draining (%d)\n", 271 gd->gd_cpuid, target->gd_cpuid, repeating); 272 smp_sniff(); 273 } 274 tsc_base = rdtsc(); 275 } 276 #endif 277 } 278 atomic_swap_int(&ip->ip_drain, olimit); 279 DEBUG_POP_INFO(); 280 #if defined(__x86_64__) 281 write_rflags(rflags); 282 #else 283 #error "no write_*flags" 284 #endif 285 } 286 287 /* 288 * Queue the new message and signal the target cpu. For now we need to 289 * physically disable interrupts because the target will not get signalled 290 * by other cpus once we set target->gd_npoll and we don't want to get 291 * interrupted. 292 * 293 * XXX not sure why this is a problem, the critical section should prevent 294 * any stalls (incoming interrupts except Xinvltlb and Xsnoop will 295 * just be made pending). 296 */ 297 rflags = read_rflags(); 298 #ifndef _KERNEL_VIRTUAL 299 cpu_disable_intr(); 300 #endif 301 302 windex = ip->ip_windex & MAXCPUFIFO_MASK; 303 ip->ip_info[windex].func = func; 304 ip->ip_info[windex].arg1 = arg1; 305 ip->ip_info[windex].arg2 = arg2; 306 cpu_sfence(); 307 ++ip->ip_windex; 308 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 309 310 /* 311 * signal the target cpu that there is work pending. 312 */ 313 if (atomic_swap_int(&target->gd_npoll, 1) == 0) { 314 logipiq(cpu_send, func, arg1, arg2, gd, target); 315 cpu_send_ipiq(target->gd_cpuid); 316 } else { 317 ++ipiq_stat(gd).ipiq_avoided; 318 } 319 write_rflags(rflags); 320 321 --gd->gd_intr_nesting_level; 322 crit_exit(); 323 logipiq(send_end, func, arg1, arg2, gd, target); 324 325 return(ip->ip_windex); 326 } 327 328 /* 329 * Similar to lwkt_send_ipiq() but this function does not actually initiate 330 * the IPI to the target cpu unless the FIFO is greater than 1/4 full. 331 * This function is usually very fast. 332 * 333 * This function is used for non-critical IPI messages, such as memory 334 * deallocations. The queue will typically be flushed by the target cpu at 335 * the next clock interrupt. 336 * 337 * Need not be called from a critical section. 338 */ 339 int 340 lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func, 341 void *arg1, int arg2) 342 { 343 lwkt_ipiq_t ip; 344 int windex; 345 struct globaldata *gd = mycpu; 346 347 KKASSERT(target != gd); 348 crit_enter_gd(gd); 349 ++gd->gd_intr_nesting_level; 350 ip = &gd->gd_ipiq[target->gd_cpuid]; 351 352 /* 353 * If the FIFO is too full send the IPI actively. 354 * 355 * WARNING! This level must be low enough not to trigger a wait loop 356 * in the active sending code since we are not signalling the 357 * target cpu. 358 */ 359 if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO / 4) { 360 --gd->gd_intr_nesting_level; 361 crit_exit_gd(gd); 362 return lwkt_send_ipiq3(target, func, arg1, arg2); 363 } 364 365 /* 366 * Else we can do it passively. 367 */ 368 logipiq(send_pasv, func, arg1, arg2, gd, target); 369 ++ipiq_stat(gd).ipiq_count; 370 ++ipiq_stat(gd).ipiq_passive; 371 372 /* 373 * Queue the new message 374 */ 375 windex = ip->ip_windex & MAXCPUFIFO_MASK; 376 ip->ip_info[windex].func = func; 377 ip->ip_info[windex].arg1 = arg1; 378 ip->ip_info[windex].arg2 = arg2; 379 cpu_sfence(); 380 ++ip->ip_windex; 381 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 382 --gd->gd_intr_nesting_level; 383 384 /* 385 * Do not signal the target cpu, it will pick up the IPI when it next 386 * polls (typically on the next tick). 387 */ 388 crit_exit(); 389 logipiq(send_end, func, arg1, arg2, gd, target); 390 391 return(ip->ip_windex); 392 } 393 394 /* 395 * deprecated, used only by fast int forwarding. 396 */ 397 int 398 lwkt_send_ipiq3_bycpu(int dcpu, ipifunc3_t func, void *arg1, int arg2) 399 { 400 return(lwkt_send_ipiq3(globaldata_find(dcpu), func, arg1, arg2)); 401 } 402 403 /* 404 * Send a message to several target cpus. Typically used for scheduling. 405 * The message will not be sent to stopped cpus. 406 * 407 * To prevent treating low-numbered cpus as favored sons, the IPIs are 408 * issued in order starting at mycpu upward, then from 0 through mycpu. 409 * This is particularly important to prevent random scheduler pickups 410 * from favoring cpu 0. 411 */ 412 int 413 lwkt_send_ipiq3_mask(cpumask_t mask, ipifunc3_t func, void *arg1, int arg2) 414 { 415 int cpuid; 416 int count = 0; 417 cpumask_t amask; 418 419 CPUMASK_NANDMASK(mask, stopped_cpus); 420 421 /* 422 * All cpus in mask which are >= mycpu 423 */ 424 CPUMASK_ASSBMASK(amask, mycpu->gd_cpuid); 425 CPUMASK_INVMASK(amask); 426 CPUMASK_ANDMASK(amask, mask); 427 while (CPUMASK_TESTNZERO(amask)) { 428 cpuid = BSFCPUMASK(amask); 429 lwkt_send_ipiq3(globaldata_find(cpuid), func, arg1, arg2); 430 CPUMASK_NANDBIT(amask, cpuid); 431 ++count; 432 } 433 434 /* 435 * All cpus in mask which are < mycpu 436 */ 437 CPUMASK_ASSBMASK(amask, mycpu->gd_cpuid); 438 CPUMASK_ANDMASK(amask, mask); 439 while (CPUMASK_TESTNZERO(amask)) { 440 cpuid = BSFCPUMASK(amask); 441 lwkt_send_ipiq3(globaldata_find(cpuid), func, arg1, arg2); 442 CPUMASK_NANDBIT(amask, cpuid); 443 ++count; 444 } 445 return(count); 446 } 447 448 /* 449 * Wait for the remote cpu to finish processing a function. 450 * 451 * YYY we have to enable interrupts and process the IPIQ while waiting 452 * for it to empty or we may deadlock with another cpu. Create a CPU_*() 453 * function to do this! YYY we really should 'block' here. 454 * 455 * MUST be called from a critical section. This routine may be called 456 * from an interrupt (for example, if an interrupt wakes a foreign thread 457 * up). 458 */ 459 void 460 lwkt_wait_ipiq(globaldata_t target, int seq) 461 { 462 lwkt_ipiq_t ip; 463 464 if (target != mycpu) { 465 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 466 if ((int)(ip->ip_xindex - seq) < 0) { 467 #if defined(__x86_64__) 468 unsigned long rflags = read_rflags(); 469 #else 470 #error "no read_*flags" 471 #endif 472 int64_t time_tgt = tsc_get_target(1000000000LL); 473 int time_loops = 10; 474 int benice = 0; 475 #ifdef _KERNEL_VIRTUAL 476 int repeating = 0; 477 #endif 478 479 cpu_enable_intr(); 480 DEBUG_PUSH_INFO("wait_ipiq"); 481 while ((int)(ip->ip_xindex - seq) < 0) { 482 crit_enter(); 483 lwkt_process_ipiq(); 484 crit_exit(); 485 #ifdef _KERNEL_VIRTUAL 486 if (repeating++ > 10) 487 vkernel_yield(); 488 #endif 489 490 /* 491 * IPIQs must be handled within 10 seconds and this code 492 * will warn after one second. 493 */ 494 if ((benice & 255) == 0 && tsc_test_target(time_tgt) > 0) { 495 kprintf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", 496 mycpu->gd_cpuid, target->gd_cpuid, 497 ip->ip_xindex - seq); 498 if (--time_loops == 0) 499 panic("LWKT_WAIT_IPIQ"); 500 time_tgt = tsc_get_target(1000000000LL); 501 } 502 ++benice; 503 504 /* 505 * xindex may be modified by another cpu, use a load fence 506 * to ensure that the loop does not use a speculative value 507 * (which may improve performance). 508 */ 509 cpu_pause(); 510 cpu_lfence(); 511 } 512 DEBUG_POP_INFO(); 513 #if defined(__x86_64__) 514 write_rflags(rflags); 515 #else 516 #error "no write_*flags" 517 #endif 518 } 519 } 520 } 521 522 /* 523 * Called from IPI interrupt (like a fast interrupt), and numerous 524 * other locations, and might also be called recursively. Caller must 525 * hold a critical section across this call. 526 * 527 * When called from doreti, splz, or an IPI interrupt, npoll is cleared 528 * by the caller using an atomic xchgl, thus synchronizing the incoming 529 * ipimask against npoll. A new IPI will be received if new traffic 530 * occurs verses the windex we read. 531 * 532 * However, ipimask might not be synchronized when called from other 533 * locations. Our processing will be more heuristic. 534 * 535 * There are two versions, one where no interrupt frame is available (when 536 * called from the send code and from splz, and one where an interrupt 537 * frame is available. 538 * 539 * When the current cpu is mastering a cpusync we do NOT internally loop 540 * on the cpusyncq poll. We also do not re-flag a pending ipi due to 541 * the cpusyncq poll because this can cause doreti/splz to loop internally. 542 * The cpusync master's own loop must be allowed to run to avoid a deadlock. 543 */ 544 void 545 lwkt_process_ipiq(void) 546 { 547 globaldata_t gd = mycpu; 548 globaldata_t sgd; 549 lwkt_ipiq_t ip; 550 cpumask_t mask; 551 int n; 552 553 ++gd->gd_processing_ipiq; 554 again: 555 mask = gd->gd_ipimask; 556 cpu_ccfence(); 557 while (CPUMASK_TESTNZERO(mask)) { 558 n = BSFCPUMASK(mask); 559 if (n != gd->gd_cpuid) { 560 sgd = globaldata_find(n); 561 ip = sgd->gd_ipiq; 562 if (ip != NULL) { 563 ip += gd->gd_cpuid; 564 while (lwkt_process_ipiq_core(sgd, ip, NULL, 0)) 565 ; 566 /* 567 * Can't NAND before-hand as it will prevent recursive 568 * processing. Sender will adjust windex before adjusting 569 * ipimask. 570 */ 571 ATOMIC_CPUMASK_NANDBIT(gd->gd_ipimask, n); 572 if (ip->ip_rindex != ip->ip_windex) 573 ATOMIC_CPUMASK_ORBIT(gd->gd_ipimask, n); 574 } 575 } 576 CPUMASK_NANDBIT(mask, n); 577 } 578 579 /* 580 * Process pending cpusyncs. If the current thread has a cpusync 581 * active cpusync we only run the list once and do not re-flag 582 * as the thread itself is processing its interlock. 583 */ 584 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL, 0)) { 585 if (gd->gd_curthread->td_cscount == 0) 586 goto again; 587 /* need_ipiq(); do not reflag */ 588 } 589 590 /* 591 * Interlock to allow more IPI interrupts. 592 */ 593 --gd->gd_processing_ipiq; 594 } 595 596 void 597 lwkt_process_ipiq_frame(struct intrframe *frame) 598 { 599 globaldata_t gd = mycpu; 600 globaldata_t sgd; 601 lwkt_ipiq_t ip; 602 cpumask_t mask; 603 int n; 604 605 ++gd->gd_processing_ipiq; 606 again: 607 mask = gd->gd_ipimask; 608 cpu_ccfence(); 609 while (CPUMASK_TESTNZERO(mask)) { 610 n = BSFCPUMASK(mask); 611 if (n != gd->gd_cpuid) { 612 sgd = globaldata_find(n); 613 ip = sgd->gd_ipiq; 614 if (ip != NULL) { 615 ip += gd->gd_cpuid; 616 while (lwkt_process_ipiq_core(sgd, ip, frame, 0)) 617 ; 618 /* 619 * Can't NAND before-hand as it will prevent recursive 620 * processing. Sender will adjust windex before adjusting 621 * ipimask. 622 */ 623 ATOMIC_CPUMASK_NANDBIT(gd->gd_ipimask, n); 624 if (ip->ip_rindex != ip->ip_windex) 625 ATOMIC_CPUMASK_ORBIT(gd->gd_ipimask, n); 626 } 627 } 628 CPUMASK_NANDBIT(mask, n); 629 } 630 if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) { 631 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, frame, 0)) { 632 if (gd->gd_curthread->td_cscount == 0) 633 goto again; 634 /* need_ipiq(); do not reflag */ 635 } 636 } 637 --gd->gd_processing_ipiq; 638 } 639 640 /* 641 * Only process incoming IPIQs from draining senders and only process them 642 * to the point where the draining sender is able to continue. This is 643 * necessary to avoid deadlocking the IPI subsystem because we are acting on 644 * incoming messages and the callback may queue additional messages. 645 * 646 * We only want to have to act on senders that are blocked to limit the 647 * number of additional messages sent. At the same time, recipients are 648 * trying to drain our own queue. Theoretically this create a pipeline that 649 * cannot deadlock. 650 */ 651 static void 652 lwkt_process_ipiq_nested(void) 653 { 654 globaldata_t gd = mycpu; 655 globaldata_t sgd; 656 lwkt_ipiq_t ip; 657 cpumask_t mask; 658 int n; 659 int limit; 660 661 ++gd->gd_processing_ipiq; 662 again: 663 mask = gd->gd_ipimask; 664 cpu_ccfence(); 665 while (CPUMASK_TESTNZERO(mask)) { 666 n = BSFCPUMASK(mask); 667 if (n != gd->gd_cpuid) { 668 sgd = globaldata_find(n); 669 ip = sgd->gd_ipiq; 670 671 /* 672 * NOTE: We do not mess with the cpumask at all, instead we allow 673 * the top-level ipiq processor deal with it. 674 */ 675 if (ip != NULL) { 676 ip += gd->gd_cpuid; 677 if ((limit = ip->ip_drain) != 0) { 678 lwkt_process_ipiq_core(sgd, ip, NULL, limit); 679 /* no gd_ipimask when doing limited processing */ 680 } 681 } 682 } 683 CPUMASK_NANDBIT(mask, n); 684 } 685 686 /* 687 * Process pending cpusyncs. If the current thread has a cpusync 688 * active cpusync we only run the list once and do not re-flag 689 * as the thread itself is processing its interlock. 690 */ 691 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL, 0)) { 692 if (gd->gd_curthread->td_cscount == 0) 693 goto again; 694 /* need_ipiq(); do not reflag */ 695 } 696 --gd->gd_processing_ipiq; 697 } 698 699 /* 700 * Process incoming IPI requests until only <limit> are left (0 to exhaust 701 * all incoming IPI requests). 702 */ 703 static int 704 lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, 705 struct intrframe *frame, int limit) 706 { 707 globaldata_t mygd = mycpu; 708 int ri; 709 int wi; 710 ipifunc3_t copy_func; 711 void *copy_arg1; 712 int copy_arg2; 713 714 /* 715 * Clear the originating core from our ipimask, we will process all 716 * incoming messages. 717 * 718 * Obtain the current write index, which is modified by a remote cpu. 719 * Issue a load fence to prevent speculative reads of e.g. data written 720 * by the other cpu prior to them updating the windex. 721 */ 722 KKASSERT(curthread->td_critcount); 723 wi = ip->ip_windex; 724 cpu_lfence(); 725 ++mygd->gd_intr_nesting_level; 726 727 /* 728 * NOTE: xindex is only updated after we are sure the function has 729 * finished execution. Beware lwkt_process_ipiq() reentrancy! 730 * The function may send an IPI which may block/drain. 731 * 732 * NOTE: Due to additional IPI operations that the callback function 733 * may make, it is possible for both rindex and windex to advance and 734 * thus for rindex to advance passed our cached windex. 735 * 736 * We must process only through our cached (wi) to ensure that 737 * speculative reads of ip_info[] content do not occur without 738 * a memory barrier. 739 * 740 * NOTE: Single pass only. Returns non-zero if the queue is not empty 741 * on return. 742 * 743 * NOTE: Our 'wi' guarantees that memory loads will not be out of order. 744 * Do NOT reload wi with windex in the below loop unless you also 745 * issue another lfence after reloading it. 746 */ 747 while (wi - (ri = ip->ip_rindex) > limit) { 748 ri &= MAXCPUFIFO_MASK; 749 copy_func = ip->ip_info[ri].func; 750 copy_arg1 = ip->ip_info[ri].arg1; 751 copy_arg2 = ip->ip_info[ri].arg2; 752 cpu_ccfence(); 753 ++ip->ip_rindex; 754 logipiq(receive, copy_func, copy_arg1, copy_arg2, sgd, mycpu); 755 #ifdef INVARIANTS 756 if (ipiq_debug && (ip->ip_rindex & 0xFFFFFF) == 0) { 757 kprintf("cpu %d ipifunc %p %p %d (frame %p)\n", 758 mycpu->gd_cpuid, 759 copy_func, copy_arg1, copy_arg2, 760 #if defined(__x86_64__) 761 (frame ? (void *)frame->if_rip : NULL)); 762 #else 763 NULL); 764 #endif 765 } 766 #endif 767 copy_func(copy_arg1, copy_arg2, frame); 768 cpu_sfence(); 769 ip->ip_xindex = ip->ip_rindex; 770 771 #ifdef PANIC_DEBUG 772 /* 773 * Simulate panics during the processing of an IPI 774 */ 775 if (mycpu->gd_cpuid == panic_ipiq_cpu && panic_ipiq_count) { 776 if (--panic_ipiq_count == 0) { 777 #ifdef DDB 778 Debugger("PANIC_DEBUG"); 779 #else 780 panic("PANIC_DEBUG"); 781 #endif 782 } 783 } 784 #endif 785 } 786 --mygd->gd_intr_nesting_level; 787 788 /* 789 * Return non-zero if there is still more in the queue. Don't worry 790 * about fencing, we will get another interrupt if necessary. 791 */ 792 return (ip->ip_rindex != ip->ip_windex); 793 } 794 795 static void 796 lwkt_sync_ipiq(void *arg) 797 { 798 volatile cpumask_t *cpumask = arg; 799 800 ATOMIC_CPUMASK_NANDBIT(*cpumask, mycpu->gd_cpuid); 801 if (CPUMASK_TESTZERO(*cpumask)) 802 wakeup(cpumask); 803 } 804 805 void 806 lwkt_synchronize_ipiqs(const char *wmesg) 807 { 808 volatile cpumask_t other_cpumask; 809 810 other_cpumask = smp_active_mask; 811 CPUMASK_ANDMASK(other_cpumask, mycpu->gd_other_cpus); 812 lwkt_send_ipiq_mask(other_cpumask, lwkt_sync_ipiq, 813 __DEVOLATILE(void *, &other_cpumask)); 814 815 while (CPUMASK_TESTNZERO(other_cpumask)) { 816 tsleep_interlock(&other_cpumask, 0); 817 if (CPUMASK_TESTNZERO(other_cpumask)) 818 tsleep(&other_cpumask, PINTERLOCKED, wmesg, 0); 819 } 820 } 821 822 /* 823 * CPU Synchronization Support 824 * 825 * lwkt_cpusync_interlock() - Place specified cpus in a quiescent state. 826 * The current cpu is placed in a hard critical 827 * section. 828 * 829 * lwkt_cpusync_deinterlock() - Execute cs_func on specified cpus, including 830 * current cpu if specified, then return. 831 */ 832 void 833 lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *arg) 834 { 835 struct lwkt_cpusync cs; 836 837 lwkt_cpusync_init(&cs, mask, func, arg); 838 lwkt_cpusync_interlock(&cs); 839 lwkt_cpusync_deinterlock(&cs); 840 } 841 842 843 void 844 lwkt_cpusync_interlock(lwkt_cpusync_t cs) 845 { 846 globaldata_t gd = mycpu; 847 cpumask_t mask; 848 849 /* 850 * mask acknowledge (cs_mack): 0->mask for stage 1 851 * 852 * mack does not include the current cpu. 853 */ 854 mask = cs->cs_mask; 855 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 856 CPUMASK_ANDMASK(mask, smp_active_mask); 857 CPUMASK_ASSZERO(cs->cs_mack); 858 859 crit_enter_id("cpusync"); 860 if (CPUMASK_TESTNZERO(mask)) { 861 DEBUG_PUSH_INFO("cpusync_interlock"); 862 ++ipiq_stat(gd).ipiq_cscount; 863 ++gd->gd_curthread->td_cscount; 864 lwkt_send_ipiq_mask(mask, (ipifunc1_t)lwkt_cpusync_remote1, cs); 865 logipiq2(sync_start, (long)CPUMASK_LOWMASK(mask)); 866 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 867 lwkt_process_ipiq(); 868 cpu_pause(); 869 #ifdef _KERNEL_VIRTUAL 870 vkernel_yield(); 871 #endif 872 } 873 DEBUG_POP_INFO(); 874 } 875 } 876 877 /* 878 * Interlocked cpus have executed remote1 and are polling in remote2. 879 * To deinterlock we clear cs_mack and wait for the cpus to execute 880 * the func and set their bit in cs_mack again. 881 * 882 */ 883 void 884 lwkt_cpusync_deinterlock(lwkt_cpusync_t cs) 885 { 886 globaldata_t gd = mycpu; 887 cpumask_t mask; 888 889 /* 890 * mask acknowledge (cs_mack): mack->0->mack for stage 2 891 * 892 * Clearing cpu bits for polling cpus in cs_mack will cause them to 893 * execute stage 2, which executes the cs_func(cs_data) and then sets 894 * their bit in cs_mack again. 895 * 896 * mack does not include the current cpu. 897 */ 898 mask = cs->cs_mack; 899 cpu_ccfence(); 900 CPUMASK_ASSZERO(cs->cs_mack); 901 cpu_ccfence(); 902 if (cs->cs_func && CPUMASK_TESTBIT(cs->cs_mask, gd->gd_cpuid)) 903 cs->cs_func(cs->cs_data); 904 if (CPUMASK_TESTNZERO(mask)) { 905 DEBUG_PUSH_INFO("cpusync_deinterlock"); 906 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 907 lwkt_process_ipiq(); 908 cpu_pause(); 909 #ifdef _KERNEL_VIRTUAL 910 vkernel_yield(); 911 #endif 912 } 913 DEBUG_POP_INFO(); 914 /* 915 * cpusyncq ipis may be left queued without the RQF flag set due to 916 * a non-zero td_cscount, so be sure to process any laggards after 917 * decrementing td_cscount. 918 */ 919 --gd->gd_curthread->td_cscount; 920 lwkt_process_ipiq(); 921 logipiq2(sync_end, (long)CPUMASK_LOWMASK(mask)); 922 } 923 crit_exit_id("cpusync"); 924 } 925 926 /* 927 * The quick version does not quiesce the target cpu(s) but instead executes 928 * the function on the target cpu(s) and waits for all to acknowledge. This 929 * avoids spinning on the target cpus. 930 * 931 * This function is typically only used for kernel_pmap updates. User pmaps 932 * have to be quiesced. 933 */ 934 void 935 lwkt_cpusync_quick(lwkt_cpusync_t cs) 936 { 937 globaldata_t gd = mycpu; 938 cpumask_t mask; 939 940 /* 941 * stage-2 cs_mack only. 942 */ 943 mask = cs->cs_mask; 944 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 945 CPUMASK_ANDMASK(mask, smp_active_mask); 946 CPUMASK_ASSZERO(cs->cs_mack); 947 948 crit_enter_id("cpusync"); 949 if (CPUMASK_TESTNZERO(mask)) { 950 DEBUG_PUSH_INFO("cpusync_interlock"); 951 ++ipiq_stat(gd).ipiq_cscount; 952 ++gd->gd_curthread->td_cscount; 953 lwkt_send_ipiq_mask(mask, (ipifunc1_t)lwkt_cpusync_remote2, cs); 954 logipiq2(sync_quick, (long)CPUMASK_LOWMASK(mask)); 955 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 956 lwkt_process_ipiq(); 957 cpu_pause(); 958 #ifdef _KERNEL_VIRTUAL 959 vkernel_yield(); 960 #endif 961 } 962 963 /* 964 * cpusyncq ipis may be left queued without the RQF flag set due to 965 * a non-zero td_cscount, so be sure to process any laggards after 966 * decrementing td_cscount. 967 */ 968 DEBUG_POP_INFO(); 969 --gd->gd_curthread->td_cscount; 970 lwkt_process_ipiq(); 971 } 972 if (cs->cs_func && CPUMASK_TESTBIT(cs->cs_mask, gd->gd_cpuid)) 973 cs->cs_func(cs->cs_data); 974 crit_exit_id("cpusync"); 975 } 976 977 /* 978 * helper IPI remote messaging function. 979 * 980 * Called on remote cpu when a new cpu synchronization request has been 981 * sent to us. Execute the run function and adjust cs_count, then requeue 982 * the request so we spin on it. 983 */ 984 static void 985 lwkt_cpusync_remote1(lwkt_cpusync_t cs) 986 { 987 globaldata_t gd = mycpu; 988 989 ATOMIC_CPUMASK_ORBIT(cs->cs_mack, gd->gd_cpuid); 990 lwkt_cpusync_remote2(cs); 991 } 992 993 /* 994 * helper IPI remote messaging function. 995 * 996 * Poll for the originator telling us to finish. If it hasn't, requeue 997 * our request so we spin on it. 998 */ 999 static void 1000 lwkt_cpusync_remote2(lwkt_cpusync_t cs) 1001 { 1002 globaldata_t gd = mycpu; 1003 1004 if (CPUMASK_TESTMASK(cs->cs_mack, gd->gd_cpumask) == 0) { 1005 if (cs->cs_func) 1006 cs->cs_func(cs->cs_data); 1007 ATOMIC_CPUMASK_ORBIT(cs->cs_mack, gd->gd_cpuid); 1008 /* cs can be ripped out at this point */ 1009 } else { 1010 lwkt_ipiq_t ip; 1011 int wi; 1012 1013 cpu_pause(); 1014 #ifdef _KERNEL_VIRTUAL 1015 vkernel_yield(); 1016 #endif 1017 cpu_lfence(); 1018 1019 /* 1020 * Requeue our IPI to avoid a deep stack recursion. If no other 1021 * IPIs are pending we can just loop up, which should help VMs 1022 * better-detect spin loops. 1023 */ 1024 ip = &gd->gd_cpusyncq; 1025 1026 wi = ip->ip_windex & MAXCPUFIFO_MASK; 1027 ip->ip_info[wi].func = (ipifunc3_t)(ipifunc1_t)lwkt_cpusync_remote2; 1028 ip->ip_info[wi].arg1 = cs; 1029 ip->ip_info[wi].arg2 = 0; 1030 cpu_sfence(); 1031 KKASSERT(ip->ip_windex - ip->ip_rindex < MAXCPUFIFO); 1032 ++ip->ip_windex; 1033 if (ipiq_debug && (ip->ip_windex & 0xFFFFFF) == 0) { 1034 kprintf("cpu %d cm=%016jx %016jx f=%p\n", 1035 gd->gd_cpuid, 1036 (intmax_t)CPUMASK_LOWMASK(cs->cs_mask), 1037 (intmax_t)CPUMASK_LOWMASK(cs->cs_mack), 1038 cs->cs_func); 1039 } 1040 } 1041 } 1042 1043 #define LWKT_IPIQ_NLATENCY 8 1044 #define LWKT_IPIQ_NLATENCY_MASK (LWKT_IPIQ_NLATENCY - 1) 1045 1046 struct lwkt_ipiq_latency_log { 1047 int idx; /* unmasked index */ 1048 int pad; 1049 uint64_t latency[LWKT_IPIQ_NLATENCY]; 1050 }; 1051 1052 static struct lwkt_ipiq_latency_log lwkt_ipiq_latency_logs[MAXCPU]; 1053 static uint64_t save_tsc; 1054 1055 /* 1056 * IPI callback (already in a critical section) 1057 */ 1058 static void 1059 lwkt_ipiq_latency_testfunc(void *arg __unused) 1060 { 1061 uint64_t delta_tsc; 1062 struct globaldata *gd; 1063 struct lwkt_ipiq_latency_log *lat; 1064 1065 /* 1066 * Get delta TSC (assume TSCs are synchronized) as quickly as 1067 * possible and then convert to nanoseconds. 1068 */ 1069 delta_tsc = rdtsc_ordered() - save_tsc; 1070 delta_tsc = delta_tsc * 1000000000LU / tsc_frequency; 1071 1072 /* 1073 * Record in our save array. 1074 */ 1075 gd = mycpu; 1076 lat = &lwkt_ipiq_latency_logs[gd->gd_cpuid]; 1077 lat->latency[lat->idx & LWKT_IPIQ_NLATENCY_MASK] = delta_tsc; 1078 ++lat->idx; 1079 } 1080 1081 /* 1082 * Send IPI from cpu0 to other cpus 1083 * 1084 * NOTE: Machine must be idle for test to run dependably, and also probably 1085 * a good idea not to be running powerd. 1086 * 1087 * NOTE: Caller should use 'usched :1 <command>' to lock itself to cpu 0. 1088 * See 'ipitest' script in /usr/src/test/sysperf/ipitest 1089 */ 1090 static int 1091 lwkt_ipiq_latency_test(SYSCTL_HANDLER_ARGS) 1092 { 1093 struct globaldata *gd; 1094 int cpu = 0, orig_cpu, error; 1095 1096 error = sysctl_handle_int(oidp, &cpu, arg2, req); 1097 if (error || req->newptr == NULL) 1098 return error; 1099 1100 if (cpu == 0) 1101 return 0; 1102 else if (cpu >= ncpus || cpu < 0) 1103 return EINVAL; 1104 1105 orig_cpu = mycpuid; 1106 lwkt_migratecpu(0); 1107 1108 gd = globaldata_find(cpu); 1109 1110 save_tsc = rdtsc_ordered(); 1111 lwkt_send_ipiq(gd, lwkt_ipiq_latency_testfunc, NULL); 1112 1113 lwkt_migratecpu(orig_cpu); 1114 return 0; 1115 } 1116 1117 SYSCTL_NODE(_debug, OID_AUTO, ipiq, CTLFLAG_RW, 0, ""); 1118 SYSCTL_PROC(_debug_ipiq, OID_AUTO, latency_test, CTLTYPE_INT | CTLFLAG_RW, 1119 NULL, 0, lwkt_ipiq_latency_test, "I", 1120 "ipi latency test, arg: remote cpuid"); 1121 1122 static int 1123 lwkt_ipiq_latency(SYSCTL_HANDLER_ARGS) 1124 { 1125 struct lwkt_ipiq_latency_log *latency = arg1; 1126 uint64_t lat[LWKT_IPIQ_NLATENCY]; 1127 int i; 1128 1129 for (i = 0; i < LWKT_IPIQ_NLATENCY; ++i) 1130 lat[i] = latency->latency[i]; 1131 1132 return sysctl_handle_opaque(oidp, lat, sizeof(lat), req); 1133 } 1134 1135 static void 1136 lwkt_ipiq_latency_init(void *dummy __unused) 1137 { 1138 int cpu; 1139 1140 for (cpu = 0; cpu < ncpus; ++cpu) { 1141 char name[32]; 1142 1143 ksnprintf(name, sizeof(name), "latency%d", cpu); 1144 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_debug_ipiq), 1145 OID_AUTO, name, CTLTYPE_OPAQUE | CTLFLAG_RD, 1146 &lwkt_ipiq_latency_logs[cpu], 0, lwkt_ipiq_latency, 1147 "LU", "7 latest ipi latency measurement results"); 1148 } 1149 } 1150 SYSINIT(lwkt_ipiq_latency, SI_SUB_CONFIGURE, SI_ORDER_ANY, 1151 lwkt_ipiq_latency_init, NULL); 1152