1 /* 2 * Copyright (c) 2003-2016 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * This module implements IPI message queueing and the MI portion of IPI 37 * message processing. 38 */ 39 40 #include "opt_ddb.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/rtprio.h> 47 #include <sys/queue.h> 48 #include <sys/thread2.h> 49 #include <sys/sysctl.h> 50 #include <sys/ktr.h> 51 #include <sys/kthread.h> 52 #include <machine/cpu.h> 53 #include <sys/lock.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_param.h> 57 #include <vm/vm_kern.h> 58 #include <vm/vm_object.h> 59 #include <vm/vm_page.h> 60 #include <vm/vm_map.h> 61 #include <vm/vm_pager.h> 62 #include <vm/vm_extern.h> 63 #include <vm/vm_zone.h> 64 65 #include <machine/stdarg.h> 66 #include <machine/smp.h> 67 #include <machine/clock.h> 68 #include <machine/atomic.h> 69 70 #ifdef _KERNEL_VIRTUAL 71 #include <pthread.h> 72 #endif 73 74 struct ipiq_stats { 75 int64_t ipiq_count; /* total calls to lwkt_send_ipiq*() */ 76 int64_t ipiq_fifofull; /* number of fifo full conditions detected */ 77 int64_t ipiq_avoided; /* interlock with target avoids cpu ipi */ 78 int64_t ipiq_passive; /* passive IPI messages */ 79 int64_t ipiq_cscount; /* number of cpu synchronizations */ 80 } __cachealign; 81 82 static struct ipiq_stats ipiq_stats_percpu[MAXCPU]; 83 #define ipiq_stat(gd) ipiq_stats_percpu[(gd)->gd_cpuid] 84 85 static int ipiq_debug; /* set to 1 for debug */ 86 #ifdef PANIC_DEBUG 87 static int panic_ipiq_cpu = -1; 88 static int panic_ipiq_count = 100; 89 #endif 90 91 SYSCTL_INT(_lwkt, OID_AUTO, ipiq_debug, CTLFLAG_RW, &ipiq_debug, 0, 92 ""); 93 #ifdef PANIC_DEBUG 94 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_cpu, CTLFLAG_RW, &panic_ipiq_cpu, 0, ""); 95 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_count, CTLFLAG_RW, &panic_ipiq_count, 0, ""); 96 #endif 97 98 #define IPIQ_STRING "func=%p arg1=%p arg2=%d scpu=%d dcpu=%d" 99 #define IPIQ_ARGS void *func, void *arg1, int arg2, int scpu, int dcpu 100 101 #if !defined(KTR_IPIQ) 102 #define KTR_IPIQ KTR_ALL 103 #endif 104 KTR_INFO_MASTER(ipiq); 105 KTR_INFO(KTR_IPIQ, ipiq, send_norm, 0, IPIQ_STRING, IPIQ_ARGS); 106 KTR_INFO(KTR_IPIQ, ipiq, send_pasv, 1, IPIQ_STRING, IPIQ_ARGS); 107 KTR_INFO(KTR_IPIQ, ipiq, receive, 4, IPIQ_STRING, IPIQ_ARGS); 108 KTR_INFO(KTR_IPIQ, ipiq, sync_start, 5, "cpumask=%08lx", unsigned long mask); 109 KTR_INFO(KTR_IPIQ, ipiq, sync_end, 6, "cpumask=%08lx", unsigned long mask); 110 KTR_INFO(KTR_IPIQ, ipiq, cpu_send, 7, IPIQ_STRING, IPIQ_ARGS); 111 KTR_INFO(KTR_IPIQ, ipiq, send_end, 8, IPIQ_STRING, IPIQ_ARGS); 112 KTR_INFO(KTR_IPIQ, ipiq, sync_quick, 9, "cpumask=%08lx", unsigned long mask); 113 114 #define logipiq(name, func, arg1, arg2, sgd, dgd) \ 115 KTR_LOG(ipiq_ ## name, func, arg1, arg2, sgd->gd_cpuid, dgd->gd_cpuid) 116 #define logipiq2(name, arg) \ 117 KTR_LOG(ipiq_ ## name, arg) 118 119 static void lwkt_process_ipiq_nested(void); 120 static int lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, 121 struct intrframe *frame, int limit); 122 static void lwkt_cpusync_remote1(lwkt_cpusync_t cs); 123 static void lwkt_cpusync_remote2(lwkt_cpusync_t cs); 124 125 #define IPIQ_SYSCTL(name) \ 126 static int \ 127 sysctl_##name(SYSCTL_HANDLER_ARGS) \ 128 { \ 129 int64_t val = 0; \ 130 int cpu, error; \ 131 \ 132 for (cpu = 0; cpu < ncpus; ++cpu) \ 133 val += ipiq_stats_percpu[cpu].name; \ 134 \ 135 error = sysctl_handle_quad(oidp, &val, 0, req); \ 136 if (error || req->newptr == NULL) \ 137 return error; \ 138 \ 139 for (cpu = 0; cpu < ncpus; ++cpu) \ 140 ipiq_stats_percpu[cpu].name = val; \ 141 \ 142 return 0; \ 143 } 144 145 IPIQ_SYSCTL(ipiq_count); 146 IPIQ_SYSCTL(ipiq_fifofull); 147 IPIQ_SYSCTL(ipiq_avoided); 148 IPIQ_SYSCTL(ipiq_passive); 149 IPIQ_SYSCTL(ipiq_cscount); 150 151 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_count, (CTLTYPE_QUAD | CTLFLAG_RW), 152 0, 0, sysctl_ipiq_count, "Q", "Number of IPI's sent"); 153 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_fifofull, (CTLTYPE_QUAD | CTLFLAG_RW), 154 0, 0, sysctl_ipiq_fifofull, "Q", 155 "Number of fifo full conditions detected"); 156 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_avoided, (CTLTYPE_QUAD | CTLFLAG_RW), 157 0, 0, sysctl_ipiq_avoided, "Q", 158 "Number of IPI's avoided by interlock with target cpu"); 159 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_passive, (CTLTYPE_QUAD | CTLFLAG_RW), 160 0, 0, sysctl_ipiq_passive, "Q", 161 "Number of passive IPI messages sent"); 162 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_cscount, (CTLTYPE_QUAD | CTLFLAG_RW), 163 0, 0, sysctl_ipiq_cscount, "Q", 164 "Number of cpu synchronizations"); 165 166 /* 167 * Send a function execution request to another cpu. The request is queued 168 * on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every 169 * possible target cpu. The FIFO can be written. 170 * 171 * If the FIFO fills up we have to enable interrupts to avoid an APIC 172 * deadlock and process pending IPIQs while waiting for it to empty. 173 * Otherwise we may soft-deadlock with another cpu whos FIFO is also full. 174 * 175 * We can safely bump gd_intr_nesting_level because our crit_exit() at the 176 * end will take care of any pending interrupts. 177 * 178 * The actual hardware IPI is avoided if the target cpu is already processing 179 * the queue from a prior IPI. It is possible to pipeline IPI messages 180 * very quickly between cpus due to the FIFO hysteresis. 181 * 182 * Need not be called from a critical section. 183 */ 184 int 185 lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2) 186 { 187 lwkt_ipiq_t ip; 188 int windex; 189 int level1; 190 int level2; 191 long rflags; 192 struct globaldata *gd = mycpu; 193 194 logipiq(send_norm, func, arg1, arg2, gd, target); 195 196 if (target == gd) { 197 func(arg1, arg2, NULL); 198 logipiq(send_end, func, arg1, arg2, gd, target); 199 return(0); 200 } 201 crit_enter(); 202 ++gd->gd_intr_nesting_level; 203 #ifdef INVARIANTS 204 if (gd->gd_intr_nesting_level > 20) 205 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); 206 #endif 207 KKASSERT(curthread->td_critcount); 208 ++ipiq_stat(gd).ipiq_count; 209 ip = &gd->gd_ipiq[target->gd_cpuid]; 210 211 /* 212 * Do not allow the FIFO to become full. Interrupts must be physically 213 * enabled while we liveloop to avoid deadlocking the APIC. 214 * 215 * When we are not nested inside a processing loop we allow the FIFO 216 * to get 1/2 full. Once it exceeds 1/2 full we must wait for it to 217 * drain, executing any incoming IPIs while we wait. 218 * 219 * When we are nested we allow the FIFO to get almost completely full. 220 * This allows us to queue IPIs sent from IPI callbacks. The processing 221 * code will only process incoming FIFOs that are trying to drain while 222 * we wait, and only to the only-slightly-less-full point, to avoid a 223 * deadlock. 224 * 225 * We are guaranteed 226 */ 227 228 if (gd->gd_processing_ipiq == 0) { 229 level1 = MAXCPUFIFO / 2; 230 level2 = MAXCPUFIFO / 4; 231 } else { 232 level1 = MAXCPUFIFO - 3; 233 level2 = MAXCPUFIFO - 5; 234 } 235 236 if (ip->ip_windex - ip->ip_rindex > level1) { 237 #ifndef _KERNEL_VIRTUAL 238 uint64_t tsc_base = rdtsc(); 239 #endif 240 int repeating = 0; 241 int olimit; 242 243 rflags = read_rflags(); 244 cpu_enable_intr(); 245 ++ipiq_stat(gd).ipiq_fifofull; 246 DEBUG_PUSH_INFO("send_ipiq3"); 247 olimit = atomic_swap_int(&ip->ip_drain, level2); 248 while (ip->ip_windex - ip->ip_rindex > level2) { 249 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1); 250 lwkt_process_ipiq_nested(); 251 cpu_pause(); 252 253 /* 254 * Check for target not draining issue. This should be fixed but 255 * leave the code in-place anyway as it can recover an otherwise 256 * dead system. 257 */ 258 #ifdef _KERNEL_VIRTUAL 259 if (repeating++ > 10) 260 pthread_yield(); 261 #else 262 if (rdtsc() - tsc_base > tsc_frequency) { 263 ++repeating; 264 if (repeating > 10) { 265 kprintf("send_ipiq %d->%d tgt not draining (%d) sniff=%p,%p\n", 266 gd->gd_cpuid, target->gd_cpuid, repeating, 267 target->gd_sample_pc, target->gd_sample_sp); 268 smp_sniff(); 269 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 270 cpu_send_ipiq(target->gd_cpuid); 271 } else { 272 kprintf("send_ipiq %d->%d tgt not draining (%d)\n", 273 gd->gd_cpuid, target->gd_cpuid, repeating); 274 smp_sniff(); 275 } 276 tsc_base = rdtsc(); 277 } 278 #endif 279 } 280 atomic_swap_int(&ip->ip_drain, olimit); 281 DEBUG_POP_INFO(); 282 #if defined(__x86_64__) 283 write_rflags(rflags); 284 #else 285 #error "no write_*flags" 286 #endif 287 } 288 289 /* 290 * Queue the new message and signal the target cpu. For now we need to 291 * physically disable interrupts because the target will not get signalled 292 * by other cpus once we set target->gd_npoll and we don't want to get 293 * interrupted. 294 * 295 * XXX not sure why this is a problem, the critical section should prevent 296 * any stalls (incoming interrupts except Xinvltlb and Xsnoop will 297 * just be made pending). 298 */ 299 rflags = read_rflags(); 300 cpu_disable_intr(); 301 302 windex = ip->ip_windex & MAXCPUFIFO_MASK; 303 ip->ip_info[windex].func = func; 304 ip->ip_info[windex].arg1 = arg1; 305 ip->ip_info[windex].arg2 = arg2; 306 cpu_sfence(); 307 ++ip->ip_windex; 308 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 309 310 /* 311 * signal the target cpu that there is work pending. 312 */ 313 if (atomic_swap_int(&target->gd_npoll, 1) == 0) { 314 logipiq(cpu_send, func, arg1, arg2, gd, target); 315 cpu_send_ipiq(target->gd_cpuid); 316 } else { 317 ++ipiq_stat(gd).ipiq_avoided; 318 } 319 write_rflags(rflags); 320 321 --gd->gd_intr_nesting_level; 322 crit_exit(); 323 logipiq(send_end, func, arg1, arg2, gd, target); 324 325 return(ip->ip_windex); 326 } 327 328 /* 329 * Similar to lwkt_send_ipiq() but this function does not actually initiate 330 * the IPI to the target cpu unless the FIFO is greater than 1/4 full. 331 * This function is usually very fast. 332 * 333 * This function is used for non-critical IPI messages, such as memory 334 * deallocations. The queue will typically be flushed by the target cpu at 335 * the next clock interrupt. 336 * 337 * Need not be called from a critical section. 338 */ 339 int 340 lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func, 341 void *arg1, int arg2) 342 { 343 lwkt_ipiq_t ip; 344 int windex; 345 struct globaldata *gd = mycpu; 346 347 KKASSERT(target != gd); 348 crit_enter_gd(gd); 349 ++gd->gd_intr_nesting_level; 350 ip = &gd->gd_ipiq[target->gd_cpuid]; 351 352 /* 353 * If the FIFO is too full send the IPI actively. 354 * 355 * WARNING! This level must be low enough not to trigger a wait loop 356 * in the active sending code since we are not signalling the 357 * target cpu. 358 */ 359 if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO / 4) { 360 --gd->gd_intr_nesting_level; 361 crit_exit_gd(gd); 362 return lwkt_send_ipiq3(target, func, arg1, arg2); 363 } 364 365 /* 366 * Else we can do it passively. 367 */ 368 logipiq(send_pasv, func, arg1, arg2, gd, target); 369 ++ipiq_stat(gd).ipiq_count; 370 ++ipiq_stat(gd).ipiq_passive; 371 372 /* 373 * Queue the new message 374 */ 375 windex = ip->ip_windex & MAXCPUFIFO_MASK; 376 ip->ip_info[windex].func = func; 377 ip->ip_info[windex].arg1 = arg1; 378 ip->ip_info[windex].arg2 = arg2; 379 cpu_sfence(); 380 ++ip->ip_windex; 381 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 382 --gd->gd_intr_nesting_level; 383 384 /* 385 * Do not signal the target cpu, it will pick up the IPI when it next 386 * polls (typically on the next tick). 387 */ 388 crit_exit(); 389 logipiq(send_end, func, arg1, arg2, gd, target); 390 391 return(ip->ip_windex); 392 } 393 394 /* 395 * deprecated, used only by fast int forwarding. 396 */ 397 int 398 lwkt_send_ipiq3_bycpu(int dcpu, ipifunc3_t func, void *arg1, int arg2) 399 { 400 return(lwkt_send_ipiq3(globaldata_find(dcpu), func, arg1, arg2)); 401 } 402 403 /* 404 * Send a message to several target cpus. Typically used for scheduling. 405 * The message will not be sent to stopped cpus. 406 * 407 * To prevent treating low-numbered cpus as favored sons, the IPIs are 408 * issued in order starting at mycpu upward, then from 0 through mycpu. 409 * This is particularly important to prevent random scheduler pickups 410 * from favoring cpu 0. 411 */ 412 int 413 lwkt_send_ipiq3_mask(cpumask_t mask, ipifunc3_t func, void *arg1, int arg2) 414 { 415 int cpuid; 416 int count = 0; 417 cpumask_t amask; 418 419 CPUMASK_NANDMASK(mask, stopped_cpus); 420 421 /* 422 * All cpus in mask which are >= mycpu 423 */ 424 CPUMASK_ASSBMASK(amask, mycpu->gd_cpuid); 425 CPUMASK_INVMASK(amask); 426 CPUMASK_ANDMASK(amask, mask); 427 while (CPUMASK_TESTNZERO(amask)) { 428 cpuid = BSFCPUMASK(amask); 429 lwkt_send_ipiq3(globaldata_find(cpuid), func, arg1, arg2); 430 CPUMASK_NANDBIT(amask, cpuid); 431 ++count; 432 } 433 434 /* 435 * All cpus in mask which are < mycpu 436 */ 437 CPUMASK_ASSBMASK(amask, mycpu->gd_cpuid); 438 CPUMASK_ANDMASK(amask, mask); 439 while (CPUMASK_TESTNZERO(amask)) { 440 cpuid = BSFCPUMASK(amask); 441 lwkt_send_ipiq3(globaldata_find(cpuid), func, arg1, arg2); 442 CPUMASK_NANDBIT(amask, cpuid); 443 ++count; 444 } 445 return(count); 446 } 447 448 /* 449 * Wait for the remote cpu to finish processing a function. 450 * 451 * YYY we have to enable interrupts and process the IPIQ while waiting 452 * for it to empty or we may deadlock with another cpu. Create a CPU_*() 453 * function to do this! YYY we really should 'block' here. 454 * 455 * MUST be called from a critical section. This routine may be called 456 * from an interrupt (for example, if an interrupt wakes a foreign thread 457 * up). 458 */ 459 void 460 lwkt_wait_ipiq(globaldata_t target, int seq) 461 { 462 lwkt_ipiq_t ip; 463 464 if (target != mycpu) { 465 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 466 if ((int)(ip->ip_xindex - seq) < 0) { 467 #if defined(__x86_64__) 468 unsigned long rflags = read_rflags(); 469 #else 470 #error "no read_*flags" 471 #endif 472 int64_t time_tgt = tsc_get_target(1000000000LL); 473 int time_loops = 10; 474 int benice = 0; 475 #ifdef _KERNEL_VIRTUAL 476 int repeating = 0; 477 #endif 478 479 cpu_enable_intr(); 480 DEBUG_PUSH_INFO("wait_ipiq"); 481 while ((int)(ip->ip_xindex - seq) < 0) { 482 crit_enter(); 483 lwkt_process_ipiq(); 484 crit_exit(); 485 #ifdef _KERNEL_VIRTUAL 486 if (repeating++ > 10) 487 pthread_yield(); 488 #endif 489 490 /* 491 * IPIQs must be handled within 10 seconds and this code 492 * will warn after one second. 493 */ 494 if ((benice & 255) == 0 && tsc_test_target(time_tgt) > 0) { 495 kprintf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", 496 mycpu->gd_cpuid, target->gd_cpuid, 497 ip->ip_xindex - seq); 498 if (--time_loops == 0) 499 panic("LWKT_WAIT_IPIQ"); 500 time_tgt = tsc_get_target(1000000000LL); 501 } 502 ++benice; 503 504 /* 505 * xindex may be modified by another cpu, use a load fence 506 * to ensure that the loop does not use a speculative value 507 * (which may improve performance). 508 */ 509 cpu_pause(); 510 cpu_lfence(); 511 } 512 DEBUG_POP_INFO(); 513 #if defined(__x86_64__) 514 write_rflags(rflags); 515 #else 516 #error "no write_*flags" 517 #endif 518 } 519 } 520 } 521 522 /* 523 * Called from IPI interrupt (like a fast interrupt), which has placed 524 * us in a critical section. The MP lock may or may not be held. 525 * May also be called from doreti or splz, or be reentrantly called 526 * indirectly through the ip_info[].func we run. 527 * 528 * There are two versions, one where no interrupt frame is available (when 529 * called from the send code and from splz, and one where an interrupt 530 * frame is available. 531 * 532 * When the current cpu is mastering a cpusync we do NOT internally loop 533 * on the cpusyncq poll. We also do not re-flag a pending ipi due to 534 * the cpusyncq poll because this can cause doreti/splz to loop internally. 535 * The cpusync master's own loop must be allowed to run to avoid a deadlock. 536 */ 537 void 538 lwkt_process_ipiq(void) 539 { 540 globaldata_t gd = mycpu; 541 globaldata_t sgd; 542 lwkt_ipiq_t ip; 543 cpumask_t mask; 544 int n; 545 546 ++gd->gd_processing_ipiq; 547 again: 548 mask = gd->gd_ipimask; 549 cpu_ccfence(); 550 while (CPUMASK_TESTNZERO(mask)) { 551 n = BSFCPUMASK(mask); 552 if (n != gd->gd_cpuid) { 553 sgd = globaldata_find(n); 554 ip = sgd->gd_ipiq; 555 if (ip != NULL) { 556 ip += gd->gd_cpuid; 557 while (lwkt_process_ipiq_core(sgd, ip, NULL, 0)) 558 ; 559 ATOMIC_CPUMASK_NANDBIT(gd->gd_ipimask, n); 560 if (ip->ip_rindex != ip->ip_windex) 561 ATOMIC_CPUMASK_ORBIT(gd->gd_ipimask, n); 562 } 563 } 564 CPUMASK_NANDBIT(mask, n); 565 } 566 567 /* 568 * Process pending cpusyncs. If the current thread has a cpusync 569 * active cpusync we only run the list once and do not re-flag 570 * as the thread itself is processing its interlock. 571 */ 572 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL, 0)) { 573 if (gd->gd_curthread->td_cscount == 0) 574 goto again; 575 /* need_ipiq(); do not reflag */ 576 } 577 578 /* 579 * Interlock to allow more IPI interrupts. 580 */ 581 --gd->gd_processing_ipiq; 582 } 583 584 void 585 lwkt_process_ipiq_frame(struct intrframe *frame) 586 { 587 globaldata_t gd = mycpu; 588 globaldata_t sgd; 589 lwkt_ipiq_t ip; 590 cpumask_t mask; 591 int n; 592 593 ++gd->gd_processing_ipiq; 594 again: 595 mask = gd->gd_ipimask; 596 cpu_ccfence(); 597 while (CPUMASK_TESTNZERO(mask)) { 598 n = BSFCPUMASK(mask); 599 if (n != gd->gd_cpuid) { 600 sgd = globaldata_find(n); 601 ip = sgd->gd_ipiq; 602 if (ip != NULL) { 603 ip += gd->gd_cpuid; 604 while (lwkt_process_ipiq_core(sgd, ip, frame, 0)) 605 ; 606 ATOMIC_CPUMASK_NANDBIT(gd->gd_ipimask, n); 607 if (ip->ip_rindex != ip->ip_windex) 608 ATOMIC_CPUMASK_ORBIT(gd->gd_ipimask, n); 609 } 610 } 611 CPUMASK_NANDBIT(mask, n); 612 } 613 if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) { 614 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, frame, 0)) { 615 if (gd->gd_curthread->td_cscount == 0) 616 goto again; 617 /* need_ipiq(); do not reflag */ 618 } 619 } 620 --gd->gd_processing_ipiq; 621 } 622 623 /* 624 * Only process incoming IPIQs from draining senders and only process them 625 * to the point where the draining sender is able to continue. This is 626 * necessary to avoid deadlocking the IPI subsystem because we are acting on 627 * incoming messages and the callback may queue additional messages. 628 * 629 * We only want to have to act on senders that are blocked to limit the 630 * number of additional messages sent. At the same time, recipients are 631 * trying to drain our own queue. Theoretically this create a pipeline that 632 * cannot deadlock. 633 */ 634 static void 635 lwkt_process_ipiq_nested(void) 636 { 637 globaldata_t gd = mycpu; 638 globaldata_t sgd; 639 lwkt_ipiq_t ip; 640 cpumask_t mask; 641 int n; 642 int limit; 643 644 ++gd->gd_processing_ipiq; 645 again: 646 mask = gd->gd_ipimask; 647 cpu_ccfence(); 648 while (CPUMASK_TESTNZERO(mask)) { 649 n = BSFCPUMASK(mask); 650 if (n != gd->gd_cpuid) { 651 sgd = globaldata_find(n); 652 ip = sgd->gd_ipiq; 653 654 /* 655 * NOTE: We do not mess with the cpumask at all, instead we allow 656 * the top-level ipiq processor deal with it. 657 */ 658 if (ip != NULL) { 659 ip += gd->gd_cpuid; 660 if ((limit = ip->ip_drain) != 0) { 661 lwkt_process_ipiq_core(sgd, ip, NULL, limit); 662 /* no gd_ipimask when doing limited processing */ 663 } 664 } 665 } 666 CPUMASK_NANDBIT(mask, n); 667 } 668 669 /* 670 * Process pending cpusyncs. If the current thread has a cpusync 671 * active cpusync we only run the list once and do not re-flag 672 * as the thread itself is processing its interlock. 673 */ 674 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL, 0)) { 675 if (gd->gd_curthread->td_cscount == 0) 676 goto again; 677 /* need_ipiq(); do not reflag */ 678 } 679 --gd->gd_processing_ipiq; 680 } 681 682 /* 683 * Process incoming IPI requests until only <limit> are left (0 to exhaust 684 * all incoming IPI requests). 685 */ 686 static int 687 lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, 688 struct intrframe *frame, int limit) 689 { 690 globaldata_t mygd = mycpu; 691 int ri; 692 int wi; 693 ipifunc3_t copy_func; 694 void *copy_arg1; 695 int copy_arg2; 696 697 /* 698 * Clear the originating core from our ipimask, we will process all 699 * incoming messages. 700 * 701 * Obtain the current write index, which is modified by a remote cpu. 702 * Issue a load fence to prevent speculative reads of e.g. data written 703 * by the other cpu prior to it updating the index. 704 */ 705 KKASSERT(curthread->td_critcount); 706 wi = ip->ip_windex; 707 cpu_lfence(); 708 ++mygd->gd_intr_nesting_level; 709 710 /* 711 * NOTE: xindex is only updated after we are sure the function has 712 * finished execution. Beware lwkt_process_ipiq() reentrancy! 713 * The function may send an IPI which may block/drain. 714 * 715 * NOTE: Due to additional IPI operations that the callback function 716 * may make, it is possible for both rindex and windex to advance and 717 * thus for rindex to advance passed our cached windex. 718 * 719 * NOTE: A load fence is required to prevent speculative loads prior 720 * to the loading of ip_rindex. Even though stores might be 721 * ordered, loads are probably not. A memory fence is required 722 * to prevent reordering of the loads after the ip_rindex update. 723 * 724 * NOTE: Single pass only. Returns non-zero if the queue is not empty 725 * on return. 726 */ 727 while (wi - (ri = ip->ip_rindex) > limit) { 728 ri &= MAXCPUFIFO_MASK; 729 cpu_lfence(); 730 copy_func = ip->ip_info[ri].func; 731 copy_arg1 = ip->ip_info[ri].arg1; 732 copy_arg2 = ip->ip_info[ri].arg2; 733 cpu_mfence(); 734 ++ip->ip_rindex; 735 KKASSERT((ip->ip_rindex & MAXCPUFIFO_MASK) == 736 ((ri + 1) & MAXCPUFIFO_MASK)); 737 logipiq(receive, copy_func, copy_arg1, copy_arg2, sgd, mycpu); 738 #ifdef INVARIANTS 739 if (ipiq_debug && (ip->ip_rindex & 0xFFFFFF) == 0) { 740 kprintf("cpu %d ipifunc %p %p %d (frame %p)\n", 741 mycpu->gd_cpuid, 742 copy_func, copy_arg1, copy_arg2, 743 #if defined(__x86_64__) 744 (frame ? (void *)frame->if_rip : NULL)); 745 #else 746 NULL); 747 #endif 748 } 749 #endif 750 copy_func(copy_arg1, copy_arg2, frame); 751 cpu_sfence(); 752 ip->ip_xindex = ip->ip_rindex; 753 754 #ifdef PANIC_DEBUG 755 /* 756 * Simulate panics during the processing of an IPI 757 */ 758 if (mycpu->gd_cpuid == panic_ipiq_cpu && panic_ipiq_count) { 759 if (--panic_ipiq_count == 0) { 760 #ifdef DDB 761 Debugger("PANIC_DEBUG"); 762 #else 763 panic("PANIC_DEBUG"); 764 #endif 765 } 766 } 767 #endif 768 } 769 --mygd->gd_intr_nesting_level; 770 771 /* 772 * Return non-zero if there is still more in the queue. Don't worry 773 * about fencing, we will get another interrupt if necessary. 774 */ 775 return (ip->ip_rindex != ip->ip_windex); 776 } 777 778 static void 779 lwkt_sync_ipiq(void *arg) 780 { 781 volatile cpumask_t *cpumask = arg; 782 783 ATOMIC_CPUMASK_NANDBIT(*cpumask, mycpu->gd_cpuid); 784 if (CPUMASK_TESTZERO(*cpumask)) 785 wakeup(cpumask); 786 } 787 788 void 789 lwkt_synchronize_ipiqs(const char *wmesg) 790 { 791 volatile cpumask_t other_cpumask; 792 793 other_cpumask = smp_active_mask; 794 CPUMASK_ANDMASK(other_cpumask, mycpu->gd_other_cpus); 795 lwkt_send_ipiq_mask(other_cpumask, lwkt_sync_ipiq, 796 __DEVOLATILE(void *, &other_cpumask)); 797 798 while (CPUMASK_TESTNZERO(other_cpumask)) { 799 tsleep_interlock(&other_cpumask, 0); 800 if (CPUMASK_TESTNZERO(other_cpumask)) 801 tsleep(&other_cpumask, PINTERLOCKED, wmesg, 0); 802 } 803 } 804 805 /* 806 * CPU Synchronization Support 807 * 808 * lwkt_cpusync_interlock() - Place specified cpus in a quiescent state. 809 * The current cpu is placed in a hard critical 810 * section. 811 * 812 * lwkt_cpusync_deinterlock() - Execute cs_func on specified cpus, including 813 * current cpu if specified, then return. 814 */ 815 void 816 lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *arg) 817 { 818 struct lwkt_cpusync cs; 819 820 lwkt_cpusync_init(&cs, mask, func, arg); 821 lwkt_cpusync_interlock(&cs); 822 lwkt_cpusync_deinterlock(&cs); 823 } 824 825 826 void 827 lwkt_cpusync_interlock(lwkt_cpusync_t cs) 828 { 829 globaldata_t gd = mycpu; 830 cpumask_t mask; 831 832 /* 833 * mask acknowledge (cs_mack): 0->mask for stage 1 834 * 835 * mack does not include the current cpu. 836 */ 837 mask = cs->cs_mask; 838 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 839 CPUMASK_ANDMASK(mask, smp_active_mask); 840 CPUMASK_ASSZERO(cs->cs_mack); 841 842 crit_enter_id("cpusync"); 843 if (CPUMASK_TESTNZERO(mask)) { 844 DEBUG_PUSH_INFO("cpusync_interlock"); 845 ++ipiq_stat(gd).ipiq_cscount; 846 ++gd->gd_curthread->td_cscount; 847 lwkt_send_ipiq_mask(mask, (ipifunc1_t)lwkt_cpusync_remote1, cs); 848 logipiq2(sync_start, (long)CPUMASK_LOWMASK(mask)); 849 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 850 lwkt_process_ipiq(); 851 cpu_pause(); 852 #ifdef _KERNEL_VIRTUAL 853 pthread_yield(); 854 #endif 855 } 856 DEBUG_POP_INFO(); 857 } 858 } 859 860 /* 861 * Interlocked cpus have executed remote1 and are polling in remote2. 862 * To deinterlock we clear cs_mack and wait for the cpus to execute 863 * the func and set their bit in cs_mack again. 864 * 865 */ 866 void 867 lwkt_cpusync_deinterlock(lwkt_cpusync_t cs) 868 { 869 globaldata_t gd = mycpu; 870 cpumask_t mask; 871 872 /* 873 * mask acknowledge (cs_mack): mack->0->mack for stage 2 874 * 875 * Clearing cpu bits for polling cpus in cs_mack will cause them to 876 * execute stage 2, which executes the cs_func(cs_data) and then sets 877 * their bit in cs_mack again. 878 * 879 * mack does not include the current cpu. 880 */ 881 mask = cs->cs_mack; 882 cpu_ccfence(); 883 CPUMASK_ASSZERO(cs->cs_mack); 884 cpu_ccfence(); 885 if (cs->cs_func && CPUMASK_TESTBIT(cs->cs_mask, gd->gd_cpuid)) 886 cs->cs_func(cs->cs_data); 887 if (CPUMASK_TESTNZERO(mask)) { 888 DEBUG_PUSH_INFO("cpusync_deinterlock"); 889 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 890 lwkt_process_ipiq(); 891 cpu_pause(); 892 #ifdef _KERNEL_VIRTUAL 893 pthread_yield(); 894 #endif 895 } 896 DEBUG_POP_INFO(); 897 /* 898 * cpusyncq ipis may be left queued without the RQF flag set due to 899 * a non-zero td_cscount, so be sure to process any laggards after 900 * decrementing td_cscount. 901 */ 902 --gd->gd_curthread->td_cscount; 903 lwkt_process_ipiq(); 904 logipiq2(sync_end, (long)CPUMASK_LOWMASK(mask)); 905 } 906 crit_exit_id("cpusync"); 907 } 908 909 /* 910 * The quick version does not quiesce the target cpu(s) but instead executes 911 * the function on the target cpu(s) and waits for all to acknowledge. This 912 * avoids spinning on the target cpus. 913 * 914 * This function is typically only used for kernel_pmap updates. User pmaps 915 * have to be quiesced. 916 */ 917 void 918 lwkt_cpusync_quick(lwkt_cpusync_t cs) 919 { 920 globaldata_t gd = mycpu; 921 cpumask_t mask; 922 923 /* 924 * stage-2 cs_mack only. 925 */ 926 mask = cs->cs_mask; 927 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 928 CPUMASK_ANDMASK(mask, smp_active_mask); 929 CPUMASK_ASSZERO(cs->cs_mack); 930 931 crit_enter_id("cpusync"); 932 if (CPUMASK_TESTNZERO(mask)) { 933 DEBUG_PUSH_INFO("cpusync_interlock"); 934 ++ipiq_stat(gd).ipiq_cscount; 935 ++gd->gd_curthread->td_cscount; 936 lwkt_send_ipiq_mask(mask, (ipifunc1_t)lwkt_cpusync_remote2, cs); 937 logipiq2(sync_quick, (long)CPUMASK_LOWMASK(mask)); 938 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 939 lwkt_process_ipiq(); 940 cpu_pause(); 941 #ifdef _KERNEL_VIRTUAL 942 pthread_yield(); 943 #endif 944 } 945 946 /* 947 * cpusyncq ipis may be left queued without the RQF flag set due to 948 * a non-zero td_cscount, so be sure to process any laggards after 949 * decrementing td_cscount. 950 */ 951 DEBUG_POP_INFO(); 952 --gd->gd_curthread->td_cscount; 953 lwkt_process_ipiq(); 954 } 955 if (cs->cs_func && CPUMASK_TESTBIT(cs->cs_mask, gd->gd_cpuid)) 956 cs->cs_func(cs->cs_data); 957 crit_exit_id("cpusync"); 958 } 959 960 /* 961 * helper IPI remote messaging function. 962 * 963 * Called on remote cpu when a new cpu synchronization request has been 964 * sent to us. Execute the run function and adjust cs_count, then requeue 965 * the request so we spin on it. 966 */ 967 static void 968 lwkt_cpusync_remote1(lwkt_cpusync_t cs) 969 { 970 globaldata_t gd = mycpu; 971 972 ATOMIC_CPUMASK_ORBIT(cs->cs_mack, gd->gd_cpuid); 973 lwkt_cpusync_remote2(cs); 974 } 975 976 /* 977 * helper IPI remote messaging function. 978 * 979 * Poll for the originator telling us to finish. If it hasn't, requeue 980 * our request so we spin on it. 981 */ 982 static void 983 lwkt_cpusync_remote2(lwkt_cpusync_t cs) 984 { 985 globaldata_t gd = mycpu; 986 987 if (CPUMASK_TESTMASK(cs->cs_mack, gd->gd_cpumask) == 0) { 988 if (cs->cs_func) 989 cs->cs_func(cs->cs_data); 990 ATOMIC_CPUMASK_ORBIT(cs->cs_mack, gd->gd_cpuid); 991 /* cs can be ripped out at this point */ 992 } else { 993 lwkt_ipiq_t ip; 994 int wi; 995 996 cpu_pause(); 997 #ifdef _KERNEL_VIRTUAL 998 pthread_yield(); 999 #endif 1000 cpu_lfence(); 1001 1002 /* 1003 * Requeue our IPI to avoid a deep stack recursion. If no other 1004 * IPIs are pending we can just loop up, which should help VMs 1005 * better-detect spin loops. 1006 */ 1007 ip = &gd->gd_cpusyncq; 1008 1009 wi = ip->ip_windex & MAXCPUFIFO_MASK; 1010 ip->ip_info[wi].func = (ipifunc3_t)(ipifunc1_t)lwkt_cpusync_remote2; 1011 ip->ip_info[wi].arg1 = cs; 1012 ip->ip_info[wi].arg2 = 0; 1013 cpu_sfence(); 1014 KKASSERT(ip->ip_windex - ip->ip_rindex < MAXCPUFIFO); 1015 ++ip->ip_windex; 1016 if (ipiq_debug && (ip->ip_windex & 0xFFFFFF) == 0) { 1017 kprintf("cpu %d cm=%016jx %016jx f=%p\n", 1018 gd->gd_cpuid, 1019 (intmax_t)CPUMASK_LOWMASK(cs->cs_mask), 1020 (intmax_t)CPUMASK_LOWMASK(cs->cs_mack), 1021 cs->cs_func); 1022 } 1023 } 1024 } 1025 1026 #define LWKT_IPIQ_NLATENCY 8 1027 #define LWKT_IPIQ_NLATENCY_MASK (LWKT_IPIQ_NLATENCY - 1) 1028 1029 struct lwkt_ipiq_latency_log { 1030 int idx; /* unmasked index */ 1031 int pad; 1032 uint64_t latency[LWKT_IPIQ_NLATENCY]; 1033 }; 1034 1035 static struct lwkt_ipiq_latency_log lwkt_ipiq_latency_logs[MAXCPU]; 1036 static uint64_t save_tsc; 1037 1038 /* 1039 * IPI callback (already in a critical section) 1040 */ 1041 static void 1042 lwkt_ipiq_latency_testfunc(void *arg __unused) 1043 { 1044 uint64_t delta_tsc; 1045 struct globaldata *gd; 1046 struct lwkt_ipiq_latency_log *lat; 1047 1048 /* 1049 * Get delta TSC (assume TSCs are synchronized) as quickly as 1050 * possible and then convert to nanoseconds. 1051 */ 1052 delta_tsc = rdtsc_ordered() - save_tsc; 1053 delta_tsc = delta_tsc * 1000000000LU / tsc_frequency; 1054 1055 /* 1056 * Record in our save array. 1057 */ 1058 gd = mycpu; 1059 lat = &lwkt_ipiq_latency_logs[gd->gd_cpuid]; 1060 lat->latency[lat->idx & LWKT_IPIQ_NLATENCY_MASK] = delta_tsc; 1061 ++lat->idx; 1062 } 1063 1064 /* 1065 * Send IPI from cpu0 to other cpus 1066 * 1067 * NOTE: Machine must be idle for test to run dependably, and also probably 1068 * a good idea not to be running powerd. 1069 * 1070 * NOTE: Caller should use 'usched :1 <command>' to lock itself to cpu 0. 1071 * See 'ipitest' script in /usr/src/test/sysperf/ipitest 1072 */ 1073 static int 1074 lwkt_ipiq_latency_test(SYSCTL_HANDLER_ARGS) 1075 { 1076 struct globaldata *gd; 1077 int cpu = 0, orig_cpu, error; 1078 1079 error = sysctl_handle_int(oidp, &cpu, arg2, req); 1080 if (error || req->newptr == NULL) 1081 return error; 1082 1083 if (cpu == 0) 1084 return 0; 1085 else if (cpu >= ncpus || cpu < 0) 1086 return EINVAL; 1087 1088 orig_cpu = mycpuid; 1089 lwkt_migratecpu(0); 1090 1091 gd = globaldata_find(cpu); 1092 1093 save_tsc = rdtsc_ordered(); 1094 lwkt_send_ipiq(gd, lwkt_ipiq_latency_testfunc, NULL); 1095 1096 lwkt_migratecpu(orig_cpu); 1097 return 0; 1098 } 1099 1100 SYSCTL_NODE(_debug, OID_AUTO, ipiq, CTLFLAG_RW, 0, ""); 1101 SYSCTL_PROC(_debug_ipiq, OID_AUTO, latency_test, CTLTYPE_INT | CTLFLAG_RW, 1102 NULL, 0, lwkt_ipiq_latency_test, "I", 1103 "ipi latency test, arg: remote cpuid"); 1104 1105 static int 1106 lwkt_ipiq_latency(SYSCTL_HANDLER_ARGS) 1107 { 1108 struct lwkt_ipiq_latency_log *latency = arg1; 1109 uint64_t lat[LWKT_IPIQ_NLATENCY]; 1110 int i; 1111 1112 for (i = 0; i < LWKT_IPIQ_NLATENCY; ++i) 1113 lat[i] = latency->latency[i]; 1114 1115 return sysctl_handle_opaque(oidp, lat, sizeof(lat), req); 1116 } 1117 1118 static void 1119 lwkt_ipiq_latency_init(void *dummy __unused) 1120 { 1121 int cpu; 1122 1123 for (cpu = 0; cpu < ncpus; ++cpu) { 1124 char name[32]; 1125 1126 ksnprintf(name, sizeof(name), "latency%d", cpu); 1127 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_debug_ipiq), 1128 OID_AUTO, name, CTLTYPE_OPAQUE | CTLFLAG_RD, 1129 &lwkt_ipiq_latency_logs[cpu], 0, lwkt_ipiq_latency, 1130 "LU", "7 latest ipi latency measurement results"); 1131 } 1132 } 1133 SYSINIT(lwkt_ipiq_latency, SI_SUB_CONFIGURE, SI_ORDER_ANY, 1134 lwkt_ipiq_latency_init, NULL); 1135