1 /* 2 * Copyright (c) 2003-2016 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * This module implements IPI message queueing and the MI portion of IPI 37 * message processing. 38 */ 39 40 #include "opt_ddb.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/rtprio.h> 47 #include <sys/queue.h> 48 #include <sys/thread2.h> 49 #include <sys/sysctl.h> 50 #include <sys/ktr.h> 51 #include <sys/kthread.h> 52 #include <machine/cpu.h> 53 #include <sys/lock.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_param.h> 57 #include <vm/vm_kern.h> 58 #include <vm/vm_object.h> 59 #include <vm/vm_page.h> 60 #include <vm/vm_map.h> 61 #include <vm/vm_pager.h> 62 #include <vm/vm_extern.h> 63 #include <vm/vm_zone.h> 64 65 #include <machine/stdarg.h> 66 #include <machine/smp.h> 67 #include <machine/clock.h> 68 #include <machine/atomic.h> 69 70 #ifdef _KERNEL_VIRTUAL 71 #include <pthread.h> 72 #endif 73 74 struct ipiq_stats { 75 int64_t ipiq_count; /* total calls to lwkt_send_ipiq*() */ 76 int64_t ipiq_fifofull; /* number of fifo full conditions detected */ 77 int64_t ipiq_avoided; /* interlock with target avoids cpu ipi */ 78 int64_t ipiq_passive; /* passive IPI messages */ 79 int64_t ipiq_cscount; /* number of cpu synchronizations */ 80 } __cachealign; 81 82 static struct ipiq_stats ipiq_stats_percpu[MAXCPU]; 83 #define ipiq_stat(gd) ipiq_stats_percpu[(gd)->gd_cpuid] 84 85 static int ipiq_debug; /* set to 1 for debug */ 86 #ifdef PANIC_DEBUG 87 static int panic_ipiq_cpu = -1; 88 static int panic_ipiq_count = 100; 89 #endif 90 91 SYSCTL_INT(_lwkt, OID_AUTO, ipiq_debug, CTLFLAG_RW, &ipiq_debug, 0, 92 ""); 93 #ifdef PANIC_DEBUG 94 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_cpu, CTLFLAG_RW, &panic_ipiq_cpu, 0, ""); 95 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_count, CTLFLAG_RW, &panic_ipiq_count, 0, ""); 96 #endif 97 98 #define IPIQ_STRING "func=%p arg1=%p arg2=%d scpu=%d dcpu=%d" 99 #define IPIQ_ARGS void *func, void *arg1, int arg2, int scpu, int dcpu 100 101 #if !defined(KTR_IPIQ) 102 #define KTR_IPIQ KTR_ALL 103 #endif 104 KTR_INFO_MASTER(ipiq); 105 KTR_INFO(KTR_IPIQ, ipiq, send_norm, 0, IPIQ_STRING, IPIQ_ARGS); 106 KTR_INFO(KTR_IPIQ, ipiq, send_pasv, 1, IPIQ_STRING, IPIQ_ARGS); 107 KTR_INFO(KTR_IPIQ, ipiq, receive, 4, IPIQ_STRING, IPIQ_ARGS); 108 KTR_INFO(KTR_IPIQ, ipiq, sync_start, 5, "cpumask=%08lx", unsigned long mask); 109 KTR_INFO(KTR_IPIQ, ipiq, sync_end, 6, "cpumask=%08lx", unsigned long mask); 110 KTR_INFO(KTR_IPIQ, ipiq, cpu_send, 7, IPIQ_STRING, IPIQ_ARGS); 111 KTR_INFO(KTR_IPIQ, ipiq, send_end, 8, IPIQ_STRING, IPIQ_ARGS); 112 KTR_INFO(KTR_IPIQ, ipiq, sync_quick, 9, "cpumask=%08lx", unsigned long mask); 113 114 #define logipiq(name, func, arg1, arg2, sgd, dgd) \ 115 KTR_LOG(ipiq_ ## name, func, arg1, arg2, sgd->gd_cpuid, dgd->gd_cpuid) 116 #define logipiq2(name, arg) \ 117 KTR_LOG(ipiq_ ## name, arg) 118 119 static void lwkt_process_ipiq_nested(void); 120 static int lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, 121 struct intrframe *frame, int limit); 122 static void lwkt_cpusync_remote1(lwkt_cpusync_t cs); 123 static void lwkt_cpusync_remote2(lwkt_cpusync_t cs); 124 125 #define IPIQ_SYSCTL(name) \ 126 static int \ 127 sysctl_##name(SYSCTL_HANDLER_ARGS) \ 128 { \ 129 int64_t val = 0; \ 130 int cpu, error; \ 131 \ 132 for (cpu = 0; cpu < ncpus; ++cpu) \ 133 val += ipiq_stats_percpu[cpu].name; \ 134 \ 135 error = sysctl_handle_quad(oidp, &val, 0, req); \ 136 if (error || req->newptr == NULL) \ 137 return error; \ 138 \ 139 for (cpu = 0; cpu < ncpus; ++cpu) \ 140 ipiq_stats_percpu[cpu].name = val; \ 141 \ 142 return 0; \ 143 } 144 145 IPIQ_SYSCTL(ipiq_count); 146 IPIQ_SYSCTL(ipiq_fifofull); 147 IPIQ_SYSCTL(ipiq_avoided); 148 IPIQ_SYSCTL(ipiq_passive); 149 IPIQ_SYSCTL(ipiq_cscount); 150 151 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_count, (CTLTYPE_QUAD | CTLFLAG_RW), 152 0, 0, sysctl_ipiq_count, "Q", "Number of IPI's sent"); 153 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_fifofull, (CTLTYPE_QUAD | CTLFLAG_RW), 154 0, 0, sysctl_ipiq_fifofull, "Q", 155 "Number of fifo full conditions detected"); 156 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_avoided, (CTLTYPE_QUAD | CTLFLAG_RW), 157 0, 0, sysctl_ipiq_avoided, "Q", 158 "Number of IPI's avoided by interlock with target cpu"); 159 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_passive, (CTLTYPE_QUAD | CTLFLAG_RW), 160 0, 0, sysctl_ipiq_passive, "Q", 161 "Number of passive IPI messages sent"); 162 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_cscount, (CTLTYPE_QUAD | CTLFLAG_RW), 163 0, 0, sysctl_ipiq_cscount, "Q", 164 "Number of cpu synchronizations"); 165 166 /* 167 * Send a function execution request to another cpu. The request is queued 168 * on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every 169 * possible target cpu. The FIFO can be written. 170 * 171 * If the FIFO fills up we have to enable interrupts to avoid an APIC 172 * deadlock and process pending IPIQs while waiting for it to empty. 173 * Otherwise we may soft-deadlock with another cpu whos FIFO is also full. 174 * 175 * We can safely bump gd_intr_nesting_level because our crit_exit() at the 176 * end will take care of any pending interrupts. 177 * 178 * The actual hardware IPI is avoided if the target cpu is already processing 179 * the queue from a prior IPI. It is possible to pipeline IPI messages 180 * very quickly between cpus due to the FIFO hysteresis. 181 * 182 * Need not be called from a critical section. 183 */ 184 int 185 lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2) 186 { 187 lwkt_ipiq_t ip; 188 int windex; 189 int level1; 190 int level2; 191 long rflags; 192 struct globaldata *gd = mycpu; 193 194 logipiq(send_norm, func, arg1, arg2, gd, target); 195 196 if (target == gd) { 197 func(arg1, arg2, NULL); 198 logipiq(send_end, func, arg1, arg2, gd, target); 199 return(0); 200 } 201 crit_enter(); 202 ++gd->gd_intr_nesting_level; 203 #ifdef INVARIANTS 204 if (gd->gd_intr_nesting_level > 20) 205 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); 206 #endif 207 KKASSERT(curthread->td_critcount); 208 ++ipiq_stat(gd).ipiq_count; 209 ip = &gd->gd_ipiq[target->gd_cpuid]; 210 211 /* 212 * Do not allow the FIFO to become full. Interrupts must be physically 213 * enabled while we liveloop to avoid deadlocking the APIC. 214 * 215 * When we are not nested inside a processing loop we allow the FIFO 216 * to get 1/2 full. Once it exceeds 1/2 full we must wait for it to 217 * drain, executing any incoming IPIs while we wait. 218 * 219 * When we are nested we allow the FIFO to get almost completely full. 220 * This allows us to queue IPIs sent from IPI callbacks. The processing 221 * code will only process incoming FIFOs that are trying to drain while 222 * we wait, and only to the only-slightly-less-full point, to avoid a 223 * deadlock. 224 * 225 * We are guaranteed 226 */ 227 228 if (gd->gd_processing_ipiq == 0) { 229 level1 = MAXCPUFIFO / 2; 230 level2 = MAXCPUFIFO / 4; 231 } else { 232 level1 = MAXCPUFIFO - 3; 233 level2 = MAXCPUFIFO - 5; 234 } 235 236 if (ip->ip_windex - ip->ip_rindex > level1) { 237 #ifndef _KERNEL_VIRTUAL 238 uint64_t tsc_base = rdtsc(); 239 #endif 240 int repeating = 0; 241 int olimit; 242 243 rflags = read_rflags(); 244 cpu_enable_intr(); 245 ++ipiq_stat(gd).ipiq_fifofull; 246 DEBUG_PUSH_INFO("send_ipiq3"); 247 olimit = atomic_swap_int(&ip->ip_drain, level2); 248 while (ip->ip_windex - ip->ip_rindex > level2) { 249 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1); 250 lwkt_process_ipiq_nested(); 251 cpu_pause(); 252 253 /* 254 * Check for target not draining issue. This should be fixed but 255 * leave the code in-place anyway as it can recover an otherwise 256 * dead system. 257 */ 258 #ifdef _KERNEL_VIRTUAL 259 if (repeating++ > 10) 260 pthread_yield(); 261 #else 262 if (rdtsc() - tsc_base > tsc_frequency) { 263 ++repeating; 264 if (repeating > 10) { 265 kprintf("send_ipiq %d->%d tgt not draining (%d) sniff=%p,%p\n", 266 gd->gd_cpuid, target->gd_cpuid, repeating, 267 target->gd_sample_pc, target->gd_sample_sp); 268 smp_sniff(); 269 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 270 cpu_send_ipiq(target->gd_cpuid); 271 } else { 272 kprintf("send_ipiq %d->%d tgt not draining (%d)\n", 273 gd->gd_cpuid, target->gd_cpuid, repeating); 274 smp_sniff(); 275 } 276 tsc_base = rdtsc(); 277 } 278 #endif 279 } 280 atomic_swap_int(&ip->ip_drain, olimit); 281 DEBUG_POP_INFO(); 282 #if defined(__x86_64__) 283 write_rflags(rflags); 284 #else 285 #error "no write_*flags" 286 #endif 287 } 288 289 /* 290 * Queue the new message and signal the target cpu. For now we need to 291 * physically disable interrupts because the target will not get signalled 292 * by other cpus once we set target->gd_npoll and we don't want to get 293 * interrupted. 294 * 295 * XXX not sure why this is a problem, the critical section should prevent 296 * any stalls (incoming interrupts except Xinvltlb and Xsnoop will 297 * just be made pending). 298 */ 299 rflags = read_rflags(); 300 cpu_disable_intr(); 301 302 windex = ip->ip_windex & MAXCPUFIFO_MASK; 303 ip->ip_info[windex].func = func; 304 ip->ip_info[windex].arg1 = arg1; 305 ip->ip_info[windex].arg2 = arg2; 306 cpu_sfence(); 307 ++ip->ip_windex; 308 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 309 310 /* 311 * signal the target cpu that there is work pending. 312 */ 313 if (atomic_swap_int(&target->gd_npoll, 1) == 0) { 314 logipiq(cpu_send, func, arg1, arg2, gd, target); 315 cpu_send_ipiq(target->gd_cpuid); 316 } else { 317 ++ipiq_stat(gd).ipiq_avoided; 318 } 319 write_rflags(rflags); 320 321 --gd->gd_intr_nesting_level; 322 crit_exit(); 323 logipiq(send_end, func, arg1, arg2, gd, target); 324 325 return(ip->ip_windex); 326 } 327 328 /* 329 * Similar to lwkt_send_ipiq() but this function does not actually initiate 330 * the IPI to the target cpu unless the FIFO is greater than 1/4 full. 331 * This function is usually very fast. 332 * 333 * This function is used for non-critical IPI messages, such as memory 334 * deallocations. The queue will typically be flushed by the target cpu at 335 * the next clock interrupt. 336 * 337 * Need not be called from a critical section. 338 */ 339 int 340 lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func, 341 void *arg1, int arg2) 342 { 343 lwkt_ipiq_t ip; 344 int windex; 345 struct globaldata *gd = mycpu; 346 347 KKASSERT(target != gd); 348 crit_enter_gd(gd); 349 ++gd->gd_intr_nesting_level; 350 ip = &gd->gd_ipiq[target->gd_cpuid]; 351 352 /* 353 * If the FIFO is too full send the IPI actively. 354 * 355 * WARNING! This level must be low enough not to trigger a wait loop 356 * in the active sending code since we are not signalling the 357 * target cpu. 358 */ 359 if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO / 4) { 360 --gd->gd_intr_nesting_level; 361 crit_exit_gd(gd); 362 return lwkt_send_ipiq3(target, func, arg1, arg2); 363 } 364 365 /* 366 * Else we can do it passively. 367 */ 368 logipiq(send_pasv, func, arg1, arg2, gd, target); 369 ++ipiq_stat(gd).ipiq_count; 370 ++ipiq_stat(gd).ipiq_passive; 371 372 /* 373 * Queue the new message 374 */ 375 windex = ip->ip_windex & MAXCPUFIFO_MASK; 376 ip->ip_info[windex].func = func; 377 ip->ip_info[windex].arg1 = arg1; 378 ip->ip_info[windex].arg2 = arg2; 379 cpu_sfence(); 380 ++ip->ip_windex; 381 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 382 --gd->gd_intr_nesting_level; 383 384 /* 385 * Do not signal the target cpu, it will pick up the IPI when it next 386 * polls (typically on the next tick). 387 */ 388 crit_exit(); 389 logipiq(send_end, func, arg1, arg2, gd, target); 390 391 return(ip->ip_windex); 392 } 393 394 /* 395 * deprecated, used only by fast int forwarding. 396 */ 397 int 398 lwkt_send_ipiq3_bycpu(int dcpu, ipifunc3_t func, void *arg1, int arg2) 399 { 400 return(lwkt_send_ipiq3(globaldata_find(dcpu), func, arg1, arg2)); 401 } 402 403 /* 404 * Send a message to several target cpus. Typically used for scheduling. 405 * The message will not be sent to stopped cpus. 406 */ 407 int 408 lwkt_send_ipiq3_mask(cpumask_t mask, ipifunc3_t func, void *arg1, int arg2) 409 { 410 int cpuid; 411 int count = 0; 412 413 CPUMASK_NANDMASK(mask, stopped_cpus); 414 while (CPUMASK_TESTNZERO(mask)) { 415 cpuid = BSFCPUMASK(mask); 416 lwkt_send_ipiq3(globaldata_find(cpuid), func, arg1, arg2); 417 CPUMASK_NANDBIT(mask, cpuid); 418 ++count; 419 } 420 return(count); 421 } 422 423 /* 424 * Wait for the remote cpu to finish processing a function. 425 * 426 * YYY we have to enable interrupts and process the IPIQ while waiting 427 * for it to empty or we may deadlock with another cpu. Create a CPU_*() 428 * function to do this! YYY we really should 'block' here. 429 * 430 * MUST be called from a critical section. This routine may be called 431 * from an interrupt (for example, if an interrupt wakes a foreign thread 432 * up). 433 */ 434 void 435 lwkt_wait_ipiq(globaldata_t target, int seq) 436 { 437 lwkt_ipiq_t ip; 438 439 if (target != mycpu) { 440 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 441 if ((int)(ip->ip_xindex - seq) < 0) { 442 #if defined(__x86_64__) 443 unsigned long rflags = read_rflags(); 444 #else 445 #error "no read_*flags" 446 #endif 447 int64_t time_tgt = tsc_get_target(1000000000LL); 448 int time_loops = 10; 449 int benice = 0; 450 #ifdef _KERNEL_VIRTUAL 451 int repeating = 0; 452 #endif 453 454 cpu_enable_intr(); 455 DEBUG_PUSH_INFO("wait_ipiq"); 456 while ((int)(ip->ip_xindex - seq) < 0) { 457 crit_enter(); 458 lwkt_process_ipiq(); 459 crit_exit(); 460 #ifdef _KERNEL_VIRTUAL 461 if (repeating++ > 10) 462 pthread_yield(); 463 #endif 464 465 /* 466 * IPIQs must be handled within 10 seconds and this code 467 * will warn after one second. 468 */ 469 if ((benice & 255) == 0 && tsc_test_target(time_tgt) > 0) { 470 kprintf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", 471 mycpu->gd_cpuid, target->gd_cpuid, 472 ip->ip_xindex - seq); 473 if (--time_loops == 0) 474 panic("LWKT_WAIT_IPIQ"); 475 time_tgt = tsc_get_target(1000000000LL); 476 } 477 ++benice; 478 479 /* 480 * xindex may be modified by another cpu, use a load fence 481 * to ensure that the loop does not use a speculative value 482 * (which may improve performance). 483 */ 484 cpu_pause(); 485 cpu_lfence(); 486 } 487 DEBUG_POP_INFO(); 488 #if defined(__x86_64__) 489 write_rflags(rflags); 490 #else 491 #error "no write_*flags" 492 #endif 493 } 494 } 495 } 496 497 /* 498 * Called from IPI interrupt (like a fast interrupt), which has placed 499 * us in a critical section. The MP lock may or may not be held. 500 * May also be called from doreti or splz, or be reentrantly called 501 * indirectly through the ip_info[].func we run. 502 * 503 * There are two versions, one where no interrupt frame is available (when 504 * called from the send code and from splz, and one where an interrupt 505 * frame is available. 506 * 507 * When the current cpu is mastering a cpusync we do NOT internally loop 508 * on the cpusyncq poll. We also do not re-flag a pending ipi due to 509 * the cpusyncq poll because this can cause doreti/splz to loop internally. 510 * The cpusync master's own loop must be allowed to run to avoid a deadlock. 511 */ 512 void 513 lwkt_process_ipiq(void) 514 { 515 globaldata_t gd = mycpu; 516 globaldata_t sgd; 517 lwkt_ipiq_t ip; 518 cpumask_t mask; 519 int n; 520 521 ++gd->gd_processing_ipiq; 522 again: 523 mask = gd->gd_ipimask; 524 cpu_ccfence(); 525 while (CPUMASK_TESTNZERO(mask)) { 526 n = BSFCPUMASK(mask); 527 if (n != gd->gd_cpuid) { 528 sgd = globaldata_find(n); 529 ip = sgd->gd_ipiq; 530 if (ip != NULL) { 531 ip += gd->gd_cpuid; 532 while (lwkt_process_ipiq_core(sgd, ip, NULL, 0)) 533 ; 534 ATOMIC_CPUMASK_NANDBIT(gd->gd_ipimask, n); 535 if (ip->ip_rindex != ip->ip_windex) 536 ATOMIC_CPUMASK_ORBIT(gd->gd_ipimask, n); 537 } 538 } 539 CPUMASK_NANDBIT(mask, n); 540 } 541 542 /* 543 * Process pending cpusyncs. If the current thread has a cpusync 544 * active cpusync we only run the list once and do not re-flag 545 * as the thread itself is processing its interlock. 546 */ 547 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL, 0)) { 548 if (gd->gd_curthread->td_cscount == 0) 549 goto again; 550 /* need_ipiq(); do not reflag */ 551 } 552 553 /* 554 * Interlock to allow more IPI interrupts. 555 */ 556 --gd->gd_processing_ipiq; 557 } 558 559 void 560 lwkt_process_ipiq_frame(struct intrframe *frame) 561 { 562 globaldata_t gd = mycpu; 563 globaldata_t sgd; 564 lwkt_ipiq_t ip; 565 cpumask_t mask; 566 int n; 567 568 ++gd->gd_processing_ipiq; 569 again: 570 mask = gd->gd_ipimask; 571 cpu_ccfence(); 572 while (CPUMASK_TESTNZERO(mask)) { 573 n = BSFCPUMASK(mask); 574 if (n != gd->gd_cpuid) { 575 sgd = globaldata_find(n); 576 ip = sgd->gd_ipiq; 577 if (ip != NULL) { 578 ip += gd->gd_cpuid; 579 while (lwkt_process_ipiq_core(sgd, ip, frame, 0)) 580 ; 581 ATOMIC_CPUMASK_NANDBIT(gd->gd_ipimask, n); 582 if (ip->ip_rindex != ip->ip_windex) 583 ATOMIC_CPUMASK_ORBIT(gd->gd_ipimask, n); 584 } 585 } 586 CPUMASK_NANDBIT(mask, n); 587 } 588 if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) { 589 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, frame, 0)) { 590 if (gd->gd_curthread->td_cscount == 0) 591 goto again; 592 /* need_ipiq(); do not reflag */ 593 } 594 } 595 --gd->gd_processing_ipiq; 596 } 597 598 /* 599 * Only process incoming IPIQs from draining senders and only process them 600 * to the point where the draining sender is able to continue. This is 601 * necessary to avoid deadlocking the IPI subsystem because we are acting on 602 * incoming messages and the callback may queue additional messages. 603 * 604 * We only want to have to act on senders that are blocked to limit the 605 * number of additional messages sent. At the same time, recipients are 606 * trying to drain our own queue. Theoretically this create a pipeline that 607 * cannot deadlock. 608 */ 609 static void 610 lwkt_process_ipiq_nested(void) 611 { 612 globaldata_t gd = mycpu; 613 globaldata_t sgd; 614 lwkt_ipiq_t ip; 615 cpumask_t mask; 616 int n; 617 int limit; 618 619 ++gd->gd_processing_ipiq; 620 again: 621 mask = gd->gd_ipimask; 622 cpu_ccfence(); 623 while (CPUMASK_TESTNZERO(mask)) { 624 n = BSFCPUMASK(mask); 625 if (n != gd->gd_cpuid) { 626 sgd = globaldata_find(n); 627 ip = sgd->gd_ipiq; 628 629 /* 630 * NOTE: We do not mess with the cpumask at all, instead we allow 631 * the top-level ipiq processor deal with it. 632 */ 633 if (ip != NULL) { 634 ip += gd->gd_cpuid; 635 if ((limit = ip->ip_drain) != 0) { 636 lwkt_process_ipiq_core(sgd, ip, NULL, limit); 637 /* no gd_ipimask when doing limited processing */ 638 } 639 } 640 } 641 CPUMASK_NANDBIT(mask, n); 642 } 643 644 /* 645 * Process pending cpusyncs. If the current thread has a cpusync 646 * active cpusync we only run the list once and do not re-flag 647 * as the thread itself is processing its interlock. 648 */ 649 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL, 0)) { 650 if (gd->gd_curthread->td_cscount == 0) 651 goto again; 652 /* need_ipiq(); do not reflag */ 653 } 654 --gd->gd_processing_ipiq; 655 } 656 657 /* 658 * Process incoming IPI requests until only <limit> are left (0 to exhaust 659 * all incoming IPI requests). 660 */ 661 static int 662 lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, 663 struct intrframe *frame, int limit) 664 { 665 globaldata_t mygd = mycpu; 666 int ri; 667 int wi; 668 ipifunc3_t copy_func; 669 void *copy_arg1; 670 int copy_arg2; 671 672 /* 673 * Clear the originating core from our ipimask, we will process all 674 * incoming messages. 675 * 676 * Obtain the current write index, which is modified by a remote cpu. 677 * Issue a load fence to prevent speculative reads of e.g. data written 678 * by the other cpu prior to it updating the index. 679 */ 680 KKASSERT(curthread->td_critcount); 681 wi = ip->ip_windex; 682 cpu_lfence(); 683 ++mygd->gd_intr_nesting_level; 684 685 /* 686 * NOTE: xindex is only updated after we are sure the function has 687 * finished execution. Beware lwkt_process_ipiq() reentrancy! 688 * The function may send an IPI which may block/drain. 689 * 690 * NOTE: Due to additional IPI operations that the callback function 691 * may make, it is possible for both rindex and windex to advance and 692 * thus for rindex to advance passed our cached windex. 693 * 694 * NOTE: A load fence is required to prevent speculative loads prior 695 * to the loading of ip_rindex. Even though stores might be 696 * ordered, loads are probably not. A memory fence is required 697 * to prevent reordering of the loads after the ip_rindex update. 698 * 699 * NOTE: Single pass only. Returns non-zero if the queue is not empty 700 * on return. 701 */ 702 while (wi - (ri = ip->ip_rindex) > limit) { 703 ri &= MAXCPUFIFO_MASK; 704 cpu_lfence(); 705 copy_func = ip->ip_info[ri].func; 706 copy_arg1 = ip->ip_info[ri].arg1; 707 copy_arg2 = ip->ip_info[ri].arg2; 708 cpu_mfence(); 709 ++ip->ip_rindex; 710 KKASSERT((ip->ip_rindex & MAXCPUFIFO_MASK) == 711 ((ri + 1) & MAXCPUFIFO_MASK)); 712 logipiq(receive, copy_func, copy_arg1, copy_arg2, sgd, mycpu); 713 #ifdef INVARIANTS 714 if (ipiq_debug && (ip->ip_rindex & 0xFFFFFF) == 0) { 715 kprintf("cpu %d ipifunc %p %p %d (frame %p)\n", 716 mycpu->gd_cpuid, 717 copy_func, copy_arg1, copy_arg2, 718 #if defined(__x86_64__) 719 (frame ? (void *)frame->if_rip : NULL)); 720 #else 721 NULL); 722 #endif 723 } 724 #endif 725 copy_func(copy_arg1, copy_arg2, frame); 726 cpu_sfence(); 727 ip->ip_xindex = ip->ip_rindex; 728 729 #ifdef PANIC_DEBUG 730 /* 731 * Simulate panics during the processing of an IPI 732 */ 733 if (mycpu->gd_cpuid == panic_ipiq_cpu && panic_ipiq_count) { 734 if (--panic_ipiq_count == 0) { 735 #ifdef DDB 736 Debugger("PANIC_DEBUG"); 737 #else 738 panic("PANIC_DEBUG"); 739 #endif 740 } 741 } 742 #endif 743 } 744 --mygd->gd_intr_nesting_level; 745 746 /* 747 * Return non-zero if there is still more in the queue. Don't worry 748 * about fencing, we will get another interrupt if necessary. 749 */ 750 return (ip->ip_rindex != ip->ip_windex); 751 } 752 753 static void 754 lwkt_sync_ipiq(void *arg) 755 { 756 volatile cpumask_t *cpumask = arg; 757 758 ATOMIC_CPUMASK_NANDBIT(*cpumask, mycpu->gd_cpuid); 759 if (CPUMASK_TESTZERO(*cpumask)) 760 wakeup(cpumask); 761 } 762 763 void 764 lwkt_synchronize_ipiqs(const char *wmesg) 765 { 766 volatile cpumask_t other_cpumask; 767 768 other_cpumask = smp_active_mask; 769 CPUMASK_ANDMASK(other_cpumask, mycpu->gd_other_cpus); 770 lwkt_send_ipiq_mask(other_cpumask, lwkt_sync_ipiq, 771 __DEVOLATILE(void *, &other_cpumask)); 772 773 while (CPUMASK_TESTNZERO(other_cpumask)) { 774 tsleep_interlock(&other_cpumask, 0); 775 if (CPUMASK_TESTNZERO(other_cpumask)) 776 tsleep(&other_cpumask, PINTERLOCKED, wmesg, 0); 777 } 778 } 779 780 /* 781 * CPU Synchronization Support 782 * 783 * lwkt_cpusync_interlock() - Place specified cpus in a quiescent state. 784 * The current cpu is placed in a hard critical 785 * section. 786 * 787 * lwkt_cpusync_deinterlock() - Execute cs_func on specified cpus, including 788 * current cpu if specified, then return. 789 */ 790 void 791 lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *arg) 792 { 793 struct lwkt_cpusync cs; 794 795 lwkt_cpusync_init(&cs, mask, func, arg); 796 lwkt_cpusync_interlock(&cs); 797 lwkt_cpusync_deinterlock(&cs); 798 } 799 800 801 void 802 lwkt_cpusync_interlock(lwkt_cpusync_t cs) 803 { 804 globaldata_t gd = mycpu; 805 cpumask_t mask; 806 807 /* 808 * mask acknowledge (cs_mack): 0->mask for stage 1 809 * 810 * mack does not include the current cpu. 811 */ 812 mask = cs->cs_mask; 813 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 814 CPUMASK_ANDMASK(mask, smp_active_mask); 815 CPUMASK_ASSZERO(cs->cs_mack); 816 817 crit_enter_id("cpusync"); 818 if (CPUMASK_TESTNZERO(mask)) { 819 DEBUG_PUSH_INFO("cpusync_interlock"); 820 ++ipiq_stat(gd).ipiq_cscount; 821 ++gd->gd_curthread->td_cscount; 822 lwkt_send_ipiq_mask(mask, (ipifunc1_t)lwkt_cpusync_remote1, cs); 823 logipiq2(sync_start, (long)CPUMASK_LOWMASK(mask)); 824 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 825 lwkt_process_ipiq(); 826 cpu_pause(); 827 #ifdef _KERNEL_VIRTUAL 828 pthread_yield(); 829 #endif 830 } 831 DEBUG_POP_INFO(); 832 } 833 } 834 835 /* 836 * Interlocked cpus have executed remote1 and are polling in remote2. 837 * To deinterlock we clear cs_mack and wait for the cpus to execute 838 * the func and set their bit in cs_mack again. 839 * 840 */ 841 void 842 lwkt_cpusync_deinterlock(lwkt_cpusync_t cs) 843 { 844 globaldata_t gd = mycpu; 845 cpumask_t mask; 846 847 /* 848 * mask acknowledge (cs_mack): mack->0->mack for stage 2 849 * 850 * Clearing cpu bits for polling cpus in cs_mack will cause them to 851 * execute stage 2, which executes the cs_func(cs_data) and then sets 852 * their bit in cs_mack again. 853 * 854 * mack does not include the current cpu. 855 */ 856 mask = cs->cs_mack; 857 cpu_ccfence(); 858 CPUMASK_ASSZERO(cs->cs_mack); 859 cpu_ccfence(); 860 if (cs->cs_func && CPUMASK_TESTBIT(cs->cs_mask, gd->gd_cpuid)) 861 cs->cs_func(cs->cs_data); 862 if (CPUMASK_TESTNZERO(mask)) { 863 DEBUG_PUSH_INFO("cpusync_deinterlock"); 864 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 865 lwkt_process_ipiq(); 866 cpu_pause(); 867 #ifdef _KERNEL_VIRTUAL 868 pthread_yield(); 869 #endif 870 } 871 DEBUG_POP_INFO(); 872 /* 873 * cpusyncq ipis may be left queued without the RQF flag set due to 874 * a non-zero td_cscount, so be sure to process any laggards after 875 * decrementing td_cscount. 876 */ 877 --gd->gd_curthread->td_cscount; 878 lwkt_process_ipiq(); 879 logipiq2(sync_end, (long)CPUMASK_LOWMASK(mask)); 880 } 881 crit_exit_id("cpusync"); 882 } 883 884 /* 885 * The quick version does not quiesce the target cpu(s) but instead executes 886 * the function on the target cpu(s) and waits for all to acknowledge. This 887 * avoids spinning on the target cpus. 888 * 889 * This function is typically only used for kernel_pmap updates. User pmaps 890 * have to be quiesced. 891 */ 892 void 893 lwkt_cpusync_quick(lwkt_cpusync_t cs) 894 { 895 globaldata_t gd = mycpu; 896 cpumask_t mask; 897 898 /* 899 * stage-2 cs_mack only. 900 */ 901 mask = cs->cs_mask; 902 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 903 CPUMASK_ANDMASK(mask, smp_active_mask); 904 CPUMASK_ASSZERO(cs->cs_mack); 905 906 crit_enter_id("cpusync"); 907 if (CPUMASK_TESTNZERO(mask)) { 908 DEBUG_PUSH_INFO("cpusync_interlock"); 909 ++ipiq_stat(gd).ipiq_cscount; 910 ++gd->gd_curthread->td_cscount; 911 lwkt_send_ipiq_mask(mask, (ipifunc1_t)lwkt_cpusync_remote2, cs); 912 logipiq2(sync_quick, (long)CPUMASK_LOWMASK(mask)); 913 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 914 lwkt_process_ipiq(); 915 cpu_pause(); 916 #ifdef _KERNEL_VIRTUAL 917 pthread_yield(); 918 #endif 919 } 920 921 /* 922 * cpusyncq ipis may be left queued without the RQF flag set due to 923 * a non-zero td_cscount, so be sure to process any laggards after 924 * decrementing td_cscount. 925 */ 926 DEBUG_POP_INFO(); 927 --gd->gd_curthread->td_cscount; 928 lwkt_process_ipiq(); 929 } 930 if (cs->cs_func && CPUMASK_TESTBIT(cs->cs_mask, gd->gd_cpuid)) 931 cs->cs_func(cs->cs_data); 932 crit_exit_id("cpusync"); 933 } 934 935 /* 936 * helper IPI remote messaging function. 937 * 938 * Called on remote cpu when a new cpu synchronization request has been 939 * sent to us. Execute the run function and adjust cs_count, then requeue 940 * the request so we spin on it. 941 */ 942 static void 943 lwkt_cpusync_remote1(lwkt_cpusync_t cs) 944 { 945 globaldata_t gd = mycpu; 946 947 ATOMIC_CPUMASK_ORBIT(cs->cs_mack, gd->gd_cpuid); 948 lwkt_cpusync_remote2(cs); 949 } 950 951 /* 952 * helper IPI remote messaging function. 953 * 954 * Poll for the originator telling us to finish. If it hasn't, requeue 955 * our request so we spin on it. 956 */ 957 static void 958 lwkt_cpusync_remote2(lwkt_cpusync_t cs) 959 { 960 globaldata_t gd = mycpu; 961 962 if (CPUMASK_TESTMASK(cs->cs_mack, gd->gd_cpumask) == 0) { 963 if (cs->cs_func) 964 cs->cs_func(cs->cs_data); 965 ATOMIC_CPUMASK_ORBIT(cs->cs_mack, gd->gd_cpuid); 966 /* cs can be ripped out at this point */ 967 } else { 968 lwkt_ipiq_t ip; 969 int wi; 970 971 cpu_pause(); 972 #ifdef _KERNEL_VIRTUAL 973 pthread_yield(); 974 #endif 975 cpu_lfence(); 976 977 /* 978 * Requeue our IPI to avoid a deep stack recursion. If no other 979 * IPIs are pending we can just loop up, which should help VMs 980 * better-detect spin loops. 981 */ 982 ip = &gd->gd_cpusyncq; 983 984 wi = ip->ip_windex & MAXCPUFIFO_MASK; 985 ip->ip_info[wi].func = (ipifunc3_t)(ipifunc1_t)lwkt_cpusync_remote2; 986 ip->ip_info[wi].arg1 = cs; 987 ip->ip_info[wi].arg2 = 0; 988 cpu_sfence(); 989 KKASSERT(ip->ip_windex - ip->ip_rindex < MAXCPUFIFO); 990 ++ip->ip_windex; 991 if (ipiq_debug && (ip->ip_windex & 0xFFFFFF) == 0) { 992 kprintf("cpu %d cm=%016jx %016jx f=%p\n", 993 gd->gd_cpuid, 994 (intmax_t)CPUMASK_LOWMASK(cs->cs_mask), 995 (intmax_t)CPUMASK_LOWMASK(cs->cs_mack), 996 cs->cs_func); 997 } 998 } 999 } 1000 1001 #define LWKT_IPIQ_NLATENCY 8 1002 #define LWKT_IPIQ_NLATENCY_MASK (LWKT_IPIQ_NLATENCY - 1) 1003 1004 struct lwkt_ipiq_latency_log { 1005 int idx; /* unmasked index */ 1006 int pad; 1007 uint64_t latency[LWKT_IPIQ_NLATENCY]; 1008 }; 1009 1010 static struct lwkt_ipiq_latency_log lwkt_ipiq_latency_logs[MAXCPU]; 1011 static uint64_t save_tsc; 1012 1013 /* 1014 * IPI callback (already in a critical section) 1015 */ 1016 static void 1017 lwkt_ipiq_latency_testfunc(void *arg __unused) 1018 { 1019 uint64_t delta_tsc; 1020 struct globaldata *gd; 1021 struct lwkt_ipiq_latency_log *lat; 1022 1023 /* 1024 * Get delta TSC (assume TSCs are synchronized) as quickly as 1025 * possible and then convert to nanoseconds. 1026 */ 1027 delta_tsc = rdtsc_ordered() - save_tsc; 1028 delta_tsc = delta_tsc * 1000000000LU / tsc_frequency; 1029 1030 /* 1031 * Record in our save array. 1032 */ 1033 gd = mycpu; 1034 lat = &lwkt_ipiq_latency_logs[gd->gd_cpuid]; 1035 lat->latency[lat->idx & LWKT_IPIQ_NLATENCY_MASK] = delta_tsc; 1036 ++lat->idx; 1037 } 1038 1039 /* 1040 * Send IPI from cpu0 to other cpus 1041 * 1042 * NOTE: Machine must be idle for test to run dependably, and also probably 1043 * a good idea not to be running powerd. 1044 * 1045 * NOTE: Caller should use 'usched :1 <command>' to lock itself to cpu 0. 1046 * See 'ipitest' script in /usr/src/test/sysperf/ipitest 1047 */ 1048 static int 1049 lwkt_ipiq_latency_test(SYSCTL_HANDLER_ARGS) 1050 { 1051 struct globaldata *gd; 1052 int cpu = 0, orig_cpu, error; 1053 1054 error = sysctl_handle_int(oidp, &cpu, arg2, req); 1055 if (error || req->newptr == NULL) 1056 return error; 1057 1058 if (cpu == 0) 1059 return 0; 1060 else if (cpu >= ncpus || cpu < 0) 1061 return EINVAL; 1062 1063 orig_cpu = mycpuid; 1064 lwkt_migratecpu(0); 1065 1066 gd = globaldata_find(cpu); 1067 1068 save_tsc = rdtsc_ordered(); 1069 lwkt_send_ipiq(gd, lwkt_ipiq_latency_testfunc, NULL); 1070 1071 lwkt_migratecpu(orig_cpu); 1072 return 0; 1073 } 1074 1075 SYSCTL_NODE(_debug, OID_AUTO, ipiq, CTLFLAG_RW, 0, ""); 1076 SYSCTL_PROC(_debug_ipiq, OID_AUTO, latency_test, CTLTYPE_INT | CTLFLAG_RW, 1077 NULL, 0, lwkt_ipiq_latency_test, "I", 1078 "ipi latency test, arg: remote cpuid"); 1079 1080 static int 1081 lwkt_ipiq_latency(SYSCTL_HANDLER_ARGS) 1082 { 1083 struct lwkt_ipiq_latency_log *latency = arg1; 1084 uint64_t lat[LWKT_IPIQ_NLATENCY]; 1085 int i; 1086 1087 for (i = 0; i < LWKT_IPIQ_NLATENCY; ++i) 1088 lat[i] = latency->latency[i]; 1089 1090 return sysctl_handle_opaque(oidp, lat, sizeof(lat), req); 1091 } 1092 1093 static void 1094 lwkt_ipiq_latency_init(void *dummy __unused) 1095 { 1096 int cpu; 1097 1098 for (cpu = 0; cpu < ncpus; ++cpu) { 1099 char name[32]; 1100 1101 ksnprintf(name, sizeof(name), "latency%d", cpu); 1102 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_debug_ipiq), 1103 OID_AUTO, name, CTLTYPE_OPAQUE | CTLFLAG_RD, 1104 &lwkt_ipiq_latency_logs[cpu], 0, lwkt_ipiq_latency, 1105 "LU", "7 latest ipi latency measurement results"); 1106 } 1107 } 1108 SYSINIT(lwkt_ipiq_latency, SI_SUB_CONFIGURE, SI_ORDER_ANY, 1109 lwkt_ipiq_latency_init, NULL); 1110