1 /* 2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * This module implements IPI message queueing and the MI portion of IPI 37 * message processing. 38 */ 39 40 #include "opt_ddb.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/rtprio.h> 47 #include <sys/queue.h> 48 #include <sys/thread2.h> 49 #include <sys/sysctl.h> 50 #include <sys/ktr.h> 51 #include <sys/kthread.h> 52 #include <machine/cpu.h> 53 #include <sys/lock.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_param.h> 57 #include <vm/vm_kern.h> 58 #include <vm/vm_object.h> 59 #include <vm/vm_page.h> 60 #include <vm/vm_map.h> 61 #include <vm/vm_pager.h> 62 #include <vm/vm_extern.h> 63 #include <vm/vm_zone.h> 64 65 #include <machine/stdarg.h> 66 #include <machine/smp.h> 67 #include <machine/atomic.h> 68 69 #ifdef _KERNEL_VIRTUAL 70 #include <pthread.h> 71 #endif 72 73 struct ipiq_stats { 74 __int64_t ipiq_count; /* total calls to lwkt_send_ipiq*() */ 75 __int64_t ipiq_fifofull; /* number of fifo full conditions detected */ 76 __int64_t ipiq_avoided; /* interlock with target avoids cpu ipi */ 77 __int64_t ipiq_passive; /* passive IPI messages */ 78 __int64_t ipiq_cscount; /* number of cpu synchronizations */ 79 } __cachealign; 80 81 static struct ipiq_stats ipiq_stats_percpu[MAXCPU]; 82 #define ipiq_stat(gd) ipiq_stats_percpu[(gd)->gd_cpuid] 83 84 static int ipiq_debug; /* set to 1 for debug */ 85 #ifdef PANIC_DEBUG 86 static int panic_ipiq_cpu = -1; 87 static int panic_ipiq_count = 100; 88 #endif 89 90 SYSCTL_INT(_lwkt, OID_AUTO, ipiq_debug, CTLFLAG_RW, &ipiq_debug, 0, 91 ""); 92 #ifdef PANIC_DEBUG 93 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_cpu, CTLFLAG_RW, &panic_ipiq_cpu, 0, ""); 94 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_count, CTLFLAG_RW, &panic_ipiq_count, 0, ""); 95 #endif 96 97 #define IPIQ_STRING "func=%p arg1=%p arg2=%d scpu=%d dcpu=%d" 98 #define IPIQ_ARGS void *func, void *arg1, int arg2, int scpu, int dcpu 99 100 #if !defined(KTR_IPIQ) 101 #define KTR_IPIQ KTR_ALL 102 #endif 103 KTR_INFO_MASTER(ipiq); 104 KTR_INFO(KTR_IPIQ, ipiq, send_norm, 0, IPIQ_STRING, IPIQ_ARGS); 105 KTR_INFO(KTR_IPIQ, ipiq, send_pasv, 1, IPIQ_STRING, IPIQ_ARGS); 106 KTR_INFO(KTR_IPIQ, ipiq, send_nbio, 2, IPIQ_STRING, IPIQ_ARGS); 107 KTR_INFO(KTR_IPIQ, ipiq, send_fail, 3, IPIQ_STRING, IPIQ_ARGS); 108 KTR_INFO(KTR_IPIQ, ipiq, receive, 4, IPIQ_STRING, IPIQ_ARGS); 109 KTR_INFO(KTR_IPIQ, ipiq, sync_start, 5, "cpumask=%08lx", unsigned long mask); 110 KTR_INFO(KTR_IPIQ, ipiq, sync_end, 6, "cpumask=%08lx", unsigned long mask); 111 KTR_INFO(KTR_IPIQ, ipiq, cpu_send, 7, IPIQ_STRING, IPIQ_ARGS); 112 KTR_INFO(KTR_IPIQ, ipiq, send_end, 8, IPIQ_STRING, IPIQ_ARGS); 113 KTR_INFO(KTR_IPIQ, ipiq, sync_quick, 9, "cpumask=%08lx", unsigned long mask); 114 115 #define logipiq(name, func, arg1, arg2, sgd, dgd) \ 116 KTR_LOG(ipiq_ ## name, func, arg1, arg2, sgd->gd_cpuid, dgd->gd_cpuid) 117 #define logipiq2(name, arg) \ 118 KTR_LOG(ipiq_ ## name, arg) 119 120 static int lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, 121 struct intrframe *frame); 122 static void lwkt_cpusync_remote1(lwkt_cpusync_t cs); 123 static void lwkt_cpusync_remote2(lwkt_cpusync_t cs); 124 125 #define IPIQ_SYSCTL(name) \ 126 static int \ 127 sysctl_##name(SYSCTL_HANDLER_ARGS) \ 128 { \ 129 __int64_t val = 0; \ 130 int cpu, error; \ 131 \ 132 for (cpu = 0; cpu < ncpus; ++cpu) \ 133 val += ipiq_stats_percpu[cpu].name; \ 134 \ 135 error = sysctl_handle_quad(oidp, &val, 0, req); \ 136 if (error || req->newptr == NULL) \ 137 return error; \ 138 \ 139 for (cpu = 0; cpu < ncpus; ++cpu) \ 140 ipiq_stats_percpu[cpu].name = val; \ 141 \ 142 return 0; \ 143 } 144 145 IPIQ_SYSCTL(ipiq_count); 146 IPIQ_SYSCTL(ipiq_fifofull); 147 IPIQ_SYSCTL(ipiq_avoided); 148 IPIQ_SYSCTL(ipiq_passive); 149 IPIQ_SYSCTL(ipiq_cscount); 150 151 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_count, (CTLTYPE_QUAD | CTLFLAG_RW), 152 0, 0, sysctl_ipiq_count, "Q", "Number of IPI's sent"); 153 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_fifofull, (CTLTYPE_QUAD | CTLFLAG_RW), 154 0, 0, sysctl_ipiq_fifofull, "Q", 155 "Number of fifo full conditions detected"); 156 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_avoided, (CTLTYPE_QUAD | CTLFLAG_RW), 157 0, 0, sysctl_ipiq_avoided, "Q", 158 "Number of IPI's avoided by interlock with target cpu"); 159 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_passive, (CTLTYPE_QUAD | CTLFLAG_RW), 160 0, 0, sysctl_ipiq_passive, "Q", 161 "Number of passive IPI messages sent"); 162 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_cscount, (CTLTYPE_QUAD | CTLFLAG_RW), 163 0, 0, sysctl_ipiq_cscount, "Q", 164 "Number of cpu synchronizations"); 165 166 /* 167 * Send a function execution request to another cpu. The request is queued 168 * on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every 169 * possible target cpu. The FIFO can be written. 170 * 171 * If the FIFO fills up we have to enable interrupts to avoid an APIC 172 * deadlock and process pending IPIQs while waiting for it to empty. 173 * Otherwise we may soft-deadlock with another cpu whos FIFO is also full. 174 * 175 * We can safely bump gd_intr_nesting_level because our crit_exit() at the 176 * end will take care of any pending interrupts. 177 * 178 * The actual hardware IPI is avoided if the target cpu is already processing 179 * the queue from a prior IPI. It is possible to pipeline IPI messages 180 * very quickly between cpus due to the FIFO hysteresis. 181 * 182 * Need not be called from a critical section. 183 */ 184 int 185 lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2) 186 { 187 lwkt_ipiq_t ip; 188 int windex; 189 #ifdef _KERNEL_VIRTUAL 190 int repeating = 0; 191 #endif 192 struct globaldata *gd = mycpu; 193 194 logipiq(send_norm, func, arg1, arg2, gd, target); 195 196 if (target == gd) { 197 func(arg1, arg2, NULL); 198 logipiq(send_end, func, arg1, arg2, gd, target); 199 return(0); 200 } 201 crit_enter(); 202 ++gd->gd_intr_nesting_level; 203 #ifdef INVARIANTS 204 if (gd->gd_intr_nesting_level > 20) 205 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); 206 #endif 207 KKASSERT(curthread->td_critcount); 208 ++ipiq_stat(gd).ipiq_count; 209 ip = &gd->gd_ipiq[target->gd_cpuid]; 210 211 /* 212 * Do not allow the FIFO to become full. Interrupts must be physically 213 * enabled while we liveloop to avoid deadlocking the APIC. 214 * 215 * The target ipiq may have gotten filled up due to passive IPIs and thus 216 * not be aware that its queue is too full, so be sure to issue an 217 * ipiq interrupt to the target cpu. 218 */ 219 if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) { 220 #if defined(__i386__) 221 unsigned int eflags = read_eflags(); 222 #elif defined(__x86_64__) 223 unsigned long rflags = read_rflags(); 224 #endif 225 226 cpu_enable_intr(); 227 ++ipiq_stat(gd).ipiq_fifofull; 228 DEBUG_PUSH_INFO("send_ipiq3"); 229 while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) { 230 if (atomic_poll_acquire_int(&target->gd_npoll)) { 231 logipiq(cpu_send, func, arg1, arg2, gd, target); 232 cpu_send_ipiq(target->gd_cpuid); 233 } 234 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1); 235 lwkt_process_ipiq(); 236 cpu_pause(); 237 #ifdef _KERNEL_VIRTUAL 238 if (repeating++ > 10) 239 pthread_yield(); 240 #endif 241 } 242 DEBUG_POP_INFO(); 243 #if defined(__i386__) 244 write_eflags(eflags); 245 #elif defined(__x86_64__) 246 write_rflags(rflags); 247 #endif 248 } 249 250 /* 251 * Queue the new message 252 */ 253 windex = ip->ip_windex & MAXCPUFIFO_MASK; 254 ip->ip_info[windex].func = func; 255 ip->ip_info[windex].arg1 = arg1; 256 ip->ip_info[windex].arg2 = arg2; 257 cpu_sfence(); 258 ++ip->ip_windex; 259 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 260 261 /* 262 * signal the target cpu that there is work pending. 263 */ 264 if (atomic_poll_acquire_int(&target->gd_npoll)) { 265 logipiq(cpu_send, func, arg1, arg2, gd, target); 266 cpu_send_ipiq(target->gd_cpuid); 267 } else { 268 ++ipiq_stat(gd).ipiq_avoided; 269 } 270 --gd->gd_intr_nesting_level; 271 crit_exit(); 272 logipiq(send_end, func, arg1, arg2, gd, target); 273 274 return(ip->ip_windex); 275 } 276 277 /* 278 * Similar to lwkt_send_ipiq() but this function does not actually initiate 279 * the IPI to the target cpu unless the FIFO has become too full, so it is 280 * very fast. 281 * 282 * This function is used for non-critical IPI messages, such as memory 283 * deallocations. The queue will typically be flushed by the target cpu at 284 * the next clock interrupt. 285 * 286 * Need not be called from a critical section. 287 */ 288 int 289 lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func, 290 void *arg1, int arg2) 291 { 292 lwkt_ipiq_t ip; 293 int windex; 294 #ifdef _KERNEL_VIRTUAL 295 int repeating = 0; 296 #endif 297 struct globaldata *gd = mycpu; 298 299 KKASSERT(target != gd); 300 crit_enter(); 301 ++gd->gd_intr_nesting_level; 302 logipiq(send_pasv, func, arg1, arg2, gd, target); 303 #ifdef INVARIANTS 304 if (gd->gd_intr_nesting_level > 20) 305 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); 306 #endif 307 KKASSERT(curthread->td_critcount); 308 ++ipiq_stat(gd).ipiq_count; 309 ++ipiq_stat(gd).ipiq_passive; 310 ip = &gd->gd_ipiq[target->gd_cpuid]; 311 312 /* 313 * Do not allow the FIFO to become full. Interrupts must be physically 314 * enabled while we liveloop to avoid deadlocking the APIC. 315 */ 316 if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) { 317 #if defined(__i386__) 318 unsigned int eflags = read_eflags(); 319 #elif defined(__x86_64__) 320 unsigned long rflags = read_rflags(); 321 #endif 322 323 cpu_enable_intr(); 324 ++ipiq_stat(gd).ipiq_fifofull; 325 DEBUG_PUSH_INFO("send_ipiq3_passive"); 326 while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) { 327 if (atomic_poll_acquire_int(&target->gd_npoll)) { 328 logipiq(cpu_send, func, arg1, arg2, gd, target); 329 cpu_send_ipiq(target->gd_cpuid); 330 } 331 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1); 332 lwkt_process_ipiq(); 333 cpu_pause(); 334 #ifdef _KERNEL_VIRTUAL 335 if (repeating++ > 10) 336 pthread_yield(); 337 #endif 338 } 339 DEBUG_POP_INFO(); 340 #if defined(__i386__) 341 write_eflags(eflags); 342 #elif defined(__x86_64__) 343 write_rflags(rflags); 344 #endif 345 } 346 347 /* 348 * Queue the new message 349 */ 350 windex = ip->ip_windex & MAXCPUFIFO_MASK; 351 ip->ip_info[windex].func = func; 352 ip->ip_info[windex].arg1 = arg1; 353 ip->ip_info[windex].arg2 = arg2; 354 cpu_sfence(); 355 ++ip->ip_windex; 356 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 357 --gd->gd_intr_nesting_level; 358 359 /* 360 * Do not signal the target cpu, it will pick up the IPI when it next 361 * polls (typically on the next tick). 362 */ 363 crit_exit(); 364 logipiq(send_end, func, arg1, arg2, gd, target); 365 366 return(ip->ip_windex); 367 } 368 369 /* 370 * Send an IPI request without blocking, return 0 on success, ENOENT on 371 * failure. The actual queueing of the hardware IPI may still force us 372 * to spin and process incoming IPIs but that will eventually go away 373 * when we've gotten rid of the other general IPIs. 374 */ 375 int 376 lwkt_send_ipiq3_nowait(globaldata_t target, ipifunc3_t func, 377 void *arg1, int arg2) 378 { 379 lwkt_ipiq_t ip; 380 int windex; 381 struct globaldata *gd = mycpu; 382 383 logipiq(send_nbio, func, arg1, arg2, gd, target); 384 KKASSERT(curthread->td_critcount); 385 if (target == gd) { 386 func(arg1, arg2, NULL); 387 logipiq(send_end, func, arg1, arg2, gd, target); 388 return(0); 389 } 390 crit_enter(); 391 ++gd->gd_intr_nesting_level; 392 ++ipiq_stat(gd).ipiq_count; 393 ip = &gd->gd_ipiq[target->gd_cpuid]; 394 395 if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO * 2 / 3) { 396 logipiq(send_fail, func, arg1, arg2, gd, target); 397 --gd->gd_intr_nesting_level; 398 crit_exit(); 399 return(ENOENT); 400 } 401 windex = ip->ip_windex & MAXCPUFIFO_MASK; 402 ip->ip_info[windex].func = func; 403 ip->ip_info[windex].arg1 = arg1; 404 ip->ip_info[windex].arg2 = arg2; 405 cpu_sfence(); 406 ++ip->ip_windex; 407 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 408 409 /* 410 * This isn't a passive IPI, we still have to signal the target cpu. 411 */ 412 if (atomic_poll_acquire_int(&target->gd_npoll)) { 413 logipiq(cpu_send, func, arg1, arg2, gd, target); 414 cpu_send_ipiq(target->gd_cpuid); 415 } else { 416 ++ipiq_stat(gd).ipiq_avoided; 417 } 418 --gd->gd_intr_nesting_level; 419 crit_exit(); 420 421 logipiq(send_end, func, arg1, arg2, gd, target); 422 return(0); 423 } 424 425 /* 426 * deprecated, used only by fast int forwarding. 427 */ 428 int 429 lwkt_send_ipiq3_bycpu(int dcpu, ipifunc3_t func, void *arg1, int arg2) 430 { 431 return(lwkt_send_ipiq3(globaldata_find(dcpu), func, arg1, arg2)); 432 } 433 434 /* 435 * Send a message to several target cpus. Typically used for scheduling. 436 * The message will not be sent to stopped cpus. 437 */ 438 int 439 lwkt_send_ipiq3_mask(cpumask_t mask, ipifunc3_t func, void *arg1, int arg2) 440 { 441 int cpuid; 442 int count = 0; 443 444 CPUMASK_NANDMASK(mask, stopped_cpus); 445 while (CPUMASK_TESTNZERO(mask)) { 446 cpuid = BSFCPUMASK(mask); 447 lwkt_send_ipiq3(globaldata_find(cpuid), func, arg1, arg2); 448 CPUMASK_NANDBIT(mask, cpuid); 449 ++count; 450 } 451 return(count); 452 } 453 454 /* 455 * Wait for the remote cpu to finish processing a function. 456 * 457 * YYY we have to enable interrupts and process the IPIQ while waiting 458 * for it to empty or we may deadlock with another cpu. Create a CPU_*() 459 * function to do this! YYY we really should 'block' here. 460 * 461 * MUST be called from a critical section. This routine may be called 462 * from an interrupt (for example, if an interrupt wakes a foreign thread 463 * up). 464 */ 465 void 466 lwkt_wait_ipiq(globaldata_t target, int seq) 467 { 468 lwkt_ipiq_t ip; 469 470 if (target != mycpu) { 471 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 472 if ((int)(ip->ip_xindex - seq) < 0) { 473 #if defined(__i386__) 474 unsigned int eflags = read_eflags(); 475 #elif defined(__x86_64__) 476 unsigned long rflags = read_rflags(); 477 #endif 478 int64_t time_tgt = tsc_get_target(1000000000LL); 479 int time_loops = 10; 480 int benice = 0; 481 #ifdef _KERNEL_VIRTUAL 482 int repeating = 0; 483 #endif 484 485 cpu_enable_intr(); 486 DEBUG_PUSH_INFO("wait_ipiq"); 487 while ((int)(ip->ip_xindex - seq) < 0) { 488 crit_enter(); 489 lwkt_process_ipiq(); 490 crit_exit(); 491 #ifdef _KERNEL_VIRTUAL 492 if (repeating++ > 10) 493 pthread_yield(); 494 #endif 495 496 /* 497 * IPIQs must be handled within 10 seconds and this code 498 * will warn after one second. 499 */ 500 if ((benice & 255) == 0 && tsc_test_target(time_tgt) > 0) { 501 kprintf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", 502 mycpu->gd_cpuid, target->gd_cpuid, 503 ip->ip_xindex - seq); 504 if (--time_loops == 0) 505 panic("LWKT_WAIT_IPIQ"); 506 time_tgt = tsc_get_target(1000000000LL); 507 } 508 ++benice; 509 510 /* 511 * xindex may be modified by another cpu, use a load fence 512 * to ensure that the loop does not use a speculative value 513 * (which may improve performance). 514 */ 515 cpu_pause(); 516 cpu_lfence(); 517 } 518 DEBUG_POP_INFO(); 519 #if defined(__i386__) 520 write_eflags(eflags); 521 #elif defined(__x86_64__) 522 write_rflags(rflags); 523 #endif 524 } 525 } 526 } 527 528 int 529 lwkt_seq_ipiq(globaldata_t target) 530 { 531 lwkt_ipiq_t ip; 532 533 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 534 return(ip->ip_windex); 535 } 536 537 /* 538 * Called from IPI interrupt (like a fast interrupt), which has placed 539 * us in a critical section. The MP lock may or may not be held. 540 * May also be called from doreti or splz, or be reentrantly called 541 * indirectly through the ip_info[].func we run. 542 * 543 * There are two versions, one where no interrupt frame is available (when 544 * called from the send code and from splz, and one where an interrupt 545 * frame is available. 546 * 547 * When the current cpu is mastering a cpusync we do NOT internally loop 548 * on the cpusyncq poll. We also do not re-flag a pending ipi due to 549 * the cpusyncq poll because this can cause doreti/splz to loop internally. 550 * The cpusync master's own loop must be allowed to run to avoid a deadlock. 551 */ 552 void 553 lwkt_process_ipiq(void) 554 { 555 globaldata_t gd = mycpu; 556 globaldata_t sgd; 557 lwkt_ipiq_t ip; 558 cpumask_t mask; 559 int n; 560 561 ++gd->gd_processing_ipiq; 562 again: 563 cpu_lfence(); 564 mask = gd->gd_ipimask; 565 ATOMIC_CPUMASK_NANDMASK(gd->gd_ipimask, mask); 566 while (CPUMASK_TESTNZERO(mask)) { 567 n = BSFCPUMASK(mask); 568 if (n != gd->gd_cpuid) { 569 sgd = globaldata_find(n); 570 ip = sgd->gd_ipiq; 571 if (ip != NULL) { 572 while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], NULL)) 573 ; 574 } 575 } 576 CPUMASK_NANDBIT(mask, n); 577 } 578 579 /* 580 * Process pending cpusyncs. If the current thread has a cpusync 581 * active cpusync we only run the list once and do not re-flag 582 * as the thread itself is processing its interlock. 583 */ 584 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL)) { 585 if (gd->gd_curthread->td_cscount == 0) 586 goto again; 587 /* need_ipiq(); do not reflag */ 588 } 589 590 /* 591 * Interlock to allow more IPI interrupts. Recheck ipimask after 592 * releasing gd_npoll. 593 */ 594 if (CPUMASK_TESTNZERO(gd->gd_ipimask)) 595 goto again; 596 atomic_poll_release_int(&gd->gd_npoll); 597 cpu_mfence(); 598 if (CPUMASK_TESTNZERO(gd->gd_ipimask)) 599 goto again; 600 --gd->gd_processing_ipiq; 601 } 602 603 void 604 lwkt_process_ipiq_frame(struct intrframe *frame) 605 { 606 globaldata_t gd = mycpu; 607 globaldata_t sgd; 608 lwkt_ipiq_t ip; 609 cpumask_t mask; 610 int n; 611 612 again: 613 cpu_lfence(); 614 mask = gd->gd_ipimask; 615 ATOMIC_CPUMASK_NANDMASK(gd->gd_ipimask, mask); 616 while (CPUMASK_TESTNZERO(mask)) { 617 n = BSFCPUMASK(mask); 618 if (n != gd->gd_cpuid) { 619 sgd = globaldata_find(n); 620 ip = sgd->gd_ipiq; 621 if (ip != NULL) { 622 while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], frame)) 623 ; 624 } 625 } 626 CPUMASK_NANDBIT(mask, n); 627 } 628 if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) { 629 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, frame)) { 630 if (gd->gd_curthread->td_cscount == 0) 631 goto again; 632 /* need_ipiq(); do not reflag */ 633 } 634 } 635 636 /* 637 * Interlock to allow more IPI interrupts. Recheck ipimask after 638 * releasing gd_npoll. 639 */ 640 if (CPUMASK_TESTNZERO(gd->gd_ipimask)) 641 goto again; 642 atomic_poll_release_int(&gd->gd_npoll); 643 cpu_mfence(); 644 if (CPUMASK_TESTNZERO(gd->gd_ipimask)) 645 goto again; 646 } 647 648 #if 0 649 static int iqticks[SMP_MAXCPU]; 650 static int iqcount[SMP_MAXCPU]; 651 #endif 652 #if 0 653 static int iqterm[SMP_MAXCPU]; 654 #endif 655 656 static int 657 lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, 658 struct intrframe *frame) 659 { 660 globaldata_t mygd = mycpu; 661 int ri; 662 int wi; 663 ipifunc3_t copy_func; 664 void *copy_arg1; 665 int copy_arg2; 666 667 #if 0 668 if (iqticks[mygd->gd_cpuid] != ticks) { 669 iqticks[mygd->gd_cpuid] = ticks; 670 iqcount[mygd->gd_cpuid] = 0; 671 } 672 if (++iqcount[mygd->gd_cpuid] > 3000000) { 673 kprintf("cpu %d ipiq maxed cscount %d spin %d\n", 674 mygd->gd_cpuid, 675 mygd->gd_curthread->td_cscount, 676 mygd->gd_spinlocks); 677 iqcount[mygd->gd_cpuid] = 0; 678 #if 0 679 if (++iqterm[mygd->gd_cpuid] > 10) 680 panic("cpu %d ipiq maxed", mygd->gd_cpuid); 681 #endif 682 int i; 683 for (i = 0; i < ncpus; ++i) { 684 if (globaldata_find(i)->gd_infomsg) 685 kprintf(" %s", globaldata_find(i)->gd_infomsg); 686 } 687 kprintf("\n"); 688 } 689 #endif 690 691 /* 692 * Clear the originating core from our ipimask, we will process all 693 * incoming messages. 694 * 695 * Obtain the current write index, which is modified by a remote cpu. 696 * Issue a load fence to prevent speculative reads of e.g. data written 697 * by the other cpu prior to it updating the index. 698 */ 699 KKASSERT(curthread->td_critcount); 700 wi = ip->ip_windex; 701 cpu_lfence(); 702 ++mygd->gd_intr_nesting_level; 703 704 /* 705 * NOTE: xindex is only updated after we are sure the function has 706 * finished execution. Beware lwkt_process_ipiq() reentrancy! 707 * The function may send an IPI which may block/drain. 708 * 709 * NOTE: Due to additional IPI operations that the callback function 710 * may make, it is possible for both rindex and windex to advance and 711 * thus for rindex to advance passed our cached windex. 712 * 713 * NOTE: A load fence is required to prevent speculative loads prior 714 * to the loading of ip_rindex. Even though stores might be 715 * ordered, loads are probably not. A memory fence is required 716 * to prevent reordering of the loads after the ip_rindex update. 717 * 718 * NOTE: Single pass only. Returns non-zero if the queue is not empty 719 * on return. 720 */ 721 while (wi - (ri = ip->ip_rindex) > 0) { 722 ri &= MAXCPUFIFO_MASK; 723 cpu_lfence(); 724 copy_func = ip->ip_info[ri].func; 725 copy_arg1 = ip->ip_info[ri].arg1; 726 copy_arg2 = ip->ip_info[ri].arg2; 727 cpu_mfence(); 728 ++ip->ip_rindex; 729 KKASSERT((ip->ip_rindex & MAXCPUFIFO_MASK) == 730 ((ri + 1) & MAXCPUFIFO_MASK)); 731 logipiq(receive, copy_func, copy_arg1, copy_arg2, sgd, mycpu); 732 #ifdef INVARIANTS 733 if (ipiq_debug && (ip->ip_rindex & 0xFFFFFF) == 0) { 734 kprintf("cpu %d ipifunc %p %p %d (frame %p)\n", 735 mycpu->gd_cpuid, 736 copy_func, copy_arg1, copy_arg2, 737 #if defined(__i386__) 738 (frame ? (void *)frame->if_eip : NULL)); 739 #elif defined(__x86_64__) 740 (frame ? (void *)frame->if_rip : NULL)); 741 #else 742 NULL); 743 #endif 744 } 745 #endif 746 copy_func(copy_arg1, copy_arg2, frame); 747 cpu_sfence(); 748 ip->ip_xindex = ip->ip_rindex; 749 750 #ifdef PANIC_DEBUG 751 /* 752 * Simulate panics during the processing of an IPI 753 */ 754 if (mycpu->gd_cpuid == panic_ipiq_cpu && panic_ipiq_count) { 755 if (--panic_ipiq_count == 0) { 756 #ifdef DDB 757 Debugger("PANIC_DEBUG"); 758 #else 759 panic("PANIC_DEBUG"); 760 #endif 761 } 762 } 763 #endif 764 } 765 --mygd->gd_intr_nesting_level; 766 767 /* 768 * Return non-zero if there is still more in the queue. 769 */ 770 cpu_lfence(); 771 return (ip->ip_rindex != ip->ip_windex); 772 } 773 774 static void 775 lwkt_sync_ipiq(void *arg) 776 { 777 volatile cpumask_t *cpumask = arg; 778 779 ATOMIC_CPUMASK_NANDBIT(*cpumask, mycpu->gd_cpuid); 780 if (CPUMASK_TESTZERO(*cpumask)) 781 wakeup(cpumask); 782 } 783 784 void 785 lwkt_synchronize_ipiqs(const char *wmesg) 786 { 787 volatile cpumask_t other_cpumask; 788 789 other_cpumask = smp_active_mask; 790 CPUMASK_ANDMASK(other_cpumask, mycpu->gd_other_cpus); 791 lwkt_send_ipiq_mask(other_cpumask, lwkt_sync_ipiq, 792 __DEVOLATILE(void *, &other_cpumask)); 793 794 while (CPUMASK_TESTNZERO(other_cpumask)) { 795 tsleep_interlock(&other_cpumask, 0); 796 if (CPUMASK_TESTNZERO(other_cpumask)) 797 tsleep(&other_cpumask, PINTERLOCKED, wmesg, 0); 798 } 799 } 800 801 /* 802 * CPU Synchronization Support 803 * 804 * lwkt_cpusync_interlock() - Place specified cpus in a quiescent state. 805 * The current cpu is placed in a hard critical 806 * section. 807 * 808 * lwkt_cpusync_deinterlock() - Execute cs_func on specified cpus, including 809 * current cpu if specified, then return. 810 */ 811 void 812 lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *arg) 813 { 814 struct lwkt_cpusync cs; 815 816 lwkt_cpusync_init(&cs, mask, func, arg); 817 lwkt_cpusync_interlock(&cs); 818 lwkt_cpusync_deinterlock(&cs); 819 } 820 821 822 void 823 lwkt_cpusync_interlock(lwkt_cpusync_t cs) 824 { 825 globaldata_t gd = mycpu; 826 cpumask_t mask; 827 828 /* 829 * mask acknowledge (cs_mack): 0->mask for stage 1 830 * 831 * mack does not include the current cpu. 832 */ 833 mask = cs->cs_mask; 834 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 835 CPUMASK_ANDMASK(mask, smp_active_mask); 836 CPUMASK_ASSZERO(cs->cs_mack); 837 838 crit_enter_id("cpusync"); 839 if (CPUMASK_TESTNZERO(mask)) { 840 DEBUG_PUSH_INFO("cpusync_interlock"); 841 ++ipiq_stat(gd).ipiq_cscount; 842 ++gd->gd_curthread->td_cscount; 843 lwkt_send_ipiq_mask(mask, (ipifunc1_t)lwkt_cpusync_remote1, cs); 844 logipiq2(sync_start, (long)CPUMASK_LOWMASK(mask)); 845 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 846 lwkt_process_ipiq(); 847 cpu_pause(); 848 #ifdef _KERNEL_VIRTUAL 849 pthread_yield(); 850 #endif 851 } 852 DEBUG_POP_INFO(); 853 } 854 } 855 856 /* 857 * Interlocked cpus have executed remote1 and are polling in remote2. 858 * To deinterlock we clear cs_mack and wait for the cpus to execute 859 * the func and set their bit in cs_mack again. 860 * 861 */ 862 void 863 lwkt_cpusync_deinterlock(lwkt_cpusync_t cs) 864 { 865 globaldata_t gd = mycpu; 866 cpumask_t mask; 867 868 /* 869 * mask acknowledge (cs_mack): mack->0->mack for stage 2 870 * 871 * Clearing cpu bits for polling cpus in cs_mack will cause them to 872 * execute stage 2, which executes the cs_func(cs_data) and then sets 873 * their bit in cs_mack again. 874 * 875 * mack does not include the current cpu. 876 */ 877 mask = cs->cs_mack; 878 cpu_ccfence(); 879 CPUMASK_ASSZERO(cs->cs_mack); 880 cpu_ccfence(); 881 if (cs->cs_func && CPUMASK_TESTBIT(cs->cs_mask, gd->gd_cpuid)) 882 cs->cs_func(cs->cs_data); 883 if (CPUMASK_TESTNZERO(mask)) { 884 DEBUG_PUSH_INFO("cpusync_deinterlock"); 885 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 886 lwkt_process_ipiq(); 887 cpu_pause(); 888 #ifdef _KERNEL_VIRTUAL 889 pthread_yield(); 890 #endif 891 } 892 DEBUG_POP_INFO(); 893 /* 894 * cpusyncq ipis may be left queued without the RQF flag set due to 895 * a non-zero td_cscount, so be sure to process any laggards after 896 * decrementing td_cscount. 897 */ 898 --gd->gd_curthread->td_cscount; 899 lwkt_process_ipiq(); 900 logipiq2(sync_end, (long)CPUMASK_LOWMASK(mask)); 901 } 902 crit_exit_id("cpusync"); 903 } 904 905 /* 906 * The quick version does not quiesce the target cpu(s) but instead executes 907 * the function on the target cpu(s) and waits for all to acknowledge. This 908 * avoids spinning on the target cpus. 909 * 910 * This function is typically only used for kernel_pmap updates. User pmaps 911 * have to be quiesced. 912 */ 913 void 914 lwkt_cpusync_quick(lwkt_cpusync_t cs) 915 { 916 globaldata_t gd = mycpu; 917 cpumask_t mask; 918 919 /* 920 * stage-2 cs_mack only. 921 */ 922 mask = cs->cs_mask; 923 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 924 CPUMASK_ANDMASK(mask, smp_active_mask); 925 CPUMASK_ASSZERO(cs->cs_mack); 926 927 crit_enter_id("cpusync"); 928 if (CPUMASK_TESTNZERO(mask)) { 929 DEBUG_PUSH_INFO("cpusync_interlock"); 930 ++ipiq_stat(gd).ipiq_cscount; 931 ++gd->gd_curthread->td_cscount; 932 lwkt_send_ipiq_mask(mask, (ipifunc1_t)lwkt_cpusync_remote2, cs); 933 logipiq2(sync_quick, (long)CPUMASK_LOWMASK(mask)); 934 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 935 lwkt_process_ipiq(); 936 cpu_pause(); 937 #ifdef _KERNEL_VIRTUAL 938 pthread_yield(); 939 #endif 940 } 941 942 /* 943 * cpusyncq ipis may be left queued without the RQF flag set due to 944 * a non-zero td_cscount, so be sure to process any laggards after 945 * decrementing td_cscount. 946 */ 947 DEBUG_POP_INFO(); 948 --gd->gd_curthread->td_cscount; 949 lwkt_process_ipiq(); 950 } 951 if (cs->cs_func && CPUMASK_TESTBIT(cs->cs_mask, gd->gd_cpuid)) 952 cs->cs_func(cs->cs_data); 953 crit_exit_id("cpusync"); 954 } 955 956 /* 957 * helper IPI remote messaging function. 958 * 959 * Called on remote cpu when a new cpu synchronization request has been 960 * sent to us. Execute the run function and adjust cs_count, then requeue 961 * the request so we spin on it. 962 */ 963 static void 964 lwkt_cpusync_remote1(lwkt_cpusync_t cs) 965 { 966 globaldata_t gd = mycpu; 967 968 ATOMIC_CPUMASK_ORBIT(cs->cs_mack, gd->gd_cpuid); 969 lwkt_cpusync_remote2(cs); 970 } 971 972 /* 973 * helper IPI remote messaging function. 974 * 975 * Poll for the originator telling us to finish. If it hasn't, requeue 976 * our request so we spin on it. 977 */ 978 static void 979 lwkt_cpusync_remote2(lwkt_cpusync_t cs) 980 { 981 globaldata_t gd = mycpu; 982 983 if (CPUMASK_TESTMASK(cs->cs_mack, gd->gd_cpumask) == 0) { 984 if (cs->cs_func) 985 cs->cs_func(cs->cs_data); 986 ATOMIC_CPUMASK_ORBIT(cs->cs_mack, gd->gd_cpuid); 987 /* cs can be ripped out at this point */ 988 } else { 989 lwkt_ipiq_t ip; 990 int wi; 991 992 cpu_pause(); 993 #ifdef _KERNEL_VIRTUAL 994 pthread_yield(); 995 #endif 996 cpu_lfence(); 997 998 /* 999 * Requeue our IPI to avoid a deep stack recursion. If no other 1000 * IPIs are pending we can just loop up, which should help VMs 1001 * better-detect spin loops. 1002 */ 1003 ip = &gd->gd_cpusyncq; 1004 #if 0 1005 if (ip->ip_rindex == ip->ip_windex) { 1006 __asm __volatile("cli"); 1007 if (ip->ip_rindex == ip->ip_windex) { 1008 __asm __volatile("sti; hlt"); 1009 } else { 1010 __asm __volatile("sti"); 1011 } 1012 } 1013 #endif 1014 1015 wi = ip->ip_windex & MAXCPUFIFO_MASK; 1016 ip->ip_info[wi].func = (ipifunc3_t)(ipifunc1_t)lwkt_cpusync_remote2; 1017 ip->ip_info[wi].arg1 = cs; 1018 ip->ip_info[wi].arg2 = 0; 1019 cpu_sfence(); 1020 KKASSERT(ip->ip_windex - ip->ip_rindex < MAXCPUFIFO); 1021 ++ip->ip_windex; 1022 if (ipiq_debug && (ip->ip_windex & 0xFFFFFF) == 0) { 1023 kprintf("cpu %d cm=%016jx %016jx f=%p\n", 1024 gd->gd_cpuid, 1025 (intmax_t)CPUMASK_LOWMASK(cs->cs_mask), 1026 (intmax_t)CPUMASK_LOWMASK(cs->cs_mack), 1027 cs->cs_func); 1028 } 1029 } 1030 } 1031