1 /* 2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * This module implements IPI message queueing and the MI portion of IPI 37 * message processing. 38 */ 39 40 #include "opt_ddb.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/rtprio.h> 47 #include <sys/queue.h> 48 #include <sys/thread2.h> 49 #include <sys/sysctl.h> 50 #include <sys/ktr.h> 51 #include <sys/kthread.h> 52 #include <machine/cpu.h> 53 #include <sys/lock.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_param.h> 57 #include <vm/vm_kern.h> 58 #include <vm/vm_object.h> 59 #include <vm/vm_page.h> 60 #include <vm/vm_map.h> 61 #include <vm/vm_pager.h> 62 #include <vm/vm_extern.h> 63 #include <vm/vm_zone.h> 64 65 #include <machine/stdarg.h> 66 #include <machine/smp.h> 67 #include <machine/atomic.h> 68 69 #ifdef _KERNEL_VIRTUAL 70 #include <pthread.h> 71 #endif 72 73 struct ipiq_stats { 74 __int64_t ipiq_count; /* total calls to lwkt_send_ipiq*() */ 75 __int64_t ipiq_fifofull; /* number of fifo full conditions detected */ 76 __int64_t ipiq_avoided; /* interlock with target avoids cpu ipi */ 77 __int64_t ipiq_passive; /* passive IPI messages */ 78 __int64_t ipiq_cscount; /* number of cpu synchronizations */ 79 } __cachealign; 80 81 static struct ipiq_stats ipiq_stats_percpu[MAXCPU]; 82 #define ipiq_stat(gd) ipiq_stats_percpu[(gd)->gd_cpuid] 83 84 static int ipiq_debug; /* set to 1 for debug */ 85 #ifdef PANIC_DEBUG 86 static int panic_ipiq_cpu = -1; 87 static int panic_ipiq_count = 100; 88 #endif 89 90 SYSCTL_INT(_lwkt, OID_AUTO, ipiq_debug, CTLFLAG_RW, &ipiq_debug, 0, 91 ""); 92 #ifdef PANIC_DEBUG 93 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_cpu, CTLFLAG_RW, &panic_ipiq_cpu, 0, ""); 94 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_count, CTLFLAG_RW, &panic_ipiq_count, 0, ""); 95 #endif 96 97 #define IPIQ_STRING "func=%p arg1=%p arg2=%d scpu=%d dcpu=%d" 98 #define IPIQ_ARGS void *func, void *arg1, int arg2, int scpu, int dcpu 99 100 #if !defined(KTR_IPIQ) 101 #define KTR_IPIQ KTR_ALL 102 #endif 103 KTR_INFO_MASTER(ipiq); 104 KTR_INFO(KTR_IPIQ, ipiq, send_norm, 0, IPIQ_STRING, IPIQ_ARGS); 105 KTR_INFO(KTR_IPIQ, ipiq, send_pasv, 1, IPIQ_STRING, IPIQ_ARGS); 106 KTR_INFO(KTR_IPIQ, ipiq, send_nbio, 2, IPIQ_STRING, IPIQ_ARGS); 107 KTR_INFO(KTR_IPIQ, ipiq, send_fail, 3, IPIQ_STRING, IPIQ_ARGS); 108 KTR_INFO(KTR_IPIQ, ipiq, receive, 4, IPIQ_STRING, IPIQ_ARGS); 109 KTR_INFO(KTR_IPIQ, ipiq, sync_start, 5, "cpumask=%08lx", unsigned long mask); 110 KTR_INFO(KTR_IPIQ, ipiq, sync_end, 6, "cpumask=%08lx", unsigned long mask); 111 KTR_INFO(KTR_IPIQ, ipiq, cpu_send, 7, IPIQ_STRING, IPIQ_ARGS); 112 KTR_INFO(KTR_IPIQ, ipiq, send_end, 8, IPIQ_STRING, IPIQ_ARGS); 113 114 #define logipiq(name, func, arg1, arg2, sgd, dgd) \ 115 KTR_LOG(ipiq_ ## name, func, arg1, arg2, sgd->gd_cpuid, dgd->gd_cpuid) 116 #define logipiq2(name, arg) \ 117 KTR_LOG(ipiq_ ## name, arg) 118 119 static int lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, 120 struct intrframe *frame); 121 static void lwkt_cpusync_remote1(lwkt_cpusync_t cs); 122 static void lwkt_cpusync_remote2(lwkt_cpusync_t cs); 123 124 #define IPIQ_SYSCTL(name) \ 125 static int \ 126 sysctl_##name(SYSCTL_HANDLER_ARGS) \ 127 { \ 128 __int64_t val = 0; \ 129 int cpu, error; \ 130 \ 131 for (cpu = 0; cpu < ncpus; ++cpu) \ 132 val += ipiq_stats_percpu[cpu].name; \ 133 \ 134 error = sysctl_handle_quad(oidp, &val, 0, req); \ 135 if (error || req->newptr == NULL) \ 136 return error; \ 137 \ 138 for (cpu = 0; cpu < ncpus; ++cpu) \ 139 ipiq_stats_percpu[cpu].name = val; \ 140 \ 141 return 0; \ 142 } 143 144 IPIQ_SYSCTL(ipiq_count); 145 IPIQ_SYSCTL(ipiq_fifofull); 146 IPIQ_SYSCTL(ipiq_avoided); 147 IPIQ_SYSCTL(ipiq_passive); 148 IPIQ_SYSCTL(ipiq_cscount); 149 150 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_count, (CTLTYPE_QUAD | CTLFLAG_RW), 151 0, 0, sysctl_ipiq_count, "Q", "Number of IPI's sent"); 152 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_fifofull, (CTLTYPE_QUAD | CTLFLAG_RW), 153 0, 0, sysctl_ipiq_fifofull, "Q", 154 "Number of fifo full conditions detected"); 155 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_avoided, (CTLTYPE_QUAD | CTLFLAG_RW), 156 0, 0, sysctl_ipiq_avoided, "Q", 157 "Number of IPI's avoided by interlock with target cpu"); 158 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_passive, (CTLTYPE_QUAD | CTLFLAG_RW), 159 0, 0, sysctl_ipiq_passive, "Q", 160 "Number of passive IPI messages sent"); 161 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_cscount, (CTLTYPE_QUAD | CTLFLAG_RW), 162 0, 0, sysctl_ipiq_cscount, "Q", 163 "Number of cpu synchronizations"); 164 165 /* 166 * Send a function execution request to another cpu. The request is queued 167 * on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every 168 * possible target cpu. The FIFO can be written. 169 * 170 * If the FIFO fills up we have to enable interrupts to avoid an APIC 171 * deadlock and process pending IPIQs while waiting for it to empty. 172 * Otherwise we may soft-deadlock with another cpu whos FIFO is also full. 173 * 174 * We can safely bump gd_intr_nesting_level because our crit_exit() at the 175 * end will take care of any pending interrupts. 176 * 177 * The actual hardware IPI is avoided if the target cpu is already processing 178 * the queue from a prior IPI. It is possible to pipeline IPI messages 179 * very quickly between cpus due to the FIFO hysteresis. 180 * 181 * Need not be called from a critical section. 182 */ 183 int 184 lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2) 185 { 186 lwkt_ipiq_t ip; 187 int windex; 188 #ifdef _KERNEL_VIRTUAL 189 int repeating = 0; 190 #endif 191 struct globaldata *gd = mycpu; 192 193 logipiq(send_norm, func, arg1, arg2, gd, target); 194 195 if (target == gd) { 196 func(arg1, arg2, NULL); 197 logipiq(send_end, func, arg1, arg2, gd, target); 198 return(0); 199 } 200 crit_enter(); 201 ++gd->gd_intr_nesting_level; 202 #ifdef INVARIANTS 203 if (gd->gd_intr_nesting_level > 20) 204 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); 205 #endif 206 KKASSERT(curthread->td_critcount); 207 ++ipiq_stat(gd).ipiq_count; 208 ip = &gd->gd_ipiq[target->gd_cpuid]; 209 210 /* 211 * Do not allow the FIFO to become full. Interrupts must be physically 212 * enabled while we liveloop to avoid deadlocking the APIC. 213 * 214 * The target ipiq may have gotten filled up due to passive IPIs and thus 215 * not be aware that its queue is too full, so be sure to issue an 216 * ipiq interrupt to the target cpu. 217 */ 218 if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) { 219 #if defined(__i386__) 220 unsigned int eflags = read_eflags(); 221 #elif defined(__x86_64__) 222 unsigned long rflags = read_rflags(); 223 #endif 224 225 cpu_enable_intr(); 226 ++ipiq_stat(gd).ipiq_fifofull; 227 DEBUG_PUSH_INFO("send_ipiq3"); 228 while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) { 229 if (atomic_poll_acquire_int(&target->gd_npoll)) { 230 logipiq(cpu_send, func, arg1, arg2, gd, target); 231 cpu_send_ipiq(target->gd_cpuid); 232 } 233 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1); 234 lwkt_process_ipiq(); 235 cpu_pause(); 236 #ifdef _KERNEL_VIRTUAL 237 if (repeating++ > 10) 238 pthread_yield(); 239 #endif 240 } 241 DEBUG_POP_INFO(); 242 #if defined(__i386__) 243 write_eflags(eflags); 244 #elif defined(__x86_64__) 245 write_rflags(rflags); 246 #endif 247 } 248 249 /* 250 * Queue the new message 251 */ 252 windex = ip->ip_windex & MAXCPUFIFO_MASK; 253 ip->ip_info[windex].func = func; 254 ip->ip_info[windex].arg1 = arg1; 255 ip->ip_info[windex].arg2 = arg2; 256 cpu_sfence(); 257 ++ip->ip_windex; 258 atomic_set_cpumask(&target->gd_ipimask, gd->gd_cpumask); 259 260 /* 261 * signal the target cpu that there is work pending. 262 */ 263 if (atomic_poll_acquire_int(&target->gd_npoll)) { 264 logipiq(cpu_send, func, arg1, arg2, gd, target); 265 cpu_send_ipiq(target->gd_cpuid); 266 } else { 267 ++ipiq_stat(gd).ipiq_avoided; 268 } 269 --gd->gd_intr_nesting_level; 270 crit_exit(); 271 logipiq(send_end, func, arg1, arg2, gd, target); 272 273 return(ip->ip_windex); 274 } 275 276 /* 277 * Similar to lwkt_send_ipiq() but this function does not actually initiate 278 * the IPI to the target cpu unless the FIFO has become too full, so it is 279 * very fast. 280 * 281 * This function is used for non-critical IPI messages, such as memory 282 * deallocations. The queue will typically be flushed by the target cpu at 283 * the next clock interrupt. 284 * 285 * Need not be called from a critical section. 286 */ 287 int 288 lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func, 289 void *arg1, int arg2) 290 { 291 lwkt_ipiq_t ip; 292 int windex; 293 #ifdef _KERNEL_VIRTUAL 294 int repeating = 0; 295 #endif 296 struct globaldata *gd = mycpu; 297 298 KKASSERT(target != gd); 299 crit_enter(); 300 ++gd->gd_intr_nesting_level; 301 logipiq(send_pasv, func, arg1, arg2, gd, target); 302 #ifdef INVARIANTS 303 if (gd->gd_intr_nesting_level > 20) 304 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); 305 #endif 306 KKASSERT(curthread->td_critcount); 307 ++ipiq_stat(gd).ipiq_count; 308 ++ipiq_stat(gd).ipiq_passive; 309 ip = &gd->gd_ipiq[target->gd_cpuid]; 310 311 /* 312 * Do not allow the FIFO to become full. Interrupts must be physically 313 * enabled while we liveloop to avoid deadlocking the APIC. 314 */ 315 if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) { 316 #if defined(__i386__) 317 unsigned int eflags = read_eflags(); 318 #elif defined(__x86_64__) 319 unsigned long rflags = read_rflags(); 320 #endif 321 322 cpu_enable_intr(); 323 ++ipiq_stat(gd).ipiq_fifofull; 324 DEBUG_PUSH_INFO("send_ipiq3_passive"); 325 while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) { 326 if (atomic_poll_acquire_int(&target->gd_npoll)) { 327 logipiq(cpu_send, func, arg1, arg2, gd, target); 328 cpu_send_ipiq(target->gd_cpuid); 329 } 330 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1); 331 lwkt_process_ipiq(); 332 cpu_pause(); 333 #ifdef _KERNEL_VIRTUAL 334 if (repeating++ > 10) 335 pthread_yield(); 336 #endif 337 } 338 DEBUG_POP_INFO(); 339 #if defined(__i386__) 340 write_eflags(eflags); 341 #elif defined(__x86_64__) 342 write_rflags(rflags); 343 #endif 344 } 345 346 /* 347 * Queue the new message 348 */ 349 windex = ip->ip_windex & MAXCPUFIFO_MASK; 350 ip->ip_info[windex].func = func; 351 ip->ip_info[windex].arg1 = arg1; 352 ip->ip_info[windex].arg2 = arg2; 353 cpu_sfence(); 354 ++ip->ip_windex; 355 atomic_set_cpumask(&target->gd_ipimask, gd->gd_cpumask); 356 --gd->gd_intr_nesting_level; 357 358 /* 359 * Do not signal the target cpu, it will pick up the IPI when it next 360 * polls (typically on the next tick). 361 */ 362 crit_exit(); 363 logipiq(send_end, func, arg1, arg2, gd, target); 364 365 return(ip->ip_windex); 366 } 367 368 /* 369 * Send an IPI request without blocking, return 0 on success, ENOENT on 370 * failure. The actual queueing of the hardware IPI may still force us 371 * to spin and process incoming IPIs but that will eventually go away 372 * when we've gotten rid of the other general IPIs. 373 */ 374 int 375 lwkt_send_ipiq3_nowait(globaldata_t target, ipifunc3_t func, 376 void *arg1, int arg2) 377 { 378 lwkt_ipiq_t ip; 379 int windex; 380 struct globaldata *gd = mycpu; 381 382 logipiq(send_nbio, func, arg1, arg2, gd, target); 383 KKASSERT(curthread->td_critcount); 384 if (target == gd) { 385 func(arg1, arg2, NULL); 386 logipiq(send_end, func, arg1, arg2, gd, target); 387 return(0); 388 } 389 crit_enter(); 390 ++gd->gd_intr_nesting_level; 391 ++ipiq_stat(gd).ipiq_count; 392 ip = &gd->gd_ipiq[target->gd_cpuid]; 393 394 if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO * 2 / 3) { 395 logipiq(send_fail, func, arg1, arg2, gd, target); 396 --gd->gd_intr_nesting_level; 397 crit_exit(); 398 return(ENOENT); 399 } 400 windex = ip->ip_windex & MAXCPUFIFO_MASK; 401 ip->ip_info[windex].func = func; 402 ip->ip_info[windex].arg1 = arg1; 403 ip->ip_info[windex].arg2 = arg2; 404 cpu_sfence(); 405 ++ip->ip_windex; 406 atomic_set_cpumask(&target->gd_ipimask, gd->gd_cpumask); 407 408 /* 409 * This isn't a passive IPI, we still have to signal the target cpu. 410 */ 411 if (atomic_poll_acquire_int(&target->gd_npoll)) { 412 logipiq(cpu_send, func, arg1, arg2, gd, target); 413 cpu_send_ipiq(target->gd_cpuid); 414 } else { 415 ++ipiq_stat(gd).ipiq_avoided; 416 } 417 --gd->gd_intr_nesting_level; 418 crit_exit(); 419 420 logipiq(send_end, func, arg1, arg2, gd, target); 421 return(0); 422 } 423 424 /* 425 * deprecated, used only by fast int forwarding. 426 */ 427 int 428 lwkt_send_ipiq3_bycpu(int dcpu, ipifunc3_t func, void *arg1, int arg2) 429 { 430 return(lwkt_send_ipiq3(globaldata_find(dcpu), func, arg1, arg2)); 431 } 432 433 /* 434 * Send a message to several target cpus. Typically used for scheduling. 435 * The message will not be sent to stopped cpus. 436 */ 437 int 438 lwkt_send_ipiq3_mask(cpumask_t mask, ipifunc3_t func, void *arg1, int arg2) 439 { 440 int cpuid; 441 int count = 0; 442 443 mask &= ~stopped_cpus; 444 while (mask) { 445 cpuid = BSFCPUMASK(mask); 446 lwkt_send_ipiq3(globaldata_find(cpuid), func, arg1, arg2); 447 mask &= ~CPUMASK(cpuid); 448 ++count; 449 } 450 return(count); 451 } 452 453 /* 454 * Wait for the remote cpu to finish processing a function. 455 * 456 * YYY we have to enable interrupts and process the IPIQ while waiting 457 * for it to empty or we may deadlock with another cpu. Create a CPU_*() 458 * function to do this! YYY we really should 'block' here. 459 * 460 * MUST be called from a critical section. This routine may be called 461 * from an interrupt (for example, if an interrupt wakes a foreign thread 462 * up). 463 */ 464 void 465 lwkt_wait_ipiq(globaldata_t target, int seq) 466 { 467 lwkt_ipiq_t ip; 468 469 if (target != mycpu) { 470 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 471 if ((int)(ip->ip_xindex - seq) < 0) { 472 #if defined(__i386__) 473 unsigned int eflags = read_eflags(); 474 #elif defined(__x86_64__) 475 unsigned long rflags = read_rflags(); 476 #endif 477 int64_t time_tgt = tsc_get_target(1000000000LL); 478 int time_loops = 10; 479 int benice = 0; 480 #ifdef _KERNEL_VIRTUAL 481 int repeating = 0; 482 #endif 483 484 cpu_enable_intr(); 485 DEBUG_PUSH_INFO("wait_ipiq"); 486 while ((int)(ip->ip_xindex - seq) < 0) { 487 crit_enter(); 488 lwkt_process_ipiq(); 489 crit_exit(); 490 #ifdef _KERNEL_VIRTUAL 491 if (repeating++ > 10) 492 pthread_yield(); 493 #endif 494 495 /* 496 * IPIQs must be handled within 10 seconds and this code 497 * will warn after one second. 498 */ 499 if ((benice & 255) == 0 && tsc_test_target(time_tgt) > 0) { 500 kprintf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", 501 mycpu->gd_cpuid, target->gd_cpuid, 502 ip->ip_xindex - seq); 503 if (--time_loops == 0) 504 panic("LWKT_WAIT_IPIQ"); 505 time_tgt = tsc_get_target(1000000000LL); 506 } 507 ++benice; 508 509 /* 510 * xindex may be modified by another cpu, use a load fence 511 * to ensure that the loop does not use a speculative value 512 * (which may improve performance). 513 */ 514 cpu_lfence(); 515 } 516 DEBUG_POP_INFO(); 517 #if defined(__i386__) 518 write_eflags(eflags); 519 #elif defined(__x86_64__) 520 write_rflags(rflags); 521 #endif 522 } 523 } 524 } 525 526 int 527 lwkt_seq_ipiq(globaldata_t target) 528 { 529 lwkt_ipiq_t ip; 530 531 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 532 return(ip->ip_windex); 533 } 534 535 /* 536 * Called from IPI interrupt (like a fast interrupt), which has placed 537 * us in a critical section. The MP lock may or may not be held. 538 * May also be called from doreti or splz, or be reentrantly called 539 * indirectly through the ip_info[].func we run. 540 * 541 * There are two versions, one where no interrupt frame is available (when 542 * called from the send code and from splz, and one where an interrupt 543 * frame is available. 544 * 545 * When the current cpu is mastering a cpusync we do NOT internally loop 546 * on the cpusyncq poll. We also do not re-flag a pending ipi due to 547 * the cpusyncq poll because this can cause doreti/splz to loop internally. 548 * The cpusync master's own loop must be allowed to run to avoid a deadlock. 549 */ 550 void 551 lwkt_process_ipiq(void) 552 { 553 globaldata_t gd = mycpu; 554 globaldata_t sgd; 555 lwkt_ipiq_t ip; 556 cpumask_t mask; 557 int n; 558 559 ++gd->gd_processing_ipiq; 560 again: 561 cpu_lfence(); 562 mask = gd->gd_ipimask; 563 atomic_clear_cpumask(&gd->gd_ipimask, mask); 564 while (mask) { 565 n = BSFCPUMASK(mask); 566 if (n != gd->gd_cpuid) { 567 sgd = globaldata_find(n); 568 ip = sgd->gd_ipiq; 569 if (ip != NULL) { 570 while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], NULL)) 571 ; 572 } 573 } 574 mask &= ~CPUMASK(n); 575 } 576 577 /* 578 * Process pending cpusyncs. If the current thread has a cpusync 579 * active cpusync we only run the list once and do not re-flag 580 * as the thread itself is processing its interlock. 581 */ 582 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL)) { 583 if (gd->gd_curthread->td_cscount == 0) 584 goto again; 585 /* need_ipiq(); do not reflag */ 586 } 587 588 /* 589 * Interlock to allow more IPI interrupts. Recheck ipimask after 590 * releasing gd_npoll. 591 */ 592 if (gd->gd_ipimask) 593 goto again; 594 atomic_poll_release_int(&gd->gd_npoll); 595 cpu_mfence(); 596 if (gd->gd_ipimask) 597 goto again; 598 --gd->gd_processing_ipiq; 599 } 600 601 void 602 lwkt_process_ipiq_frame(struct intrframe *frame) 603 { 604 globaldata_t gd = mycpu; 605 globaldata_t sgd; 606 lwkt_ipiq_t ip; 607 cpumask_t mask; 608 int n; 609 610 again: 611 cpu_lfence(); 612 mask = gd->gd_ipimask; 613 atomic_clear_cpumask(&gd->gd_ipimask, mask); 614 while (mask) { 615 n = BSFCPUMASK(mask); 616 if (n != gd->gd_cpuid) { 617 sgd = globaldata_find(n); 618 ip = sgd->gd_ipiq; 619 if (ip != NULL) { 620 while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], frame)) 621 ; 622 } 623 } 624 mask &= ~CPUMASK(n); 625 } 626 if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) { 627 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, frame)) { 628 if (gd->gd_curthread->td_cscount == 0) 629 goto again; 630 /* need_ipiq(); do not reflag */ 631 } 632 } 633 634 /* 635 * Interlock to allow more IPI interrupts. Recheck ipimask after 636 * releasing gd_npoll. 637 */ 638 if (gd->gd_ipimask) 639 goto again; 640 atomic_poll_release_int(&gd->gd_npoll); 641 cpu_mfence(); 642 if (gd->gd_ipimask) 643 goto again; 644 } 645 646 #if 0 647 static int iqticks[SMP_MAXCPU]; 648 static int iqcount[SMP_MAXCPU]; 649 #endif 650 #if 0 651 static int iqterm[SMP_MAXCPU]; 652 #endif 653 654 static int 655 lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, 656 struct intrframe *frame) 657 { 658 globaldata_t mygd = mycpu; 659 int ri; 660 int wi; 661 ipifunc3_t copy_func; 662 void *copy_arg1; 663 int copy_arg2; 664 665 #if 0 666 if (iqticks[mygd->gd_cpuid] != ticks) { 667 iqticks[mygd->gd_cpuid] = ticks; 668 iqcount[mygd->gd_cpuid] = 0; 669 } 670 if (++iqcount[mygd->gd_cpuid] > 3000000) { 671 kprintf("cpu %d ipiq maxed cscount %d spin %d\n", 672 mygd->gd_cpuid, 673 mygd->gd_curthread->td_cscount, 674 mygd->gd_spinlocks); 675 iqcount[mygd->gd_cpuid] = 0; 676 #if 0 677 if (++iqterm[mygd->gd_cpuid] > 10) 678 panic("cpu %d ipiq maxed", mygd->gd_cpuid); 679 #endif 680 int i; 681 for (i = 0; i < ncpus; ++i) { 682 if (globaldata_find(i)->gd_infomsg) 683 kprintf(" %s", globaldata_find(i)->gd_infomsg); 684 } 685 kprintf("\n"); 686 } 687 #endif 688 689 /* 690 * Clear the originating core from our ipimask, we will process all 691 * incoming messages. 692 * 693 * Obtain the current write index, which is modified by a remote cpu. 694 * Issue a load fence to prevent speculative reads of e.g. data written 695 * by the other cpu prior to it updating the index. 696 */ 697 KKASSERT(curthread->td_critcount); 698 wi = ip->ip_windex; 699 cpu_lfence(); 700 ++mygd->gd_intr_nesting_level; 701 702 /* 703 * NOTE: xindex is only updated after we are sure the function has 704 * finished execution. Beware lwkt_process_ipiq() reentrancy! 705 * The function may send an IPI which may block/drain. 706 * 707 * NOTE: Due to additional IPI operations that the callback function 708 * may make, it is possible for both rindex and windex to advance and 709 * thus for rindex to advance passed our cached windex. 710 * 711 * NOTE: A load fence is required to prevent speculative loads prior 712 * to the loading of ip_rindex. Even though stores might be 713 * ordered, loads are probably not. A memory fence is required 714 * to prevent reordering of the loads after the ip_rindex update. 715 * 716 * NOTE: Single pass only. Returns non-zero if the queue is not empty 717 * on return. 718 */ 719 while (wi - (ri = ip->ip_rindex) > 0) { 720 ri &= MAXCPUFIFO_MASK; 721 cpu_lfence(); 722 copy_func = ip->ip_info[ri].func; 723 copy_arg1 = ip->ip_info[ri].arg1; 724 copy_arg2 = ip->ip_info[ri].arg2; 725 cpu_mfence(); 726 ++ip->ip_rindex; 727 KKASSERT((ip->ip_rindex & MAXCPUFIFO_MASK) == 728 ((ri + 1) & MAXCPUFIFO_MASK)); 729 logipiq(receive, copy_func, copy_arg1, copy_arg2, sgd, mycpu); 730 #ifdef INVARIANTS 731 if (ipiq_debug && (ip->ip_rindex & 0xFFFFFF) == 0) { 732 kprintf("cpu %d ipifunc %p %p %d (frame %p)\n", 733 mycpu->gd_cpuid, 734 copy_func, copy_arg1, copy_arg2, 735 #if defined(__i386__) 736 (frame ? (void *)frame->if_eip : NULL)); 737 #elif defined(__x86_64__) 738 (frame ? (void *)frame->if_rip : NULL)); 739 #else 740 NULL); 741 #endif 742 } 743 #endif 744 copy_func(copy_arg1, copy_arg2, frame); 745 cpu_sfence(); 746 ip->ip_xindex = ip->ip_rindex; 747 748 #ifdef PANIC_DEBUG 749 /* 750 * Simulate panics during the processing of an IPI 751 */ 752 if (mycpu->gd_cpuid == panic_ipiq_cpu && panic_ipiq_count) { 753 if (--panic_ipiq_count == 0) { 754 #ifdef DDB 755 Debugger("PANIC_DEBUG"); 756 #else 757 panic("PANIC_DEBUG"); 758 #endif 759 } 760 } 761 #endif 762 } 763 --mygd->gd_intr_nesting_level; 764 765 /* 766 * Return non-zero if there is still more in the queue. 767 */ 768 cpu_lfence(); 769 return (ip->ip_rindex != ip->ip_windex); 770 } 771 772 static void 773 lwkt_sync_ipiq(void *arg) 774 { 775 volatile cpumask_t *cpumask = arg; 776 777 atomic_clear_cpumask(cpumask, mycpu->gd_cpumask); 778 if (*cpumask == 0) 779 wakeup(cpumask); 780 } 781 782 void 783 lwkt_synchronize_ipiqs(const char *wmesg) 784 { 785 volatile cpumask_t other_cpumask; 786 787 other_cpumask = mycpu->gd_other_cpus & smp_active_mask; 788 lwkt_send_ipiq_mask(other_cpumask, lwkt_sync_ipiq, 789 __DEVOLATILE(void *, &other_cpumask)); 790 791 while (other_cpumask != 0) { 792 tsleep_interlock(&other_cpumask, 0); 793 if (other_cpumask != 0) 794 tsleep(&other_cpumask, PINTERLOCKED, wmesg, 0); 795 } 796 } 797 798 /* 799 * CPU Synchronization Support 800 * 801 * lwkt_cpusync_interlock() - Place specified cpus in a quiescent state. 802 * The current cpu is placed in a hard critical 803 * section. 804 * 805 * lwkt_cpusync_deinterlock() - Execute cs_func on specified cpus, including 806 * current cpu if specified, then return. 807 */ 808 void 809 lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *arg) 810 { 811 struct lwkt_cpusync cs; 812 813 lwkt_cpusync_init(&cs, mask, func, arg); 814 lwkt_cpusync_interlock(&cs); 815 lwkt_cpusync_deinterlock(&cs); 816 } 817 818 819 void 820 lwkt_cpusync_interlock(lwkt_cpusync_t cs) 821 { 822 #if 0 823 const char *smsg = "SMPSYNL"; 824 #endif 825 globaldata_t gd = mycpu; 826 cpumask_t mask; 827 828 /* 829 * mask acknowledge (cs_mack): 0->mask for stage 1 830 * 831 * mack does not include the current cpu. 832 */ 833 mask = cs->cs_mask & gd->gd_other_cpus & smp_active_mask; 834 cs->cs_mack = 0; 835 crit_enter_id("cpusync"); 836 if (mask) { 837 DEBUG_PUSH_INFO("cpusync_interlock"); 838 ++ipiq_stat(gd).ipiq_cscount; 839 ++gd->gd_curthread->td_cscount; 840 lwkt_send_ipiq_mask(mask, (ipifunc1_t)lwkt_cpusync_remote1, cs); 841 logipiq2(sync_start, (long)mask); 842 #if 0 843 if (gd->gd_curthread->td_wmesg == NULL) 844 gd->gd_curthread->td_wmesg = smsg; 845 #endif 846 while (cs->cs_mack != mask) { 847 lwkt_process_ipiq(); 848 cpu_pause(); 849 #ifdef _KERNEL_VIRTUAL 850 pthread_yield(); 851 #endif 852 } 853 #if 0 854 if (gd->gd_curthread->td_wmesg == smsg) 855 gd->gd_curthread->td_wmesg = NULL; 856 #endif 857 DEBUG_POP_INFO(); 858 } 859 } 860 861 /* 862 * Interlocked cpus have executed remote1 and are polling in remote2. 863 * To deinterlock we clear cs_mack and wait for the cpus to execute 864 * the func and set their bit in cs_mack again. 865 * 866 */ 867 void 868 lwkt_cpusync_deinterlock(lwkt_cpusync_t cs) 869 { 870 globaldata_t gd = mycpu; 871 #if 0 872 const char *smsg = "SMPSYNU"; 873 #endif 874 cpumask_t mask; 875 876 /* 877 * mask acknowledge (cs_mack): mack->0->mack for stage 2 878 * 879 * Clearing cpu bits for polling cpus in cs_mack will cause them to 880 * execute stage 2, which executes the cs_func(cs_data) and then sets 881 * their bit in cs_mack again. 882 * 883 * mack does not include the current cpu. 884 */ 885 mask = cs->cs_mack; 886 cpu_ccfence(); 887 cs->cs_mack = 0; 888 cpu_ccfence(); 889 if (cs->cs_func && (cs->cs_mask & gd->gd_cpumask)) 890 cs->cs_func(cs->cs_data); 891 if (mask) { 892 DEBUG_PUSH_INFO("cpusync_deinterlock"); 893 #if 0 894 if (gd->gd_curthread->td_wmesg == NULL) 895 gd->gd_curthread->td_wmesg = smsg; 896 #endif 897 while (cs->cs_mack != mask) { 898 lwkt_process_ipiq(); 899 cpu_pause(); 900 #ifdef _KERNEL_VIRTUAL 901 pthread_yield(); 902 #endif 903 } 904 #if 0 905 if (gd->gd_curthread->td_wmesg == smsg) 906 gd->gd_curthread->td_wmesg = NULL; 907 #endif 908 DEBUG_POP_INFO(); 909 /* 910 * cpusyncq ipis may be left queued without the RQF flag set due to 911 * a non-zero td_cscount, so be sure to process any laggards after 912 * decrementing td_cscount. 913 */ 914 --gd->gd_curthread->td_cscount; 915 lwkt_process_ipiq(); 916 logipiq2(sync_end, (long)mask); 917 } 918 crit_exit_id("cpusync"); 919 } 920 921 /* 922 * helper IPI remote messaging function. 923 * 924 * Called on remote cpu when a new cpu synchronization request has been 925 * sent to us. Execute the run function and adjust cs_count, then requeue 926 * the request so we spin on it. 927 */ 928 static void 929 lwkt_cpusync_remote1(lwkt_cpusync_t cs) 930 { 931 globaldata_t gd = mycpu; 932 933 atomic_set_cpumask(&cs->cs_mack, gd->gd_cpumask); 934 lwkt_cpusync_remote2(cs); 935 } 936 937 /* 938 * helper IPI remote messaging function. 939 * 940 * Poll for the originator telling us to finish. If it hasn't, requeue 941 * our request so we spin on it. 942 */ 943 static void 944 lwkt_cpusync_remote2(lwkt_cpusync_t cs) 945 { 946 globaldata_t gd = mycpu; 947 948 if ((cs->cs_mack & gd->gd_cpumask) == 0) { 949 if (cs->cs_func) 950 cs->cs_func(cs->cs_data); 951 atomic_set_cpumask(&cs->cs_mack, gd->gd_cpumask); 952 /* cs can be ripped out at this point */ 953 } else { 954 lwkt_ipiq_t ip; 955 int wi; 956 957 #ifdef _KERNEL_VIRTUAL 958 pthread_yield(); 959 #endif 960 ip = &gd->gd_cpusyncq; 961 wi = ip->ip_windex & MAXCPUFIFO_MASK; 962 ip->ip_info[wi].func = (ipifunc3_t)(ipifunc1_t)lwkt_cpusync_remote2; 963 ip->ip_info[wi].arg1 = cs; 964 ip->ip_info[wi].arg2 = 0; 965 cpu_sfence(); 966 KKASSERT(ip->ip_windex - ip->ip_rindex < MAXCPUFIFO); 967 ++ip->ip_windex; 968 if (ipiq_debug && (ip->ip_windex & 0xFFFFFF) == 0) { 969 kprintf("cpu %d cm=%016jx %016jx f=%p\n", 970 gd->gd_cpuid, 971 (intmax_t)cs->cs_mask, (intmax_t)cs->cs_mack, 972 cs->cs_func); 973 } 974 } 975 } 976