1 /* 2 * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $DragonFly: src/sys/kern/lwkt_ipiq.c,v 1.5 2004/03/08 03:03:54 dillon Exp $ 27 */ 28 29 /* 30 * This module implements IPI message queueing and the MI portion of IPI 31 * message processing. 32 */ 33 34 #ifdef _KERNEL 35 36 #include <sys/param.h> 37 #include <sys/systm.h> 38 #include <sys/kernel.h> 39 #include <sys/proc.h> 40 #include <sys/rtprio.h> 41 #include <sys/queue.h> 42 #include <sys/thread2.h> 43 #include <sys/sysctl.h> 44 #include <sys/kthread.h> 45 #include <machine/cpu.h> 46 #include <sys/lock.h> 47 #include <sys/caps.h> 48 49 #include <vm/vm.h> 50 #include <vm/vm_param.h> 51 #include <vm/vm_kern.h> 52 #include <vm/vm_object.h> 53 #include <vm/vm_page.h> 54 #include <vm/vm_map.h> 55 #include <vm/vm_pager.h> 56 #include <vm/vm_extern.h> 57 #include <vm/vm_zone.h> 58 59 #include <machine/stdarg.h> 60 #include <machine/ipl.h> 61 #include <machine/smp.h> 62 #include <machine/atomic.h> 63 64 #define THREAD_STACK (UPAGES * PAGE_SIZE) 65 66 #else 67 68 #include <sys/stdint.h> 69 #include <libcaps/thread.h> 70 #include <sys/thread.h> 71 #include <sys/msgport.h> 72 #include <sys/errno.h> 73 #include <libcaps/globaldata.h> 74 #include <sys/thread2.h> 75 #include <sys/msgport2.h> 76 #include <stdio.h> 77 #include <stdlib.h> 78 #include <string.h> 79 #include <machine/cpufunc.h> 80 #include <machine/lock.h> 81 #include <machine/cpu.h> 82 #include <machine/atomic.h> 83 84 #endif 85 86 #ifdef SMP 87 static __int64_t ipiq_count; 88 static __int64_t ipiq_fifofull; 89 static __int64_t ipiq_cscount; 90 #endif 91 92 #ifdef _KERNEL 93 94 #ifdef SMP 95 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_count, CTLFLAG_RW, &ipiq_count, 0, ""); 96 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_fifofull, CTLFLAG_RW, &ipiq_fifofull, 0, ""); 97 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_cscount, CTLFLAG_RW, &ipiq_cscount, 0, ""); 98 #endif 99 100 #endif 101 102 #ifdef SMP 103 104 static int lwkt_process_ipiq1(lwkt_ipiq_t ip, struct intrframe *frame); 105 static void lwkt_cpusync_remote1(lwkt_cpusync_t poll); 106 static void lwkt_cpusync_remote2(lwkt_cpusync_t poll); 107 108 /* 109 * Send a function execution request to another cpu. The request is queued 110 * on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every 111 * possible target cpu. The FIFO can be written. 112 * 113 * YYY If the FIFO fills up we have to enable interrupts and process the 114 * IPIQ while waiting for it to empty or we may deadlock with another cpu. 115 * Create a CPU_*() function to do this! 116 * 117 * We can safely bump gd_intr_nesting_level because our crit_exit() at the 118 * end will take care of any pending interrupts. 119 * 120 * Must be called from a critical section. 121 */ 122 int 123 lwkt_send_ipiq(globaldata_t target, ipifunc_t func, void *arg) 124 { 125 lwkt_ipiq_t ip; 126 int windex; 127 struct globaldata *gd = mycpu; 128 129 if (target == gd) { 130 func(arg); 131 return(0); 132 } 133 crit_enter(); 134 ++gd->gd_intr_nesting_level; 135 #ifdef INVARIANTS 136 if (gd->gd_intr_nesting_level > 20) 137 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); 138 #endif 139 KKASSERT(curthread->td_pri >= TDPRI_CRIT); 140 ++ipiq_count; 141 ip = &gd->gd_ipiq[target->gd_cpuid]; 142 143 /* 144 * We always drain before the FIFO becomes full so it should never 145 * become full. We need to leave enough entries to deal with 146 * reentrancy. 147 */ 148 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO); 149 windex = ip->ip_windex & MAXCPUFIFO_MASK; 150 ip->ip_func[windex] = (ipifunc2_t)func; 151 ip->ip_arg[windex] = arg; 152 cpu_mb1(); 153 ++ip->ip_windex; 154 if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) { 155 unsigned int eflags = read_eflags(); 156 cpu_enable_intr(); 157 ++ipiq_fifofull; 158 while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) { 159 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1); 160 lwkt_process_ipiq(); 161 } 162 write_eflags(eflags); 163 } 164 --gd->gd_intr_nesting_level; 165 cpu_send_ipiq(target->gd_cpuid); /* issues mem barrier if appropriate */ 166 crit_exit(); 167 return(ip->ip_windex); 168 } 169 170 /* 171 * Send an IPI request passively, return 0 on success and ENOENT on failure. 172 * This routine does not recursive through lwkt_process_ipiq() nor does it 173 * block trying to queue the actual IPI. If we successfully queue the 174 * message but fail to queue the IPI, we still count it as a success. 175 * The occassional small race against a target cpu HLT is recovered at 176 * the next clock interrupt. 177 */ 178 int 179 lwkt_send_ipiq_passive(globaldata_t target, ipifunc_t func, void *arg) 180 { 181 lwkt_ipiq_t ip; 182 int windex; 183 struct globaldata *gd = mycpu; 184 185 KKASSERT(curthread->td_pri >= TDPRI_CRIT); 186 if (target == gd) { 187 func(arg); 188 return(0); 189 } 190 ++ipiq_count; 191 ip = &gd->gd_ipiq[target->gd_cpuid]; 192 193 if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO - 1) { 194 return(ENOENT); 195 } 196 windex = ip->ip_windex & MAXCPUFIFO_MASK; 197 ip->ip_func[windex] = (ipifunc2_t)func; 198 ip->ip_arg[windex] = arg; 199 cpu_mb1(); 200 ++ip->ip_windex; 201 /* 202 * passive mode doesn't work yet :-( 203 */ 204 #if 1 205 cpu_send_ipiq(target->gd_cpuid); 206 #else 207 cpu_send_ipiq_passive(target->gd_cpuid); 208 #endif 209 return(0); 210 } 211 212 /* 213 * deprecated, used only by fast int forwarding. 214 */ 215 int 216 lwkt_send_ipiq_bycpu(int dcpu, ipifunc_t func, void *arg) 217 { 218 return(lwkt_send_ipiq(globaldata_find(dcpu), func, arg)); 219 } 220 221 /* 222 * Send a message to several target cpus. Typically used for scheduling. 223 * The message will not be sent to stopped cpus. 224 */ 225 int 226 lwkt_send_ipiq_mask(u_int32_t mask, ipifunc_t func, void *arg) 227 { 228 int cpuid; 229 int count = 0; 230 231 mask &= ~stopped_cpus; 232 while (mask) { 233 cpuid = bsfl(mask); 234 lwkt_send_ipiq(globaldata_find(cpuid), func, arg); 235 mask &= ~(1 << cpuid); 236 ++count; 237 } 238 return(count); 239 } 240 241 /* 242 * Wait for the remote cpu to finish processing a function. 243 * 244 * YYY we have to enable interrupts and process the IPIQ while waiting 245 * for it to empty or we may deadlock with another cpu. Create a CPU_*() 246 * function to do this! YYY we really should 'block' here. 247 * 248 * MUST be called from a critical section. This routine may be called 249 * from an interrupt (for example, if an interrupt wakes a foreign thread 250 * up). 251 */ 252 void 253 lwkt_wait_ipiq(globaldata_t target, int seq) 254 { 255 lwkt_ipiq_t ip; 256 int maxc = 100000000; 257 258 if (target != mycpu) { 259 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 260 if ((int)(ip->ip_xindex - seq) < 0) { 261 unsigned int eflags = read_eflags(); 262 cpu_enable_intr(); 263 while ((int)(ip->ip_xindex - seq) < 0) { 264 crit_enter(); 265 lwkt_process_ipiq(); 266 crit_exit(); 267 if (--maxc == 0) 268 printf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", mycpu->gd_cpuid, target->gd_cpuid, ip->ip_xindex - seq); 269 if (maxc < -1000000) 270 panic("LWKT_WAIT_IPIQ"); 271 } 272 write_eflags(eflags); 273 } 274 } 275 } 276 277 int 278 lwkt_seq_ipiq(globaldata_t target) 279 { 280 lwkt_ipiq_t ip; 281 282 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 283 return(ip->ip_windex); 284 } 285 286 /* 287 * Called from IPI interrupt (like a fast interrupt), which has placed 288 * us in a critical section. The MP lock may or may not be held. 289 * May also be called from doreti or splz, or be reentrantly called 290 * indirectly through the ip_func[] we run. 291 * 292 * There are two versions, one where no interrupt frame is available (when 293 * called from the send code and from splz, and one where an interrupt 294 * frame is available. 295 */ 296 void 297 lwkt_process_ipiq(void) 298 { 299 globaldata_t gd = mycpu; 300 lwkt_ipiq_t ip; 301 int n; 302 303 again: 304 for (n = 0; n < ncpus; ++n) { 305 if (n != gd->gd_cpuid) { 306 ip = globaldata_find(n)->gd_ipiq; 307 if (ip != NULL) { 308 while (lwkt_process_ipiq1(&ip[gd->gd_cpuid], NULL)) 309 ; 310 } 311 } 312 } 313 if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) { 314 if (lwkt_process_ipiq1(&gd->gd_cpusyncq, NULL)) { 315 if (gd->gd_curthread->td_cscount == 0) 316 goto again; 317 need_ipiq(); 318 } 319 } 320 } 321 322 #ifdef _KERNEL 323 void 324 lwkt_process_ipiq_frame(struct intrframe frame) 325 { 326 globaldata_t gd = mycpu; 327 lwkt_ipiq_t ip; 328 int n; 329 330 again: 331 for (n = 0; n < ncpus; ++n) { 332 if (n != gd->gd_cpuid) { 333 ip = globaldata_find(n)->gd_ipiq; 334 if (ip != NULL) { 335 while (lwkt_process_ipiq1(&ip[gd->gd_cpuid], &frame)) 336 ; 337 } 338 } 339 } 340 if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) { 341 if (lwkt_process_ipiq1(&gd->gd_cpusyncq, &frame)) { 342 if (gd->gd_curthread->td_cscount == 0) 343 goto again; 344 need_ipiq(); 345 } 346 } 347 } 348 #endif 349 350 static int 351 lwkt_process_ipiq1(lwkt_ipiq_t ip, struct intrframe *frame) 352 { 353 int ri; 354 int wi = ip->ip_windex; 355 /* 356 * Note: xindex is only updated after we are sure the function has 357 * finished execution. Beware lwkt_process_ipiq() reentrancy! The 358 * function may send an IPI which may block/drain. 359 */ 360 while ((ri = ip->ip_rindex) != wi) { 361 ip->ip_rindex = ri + 1; 362 ri &= MAXCPUFIFO_MASK; 363 ip->ip_func[ri](ip->ip_arg[ri], frame); 364 /* YYY memory barrier */ 365 ip->ip_xindex = ip->ip_rindex; 366 } 367 return(wi != ip->ip_windex); 368 } 369 370 #else 371 372 /* 373 * !SMP dummy routines 374 */ 375 376 int 377 lwkt_send_ipiq(globaldata_t target, ipifunc_t func, void *arg) 378 { 379 panic("lwkt_send_ipiq: UP box! (%d,%p,%p)", target->gd_cpuid, func, arg); 380 return(0); /* NOT REACHED */ 381 } 382 383 void 384 lwkt_wait_ipiq(globaldata_t target, int seq) 385 { 386 panic("lwkt_wait_ipiq: UP box! (%d,%d)", target->gd_cpuid, seq); 387 } 388 389 #endif 390 391 /* 392 * CPU Synchronization Support 393 * 394 * lwkt_cpusync_simple() 395 * 396 * The function is executed synchronously before return on remote cpus. 397 * A lwkt_cpusync_t pointer is passed as an argument. The data can 398 * be accessed via arg->cs_data. 399 * 400 * XXX should I just pass the data as an argument to be consistent? 401 */ 402 403 void 404 lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *data) 405 { 406 struct lwkt_cpusync cmd; 407 408 cmd.cs_run_func = NULL; 409 cmd.cs_fin1_func = func; 410 cmd.cs_fin2_func = NULL; 411 cmd.cs_data = data; 412 lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd); 413 if (mask & (1 << mycpu->gd_cpuid)) 414 func(&cmd); 415 lwkt_cpusync_finish(&cmd); 416 } 417 418 /* 419 * lwkt_cpusync_fastdata() 420 * 421 * The function is executed in tandem with return on remote cpus. 422 * The data is directly passed as an argument. Do not pass pointers to 423 * temporary storage as the storage might have 424 * gone poof by the time the target cpu executes 425 * the function. 426 * 427 * At the moment lwkt_cpusync is declared on the stack and we must wait 428 * for all remote cpus to ack in lwkt_cpusync_finish(), but as a future 429 * optimization we should be able to put a counter in the globaldata 430 * structure (if it is not otherwise being used) and just poke it and 431 * return without waiting. XXX 432 */ 433 void 434 lwkt_cpusync_fastdata(cpumask_t mask, cpusync_func2_t func, void *data) 435 { 436 struct lwkt_cpusync cmd; 437 438 cmd.cs_run_func = NULL; 439 cmd.cs_fin1_func = NULL; 440 cmd.cs_fin2_func = func; 441 cmd.cs_data = NULL; 442 lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd); 443 if (mask & (1 << mycpu->gd_cpuid)) 444 func(data); 445 lwkt_cpusync_finish(&cmd); 446 } 447 448 /* 449 * lwkt_cpusync_start() 450 * 451 * Start synchronization with a set of target cpus, return once they are 452 * known to be in a synchronization loop. The target cpus will execute 453 * poll->cs_run_func() IN TANDEM WITH THE RETURN. 454 * 455 * XXX future: add lwkt_cpusync_start_quick() and require a call to 456 * lwkt_cpusync_add() or lwkt_cpusync_wait(), allowing the caller to 457 * potentially absorb the IPI latency doing something useful. 458 */ 459 void 460 lwkt_cpusync_start(cpumask_t mask, lwkt_cpusync_t poll) 461 { 462 globaldata_t gd = mycpu; 463 464 poll->cs_count = 0; 465 poll->cs_mask = mask; 466 #ifdef SMP 467 poll->cs_maxcount = lwkt_send_ipiq_mask( 468 mask & gd->gd_other_cpus & smp_active_mask, 469 (ipifunc_t)lwkt_cpusync_remote1, poll); 470 #endif 471 if (mask & (1 << gd->gd_cpuid)) { 472 if (poll->cs_run_func) 473 poll->cs_run_func(poll); 474 } 475 #ifdef SMP 476 if (poll->cs_maxcount) { 477 ++ipiq_cscount; 478 ++gd->gd_curthread->td_cscount; 479 while (poll->cs_count != poll->cs_maxcount) { 480 crit_enter(); 481 lwkt_process_ipiq(); 482 crit_exit(); 483 } 484 } 485 #endif 486 } 487 488 void 489 lwkt_cpusync_add(cpumask_t mask, lwkt_cpusync_t poll) 490 { 491 globaldata_t gd = mycpu; 492 #ifdef SMP 493 int count; 494 #endif 495 496 mask &= ~poll->cs_mask; 497 poll->cs_mask |= mask; 498 #ifdef SMP 499 count = lwkt_send_ipiq_mask( 500 mask & gd->gd_other_cpus & smp_active_mask, 501 (ipifunc_t)lwkt_cpusync_remote1, poll); 502 #endif 503 if (mask & (1 << gd->gd_cpuid)) { 504 if (poll->cs_run_func) 505 poll->cs_run_func(poll); 506 } 507 #ifdef SMP 508 poll->cs_maxcount += count; 509 if (poll->cs_maxcount) { 510 if (poll->cs_maxcount == count) 511 ++gd->gd_curthread->td_cscount; 512 while (poll->cs_count != poll->cs_maxcount) { 513 crit_enter(); 514 lwkt_process_ipiq(); 515 crit_exit(); 516 } 517 } 518 #endif 519 } 520 521 /* 522 * Finish synchronization with a set of target cpus. The target cpus will 523 * execute cs_fin1_func(poll) prior to this function returning, and will 524 * execute cs_fin2_func(data) IN TANDEM WITH THIS FUNCTION'S RETURN. 525 * 526 * If cs_maxcount is non-zero then we are mastering a cpusync with one or 527 * more remote cpus and must account for it in our thread structure. 528 */ 529 void 530 lwkt_cpusync_finish(lwkt_cpusync_t poll) 531 { 532 globaldata_t gd = mycpu; 533 534 poll->cs_count = -1; 535 if (poll->cs_mask & (1 << gd->gd_cpuid)) { 536 if (poll->cs_fin1_func) 537 poll->cs_fin1_func(poll); 538 if (poll->cs_fin2_func) 539 poll->cs_fin2_func(poll->cs_data); 540 } 541 #ifdef SMP 542 if (poll->cs_maxcount) { 543 while (poll->cs_count != -(poll->cs_maxcount + 1)) { 544 crit_enter(); 545 lwkt_process_ipiq(); 546 crit_exit(); 547 } 548 --gd->gd_curthread->td_cscount; 549 } 550 #endif 551 } 552 553 #ifdef SMP 554 555 /* 556 * helper IPI remote messaging function. 557 * 558 * Called on remote cpu when a new cpu synchronization request has been 559 * sent to us. Execute the run function and adjust cs_count, then requeue 560 * the request so we spin on it. 561 */ 562 static void 563 lwkt_cpusync_remote1(lwkt_cpusync_t poll) 564 { 565 atomic_add_int(&poll->cs_count, 1); 566 if (poll->cs_run_func) 567 poll->cs_run_func(poll); 568 lwkt_cpusync_remote2(poll); 569 } 570 571 /* 572 * helper IPI remote messaging function. 573 * 574 * Poll for the originator telling us to finish. If it hasn't, requeue 575 * our request so we spin on it. When the originator requests that we 576 * finish we execute cs_fin1_func(poll) synchronously and cs_fin2_func(data) 577 * in tandem with the release. 578 */ 579 static void 580 lwkt_cpusync_remote2(lwkt_cpusync_t poll) 581 { 582 if (poll->cs_count < 0) { 583 cpusync_func2_t savef; 584 void *saved; 585 586 if (poll->cs_fin1_func) 587 poll->cs_fin1_func(poll); 588 if (poll->cs_fin2_func) { 589 savef = poll->cs_fin2_func; 590 saved = poll->cs_data; 591 atomic_add_int(&poll->cs_count, -1); 592 savef(saved); 593 } else { 594 atomic_add_int(&poll->cs_count, -1); 595 } 596 } else { 597 globaldata_t gd = mycpu; 598 lwkt_ipiq_t ip; 599 int wi; 600 601 ip = &gd->gd_cpusyncq; 602 wi = ip->ip_windex & MAXCPUFIFO_MASK; 603 ip->ip_func[wi] = (ipifunc2_t)lwkt_cpusync_remote2; 604 ip->ip_arg[wi] = poll; 605 ++ip->ip_windex; 606 } 607 } 608 609 #endif 610