1 /* 2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/kern/lwkt_ipiq.c,v 1.8 2004/07/16 05:51:10 dillon Exp $ 35 */ 36 37 /* 38 * This module implements IPI message queueing and the MI portion of IPI 39 * message processing. 40 */ 41 42 #ifdef _KERNEL 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/kernel.h> 47 #include <sys/proc.h> 48 #include <sys/rtprio.h> 49 #include <sys/queue.h> 50 #include <sys/thread2.h> 51 #include <sys/sysctl.h> 52 #include <sys/kthread.h> 53 #include <machine/cpu.h> 54 #include <sys/lock.h> 55 #include <sys/caps.h> 56 57 #include <vm/vm.h> 58 #include <vm/vm_param.h> 59 #include <vm/vm_kern.h> 60 #include <vm/vm_object.h> 61 #include <vm/vm_page.h> 62 #include <vm/vm_map.h> 63 #include <vm/vm_pager.h> 64 #include <vm/vm_extern.h> 65 #include <vm/vm_zone.h> 66 67 #include <machine/stdarg.h> 68 #include <machine/ipl.h> 69 #include <machine/smp.h> 70 #include <machine/atomic.h> 71 72 #define THREAD_STACK (UPAGES * PAGE_SIZE) 73 74 #else 75 76 #include <sys/stdint.h> 77 #include <libcaps/thread.h> 78 #include <sys/thread.h> 79 #include <sys/msgport.h> 80 #include <sys/errno.h> 81 #include <libcaps/globaldata.h> 82 #include <machine/cpufunc.h> 83 #include <sys/thread2.h> 84 #include <sys/msgport2.h> 85 #include <stdio.h> 86 #include <stdlib.h> 87 #include <string.h> 88 #include <machine/lock.h> 89 #include <machine/cpu.h> 90 #include <machine/atomic.h> 91 92 #endif 93 94 #ifdef SMP 95 static __int64_t ipiq_count; 96 static __int64_t ipiq_fifofull; 97 static __int64_t ipiq_cscount; 98 #endif 99 100 #ifdef _KERNEL 101 102 #ifdef SMP 103 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_count, CTLFLAG_RW, &ipiq_count, 0, ""); 104 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_fifofull, CTLFLAG_RW, &ipiq_fifofull, 0, ""); 105 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_cscount, CTLFLAG_RW, &ipiq_cscount, 0, ""); 106 #endif 107 108 #endif 109 110 #ifdef SMP 111 112 static int lwkt_process_ipiq1(lwkt_ipiq_t ip, struct intrframe *frame); 113 static void lwkt_cpusync_remote1(lwkt_cpusync_t poll); 114 static void lwkt_cpusync_remote2(lwkt_cpusync_t poll); 115 116 /* 117 * Send a function execution request to another cpu. The request is queued 118 * on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every 119 * possible target cpu. The FIFO can be written. 120 * 121 * YYY If the FIFO fills up we have to enable interrupts and process the 122 * IPIQ while waiting for it to empty or we may deadlock with another cpu. 123 * Create a CPU_*() function to do this! 124 * 125 * We can safely bump gd_intr_nesting_level because our crit_exit() at the 126 * end will take care of any pending interrupts. 127 * 128 * Must be called from a critical section. 129 */ 130 int 131 lwkt_send_ipiq(globaldata_t target, ipifunc_t func, void *arg) 132 { 133 lwkt_ipiq_t ip; 134 int windex; 135 struct globaldata *gd = mycpu; 136 137 if (target == gd) { 138 func(arg); 139 return(0); 140 } 141 crit_enter(); 142 ++gd->gd_intr_nesting_level; 143 #ifdef INVARIANTS 144 if (gd->gd_intr_nesting_level > 20) 145 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); 146 #endif 147 KKASSERT(curthread->td_pri >= TDPRI_CRIT); 148 ++ipiq_count; 149 ip = &gd->gd_ipiq[target->gd_cpuid]; 150 151 /* 152 * We always drain before the FIFO becomes full so it should never 153 * become full. We need to leave enough entries to deal with 154 * reentrancy. 155 */ 156 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO); 157 windex = ip->ip_windex & MAXCPUFIFO_MASK; 158 ip->ip_func[windex] = (ipifunc2_t)func; 159 ip->ip_arg[windex] = arg; 160 cpu_mb1(); 161 ++ip->ip_windex; 162 if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) { 163 unsigned int eflags = read_eflags(); 164 cpu_enable_intr(); 165 ++ipiq_fifofull; 166 while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) { 167 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1); 168 lwkt_process_ipiq(); 169 } 170 write_eflags(eflags); 171 } 172 --gd->gd_intr_nesting_level; 173 cpu_send_ipiq(target->gd_cpuid); /* issues mem barrier if appropriate */ 174 crit_exit(); 175 return(ip->ip_windex); 176 } 177 178 /* 179 * Send an IPI request passively, return 0 on success and ENOENT on failure. 180 * This routine does not recursive through lwkt_process_ipiq() nor does it 181 * block trying to queue the actual IPI. If we successfully queue the 182 * message but fail to queue the IPI, we still count it as a success. 183 * The occassional small race against a target cpu HLT is recovered at 184 * the next clock interrupt. 185 */ 186 int 187 lwkt_send_ipiq_passive(globaldata_t target, ipifunc_t func, void *arg) 188 { 189 lwkt_ipiq_t ip; 190 int windex; 191 struct globaldata *gd = mycpu; 192 193 KKASSERT(curthread->td_pri >= TDPRI_CRIT); 194 if (target == gd) { 195 func(arg); 196 return(0); 197 } 198 ++ipiq_count; 199 ip = &gd->gd_ipiq[target->gd_cpuid]; 200 201 if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO - 1) { 202 return(ENOENT); 203 } 204 windex = ip->ip_windex & MAXCPUFIFO_MASK; 205 ip->ip_func[windex] = (ipifunc2_t)func; 206 ip->ip_arg[windex] = arg; 207 cpu_mb1(); 208 ++ip->ip_windex; 209 /* 210 * passive mode doesn't work yet :-( 211 */ 212 #if 1 213 cpu_send_ipiq(target->gd_cpuid); 214 #else 215 cpu_send_ipiq_passive(target->gd_cpuid); 216 #endif 217 return(0); 218 } 219 220 /* 221 * deprecated, used only by fast int forwarding. 222 */ 223 int 224 lwkt_send_ipiq_bycpu(int dcpu, ipifunc_t func, void *arg) 225 { 226 return(lwkt_send_ipiq(globaldata_find(dcpu), func, arg)); 227 } 228 229 /* 230 * Send a message to several target cpus. Typically used for scheduling. 231 * The message will not be sent to stopped cpus. 232 */ 233 int 234 lwkt_send_ipiq_mask(u_int32_t mask, ipifunc_t func, void *arg) 235 { 236 int cpuid; 237 int count = 0; 238 239 mask &= ~stopped_cpus; 240 while (mask) { 241 cpuid = bsfl(mask); 242 lwkt_send_ipiq(globaldata_find(cpuid), func, arg); 243 mask &= ~(1 << cpuid); 244 ++count; 245 } 246 return(count); 247 } 248 249 /* 250 * Wait for the remote cpu to finish processing a function. 251 * 252 * YYY we have to enable interrupts and process the IPIQ while waiting 253 * for it to empty or we may deadlock with another cpu. Create a CPU_*() 254 * function to do this! YYY we really should 'block' here. 255 * 256 * MUST be called from a critical section. This routine may be called 257 * from an interrupt (for example, if an interrupt wakes a foreign thread 258 * up). 259 */ 260 void 261 lwkt_wait_ipiq(globaldata_t target, int seq) 262 { 263 lwkt_ipiq_t ip; 264 int maxc = 100000000; 265 266 if (target != mycpu) { 267 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 268 if ((int)(ip->ip_xindex - seq) < 0) { 269 unsigned int eflags = read_eflags(); 270 cpu_enable_intr(); 271 while ((int)(ip->ip_xindex - seq) < 0) { 272 crit_enter(); 273 lwkt_process_ipiq(); 274 crit_exit(); 275 if (--maxc == 0) 276 printf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", mycpu->gd_cpuid, target->gd_cpuid, ip->ip_xindex - seq); 277 if (maxc < -1000000) 278 panic("LWKT_WAIT_IPIQ"); 279 } 280 write_eflags(eflags); 281 } 282 } 283 } 284 285 int 286 lwkt_seq_ipiq(globaldata_t target) 287 { 288 lwkt_ipiq_t ip; 289 290 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 291 return(ip->ip_windex); 292 } 293 294 /* 295 * Called from IPI interrupt (like a fast interrupt), which has placed 296 * us in a critical section. The MP lock may or may not be held. 297 * May also be called from doreti or splz, or be reentrantly called 298 * indirectly through the ip_func[] we run. 299 * 300 * There are two versions, one where no interrupt frame is available (when 301 * called from the send code and from splz, and one where an interrupt 302 * frame is available. 303 */ 304 void 305 lwkt_process_ipiq(void) 306 { 307 globaldata_t gd = mycpu; 308 lwkt_ipiq_t ip; 309 int n; 310 311 again: 312 for (n = 0; n < ncpus; ++n) { 313 if (n != gd->gd_cpuid) { 314 ip = globaldata_find(n)->gd_ipiq; 315 if (ip != NULL) { 316 while (lwkt_process_ipiq1(&ip[gd->gd_cpuid], NULL)) 317 ; 318 } 319 } 320 } 321 if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) { 322 if (lwkt_process_ipiq1(&gd->gd_cpusyncq, NULL)) { 323 if (gd->gd_curthread->td_cscount == 0) 324 goto again; 325 need_ipiq(); 326 } 327 } 328 } 329 330 #ifdef _KERNEL 331 void 332 lwkt_process_ipiq_frame(struct intrframe frame) 333 { 334 globaldata_t gd = mycpu; 335 lwkt_ipiq_t ip; 336 int n; 337 338 again: 339 for (n = 0; n < ncpus; ++n) { 340 if (n != gd->gd_cpuid) { 341 ip = globaldata_find(n)->gd_ipiq; 342 if (ip != NULL) { 343 while (lwkt_process_ipiq1(&ip[gd->gd_cpuid], &frame)) 344 ; 345 } 346 } 347 } 348 if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) { 349 if (lwkt_process_ipiq1(&gd->gd_cpusyncq, &frame)) { 350 if (gd->gd_curthread->td_cscount == 0) 351 goto again; 352 need_ipiq(); 353 } 354 } 355 } 356 #endif 357 358 static int 359 lwkt_process_ipiq1(lwkt_ipiq_t ip, struct intrframe *frame) 360 { 361 int ri; 362 int wi = ip->ip_windex; 363 /* 364 * Note: xindex is only updated after we are sure the function has 365 * finished execution. Beware lwkt_process_ipiq() reentrancy! The 366 * function may send an IPI which may block/drain. 367 */ 368 while ((ri = ip->ip_rindex) != wi) { 369 ip->ip_rindex = ri + 1; 370 ri &= MAXCPUFIFO_MASK; 371 ip->ip_func[ri](ip->ip_arg[ri], frame); 372 /* YYY memory barrier */ 373 ip->ip_xindex = ip->ip_rindex; 374 } 375 return(wi != ip->ip_windex); 376 } 377 378 #else 379 380 /* 381 * !SMP dummy routines 382 */ 383 384 int 385 lwkt_send_ipiq(globaldata_t target, ipifunc_t func, void *arg) 386 { 387 panic("lwkt_send_ipiq: UP box! (%d,%p,%p)", target->gd_cpuid, func, arg); 388 return(0); /* NOT REACHED */ 389 } 390 391 void 392 lwkt_wait_ipiq(globaldata_t target, int seq) 393 { 394 panic("lwkt_wait_ipiq: UP box! (%d,%d)", target->gd_cpuid, seq); 395 } 396 397 #endif 398 399 /* 400 * CPU Synchronization Support 401 * 402 * lwkt_cpusync_simple() 403 * 404 * The function is executed synchronously before return on remote cpus. 405 * A lwkt_cpusync_t pointer is passed as an argument. The data can 406 * be accessed via arg->cs_data. 407 * 408 * XXX should I just pass the data as an argument to be consistent? 409 */ 410 411 void 412 lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *data) 413 { 414 struct lwkt_cpusync cmd; 415 416 cmd.cs_run_func = NULL; 417 cmd.cs_fin1_func = func; 418 cmd.cs_fin2_func = NULL; 419 cmd.cs_data = data; 420 lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd); 421 if (mask & (1 << mycpu->gd_cpuid)) 422 func(&cmd); 423 lwkt_cpusync_finish(&cmd); 424 } 425 426 /* 427 * lwkt_cpusync_fastdata() 428 * 429 * The function is executed in tandem with return on remote cpus. 430 * The data is directly passed as an argument. Do not pass pointers to 431 * temporary storage as the storage might have 432 * gone poof by the time the target cpu executes 433 * the function. 434 * 435 * At the moment lwkt_cpusync is declared on the stack and we must wait 436 * for all remote cpus to ack in lwkt_cpusync_finish(), but as a future 437 * optimization we should be able to put a counter in the globaldata 438 * structure (if it is not otherwise being used) and just poke it and 439 * return without waiting. XXX 440 */ 441 void 442 lwkt_cpusync_fastdata(cpumask_t mask, cpusync_func2_t func, void *data) 443 { 444 struct lwkt_cpusync cmd; 445 446 cmd.cs_run_func = NULL; 447 cmd.cs_fin1_func = NULL; 448 cmd.cs_fin2_func = func; 449 cmd.cs_data = NULL; 450 lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd); 451 if (mask & (1 << mycpu->gd_cpuid)) 452 func(data); 453 lwkt_cpusync_finish(&cmd); 454 } 455 456 /* 457 * lwkt_cpusync_start() 458 * 459 * Start synchronization with a set of target cpus, return once they are 460 * known to be in a synchronization loop. The target cpus will execute 461 * poll->cs_run_func() IN TANDEM WITH THE RETURN. 462 * 463 * XXX future: add lwkt_cpusync_start_quick() and require a call to 464 * lwkt_cpusync_add() or lwkt_cpusync_wait(), allowing the caller to 465 * potentially absorb the IPI latency doing something useful. 466 */ 467 void 468 lwkt_cpusync_start(cpumask_t mask, lwkt_cpusync_t poll) 469 { 470 globaldata_t gd = mycpu; 471 472 poll->cs_count = 0; 473 poll->cs_mask = mask; 474 #ifdef SMP 475 poll->cs_maxcount = lwkt_send_ipiq_mask( 476 mask & gd->gd_other_cpus & smp_active_mask, 477 (ipifunc_t)lwkt_cpusync_remote1, poll); 478 #endif 479 if (mask & gd->gd_cpumask) { 480 if (poll->cs_run_func) 481 poll->cs_run_func(poll); 482 } 483 #ifdef SMP 484 if (poll->cs_maxcount) { 485 ++ipiq_cscount; 486 ++gd->gd_curthread->td_cscount; 487 while (poll->cs_count != poll->cs_maxcount) { 488 crit_enter(); 489 lwkt_process_ipiq(); 490 crit_exit(); 491 } 492 } 493 #endif 494 } 495 496 void 497 lwkt_cpusync_add(cpumask_t mask, lwkt_cpusync_t poll) 498 { 499 globaldata_t gd = mycpu; 500 #ifdef SMP 501 int count; 502 #endif 503 504 mask &= ~poll->cs_mask; 505 poll->cs_mask |= mask; 506 #ifdef SMP 507 count = lwkt_send_ipiq_mask( 508 mask & gd->gd_other_cpus & smp_active_mask, 509 (ipifunc_t)lwkt_cpusync_remote1, poll); 510 #endif 511 if (mask & gd->gd_cpumask) { 512 if (poll->cs_run_func) 513 poll->cs_run_func(poll); 514 } 515 #ifdef SMP 516 poll->cs_maxcount += count; 517 if (poll->cs_maxcount) { 518 if (poll->cs_maxcount == count) 519 ++gd->gd_curthread->td_cscount; 520 while (poll->cs_count != poll->cs_maxcount) { 521 crit_enter(); 522 lwkt_process_ipiq(); 523 crit_exit(); 524 } 525 } 526 #endif 527 } 528 529 /* 530 * Finish synchronization with a set of target cpus. The target cpus will 531 * execute cs_fin1_func(poll) prior to this function returning, and will 532 * execute cs_fin2_func(data) IN TANDEM WITH THIS FUNCTION'S RETURN. 533 * 534 * If cs_maxcount is non-zero then we are mastering a cpusync with one or 535 * more remote cpus and must account for it in our thread structure. 536 */ 537 void 538 lwkt_cpusync_finish(lwkt_cpusync_t poll) 539 { 540 globaldata_t gd = mycpu; 541 542 poll->cs_count = -1; 543 if (poll->cs_mask & gd->gd_cpumask) { 544 if (poll->cs_fin1_func) 545 poll->cs_fin1_func(poll); 546 if (poll->cs_fin2_func) 547 poll->cs_fin2_func(poll->cs_data); 548 } 549 #ifdef SMP 550 if (poll->cs_maxcount) { 551 while (poll->cs_count != -(poll->cs_maxcount + 1)) { 552 crit_enter(); 553 lwkt_process_ipiq(); 554 crit_exit(); 555 } 556 --gd->gd_curthread->td_cscount; 557 } 558 #endif 559 } 560 561 #ifdef SMP 562 563 /* 564 * helper IPI remote messaging function. 565 * 566 * Called on remote cpu when a new cpu synchronization request has been 567 * sent to us. Execute the run function and adjust cs_count, then requeue 568 * the request so we spin on it. 569 */ 570 static void 571 lwkt_cpusync_remote1(lwkt_cpusync_t poll) 572 { 573 atomic_add_int(&poll->cs_count, 1); 574 if (poll->cs_run_func) 575 poll->cs_run_func(poll); 576 lwkt_cpusync_remote2(poll); 577 } 578 579 /* 580 * helper IPI remote messaging function. 581 * 582 * Poll for the originator telling us to finish. If it hasn't, requeue 583 * our request so we spin on it. When the originator requests that we 584 * finish we execute cs_fin1_func(poll) synchronously and cs_fin2_func(data) 585 * in tandem with the release. 586 */ 587 static void 588 lwkt_cpusync_remote2(lwkt_cpusync_t poll) 589 { 590 if (poll->cs_count < 0) { 591 cpusync_func2_t savef; 592 void *saved; 593 594 if (poll->cs_fin1_func) 595 poll->cs_fin1_func(poll); 596 if (poll->cs_fin2_func) { 597 savef = poll->cs_fin2_func; 598 saved = poll->cs_data; 599 atomic_add_int(&poll->cs_count, -1); 600 savef(saved); 601 } else { 602 atomic_add_int(&poll->cs_count, -1); 603 } 604 } else { 605 globaldata_t gd = mycpu; 606 lwkt_ipiq_t ip; 607 int wi; 608 609 ip = &gd->gd_cpusyncq; 610 wi = ip->ip_windex & MAXCPUFIFO_MASK; 611 ip->ip_func[wi] = (ipifunc2_t)lwkt_cpusync_remote2; 612 ip->ip_arg[wi] = poll; 613 ++ip->ip_windex; 614 } 615 } 616 617 #endif 618