1 /* 2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org> 35 * Copyright (c) 1982, 1986, 1991, 1993 36 * The Regents of the University of California. All rights reserved. 37 * (c) UNIX System Laboratories, Inc. 38 * All or some portions of this file are derived from material licensed 39 * to the University of California by American Telephone and Telegraph 40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 41 * the permission of UNIX System Laboratories, Inc. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 68 * $FreeBSD: src/sys/kern/kern_clock.c,v 1.105.2.10 2002/10/17 13:19:40 maxim Exp $ 69 */ 70 71 #include "opt_ntp.h" 72 #include "opt_pctrack.h" 73 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/callout.h> 77 #include <sys/kernel.h> 78 #include <sys/kinfo.h> 79 #include <sys/proc.h> 80 #include <sys/malloc.h> 81 #include <sys/resource.h> 82 #include <sys/resourcevar.h> 83 #include <sys/signalvar.h> 84 #include <sys/priv.h> 85 #include <sys/timex.h> 86 #include <sys/timepps.h> 87 #include <sys/upmap.h> 88 #include <vm/vm.h> 89 #include <sys/lock.h> 90 #include <vm/pmap.h> 91 #include <vm/vm_map.h> 92 #include <vm/vm_extern.h> 93 #include <sys/sysctl.h> 94 95 #include <sys/thread2.h> 96 #include <sys/spinlock2.h> 97 98 #include <machine/cpu.h> 99 #include <machine/limits.h> 100 #include <machine/smp.h> 101 #include <machine/cpufunc.h> 102 #include <machine/specialreg.h> 103 #include <machine/clock.h> 104 105 #ifdef GPROF 106 #include <sys/gmon.h> 107 #endif 108 109 #ifdef DEBUG_PCTRACK 110 static void do_pctrack(struct intrframe *frame, int which); 111 #endif 112 113 static void initclocks (void *dummy); 114 SYSINIT(clocks, SI_BOOT2_CLOCKS, SI_ORDER_FIRST, initclocks, NULL); 115 116 /* 117 * Some of these don't belong here, but it's easiest to concentrate them. 118 * Note that cpu_time counts in microseconds, but most userland programs 119 * just compare relative times against the total by delta. 120 */ 121 struct kinfo_cputime cputime_percpu[MAXCPU]; 122 #ifdef DEBUG_PCTRACK 123 struct kinfo_pcheader cputime_pcheader = { PCTRACK_SIZE, PCTRACK_ARYSIZE }; 124 struct kinfo_pctrack cputime_pctrack[MAXCPU][PCTRACK_SIZE]; 125 #endif 126 127 static int sniff_enable = 1; 128 static int sniff_target = -1; 129 SYSCTL_INT(_kern, OID_AUTO, sniff_enable, CTLFLAG_RW, &sniff_enable, 0 , ""); 130 SYSCTL_INT(_kern, OID_AUTO, sniff_target, CTLFLAG_RW, &sniff_target, 0 , ""); 131 132 static int 133 sysctl_cputime(SYSCTL_HANDLER_ARGS) 134 { 135 int cpu, error = 0; 136 int root_error; 137 size_t size = sizeof(struct kinfo_cputime); 138 struct kinfo_cputime tmp; 139 140 /* 141 * NOTE: For security reasons, only root can sniff %rip 142 */ 143 root_error = priv_check_cred(curthread->td_ucred, PRIV_ROOT, 0); 144 145 for (cpu = 0; cpu < ncpus; ++cpu) { 146 tmp = cputime_percpu[cpu]; 147 if (root_error == 0) { 148 tmp.cp_sample_pc = 149 (int64_t)globaldata_find(cpu)->gd_sample_pc; 150 tmp.cp_sample_sp = 151 (int64_t)globaldata_find(cpu)->gd_sample_sp; 152 } 153 if ((error = SYSCTL_OUT(req, &tmp, size)) != 0) 154 break; 155 } 156 157 if (root_error == 0) { 158 if (sniff_enable) { 159 int n = sniff_target; 160 if (n < 0) 161 smp_sniff(); 162 else if (n < ncpus) 163 cpu_sniff(n); 164 } 165 } 166 167 return (error); 168 } 169 SYSCTL_PROC(_kern, OID_AUTO, cputime, (CTLTYPE_OPAQUE|CTLFLAG_RD), 0, 0, 170 sysctl_cputime, "S,kinfo_cputime", "CPU time statistics"); 171 172 static int 173 sysctl_cp_time(SYSCTL_HANDLER_ARGS) 174 { 175 long cpu_states[CPUSTATES] = {0}; 176 int cpu, error = 0; 177 size_t size = sizeof(cpu_states); 178 179 for (cpu = 0; cpu < ncpus; ++cpu) { 180 cpu_states[CP_USER] += cputime_percpu[cpu].cp_user; 181 cpu_states[CP_NICE] += cputime_percpu[cpu].cp_nice; 182 cpu_states[CP_SYS] += cputime_percpu[cpu].cp_sys; 183 cpu_states[CP_INTR] += cputime_percpu[cpu].cp_intr; 184 cpu_states[CP_IDLE] += cputime_percpu[cpu].cp_idle; 185 } 186 187 error = SYSCTL_OUT(req, cpu_states, size); 188 189 return (error); 190 } 191 192 SYSCTL_PROC(_kern, OID_AUTO, cp_time, (CTLTYPE_LONG|CTLFLAG_RD), 0, 0, 193 sysctl_cp_time, "LU", "CPU time statistics"); 194 195 static int 196 sysctl_cp_times(SYSCTL_HANDLER_ARGS) 197 { 198 long cpu_states[CPUSTATES] = {0}; 199 int cpu, error; 200 size_t size = sizeof(cpu_states); 201 202 for (error = 0, cpu = 0; error == 0 && cpu < ncpus; ++cpu) { 203 cpu_states[CP_USER] = cputime_percpu[cpu].cp_user; 204 cpu_states[CP_NICE] = cputime_percpu[cpu].cp_nice; 205 cpu_states[CP_SYS] = cputime_percpu[cpu].cp_sys; 206 cpu_states[CP_INTR] = cputime_percpu[cpu].cp_intr; 207 cpu_states[CP_IDLE] = cputime_percpu[cpu].cp_idle; 208 error = SYSCTL_OUT(req, cpu_states, size); 209 } 210 211 return (error); 212 } 213 214 SYSCTL_PROC(_kern, OID_AUTO, cp_times, (CTLTYPE_LONG|CTLFLAG_RD), 0, 0, 215 sysctl_cp_times, "LU", "per-CPU time statistics"); 216 217 /* 218 * boottime is used to calculate the 'real' uptime. Do not confuse this with 219 * microuptime(). microtime() is not drift compensated. The real uptime 220 * with compensation is nanotime() - bootime. boottime is recalculated 221 * whenever the real time is set based on the compensated elapsed time 222 * in seconds (gd->gd_time_seconds). 223 * 224 * The gd_time_seconds and gd_cpuclock_base fields remain fairly monotonic. 225 * Slight adjustments to gd_cpuclock_base are made to phase-lock it to 226 * the real time. 227 * 228 * WARNING! time_second can backstep on time corrections. Also, unlike 229 * time_second, time_uptime is not a "real" time_t (seconds 230 * since the Epoch) but seconds since booting. 231 */ 232 struct timespec boottime; /* boot time (realtime) for reference only */ 233 time_t time_second; /* read-only 'passive' realtime in seconds */ 234 time_t time_uptime; /* read-only 'passive' uptime in seconds */ 235 236 /* 237 * basetime is used to calculate the compensated real time of day. The 238 * basetime can be modified on a per-tick basis by the adjtime(), 239 * ntp_adjtime(), and sysctl-based time correction APIs. 240 * 241 * Note that frequency corrections can also be made by adjusting 242 * gd_cpuclock_base. 243 * 244 * basetime is a tail-chasing FIFO, updated only by cpu #0. The FIFO is 245 * used on both SMP and UP systems to avoid MP races between cpu's and 246 * interrupt races on UP systems. 247 */ 248 struct hardtime { 249 __uint32_t time_second; 250 sysclock_t cpuclock_base; 251 }; 252 253 #define BASETIME_ARYSIZE 16 254 #define BASETIME_ARYMASK (BASETIME_ARYSIZE - 1) 255 static struct timespec basetime[BASETIME_ARYSIZE]; 256 static struct hardtime hardtime[BASETIME_ARYSIZE]; 257 static volatile int basetime_index; 258 259 static int 260 sysctl_get_basetime(SYSCTL_HANDLER_ARGS) 261 { 262 struct timespec *bt; 263 int error; 264 int index; 265 266 /* 267 * Because basetime data and index may be updated by another cpu, 268 * a load fence is required to ensure that the data we read has 269 * not been speculatively read relative to a possibly updated index. 270 */ 271 index = basetime_index; 272 cpu_lfence(); 273 bt = &basetime[index]; 274 error = SYSCTL_OUT(req, bt, sizeof(*bt)); 275 return (error); 276 } 277 278 SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD, 279 &boottime, timespec, "System boottime"); 280 SYSCTL_PROC(_kern, OID_AUTO, basetime, CTLTYPE_STRUCT|CTLFLAG_RD, 0, 0, 281 sysctl_get_basetime, "S,timespec", "System basetime"); 282 283 static void hardclock(systimer_t info, int, struct intrframe *frame); 284 static void statclock(systimer_t info, int, struct intrframe *frame); 285 static void schedclock(systimer_t info, int, struct intrframe *frame); 286 static void getnanotime_nbt(struct timespec *nbt, struct timespec *tsp); 287 288 int ticks; /* system master ticks at hz */ 289 int clocks_running; /* tsleep/timeout clocks operational */ 290 int64_t nsec_adj; /* ntpd per-tick adjustment in nsec << 32 */ 291 int64_t nsec_acc; /* accumulator */ 292 int sched_ticks; /* global schedule clock ticks */ 293 294 /* NTPD time correction fields */ 295 int64_t ntp_tick_permanent; /* per-tick adjustment in nsec << 32 */ 296 int64_t ntp_tick_acc; /* accumulator for per-tick adjustment */ 297 int64_t ntp_delta; /* one-time correction in nsec */ 298 int64_t ntp_big_delta = 1000000000; 299 int32_t ntp_tick_delta; /* current adjustment rate */ 300 int32_t ntp_default_tick_delta; /* adjustment rate for ntp_delta */ 301 time_t ntp_leap_second; /* time of next leap second */ 302 int ntp_leap_insert; /* whether to insert or remove a second */ 303 struct spinlock ntp_spin; 304 305 /* 306 * Finish initializing clock frequencies and start all clocks running. 307 */ 308 /* ARGSUSED*/ 309 static void 310 initclocks(void *dummy) 311 { 312 /*psratio = profhz / stathz;*/ 313 spin_init(&ntp_spin, "ntp"); 314 initclocks_pcpu(); 315 clocks_running = 1; 316 if (kpmap) { 317 kpmap->tsc_freq = (uint64_t)tsc_frequency; 318 kpmap->tick_freq = hz; 319 } 320 } 321 322 /* 323 * Called on a per-cpu basis from the idle thread bootstrap on each cpu 324 * during SMP initialization. 325 * 326 * This routine is called concurrently during low-level SMP initialization 327 * and may not block in any way. Meaning, among other things, we can't 328 * acquire any tokens. 329 */ 330 void 331 initclocks_pcpu(void) 332 { 333 struct globaldata *gd = mycpu; 334 335 crit_enter(); 336 if (gd->gd_cpuid == 0) { 337 gd->gd_time_seconds = 1; 338 gd->gd_cpuclock_base = sys_cputimer->count(); 339 hardtime[0].time_second = gd->gd_time_seconds; 340 hardtime[0].cpuclock_base = gd->gd_cpuclock_base; 341 } else { 342 gd->gd_time_seconds = globaldata_find(0)->gd_time_seconds; 343 gd->gd_cpuclock_base = globaldata_find(0)->gd_cpuclock_base; 344 } 345 346 systimer_intr_enable(); 347 348 crit_exit(); 349 } 350 351 /* 352 * This routine is called on just the BSP, just after SMP initialization 353 * completes to * finish initializing any clocks that might contend/block 354 * (e.g. like on a token). We can't do this in initclocks_pcpu() because 355 * that function is called from the idle thread bootstrap for each cpu and 356 * not allowed to block at all. 357 */ 358 static 359 void 360 initclocks_other(void *dummy) 361 { 362 struct globaldata *ogd = mycpu; 363 struct globaldata *gd; 364 int n; 365 366 for (n = 0; n < ncpus; ++n) { 367 lwkt_setcpu_self(globaldata_find(n)); 368 gd = mycpu; 369 370 /* 371 * Use a non-queued periodic systimer to prevent multiple 372 * ticks from building up if the sysclock jumps forward 373 * (8254 gets reset). The sysclock will never jump backwards. 374 * Our time sync is based on the actual sysclock, not the 375 * ticks count. 376 * 377 * Install statclock before hardclock to prevent statclock 378 * from misinterpreting gd_flags for tick assignment when 379 * they overlap. 380 */ 381 systimer_init_periodic_nq(&gd->gd_statclock, statclock, 382 NULL, stathz); 383 systimer_init_periodic_nq(&gd->gd_hardclock, hardclock, 384 NULL, hz); 385 /* XXX correct the frequency for scheduler / estcpu tests */ 386 systimer_init_periodic_nq(&gd->gd_schedclock, schedclock, 387 NULL, ESTCPUFREQ); 388 } 389 lwkt_setcpu_self(ogd); 390 } 391 SYSINIT(clocks2, SI_BOOT2_POST_SMP, SI_ORDER_ANY, initclocks_other, NULL); 392 393 /* 394 * This sets the current real time of day. Timespecs are in seconds and 395 * nanoseconds. We do not mess with gd_time_seconds and gd_cpuclock_base, 396 * instead we adjust basetime so basetime + gd_* results in the current 397 * time of day. This way the gd_* fields are guaranteed to represent 398 * a monotonically increasing 'uptime' value. 399 * 400 * When set_timeofday() is called from userland, the system call forces it 401 * onto cpu #0 since only cpu #0 can update basetime_index. 402 */ 403 void 404 set_timeofday(struct timespec *ts) 405 { 406 struct timespec *nbt; 407 int ni; 408 409 /* 410 * XXX SMP / non-atomic basetime updates 411 */ 412 crit_enter(); 413 ni = (basetime_index + 1) & BASETIME_ARYMASK; 414 cpu_lfence(); 415 nbt = &basetime[ni]; 416 nanouptime(nbt); 417 nbt->tv_sec = ts->tv_sec - nbt->tv_sec; 418 nbt->tv_nsec = ts->tv_nsec - nbt->tv_nsec; 419 if (nbt->tv_nsec < 0) { 420 nbt->tv_nsec += 1000000000; 421 --nbt->tv_sec; 422 } 423 424 /* 425 * Note that basetime diverges from boottime as the clock drift is 426 * compensated for, so we cannot do away with boottime. When setting 427 * the absolute time of day the drift is 0 (for an instant) and we 428 * can simply assign boottime to basetime. 429 * 430 * Note that nanouptime() is based on gd_time_seconds which is drift 431 * compensated up to a point (it is guaranteed to remain monotonically 432 * increasing). gd_time_seconds is thus our best uptime guess and 433 * suitable for use in the boottime calculation. It is already taken 434 * into account in the basetime calculation above. 435 */ 436 spin_lock(&ntp_spin); 437 boottime.tv_sec = nbt->tv_sec; 438 ntp_delta = 0; 439 440 /* 441 * We now have a new basetime, make sure all other cpus have it, 442 * then update the index. 443 */ 444 cpu_sfence(); 445 basetime_index = ni; 446 spin_unlock(&ntp_spin); 447 448 crit_exit(); 449 } 450 451 /* 452 * Each cpu has its own hardclock, but we only increments ticks and softticks 453 * on cpu #0. 454 * 455 * NOTE! systimer! the MP lock might not be held here. We can only safely 456 * manipulate objects owned by the current cpu. 457 */ 458 static void 459 hardclock(systimer_t info, int in_ipi, struct intrframe *frame) 460 { 461 sysclock_t cputicks; 462 struct proc *p; 463 struct globaldata *gd = mycpu; 464 465 if ((gd->gd_reqflags & RQF_IPIQ) == 0 && lwkt_need_ipiq_process(gd)) { 466 /* Defer to doreti on passive IPIQ processing */ 467 need_ipiq(); 468 } 469 470 /* 471 * We update the compensation base to calculate fine-grained time 472 * from the sys_cputimer on a per-cpu basis in order to avoid 473 * having to mess around with locks. sys_cputimer is assumed to 474 * be consistent across all cpus. CPU N copies the base state from 475 * CPU 0 using the same FIFO trick that we use for basetime (so we 476 * don't catch a CPU 0 update in the middle). 477 * 478 * Note that we never allow info->time (aka gd->gd_hardclock.time) 479 * to reverse index gd_cpuclock_base, but that it is possible for 480 * it to temporarily get behind in the seconds if something in the 481 * system locks interrupts for a long period of time. Since periodic 482 * timers count events, though everything should resynch again 483 * immediately. 484 */ 485 if (gd->gd_cpuid == 0) { 486 int ni; 487 488 cputicks = info->time - gd->gd_cpuclock_base; 489 if (cputicks >= sys_cputimer->freq) { 490 cputicks /= sys_cputimer->freq; 491 if (cputicks != 0 && cputicks != 1) 492 kprintf("Warning: hardclock missed > 1 sec\n"); 493 gd->gd_time_seconds += cputicks; 494 gd->gd_cpuclock_base += sys_cputimer->freq * cputicks; 495 /* uncorrected monotonic 1-sec gran */ 496 time_uptime += cputicks; 497 } 498 ni = (basetime_index + 1) & BASETIME_ARYMASK; 499 hardtime[ni].time_second = gd->gd_time_seconds; 500 hardtime[ni].cpuclock_base = gd->gd_cpuclock_base; 501 } else { 502 int ni; 503 504 ni = basetime_index; 505 cpu_lfence(); 506 gd->gd_time_seconds = hardtime[ni].time_second; 507 gd->gd_cpuclock_base = hardtime[ni].cpuclock_base; 508 } 509 510 /* 511 * The system-wide ticks counter and NTP related timedelta/tickdelta 512 * adjustments only occur on cpu #0. NTP adjustments are accomplished 513 * by updating basetime. 514 */ 515 if (gd->gd_cpuid == 0) { 516 struct timespec *nbt; 517 struct timespec nts; 518 int leap; 519 int ni; 520 521 ++ticks; 522 523 #if 0 524 if (tco->tc_poll_pps) 525 tco->tc_poll_pps(tco); 526 #endif 527 528 /* 529 * Calculate the new basetime index. We are in a critical section 530 * on cpu #0 and can safely play with basetime_index. Start 531 * with the current basetime and then make adjustments. 532 */ 533 ni = (basetime_index + 1) & BASETIME_ARYMASK; 534 nbt = &basetime[ni]; 535 *nbt = basetime[basetime_index]; 536 537 /* 538 * ntp adjustments only occur on cpu 0 and are protected by 539 * ntp_spin. This spinlock virtually never conflicts. 540 */ 541 spin_lock(&ntp_spin); 542 543 /* 544 * Apply adjtime corrections. (adjtime() API) 545 * 546 * adjtime() only runs on cpu #0 so our critical section is 547 * sufficient to access these variables. 548 */ 549 if (ntp_delta != 0) { 550 nbt->tv_nsec += ntp_tick_delta; 551 ntp_delta -= ntp_tick_delta; 552 if ((ntp_delta > 0 && ntp_delta < ntp_tick_delta) || 553 (ntp_delta < 0 && ntp_delta > ntp_tick_delta)) { 554 ntp_tick_delta = ntp_delta; 555 } 556 } 557 558 /* 559 * Apply permanent frequency corrections. (sysctl API) 560 */ 561 if (ntp_tick_permanent != 0) { 562 ntp_tick_acc += ntp_tick_permanent; 563 if (ntp_tick_acc >= (1LL << 32)) { 564 nbt->tv_nsec += ntp_tick_acc >> 32; 565 ntp_tick_acc -= (ntp_tick_acc >> 32) << 32; 566 } else if (ntp_tick_acc <= -(1LL << 32)) { 567 /* Negate ntp_tick_acc to avoid shifting the sign bit. */ 568 nbt->tv_nsec -= (-ntp_tick_acc) >> 32; 569 ntp_tick_acc += ((-ntp_tick_acc) >> 32) << 32; 570 } 571 } 572 573 if (nbt->tv_nsec >= 1000000000) { 574 nbt->tv_sec++; 575 nbt->tv_nsec -= 1000000000; 576 } else if (nbt->tv_nsec < 0) { 577 nbt->tv_sec--; 578 nbt->tv_nsec += 1000000000; 579 } 580 581 /* 582 * Another per-tick compensation. (for ntp_adjtime() API) 583 */ 584 if (nsec_adj != 0) { 585 nsec_acc += nsec_adj; 586 if (nsec_acc >= 0x100000000LL) { 587 nbt->tv_nsec += nsec_acc >> 32; 588 nsec_acc = (nsec_acc & 0xFFFFFFFFLL); 589 } else if (nsec_acc <= -0x100000000LL) { 590 nbt->tv_nsec -= -nsec_acc >> 32; 591 nsec_acc = -(-nsec_acc & 0xFFFFFFFFLL); 592 } 593 if (nbt->tv_nsec >= 1000000000) { 594 nbt->tv_nsec -= 1000000000; 595 ++nbt->tv_sec; 596 } else if (nbt->tv_nsec < 0) { 597 nbt->tv_nsec += 1000000000; 598 --nbt->tv_sec; 599 } 600 } 601 spin_unlock(&ntp_spin); 602 603 /************************************************************ 604 * LEAP SECOND CORRECTION * 605 ************************************************************ 606 * 607 * Taking into account all the corrections made above, figure 608 * out the new real time. If the seconds field has changed 609 * then apply any pending leap-second corrections. 610 */ 611 getnanotime_nbt(nbt, &nts); 612 613 if (time_second != nts.tv_sec) { 614 /* 615 * Apply leap second (sysctl API). Adjust nts for changes 616 * so we do not have to call getnanotime_nbt again. 617 */ 618 if (ntp_leap_second) { 619 if (ntp_leap_second == nts.tv_sec) { 620 if (ntp_leap_insert) { 621 nbt->tv_sec++; 622 nts.tv_sec++; 623 } else { 624 nbt->tv_sec--; 625 nts.tv_sec--; 626 } 627 ntp_leap_second--; 628 } 629 } 630 631 /* 632 * Apply leap second (ntp_adjtime() API), calculate a new 633 * nsec_adj field. ntp_update_second() returns nsec_adj 634 * as a per-second value but we need it as a per-tick value. 635 */ 636 leap = ntp_update_second(time_second, &nsec_adj); 637 nsec_adj /= hz; 638 nbt->tv_sec += leap; 639 nts.tv_sec += leap; 640 641 /* 642 * Update the time_second 'approximate time' global. 643 */ 644 time_second = nts.tv_sec; 645 } 646 647 /* 648 * Finally, our new basetime is ready to go live! 649 */ 650 cpu_sfence(); 651 basetime_index = ni; 652 653 /* 654 * Update kpmap on each tick. TS updates are integrated with 655 * fences and upticks allowing userland to read the data 656 * deterministically. 657 */ 658 if (kpmap) { 659 int w; 660 661 w = (kpmap->upticks + 1) & 1; 662 getnanouptime(&kpmap->ts_uptime[w]); 663 getnanotime(&kpmap->ts_realtime[w]); 664 cpu_sfence(); 665 ++kpmap->upticks; 666 cpu_sfence(); 667 } 668 } 669 670 /* 671 * lwkt thread scheduler fair queueing 672 */ 673 lwkt_schedulerclock(curthread); 674 675 /* 676 * softticks are handled for all cpus 677 */ 678 hardclock_softtick(gd); 679 680 /* 681 * Rollup accumulated vmstats, copy-back for critical path checks. 682 */ 683 vmstats_rollup_cpu(gd); 684 mycpu->gd_vmstats = vmstats; 685 686 /* 687 * ITimer handling is per-tick, per-cpu. 688 * 689 * We must acquire the per-process token in order for ksignal() 690 * to be non-blocking. For the moment this requires an AST fault, 691 * the ksignal() cannot be safely issued from this hard interrupt. 692 * 693 * XXX Even the trytoken here isn't right, and itimer operation in 694 * a multi threaded environment is going to be weird at the 695 * very least. 696 */ 697 if ((p = curproc) != NULL && lwkt_trytoken(&p->p_token)) { 698 crit_enter_hard(); 699 if (p->p_upmap) 700 ++p->p_upmap->runticks; 701 702 if (frame && CLKF_USERMODE(frame) && 703 timevalisset(&p->p_timer[ITIMER_VIRTUAL].it_value) && 704 itimerdecr(&p->p_timer[ITIMER_VIRTUAL], ustick) == 0) { 705 p->p_flags |= P_SIGVTALRM; 706 need_user_resched(); 707 } 708 if (timevalisset(&p->p_timer[ITIMER_PROF].it_value) && 709 itimerdecr(&p->p_timer[ITIMER_PROF], ustick) == 0) { 710 p->p_flags |= P_SIGPROF; 711 need_user_resched(); 712 } 713 crit_exit_hard(); 714 lwkt_reltoken(&p->p_token); 715 } 716 setdelayed(); 717 } 718 719 /* 720 * The statistics clock typically runs at a 125Hz rate, and is intended 721 * to be frequency offset from the hardclock (typ 100Hz). It is per-cpu. 722 * 723 * NOTE! systimer! the MP lock might not be held here. We can only safely 724 * manipulate objects owned by the current cpu. 725 * 726 * The stats clock is responsible for grabbing a profiling sample. 727 * Most of the statistics are only used by user-level statistics programs. 728 * The main exceptions are p->p_uticks, p->p_sticks, p->p_iticks, and 729 * p->p_estcpu. 730 * 731 * Like the other clocks, the stat clock is called from what is effectively 732 * a fast interrupt, so the context should be the thread/process that got 733 * interrupted. 734 */ 735 static void 736 statclock(systimer_t info, int in_ipi, struct intrframe *frame) 737 { 738 #ifdef GPROF 739 struct gmonparam *g; 740 int i; 741 #endif 742 globaldata_t gd = mycpu; 743 thread_t td; 744 struct proc *p; 745 int bump; 746 sysclock_t cv; 747 sysclock_t scv; 748 749 /* 750 * How big was our timeslice relative to the last time? Calculate 751 * in microseconds. 752 * 753 * NOTE: Use of microuptime() is typically MPSAFE, but usually not 754 * during early boot. Just use the systimer count to be nice 755 * to e.g. qemu. The systimer has a better chance of being 756 * MPSAFE at early boot. 757 */ 758 cv = sys_cputimer->count(); 759 scv = gd->statint.gd_statcv; 760 if (scv == 0) { 761 bump = 1; 762 } else { 763 bump = (sys_cputimer->freq64_usec * (cv - scv)) >> 32; 764 if (bump < 0) 765 bump = 0; 766 if (bump > 1000000) 767 bump = 1000000; 768 } 769 gd->statint.gd_statcv = cv; 770 771 #if 0 772 stv = &gd->gd_stattv; 773 if (stv->tv_sec == 0) { 774 bump = 1; 775 } else { 776 bump = tv.tv_usec - stv->tv_usec + 777 (tv.tv_sec - stv->tv_sec) * 1000000; 778 if (bump < 0) 779 bump = 0; 780 if (bump > 1000000) 781 bump = 1000000; 782 } 783 *stv = tv; 784 #endif 785 786 td = curthread; 787 p = td->td_proc; 788 789 if (frame && CLKF_USERMODE(frame)) { 790 /* 791 * Came from userland, handle user time and deal with 792 * possible process. 793 */ 794 if (p && (p->p_flags & P_PROFIL)) 795 addupc_intr(p, CLKF_PC(frame), 1); 796 td->td_uticks += bump; 797 798 /* 799 * Charge the time as appropriate 800 */ 801 if (p && p->p_nice > NZERO) 802 cpu_time.cp_nice += bump; 803 else 804 cpu_time.cp_user += bump; 805 } else { 806 int intr_nest = gd->gd_intr_nesting_level; 807 808 if (in_ipi) { 809 /* 810 * IPI processing code will bump gd_intr_nesting_level 811 * up by one, which breaks following CLKF_INTR testing, 812 * so we subtract it by one here. 813 */ 814 --intr_nest; 815 } 816 #ifdef GPROF 817 /* 818 * Kernel statistics are just like addupc_intr, only easier. 819 */ 820 g = &_gmonparam; 821 if (g->state == GMON_PROF_ON && frame) { 822 i = CLKF_PC(frame) - g->lowpc; 823 if (i < g->textsize) { 824 i /= HISTFRACTION * sizeof(*g->kcount); 825 g->kcount[i]++; 826 } 827 } 828 #endif 829 830 #define IS_INTR_RUNNING ((frame && CLKF_INTR(intr_nest)) || CLKF_INTR_TD(td)) 831 832 /* 833 * Came from kernel mode, so we were: 834 * - handling an interrupt, 835 * - doing syscall or trap work on behalf of the current 836 * user process, or 837 * - spinning in the idle loop. 838 * Whichever it is, charge the time as appropriate. 839 * Note that we charge interrupts to the current process, 840 * regardless of whether they are ``for'' that process, 841 * so that we know how much of its real time was spent 842 * in ``non-process'' (i.e., interrupt) work. 843 * 844 * XXX assume system if frame is NULL. A NULL frame 845 * can occur if ipi processing is done from a crit_exit(). 846 */ 847 if (IS_INTR_RUNNING) { 848 /* 849 * If we interrupted an interrupt thread, well, 850 * count it as interrupt time. 851 */ 852 td->td_iticks += bump; 853 #ifdef DEBUG_PCTRACK 854 if (frame) 855 do_pctrack(frame, PCTRACK_INT); 856 #endif 857 cpu_time.cp_intr += bump; 858 } else if (gd->gd_flags & GDF_VIRTUSER) { 859 /* 860 * The vkernel doesn't do a good job providing trap 861 * frames that we can test. If the GDF_VIRTUSER 862 * flag is set we probably interrupted user mode. 863 * 864 * We also use this flag on the host when entering 865 * VMM mode. 866 */ 867 td->td_uticks += bump; 868 869 /* 870 * Charge the time as appropriate 871 */ 872 if (p && p->p_nice > NZERO) 873 cpu_time.cp_nice += bump; 874 else 875 cpu_time.cp_user += bump; 876 } else { 877 td->td_sticks += bump; 878 if (td == &gd->gd_idlethread) { 879 /* 880 * Token contention can cause us to mis-count 881 * a contended as idle, but it doesn't work 882 * properly for VKERNELs so just test on a 883 * real kernel. 884 */ 885 #ifdef _KERNEL_VIRTUAL 886 cpu_time.cp_idle += bump; 887 #else 888 if (mycpu->gd_reqflags & RQF_IDLECHECK_WK_MASK) 889 cpu_time.cp_sys += bump; 890 else 891 cpu_time.cp_idle += bump; 892 #endif 893 } else { 894 /* 895 * System thread was running. 896 */ 897 #ifdef DEBUG_PCTRACK 898 if (frame) 899 do_pctrack(frame, PCTRACK_SYS); 900 #endif 901 cpu_time.cp_sys += bump; 902 } 903 } 904 905 #undef IS_INTR_RUNNING 906 } 907 } 908 909 #ifdef DEBUG_PCTRACK 910 /* 911 * Sample the PC when in the kernel or in an interrupt. User code can 912 * retrieve the information and generate a histogram or other output. 913 */ 914 915 static void 916 do_pctrack(struct intrframe *frame, int which) 917 { 918 struct kinfo_pctrack *pctrack; 919 920 pctrack = &cputime_pctrack[mycpu->gd_cpuid][which]; 921 pctrack->pc_array[pctrack->pc_index & PCTRACK_ARYMASK] = 922 (void *)CLKF_PC(frame); 923 ++pctrack->pc_index; 924 } 925 926 static int 927 sysctl_pctrack(SYSCTL_HANDLER_ARGS) 928 { 929 struct kinfo_pcheader head; 930 int error; 931 int cpu; 932 int ntrack; 933 934 head.pc_ntrack = PCTRACK_SIZE; 935 head.pc_arysize = PCTRACK_ARYSIZE; 936 937 if ((error = SYSCTL_OUT(req, &head, sizeof(head))) != 0) 938 return (error); 939 940 for (cpu = 0; cpu < ncpus; ++cpu) { 941 for (ntrack = 0; ntrack < PCTRACK_SIZE; ++ntrack) { 942 error = SYSCTL_OUT(req, &cputime_pctrack[cpu][ntrack], 943 sizeof(struct kinfo_pctrack)); 944 if (error) 945 break; 946 } 947 if (error) 948 break; 949 } 950 return (error); 951 } 952 SYSCTL_PROC(_kern, OID_AUTO, pctrack, (CTLTYPE_OPAQUE|CTLFLAG_RD), 0, 0, 953 sysctl_pctrack, "S,kinfo_pcheader", "CPU PC tracking"); 954 955 #endif 956 957 /* 958 * The scheduler clock typically runs at a 50Hz rate. NOTE! systimer, 959 * the MP lock might not be held. We can safely manipulate parts of curproc 960 * but that's about it. 961 * 962 * Each cpu has its own scheduler clock. 963 */ 964 static void 965 schedclock(systimer_t info, int in_ipi __unused, struct intrframe *frame) 966 { 967 struct lwp *lp; 968 struct rusage *ru; 969 struct vmspace *vm; 970 long rss; 971 972 if ((lp = lwkt_preempted_proc()) != NULL) { 973 /* 974 * Account for cpu time used and hit the scheduler. Note 975 * that this call MUST BE MP SAFE, and the BGL IS NOT HELD 976 * HERE. 977 */ 978 ++lp->lwp_cpticks; 979 usched_schedulerclock(lp, info->periodic, info->time); 980 } else { 981 usched_schedulerclock(NULL, info->periodic, info->time); 982 } 983 if ((lp = curthread->td_lwp) != NULL) { 984 /* 985 * Update resource usage integrals and maximums. 986 */ 987 if ((ru = &lp->lwp_proc->p_ru) && 988 (vm = lp->lwp_proc->p_vmspace) != NULL) { 989 ru->ru_ixrss += pgtok(vm->vm_tsize); 990 ru->ru_idrss += pgtok(vm->vm_dsize); 991 ru->ru_isrss += pgtok(vm->vm_ssize); 992 if (lwkt_trytoken(&vm->vm_map.token)) { 993 rss = pgtok(vmspace_resident_count(vm)); 994 if (ru->ru_maxrss < rss) 995 ru->ru_maxrss = rss; 996 lwkt_reltoken(&vm->vm_map.token); 997 } 998 } 999 } 1000 /* Increment the global sched_ticks */ 1001 if (mycpu->gd_cpuid == 0) 1002 ++sched_ticks; 1003 } 1004 1005 /* 1006 * Compute number of ticks for the specified amount of time. The 1007 * return value is intended to be used in a clock interrupt timed 1008 * operation and guaranteed to meet or exceed the requested time. 1009 * If the representation overflows, return INT_MAX. The minimum return 1010 * value is 1 ticks and the function will average the calculation up. 1011 * If any value greater then 0 microseconds is supplied, a value 1012 * of at least 2 will be returned to ensure that a near-term clock 1013 * interrupt does not cause the timeout to occur (degenerately) early. 1014 * 1015 * Note that limit checks must take into account microseconds, which is 1016 * done simply by using the smaller signed long maximum instead of 1017 * the unsigned long maximum. 1018 * 1019 * If ints have 32 bits, then the maximum value for any timeout in 1020 * 10ms ticks is 248 days. 1021 */ 1022 int 1023 tvtohz_high(struct timeval *tv) 1024 { 1025 int ticks; 1026 long sec, usec; 1027 1028 sec = tv->tv_sec; 1029 usec = tv->tv_usec; 1030 if (usec < 0) { 1031 sec--; 1032 usec += 1000000; 1033 } 1034 if (sec < 0) { 1035 #ifdef DIAGNOSTIC 1036 if (usec > 0) { 1037 sec++; 1038 usec -= 1000000; 1039 } 1040 kprintf("tvtohz_high: negative time difference " 1041 "%ld sec %ld usec\n", 1042 sec, usec); 1043 #endif 1044 ticks = 1; 1045 } else if (sec <= INT_MAX / hz) { 1046 ticks = (int)(sec * hz + 1047 ((u_long)usec + (ustick - 1)) / ustick) + 1; 1048 } else { 1049 ticks = INT_MAX; 1050 } 1051 return (ticks); 1052 } 1053 1054 int 1055 tstohz_high(struct timespec *ts) 1056 { 1057 int ticks; 1058 long sec, nsec; 1059 1060 sec = ts->tv_sec; 1061 nsec = ts->tv_nsec; 1062 if (nsec < 0) { 1063 sec--; 1064 nsec += 1000000000; 1065 } 1066 if (sec < 0) { 1067 #ifdef DIAGNOSTIC 1068 if (nsec > 0) { 1069 sec++; 1070 nsec -= 1000000000; 1071 } 1072 kprintf("tstohz_high: negative time difference " 1073 "%ld sec %ld nsec\n", 1074 sec, nsec); 1075 #endif 1076 ticks = 1; 1077 } else if (sec <= INT_MAX / hz) { 1078 ticks = (int)(sec * hz + 1079 ((u_long)nsec + (nstick - 1)) / nstick) + 1; 1080 } else { 1081 ticks = INT_MAX; 1082 } 1083 return (ticks); 1084 } 1085 1086 1087 /* 1088 * Compute number of ticks for the specified amount of time, erroring on 1089 * the side of it being too low to ensure that sleeping the returned number 1090 * of ticks will not result in a late return. 1091 * 1092 * The supplied timeval may not be negative and should be normalized. A 1093 * return value of 0 is possible if the timeval converts to less then 1094 * 1 tick. 1095 * 1096 * If ints have 32 bits, then the maximum value for any timeout in 1097 * 10ms ticks is 248 days. 1098 */ 1099 int 1100 tvtohz_low(struct timeval *tv) 1101 { 1102 int ticks; 1103 long sec; 1104 1105 sec = tv->tv_sec; 1106 if (sec <= INT_MAX / hz) 1107 ticks = (int)(sec * hz + (u_long)tv->tv_usec / ustick); 1108 else 1109 ticks = INT_MAX; 1110 return (ticks); 1111 } 1112 1113 int 1114 tstohz_low(struct timespec *ts) 1115 { 1116 int ticks; 1117 long sec; 1118 1119 sec = ts->tv_sec; 1120 if (sec <= INT_MAX / hz) 1121 ticks = (int)(sec * hz + (u_long)ts->tv_nsec / nstick); 1122 else 1123 ticks = INT_MAX; 1124 return (ticks); 1125 } 1126 1127 /* 1128 * Start profiling on a process. 1129 * 1130 * Caller must hold p->p_token(); 1131 * 1132 * Kernel profiling passes proc0 which never exits and hence 1133 * keeps the profile clock running constantly. 1134 */ 1135 void 1136 startprofclock(struct proc *p) 1137 { 1138 if ((p->p_flags & P_PROFIL) == 0) { 1139 p->p_flags |= P_PROFIL; 1140 #if 0 /* XXX */ 1141 if (++profprocs == 1 && stathz != 0) { 1142 crit_enter(); 1143 psdiv = psratio; 1144 setstatclockrate(profhz); 1145 crit_exit(); 1146 } 1147 #endif 1148 } 1149 } 1150 1151 /* 1152 * Stop profiling on a process. 1153 * 1154 * caller must hold p->p_token 1155 */ 1156 void 1157 stopprofclock(struct proc *p) 1158 { 1159 if (p->p_flags & P_PROFIL) { 1160 p->p_flags &= ~P_PROFIL; 1161 #if 0 /* XXX */ 1162 if (--profprocs == 0 && stathz != 0) { 1163 crit_enter(); 1164 psdiv = 1; 1165 setstatclockrate(stathz); 1166 crit_exit(); 1167 } 1168 #endif 1169 } 1170 } 1171 1172 /* 1173 * Return information about system clocks. 1174 */ 1175 static int 1176 sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS) 1177 { 1178 struct kinfo_clockinfo clkinfo; 1179 /* 1180 * Construct clockinfo structure. 1181 */ 1182 clkinfo.ci_hz = hz; 1183 clkinfo.ci_tick = ustick; 1184 clkinfo.ci_tickadj = ntp_default_tick_delta / 1000; 1185 clkinfo.ci_profhz = profhz; 1186 clkinfo.ci_stathz = stathz ? stathz : hz; 1187 return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); 1188 } 1189 1190 SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, 1191 0, 0, sysctl_kern_clockrate, "S,clockinfo",""); 1192 1193 /* 1194 * We have eight functions for looking at the clock, four for 1195 * microseconds and four for nanoseconds. For each there is fast 1196 * but less precise version "get{nano|micro}[up]time" which will 1197 * return a time which is up to 1/HZ previous to the call, whereas 1198 * the raw version "{nano|micro}[up]time" will return a timestamp 1199 * which is as precise as possible. The "up" variants return the 1200 * time relative to system boot, these are well suited for time 1201 * interval measurements. 1202 * 1203 * Each cpu independently maintains the current time of day, so all 1204 * we need to do to protect ourselves from changes is to do a loop 1205 * check on the seconds field changing out from under us. 1206 * 1207 * The system timer maintains a 32 bit count and due to various issues 1208 * it is possible for the calculated delta to occasionally exceed 1209 * sys_cputimer->freq. If this occurs the sys_cputimer->freq64_nsec 1210 * multiplication can easily overflow, so we deal with the case. For 1211 * uniformity we deal with the case in the usec case too. 1212 * 1213 * All the [get][micro,nano][time,uptime]() routines are MPSAFE. 1214 */ 1215 void 1216 getmicrouptime(struct timeval *tvp) 1217 { 1218 struct globaldata *gd = mycpu; 1219 sysclock_t delta; 1220 1221 do { 1222 tvp->tv_sec = gd->gd_time_seconds; 1223 delta = gd->gd_hardclock.time - gd->gd_cpuclock_base; 1224 } while (tvp->tv_sec != gd->gd_time_seconds); 1225 1226 if (delta >= sys_cputimer->freq) { 1227 tvp->tv_sec += delta / sys_cputimer->freq; 1228 delta %= sys_cputimer->freq; 1229 } 1230 tvp->tv_usec = (sys_cputimer->freq64_usec * delta) >> 32; 1231 if (tvp->tv_usec >= 1000000) { 1232 tvp->tv_usec -= 1000000; 1233 ++tvp->tv_sec; 1234 } 1235 } 1236 1237 void 1238 getnanouptime(struct timespec *tsp) 1239 { 1240 struct globaldata *gd = mycpu; 1241 sysclock_t delta; 1242 1243 do { 1244 tsp->tv_sec = gd->gd_time_seconds; 1245 delta = gd->gd_hardclock.time - gd->gd_cpuclock_base; 1246 } while (tsp->tv_sec != gd->gd_time_seconds); 1247 1248 if (delta >= sys_cputimer->freq) { 1249 tsp->tv_sec += delta / sys_cputimer->freq; 1250 delta %= sys_cputimer->freq; 1251 } 1252 tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32; 1253 } 1254 1255 void 1256 microuptime(struct timeval *tvp) 1257 { 1258 struct globaldata *gd = mycpu; 1259 sysclock_t delta; 1260 1261 do { 1262 tvp->tv_sec = gd->gd_time_seconds; 1263 delta = sys_cputimer->count() - gd->gd_cpuclock_base; 1264 } while (tvp->tv_sec != gd->gd_time_seconds); 1265 1266 if (delta >= sys_cputimer->freq) { 1267 tvp->tv_sec += delta / sys_cputimer->freq; 1268 delta %= sys_cputimer->freq; 1269 } 1270 tvp->tv_usec = (sys_cputimer->freq64_usec * delta) >> 32; 1271 } 1272 1273 void 1274 nanouptime(struct timespec *tsp) 1275 { 1276 struct globaldata *gd = mycpu; 1277 sysclock_t delta; 1278 1279 do { 1280 tsp->tv_sec = gd->gd_time_seconds; 1281 delta = sys_cputimer->count() - gd->gd_cpuclock_base; 1282 } while (tsp->tv_sec != gd->gd_time_seconds); 1283 1284 if (delta >= sys_cputimer->freq) { 1285 tsp->tv_sec += delta / sys_cputimer->freq; 1286 delta %= sys_cputimer->freq; 1287 } 1288 tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32; 1289 } 1290 1291 /* 1292 * realtime routines 1293 */ 1294 void 1295 getmicrotime(struct timeval *tvp) 1296 { 1297 struct globaldata *gd = mycpu; 1298 struct timespec *bt; 1299 sysclock_t delta; 1300 1301 do { 1302 tvp->tv_sec = gd->gd_time_seconds; 1303 delta = gd->gd_hardclock.time - gd->gd_cpuclock_base; 1304 } while (tvp->tv_sec != gd->gd_time_seconds); 1305 1306 if (delta >= sys_cputimer->freq) { 1307 tvp->tv_sec += delta / sys_cputimer->freq; 1308 delta %= sys_cputimer->freq; 1309 } 1310 tvp->tv_usec = (sys_cputimer->freq64_usec * delta) >> 32; 1311 1312 bt = &basetime[basetime_index]; 1313 cpu_lfence(); 1314 tvp->tv_sec += bt->tv_sec; 1315 tvp->tv_usec += bt->tv_nsec / 1000; 1316 while (tvp->tv_usec >= 1000000) { 1317 tvp->tv_usec -= 1000000; 1318 ++tvp->tv_sec; 1319 } 1320 } 1321 1322 void 1323 getnanotime(struct timespec *tsp) 1324 { 1325 struct globaldata *gd = mycpu; 1326 struct timespec *bt; 1327 sysclock_t delta; 1328 1329 do { 1330 tsp->tv_sec = gd->gd_time_seconds; 1331 delta = gd->gd_hardclock.time - gd->gd_cpuclock_base; 1332 } while (tsp->tv_sec != gd->gd_time_seconds); 1333 1334 if (delta >= sys_cputimer->freq) { 1335 tsp->tv_sec += delta / sys_cputimer->freq; 1336 delta %= sys_cputimer->freq; 1337 } 1338 tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32; 1339 1340 bt = &basetime[basetime_index]; 1341 cpu_lfence(); 1342 tsp->tv_sec += bt->tv_sec; 1343 tsp->tv_nsec += bt->tv_nsec; 1344 while (tsp->tv_nsec >= 1000000000) { 1345 tsp->tv_nsec -= 1000000000; 1346 ++tsp->tv_sec; 1347 } 1348 } 1349 1350 static void 1351 getnanotime_nbt(struct timespec *nbt, struct timespec *tsp) 1352 { 1353 struct globaldata *gd = mycpu; 1354 sysclock_t delta; 1355 1356 do { 1357 tsp->tv_sec = gd->gd_time_seconds; 1358 delta = gd->gd_hardclock.time - gd->gd_cpuclock_base; 1359 } while (tsp->tv_sec != gd->gd_time_seconds); 1360 1361 if (delta >= sys_cputimer->freq) { 1362 tsp->tv_sec += delta / sys_cputimer->freq; 1363 delta %= sys_cputimer->freq; 1364 } 1365 tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32; 1366 1367 tsp->tv_sec += nbt->tv_sec; 1368 tsp->tv_nsec += nbt->tv_nsec; 1369 while (tsp->tv_nsec >= 1000000000) { 1370 tsp->tv_nsec -= 1000000000; 1371 ++tsp->tv_sec; 1372 } 1373 } 1374 1375 1376 void 1377 microtime(struct timeval *tvp) 1378 { 1379 struct globaldata *gd = mycpu; 1380 struct timespec *bt; 1381 sysclock_t delta; 1382 1383 do { 1384 tvp->tv_sec = gd->gd_time_seconds; 1385 delta = sys_cputimer->count() - gd->gd_cpuclock_base; 1386 } while (tvp->tv_sec != gd->gd_time_seconds); 1387 1388 if (delta >= sys_cputimer->freq) { 1389 tvp->tv_sec += delta / sys_cputimer->freq; 1390 delta %= sys_cputimer->freq; 1391 } 1392 tvp->tv_usec = (sys_cputimer->freq64_usec * delta) >> 32; 1393 1394 bt = &basetime[basetime_index]; 1395 cpu_lfence(); 1396 tvp->tv_sec += bt->tv_sec; 1397 tvp->tv_usec += bt->tv_nsec / 1000; 1398 while (tvp->tv_usec >= 1000000) { 1399 tvp->tv_usec -= 1000000; 1400 ++tvp->tv_sec; 1401 } 1402 } 1403 1404 void 1405 nanotime(struct timespec *tsp) 1406 { 1407 struct globaldata *gd = mycpu; 1408 struct timespec *bt; 1409 sysclock_t delta; 1410 1411 do { 1412 tsp->tv_sec = gd->gd_time_seconds; 1413 delta = sys_cputimer->count() - gd->gd_cpuclock_base; 1414 } while (tsp->tv_sec != gd->gd_time_seconds); 1415 1416 if (delta >= sys_cputimer->freq) { 1417 tsp->tv_sec += delta / sys_cputimer->freq; 1418 delta %= sys_cputimer->freq; 1419 } 1420 tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32; 1421 1422 bt = &basetime[basetime_index]; 1423 cpu_lfence(); 1424 tsp->tv_sec += bt->tv_sec; 1425 tsp->tv_nsec += bt->tv_nsec; 1426 while (tsp->tv_nsec >= 1000000000) { 1427 tsp->tv_nsec -= 1000000000; 1428 ++tsp->tv_sec; 1429 } 1430 } 1431 1432 /* 1433 * Get an approximate time_t. It does not have to be accurate. This 1434 * function is called only from KTR and can be called with the system in 1435 * any state so do not use a critical section or other complex operation 1436 * here. 1437 * 1438 * NOTE: This is not exactly synchronized with real time. To do that we 1439 * would have to do what microtime does and check for a nanoseconds 1440 * overflow. 1441 */ 1442 time_t 1443 get_approximate_time_t(void) 1444 { 1445 struct globaldata *gd = mycpu; 1446 struct timespec *bt; 1447 1448 bt = &basetime[basetime_index]; 1449 return(gd->gd_time_seconds + bt->tv_sec); 1450 } 1451 1452 int 1453 pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps) 1454 { 1455 pps_params_t *app; 1456 struct pps_fetch_args *fapi; 1457 #ifdef PPS_SYNC 1458 struct pps_kcbind_args *kapi; 1459 #endif 1460 1461 switch (cmd) { 1462 case PPS_IOC_CREATE: 1463 return (0); 1464 case PPS_IOC_DESTROY: 1465 return (0); 1466 case PPS_IOC_SETPARAMS: 1467 app = (pps_params_t *)data; 1468 if (app->mode & ~pps->ppscap) 1469 return (EINVAL); 1470 pps->ppsparam = *app; 1471 return (0); 1472 case PPS_IOC_GETPARAMS: 1473 app = (pps_params_t *)data; 1474 *app = pps->ppsparam; 1475 app->api_version = PPS_API_VERS_1; 1476 return (0); 1477 case PPS_IOC_GETCAP: 1478 *(int*)data = pps->ppscap; 1479 return (0); 1480 case PPS_IOC_FETCH: 1481 fapi = (struct pps_fetch_args *)data; 1482 if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC) 1483 return (EINVAL); 1484 if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec) 1485 return (EOPNOTSUPP); 1486 pps->ppsinfo.current_mode = pps->ppsparam.mode; 1487 fapi->pps_info_buf = pps->ppsinfo; 1488 return (0); 1489 case PPS_IOC_KCBIND: 1490 #ifdef PPS_SYNC 1491 kapi = (struct pps_kcbind_args *)data; 1492 /* XXX Only root should be able to do this */ 1493 if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC) 1494 return (EINVAL); 1495 if (kapi->kernel_consumer != PPS_KC_HARDPPS) 1496 return (EINVAL); 1497 if (kapi->edge & ~pps->ppscap) 1498 return (EINVAL); 1499 pps->kcmode = kapi->edge; 1500 return (0); 1501 #else 1502 return (EOPNOTSUPP); 1503 #endif 1504 default: 1505 return (ENOTTY); 1506 } 1507 } 1508 1509 void 1510 pps_init(struct pps_state *pps) 1511 { 1512 pps->ppscap |= PPS_TSFMT_TSPEC; 1513 if (pps->ppscap & PPS_CAPTUREASSERT) 1514 pps->ppscap |= PPS_OFFSETASSERT; 1515 if (pps->ppscap & PPS_CAPTURECLEAR) 1516 pps->ppscap |= PPS_OFFSETCLEAR; 1517 } 1518 1519 void 1520 pps_event(struct pps_state *pps, sysclock_t count, int event) 1521 { 1522 struct globaldata *gd; 1523 struct timespec *tsp; 1524 struct timespec *osp; 1525 struct timespec *bt; 1526 struct timespec ts; 1527 sysclock_t *pcount; 1528 #ifdef PPS_SYNC 1529 sysclock_t tcount; 1530 #endif 1531 sysclock_t delta; 1532 pps_seq_t *pseq; 1533 int foff; 1534 #ifdef PPS_SYNC 1535 int fhard; 1536 #endif 1537 int ni; 1538 1539 gd = mycpu; 1540 1541 /* Things would be easier with arrays... */ 1542 if (event == PPS_CAPTUREASSERT) { 1543 tsp = &pps->ppsinfo.assert_timestamp; 1544 osp = &pps->ppsparam.assert_offset; 1545 foff = pps->ppsparam.mode & PPS_OFFSETASSERT; 1546 #ifdef PPS_SYNC 1547 fhard = pps->kcmode & PPS_CAPTUREASSERT; 1548 #endif 1549 pcount = &pps->ppscount[0]; 1550 pseq = &pps->ppsinfo.assert_sequence; 1551 } else { 1552 tsp = &pps->ppsinfo.clear_timestamp; 1553 osp = &pps->ppsparam.clear_offset; 1554 foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; 1555 #ifdef PPS_SYNC 1556 fhard = pps->kcmode & PPS_CAPTURECLEAR; 1557 #endif 1558 pcount = &pps->ppscount[1]; 1559 pseq = &pps->ppsinfo.clear_sequence; 1560 } 1561 1562 /* Nothing really happened */ 1563 if (*pcount == count) 1564 return; 1565 1566 *pcount = count; 1567 1568 do { 1569 ts.tv_sec = gd->gd_time_seconds; 1570 delta = count - gd->gd_cpuclock_base; 1571 } while (ts.tv_sec != gd->gd_time_seconds); 1572 1573 if (delta >= sys_cputimer->freq) { 1574 ts.tv_sec += delta / sys_cputimer->freq; 1575 delta %= sys_cputimer->freq; 1576 } 1577 ts.tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32; 1578 ni = basetime_index; 1579 cpu_lfence(); 1580 bt = &basetime[ni]; 1581 ts.tv_sec += bt->tv_sec; 1582 ts.tv_nsec += bt->tv_nsec; 1583 while (ts.tv_nsec >= 1000000000) { 1584 ts.tv_nsec -= 1000000000; 1585 ++ts.tv_sec; 1586 } 1587 1588 (*pseq)++; 1589 *tsp = ts; 1590 1591 if (foff) { 1592 timespecadd(tsp, osp); 1593 if (tsp->tv_nsec < 0) { 1594 tsp->tv_nsec += 1000000000; 1595 tsp->tv_sec -= 1; 1596 } 1597 } 1598 #ifdef PPS_SYNC 1599 if (fhard) { 1600 /* magic, at its best... */ 1601 tcount = count - pps->ppscount[2]; 1602 pps->ppscount[2] = count; 1603 if (tcount >= sys_cputimer->freq) { 1604 delta = (1000000000 * (tcount / sys_cputimer->freq) + 1605 sys_cputimer->freq64_nsec * 1606 (tcount % sys_cputimer->freq)) >> 32; 1607 } else { 1608 delta = (sys_cputimer->freq64_nsec * tcount) >> 32; 1609 } 1610 hardpps(tsp, delta); 1611 } 1612 #endif 1613 } 1614 1615 /* 1616 * Return the tsc target value for a delay of (ns). 1617 * 1618 * Returns -1 if the TSC is not supported. 1619 */ 1620 int64_t 1621 tsc_get_target(int ns) 1622 { 1623 #if defined(_RDTSC_SUPPORTED_) 1624 if (cpu_feature & CPUID_TSC) { 1625 return (rdtsc() + tsc_frequency * ns / (int64_t)1000000000); 1626 } 1627 #endif 1628 return(-1); 1629 } 1630 1631 /* 1632 * Compare the tsc against the passed target 1633 * 1634 * Returns +1 if the target has been reached 1635 * Returns 0 if the target has not yet been reached 1636 * Returns -1 if the TSC is not supported. 1637 * 1638 * Typical use: while (tsc_test_target(target) == 0) { ...poll... } 1639 */ 1640 int 1641 tsc_test_target(int64_t target) 1642 { 1643 #if defined(_RDTSC_SUPPORTED_) 1644 if (cpu_feature & CPUID_TSC) { 1645 if ((int64_t)(target - rdtsc()) <= 0) 1646 return(1); 1647 return(0); 1648 } 1649 #endif 1650 return(-1); 1651 } 1652 1653 /* 1654 * Delay the specified number of nanoseconds using the tsc. This function 1655 * returns immediately if the TSC is not supported. At least one cpu_pause() 1656 * will be issued. 1657 */ 1658 void 1659 tsc_delay(int ns) 1660 { 1661 int64_t clk; 1662 1663 clk = tsc_get_target(ns); 1664 cpu_pause(); 1665 while (tsc_test_target(clk) == 0) 1666 cpu_pause(); 1667 } 1668