1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)kern_time.c 8.1 (Berkeley) 6/10/93 30 * $FreeBSD: src/sys/kern/kern_time.c,v 1.68.2.1 2002/10/01 08:00:41 bde Exp $ 31 */ 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/buf.h> 36 #include <sys/sysmsg.h> 37 #include <sys/resourcevar.h> 38 #include <sys/signalvar.h> 39 #include <sys/kernel.h> 40 #include <sys/sysent.h> 41 #include <sys/proc.h> 42 #include <sys/caps.h> 43 #include <sys/time.h> 44 #include <sys/vnode.h> 45 #include <sys/sysctl.h> 46 #include <sys/kern_syscall.h> 47 #include <sys/upmap.h> 48 #include <vm/vm.h> 49 #include <vm/vm_extern.h> 50 51 #include <sys/msgport2.h> 52 #include <sys/spinlock2.h> 53 #include <sys/thread2.h> 54 55 extern struct spinlock ntp_spin; 56 57 #define CPUCLOCK_BIT 0x80000000 58 #define CPUCLOCK_ID_MASK ~CPUCLOCK_BIT 59 #define CPUCLOCK2LWPID(clock_id) (((clockid_t)(clock_id) >> 32) & CPUCLOCK_ID_MASK) 60 #define CPUCLOCK2PID(clock_id) ((clock_id) & CPUCLOCK_ID_MASK) 61 #define MAKE_CPUCLOCK(pid, lwp_id) ((clockid_t)(lwp_id) << 32 | (pid) | CPUCLOCK_BIT) 62 63 struct timezone tz; 64 65 /* 66 * Time of day and interval timer support. 67 * 68 * These routines provide the kernel entry points to get and set 69 * the time-of-day and per-process interval timers. Subroutines 70 * here provide support for adding and subtracting timeval structures 71 * and decrementing interval timers, optionally reloading the interval 72 * timers when they expire. 73 */ 74 75 static int settime(struct timeval *); 76 static void timevalfix(struct timeval *); 77 static void realitexpire(void *arg); 78 79 static int sysctl_gettimeofday_quick(SYSCTL_HANDLER_ARGS); 80 81 82 /* 83 * Nanosleep tries very hard to sleep for a precisely requested time 84 * interval, down to 1uS. The administrator can impose a minimum delay 85 * and a delay below which we hard-loop instead of initiate a timer 86 * interrupt and sleep. 87 * 88 * For machines under high loads it might be beneficial to increase min_us 89 * to e.g. 1000uS (1ms) so spining processes sleep meaningfully. 90 */ 91 static int nanosleep_min_us = 10; 92 static int nanosleep_hard_us = 100; 93 static int gettimeofday_quick = 0; 94 SYSCTL_INT(_kern, OID_AUTO, nanosleep_min_us, CTLFLAG_RW, 95 &nanosleep_min_us, 0, ""); 96 SYSCTL_INT(_kern, OID_AUTO, nanosleep_hard_us, CTLFLAG_RW, 97 &nanosleep_hard_us, 0, ""); 98 SYSCTL_PROC(_kern, OID_AUTO, gettimeofday_quick, CTLTYPE_INT | CTLFLAG_RW, 99 0, 0, sysctl_gettimeofday_quick, "I", "Quick mode gettimeofday"); 100 101 static struct lock masterclock_lock = LOCK_INITIALIZER("mstrclk", 0, 0); 102 103 static int 104 settime(struct timeval *tv) 105 { 106 struct timeval delta, tv1, tv2; 107 static struct timeval maxtime, laststep; 108 struct timespec ts; 109 int origcpu; 110 111 if ((origcpu = mycpu->gd_cpuid) != 0) 112 lwkt_setcpu_self(globaldata_find(0)); 113 114 crit_enter(); 115 microtime(&tv1); 116 delta = *tv; 117 timevalsub(&delta, &tv1); 118 119 /* 120 * If the system is secure, we do not allow the time to be 121 * set to a value earlier than 1 second less than the highest 122 * time we have yet seen. The worst a miscreant can do in 123 * this circumstance is "freeze" time. He couldn't go 124 * back to the past. 125 * 126 * We similarly do not allow the clock to be stepped more 127 * than one second, nor more than once per second. This allows 128 * a miscreant to make the clock march double-time, but no worse. 129 */ 130 if (securelevel > 1) { 131 if (delta.tv_sec < 0 || delta.tv_usec < 0) { 132 /* 133 * Update maxtime to latest time we've seen. 134 */ 135 if (tv1.tv_sec > maxtime.tv_sec) 136 maxtime = tv1; 137 tv2 = *tv; 138 timevalsub(&tv2, &maxtime); 139 if (tv2.tv_sec < -1) { 140 tv->tv_sec = maxtime.tv_sec - 1; 141 kprintf("Time adjustment clamped to -1 second\n"); 142 } 143 } else { 144 if (tv1.tv_sec == laststep.tv_sec) { 145 crit_exit(); 146 return (EPERM); 147 } 148 if (delta.tv_sec > 1) { 149 tv->tv_sec = tv1.tv_sec + 1; 150 kprintf("Time adjustment clamped to +1 second\n"); 151 } 152 laststep = *tv; 153 } 154 } 155 156 ts.tv_sec = tv->tv_sec; 157 ts.tv_nsec = tv->tv_usec * 1000; 158 set_timeofday(&ts); 159 crit_exit(); 160 161 if (origcpu != 0) 162 lwkt_setcpu_self(globaldata_find(origcpu)); 163 164 resettodr(); 165 return (0); 166 } 167 168 static void 169 get_process_cputime(struct proc *p, struct timespec *ats) 170 { 171 struct rusage ru; 172 173 lwkt_gettoken(&p->p_token); 174 calcru_proc(p, &ru); 175 lwkt_reltoken(&p->p_token); 176 timevaladd(&ru.ru_utime, &ru.ru_stime); 177 TIMEVAL_TO_TIMESPEC(&ru.ru_utime, ats); 178 } 179 180 static void 181 get_process_usertime(struct proc *p, struct timespec *ats) 182 { 183 struct rusage ru; 184 185 lwkt_gettoken(&p->p_token); 186 calcru_proc(p, &ru); 187 lwkt_reltoken(&p->p_token); 188 TIMEVAL_TO_TIMESPEC(&ru.ru_utime, ats); 189 } 190 191 static void 192 get_thread_cputime(struct thread *td, struct timespec *ats) 193 { 194 struct timeval sys, user; 195 196 calcru(td->td_lwp, &user, &sys); 197 timevaladd(&user, &sys); 198 TIMEVAL_TO_TIMESPEC(&user, ats); 199 } 200 201 /* 202 * MPSAFE 203 */ 204 int 205 kern_clock_gettime(clockid_t clock_id, struct timespec *ats) 206 { 207 struct proc *p; 208 struct lwp *lp; 209 lwpid_t lwp_id; 210 211 p = curproc; 212 switch(clock_id) { 213 case CLOCK_REALTIME: 214 case CLOCK_REALTIME_PRECISE: 215 nanotime(ats); 216 break; 217 case CLOCK_REALTIME_FAST: 218 getnanotime(ats); 219 break; 220 case CLOCK_MONOTONIC: 221 case CLOCK_MONOTONIC_PRECISE: 222 case CLOCK_UPTIME: 223 case CLOCK_UPTIME_PRECISE: 224 nanouptime(ats); 225 break; 226 case CLOCK_MONOTONIC_FAST: 227 case CLOCK_UPTIME_FAST: 228 getnanouptime(ats); 229 break; 230 case CLOCK_VIRTUAL: 231 get_process_usertime(p, ats); 232 break; 233 case CLOCK_PROF: 234 case CLOCK_PROCESS_CPUTIME_ID: 235 get_process_cputime(p, ats); 236 break; 237 case CLOCK_SECOND: 238 ats->tv_sec = time_second; 239 ats->tv_nsec = 0; 240 break; 241 case CLOCK_THREAD_CPUTIME_ID: 242 get_thread_cputime(curthread, ats); 243 break; 244 default: 245 if ((clock_id & CPUCLOCK_BIT) == 0) 246 return (EINVAL); 247 if ((p = pfind(CPUCLOCK2PID(clock_id))) == NULL) 248 return (EINVAL); 249 lwp_id = CPUCLOCK2LWPID(clock_id); 250 if (lwp_id == 0) { 251 get_process_cputime(p, ats); 252 } else { 253 lwkt_gettoken(&p->p_token); 254 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, lwp_id); 255 if (lp == NULL) { 256 lwkt_reltoken(&p->p_token); 257 PRELE(p); 258 return (EINVAL); 259 } 260 get_thread_cputime(lp->lwp_thread, ats); 261 lwkt_reltoken(&p->p_token); 262 } 263 PRELE(p); 264 } 265 return (0); 266 } 267 268 /* 269 * MPSAFE 270 */ 271 int 272 sys_clock_gettime(struct sysmsg *sysmsg, const struct clock_gettime_args *uap) 273 { 274 struct timespec ats; 275 int error; 276 277 error = kern_clock_gettime(uap->clock_id, &ats); 278 if (error == 0) 279 error = copyout(&ats, uap->tp, sizeof(ats)); 280 281 return (error); 282 } 283 284 int 285 kern_clock_settime(clockid_t clock_id, struct timespec *ats) 286 { 287 struct timeval atv; 288 int error; 289 290 if ((error = caps_priv_check_self(SYSCAP_NOSETTIME)) != 0) 291 return (error); 292 if (clock_id != CLOCK_REALTIME) 293 return (EINVAL); 294 if (ats->tv_sec < 0 || ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000) 295 return (EINVAL); 296 297 lockmgr(&masterclock_lock, LK_EXCLUSIVE); 298 TIMESPEC_TO_TIMEVAL(&atv, ats); 299 error = settime(&atv); 300 lockmgr(&masterclock_lock, LK_RELEASE); 301 302 return (error); 303 } 304 305 /* 306 * MPALMOSTSAFE 307 */ 308 int 309 sys_clock_settime(struct sysmsg *sysmsg, const struct clock_settime_args *uap) 310 { 311 struct timespec ats; 312 int error; 313 314 if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0) 315 return (error); 316 317 error = kern_clock_settime(uap->clock_id, &ats); 318 319 return (error); 320 } 321 322 /* 323 * MPSAFE 324 */ 325 int 326 kern_clock_getres(clockid_t clock_id, struct timespec *ts) 327 { 328 ts->tv_sec = 0; 329 330 switch (clock_id) { 331 case CLOCK_REALTIME: 332 case CLOCK_REALTIME_FAST: 333 case CLOCK_REALTIME_PRECISE: 334 case CLOCK_MONOTONIC: 335 case CLOCK_MONOTONIC_FAST: 336 case CLOCK_MONOTONIC_PRECISE: 337 case CLOCK_UPTIME: 338 case CLOCK_UPTIME_FAST: 339 case CLOCK_UPTIME_PRECISE: 340 /* 341 * Minimum reportable resolution is 1ns. Rounding is 342 * otherwise unimportant. 343 */ 344 ts->tv_nsec = 999999999 / sys_cputimer->freq + 1; 345 break; 346 case CLOCK_VIRTUAL: 347 case CLOCK_PROF: 348 /* Accurately round up here because we can do so cheaply. */ 349 ts->tv_nsec = howmany(1000000000, hz); 350 break; 351 case CLOCK_SECOND: 352 ts->tv_sec = 1; 353 ts->tv_nsec = 0; 354 break; 355 case CLOCK_THREAD_CPUTIME_ID: 356 case CLOCK_PROCESS_CPUTIME_ID: 357 ts->tv_nsec = 1000; 358 break; 359 default: 360 if ((clock_id & CPUCLOCK_BIT) == CPUCLOCK_BIT) { 361 pid_t pid = CPUCLOCK2PID(clock_id); 362 if (pid < 2 || pid > PID_MAX) 363 return (EINVAL); 364 ts->tv_nsec = 1000; 365 } else { 366 return (EINVAL); 367 } 368 } 369 370 return (0); 371 } 372 373 /* 374 * MPSAFE 375 */ 376 int 377 sys_clock_getres(struct sysmsg *sysmsg, const struct clock_getres_args *uap) 378 { 379 int error; 380 struct timespec ts; 381 382 error = kern_clock_getres(uap->clock_id, &ts); 383 if (error == 0) 384 error = copyout(&ts, uap->tp, sizeof(ts)); 385 386 return (error); 387 } 388 389 static int 390 kern_getcpuclockid(pid_t pid, lwpid_t lwp_id, clockid_t *clock_id) 391 { 392 struct proc *p; 393 int error = 0; 394 395 if (pid == 0) { 396 p = curproc; 397 pid = p->p_pid; 398 PHOLD(p); 399 } else { 400 p = pfind(pid); 401 if (p == NULL) 402 return (ESRCH); 403 } 404 /* lwp_id can be 0 when called by clock_getcpuclockid() */ 405 if (lwp_id < 0) { 406 error = EINVAL; 407 goto out; 408 } 409 lwkt_gettoken(&p->p_token); 410 if (lwp_id > 0 && 411 lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, lwp_id) == NULL) { 412 lwkt_reltoken(&p->p_token); 413 error = ESRCH; 414 goto out; 415 } 416 *clock_id = MAKE_CPUCLOCK(pid, lwp_id); 417 lwkt_reltoken(&p->p_token); 418 out: 419 PRELE(p); 420 return (error); 421 } 422 423 int 424 sys_getcpuclockid(struct sysmsg *sysmsg, const struct getcpuclockid_args *uap) 425 { 426 clockid_t clk_id; 427 int error; 428 429 error = kern_getcpuclockid(uap->pid, uap->lwp_id, &clk_id); 430 if (error == 0) 431 error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t)); 432 433 return (error); 434 } 435 436 /* 437 * clock_nanosleep1() 438 * 439 * This is a general helper function for clock_nanosleep() and 440 * nanosleep() (aka sleep(), aka usleep()). 441 * 442 * If there is less than one tick's worth of time left and 443 * we haven't done a yield, or the remaining microseconds is 444 * ridiculously low, do a yield. This avoids having 445 * to deal with systimer overheads when the system is under 446 * heavy loads. If we have done a yield already then use 447 * a systimer and an uninterruptable thread wait. 448 * 449 * If there is more than a tick's worth of time left, 450 * calculate the baseline ticks and use an interruptable 451 * tsleep, then handle the fine-grained delay on the next 452 * loop. This usually results in two sleeps occuring, a long one 453 * and a short one. 454 * 455 * MPSAFE 456 */ 457 static void 458 ns1_systimer(systimer_t info, int in_ipi __unused, 459 struct intrframe *frame __unused) 460 { 461 lwkt_schedule(info->data); 462 } 463 464 int 465 clock_nanosleep1(clockid_t clock_id, int flags, 466 struct timespec *rqt, struct timespec *rmt) 467 { 468 static int nanowait; 469 struct timespec ts_cur, ts_tgt, ts_int; 470 struct timeval tv; 471 bool is_abs; 472 int error, error2; 473 474 if ((flags & ~(TIMER_RELTIME | TIMER_ABSTIME)) != 0) 475 return (EINVAL); 476 if (rqt->tv_sec < 0 || rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000) 477 return (EINVAL); 478 if (rqt->tv_sec == 0 && rqt->tv_nsec == 0) 479 return (0); 480 481 switch (clock_id) { 482 case CLOCK_REALTIME: 483 case CLOCK_REALTIME_FAST: 484 case CLOCK_REALTIME_PRECISE: 485 case CLOCK_SECOND: 486 case CLOCK_MONOTONIC: 487 case CLOCK_MONOTONIC_FAST: 488 case CLOCK_MONOTONIC_PRECISE: 489 case CLOCK_UPTIME: 490 case CLOCK_UPTIME_FAST: 491 case CLOCK_UPTIME_PRECISE: 492 is_abs = (flags & TIMER_ABSTIME) != 0; 493 break; 494 case CLOCK_VIRTUAL: 495 case CLOCK_PROF: 496 case CLOCK_PROCESS_CPUTIME_ID: 497 return (ENOTSUP); 498 case CLOCK_THREAD_CPUTIME_ID: 499 default: 500 return (EINVAL); 501 } 502 503 error = kern_clock_gettime(clock_id, &ts_cur); 504 if (error) 505 return (error); 506 507 if (is_abs) { 508 if (timespeccmp(&ts_cur, rqt, >=)) 509 return (0); 510 511 ts_tgt = *rqt; /* target timestamp */ 512 timespecsub(&ts_tgt, &ts_cur, &ts_int); /* sleep interval */ 513 } else { 514 ts_int = *rqt; /* sleep interval */ 515 timespecadd(&ts_cur, &ts_int, &ts_tgt); /* target timestamp */ 516 } 517 518 for (;;) { 519 int ticks; 520 struct systimer info; 521 thread_t td; 522 523 timespecsub(&ts_tgt, &ts_cur, &ts_int); 524 TIMESPEC_TO_TIMEVAL(&tv, &ts_int); 525 ticks = tv.tv_usec / ustick; /* approximate */ 526 527 if (tv.tv_sec == 0 && ticks == 0) { 528 td = curthread; 529 if (tv.tv_usec > 0 && tv.tv_usec < nanosleep_min_us) 530 tv.tv_usec = nanosleep_min_us; 531 if (tv.tv_usec < nanosleep_hard_us) { 532 lwkt_user_yield(); 533 cpu_pause(); 534 } else { 535 crit_enter_quick(td); 536 systimer_init_oneshot(&info, ns1_systimer, 537 td, tv.tv_usec); 538 lwkt_deschedule_self(td); 539 crit_exit_quick(td); 540 lwkt_switch(); 541 systimer_del(&info); /* make sure it's gone */ 542 } 543 error = iscaught(td->td_lwp); 544 } else if (tv.tv_sec == 0) { 545 error = tsleep(&nanowait, PCATCH, "nanslp", ticks); 546 } else { 547 ticks = tvtohz_low(&tv); /* also handles overflow */ 548 error = tsleep(&nanowait, PCATCH, "nanslp", ticks); 549 } 550 551 error2 = kern_clock_gettime(clock_id, &ts_cur); 552 if (error2) 553 return (error2); 554 555 if (error && error != EWOULDBLOCK) { 556 if (error == ERESTART) 557 error = EINTR; 558 if (rmt != NULL && !is_abs) { 559 timespecsub(&ts_tgt, &ts_cur, &ts_int); 560 if (ts_int.tv_sec < 0) 561 timespecclear(&ts_int); 562 *rmt = ts_int; 563 } 564 return (error); 565 } 566 if (timespeccmp(&ts_cur, &ts_tgt, >=)) 567 return (0); 568 } 569 } 570 571 int 572 nanosleep1(struct timespec *rqt, struct timespec *rmt) 573 { 574 return clock_nanosleep1(CLOCK_REALTIME, TIMER_RELTIME, rqt, rmt); 575 } 576 577 /* 578 * MPSAFE 579 */ 580 int 581 sys_clock_nanosleep(struct sysmsg *sysmsg, 582 const struct clock_nanosleep_args *uap) 583 { 584 int error; 585 bool is_abs; 586 struct timespec rqt; 587 struct timespec rmt; 588 589 is_abs = (uap->flags & TIMER_ABSTIME) != 0; 590 591 error = copyin(uap->rqtp, &rqt, sizeof(rqt)); 592 if (error) { 593 sysmsg->sysmsg_result = error; 594 return (0); 595 } 596 597 bzero(&rmt, sizeof(rmt)); 598 error = clock_nanosleep1(uap->clock_id, uap->flags, &rqt, &rmt); 599 600 /* 601 * copyout the residual if nanosleep was interrupted. 602 */ 603 if (error == EINTR && uap->rmtp != NULL && !is_abs) { 604 int error2; 605 606 error2 = copyout(&rmt, uap->rmtp, sizeof(rmt)); 607 if (error2) 608 error = error2; 609 } 610 611 sysmsg->sysmsg_result = error; 612 return (0); 613 } 614 615 /* 616 * MPSAFE 617 */ 618 int 619 sys_nanosleep(struct sysmsg *sysmsg, const struct nanosleep_args *uap) 620 { 621 int error; 622 struct timespec rqt; 623 struct timespec rmt; 624 625 error = copyin(uap->rqtp, &rqt, sizeof(rqt)); 626 if (error) 627 return (error); 628 629 bzero(&rmt, sizeof(rmt)); 630 error = nanosleep1(&rqt, &rmt); 631 632 /* 633 * copyout the residual if nanosleep was interrupted. 634 */ 635 if (error == EINTR && uap->rmtp != NULL) { 636 int error2; 637 638 error2 = copyout(&rmt, uap->rmtp, sizeof(rmt)); 639 if (error2) 640 error = error2; 641 } 642 return (error); 643 } 644 645 /* 646 * The gettimeofday() system call is supposed to return a fine-grained 647 * realtime stamp. However, acquiring a fine-grained stamp can create a 648 * bottleneck when multiple cpu cores are trying to accessing e.g. the 649 * HPET hardware timer all at the same time, so we have a sysctl that 650 * allows its behavior to be changed to a more coarse-grained timestamp 651 * which does not have to access a hardware timer. 652 */ 653 int 654 sys_gettimeofday(struct sysmsg *sysmsg, const struct gettimeofday_args *uap) 655 { 656 struct timeval atv; 657 int error = 0; 658 659 if (uap->tp) { 660 if (gettimeofday_quick) 661 getmicrotime(&atv); 662 else 663 microtime(&atv); 664 if ((error = copyout((caddr_t)&atv, (caddr_t)uap->tp, 665 sizeof (atv)))) 666 return (error); 667 } 668 if (uap->tzp) 669 error = copyout((caddr_t)&tz, (caddr_t)uap->tzp, 670 sizeof (tz)); 671 return (error); 672 } 673 674 /* 675 * MPALMOSTSAFE 676 */ 677 int 678 sys_settimeofday(struct sysmsg *sysmsg, const struct settimeofday_args *uap) 679 { 680 struct timeval atv; 681 struct timezone atz; 682 int error; 683 684 if ((error = caps_priv_check_self(SYSCAP_NOSETTIME))) 685 return (error); 686 /* 687 * Verify all parameters before changing time. 688 * 689 * XXX: We do not allow the time to be set to 0.0, which also by 690 * happy coincidence works around a pkgsrc bulk build bug. 691 */ 692 if (uap->tv) { 693 if ((error = copyin((caddr_t)uap->tv, (caddr_t)&atv, 694 sizeof(atv)))) 695 return (error); 696 if (atv.tv_usec < 0 || atv.tv_usec >= 1000000) 697 return (EINVAL); 698 if (atv.tv_sec == 0 && atv.tv_usec == 0) 699 return (EINVAL); 700 } 701 if (uap->tzp && 702 (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz)))) 703 return (error); 704 705 lockmgr(&masterclock_lock, LK_EXCLUSIVE); 706 if (uap->tv && (error = settime(&atv))) { 707 lockmgr(&masterclock_lock, LK_RELEASE); 708 return (error); 709 } 710 lockmgr(&masterclock_lock, LK_RELEASE); 711 712 if (uap->tzp) 713 tz = atz; 714 return (0); 715 } 716 717 /* 718 * WARNING! Run with ntp_spin held 719 */ 720 static void 721 kern_adjtime_common(void) 722 { 723 if ((ntp_delta >= 0 && ntp_delta < ntp_default_tick_delta) || 724 (ntp_delta < 0 && ntp_delta > -ntp_default_tick_delta)) 725 ntp_tick_delta = ntp_delta; 726 else if (ntp_delta > ntp_big_delta) 727 ntp_tick_delta = 10 * ntp_default_tick_delta; 728 else if (ntp_delta < -ntp_big_delta) 729 ntp_tick_delta = -10 * ntp_default_tick_delta; 730 else if (ntp_delta > 0) 731 ntp_tick_delta = ntp_default_tick_delta; 732 else 733 ntp_tick_delta = -ntp_default_tick_delta; 734 } 735 736 void 737 kern_adjtime(int64_t delta, int64_t *odelta) 738 { 739 spin_lock(&ntp_spin); 740 *odelta = ntp_delta; 741 ntp_delta = delta; 742 kern_adjtime_common(); 743 spin_unlock(&ntp_spin); 744 } 745 746 static void 747 kern_get_ntp_delta(int64_t *delta) 748 { 749 *delta = ntp_delta; 750 } 751 752 void 753 kern_reladjtime(int64_t delta) 754 { 755 spin_lock(&ntp_spin); 756 ntp_delta += delta; 757 kern_adjtime_common(); 758 spin_unlock(&ntp_spin); 759 } 760 761 static void 762 kern_adjfreq(int64_t rate) 763 { 764 spin_lock(&ntp_spin); 765 ntp_tick_permanent = rate; 766 spin_unlock(&ntp_spin); 767 } 768 769 /* 770 * MPALMOSTSAFE 771 */ 772 int 773 sys_adjtime(struct sysmsg *sysmsg, const struct adjtime_args *uap) 774 { 775 struct timeval atv; 776 int64_t ndelta, odelta; 777 int error; 778 779 if ((error = caps_priv_check_self(SYSCAP_NOSETTIME))) 780 return (error); 781 error = copyin(uap->delta, &atv, sizeof(struct timeval)); 782 if (error) 783 return (error); 784 785 /* 786 * Compute the total correction and the rate at which to apply it. 787 * Round the adjustment down to a whole multiple of the per-tick 788 * delta, so that after some number of incremental changes in 789 * hardclock(), tickdelta will become zero, lest the correction 790 * overshoot and start taking us away from the desired final time. 791 */ 792 ndelta = (int64_t)atv.tv_sec * 1000000000 + atv.tv_usec * 1000; 793 kern_adjtime(ndelta, &odelta); 794 795 if (uap->olddelta) { 796 atv.tv_sec = odelta / 1000000000; 797 atv.tv_usec = odelta % 1000000000 / 1000; 798 copyout(&atv, uap->olddelta, sizeof(struct timeval)); 799 } 800 return (0); 801 } 802 803 static int 804 sysctl_adjtime(SYSCTL_HANDLER_ARGS) 805 { 806 int64_t delta; 807 int error; 808 809 if (req->newptr != NULL) { 810 if (caps_priv_check_self(SYSCAP_RESTRICTEDROOT)) 811 return (EPERM); 812 error = SYSCTL_IN(req, &delta, sizeof(delta)); 813 if (error) 814 return (error); 815 kern_reladjtime(delta); 816 } 817 818 if (req->oldptr) 819 kern_get_ntp_delta(&delta); 820 error = SYSCTL_OUT(req, &delta, sizeof(delta)); 821 return (error); 822 } 823 824 /* 825 * delta is in nanoseconds. 826 */ 827 static int 828 sysctl_delta(SYSCTL_HANDLER_ARGS) 829 { 830 int64_t delta, old_delta; 831 int error; 832 833 if (req->newptr != NULL) { 834 if (caps_priv_check_self(SYSCAP_RESTRICTEDROOT)) 835 return (EPERM); 836 error = SYSCTL_IN(req, &delta, sizeof(delta)); 837 if (error) 838 return (error); 839 kern_adjtime(delta, &old_delta); 840 } 841 842 if (req->oldptr != NULL) 843 kern_get_ntp_delta(&old_delta); 844 error = SYSCTL_OUT(req, &old_delta, sizeof(old_delta)); 845 return (error); 846 } 847 848 /* 849 * frequency is in nanoseconds per second shifted left 32. 850 * kern_adjfreq() needs it in nanoseconds per tick shifted left 32. 851 */ 852 static int 853 sysctl_adjfreq(SYSCTL_HANDLER_ARGS) 854 { 855 int64_t freqdelta; 856 int error; 857 858 if (req->newptr != NULL) { 859 if (caps_priv_check_self(SYSCAP_RESTRICTEDROOT)) 860 return (EPERM); 861 error = SYSCTL_IN(req, &freqdelta, sizeof(freqdelta)); 862 if (error) 863 return (error); 864 865 freqdelta /= hz; 866 kern_adjfreq(freqdelta); 867 } 868 869 if (req->oldptr != NULL) 870 freqdelta = ntp_tick_permanent * hz; 871 error = SYSCTL_OUT(req, &freqdelta, sizeof(freqdelta)); 872 if (error) 873 return (error); 874 875 return (0); 876 } 877 878 SYSCTL_NODE(_kern, OID_AUTO, ntp, CTLFLAG_RW, 0, "NTP related controls"); 879 SYSCTL_PROC(_kern_ntp, OID_AUTO, permanent, 880 CTLTYPE_QUAD|CTLFLAG_RW, 0, 0, 881 sysctl_adjfreq, "Q", "permanent correction per second"); 882 SYSCTL_PROC(_kern_ntp, OID_AUTO, delta, 883 CTLTYPE_QUAD|CTLFLAG_RW, 0, 0, 884 sysctl_delta, "Q", "one-time delta"); 885 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, big_delta, CTLFLAG_RD, 886 &ntp_big_delta, sizeof(ntp_big_delta), "Q", 887 "threshold for fast adjustment"); 888 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, tick_delta, CTLFLAG_RD, 889 &ntp_tick_delta, sizeof(ntp_tick_delta), "LU", 890 "per-tick adjustment"); 891 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, default_tick_delta, CTLFLAG_RD, 892 &ntp_default_tick_delta, sizeof(ntp_default_tick_delta), "LU", 893 "default per-tick adjustment"); 894 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, next_leap_second, CTLFLAG_RW, 895 &ntp_leap_second, sizeof(ntp_leap_second), "LU", 896 "next leap second"); 897 SYSCTL_INT(_kern_ntp, OID_AUTO, insert_leap_second, CTLFLAG_RW, 898 &ntp_leap_insert, 0, "insert or remove leap second"); 899 SYSCTL_PROC(_kern_ntp, OID_AUTO, adjust, 900 CTLTYPE_QUAD|CTLFLAG_RW, 0, 0, 901 sysctl_adjtime, "Q", "relative adjust for delta"); 902 903 /* 904 * Get value of an interval timer. The process virtual and 905 * profiling virtual time timers are kept in the p_stats area, since 906 * they can be swapped out. These are kept internally in the 907 * way they are specified externally: in time until they expire. 908 * 909 * The real time interval timer is kept in the process table slot 910 * for the process, and its value (it_value) is kept as an 911 * absolute time rather than as a delta, so that it is easy to keep 912 * periodic real-time signals from drifting. 913 * 914 * Virtual time timers are processed in the hardclock() routine of 915 * kern_clock.c. The real time timer is processed by a timeout 916 * routine, called from the softclock() routine. Since a callout 917 * may be delayed in real time due to interrupt processing in the system, 918 * it is possible for the real time timeout routine (realitexpire, given below), 919 * to be delayed in real time past when it is supposed to occur. It 920 * does not suffice, therefore, to reload the real timer .it_value from the 921 * real time timers .it_interval. Rather, we compute the next time in 922 * absolute time the timer should go off. 923 * 924 * MPALMOSTSAFE 925 */ 926 int 927 sys_getitimer(struct sysmsg *sysmsg, const struct getitimer_args *uap) 928 { 929 struct proc *p = curproc; 930 struct timeval ctv; 931 struct itimerval aitv; 932 933 if (uap->which > ITIMER_PROF) 934 return (EINVAL); 935 lwkt_gettoken(&p->p_token); 936 if (uap->which == ITIMER_REAL) { 937 /* 938 * Convert from absolute to relative time in .it_value 939 * part of real time timer. If time for real time timer 940 * has passed return 0, else return difference between 941 * current time and time for the timer to go off. 942 */ 943 aitv = p->p_realtimer; 944 if (timevalisset(&aitv.it_value)) { 945 getmicrouptime(&ctv); 946 if (timevalcmp(&aitv.it_value, &ctv, <)) 947 timevalclear(&aitv.it_value); 948 else 949 timevalsub(&aitv.it_value, &ctv); 950 } 951 } else { 952 aitv = p->p_timer[uap->which]; 953 } 954 lwkt_reltoken(&p->p_token); 955 return (copyout(&aitv, uap->itv, sizeof (struct itimerval))); 956 } 957 958 /* 959 * MPALMOSTSAFE 960 */ 961 int 962 sys_setitimer(struct sysmsg *sysmsg, const struct setitimer_args *uap) 963 { 964 struct itimerval aitv; 965 struct timeval ctv; 966 struct itimerval *itvp; 967 struct proc *p = curproc; 968 struct getitimer_args gitargs; 969 int error; 970 971 if (uap->which > ITIMER_PROF) 972 return (EINVAL); 973 itvp = uap->itv; 974 if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv, 975 sizeof(struct itimerval)))) 976 return (error); 977 978 if (uap->oitv) { 979 gitargs.which = uap->which; 980 gitargs.itv = uap->oitv; 981 error = sys_getitimer(sysmsg, &gitargs); 982 if (error) 983 return error; 984 } 985 if (itvp == NULL) 986 return (0); 987 if (itimerfix(&aitv.it_value)) 988 return (EINVAL); 989 if (!timevalisset(&aitv.it_value)) 990 timevalclear(&aitv.it_interval); 991 else if (itimerfix(&aitv.it_interval)) 992 return (EINVAL); 993 lwkt_gettoken(&p->p_token); 994 if (uap->which == ITIMER_REAL) { 995 if (timevalisset(&p->p_realtimer.it_value)) 996 callout_cancel(&p->p_ithandle); 997 if (timevalisset(&aitv.it_value)) 998 callout_reset(&p->p_ithandle, 999 tvtohz_high(&aitv.it_value), realitexpire, p); 1000 getmicrouptime(&ctv); 1001 timevaladd(&aitv.it_value, &ctv); 1002 p->p_realtimer = aitv; 1003 } else { 1004 p->p_timer[uap->which] = aitv; 1005 switch(uap->which) { 1006 case ITIMER_VIRTUAL: 1007 p->p_flags &= ~P_SIGVTALRM; 1008 break; 1009 case ITIMER_PROF: 1010 p->p_flags &= ~P_SIGPROF; 1011 break; 1012 } 1013 } 1014 lwkt_reltoken(&p->p_token); 1015 return (0); 1016 } 1017 1018 /* 1019 * Real interval timer expired: 1020 * send process whose timer expired an alarm signal. 1021 * If time is not set up to reload, then just return. 1022 * Else compute next time timer should go off which is > current time. 1023 * This is where delay in processing this timeout causes multiple 1024 * SIGALRM calls to be compressed into one. 1025 * tvtohz_high() always adds 1 to allow for the time until the next clock 1026 * interrupt being strictly less than 1 clock tick, but we don't want 1027 * that here since we want to appear to be in sync with the clock 1028 * interrupt even when we're delayed. 1029 */ 1030 static 1031 void 1032 realitexpire(void *arg) 1033 { 1034 struct proc *p; 1035 struct timeval ctv, ntv; 1036 1037 p = (struct proc *)arg; 1038 PHOLD(p); 1039 lwkt_gettoken(&p->p_token); 1040 ksignal(p, SIGALRM); 1041 if (!timevalisset(&p->p_realtimer.it_interval)) { 1042 timevalclear(&p->p_realtimer.it_value); 1043 goto done; 1044 } 1045 for (;;) { 1046 timevaladd(&p->p_realtimer.it_value, 1047 &p->p_realtimer.it_interval); 1048 getmicrouptime(&ctv); 1049 if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) { 1050 ntv = p->p_realtimer.it_value; 1051 timevalsub(&ntv, &ctv); 1052 callout_reset(&p->p_ithandle, tvtohz_low(&ntv), 1053 realitexpire, p); 1054 goto done; 1055 } 1056 } 1057 done: 1058 lwkt_reltoken(&p->p_token); 1059 PRELE(p); 1060 } 1061 1062 /* 1063 * Used to validate itimer timeouts and utimes*() timespecs. 1064 */ 1065 int 1066 itimerfix(struct timeval *tv) 1067 { 1068 if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000) 1069 return (EINVAL); 1070 if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < ustick) 1071 tv->tv_usec = ustick; 1072 return (0); 1073 } 1074 1075 /* 1076 * Used to validate timeouts and utimes*() timespecs. 1077 */ 1078 int 1079 itimespecfix(struct timespec *ts) 1080 { 1081 if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000ULL) 1082 return (EINVAL); 1083 if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < nstick) 1084 ts->tv_nsec = nstick; 1085 return (0); 1086 } 1087 1088 /* 1089 * Decrement an interval timer by a specified number 1090 * of microseconds, which must be less than a second, 1091 * i.e. < 1000000. If the timer expires, then reload 1092 * it. In this case, carry over (usec - old value) to 1093 * reduce the value reloaded into the timer so that 1094 * the timer does not drift. This routine assumes 1095 * that it is called in a context where the timers 1096 * on which it is operating cannot change in value. 1097 */ 1098 int 1099 itimerdecr(struct itimerval *itp, int usec) 1100 { 1101 1102 if (itp->it_value.tv_usec < usec) { 1103 if (itp->it_value.tv_sec == 0) { 1104 /* expired, and already in next interval */ 1105 usec -= itp->it_value.tv_usec; 1106 goto expire; 1107 } 1108 itp->it_value.tv_usec += 1000000; 1109 itp->it_value.tv_sec--; 1110 } 1111 itp->it_value.tv_usec -= usec; 1112 usec = 0; 1113 if (timevalisset(&itp->it_value)) 1114 return (1); 1115 /* expired, exactly at end of interval */ 1116 expire: 1117 if (timevalisset(&itp->it_interval)) { 1118 itp->it_value = itp->it_interval; 1119 itp->it_value.tv_usec -= usec; 1120 if (itp->it_value.tv_usec < 0) { 1121 itp->it_value.tv_usec += 1000000; 1122 itp->it_value.tv_sec--; 1123 } 1124 } else 1125 itp->it_value.tv_usec = 0; /* sec is already 0 */ 1126 return (0); 1127 } 1128 1129 /* 1130 * Add and subtract routines for timevals. 1131 * N.B.: subtract routine doesn't deal with 1132 * results which are before the beginning, 1133 * it just gets very confused in this case. 1134 * Caveat emptor. 1135 */ 1136 void 1137 timevaladd(struct timeval *t1, const struct timeval *t2) 1138 { 1139 1140 t1->tv_sec += t2->tv_sec; 1141 t1->tv_usec += t2->tv_usec; 1142 timevalfix(t1); 1143 } 1144 1145 void 1146 timevalsub(struct timeval *t1, const struct timeval *t2) 1147 { 1148 1149 t1->tv_sec -= t2->tv_sec; 1150 t1->tv_usec -= t2->tv_usec; 1151 timevalfix(t1); 1152 } 1153 1154 static void 1155 timevalfix(struct timeval *t1) 1156 { 1157 1158 if (t1->tv_usec < 0) { 1159 t1->tv_sec--; 1160 t1->tv_usec += 1000000; 1161 } 1162 if (t1->tv_usec >= 1000000) { 1163 t1->tv_sec++; 1164 t1->tv_usec -= 1000000; 1165 } 1166 } 1167 1168 /* 1169 * ratecheck(): simple time-based rate-limit checking. 1170 */ 1171 int 1172 ratecheck(struct timeval *lasttime, const struct timeval *mininterval) 1173 { 1174 struct timeval tv, delta; 1175 int rv = 0; 1176 1177 getmicrouptime(&tv); /* NB: 10ms precision */ 1178 delta = tv; 1179 timevalsub(&delta, lasttime); 1180 1181 /* 1182 * check for 0,0 is so that the message will be seen at least once, 1183 * even if interval is huge. 1184 */ 1185 if (timevalcmp(&delta, mininterval, >=) || 1186 (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) { 1187 *lasttime = tv; 1188 rv = 1; 1189 } 1190 1191 return (rv); 1192 } 1193 1194 /* 1195 * ppsratecheck(): packets (or events) per second limitation. 1196 * 1197 * Return 0 if the limit is to be enforced (e.g. the caller 1198 * should drop a packet because of the rate limitation). 1199 * 1200 * maxpps of 0 always causes zero to be returned. maxpps of -1 1201 * always causes 1 to be returned; this effectively defeats rate 1202 * limiting. 1203 * 1204 * Note that we maintain the struct timeval for compatibility 1205 * with other bsd systems. We reuse the storage and just monitor 1206 * clock ticks for minimal overhead. 1207 */ 1208 int 1209 ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps) 1210 { 1211 int now; 1212 1213 /* 1214 * Reset the last time and counter if this is the first call 1215 * or more than a second has passed since the last update of 1216 * lasttime. 1217 */ 1218 now = ticks; 1219 if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) { 1220 lasttime->tv_sec = now; 1221 *curpps = 1; 1222 return (maxpps != 0); 1223 } else { 1224 (*curpps)++; /* NB: ignore potential overflow */ 1225 return (maxpps < 0 || *curpps < maxpps); 1226 } 1227 } 1228 1229 static int 1230 sysctl_gettimeofday_quick(SYSCTL_HANDLER_ARGS) 1231 { 1232 int error; 1233 int gtod; 1234 1235 gtod = gettimeofday_quick; 1236 error = sysctl_handle_int(oidp, >od, 0, req); 1237 if (error || req->newptr == NULL) 1238 return error; 1239 gettimeofday_quick = gtod; 1240 if (kpmap) 1241 kpmap->fast_gtod = gtod; 1242 return 0; 1243 } 1244