xref: /dragonfly/sys/kern/kern_time.c (revision bbb35c81)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)kern_time.c	8.1 (Berkeley) 6/10/93
30  * $FreeBSD: src/sys/kern/kern_time.c,v 1.68.2.1 2002/10/01 08:00:41 bde Exp $
31  */
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/buf.h>
36 #include <sys/sysmsg.h>
37 #include <sys/resourcevar.h>
38 #include <sys/signalvar.h>
39 #include <sys/kernel.h>
40 #include <sys/sysent.h>
41 #include <sys/proc.h>
42 #include <sys/priv.h>
43 #include <sys/time.h>
44 #include <sys/vnode.h>
45 #include <sys/sysctl.h>
46 #include <sys/kern_syscall.h>
47 #include <sys/upmap.h>
48 #include <vm/vm.h>
49 #include <vm/vm_extern.h>
50 
51 #include <sys/msgport2.h>
52 #include <sys/spinlock2.h>
53 #include <sys/thread2.h>
54 
55 extern struct spinlock ntp_spin;
56 
57 #define CPUCLOCK_BIT			0x80000000
58 #define	CPUCLOCK_ID_MASK		~CPUCLOCK_BIT
59 #define	CPUCLOCK2LWPID(clock_id)	(((clockid_t)(clock_id) >> 32) & CPUCLOCK_ID_MASK)
60 #define	CPUCLOCK2PID(clock_id)		((clock_id) & CPUCLOCK_ID_MASK)
61 #define MAKE_CPUCLOCK(pid, lwp_id)	((clockid_t)(lwp_id) << 32 | (pid) | CPUCLOCK_BIT)
62 
63 struct timezone tz;
64 
65 /*
66  * Time of day and interval timer support.
67  *
68  * These routines provide the kernel entry points to get and set
69  * the time-of-day and per-process interval timers.  Subroutines
70  * here provide support for adding and subtracting timeval structures
71  * and decrementing interval timers, optionally reloading the interval
72  * timers when they expire.
73  */
74 
75 static int	settime(struct timeval *);
76 static void	timevalfix(struct timeval *);
77 static void	realitexpire(void *arg);
78 
79 static int sysctl_gettimeofday_quick(SYSCTL_HANDLER_ARGS);
80 
81 
82 /*
83  * Nanosleep tries very hard to sleep for a precisely requested time
84  * interval, down to 1uS.  The administrator can impose a minimum delay
85  * and a delay below which we hard-loop instead of initiate a timer
86  * interrupt and sleep.
87  *
88  * For machines under high loads it might be beneficial to increase min_us
89  * to e.g. 1000uS (1ms) so spining processes sleep meaningfully.
90  */
91 static int     nanosleep_min_us = 10;
92 static int     nanosleep_hard_us = 100;
93 static int     gettimeofday_quick = 0;
94 SYSCTL_INT(_kern, OID_AUTO, nanosleep_min_us, CTLFLAG_RW,
95 	   &nanosleep_min_us, 0, "");
96 SYSCTL_INT(_kern, OID_AUTO, nanosleep_hard_us, CTLFLAG_RW,
97 	   &nanosleep_hard_us, 0, "");
98 SYSCTL_PROC(_kern, OID_AUTO, gettimeofday_quick, CTLTYPE_INT | CTLFLAG_RW,
99 	   0, 0, sysctl_gettimeofday_quick, "I", "Quick mode gettimeofday");
100 
101 static struct lock masterclock_lock = LOCK_INITIALIZER("mstrclk", 0, 0);
102 
103 static int
104 settime(struct timeval *tv)
105 {
106 	struct timeval delta, tv1, tv2;
107 	static struct timeval maxtime, laststep;
108 	struct timespec ts;
109 	int origcpu;
110 
111 	if ((origcpu = mycpu->gd_cpuid) != 0)
112 		lwkt_setcpu_self(globaldata_find(0));
113 
114 	crit_enter();
115 	microtime(&tv1);
116 	delta = *tv;
117 	timevalsub(&delta, &tv1);
118 
119 	/*
120 	 * If the system is secure, we do not allow the time to be
121 	 * set to a value earlier than 1 second less than the highest
122 	 * time we have yet seen. The worst a miscreant can do in
123 	 * this circumstance is "freeze" time. He couldn't go
124 	 * back to the past.
125 	 *
126 	 * We similarly do not allow the clock to be stepped more
127 	 * than one second, nor more than once per second. This allows
128 	 * a miscreant to make the clock march double-time, but no worse.
129 	 */
130 	if (securelevel > 1) {
131 		if (delta.tv_sec < 0 || delta.tv_usec < 0) {
132 			/*
133 			 * Update maxtime to latest time we've seen.
134 			 */
135 			if (tv1.tv_sec > maxtime.tv_sec)
136 				maxtime = tv1;
137 			tv2 = *tv;
138 			timevalsub(&tv2, &maxtime);
139 			if (tv2.tv_sec < -1) {
140 				tv->tv_sec = maxtime.tv_sec - 1;
141 				kprintf("Time adjustment clamped to -1 second\n");
142 			}
143 		} else {
144 			if (tv1.tv_sec == laststep.tv_sec) {
145 				crit_exit();
146 				return (EPERM);
147 			}
148 			if (delta.tv_sec > 1) {
149 				tv->tv_sec = tv1.tv_sec + 1;
150 				kprintf("Time adjustment clamped to +1 second\n");
151 			}
152 			laststep = *tv;
153 		}
154 	}
155 
156 	ts.tv_sec = tv->tv_sec;
157 	ts.tv_nsec = tv->tv_usec * 1000;
158 	set_timeofday(&ts);
159 	crit_exit();
160 
161 	if (origcpu != 0)
162 		lwkt_setcpu_self(globaldata_find(origcpu));
163 
164 	resettodr();
165 	return (0);
166 }
167 
168 static void
169 get_process_cputime(struct proc *p, struct timespec *ats)
170 {
171 	struct rusage ru;
172 
173 	lwkt_gettoken(&p->p_token);
174 	calcru_proc(p, &ru);
175 	lwkt_reltoken(&p->p_token);
176 	timevaladd(&ru.ru_utime, &ru.ru_stime);
177 	TIMEVAL_TO_TIMESPEC(&ru.ru_utime, ats);
178 }
179 
180 static void
181 get_process_usertime(struct proc *p, struct timespec *ats)
182 {
183 	struct rusage ru;
184 
185 	lwkt_gettoken(&p->p_token);
186 	calcru_proc(p, &ru);
187 	lwkt_reltoken(&p->p_token);
188 	TIMEVAL_TO_TIMESPEC(&ru.ru_utime, ats);
189 }
190 
191 static void
192 get_thread_cputime(struct thread *td, struct timespec *ats)
193 {
194 	struct timeval sys, user;
195 
196 	calcru(td->td_lwp, &user, &sys);
197 	timevaladd(&user, &sys);
198 	TIMEVAL_TO_TIMESPEC(&user, ats);
199 }
200 
201 /*
202  * MPSAFE
203  */
204 int
205 kern_clock_gettime(clockid_t clock_id, struct timespec *ats)
206 {
207 	struct proc *p;
208 	struct lwp *lp;
209 	lwpid_t lwp_id;
210 
211 	p = curproc;
212 	switch(clock_id) {
213 	case CLOCK_REALTIME:
214 	case CLOCK_REALTIME_PRECISE:
215 		nanotime(ats);
216 		break;
217 	case CLOCK_REALTIME_FAST:
218 		getnanotime(ats);
219 		break;
220 	case CLOCK_MONOTONIC:
221 	case CLOCK_MONOTONIC_PRECISE:
222 	case CLOCK_UPTIME:
223 	case CLOCK_UPTIME_PRECISE:
224 		nanouptime(ats);
225 		break;
226 	case CLOCK_MONOTONIC_FAST:
227 	case CLOCK_UPTIME_FAST:
228 		getnanouptime(ats);
229 		break;
230 	case CLOCK_VIRTUAL:
231 		get_process_usertime(p, ats);
232 		break;
233 	case CLOCK_PROF:
234 	case CLOCK_PROCESS_CPUTIME_ID:
235 		get_process_cputime(p, ats);
236 		break;
237 	case CLOCK_SECOND:
238 		ats->tv_sec = time_second;
239 		ats->tv_nsec = 0;
240 		break;
241 	case CLOCK_THREAD_CPUTIME_ID:
242 		get_thread_cputime(curthread, ats);
243 		break;
244 	default:
245 		if ((clock_id & CPUCLOCK_BIT) == 0)
246 			return (EINVAL);
247 		if ((p = pfind(CPUCLOCK2PID(clock_id))) == NULL)
248 			return (EINVAL);
249 		lwp_id = CPUCLOCK2LWPID(clock_id);
250 		if (lwp_id == 0) {
251 			get_process_cputime(p, ats);
252 		} else {
253 			lwkt_gettoken(&p->p_token);
254 			lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, lwp_id);
255 			if (lp == NULL) {
256 				lwkt_reltoken(&p->p_token);
257 				PRELE(p);
258 				return (EINVAL);
259 			}
260 			get_thread_cputime(lp->lwp_thread, ats);
261 			lwkt_reltoken(&p->p_token);
262 		}
263 		PRELE(p);
264 	}
265 	return (0);
266 }
267 
268 /*
269  * MPSAFE
270  */
271 int
272 sys_clock_gettime(struct sysmsg *sysmsg, const struct clock_gettime_args *uap)
273 {
274 	struct timespec ats;
275 	int error;
276 
277 	error = kern_clock_gettime(uap->clock_id, &ats);
278 	if (error == 0)
279 		error = copyout(&ats, uap->tp, sizeof(ats));
280 
281 	return (error);
282 }
283 
284 int
285 kern_clock_settime(clockid_t clock_id, struct timespec *ats)
286 {
287 	struct thread *td = curthread;
288 	struct timeval atv;
289 	int error;
290 
291 	if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
292 		return (error);
293 	if (clock_id != CLOCK_REALTIME)
294 		return (EINVAL);
295 	if (ats->tv_sec < 0 || ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000)
296 		return (EINVAL);
297 
298 	lockmgr(&masterclock_lock, LK_EXCLUSIVE);
299 	TIMESPEC_TO_TIMEVAL(&atv, ats);
300 	error = settime(&atv);
301 	lockmgr(&masterclock_lock, LK_RELEASE);
302 
303 	return (error);
304 }
305 
306 /*
307  * MPALMOSTSAFE
308  */
309 int
310 sys_clock_settime(struct sysmsg *sysmsg, const struct clock_settime_args *uap)
311 {
312 	struct timespec ats;
313 	int error;
314 
315 	if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0)
316 		return (error);
317 
318 	error = kern_clock_settime(uap->clock_id, &ats);
319 
320 	return (error);
321 }
322 
323 /*
324  * MPSAFE
325  */
326 int
327 kern_clock_getres(clockid_t clock_id, struct timespec *ts)
328 {
329 	ts->tv_sec = 0;
330 
331 	switch (clock_id) {
332 	case CLOCK_REALTIME:
333 	case CLOCK_REALTIME_FAST:
334 	case CLOCK_REALTIME_PRECISE:
335 	case CLOCK_MONOTONIC:
336 	case CLOCK_MONOTONIC_FAST:
337 	case CLOCK_MONOTONIC_PRECISE:
338 	case CLOCK_UPTIME:
339 	case CLOCK_UPTIME_FAST:
340 	case CLOCK_UPTIME_PRECISE:
341 		/*
342 		 * Minimum reportable resolution is 1ns.  Rounding is
343 		 * otherwise unimportant.
344 		 */
345 		ts->tv_nsec = 999999999 / sys_cputimer->freq + 1;
346 		break;
347 	case CLOCK_VIRTUAL:
348 	case CLOCK_PROF:
349 		/* Accurately round up here because we can do so cheaply. */
350 		ts->tv_nsec = howmany(1000000000, hz);
351 		break;
352 	case CLOCK_SECOND:
353 		ts->tv_sec = 1;
354 		ts->tv_nsec = 0;
355 		break;
356 	case CLOCK_THREAD_CPUTIME_ID:
357 	case CLOCK_PROCESS_CPUTIME_ID:
358 		ts->tv_nsec = 1000;
359 		break;
360 	default:
361 		if ((clock_id & CPUCLOCK_BIT) == CPUCLOCK_BIT) {
362 			pid_t pid = CPUCLOCK2PID(clock_id);
363 			if (pid < 2 || pid > PID_MAX)
364 				return (EINVAL);
365 			ts->tv_nsec = 1000;
366 		} else {
367 			return (EINVAL);
368 		}
369 	}
370 
371 	return (0);
372 }
373 
374 /*
375  * MPSAFE
376  */
377 int
378 sys_clock_getres(struct sysmsg *sysmsg, const struct clock_getres_args *uap)
379 {
380 	int error;
381 	struct timespec ts;
382 
383 	error = kern_clock_getres(uap->clock_id, &ts);
384 	if (error == 0)
385 		error = copyout(&ts, uap->tp, sizeof(ts));
386 
387 	return (error);
388 }
389 
390 static int
391 kern_getcpuclockid(pid_t pid, lwpid_t lwp_id, clockid_t *clock_id)
392 {
393 	struct proc *p;
394 	int error = 0;
395 
396 	if (pid == 0) {
397 		p = curproc;
398 		pid = p->p_pid;
399 		PHOLD(p);
400 	} else {
401 		p = pfind(pid);
402 		if (p == NULL)
403 			return (ESRCH);
404 	}
405 	/* lwp_id can be 0 when called by clock_getcpuclockid() */
406 	if (lwp_id < 0) {
407 		error = EINVAL;
408 		goto out;
409 	}
410 	lwkt_gettoken(&p->p_token);
411 	if (lwp_id > 0 &&
412 	    lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, lwp_id) == NULL) {
413 		lwkt_reltoken(&p->p_token);
414 		error = ESRCH;
415 		goto out;
416 	}
417 	*clock_id = MAKE_CPUCLOCK(pid, lwp_id);
418 	lwkt_reltoken(&p->p_token);
419 out:
420 	PRELE(p);
421 	return (error);
422 }
423 
424 int
425 sys_getcpuclockid(struct sysmsg *sysmsg, const struct getcpuclockid_args *uap)
426 {
427 	clockid_t clk_id;
428 	int error;
429 
430 	error = kern_getcpuclockid(uap->pid, uap->lwp_id, &clk_id);
431 	if (error == 0)
432 		error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t));
433 
434 	return (error);
435 }
436 
437 /*
438  * clock_nanosleep1()
439  *
440  *	This is a general helper function for clock_nanosleep() and
441  *	nanosleep() (aka sleep(), aka usleep()).
442  *
443  *	If there is less than one tick's worth of time left and
444  *	we haven't done a yield, or the remaining microseconds is
445  *	ridiculously low, do a yield.  This avoids having
446  *	to deal with systimer overheads when the system is under
447  *	heavy loads.  If we have done a yield already then use
448  *	a systimer and an uninterruptable thread wait.
449  *
450  *	If there is more than a tick's worth of time left,
451  *	calculate the baseline ticks and use an interruptable
452  *	tsleep, then handle the fine-grained delay on the next
453  *	loop.  This usually results in two sleeps occuring, a long one
454  *	and a short one.
455  *
456  * MPSAFE
457  */
458 static void
459 ns1_systimer(systimer_t info, int in_ipi __unused,
460     struct intrframe *frame __unused)
461 {
462 	lwkt_schedule(info->data);
463 }
464 
465 int
466 clock_nanosleep1(clockid_t clock_id, int flags,
467     struct timespec *rqt, struct timespec *rmt)
468 {
469 	static int nanowait;
470 	struct timespec ts_cur, ts_tgt, ts_int;
471 	struct timeval tv;
472 	bool is_abs;
473 	int error, error2;
474 
475 	if ((flags & ~(TIMER_RELTIME | TIMER_ABSTIME)) != 0)
476 		return (EINVAL);
477 	if (rqt->tv_sec < 0 || rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
478 		return (EINVAL);
479 	if (rqt->tv_sec == 0 && rqt->tv_nsec == 0)
480 		return (0);
481 
482 	switch (clock_id) {
483 	case CLOCK_REALTIME:
484 	case CLOCK_REALTIME_FAST:
485 	case CLOCK_REALTIME_PRECISE:
486 	case CLOCK_SECOND:
487 	case CLOCK_MONOTONIC:
488 	case CLOCK_MONOTONIC_FAST:
489 	case CLOCK_MONOTONIC_PRECISE:
490 	case CLOCK_UPTIME:
491 	case CLOCK_UPTIME_FAST:
492 	case CLOCK_UPTIME_PRECISE:
493 		is_abs = (flags & TIMER_ABSTIME) != 0;
494 		break;
495 	case CLOCK_VIRTUAL:
496 	case CLOCK_PROF:
497 	case CLOCK_PROCESS_CPUTIME_ID:
498 		return (ENOTSUP);
499 	case CLOCK_THREAD_CPUTIME_ID:
500 	default:
501 		return (EINVAL);
502 	}
503 
504 	error = kern_clock_gettime(clock_id, &ts_cur);
505 	if (error)
506 		return (error);
507 
508 	if (is_abs) {
509 		if (timespeccmp(&ts_cur, rqt, >=))
510 			return (0);
511 
512 		ts_tgt = *rqt; /* target timestamp */
513 		timespecsub(&ts_tgt, &ts_cur, &ts_int); /* sleep interval */
514 	} else {
515 		ts_int = *rqt; /* sleep interval */
516 		timespecadd(&ts_cur, &ts_int, &ts_tgt); /* target timestamp */
517 	}
518 
519 	for (;;) {
520 		int ticks;
521 		struct systimer info;
522 		thread_t td;
523 
524 		timespecsub(&ts_tgt, &ts_cur, &ts_int);
525 		TIMESPEC_TO_TIMEVAL(&tv, &ts_int);
526 		ticks = tv.tv_usec / ustick; /* approximate */
527 
528 		if (tv.tv_sec == 0 && ticks == 0) {
529 			td = curthread;
530 			if (tv.tv_usec > 0 && tv.tv_usec < nanosleep_min_us)
531 				tv.tv_usec = nanosleep_min_us;
532 			if (tv.tv_usec < nanosleep_hard_us) {
533 				lwkt_user_yield();
534 				cpu_pause();
535 			} else {
536 				crit_enter_quick(td);
537 				systimer_init_oneshot(&info, ns1_systimer,
538 						td, tv.tv_usec);
539 				lwkt_deschedule_self(td);
540 				crit_exit_quick(td);
541 				lwkt_switch();
542 				systimer_del(&info); /* make sure it's gone */
543 			}
544 			error = iscaught(td->td_lwp);
545 		} else if (tv.tv_sec == 0) {
546 			error = tsleep(&nanowait, PCATCH, "nanslp", ticks);
547 		} else {
548 			ticks = tvtohz_low(&tv); /* also handles overflow */
549 			error = tsleep(&nanowait, PCATCH, "nanslp", ticks);
550 		}
551 
552 		error2 = kern_clock_gettime(clock_id, &ts_cur);
553 		if (error2)
554 			return (error2);
555 
556 		if (error && error != EWOULDBLOCK) {
557 			if (error == ERESTART)
558 				error = EINTR;
559 			if (rmt != NULL && !is_abs) {
560 				timespecsub(&ts_tgt, &ts_cur, &ts_int);
561 				if (ts_int.tv_sec < 0)
562 					timespecclear(&ts_int);
563 				*rmt = ts_int;
564 			}
565 			return (error);
566 		}
567 		if (timespeccmp(&ts_cur, &ts_tgt, >=))
568 			return (0);
569 	}
570 }
571 
572 int
573 nanosleep1(struct timespec *rqt, struct timespec *rmt)
574 {
575 	return clock_nanosleep1(CLOCK_REALTIME, TIMER_RELTIME, rqt, rmt);
576 }
577 
578 /*
579  * MPSAFE
580  */
581 int
582 sys_clock_nanosleep(struct sysmsg *sysmsg,
583     const struct clock_nanosleep_args *uap)
584 {
585 	int error;
586 	bool is_abs;
587 	struct timespec rqt;
588 	struct timespec rmt;
589 
590 	is_abs = (uap->flags & TIMER_ABSTIME) != 0;
591 
592 	error = copyin(uap->rqtp, &rqt, sizeof(rqt));
593 	if (error) {
594 		sysmsg->sysmsg_result = error;
595 		return (0);
596 	}
597 
598 	bzero(&rmt, sizeof(rmt));
599 	error = clock_nanosleep1(uap->clock_id, uap->flags, &rqt, &rmt);
600 
601 	/*
602 	 * copyout the residual if nanosleep was interrupted.
603 	 */
604 	if (error == EINTR && uap->rmtp != NULL && !is_abs) {
605 		int error2;
606 
607 		error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
608 		if (error2)
609 			error = error2;
610 	}
611 
612 	sysmsg->sysmsg_result = error;
613 	return (0);
614 }
615 
616 /*
617  * MPSAFE
618  */
619 int
620 sys_nanosleep(struct sysmsg *sysmsg, const struct nanosleep_args *uap)
621 {
622 	int error;
623 	struct timespec rqt;
624 	struct timespec rmt;
625 
626 	error = copyin(uap->rqtp, &rqt, sizeof(rqt));
627 	if (error)
628 		return (error);
629 
630 	bzero(&rmt, sizeof(rmt));
631 	error = nanosleep1(&rqt, &rmt);
632 
633 	/*
634 	 * copyout the residual if nanosleep was interrupted.
635 	 */
636 	if (error == EINTR && uap->rmtp != NULL) {
637 		int error2;
638 
639 		error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
640 		if (error2)
641 			error = error2;
642 	}
643 	return (error);
644 }
645 
646 /*
647  * The gettimeofday() system call is supposed to return a fine-grained
648  * realtime stamp.  However, acquiring a fine-grained stamp can create a
649  * bottleneck when multiple cpu cores are trying to accessing e.g. the
650  * HPET hardware timer all at the same time, so we have a sysctl that
651  * allows its behavior to be changed to a more coarse-grained timestamp
652  * which does not have to access a hardware timer.
653  */
654 int
655 sys_gettimeofday(struct sysmsg *sysmsg, const struct gettimeofday_args *uap)
656 {
657 	struct timeval atv;
658 	int error = 0;
659 
660 	if (uap->tp) {
661 		if (gettimeofday_quick)
662 			getmicrotime(&atv);
663 		else
664 			microtime(&atv);
665 		if ((error = copyout((caddr_t)&atv, (caddr_t)uap->tp,
666 		    sizeof (atv))))
667 			return (error);
668 	}
669 	if (uap->tzp)
670 		error = copyout((caddr_t)&tz, (caddr_t)uap->tzp,
671 		    sizeof (tz));
672 	return (error);
673 }
674 
675 /*
676  * MPALMOSTSAFE
677  */
678 int
679 sys_settimeofday(struct sysmsg *sysmsg, const struct settimeofday_args *uap)
680 {
681 	struct thread *td = curthread;
682 	struct timeval atv;
683 	struct timezone atz;
684 	int error;
685 
686 	if ((error = priv_check(td, PRIV_SETTIMEOFDAY)))
687 		return (error);
688 	/*
689 	 * Verify all parameters before changing time.
690 	 *
691 	 * XXX: We do not allow the time to be set to 0.0, which also by
692 	 *	happy coincidence works around a pkgsrc bulk build bug.
693 	 */
694 	if (uap->tv) {
695 		if ((error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
696 		    sizeof(atv))))
697 			return (error);
698 		if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
699 			return (EINVAL);
700 		if (atv.tv_sec == 0 && atv.tv_usec == 0)
701 			return (EINVAL);
702 	}
703 	if (uap->tzp &&
704 	    (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz))))
705 		return (error);
706 
707 	lockmgr(&masterclock_lock, LK_EXCLUSIVE);
708 	if (uap->tv && (error = settime(&atv))) {
709 		lockmgr(&masterclock_lock, LK_RELEASE);
710 		return (error);
711 	}
712 	lockmgr(&masterclock_lock, LK_RELEASE);
713 
714 	if (uap->tzp)
715 		tz = atz;
716 	return (0);
717 }
718 
719 /*
720  * WARNING! Run with ntp_spin held
721  */
722 static void
723 kern_adjtime_common(void)
724 {
725 	if ((ntp_delta >= 0 && ntp_delta < ntp_default_tick_delta) ||
726 	    (ntp_delta < 0 && ntp_delta > -ntp_default_tick_delta))
727 		ntp_tick_delta = ntp_delta;
728 	else if (ntp_delta > ntp_big_delta)
729 		ntp_tick_delta = 10 * ntp_default_tick_delta;
730 	else if (ntp_delta < -ntp_big_delta)
731 		ntp_tick_delta = -10 * ntp_default_tick_delta;
732 	else if (ntp_delta > 0)
733 		ntp_tick_delta = ntp_default_tick_delta;
734 	else
735 		ntp_tick_delta = -ntp_default_tick_delta;
736 }
737 
738 void
739 kern_adjtime(int64_t delta, int64_t *odelta)
740 {
741 	spin_lock(&ntp_spin);
742 	*odelta = ntp_delta;
743 	ntp_delta = delta;
744 	kern_adjtime_common();
745 	spin_unlock(&ntp_spin);
746 }
747 
748 static void
749 kern_get_ntp_delta(int64_t *delta)
750 {
751 	*delta = ntp_delta;
752 }
753 
754 void
755 kern_reladjtime(int64_t delta)
756 {
757 	spin_lock(&ntp_spin);
758 	ntp_delta += delta;
759 	kern_adjtime_common();
760 	spin_unlock(&ntp_spin);
761 }
762 
763 static void
764 kern_adjfreq(int64_t rate)
765 {
766 	spin_lock(&ntp_spin);
767 	ntp_tick_permanent = rate;
768 	spin_unlock(&ntp_spin);
769 }
770 
771 /*
772  * MPALMOSTSAFE
773  */
774 int
775 sys_adjtime(struct sysmsg *sysmsg, const struct adjtime_args *uap)
776 {
777 	struct thread *td = curthread;
778 	struct timeval atv;
779 	int64_t ndelta, odelta;
780 	int error;
781 
782 	if ((error = priv_check(td, PRIV_ADJTIME)))
783 		return (error);
784 	error = copyin(uap->delta, &atv, sizeof(struct timeval));
785 	if (error)
786 		return (error);
787 
788 	/*
789 	 * Compute the total correction and the rate at which to apply it.
790 	 * Round the adjustment down to a whole multiple of the per-tick
791 	 * delta, so that after some number of incremental changes in
792 	 * hardclock(), tickdelta will become zero, lest the correction
793 	 * overshoot and start taking us away from the desired final time.
794 	 */
795 	ndelta = (int64_t)atv.tv_sec * 1000000000 + atv.tv_usec * 1000;
796 	kern_adjtime(ndelta, &odelta);
797 
798 	if (uap->olddelta) {
799 		atv.tv_sec = odelta / 1000000000;
800 		atv.tv_usec = odelta % 1000000000 / 1000;
801 		copyout(&atv, uap->olddelta, sizeof(struct timeval));
802 	}
803 	return (0);
804 }
805 
806 static int
807 sysctl_adjtime(SYSCTL_HANDLER_ARGS)
808 {
809 	int64_t delta;
810 	int error;
811 
812 	if (req->newptr != NULL) {
813 		if (priv_check(curthread, PRIV_ROOT))
814 			return (EPERM);
815 		error = SYSCTL_IN(req, &delta, sizeof(delta));
816 		if (error)
817 			return (error);
818 		kern_reladjtime(delta);
819 	}
820 
821 	if (req->oldptr)
822 		kern_get_ntp_delta(&delta);
823 	error = SYSCTL_OUT(req, &delta, sizeof(delta));
824 	return (error);
825 }
826 
827 /*
828  * delta is in nanoseconds.
829  */
830 static int
831 sysctl_delta(SYSCTL_HANDLER_ARGS)
832 {
833 	int64_t delta, old_delta;
834 	int error;
835 
836 	if (req->newptr != NULL) {
837 		if (priv_check(curthread, PRIV_ROOT))
838 			return (EPERM);
839 		error = SYSCTL_IN(req, &delta, sizeof(delta));
840 		if (error)
841 			return (error);
842 		kern_adjtime(delta, &old_delta);
843 	}
844 
845 	if (req->oldptr != NULL)
846 		kern_get_ntp_delta(&old_delta);
847 	error = SYSCTL_OUT(req, &old_delta, sizeof(old_delta));
848 	return (error);
849 }
850 
851 /*
852  * frequency is in nanoseconds per second shifted left 32.
853  * kern_adjfreq() needs it in nanoseconds per tick shifted left 32.
854  */
855 static int
856 sysctl_adjfreq(SYSCTL_HANDLER_ARGS)
857 {
858 	int64_t freqdelta;
859 	int error;
860 
861 	if (req->newptr != NULL) {
862 		if (priv_check(curthread, PRIV_ROOT))
863 			return (EPERM);
864 		error = SYSCTL_IN(req, &freqdelta, sizeof(freqdelta));
865 		if (error)
866 			return (error);
867 
868 		freqdelta /= hz;
869 		kern_adjfreq(freqdelta);
870 	}
871 
872 	if (req->oldptr != NULL)
873 		freqdelta = ntp_tick_permanent * hz;
874 	error = SYSCTL_OUT(req, &freqdelta, sizeof(freqdelta));
875 	if (error)
876 		return (error);
877 
878 	return (0);
879 }
880 
881 SYSCTL_NODE(_kern, OID_AUTO, ntp, CTLFLAG_RW, 0, "NTP related controls");
882 SYSCTL_PROC(_kern_ntp, OID_AUTO, permanent,
883     CTLTYPE_QUAD|CTLFLAG_RW, 0, 0,
884     sysctl_adjfreq, "Q", "permanent correction per second");
885 SYSCTL_PROC(_kern_ntp, OID_AUTO, delta,
886     CTLTYPE_QUAD|CTLFLAG_RW, 0, 0,
887     sysctl_delta, "Q", "one-time delta");
888 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, big_delta, CTLFLAG_RD,
889     &ntp_big_delta, sizeof(ntp_big_delta), "Q",
890     "threshold for fast adjustment");
891 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, tick_delta, CTLFLAG_RD,
892     &ntp_tick_delta, sizeof(ntp_tick_delta), "LU",
893     "per-tick adjustment");
894 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, default_tick_delta, CTLFLAG_RD,
895     &ntp_default_tick_delta, sizeof(ntp_default_tick_delta), "LU",
896     "default per-tick adjustment");
897 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, next_leap_second, CTLFLAG_RW,
898     &ntp_leap_second, sizeof(ntp_leap_second), "LU",
899     "next leap second");
900 SYSCTL_INT(_kern_ntp, OID_AUTO, insert_leap_second, CTLFLAG_RW,
901     &ntp_leap_insert, 0, "insert or remove leap second");
902 SYSCTL_PROC(_kern_ntp, OID_AUTO, adjust,
903     CTLTYPE_QUAD|CTLFLAG_RW, 0, 0,
904     sysctl_adjtime, "Q", "relative adjust for delta");
905 
906 /*
907  * Get value of an interval timer.  The process virtual and
908  * profiling virtual time timers are kept in the p_stats area, since
909  * they can be swapped out.  These are kept internally in the
910  * way they are specified externally: in time until they expire.
911  *
912  * The real time interval timer is kept in the process table slot
913  * for the process, and its value (it_value) is kept as an
914  * absolute time rather than as a delta, so that it is easy to keep
915  * periodic real-time signals from drifting.
916  *
917  * Virtual time timers are processed in the hardclock() routine of
918  * kern_clock.c.  The real time timer is processed by a timeout
919  * routine, called from the softclock() routine.  Since a callout
920  * may be delayed in real time due to interrupt processing in the system,
921  * it is possible for the real time timeout routine (realitexpire, given below),
922  * to be delayed in real time past when it is supposed to occur.  It
923  * does not suffice, therefore, to reload the real timer .it_value from the
924  * real time timers .it_interval.  Rather, we compute the next time in
925  * absolute time the timer should go off.
926  *
927  * MPALMOSTSAFE
928  */
929 int
930 sys_getitimer(struct sysmsg *sysmsg, const struct getitimer_args *uap)
931 {
932 	struct proc *p = curproc;
933 	struct timeval ctv;
934 	struct itimerval aitv;
935 
936 	if (uap->which > ITIMER_PROF)
937 		return (EINVAL);
938 	lwkt_gettoken(&p->p_token);
939 	if (uap->which == ITIMER_REAL) {
940 		/*
941 		 * Convert from absolute to relative time in .it_value
942 		 * part of real time timer.  If time for real time timer
943 		 * has passed return 0, else return difference between
944 		 * current time and time for the timer to go off.
945 		 */
946 		aitv = p->p_realtimer;
947 		if (timevalisset(&aitv.it_value)) {
948 			getmicrouptime(&ctv);
949 			if (timevalcmp(&aitv.it_value, &ctv, <))
950 				timevalclear(&aitv.it_value);
951 			else
952 				timevalsub(&aitv.it_value, &ctv);
953 		}
954 	} else {
955 		aitv = p->p_timer[uap->which];
956 	}
957 	lwkt_reltoken(&p->p_token);
958 	return (copyout(&aitv, uap->itv, sizeof (struct itimerval)));
959 }
960 
961 /*
962  * MPALMOSTSAFE
963  */
964 int
965 sys_setitimer(struct sysmsg *sysmsg, const struct setitimer_args *uap)
966 {
967 	struct itimerval aitv;
968 	struct timeval ctv;
969 	struct itimerval *itvp;
970 	struct proc *p = curproc;
971 	struct getitimer_args gitargs;
972 	int error;
973 
974 	if (uap->which > ITIMER_PROF)
975 		return (EINVAL);
976 	itvp = uap->itv;
977 	if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv,
978 	    sizeof(struct itimerval))))
979 		return (error);
980 
981 	if (uap->oitv) {
982 		gitargs.which = uap->which;
983 		gitargs.itv = uap->oitv;
984 		error = sys_getitimer(sysmsg, &gitargs);
985 		if (error)
986 			return error;
987 	}
988 	if (itvp == NULL)
989 		return (0);
990 	if (itimerfix(&aitv.it_value))
991 		return (EINVAL);
992 	if (!timevalisset(&aitv.it_value))
993 		timevalclear(&aitv.it_interval);
994 	else if (itimerfix(&aitv.it_interval))
995 		return (EINVAL);
996 	lwkt_gettoken(&p->p_token);
997 	if (uap->which == ITIMER_REAL) {
998 		if (timevalisset(&p->p_realtimer.it_value))
999 			callout_cancel(&p->p_ithandle);
1000 		if (timevalisset(&aitv.it_value))
1001 			callout_reset(&p->p_ithandle,
1002 			    tvtohz_high(&aitv.it_value), realitexpire, p);
1003 		getmicrouptime(&ctv);
1004 		timevaladd(&aitv.it_value, &ctv);
1005 		p->p_realtimer = aitv;
1006 	} else {
1007 		p->p_timer[uap->which] = aitv;
1008 		switch(uap->which) {
1009 		case ITIMER_VIRTUAL:
1010 			p->p_flags &= ~P_SIGVTALRM;
1011 			break;
1012 		case ITIMER_PROF:
1013 			p->p_flags &= ~P_SIGPROF;
1014 			break;
1015 		}
1016 	}
1017 	lwkt_reltoken(&p->p_token);
1018 	return (0);
1019 }
1020 
1021 /*
1022  * Real interval timer expired:
1023  * send process whose timer expired an alarm signal.
1024  * If time is not set up to reload, then just return.
1025  * Else compute next time timer should go off which is > current time.
1026  * This is where delay in processing this timeout causes multiple
1027  * SIGALRM calls to be compressed into one.
1028  * tvtohz_high() always adds 1 to allow for the time until the next clock
1029  * interrupt being strictly less than 1 clock tick, but we don't want
1030  * that here since we want to appear to be in sync with the clock
1031  * interrupt even when we're delayed.
1032  */
1033 static
1034 void
1035 realitexpire(void *arg)
1036 {
1037 	struct proc *p;
1038 	struct timeval ctv, ntv;
1039 
1040 	p = (struct proc *)arg;
1041 	PHOLD(p);
1042 	lwkt_gettoken(&p->p_token);
1043 	ksignal(p, SIGALRM);
1044 	if (!timevalisset(&p->p_realtimer.it_interval)) {
1045 		timevalclear(&p->p_realtimer.it_value);
1046 		goto done;
1047 	}
1048 	for (;;) {
1049 		timevaladd(&p->p_realtimer.it_value,
1050 			   &p->p_realtimer.it_interval);
1051 		getmicrouptime(&ctv);
1052 		if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
1053 			ntv = p->p_realtimer.it_value;
1054 			timevalsub(&ntv, &ctv);
1055 			callout_reset(&p->p_ithandle, tvtohz_low(&ntv),
1056 				      realitexpire, p);
1057 			goto done;
1058 		}
1059 	}
1060 done:
1061 	lwkt_reltoken(&p->p_token);
1062 	PRELE(p);
1063 }
1064 
1065 /*
1066  * Used to validate itimer timeouts and utimes*() timespecs.
1067  */
1068 int
1069 itimerfix(struct timeval *tv)
1070 {
1071 	if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
1072 		return (EINVAL);
1073 	if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < ustick)
1074 		tv->tv_usec = ustick;
1075 	return (0);
1076 }
1077 
1078 /*
1079  * Used to validate timeouts and utimes*() timespecs.
1080  */
1081 int
1082 itimespecfix(struct timespec *ts)
1083 {
1084 	if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000ULL)
1085 		return (EINVAL);
1086 	if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < nstick)
1087 		ts->tv_nsec = nstick;
1088 	return (0);
1089 }
1090 
1091 /*
1092  * Decrement an interval timer by a specified number
1093  * of microseconds, which must be less than a second,
1094  * i.e. < 1000000.  If the timer expires, then reload
1095  * it.  In this case, carry over (usec - old value) to
1096  * reduce the value reloaded into the timer so that
1097  * the timer does not drift.  This routine assumes
1098  * that it is called in a context where the timers
1099  * on which it is operating cannot change in value.
1100  */
1101 int
1102 itimerdecr(struct itimerval *itp, int usec)
1103 {
1104 
1105 	if (itp->it_value.tv_usec < usec) {
1106 		if (itp->it_value.tv_sec == 0) {
1107 			/* expired, and already in next interval */
1108 			usec -= itp->it_value.tv_usec;
1109 			goto expire;
1110 		}
1111 		itp->it_value.tv_usec += 1000000;
1112 		itp->it_value.tv_sec--;
1113 	}
1114 	itp->it_value.tv_usec -= usec;
1115 	usec = 0;
1116 	if (timevalisset(&itp->it_value))
1117 		return (1);
1118 	/* expired, exactly at end of interval */
1119 expire:
1120 	if (timevalisset(&itp->it_interval)) {
1121 		itp->it_value = itp->it_interval;
1122 		itp->it_value.tv_usec -= usec;
1123 		if (itp->it_value.tv_usec < 0) {
1124 			itp->it_value.tv_usec += 1000000;
1125 			itp->it_value.tv_sec--;
1126 		}
1127 	} else
1128 		itp->it_value.tv_usec = 0;		/* sec is already 0 */
1129 	return (0);
1130 }
1131 
1132 /*
1133  * Add and subtract routines for timevals.
1134  * N.B.: subtract routine doesn't deal with
1135  * results which are before the beginning,
1136  * it just gets very confused in this case.
1137  * Caveat emptor.
1138  */
1139 void
1140 timevaladd(struct timeval *t1, const struct timeval *t2)
1141 {
1142 
1143 	t1->tv_sec += t2->tv_sec;
1144 	t1->tv_usec += t2->tv_usec;
1145 	timevalfix(t1);
1146 }
1147 
1148 void
1149 timevalsub(struct timeval *t1, const struct timeval *t2)
1150 {
1151 
1152 	t1->tv_sec -= t2->tv_sec;
1153 	t1->tv_usec -= t2->tv_usec;
1154 	timevalfix(t1);
1155 }
1156 
1157 static void
1158 timevalfix(struct timeval *t1)
1159 {
1160 
1161 	if (t1->tv_usec < 0) {
1162 		t1->tv_sec--;
1163 		t1->tv_usec += 1000000;
1164 	}
1165 	if (t1->tv_usec >= 1000000) {
1166 		t1->tv_sec++;
1167 		t1->tv_usec -= 1000000;
1168 	}
1169 }
1170 
1171 /*
1172  * ratecheck(): simple time-based rate-limit checking.
1173  */
1174 int
1175 ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
1176 {
1177 	struct timeval tv, delta;
1178 	int rv = 0;
1179 
1180 	getmicrouptime(&tv);		/* NB: 10ms precision */
1181 	delta = tv;
1182 	timevalsub(&delta, lasttime);
1183 
1184 	/*
1185 	 * check for 0,0 is so that the message will be seen at least once,
1186 	 * even if interval is huge.
1187 	 */
1188 	if (timevalcmp(&delta, mininterval, >=) ||
1189 	    (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
1190 		*lasttime = tv;
1191 		rv = 1;
1192 	}
1193 
1194 	return (rv);
1195 }
1196 
1197 /*
1198  * ppsratecheck(): packets (or events) per second limitation.
1199  *
1200  * Return 0 if the limit is to be enforced (e.g. the caller
1201  * should drop a packet because of the rate limitation).
1202  *
1203  * maxpps of 0 always causes zero to be returned.  maxpps of -1
1204  * always causes 1 to be returned; this effectively defeats rate
1205  * limiting.
1206  *
1207  * Note that we maintain the struct timeval for compatibility
1208  * with other bsd systems.  We reuse the storage and just monitor
1209  * clock ticks for minimal overhead.
1210  */
1211 int
1212 ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
1213 {
1214 	int now;
1215 
1216 	/*
1217 	 * Reset the last time and counter if this is the first call
1218 	 * or more than a second has passed since the last update of
1219 	 * lasttime.
1220 	 */
1221 	now = ticks;
1222 	if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) {
1223 		lasttime->tv_sec = now;
1224 		*curpps = 1;
1225 		return (maxpps != 0);
1226 	} else {
1227 		(*curpps)++;		/* NB: ignore potential overflow */
1228 		return (maxpps < 0 || *curpps < maxpps);
1229 	}
1230 }
1231 
1232 static int
1233 sysctl_gettimeofday_quick(SYSCTL_HANDLER_ARGS)
1234 {
1235 	int error;
1236 	int gtod;
1237 
1238 	gtod = gettimeofday_quick;
1239 	error = sysctl_handle_int(oidp, &gtod, 0, req);
1240 	if (error || req->newptr == NULL)
1241 		return error;
1242 	gettimeofday_quick = gtod;
1243 	if (kpmap)
1244 		kpmap->fast_gtod = gtod;
1245 	return 0;
1246 }
1247