1 /* Windows NT Clock Routines
2  *
3  * Created by Sven Dietrich  sven@inter-yacht.com
4  *
5  * New interpolation scheme by Dave Hart <davehart@davehart.com> in
6  * February 2009 overcomes 500us-1ms inherent jitter with the older
7  * scheme, first identified by Peter Rosin (nee Ekberg)
8  * <peda@lysator.liu.se> in 2003 [Bug 216].
9  *
10  * Note:  The Windows port of ntpd uses the C99-snprintf replacement for
11  * (v)snprintf(), also used by msyslog(), which does not understand the
12  * printf format specifier %I64d, only the more common %lld.  With the
13  * minimum supported compiler raised to Visual C++ 2005 in ntp-dev in
14  * August 2011, all MS C runtime routines also understand %lld and %llu.
15  */
16 
17 
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21 
22 #include <sys/resource.h>	/* our private version */
23 
24 #if defined(_MSC_VER) && _MSC_VER >= 1400	/* VS 2005 */
25 #include <intrin.h>				/* for __rdtsc() */
26 #endif
27 
28 #ifdef HAVE_PPSAPI
29 #include <timepps.h>
30 /*
31  * ports/winnt/include/timepps.h defines EOPNOTSUPP for compatibility
32  * with PPSAPI on other platforms.  ports/winnt/include/isc/net.h has
33  * #define EOPNOTSUPP WSAEOPNOTSUPP, so to avoid a macro redefinition
34  * warning undefine it.
35  */
36 #undef EOPNOTSUPP
37 #endif	/* HAVE_PPSAPI */
38 
39 #include "ntp_stdlib.h"
40 #include "ntp_unixtime.h"
41 #include "ntp_timer.h"
42 #include "ntp_assert.h"
43 #include "ntp_leapsec.h"
44 #include "clockstuff.h"
45 #include "ntservice.h"
46 #include "ntpd.h"
47 #include "ntpd-opts.h"
48 
49 extern double sys_residual;	/* residual from previous adjustment */
50 
51 /*
52  * Include code to possibly modify the MM timer while the service is active.
53  */
54 
55 /*
56  * Whether or not MM timer modifications takes place is still controlled
57  * by the variable below which is initialized by a default value but
58  * might be changed depending on a command line switch.
59  */
60 static int modify_mm_timer = MM_TIMER_LORES;
61 
62 #define MM_TIMER_INTV   1  /* the interval we'd want to set the MM timer to [ms] */
63 
64 static UINT wTimerRes;
65 
66 BOOL init_randfile();
67 
68 static long last_Adj = 0;
69 
70 #define LS_CORR_INTV_SECS  2   /* seconds to apply leap second correction */
71 #define LS_CORR_INTV   ( 1000ul * LS_CORR_INTV_SECS )
72 #define LS_CORR_LIMIT  ( 250ul )  // quarter second
73 
74 typedef union ft_ull {
75 	FILETIME ft;
76 	ULONGLONG ull;
77 	LONGLONG ll;
78 	LARGE_INTEGER li;
79 } FT_ULL;
80 
81 /* leap second stuff */
82 static FT_ULL ls_ft;
83 static DWORD ls_time_adjustment;
84 
85 static BOOL winnt_time_initialized = FALSE;
86 static BOOL winnt_use_interpolation = FALSE;
87 static unsigned clock_thread_id;
88 
89 
90 void WINAPI GetInterpTimeAsFileTime(LPFILETIME pft);
91 static void StartClockThread(void);
92 static void tune_ctr_freq(LONGLONG, LONGLONG);
93 void StopClockThread(void);
94 void atexit_revert_mm_timer(void);
95 void win_time_stepped(void);
96 
97 static HANDLE clock_thread = NULL;
98 static HANDLE TimerThreadExitRequest = NULL;
99 
100 /*
101  * interp_time estimates time in 100ns units
102  * based on a performance counter value given.
103  * The 2nd parameter indicates if this is
104  * part of a current time-of-day calculation.
105  */
106 ULONGLONG interp_time(ULONGLONG, BOOL);
107 
108 /*
109  * add_counter_time_pair is called by the
110  * high priority clock thread with a new
111  * sample.
112  */
113 void add_counter_time_pair(ULONGLONG, LONGLONG);
114 
115 /*
116  * globals used by the above two functions to
117  * implement the counter/time history
118  */
119 #define BASELINES_TOT	256
120 #define BASELINES_USED	64
121 
122 static volatile int	newest_baseline = 0;
123 static volatile int	newest_baseline_gen = 0;
124 static ULONGLONG	baseline_counts[BASELINES_TOT] = {0};
125 static LONGLONG		baseline_times[BASELINES_TOT] = {0};
126 
127 #define CLOCK_BACK_THRESHOLD	100	/* < 10us unremarkable */
128 static ULONGLONG	clock_backward_max = CLOCK_BACK_THRESHOLD;
129 static int		clock_backward_count;
130 
131 /**
132  * A flag set on Windows versions which ignore small time adjustments.
133  *
134  * Windows Vista and Windows 7 ignore TimeAdjustment less than 16.
135  * @note Has to be checked for Windows Server 2008/2012 and Windows 8.
136  * Ref: http://support.microsoft.com/kb/2537623, bug #2328
137  */
138 static BOOL os_ignores_small_adjustment;
139 
140 /*
141  * clockperiod is the period used for SetSystemTimeAdjustment
142  * slewing calculations but does not necessarily correspond
143  * to the precision of the OS clock.  Prior to Windows Vista
144  * (6.0) the two were identical.  In 100ns units.
145  */
146 static DWORD clockperiod;
147 
148 /*
149  * os_clock_precision is the observed precision of the OS
150  * clock, meaning the increment between discrete values. This
151  * is currently calculated once at startup.  100ns units.
152  */
153 static ULONGLONG os_clock_precision;
154 
155 /*
156  * NomPerfCtrFreq is from QueryPerformanceFrequency and is the
157  * number of performance counter beats per second.  PerfCtrFreq
158  * starts from NomPerfCtrFreq but is maintained using a sliding
159  * window average based on actual performance counter behavior,
160  * to allow us to better tolerate powersaving measures that
161  * alter the effective frequency of the processor cycle counter
162  * (TSC) which sometimes underlies QueryPerformanceCounter.
163  *
164  * Note that the OS is unlikely to be so subtle in its internal
165  * scheduling of waitable timers, presumably done using the
166  * performance counter.  Therefore our calculations for
167  * interpolated time should be based on PerfCtrFreq but our
168  * calculations for SetWaitableTimer should assume the OS will
169  * convert from FILETIME 100ns units to performance counter
170  * beats using the nominal frequency.
171  */
172 
173 volatile ULONGLONG PerfCtrFreq = 0;
174 	 ULONGLONG NomPerfCtrFreq = 0;
175 
176 /*
177  * If we're using RDTSC beating at the same rate as
178  * QueryPerformanceCounter, there is a systemic
179  * offset we need to account for when using
180  * counterstamps from serialpps.sys, which are
181  * always from QPC (actually KeQueryPerformanceCounter).
182  */
183 static LONGLONG QPC_offset = 0;
184 
185 /*
186  * Substitute RDTSC for QueryPerformanceCounter()?
187  */
188 static int use_pcc = -1;
189 
190 /*
191  * Restrict threads that call QPC/RDTSC to one CPU?
192  */
193 static int lock_interp_threads = -1;
194 
195 /*
196  * ppm_per_adjust_unit is parts per million effect on the OS
197  * clock per slewing adjustment unit per second.  Per haps.
198  */
199 static DOUBLE ppm_per_adjust_unit;
200 
201 /*
202  * wintickadj emulates the functionality provided by unix tickadj,
203  * providing a baseline clock correction if needed to get the
204  * clock within a few hundred PPM of correct frequency.
205  */
206 static long wintickadj;
207 
208 static void	choose_interp_counter(void);
209 static int	is_qpc_built_on_pcc(void);
210 
211 /*
212  * performance counter frequency observations
213  */
214 #define TUNE_CTR_DEPTH		3	/* running avg depth */
215 
216 static HANDLE		ctr_freq_timer = INVALID_HANDLE_VALUE;
217 static ULONGLONG	tune_ctr_freq_max_interval;
218 static unsigned		tune_ctr_period;
219 void start_ctr_freq_timer(ULONGLONG now_time);
220 void reset_ctr_freq_timer(ULONGLONG when, ULONGLONG now);
221 void reset_ctr_freq_timer_abs(ULONGLONG when);
222 
223 /* round a Windows time to the next bottom of the second */
224 
225 #define ROUND_TO_NEXT_SEC_BOTTOM(t)	\
226 do {	\
227 	(t) += 3 * HECTONANOSECONDS / 2 - 1;	\
228 	(t) /= HECTONANOSECONDS;	\
229 	(t) *= HECTONANOSECONDS;	\
230 	(t) -= HECTONANOSECONDS / 2;	\
231 } while (0)
232 
233 /*
234  * NT native time format is 100's of nanoseconds since 1601-01-01.
235  * Helpers for converting between "hectonanoseconds" and the
236  * performance counter scale from which interpolated time is
237  * derived.
238  */
239 #define HNS2PERF(hns)	((hns) * PerfCtrFreq / HECTONANOSECONDS)
240 #define PERF2HNS(ctr)	((ctr) * HECTONANOSECONDS / PerfCtrFreq)
241 
242 
243 #if defined(_MSC_VER) && _MSC_VER >= 1400	/* VS 2005 */
244 #define	get_pcc()	__rdtsc()
245 #else
246 /*
247  * something like this can be used for a compiler without __rdtsc()
248  */
249 ULONGLONG __forceinline
get_pcc(void)250 get_pcc(void)
251 {
252 	/* RDTSC returns in EDX:EAX, same as C compiler */
253 	__asm {
254 		RDTSC
255 	}
256 }
257 #endif
258 
259 
260 /*
261  * perf_ctr() returns the current performance counter value,
262  * from QueryPerformanceCounter or RDTSC.
263  */
264 ULONGLONG WINAPI
perf_ctr(void)265 perf_ctr(void)
266 {
267 	FT_ULL ft;
268 
269 	if (use_pcc)
270 		return get_pcc();
271 	else {
272 		QueryPerformanceCounter(&ft.li);
273 		return ft.ull;
274 	}
275 }
276 
277 
278 /*
279  * init_small_adjustment
280  *
281  * Set variable os_ignores_small_adjustment
282  *
283  */
init_small_adjustment(void)284 static void init_small_adjustment(void)
285 {
286 	OSVERSIONINFO vi;
287 	memset(&vi, 0, sizeof(vi));
288 	vi.dwOSVersionInfoSize = sizeof(vi);
289 
290 	if (!GetVersionEx(&vi)) {
291 		msyslog(LOG_WARNING, "GetVersionEx failed with error code %d.", GetLastError());
292 		os_ignores_small_adjustment = FALSE;
293 		return;
294 	}
295 
296 	if (vi.dwMajorVersion == 6 && vi.dwMinorVersion == 1) {
297 		// Windows 7 and Windows Server 2008 R2
298 		//
299 		// Windows 7 is documented as affected.
300 		// Windows Server 2008 R2 is assumed affected.
301 		os_ignores_small_adjustment = TRUE;
302 	} else if (vi.dwMajorVersion == 6 && vi.dwMinorVersion == 0) {
303 		// Windows Vista and Windows Server 2008
304 		//
305 		// Windows Vista is documented as affected.
306 		// Windows Server 2008 is assumed affected.
307 		os_ignores_small_adjustment = TRUE;
308 	} else {
309 		os_ignores_small_adjustment = FALSE;
310 	}
311 }
312 
313 
314 /*
315  * choose_interp_counter - select between QueryPerformanceCounter and
316  *			   the x86 processor cycle counter (TSC).
317  */
318 static void
choose_interp_counter(void)319 choose_interp_counter(void)
320 {
321 	const char *	ntpd_pcc_freq_text;
322 	int		qpc_built_on_pcc;
323 
324 	/*
325 	 * Regardless of whether we actually use RDTSC, first determine
326 	 * if QueryPerformanceCounter is built on it, so that we can
327 	 * decide whether it's prudent to lock QPC-consuming threads to
328 	 * a particular CPU.
329 	 */
330 	qpc_built_on_pcc = is_qpc_built_on_pcc();
331 	lock_interp_threads = qpc_built_on_pcc;
332 
333 	/*
334 	 * It's time to make some more permanent knobs,
335 	 * but for right now the RDTSC aka PCC dance on x86 is:
336 	 *
337 	 * 1.  With none of these variables defined, only QPC
338 	 *     is used because there is no reliable way to
339 	 *     detect counter frequency variation after ntpd
340 	 *     startup implemented.
341 	 * 2.  We need a better knob, but for now if you know
342 	 *     your RDTSC / CPU frequency is invariant, set
343 	 *     NTPD_PCC and assuming your QPC is based on the
344 	 *     PCC as well, RDTSC will be substituted.
345 	 * 3.  More forcefully, you can jam in a desired exact
346 	 *     processor frequency, expressed in cycles per
347 	 *     second by setting NTPD_PCC_FREQ=398125000, for
348 	 *     example, if yor actual known CPU frequency is
349 	 *     398.125 MHz, and NTPD_PCC doesn't work because
350 	 *     QueryPerformanceCounter is implemented using
351 	 *     another counter.  It is very easy to make ntpd
352 	 *     fall down if the NTPD_PCC_FREQ value isn't very
353 	 *     close to the observed RDTSC units per second.
354 	 *
355 	 * Items 2 and 3 could probably best be combined into one
356 	 * new windows-specific command line switch such as
357 	 *   ntpd --pcc
358 	 * or
359 	 *   ntpd --pcc=398125000
360 	 *
361 	 * They are currently tied to Windows because that is
362 	 * the only ntpd port with its own interpolation, and
363 	 * to x86/x64 because no one has ported the Windows
364 	 * ntpd port to the sole remaining alternative, Intel
365 	 * Itanium.
366 	 */
367 	if (HAVE_OPT(PCCFREQ))
368 		ntpd_pcc_freq_text = OPT_ARG(PCCFREQ);
369 	else
370 		ntpd_pcc_freq_text = getenv("NTPD_PCC_FREQ");
371 
372 	if (!HAVE_OPT(USEPCC)
373 	    && NULL == ntpd_pcc_freq_text
374 	    && NULL == getenv("NTPD_PCC")) {
375 		use_pcc = 0;
376 		return;
377 	}
378 
379 	if (!qpc_built_on_pcc && NULL == ntpd_pcc_freq_text) {
380 		use_pcc = 0;
381 		return;
382 	}
383 
384 	use_pcc = 1;
385 	if (ntpd_pcc_freq_text != NULL)
386 		sscanf(ntpd_pcc_freq_text,
387 		       "%llu",
388 		       &NomPerfCtrFreq);
389 
390 	NLOG(NLOG_CLOCKINFO)
391 		msyslog(LOG_INFO,
392 			"using processor cycle counter "
393 			"%.3f MHz",
394 			NomPerfCtrFreq / 1e6);
395 	return;
396 }
397 
398 
399 /*
400  * is_qpc_built_on_pcc - test if QueryPerformanceCounter runs at the
401  *			 same rate as the processor cycle counter (TSC).
402  */
403 static int
is_qpc_built_on_pcc(void)404 is_qpc_built_on_pcc(void)
405 {
406 	LONGLONG	offset;
407 	FT_ULL		ft1;
408 	FT_ULL		ft2;
409 	FT_ULL		ft3;
410 	FT_ULL		ft4;
411 	FT_ULL		ft5;
412 
413 	REQUIRE(NomPerfCtrFreq != 0);
414 
415 	QueryPerformanceCounter(&ft1.li);
416 	ft2.ull = get_pcc();
417 	Sleep(1);
418 	QueryPerformanceCounter(&ft3.li);
419 	Sleep(1);
420 	ft4.ull = get_pcc();
421 	Sleep(1);
422 	QueryPerformanceCounter(&ft5.li);
423 
424 	offset = ft2.ull - ft1.ull;
425 	ft3.ull += offset;
426 	ft5.ull += offset;
427 
428 	if (ft2.ull <= ft3.ull &&
429 	    ft3.ull <= ft4.ull &&
430 	    ft4.ull <= ft5.ull) {
431 
432 		QPC_offset = offset;
433 		return TRUE;
434 	}
435 
436 	return FALSE;
437 }
438 
439 
440 /*
441  * Request Multimedia Timer
442  */
443 void
set_mm_timer(int timerres)444 set_mm_timer(
445 	int timerres
446 	)
447 {
448 	modify_mm_timer = timerres;
449 }
450 
451 /*
452  * adj_systime - called once every second to discipline system clock.
453  * Normally, the offset passed in (parameter now) is in the range
454  * [-NTP_MAXFREQ, NTP_MAXFREQ].  However, at EVNT_NSET, a much larger
455  * slew is requested if the initial offset is less than the step
456  * threshold, in the range [-step, step] where step is the step
457  * threshold, 128 msec by default.  For the remainder of the frequency
458  * training interval, adj_systime is called with 0 offset each second
459  * and slew the large offset at 500 PPM (500 usec/sec).
460  * Returns 1 if okay, 0 if trouble.
461  */
462 int
adj_systime(double now)463 adj_systime(
464 	double now
465 	)
466 {
467         /* ntp time scale origin as ticks since 1601-01-01 */
468         static const ULONGLONG HNS_JAN_1900 = 94354848000000000ull;
469 
470 	static DWORD ls_start_tick; /* start of slew in 1ms ticks */
471 
472 	static double	adjtime_carry;
473 	double		dtemp;
474 	u_char		isneg;
475 	BOOL		rc;
476 	long		TimeAdjustment;
477 	SYSTEMTIME	st;
478 	DWORD		ls_elapsed;
479 	FT_ULL		curr_ft;
480         leap_result_t   lsi;
481 
482 	/*
483 	 * Add the residual from the previous adjustment to the new
484 	 * adjustment, bound and round.
485 	 */
486 	dtemp = adjtime_carry + sys_residual + now;
487 	adjtime_carry = 0.;
488 	sys_residual = 0.;
489 	if (dtemp < 0) {
490 		isneg = TRUE;
491 		dtemp = -dtemp;
492 	} else {
493 		isneg = FALSE;
494 	}
495 
496 	if (dtemp > NTP_MAXFREQ) {
497 		adjtime_carry = dtemp - NTP_MAXFREQ;
498 		dtemp = NTP_MAXFREQ;
499 	}
500 
501 	if (isneg) {
502 		dtemp = -dtemp;
503 		adjtime_carry = -adjtime_carry;
504 	}
505 
506 	dtemp = dtemp * 1e6;
507 
508 	/*
509 	 * dtemp is in micro seconds. NT uses 100 ns units,
510 	 * so a unit change in TimeAdjustment corresponds
511 	 * to slewing 10 ppm on a 100 Hz system. Calculate
512 	 * the number of 100ns units to add, using OS tick
513 	 * frequency as per suggestion from Harry Pyle,
514 	 * and leave the remainder in dtemp
515 	 */
516 	TimeAdjustment = (long)(dtemp / ppm_per_adjust_unit +
517 				((isneg)
518 				     ? -0.5
519 				     : 0.5));
520 
521 	if (os_ignores_small_adjustment) {
522 		/*
523 		 * As the OS ignores adjustments smaller than 16, we need to
524 		 * leave these small adjustments in sys_residual, causing
525 		 * the small values to be averaged over time.
526 		 */
527 		if (TimeAdjustment > -16 && TimeAdjustment < 16) {
528 			TimeAdjustment = 0;
529 		}
530 	}
531 
532 	dtemp -= TimeAdjustment * ppm_per_adjust_unit;
533 
534 
535 	/* If a piping-hot close leap second is pending for the end
536          * of this day, determine the UTC time stamp when the transition
537          * must take place. (Calculated in the current leap era!)
538 	 */
539 	if (leapsec >= LSPROX_ALERT) {
540                 if (0 == ls_ft.ull && leapsec_frame(&lsi)) {
541                         if (lsi.tai_diff > 0) {
542                                 /* A leap second insert is scheduled at the end
543                                  * of the day. Since we have not yet computed the
544                                  * time stamp, do it now. Signal electric mode
545                                  * for this insert. We start processing 1 second early
546 				 * because we want to slew over 2 seconds.
547                                  */
548                                 ls_ft.ull = lsi.ttime.Q_s * HECTONANOSECONDS
549                                           + HNS_JAN_1900;
550                                 FileTimeToSystemTime(&ls_ft.ft, &st);
551 			        msyslog(LOG_NOTICE,
552 				        "Detected positive leap second announcement "
553 				        "for %04d-%02d-%02d %02d:%02d:%02d UTC",
554 				        st.wYear, st.wMonth, st.wDay,
555 				        st.wHour, st.wMinute, st.wSecond);
556 				/* slew starts with last second before insertion!
557 				 * And we have to tell the core that we deal with it.
558 				 */
559                                 ls_ft.ull -= (HECTONANOSECONDS + HECTONANOSECONDS/2);
560                                 leapsec_electric(TRUE);
561                         } else if (lsi.tai_diff < 0) {
562                                 /* Do not handle negative leap seconds here. If this
563                                  * happens, let the system step.
564                                  */
565                                 leapsec_electric(FALSE);
566                         }
567                 }
568         } else {
569                 /* The leap second announcement is gone. Happens primarily after
570                  * the leap transition, but can also be due to a clock step.
571                  * Disarm the leap second, but only if there is one scheduled
572                  * and not currently in progress!
573                  */
574 		if (ls_ft.ull != 0 && ls_time_adjustment == 0) {
575 			ls_ft.ull = 0;
576 			msyslog(LOG_NOTICE, "Leap second announcement disarmed");
577 		}
578 	}
579 
580 	/*
581 	 * If the time stamp for the next leap second has been set
582 	 * then check if the leap second must be handled. We use
583 	 * free-running milliseconds from 'GetTickCount()', which
584 	 * is documented as not affected by clock and/or speed
585 	 * adjustments.
586 	 */
587 	if (ls_ft.ull != 0) {
588 		if (0 == ls_time_adjustment) { /* has not yet been scheduled */
589 	 		GetSystemTimeAsFileTime(&curr_ft.ft);
590 			if (curr_ft.ull >= ls_ft.ull) {
591 				ls_ft.ull = _UI64_MAX; /* guard against second schedule */
592 				ls_time_adjustment = clockperiod / LS_CORR_INTV_SECS;
593 				ls_start_tick = GetTickCount();
594 				msyslog(LOG_NOTICE, "Started leap second insertion.");
595 			}
596 			ls_elapsed = 0;
597 		} else {  /* leap sec adjustment has been scheduled previously */
598 			ls_elapsed = GetTickCount() - ls_start_tick;
599 		}
600 
601 		if (ls_time_adjustment != 0) {  /* leap second adjustment is currently active */
602 			if (ls_elapsed > (LS_CORR_INTV - LS_CORR_LIMIT)) {
603 				ls_time_adjustment = 0;  /* leap second adjustment done */
604 				msyslog(LOG_NOTICE, "Finished leap second insertion.");
605 			}
606 
607 			/*
608 			 * NOTE: While the system time is slewed during the leap second
609 			 * the interpolation function which is based on the performance
610 			 * counter does not account for the slew.
611 			 */
612 			TimeAdjustment -= ls_time_adjustment;
613 		}
614 	}
615 
616 
617 	sys_residual = dtemp / 1e6;
618 	DPRINTF(3, ("adj_systime: %.9f -> %.9f residual %.9f",
619 		    now, 1e-6 * (TimeAdjustment * ppm_per_adjust_unit),
620 		    sys_residual));
621 	if (0. == adjtime_carry)
622 		DPRINTF(3, ("\n"));
623 	else
624 		DPRINTF(3, (" adjtime %.9f\n", adjtime_carry));
625 
626 	/* only adjust the clock if adjustment changes */
627 	TimeAdjustment += wintickadj;
628 	if (last_Adj != TimeAdjustment) {
629 		last_Adj = TimeAdjustment;
630 		DPRINTF(2, ("SetSystemTimeAdjustment(%+ld)\n", TimeAdjustment));
631 		rc = SetSystemTimeAdjustment(clockperiod + TimeAdjustment, FALSE);
632 		if (!rc)
633 			msyslog(LOG_ERR, "Can't adjust time: %m");
634 	} else {
635 		rc = TRUE;
636 	}
637 
638 	return rc;
639 }
640 
641 
642 void
init_winnt_time(void)643 init_winnt_time(void)
644 {
645 	static const char settod[] = "settimeofday=\"SetSystemTime\"";
646 	char szMsgPath[MAX_PATH+1];
647 	HANDLE hToken = INVALID_HANDLE_VALUE;
648 	TOKEN_PRIVILEGES tkp;
649 	TIMECAPS tc;
650 	BOOL noslew;
651 	DWORD adjclockperiod;
652 	LARGE_INTEGER Freq;
653 	FT_ULL initial_hectonanosecs;
654 	FT_ULL next_hectonanosecs;
655 	double adjppm;
656 	double rawadj;
657 	char * pch;
658 
659 	if (winnt_time_initialized)
660 		return;
661 
662 	/* Set up the Console Handler */
663 	if (!SetConsoleCtrlHandler(OnConsoleEvent, TRUE)) {
664 		msyslog(LOG_ERR, "Can't set console control handler: %m");
665 	}
666 
667 	/* Set the Event-ID message-file name. */
668 	if (!GetModuleFileName(NULL, szMsgPath, sizeof(szMsgPath))) {
669 		msyslog(LOG_ERR, "GetModuleFileName(PGM_EXE_FILE) failed: %m");
670 		exit(1);
671 	}
672 
673 	/* Initialize random file before OpenSSL checks */
674 	if (!init_randfile())
675 		msyslog(LOG_ERR, "Unable to initialize .rnd file");
676 
677 #pragma warning(push)
678 #pragma warning(disable: 4127) /* conditional expression is constant */
679 
680 #ifdef DEBUG
681 	if (SIZEOF_TIME_T != sizeof(time_t)
682 	    || SIZEOF_INT != sizeof(int)
683 	    || SIZEOF_SIGNED_CHAR != sizeof(char)) {
684 		msyslog(LOG_ERR, "config.h SIZEOF_* macros wrong, fatal");
685 		exit(1);
686 	}
687 #endif
688 
689 #pragma warning(pop)
690 
691 	init_small_adjustment();
692         leapsec_electric(TRUE);
693 
694 	/*
695 	 * Get privileges needed for fiddling with the clock
696 	 */
697 
698 	/* get the current process token handle */
699 	if (!OpenProcessToken(
700 		GetCurrentProcess(),
701 		TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY,
702 		&hToken)) {
703 		msyslog(LOG_ERR, "OpenProcessToken failed: %m");
704 		exit(-1);
705 	}
706 	/* get the LUID for system-time privilege. */
707 	LookupPrivilegeValue(NULL, SE_SYSTEMTIME_NAME, &tkp.Privileges[0].Luid);
708 	tkp.PrivilegeCount = 1;  /* one privilege to set */
709 	tkp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
710 
711 	/* get set-time privilege for this process. */
712 	AdjustTokenPrivileges(hToken, FALSE, &tkp, 0,
713 	 	(PTOKEN_PRIVILEGES) NULL, 0);
714 
715 	/* cannot use return value of AdjustTokenPrivileges. */
716 	/* (success does not indicate all privileges were set) */
717 	if (GetLastError() != ERROR_SUCCESS) {
718 		msyslog(LOG_ERR, "AdjustTokenPrivileges failed: %m");
719 	 	/* later set time call will probably fail */
720 	}
721 
722 	CloseHandle(hToken);
723 	hToken = INVALID_HANDLE_VALUE;
724 
725 	/*
726 	 * Say how we're setting the time of day
727 	 */
728 	set_sys_var(settod, sizeof(settod), RO);
729 
730 	/*
731 	 * ntpd on Windows has always raised its priority, without
732 	 * requiring -N as on Unix.  Since Windows ntpd doesn't share
733 	 * the history of unix ntpd of once having no -N and therefore
734 	 * needing to be invoked under nice, there is no reason to
735 	 * bring it in line with the Unix version in this regard.
736 	 * Instsrv assumes ntpd is invoked with no arguments, and
737 	 * upgrading users would be negatively surprised by the
738 	 * poor timekeeping if they failed to add -N as part of
739 	 * upgrading were we to correct this platform difference.
740 	 */
741 	if (-1 == setpriority(PRIO_PROCESS, 0, NTP_PRIO))
742 		exit(-1);
743 
744 	/* Determine the existing system time slewing */
745 	if (!GetSystemTimeAdjustment(&adjclockperiod, &clockperiod, &noslew)) {
746 		msyslog(LOG_ERR, "GetSystemTimeAdjustment failed: %m");
747 		exit(-1);
748 	}
749 
750 	/*
751 	 * If there is no slewing before ntpd, adjclockperiod and clockperiod
752 	 * will be equal.  Any difference is carried into adj_systime's first
753 	 * pass as the previous adjustment.
754 	 */
755 	last_Adj = adjclockperiod - clockperiod;
756 
757 	if (last_Adj)
758 		msyslog(LOG_INFO,
759 			"Clock interrupt period %.3f msec "
760 			"(startup slew %.1f usec/period)",
761 			clockperiod / 1e4,
762 			last_Adj / 10.);
763 	else
764 		msyslog(LOG_INFO,
765 			"Clock interrupt period %.3f msec",
766 			clockperiod / 1e4);
767 
768 	/*
769 	 * Calculate the time adjustment resulting from incrementing
770 	 * units per tick by 1 unit for 1 second
771 	 */
772 	ppm_per_adjust_unit = 1e6 / clockperiod;
773 
774 	pch = getenv("NTPD_TICKADJ_PPM");
775 	if (pch != NULL && 1 == sscanf(pch, "%lf", &adjppm)) {
776 		rawadj = adjppm / ppm_per_adjust_unit;
777 		rawadj += (rawadj < 0)
778 			      ? -0.5
779 			      : 0.5;
780 		wintickadj = (long)rawadj;
781 		msyslog(LOG_INFO,
782 			"Using NTPD_TICKADJ_PPM %+g ppm (%+ld)",
783 			adjppm, wintickadj);
784 	}
785 
786 	/* get the performance counter ticks per second */
787 	if (!QueryPerformanceFrequency(&Freq) || !Freq.QuadPart) {
788 		msyslog(LOG_ERR, "QueryPerformanceFrequency failed: %m");
789 		exit(-1);
790 	}
791 
792 	NomPerfCtrFreq = PerfCtrFreq = Freq.QuadPart;
793 	msyslog(LOG_INFO,
794 		"Performance counter frequency %.3f MHz",
795 		PerfCtrFreq / 1e6);
796 
797 	/*
798 	 * With a precise system clock, our interpolation decision is
799 	 * a slam dunk.
800 	 */
801 	if (NULL != pGetSystemTimePreciseAsFileTime) {
802 		winnt_use_interpolation = FALSE;
803 		winnt_time_initialized = TRUE;
804 
805 		return;
806 	}
807 
808 	/*
809 	 * Implement any multimedia timer manipulation requested via -M
810 	 * option.  This is rumored to be unneeded on Win8 with the
811 	 * introduction of the precise (interpolated) system clock.
812 	 */
813 	if (modify_mm_timer) {
814 		if (timeGetDevCaps(&tc, sizeof(tc)) == TIMERR_NOERROR) {
815 			wTimerRes = min(max(tc.wPeriodMin, MM_TIMER_INTV), tc.wPeriodMax);
816 			timeBeginPeriod(wTimerRes);
817 			atexit(atexit_revert_mm_timer);
818 
819 			msyslog(LOG_INFO, "MM timer resolution: %u..%u msec, set to %u msec",
820 				tc.wPeriodMin, tc.wPeriodMax, wTimerRes );
821 
822 			/* Pause briefly before measuring the clock precision, see [Bug 2790] */
823 			Sleep( 33 );
824 
825 		} else {
826 			msyslog(LOG_ERR, "Multimedia timer unavailable");
827 		}
828 	}
829 
830 	/*
831 	 * Spin on GetSystemTimeAsFileTime to determine its
832 	 * granularity.  Prior to Windows Vista this is
833 	 * typically the same as the clock period.
834 	 */
835 	GetSystemTimeAsFileTime(&initial_hectonanosecs.ft);
836 	do {
837 		GetSystemTimeAsFileTime(&next_hectonanosecs.ft);
838 	} while (initial_hectonanosecs.ull == next_hectonanosecs.ull);
839 
840 	os_clock_precision = next_hectonanosecs.ull -
841 		initial_hectonanosecs.ull;
842 
843 	msyslog(LOG_INFO,
844 		"Windows clock precision %.3f msec, min. slew %.3f ppm/s",
845 		os_clock_precision / 1e4, ppm_per_adjust_unit);
846 
847 	winnt_time_initialized = TRUE;
848 
849 	choose_interp_counter();
850 
851 	if (getenv("NTPD_USE_SYSTEM_CLOCK") ||
852 	    (os_clock_precision < 4 * 10000 &&
853 	     !getenv("NTPD_USE_INTERP_DANGEROUS"))) {
854 		msyslog(LOG_INFO, "using Windows clock directly");
855 	} else {
856 		winnt_use_interpolation = TRUE;
857 		get_sys_time_as_filetime = GetInterpTimeAsFileTime;
858 		StartClockThread();
859 	}
860 }
861 
862 
863 void
atexit_revert_mm_timer(void)864 atexit_revert_mm_timer(void)
865 {
866 	timeEndPeriod(wTimerRes);
867 	DPRINTF(1, ("MM timer resolution reset\n"));
868 }
869 
870 
871 void
reset_winnt_time(void)872 reset_winnt_time(void)
873 {
874 	SYSTEMTIME st;
875 
876 	/*
877 	 * If we're in the 2-second slew right after a leap second,
878 	 * we don't want to continue that extreme slew, in that case
879 	 * disable our slewing and return clock discipline to the
880 	 * kernel.  Similarly if we are not yet synchronized,
881 	 * our current slew may not be a good ongoing trim.
882 	 * Otherwise, our leave in place the last SetSystemTimeAdjustment
883 	 * as an ongoing frequency correction, better than nothing.
884 	 * TODO:
885 	 * Verify this will not call SetSystemTimeAdjustment if
886 	 * ntpd is running in ntpdate mode.
887 	 */
888 	if (sys_leap == LEAP_NOTINSYNC || ls_time_adjustment != 0)
889 		SetSystemTimeAdjustment(0, TRUE);
890 
891 	/*
892 	 * Read the current system time, and write it back to
893 	 * force CMOS update, only if we are exiting because
894 	 * the computer is shutting down and we are already
895 	 * synchronized.
896 	 */
897 	 if (ntservice_systemisshuttingdown() && sys_leap != LEAP_NOTINSYNC) {
898 		GetSystemTime(&st);
899 		SetSystemTime(&st);
900 		NLOG(NLOG_SYSEVENT | NLOG_CLOCKINFO)
901 			msyslog(LOG_NOTICE, "system is shutting down, CMOS time reset.");
902 	}
903 }
904 
905 
906 /*
907  * GetSystemTimeAsFileTime() interface clone is used by getclock() in ntpd.
908  */
909 
910 void WINAPI
GetInterpTimeAsFileTime(LPFILETIME pft)911 GetInterpTimeAsFileTime(
912 	LPFILETIME pft
913 	)
914 {
915 	static ULONGLONG last_interp_time;
916 	FT_ULL now_time;
917 	FT_ULL now_count;
918 	ULONGLONG clock_backward;
919 
920 	/*
921 	 * Mark a mark ASAP.  The latency to here should be reasonably
922 	 * deterministic
923 	 */
924 
925 	now_count.ull = perf_ctr();
926 	now_time.ull = interp_time(now_count.ull, TRUE);
927 
928 	if (last_interp_time <= now_time.ull) {
929 		last_interp_time = now_time.ull;
930 	} else {
931 		clock_backward = last_interp_time - now_time.ull;
932 		if (clock_backward > clock_backward_max) {
933 			clock_backward_max = clock_backward;
934 			clock_backward_count++;
935 		}
936 		now_time.ull = last_interp_time;
937 	}
938 	*pft = now_time.ft;
939 
940 	return;
941 }
942 
943 
944 /*
945  * TimerApcFunction is invoked on the high-priority clock
946  * thread to capture a new  baseline system time and
947  * performance counter correlation every 43 msec (64Hz
948  * OS clock precision).
949  */
950 static void CALLBACK
TimerApcFunction(LPVOID lpArgToCompletionRoutine,DWORD dwTimerLowValue,DWORD dwTimerHighValue)951 TimerApcFunction(
952 	LPVOID lpArgToCompletionRoutine,
953 	DWORD dwTimerLowValue,
954 	DWORD dwTimerHighValue
955 	)
956 {
957 	static BOOL		ctr_freq_timer_started = FALSE;
958 	static ULONGLONG	prev_count;
959 	ULONGLONG		now_time;
960 	FT_ULL			now_count;
961 
962 	/* Grab the counter first of all */
963 	now_count.ull = perf_ctr();
964 
965 	now_time = (((ULONGLONG)dwTimerHighValue << 32) |
966 				dwTimerLowValue);
967 
968 	/*
969 	 * Save this correlation in the history.
970 	 */
971 	add_counter_time_pair(now_count.ull, now_time);
972 
973 	/*
974 	 * Once we're synchronized start the counter frequency
975 	 * tuning timer.
976 	 */
977 	if (INVALID_HANDLE_VALUE == ctr_freq_timer &&
978 	    LEAP_NOTINSYNC != sys_leap)
979 		start_ctr_freq_timer(now_time);
980 }
981 
982 
983 unsigned WINAPI
ClockThread(void * arg)984 ClockThread(
985 	void *arg
986 	)
987 {
988 	LARGE_INTEGER	DueTime;
989 	HANDLE		timer;
990 	double		HZ;
991 	double		TimerHz;
992 	DWORD		timer_period_msec;
993 	DWORD		res;
994 	char		*ntpd_int_int_text;
995 
996 	UNUSED_ARG(arg);
997 
998 	timer = CreateWaitableTimer(NULL, FALSE, NULL);
999 
1000 	ntpd_int_int_text = getenv("NTPD_INT_INT");
1001 
1002 	HZ = (double)HECTONANOSECONDS / clockperiod;
1003 
1004 	if (HZ > 63 && HZ < 65) {
1005 		timer_period_msec = 43;
1006 	} else if (HZ > 98 && HZ < 102) {
1007 		timer_period_msec = 27;
1008 		if (NULL == ntpd_int_int_text)
1009 			msyslog(LOG_WARNING,
1010 				"%.3f Hz system clock may benefit from "
1011 				"custom NTPD_INT_INT env var timer interval "
1012 				"override between approx. 20 and 50 msecs.",
1013 				HZ);
1014 	} else {
1015 		timer_period_msec = (DWORD)(0.5 + (2.752 * clockperiod / 10000));
1016 		if (NULL == ntpd_int_int_text)
1017 			msyslog(LOG_WARNING,
1018 				"unfamiliar %.3f Hz system clock may benefit "
1019 				"from custom NTPD_INT_INT env var timer "
1020 				"interval override between approx. 20 and 50 "
1021 				"msecs.",
1022 				HZ);
1023 	}
1024 
1025 	if (ntpd_int_int_text != NULL) {
1026 		timer_period_msec = atoi(ntpd_int_int_text);
1027 		timer_period_msec = max(9, timer_period_msec);
1028 		msyslog(LOG_NOTICE,
1029 			"using NTPD_INT_INT env var override %u",
1030 			timer_period_msec);
1031 	}
1032 
1033 	TimerHz = 1e3 / timer_period_msec;
1034 	msyslog(LOG_NOTICE, "HZ %.3f using %u msec timer %.3f Hz %d deep",
1035 		HZ,
1036 		timer_period_msec,
1037 		TimerHz,
1038 		BASELINES_USED);
1039 
1040 	/* negative DueTime means relative to now */
1041 	DueTime.QuadPart = -(int)timer_period_msec;
1042 
1043 	SetWaitableTimer(
1044 		timer,
1045 		&DueTime,		/* first fire */
1046 		timer_period_msec,	/* period thereafter */
1047 		TimerApcFunction,	/* callback routine */
1048 		&timer,			/* context for callback */
1049 		FALSE);			/* do not interfere with power saving */
1050 
1051 	/*
1052 	 * The clock thread spends the rest of its life in the TimerApcFunction
1053 	 * and ctr_freq_timer_fired timer APC callbacks, which can only occur
1054 	 * while this thread is in an alertable wait.  Note the Ex on
1055 	 * WaitForSingleObjectEx and TRUE for fAlertable.  The wait will return
1056 	 * after each APC callback in which case we simply wait again.  We will
1057 	 * break out of the loop when StopClockThread signals our exit event.
1058 	 */
1059 	do res = WaitForSingleObjectEx(
1060 			TimerThreadExitRequest,
1061 			INFINITE,
1062 			TRUE);
1063 	while (WAIT_OBJECT_0 != res);
1064 
1065 	CloseHandle(timer);
1066 
1067 	if (ctr_freq_timer != INVALID_HANDLE_VALUE) {
1068 		CloseHandle(ctr_freq_timer);
1069 		ctr_freq_timer = INVALID_HANDLE_VALUE;
1070 	}
1071 
1072 	return 0;
1073 }
1074 
1075 
1076 static void
StartClockThread(void)1077 StartClockThread(void)
1078 {
1079 	static BOOL done_once = FALSE;
1080 	FT_ULL StartTime;
1081 
1082 	/* init variables with the time now */
1083 	GetSystemTimeAsFileTime(&StartTime.ft);
1084 	baseline_times[0] = StartTime.ull;
1085 	baseline_counts[0] = perf_ctr();
1086 
1087 	/* init sync objects */
1088 	TimerThreadExitRequest = CreateEvent(NULL, FALSE, FALSE, NULL);
1089 
1090 	clock_thread =
1091 		(HANDLE)_beginthreadex(
1092 			NULL,
1093 			0,
1094 			ClockThread,
1095 			NULL,
1096 			CREATE_SUSPENDED,
1097 			&clock_thread_id);
1098 
1099 	if (clock_thread != NULL) {
1100 		/* remember the thread priority is only within the process class */
1101 		if (!SetThreadPriority(clock_thread, THREAD_PRIORITY_TIME_CRITICAL)) {
1102 			DPRINTF(1, ("Error setting thread priority\n"));
1103 		}
1104 
1105 		lock_thread_to_processor(clock_thread);
1106 		ResumeThread(clock_thread);
1107 
1108 		if (FALSE == done_once) {
1109 			done_once = TRUE;
1110 			lock_thread_to_processor(GetCurrentThread());
1111 			atexit( StopClockThread );
1112 		}
1113 
1114 		/*
1115 		 * Give the clock thread time to fill its counter/time
1116 		 * sample buffer.  This will underfill the buffer a
1117 		 * bit for sample periods over 43 msec.
1118 		 */
1119 		Sleep(BASELINES_USED * 43);
1120 	}
1121 }
1122 
1123 
1124 void
StopClockThread(void)1125 StopClockThread(void)
1126 {
1127 	/*
1128 	 * if the clock thread exit()s this routine
1129 	 * will be called on the clock thread and
1130 	 * we need not (and can't) use the normal
1131 	 * TimerThreadExitRequest event.
1132 	 */
1133 	if (GetCurrentThreadId() != clock_thread_id) {
1134 
1135 		if (!SetEvent(TimerThreadExitRequest) ||
1136 		    WaitForSingleObject(clock_thread, 2 * 1000) !=
1137 		    WAIT_OBJECT_0) {
1138 			msyslog(LOG_ERR, "Failed to stop clock thread.");
1139 		}
1140 	}
1141 	CloseHandle(TimerThreadExitRequest);
1142 	TimerThreadExitRequest = NULL;
1143 	CloseHandle(clock_thread);
1144 	clock_thread = NULL;
1145 }
1146 
1147 
1148 void
lock_thread_to_processor(HANDLE thread)1149 lock_thread_to_processor(HANDLE thread)
1150 {
1151 	static	DWORD_PTR	ProcessAffinityMask;
1152 	static	DWORD_PTR	ThreadAffinityMask;
1153 	DWORD_PTR		SystemAffinityMask;
1154 	char			*cputext;
1155 	unsigned int		cpu;
1156 
1157 	if ( ! winnt_time_initialized) {
1158 		DPRINTF(1, ("init_winnt_time() must be called before "
1159 				"lock_thread_to_processor(), exiting\n"));
1160 		exit(-1);
1161 	}
1162 
1163 	if (!winnt_use_interpolation)
1164 		return;
1165 
1166 	if (-1 == lock_interp_threads) {
1167 		DPRINTF(1, ("choose_interp_counter() is not called "
1168 			    "before lock_thread_to_processor()\n"));
1169 		exit(-1);
1170 	} else if (!lock_interp_threads)
1171 		return;
1172 
1173 	/*
1174 	 * Calculate the ThreadAffinityMask we'll use once on the
1175 	 * first invocation.
1176 	 */
1177 	if (!ProcessAffinityMask) {
1178 
1179 		/*
1180 		 * Choose which processor to nail the main and clock threads to.
1181 		 * If we have more than one, we simply choose the 2nd.
1182 		 * Randomly choosing from 2 to n would be better, but in
1183 		 * either case with clock and network interrupts more likely
1184 		 * to be serviced by the first procecssor, let's stay away
1185 		 * from it.  QueryPerformanceCounter is not necessarily
1186 		 * consistent across CPUs, hence the need to nail the two
1187 		 * threads involved in QPC-based interpolation to the same
1188 		 * CPU.
1189 		 */
1190 
1191 		GetProcessAffinityMask(
1192 			GetCurrentProcess(),
1193 			&ProcessAffinityMask,
1194 			&SystemAffinityMask);
1195 
1196 		/*
1197 		 * respect NTPD_CPU environment variable if present
1198 		 * for testing.  NTPD_CPU=0 means use all CPUs, 1-64
1199 		 * means lock threads involved in interpolation to
1200 		 * that CPU.  Default to 2nd if more than 1.
1201 		 */
1202 
1203 		cpu = 2;
1204 		cputext = getenv("NTPD_CPU");
1205 		if (cputext) {
1206 			cpu = (unsigned int) atoi(cputext);
1207 			cpu = min((8 * sizeof(DWORD_PTR)), cpu);
1208 		}
1209 
1210 		/*
1211 		 * Clear all bits except the 2nd.  If we have only one proc
1212 		 * that leaves ThreadAffinityMask zeroed and we won't bother
1213 		 * with SetThreadAffinityMask.
1214 		 */
1215 
1216 		ThreadAffinityMask = (0 == cpu) ? 0 : (1 << (cpu - 1));
1217 
1218 		if (ThreadAffinityMask &&
1219 			!(ThreadAffinityMask & ProcessAffinityMask))
1220 
1221 			DPRINTF(1, ("Selected CPU %u (mask %x) is outside "
1222 					"process mask %x, using all CPUs.\n",
1223 					cpu, ThreadAffinityMask,
1224 					ProcessAffinityMask));
1225 		else
1226 			DPRINTF(1, ("Wiring to processor %u (0 means all) "
1227 					"affinity mask %x\n",
1228 					cpu, ThreadAffinityMask));
1229 
1230 		ThreadAffinityMask &= ProcessAffinityMask;
1231 	}
1232 
1233 	if (ThreadAffinityMask &&
1234 	    !SetThreadAffinityMask(thread, ThreadAffinityMask))
1235 		msyslog(LOG_ERR,
1236 			"Unable to wire thread to mask %x: %m",
1237 			ThreadAffinityMask);
1238 }
1239 
1240 
1241 #ifdef HAVE_PPSAPI
1242 static inline void ntp_timestamp_from_counter(l_fp *, ULONGLONG,
1243 					      ULONGLONG);
1244 
1245 /*
1246  * helper routine for serial PPS which returns QueryPerformanceCounter
1247  * timestamp and needs to interpolate it to an NTP timestamp.
1248  */
1249 void
pps_ntp_timestamp_from_counter(ntp_fp_t * result,ULONGLONG Timestamp,ULONGLONG Counterstamp)1250 pps_ntp_timestamp_from_counter(
1251 	ntp_fp_t	*result,
1252 	ULONGLONG	Timestamp,
1253 	ULONGLONG	Counterstamp
1254 	)
1255 {
1256 	/*
1257 	 * convert between equivalent l_fp and PPSAPI ntp_fp_t
1258 	 */
1259 	ntp_timestamp_from_counter(
1260 		(l_fp *)result,
1261 		Timestamp,
1262 		Counterstamp);
1263 }
1264 
1265 
1266 static inline
1267 void
ntp_timestamp_from_counter(l_fp * result,ULONGLONG Timestamp,ULONGLONG Counterstamp)1268 ntp_timestamp_from_counter(
1269 	l_fp *result,
1270 	ULONGLONG Timestamp,
1271 	ULONGLONG Counterstamp
1272 	)
1273 {
1274 	FT_ULL		Now;
1275 	FT_ULL		Ctr;
1276 	LONGLONG	CtrDelta;
1277 	double		seconds;
1278 	ULONGLONG	InterpTimestamp;
1279 
1280 	if (winnt_use_interpolation) {
1281 		if (0 == Counterstamp) {
1282 			DPRINTF(1, ("ntp_timestamp_from_counter rejecting 0 counter.\n"));
1283 			ZERO(*result);
1284 			return;
1285 		}
1286 
1287 		InterpTimestamp = interp_time(Counterstamp + QPC_offset, FALSE);
1288 	} else {  /* ! winnt_use_interpolation */
1289 		if (NULL != pGetSystemTimePreciseAsFileTime &&
1290 		    0 != Counterstamp) {
1291 			QueryPerformanceCounter(&Ctr.li);
1292 			(*pGetSystemTimePreciseAsFileTime)(&Now.ft);
1293 			CtrDelta = Ctr.ull - Counterstamp;
1294 			seconds = (double)CtrDelta / PerfCtrFreq;
1295 			InterpTimestamp = Now.ull -
1296 			    (ULONGLONG)(seconds * HECTONANOSECONDS);
1297 		} else {
1298 			/* have to simply use the driver's system time timestamp */
1299 			InterpTimestamp = Timestamp;
1300 			GetSystemTimeAsFileTime(&Now.ft);
1301 		}
1302 	}
1303 
1304 	/* convert from 100ns units to NTP fixed point format */
1305 
1306 	InterpTimestamp -= FILETIME_1970;
1307 	result->l_ui = JAN_1970 + (u_int32)(InterpTimestamp / HECTONANOSECONDS);
1308 	result->l_uf = (u_int32)((InterpTimestamp % HECTONANOSECONDS) *
1309 				 (ULONGLONG)FRAC / HECTONANOSECONDS);
1310 }
1311 #endif  /* HAVE_PPSAPI */
1312 
1313 
1314 void
win_time_stepped(void)1315 win_time_stepped(void)
1316 {
1317 	/*
1318 	 * called back by ntp_set_tod after the system
1319 	 * time has been stepped (set).
1320 	 *
1321 	 * We normally prevent the reported time from going backwards
1322 	 * but need to allow it in this case.
1323 	 */
1324 	if (FALSE == winnt_use_interpolation)
1325 		return;
1326 
1327 
1328 	/*
1329 	 * Restart the clock thread to get a new baseline
1330 	 * time/counter correlation.
1331 	 */
1332 	StopClockThread();
1333 
1334 	/*
1335 	 * newest_baseline_gen is a generation counter
1336 	 * incremented once each time newest_baseline
1337 	 * is reset.
1338 	 */
1339 	newest_baseline_gen++;
1340 
1341 	clock_backward_max = CLOCK_BACK_THRESHOLD;
1342 	clock_backward_count = 0;
1343 	newest_baseline = 0;
1344 	ZERO(baseline_counts);
1345 	ZERO(baseline_times);
1346 
1347 	StartClockThread();
1348 }
1349 
1350 
1351 /*
1352  * log2ull - log base 2 of a unsigned 64-bit number
1353  */
1354 int
log2ull(ULONGLONG n)1355 log2ull(
1356 	ULONGLONG n
1357 	)
1358 {
1359 	const ULONGLONG one = 1;
1360 	int log = 0;
1361 
1362 	if (n >= one<<32) { n >>= 32; log += 32; }
1363 	if (n >= one<<16) { n >>= 16; log += 16; }
1364 	if (n >= one<< 8) { n >>=  8; log +=  8; }
1365 	if (n >= one<< 4) { n >>=  4; log +=  4; }
1366 	if (n >= one<< 2) { n >>=  2; log +=  2; }
1367 	if (n >= one<< 1) {	      log +=  1; }
1368 
1369 	return (n) ? log : (-1);
1370 }
1371 
1372 
1373 /*
1374  * ctr_freq_timer_fired is called once a few seconds before
1375  * tune_ctr_period seconds have elapsed, to reset the timer
1376  * and hopefully minimize error due to the system using the
1377  * nominal performance counter frequency to set the timer
1378  * internally, which is typically dozens of PPM from the
1379  * actual performance counter rate.  A few seconds later
1380  * it is called again to observe the counter and estimate the
1381  * counter frequency.
1382  */
1383 static void CALLBACK
ctr_freq_timer_fired(LPVOID arg,DWORD dwTimeLow,DWORD dwTimeHigh)1384 ctr_freq_timer_fired(
1385 	LPVOID arg,
1386 	DWORD dwTimeLow,
1387 	DWORD dwTimeHigh
1388 	)
1389 {
1390 	static	FT_ULL		begin_time = {0};
1391 	static	FT_ULL		begin_count = {0};
1392 	static	ULONGLONG	next_period_time = 0;
1393 	static	ULONGLONG	report_systemtime = 0;
1394 	const	ULONGLONG	five_minutes = 5ui64 * 60 * HECTONANOSECONDS;
1395 	FT_ULL			now_time;
1396 	FT_ULL			now_count;
1397 
1398 	if (!begin_time.ull) {
1399 		begin_count.ull = perf_ctr();
1400 		begin_time.ft.dwLowDateTime = dwTimeLow;
1401 		begin_time.ft.dwHighDateTime = dwTimeHigh;
1402 
1403 		/*
1404 		 * adapt perf ctr observation interval to the
1405 		 * counter frequency
1406 		 */
1407 		tune_ctr_period = 22680 / log2ull(NomPerfCtrFreq);
1408 
1409 		/*
1410 		 * reset timer 2s before period ends to minimize
1411 		 * error from OS timer routines using nominal
1412 		 * performance frequency internally.
1413 		 */
1414 		tune_ctr_freq_max_interval = tune_ctr_period - 2;
1415 
1416 		next_period_time = begin_time.ull +
1417 			(ULONGLONG)tune_ctr_period * HECTONANOSECONDS;
1418 
1419 		ROUND_TO_NEXT_SEC_BOTTOM(next_period_time);
1420 
1421 		reset_ctr_freq_timer(next_period_time, begin_time.ull);
1422 
1423 		return;
1424 	}
1425 
1426 	now_time.ft.dwLowDateTime = dwTimeLow;
1427 	now_time.ft.dwHighDateTime = dwTimeHigh;
1428 
1429 	if (now_time.ull >= next_period_time) {
1430 		now_count.ull = perf_ctr();
1431 		tune_ctr_freq(
1432 			now_count.ull - begin_count.ull,
1433 			now_time.ull - begin_time.ull);
1434 		next_period_time += (ULONGLONG)tune_ctr_period * HECTONANOSECONDS;
1435 		begin_count.ull = now_count.ull;
1436 		begin_time.ull = now_time.ull;
1437 	}
1438 
1439 	/*
1440 	 * Log clock backward events no more often than 5 minutes.
1441 	 */
1442 	if (!report_systemtime) {
1443 		report_systemtime = now_time.ull + five_minutes;
1444 	} else if (report_systemtime <= now_time.ull) {
1445 		report_systemtime +=  five_minutes;
1446 		if (clock_backward_count) {
1447 			msyslog(LOG_WARNING,
1448 				"clock would have gone backward %d times, "
1449 				"max %.1f usec",
1450 				clock_backward_count,
1451 				clock_backward_max / 10.);
1452 
1453 			clock_backward_max = CLOCK_BACK_THRESHOLD;
1454 			clock_backward_count = 0;
1455 		}
1456 	}
1457 	reset_ctr_freq_timer(next_period_time, now_time.ull);
1458 }
1459 
1460 
1461 void
reset_ctr_freq_timer_abs(ULONGLONG when)1462 reset_ctr_freq_timer_abs(
1463 	ULONGLONG when
1464 	)
1465 {
1466 	FT_ULL	fire_time;
1467 
1468 	fire_time.ull = when;
1469 	SetWaitableTimer(
1470 		ctr_freq_timer,
1471 		&fire_time.li,		/* first fire */
1472 		0,			/* not periodic */
1473 		ctr_freq_timer_fired,	/* callback routine */
1474 		NULL,			/* context for callback */
1475 		FALSE);			/* do not interfere with power saving */
1476 }
1477 
1478 
1479 void
reset_ctr_freq_timer(ULONGLONG when,ULONGLONG now)1480 reset_ctr_freq_timer(
1481 	ULONGLONG when,
1482 	ULONGLONG now
1483 	)
1484 {
1485 	if (when - now >
1486 	    (tune_ctr_freq_max_interval * HECTONANOSECONDS + HECTONANOSECONDS))
1487 		when = now + tune_ctr_freq_max_interval * HECTONANOSECONDS;
1488 
1489 	reset_ctr_freq_timer_abs(when);
1490 }
1491 
1492 
1493 void
start_ctr_freq_timer(ULONGLONG now_time)1494 start_ctr_freq_timer(
1495 	ULONGLONG now_time
1496 	)
1497 {
1498 	ULONGLONG when;
1499 
1500 	ctr_freq_timer = CreateWaitableTimer(NULL, FALSE, NULL);
1501 	when = now_time;
1502 	ROUND_TO_NEXT_SEC_BOTTOM(when);
1503 
1504 	reset_ctr_freq_timer_abs(when);
1505 }
1506 
1507 
1508 /*
1509  * tune_ctr_freq is called once per tune_ctr_period seconds
1510  * with a counter difference and time difference.
1511  */
1512 void
tune_ctr_freq(LONGLONG ctr_delta,LONGLONG time_delta)1513 tune_ctr_freq(
1514 	LONGLONG ctr_delta,
1515 	LONGLONG time_delta
1516 	)
1517 {
1518 	static unsigned count = 0;
1519 	static unsigned dispcount = 0;
1520 	static unsigned report_at_count = 0;
1521 	static int disbelieved = 0;
1522 	static int i = 0;
1523 	static double nom_freq = 0;
1524 	static LONGLONG diffs[TUNE_CTR_DEPTH] = {0};
1525 	static LONGLONG sum = 0;
1526 	char ctr_freq_eq[64];
1527 	LONGLONG delta;
1528 	LONGLONG deltadiff;
1529 	ULONGLONG ObsPerfCtrFreq;
1530 	double freq;
1531 	double this_freq;
1532 	BOOL isneg;
1533 
1534 	/* one-time initialization */
1535 	if (!report_at_count) {
1536 		report_at_count = 24 * 60 * 60 / tune_ctr_period;
1537 		nom_freq = NomPerfCtrFreq / 1e6;
1538 	}
1539 
1540 	/* delta is the per-second observed frequency this time */
1541 	delta = (LONGLONG)((double)ctr_delta * HECTONANOSECONDS /
1542 			   time_delta);
1543 
1544 	/* disbelieve any delta more than +/- 976 PPM from nominal */
1545 	deltadiff = delta - NomPerfCtrFreq;
1546 	if (0 > deltadiff) {
1547 		isneg = TRUE;
1548 		deltadiff = -deltadiff;
1549 	} else {
1550 		isneg = FALSE;
1551 	}
1552 
1553 	if ((ULONGLONG)deltadiff > (NomPerfCtrFreq / 1024)) {
1554 		disbelieved++;
1555 		dispcount++;
1556 #ifdef DEBUG
1557 		msyslog(LOG_DEBUG, "ctr delta %s%lld exceeds limit %llu",
1558 				   (isneg) ? "-" : "",
1559 				   deltadiff,
1560 				   NomPerfCtrFreq / 1024);
1561 #endif
1562 	} else {
1563 
1564 		/*
1565 		 * collect average over TUNE_CTR_DEPTH samples
1566 		 * for our PerfCtrFreq trimming.
1567 		 */
1568 
1569 		if (isneg)
1570 			deltadiff = -deltadiff;
1571 		sum -= diffs[i];
1572 		diffs[i] = deltadiff;
1573 		sum += deltadiff;
1574 		i = (i + 1) % COUNTOF(diffs);
1575 		count++;
1576 		dispcount++;
1577 	}
1578 
1579 	this_freq = delta / 1e6;
1580 
1581 	ObsPerfCtrFreq = NomPerfCtrFreq + (sum / COUNTOF(diffs));
1582 
1583 #if 1	/* #if 0 to disable changing freq used */
1584 	/* get rid of ObsPerfCtrFreq when removing the #ifdef */
1585 	PerfCtrFreq = ObsPerfCtrFreq;
1586 #endif
1587 	freq = PerfCtrFreq / 1e6;
1588 
1589 	/*
1590 	 * make the performance counter's frequency error from its
1591 	 * nominal rate, expressed in PPM, available via ntpq as
1592 	 * system variable "ctr_frequency".  This is consistent with
1593 	 * "frequency" which is the system clock drift in PPM.
1594 	 */
1595 	snprintf(ctr_freq_eq, sizeof(ctr_freq_eq), "ctr_frequency=%.2f",
1596 		 1e6 * (freq - nom_freq) / nom_freq);
1597 	set_sys_var(ctr_freq_eq, strlen(ctr_freq_eq) + 1, RO | DEF);
1598 
1599 	/*
1600 	 * report observed ctr freq each time the estimate used during
1601 	 * startup moves toward the observed freq from the nominal.
1602 	 */
1603 
1604 	if (count > COUNTOF(diffs) &&
1605 	    /* (count % COUNTOF(diffs)) && */	/* enables reporting each */
1606 	    dispcount < report_at_count)	/* TUNE_CTR_DEPTH samples */
1607 		return;
1608 
1609 	NLOG(NLOG_CLOCKINFO)
1610 		if (count <= COUNTOF(diffs))
1611 			/* moving to observed freq. from nominal (startup) */
1612 			msyslog(LOG_INFO,
1613 				(freq > 100)
1614 				   ? "ctr %.3f MHz %+6.2f PPM using %.3f MHz %+6.2f PPM"
1615 				   : "ctr %.6f MHz %+6.2f PPM using %.6f MHz %+6.2f PPM",
1616 				this_freq,
1617 				1e6 * (this_freq - nom_freq) / nom_freq,
1618 				freq,
1619 				1e6 * (freq - nom_freq) / nom_freq);
1620 		else
1621 			/* steady state */
1622 			msyslog(LOG_INFO,
1623 				(freq > 100)
1624 				   ? "ctr %.3f MHz %+.2f PPM"
1625 				   : "ctr %.6f MHz %+.2f PPM",
1626 				freq,
1627 				1e6 * (freq - nom_freq) / nom_freq);
1628 
1629 	if (disbelieved) {
1630 		msyslog(LOG_ERR,
1631 			"%d ctr samples exceed +/- 976 PPM range gate",
1632 			disbelieved);
1633 		disbelieved = 0;
1634 	}
1635 
1636 	dispcount = 0;
1637 }
1638 
1639 
1640 /*
1641  * add_counter_time_pair is called by the
1642  * high priority clock thread with each new
1643  * baseline counter/time correlation.
1644  */
1645 void
add_counter_time_pair(ULONGLONG ctr,LONGLONG time)1646 add_counter_time_pair(
1647 	ULONGLONG ctr,
1648 	LONGLONG time
1649 	)
1650 {
1651 	int i;
1652 
1653 	i = (newest_baseline + 1) % BASELINES_TOT;
1654 
1655 	baseline_counts[i] = ctr;
1656 	baseline_times[i] = time;
1657 
1658 	newest_baseline = i;
1659 }
1660 
1661 
1662 /*
1663  * interp_time estimates NT time in 100ns units
1664  * based on a performance counter value given.
1665  * This must tolerate recent historical counters
1666  * as well as current.  When current is FALSE
1667  * we can't assume ctr is the latest/highest
1668  * seen.
1669  */
1670 ULONGLONG
interp_time(ULONGLONG ctr,BOOL current)1671 interp_time(
1672 	ULONGLONG ctr,
1673 	BOOL current
1674 	)
1675 {
1676 	static __declspec(thread) int		last_newest = -1;
1677 	static __declspec(thread) int		last_newest_gen;
1678 	static __declspec(thread) int		best_index;
1679 	ULONGLONG	this_ctr;
1680 	LONGLONG	this_time;
1681 	LONGLONG	latest_time;
1682 	LONGLONG	ctr_diff;
1683 	int		i;
1684 	int		i_gen;
1685 	int		c;
1686 
1687 	/*
1688 	 * Use the system time (roughly synchronised to the tick, and
1689 	 * extrapolated using the system performance counter.
1690 	 *
1691 	 * Cache the results per thread and only repeat the
1692 	 * calculation when new data has arrived.
1693 	 */
1694 	i = newest_baseline;
1695 	i_gen = newest_baseline_gen;
1696 
1697 	if (last_newest == i && last_newest_gen == i_gen) {
1698 		this_time = baseline_times[best_index];
1699 		ctr_diff = ctr - baseline_counts[best_index];
1700 		this_time += (LONGLONG)PERF2HNS((double)ctr_diff);
1701 
1702 		return this_time;
1703 	}
1704 
1705 	last_newest = i;
1706 	last_newest_gen = i_gen;
1707 
1708 	latest_time = 0;
1709 
1710 	/*
1711 	 * Run through the history calculating the interpolated
1712 	 * time based on each counter/time correlation in turn,
1713 	 * and believe the latest one.  This is akin to the NTP
1714 	 * protocol minimum delay clock filter.  Errors due to
1715 	 * counter/time correlations with stale time are all
1716 	 * negative.
1717 	 */
1718 	for (c = 0; c < BASELINES_USED; c++) {
1719 		 if (baseline_times[i]) {
1720 			this_time = baseline_times[i];
1721 			this_ctr = baseline_counts[i];
1722 
1723 			ctr_diff = ctr - this_ctr;
1724 
1725 			if (current && ctr_diff < 0) {
1726 				/*
1727 				 * The performance counter apparently went
1728 				 * backwards without rolling over.  It might
1729 				 * be nice to complain but we don't want
1730 				 * to do it repeatedly.
1731 				 */
1732 				ctr_diff = 0;
1733 			}
1734 
1735 			this_time += (LONGLONG)PERF2HNS((double)ctr_diff);
1736 
1737 			if (this_time > latest_time) {
1738 				latest_time = this_time;
1739 				best_index = i;
1740 			}
1741 		}
1742 		i = i ? (i - 1) : (BASELINES_TOT - 1);
1743 	}
1744 
1745 	return latest_time;
1746 }
1747