1 /* Windows NT Clock Routines
2 *
3 * Created by Sven Dietrich sven@inter-yacht.com
4 *
5 * New interpolation scheme by Dave Hart <davehart@davehart.com> in
6 * February 2009 overcomes 500us-1ms inherent jitter with the older
7 * scheme, first identified by Peter Rosin (nee Ekberg)
8 * <peda@lysator.liu.se> in 2003 [Bug 216].
9 *
10 * Note: The Windows port of ntpd uses the C99-snprintf replacement for
11 * (v)snprintf(), also used by msyslog(), which does not understand the
12 * printf format specifier %I64d, only the more common %lld. With the
13 * minimum supported compiler raised to Visual C++ 2005 in ntp-dev in
14 * August 2011, all MS C runtime routines also understand %lld and %llu.
15 */
16
17
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21
22 #include <sys/resource.h> /* our private version */
23
24 #if defined(_MSC_VER) && _MSC_VER >= 1400 /* VS 2005 */
25 #include <intrin.h> /* for __rdtsc() */
26 #endif
27
28 #ifdef HAVE_PPSAPI
29 #include <timepps.h>
30 /*
31 * ports/winnt/include/timepps.h defines EOPNOTSUPP for compatibility
32 * with PPSAPI on other platforms. ports/winnt/include/isc/net.h has
33 * #define EOPNOTSUPP WSAEOPNOTSUPP, so to avoid a macro redefinition
34 * warning undefine it.
35 */
36 #undef EOPNOTSUPP
37 #endif /* HAVE_PPSAPI */
38
39 #include "ntp_stdlib.h"
40 #include "ntp_unixtime.h"
41 #include "ntp_timer.h"
42 #include "ntp_assert.h"
43 #include "ntp_leapsec.h"
44 #include "clockstuff.h"
45 #include "ntservice.h"
46 #include "ntpd.h"
47 #include "ntpd-opts.h"
48
49 extern double sys_residual; /* residual from previous adjustment */
50
51 /*
52 * Include code to possibly modify the MM timer while the service is active.
53 */
54
55 /*
56 * Whether or not MM timer modifications takes place is still controlled
57 * by the variable below which is initialized by a default value but
58 * might be changed depending on a command line switch.
59 */
60 static int modify_mm_timer = MM_TIMER_LORES;
61
62 #define MM_TIMER_INTV 1 /* the interval we'd want to set the MM timer to [ms] */
63
64 static UINT wTimerRes;
65
66 BOOL init_randfile();
67
68 static long last_Adj = 0;
69
70 #define LS_CORR_INTV_SECS 2 /* seconds to apply leap second correction */
71 #define LS_CORR_INTV ( 1000ul * LS_CORR_INTV_SECS )
72 #define LS_CORR_LIMIT ( 250ul ) // quarter second
73
74 typedef union ft_ull {
75 FILETIME ft;
76 ULONGLONG ull;
77 LONGLONG ll;
78 LARGE_INTEGER li;
79 } FT_ULL;
80
81 /* leap second stuff */
82 static FT_ULL ls_ft;
83 static DWORD ls_time_adjustment;
84
85 static BOOL winnt_time_initialized = FALSE;
86 static BOOL winnt_use_interpolation = FALSE;
87 static unsigned clock_thread_id;
88
89
90 void WINAPI GetInterpTimeAsFileTime(LPFILETIME pft);
91 static void StartClockThread(void);
92 static void tune_ctr_freq(LONGLONG, LONGLONG);
93 void StopClockThread(void);
94 void atexit_revert_mm_timer(void);
95 void win_time_stepped(void);
96
97 static HANDLE clock_thread = NULL;
98 static HANDLE TimerThreadExitRequest = NULL;
99
100 /*
101 * interp_time estimates time in 100ns units
102 * based on a performance counter value given.
103 * The 2nd parameter indicates if this is
104 * part of a current time-of-day calculation.
105 */
106 ULONGLONG interp_time(ULONGLONG, BOOL);
107
108 /*
109 * add_counter_time_pair is called by the
110 * high priority clock thread with a new
111 * sample.
112 */
113 void add_counter_time_pair(ULONGLONG, LONGLONG);
114
115 /*
116 * globals used by the above two functions to
117 * implement the counter/time history
118 */
119 #define BASELINES_TOT 256
120 #define BASELINES_USED 64
121
122 static volatile int newest_baseline = 0;
123 static volatile int newest_baseline_gen = 0;
124 static ULONGLONG baseline_counts[BASELINES_TOT] = {0};
125 static LONGLONG baseline_times[BASELINES_TOT] = {0};
126
127 #define CLOCK_BACK_THRESHOLD 100 /* < 10us unremarkable */
128 static ULONGLONG clock_backward_max = CLOCK_BACK_THRESHOLD;
129 static int clock_backward_count;
130
131 /**
132 * A flag set on Windows versions which ignore small time adjustments.
133 *
134 * Windows Vista and Windows 7 ignore TimeAdjustment less than 16.
135 * @note Has to be checked for Windows Server 2008/2012 and Windows 8.
136 * Ref: http://support.microsoft.com/kb/2537623, bug #2328
137 */
138 static BOOL os_ignores_small_adjustment;
139
140 /*
141 * clockperiod is the period used for SetSystemTimeAdjustment
142 * slewing calculations but does not necessarily correspond
143 * to the precision of the OS clock. Prior to Windows Vista
144 * (6.0) the two were identical. In 100ns units.
145 */
146 static DWORD clockperiod;
147
148 /*
149 * os_clock_precision is the observed precision of the OS
150 * clock, meaning the increment between discrete values. This
151 * is currently calculated once at startup. 100ns units.
152 */
153 static ULONGLONG os_clock_precision;
154
155 /*
156 * NomPerfCtrFreq is from QueryPerformanceFrequency and is the
157 * number of performance counter beats per second. PerfCtrFreq
158 * starts from NomPerfCtrFreq but is maintained using a sliding
159 * window average based on actual performance counter behavior,
160 * to allow us to better tolerate powersaving measures that
161 * alter the effective frequency of the processor cycle counter
162 * (TSC) which sometimes underlies QueryPerformanceCounter.
163 *
164 * Note that the OS is unlikely to be so subtle in its internal
165 * scheduling of waitable timers, presumably done using the
166 * performance counter. Therefore our calculations for
167 * interpolated time should be based on PerfCtrFreq but our
168 * calculations for SetWaitableTimer should assume the OS will
169 * convert from FILETIME 100ns units to performance counter
170 * beats using the nominal frequency.
171 */
172
173 volatile ULONGLONG PerfCtrFreq = 0;
174 ULONGLONG NomPerfCtrFreq = 0;
175
176 /*
177 * If we're using RDTSC beating at the same rate as
178 * QueryPerformanceCounter, there is a systemic
179 * offset we need to account for when using
180 * counterstamps from serialpps.sys, which are
181 * always from QPC (actually KeQueryPerformanceCounter).
182 */
183 static LONGLONG QPC_offset = 0;
184
185 /*
186 * Substitute RDTSC for QueryPerformanceCounter()?
187 */
188 static int use_pcc = -1;
189
190 /*
191 * Restrict threads that call QPC/RDTSC to one CPU?
192 */
193 static int lock_interp_threads = -1;
194
195 /*
196 * ppm_per_adjust_unit is parts per million effect on the OS
197 * clock per slewing adjustment unit per second. Per haps.
198 */
199 static DOUBLE ppm_per_adjust_unit;
200
201 /*
202 * wintickadj emulates the functionality provided by unix tickadj,
203 * providing a baseline clock correction if needed to get the
204 * clock within a few hundred PPM of correct frequency.
205 */
206 static long wintickadj;
207
208 static void choose_interp_counter(void);
209 static int is_qpc_built_on_pcc(void);
210
211 /*
212 * performance counter frequency observations
213 */
214 #define TUNE_CTR_DEPTH 3 /* running avg depth */
215
216 static HANDLE ctr_freq_timer = INVALID_HANDLE_VALUE;
217 static ULONGLONG tune_ctr_freq_max_interval;
218 static unsigned tune_ctr_period;
219 void start_ctr_freq_timer(ULONGLONG now_time);
220 void reset_ctr_freq_timer(ULONGLONG when, ULONGLONG now);
221 void reset_ctr_freq_timer_abs(ULONGLONG when);
222
223 /* round a Windows time to the next bottom of the second */
224
225 #define ROUND_TO_NEXT_SEC_BOTTOM(t) \
226 do { \
227 (t) += 3 * HECTONANOSECONDS / 2 - 1; \
228 (t) /= HECTONANOSECONDS; \
229 (t) *= HECTONANOSECONDS; \
230 (t) -= HECTONANOSECONDS / 2; \
231 } while (0)
232
233 /*
234 * NT native time format is 100's of nanoseconds since 1601-01-01.
235 * Helpers for converting between "hectonanoseconds" and the
236 * performance counter scale from which interpolated time is
237 * derived.
238 */
239 #define HNS2PERF(hns) ((hns) * PerfCtrFreq / HECTONANOSECONDS)
240 #define PERF2HNS(ctr) ((ctr) * HECTONANOSECONDS / PerfCtrFreq)
241
242
243 #if defined(_MSC_VER) && _MSC_VER >= 1400 /* VS 2005 */
244 #define get_pcc() __rdtsc()
245 #else
246 /*
247 * something like this can be used for a compiler without __rdtsc()
248 */
249 ULONGLONG __forceinline
get_pcc(void)250 get_pcc(void)
251 {
252 /* RDTSC returns in EDX:EAX, same as C compiler */
253 __asm {
254 RDTSC
255 }
256 }
257 #endif
258
259
260 /*
261 * perf_ctr() returns the current performance counter value,
262 * from QueryPerformanceCounter or RDTSC.
263 */
264 ULONGLONG WINAPI
perf_ctr(void)265 perf_ctr(void)
266 {
267 FT_ULL ft;
268
269 if (use_pcc)
270 return get_pcc();
271 else {
272 QueryPerformanceCounter(&ft.li);
273 return ft.ull;
274 }
275 }
276
277
278 /*
279 * init_small_adjustment
280 *
281 * Set variable os_ignores_small_adjustment
282 *
283 */
init_small_adjustment(void)284 static void init_small_adjustment(void)
285 {
286 OSVERSIONINFO vi;
287 memset(&vi, 0, sizeof(vi));
288 vi.dwOSVersionInfoSize = sizeof(vi);
289
290 if (!GetVersionEx(&vi)) {
291 msyslog(LOG_WARNING, "GetVersionEx failed with error code %d.", GetLastError());
292 os_ignores_small_adjustment = FALSE;
293 return;
294 }
295
296 if (vi.dwMajorVersion == 6 && vi.dwMinorVersion == 1) {
297 // Windows 7 and Windows Server 2008 R2
298 //
299 // Windows 7 is documented as affected.
300 // Windows Server 2008 R2 is assumed affected.
301 os_ignores_small_adjustment = TRUE;
302 } else if (vi.dwMajorVersion == 6 && vi.dwMinorVersion == 0) {
303 // Windows Vista and Windows Server 2008
304 //
305 // Windows Vista is documented as affected.
306 // Windows Server 2008 is assumed affected.
307 os_ignores_small_adjustment = TRUE;
308 } else {
309 os_ignores_small_adjustment = FALSE;
310 }
311 }
312
313
314 /*
315 * choose_interp_counter - select between QueryPerformanceCounter and
316 * the x86 processor cycle counter (TSC).
317 */
318 static void
choose_interp_counter(void)319 choose_interp_counter(void)
320 {
321 const char * ntpd_pcc_freq_text;
322 int qpc_built_on_pcc;
323
324 /*
325 * Regardless of whether we actually use RDTSC, first determine
326 * if QueryPerformanceCounter is built on it, so that we can
327 * decide whether it's prudent to lock QPC-consuming threads to
328 * a particular CPU.
329 */
330 qpc_built_on_pcc = is_qpc_built_on_pcc();
331 lock_interp_threads = qpc_built_on_pcc;
332
333 /*
334 * It's time to make some more permanent knobs,
335 * but for right now the RDTSC aka PCC dance on x86 is:
336 *
337 * 1. With none of these variables defined, only QPC
338 * is used because there is no reliable way to
339 * detect counter frequency variation after ntpd
340 * startup implemented.
341 * 2. We need a better knob, but for now if you know
342 * your RDTSC / CPU frequency is invariant, set
343 * NTPD_PCC and assuming your QPC is based on the
344 * PCC as well, RDTSC will be substituted.
345 * 3. More forcefully, you can jam in a desired exact
346 * processor frequency, expressed in cycles per
347 * second by setting NTPD_PCC_FREQ=398125000, for
348 * example, if yor actual known CPU frequency is
349 * 398.125 MHz, and NTPD_PCC doesn't work because
350 * QueryPerformanceCounter is implemented using
351 * another counter. It is very easy to make ntpd
352 * fall down if the NTPD_PCC_FREQ value isn't very
353 * close to the observed RDTSC units per second.
354 *
355 * Items 2 and 3 could probably best be combined into one
356 * new windows-specific command line switch such as
357 * ntpd --pcc
358 * or
359 * ntpd --pcc=398125000
360 *
361 * They are currently tied to Windows because that is
362 * the only ntpd port with its own interpolation, and
363 * to x86/x64 because no one has ported the Windows
364 * ntpd port to the sole remaining alternative, Intel
365 * Itanium.
366 */
367 if (HAVE_OPT(PCCFREQ))
368 ntpd_pcc_freq_text = OPT_ARG(PCCFREQ);
369 else
370 ntpd_pcc_freq_text = getenv("NTPD_PCC_FREQ");
371
372 if (!HAVE_OPT(USEPCC)
373 && NULL == ntpd_pcc_freq_text
374 && NULL == getenv("NTPD_PCC")) {
375 use_pcc = 0;
376 return;
377 }
378
379 if (!qpc_built_on_pcc && NULL == ntpd_pcc_freq_text) {
380 use_pcc = 0;
381 return;
382 }
383
384 use_pcc = 1;
385 if (ntpd_pcc_freq_text != NULL)
386 sscanf(ntpd_pcc_freq_text,
387 "%llu",
388 &NomPerfCtrFreq);
389
390 NLOG(NLOG_CLOCKINFO)
391 msyslog(LOG_INFO,
392 "using processor cycle counter "
393 "%.3f MHz",
394 NomPerfCtrFreq / 1e6);
395 return;
396 }
397
398
399 /*
400 * is_qpc_built_on_pcc - test if QueryPerformanceCounter runs at the
401 * same rate as the processor cycle counter (TSC).
402 */
403 static int
is_qpc_built_on_pcc(void)404 is_qpc_built_on_pcc(void)
405 {
406 LONGLONG offset;
407 FT_ULL ft1;
408 FT_ULL ft2;
409 FT_ULL ft3;
410 FT_ULL ft4;
411 FT_ULL ft5;
412
413 REQUIRE(NomPerfCtrFreq != 0);
414
415 QueryPerformanceCounter(&ft1.li);
416 ft2.ull = get_pcc();
417 Sleep(1);
418 QueryPerformanceCounter(&ft3.li);
419 Sleep(1);
420 ft4.ull = get_pcc();
421 Sleep(1);
422 QueryPerformanceCounter(&ft5.li);
423
424 offset = ft2.ull - ft1.ull;
425 ft3.ull += offset;
426 ft5.ull += offset;
427
428 if (ft2.ull <= ft3.ull &&
429 ft3.ull <= ft4.ull &&
430 ft4.ull <= ft5.ull) {
431
432 QPC_offset = offset;
433 return TRUE;
434 }
435
436 return FALSE;
437 }
438
439
440 /*
441 * Request Multimedia Timer
442 */
443 void
set_mm_timer(int timerres)444 set_mm_timer(
445 int timerres
446 )
447 {
448 modify_mm_timer = timerres;
449 }
450
451 /*
452 * adj_systime - called once every second to discipline system clock.
453 * Normally, the offset passed in (parameter now) is in the range
454 * [-NTP_MAXFREQ, NTP_MAXFREQ]. However, at EVNT_NSET, a much larger
455 * slew is requested if the initial offset is less than the step
456 * threshold, in the range [-step, step] where step is the step
457 * threshold, 128 msec by default. For the remainder of the frequency
458 * training interval, adj_systime is called with 0 offset each second
459 * and slew the large offset at 500 PPM (500 usec/sec).
460 * Returns 1 if okay, 0 if trouble.
461 */
462 int
adj_systime(double now)463 adj_systime(
464 double now
465 )
466 {
467 /* ntp time scale origin as ticks since 1601-01-01 */
468 static const ULONGLONG HNS_JAN_1900 = 94354848000000000ull;
469
470 static DWORD ls_start_tick; /* start of slew in 1ms ticks */
471
472 static double adjtime_carry;
473 double dtemp;
474 u_char isneg;
475 BOOL rc;
476 long TimeAdjustment;
477 SYSTEMTIME st;
478 DWORD ls_elapsed;
479 FT_ULL curr_ft;
480 leap_result_t lsi;
481
482 /*
483 * Add the residual from the previous adjustment to the new
484 * adjustment, bound and round.
485 */
486 dtemp = adjtime_carry + sys_residual + now;
487 adjtime_carry = 0.;
488 sys_residual = 0.;
489 if (dtemp < 0) {
490 isneg = TRUE;
491 dtemp = -dtemp;
492 } else {
493 isneg = FALSE;
494 }
495
496 if (dtemp > NTP_MAXFREQ) {
497 adjtime_carry = dtemp - NTP_MAXFREQ;
498 dtemp = NTP_MAXFREQ;
499 }
500
501 if (isneg) {
502 dtemp = -dtemp;
503 adjtime_carry = -adjtime_carry;
504 }
505
506 dtemp = dtemp * 1e6;
507
508 /*
509 * dtemp is in micro seconds. NT uses 100 ns units,
510 * so a unit change in TimeAdjustment corresponds
511 * to slewing 10 ppm on a 100 Hz system. Calculate
512 * the number of 100ns units to add, using OS tick
513 * frequency as per suggestion from Harry Pyle,
514 * and leave the remainder in dtemp
515 */
516 TimeAdjustment = (long)(dtemp / ppm_per_adjust_unit +
517 ((isneg)
518 ? -0.5
519 : 0.5));
520
521 if (os_ignores_small_adjustment) {
522 /*
523 * As the OS ignores adjustments smaller than 16, we need to
524 * leave these small adjustments in sys_residual, causing
525 * the small values to be averaged over time.
526 */
527 if (TimeAdjustment > -16 && TimeAdjustment < 16) {
528 TimeAdjustment = 0;
529 }
530 }
531
532 dtemp -= TimeAdjustment * ppm_per_adjust_unit;
533
534
535 /* If a piping-hot close leap second is pending for the end
536 * of this day, determine the UTC time stamp when the transition
537 * must take place. (Calculated in the current leap era!)
538 */
539 if (leapsec >= LSPROX_ALERT) {
540 if (0 == ls_ft.ull && leapsec_frame(&lsi)) {
541 if (lsi.tai_diff > 0) {
542 /* A leap second insert is scheduled at the end
543 * of the day. Since we have not yet computed the
544 * time stamp, do it now. Signal electric mode
545 * for this insert. We start processing 1 second early
546 * because we want to slew over 2 seconds.
547 */
548 ls_ft.ull = lsi.ttime.Q_s * HECTONANOSECONDS
549 + HNS_JAN_1900;
550 FileTimeToSystemTime(&ls_ft.ft, &st);
551 msyslog(LOG_NOTICE,
552 "Detected positive leap second announcement "
553 "for %04d-%02d-%02d %02d:%02d:%02d UTC",
554 st.wYear, st.wMonth, st.wDay,
555 st.wHour, st.wMinute, st.wSecond);
556 /* slew starts with last second before insertion!
557 * And we have to tell the core that we deal with it.
558 */
559 ls_ft.ull -= (HECTONANOSECONDS + HECTONANOSECONDS/2);
560 leapsec_electric(TRUE);
561 } else if (lsi.tai_diff < 0) {
562 /* Do not handle negative leap seconds here. If this
563 * happens, let the system step.
564 */
565 leapsec_electric(FALSE);
566 }
567 }
568 } else {
569 /* The leap second announcement is gone. Happens primarily after
570 * the leap transition, but can also be due to a clock step.
571 * Disarm the leap second, but only if there is one scheduled
572 * and not currently in progress!
573 */
574 if (ls_ft.ull != 0 && ls_time_adjustment == 0) {
575 ls_ft.ull = 0;
576 msyslog(LOG_NOTICE, "Leap second announcement disarmed");
577 }
578 }
579
580 /*
581 * If the time stamp for the next leap second has been set
582 * then check if the leap second must be handled. We use
583 * free-running milliseconds from 'GetTickCount()', which
584 * is documented as not affected by clock and/or speed
585 * adjustments.
586 */
587 if (ls_ft.ull != 0) {
588 if (0 == ls_time_adjustment) { /* has not yet been scheduled */
589 GetSystemTimeAsFileTime(&curr_ft.ft);
590 if (curr_ft.ull >= ls_ft.ull) {
591 ls_ft.ull = _UI64_MAX; /* guard against second schedule */
592 ls_time_adjustment = clockperiod / LS_CORR_INTV_SECS;
593 ls_start_tick = GetTickCount();
594 msyslog(LOG_NOTICE, "Started leap second insertion.");
595 }
596 ls_elapsed = 0;
597 } else { /* leap sec adjustment has been scheduled previously */
598 ls_elapsed = GetTickCount() - ls_start_tick;
599 }
600
601 if (ls_time_adjustment != 0) { /* leap second adjustment is currently active */
602 if (ls_elapsed > (LS_CORR_INTV - LS_CORR_LIMIT)) {
603 ls_time_adjustment = 0; /* leap second adjustment done */
604 msyslog(LOG_NOTICE, "Finished leap second insertion.");
605 }
606
607 /*
608 * NOTE: While the system time is slewed during the leap second
609 * the interpolation function which is based on the performance
610 * counter does not account for the slew.
611 */
612 TimeAdjustment -= ls_time_adjustment;
613 }
614 }
615
616
617 sys_residual = dtemp / 1e6;
618 DPRINTF(3, ("adj_systime: %.9f -> %.9f residual %.9f",
619 now, 1e-6 * (TimeAdjustment * ppm_per_adjust_unit),
620 sys_residual));
621 if (0. == adjtime_carry)
622 DPRINTF(3, ("\n"));
623 else
624 DPRINTF(3, (" adjtime %.9f\n", adjtime_carry));
625
626 /* only adjust the clock if adjustment changes */
627 TimeAdjustment += wintickadj;
628 if (last_Adj != TimeAdjustment) {
629 last_Adj = TimeAdjustment;
630 DPRINTF(2, ("SetSystemTimeAdjustment(%+ld)\n", TimeAdjustment));
631 rc = SetSystemTimeAdjustment(clockperiod + TimeAdjustment, FALSE);
632 if (!rc)
633 msyslog(LOG_ERR, "Can't adjust time: %m");
634 } else {
635 rc = TRUE;
636 }
637
638 return rc;
639 }
640
641
642 void
init_winnt_time(void)643 init_winnt_time(void)
644 {
645 static const char settod[] = "settimeofday=\"SetSystemTime\"";
646 char szMsgPath[MAX_PATH+1];
647 HANDLE hToken = INVALID_HANDLE_VALUE;
648 TOKEN_PRIVILEGES tkp;
649 TIMECAPS tc;
650 BOOL noslew;
651 DWORD adjclockperiod;
652 LARGE_INTEGER Freq;
653 FT_ULL initial_hectonanosecs;
654 FT_ULL next_hectonanosecs;
655 double adjppm;
656 double rawadj;
657 char * pch;
658
659 if (winnt_time_initialized)
660 return;
661
662 /* Set up the Console Handler */
663 if (!SetConsoleCtrlHandler(OnConsoleEvent, TRUE)) {
664 msyslog(LOG_ERR, "Can't set console control handler: %m");
665 }
666
667 /* Set the Event-ID message-file name. */
668 if (!GetModuleFileName(NULL, szMsgPath, sizeof(szMsgPath))) {
669 msyslog(LOG_ERR, "GetModuleFileName(PGM_EXE_FILE) failed: %m");
670 exit(1);
671 }
672
673 /* Initialize random file before OpenSSL checks */
674 if (!init_randfile())
675 msyslog(LOG_ERR, "Unable to initialize .rnd file");
676
677 #pragma warning(push)
678 #pragma warning(disable: 4127) /* conditional expression is constant */
679
680 #ifdef DEBUG
681 if (SIZEOF_TIME_T != sizeof(time_t)
682 || SIZEOF_INT != sizeof(int)
683 || SIZEOF_SIGNED_CHAR != sizeof(char)) {
684 msyslog(LOG_ERR, "config.h SIZEOF_* macros wrong, fatal");
685 exit(1);
686 }
687 #endif
688
689 #pragma warning(pop)
690
691 init_small_adjustment();
692 leapsec_electric(TRUE);
693
694 /*
695 * Get privileges needed for fiddling with the clock
696 */
697
698 /* get the current process token handle */
699 if (!OpenProcessToken(
700 GetCurrentProcess(),
701 TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY,
702 &hToken)) {
703 msyslog(LOG_ERR, "OpenProcessToken failed: %m");
704 exit(-1);
705 }
706 /* get the LUID for system-time privilege. */
707 LookupPrivilegeValue(NULL, SE_SYSTEMTIME_NAME, &tkp.Privileges[0].Luid);
708 tkp.PrivilegeCount = 1; /* one privilege to set */
709 tkp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
710
711 /* get set-time privilege for this process. */
712 AdjustTokenPrivileges(hToken, FALSE, &tkp, 0,
713 (PTOKEN_PRIVILEGES) NULL, 0);
714
715 /* cannot use return value of AdjustTokenPrivileges. */
716 /* (success does not indicate all privileges were set) */
717 if (GetLastError() != ERROR_SUCCESS) {
718 msyslog(LOG_ERR, "AdjustTokenPrivileges failed: %m");
719 /* later set time call will probably fail */
720 }
721
722 CloseHandle(hToken);
723 hToken = INVALID_HANDLE_VALUE;
724
725 /*
726 * Say how we're setting the time of day
727 */
728 set_sys_var(settod, sizeof(settod), RO);
729
730 /*
731 * ntpd on Windows has always raised its priority, without
732 * requiring -N as on Unix. Since Windows ntpd doesn't share
733 * the history of unix ntpd of once having no -N and therefore
734 * needing to be invoked under nice, there is no reason to
735 * bring it in line with the Unix version in this regard.
736 * Instsrv assumes ntpd is invoked with no arguments, and
737 * upgrading users would be negatively surprised by the
738 * poor timekeeping if they failed to add -N as part of
739 * upgrading were we to correct this platform difference.
740 */
741 if (-1 == setpriority(PRIO_PROCESS, 0, NTP_PRIO))
742 exit(-1);
743
744 /* Determine the existing system time slewing */
745 if (!GetSystemTimeAdjustment(&adjclockperiod, &clockperiod, &noslew)) {
746 msyslog(LOG_ERR, "GetSystemTimeAdjustment failed: %m");
747 exit(-1);
748 }
749
750 /*
751 * If there is no slewing before ntpd, adjclockperiod and clockperiod
752 * will be equal. Any difference is carried into adj_systime's first
753 * pass as the previous adjustment.
754 */
755 last_Adj = adjclockperiod - clockperiod;
756
757 if (last_Adj)
758 msyslog(LOG_INFO,
759 "Clock interrupt period %.3f msec "
760 "(startup slew %.1f usec/period)",
761 clockperiod / 1e4,
762 last_Adj / 10.);
763 else
764 msyslog(LOG_INFO,
765 "Clock interrupt period %.3f msec",
766 clockperiod / 1e4);
767
768 /*
769 * Calculate the time adjustment resulting from incrementing
770 * units per tick by 1 unit for 1 second
771 */
772 ppm_per_adjust_unit = 1e6 / clockperiod;
773
774 pch = getenv("NTPD_TICKADJ_PPM");
775 if (pch != NULL && 1 == sscanf(pch, "%lf", &adjppm)) {
776 rawadj = adjppm / ppm_per_adjust_unit;
777 rawadj += (rawadj < 0)
778 ? -0.5
779 : 0.5;
780 wintickadj = (long)rawadj;
781 msyslog(LOG_INFO,
782 "Using NTPD_TICKADJ_PPM %+g ppm (%+ld)",
783 adjppm, wintickadj);
784 }
785
786 /* get the performance counter ticks per second */
787 if (!QueryPerformanceFrequency(&Freq) || !Freq.QuadPart) {
788 msyslog(LOG_ERR, "QueryPerformanceFrequency failed: %m");
789 exit(-1);
790 }
791
792 NomPerfCtrFreq = PerfCtrFreq = Freq.QuadPart;
793 msyslog(LOG_INFO,
794 "Performance counter frequency %.3f MHz",
795 PerfCtrFreq / 1e6);
796
797 /*
798 * With a precise system clock, our interpolation decision is
799 * a slam dunk.
800 */
801 if (NULL != pGetSystemTimePreciseAsFileTime) {
802 winnt_use_interpolation = FALSE;
803 winnt_time_initialized = TRUE;
804
805 return;
806 }
807
808 /*
809 * Implement any multimedia timer manipulation requested via -M
810 * option. This is rumored to be unneeded on Win8 with the
811 * introduction of the precise (interpolated) system clock.
812 */
813 if (modify_mm_timer) {
814 if (timeGetDevCaps(&tc, sizeof(tc)) == TIMERR_NOERROR) {
815 wTimerRes = min(max(tc.wPeriodMin, MM_TIMER_INTV), tc.wPeriodMax);
816 timeBeginPeriod(wTimerRes);
817 atexit(atexit_revert_mm_timer);
818
819 msyslog(LOG_INFO, "MM timer resolution: %u..%u msec, set to %u msec",
820 tc.wPeriodMin, tc.wPeriodMax, wTimerRes );
821
822 /* Pause briefly before measuring the clock precision, see [Bug 2790] */
823 Sleep( 33 );
824
825 } else {
826 msyslog(LOG_ERR, "Multimedia timer unavailable");
827 }
828 }
829
830 /*
831 * Spin on GetSystemTimeAsFileTime to determine its
832 * granularity. Prior to Windows Vista this is
833 * typically the same as the clock period.
834 */
835 GetSystemTimeAsFileTime(&initial_hectonanosecs.ft);
836 do {
837 GetSystemTimeAsFileTime(&next_hectonanosecs.ft);
838 } while (initial_hectonanosecs.ull == next_hectonanosecs.ull);
839
840 os_clock_precision = next_hectonanosecs.ull -
841 initial_hectonanosecs.ull;
842
843 msyslog(LOG_INFO,
844 "Windows clock precision %.3f msec, min. slew %.3f ppm/s",
845 os_clock_precision / 1e4, ppm_per_adjust_unit);
846
847 winnt_time_initialized = TRUE;
848
849 choose_interp_counter();
850
851 if (getenv("NTPD_USE_SYSTEM_CLOCK") ||
852 (os_clock_precision < 4 * 10000 &&
853 !getenv("NTPD_USE_INTERP_DANGEROUS"))) {
854 msyslog(LOG_INFO, "using Windows clock directly");
855 } else {
856 winnt_use_interpolation = TRUE;
857 get_sys_time_as_filetime = GetInterpTimeAsFileTime;
858 StartClockThread();
859 }
860 }
861
862
863 void
atexit_revert_mm_timer(void)864 atexit_revert_mm_timer(void)
865 {
866 timeEndPeriod(wTimerRes);
867 DPRINTF(1, ("MM timer resolution reset\n"));
868 }
869
870
871 void
reset_winnt_time(void)872 reset_winnt_time(void)
873 {
874 SYSTEMTIME st;
875
876 /*
877 * If we're in the 2-second slew right after a leap second,
878 * we don't want to continue that extreme slew, in that case
879 * disable our slewing and return clock discipline to the
880 * kernel. Similarly if we are not yet synchronized,
881 * our current slew may not be a good ongoing trim.
882 * Otherwise, our leave in place the last SetSystemTimeAdjustment
883 * as an ongoing frequency correction, better than nothing.
884 * TODO:
885 * Verify this will not call SetSystemTimeAdjustment if
886 * ntpd is running in ntpdate mode.
887 */
888 if (sys_leap == LEAP_NOTINSYNC || ls_time_adjustment != 0)
889 SetSystemTimeAdjustment(0, TRUE);
890
891 /*
892 * Read the current system time, and write it back to
893 * force CMOS update, only if we are exiting because
894 * the computer is shutting down and we are already
895 * synchronized.
896 */
897 if (ntservice_systemisshuttingdown() && sys_leap != LEAP_NOTINSYNC) {
898 GetSystemTime(&st);
899 SetSystemTime(&st);
900 NLOG(NLOG_SYSEVENT | NLOG_CLOCKINFO)
901 msyslog(LOG_NOTICE, "system is shutting down, CMOS time reset.");
902 }
903 }
904
905
906 /*
907 * GetSystemTimeAsFileTime() interface clone is used by getclock() in ntpd.
908 */
909
910 void WINAPI
GetInterpTimeAsFileTime(LPFILETIME pft)911 GetInterpTimeAsFileTime(
912 LPFILETIME pft
913 )
914 {
915 static ULONGLONG last_interp_time;
916 FT_ULL now_time;
917 FT_ULL now_count;
918 ULONGLONG clock_backward;
919
920 /*
921 * Mark a mark ASAP. The latency to here should be reasonably
922 * deterministic
923 */
924
925 now_count.ull = perf_ctr();
926 now_time.ull = interp_time(now_count.ull, TRUE);
927
928 if (last_interp_time <= now_time.ull) {
929 last_interp_time = now_time.ull;
930 } else {
931 clock_backward = last_interp_time - now_time.ull;
932 if (clock_backward > clock_backward_max) {
933 clock_backward_max = clock_backward;
934 clock_backward_count++;
935 }
936 now_time.ull = last_interp_time;
937 }
938 *pft = now_time.ft;
939
940 return;
941 }
942
943
944 /*
945 * TimerApcFunction is invoked on the high-priority clock
946 * thread to capture a new baseline system time and
947 * performance counter correlation every 43 msec (64Hz
948 * OS clock precision).
949 */
950 static void CALLBACK
TimerApcFunction(LPVOID lpArgToCompletionRoutine,DWORD dwTimerLowValue,DWORD dwTimerHighValue)951 TimerApcFunction(
952 LPVOID lpArgToCompletionRoutine,
953 DWORD dwTimerLowValue,
954 DWORD dwTimerHighValue
955 )
956 {
957 static BOOL ctr_freq_timer_started = FALSE;
958 static ULONGLONG prev_count;
959 ULONGLONG now_time;
960 FT_ULL now_count;
961
962 /* Grab the counter first of all */
963 now_count.ull = perf_ctr();
964
965 now_time = (((ULONGLONG)dwTimerHighValue << 32) |
966 dwTimerLowValue);
967
968 /*
969 * Save this correlation in the history.
970 */
971 add_counter_time_pair(now_count.ull, now_time);
972
973 /*
974 * Once we're synchronized start the counter frequency
975 * tuning timer.
976 */
977 if (INVALID_HANDLE_VALUE == ctr_freq_timer &&
978 LEAP_NOTINSYNC != sys_leap)
979 start_ctr_freq_timer(now_time);
980 }
981
982
983 unsigned WINAPI
ClockThread(void * arg)984 ClockThread(
985 void *arg
986 )
987 {
988 LARGE_INTEGER DueTime;
989 HANDLE timer;
990 double HZ;
991 double TimerHz;
992 DWORD timer_period_msec;
993 DWORD res;
994 char *ntpd_int_int_text;
995
996 UNUSED_ARG(arg);
997
998 timer = CreateWaitableTimer(NULL, FALSE, NULL);
999
1000 ntpd_int_int_text = getenv("NTPD_INT_INT");
1001
1002 HZ = (double)HECTONANOSECONDS / clockperiod;
1003
1004 if (HZ > 63 && HZ < 65) {
1005 timer_period_msec = 43;
1006 } else if (HZ > 98 && HZ < 102) {
1007 timer_period_msec = 27;
1008 if (NULL == ntpd_int_int_text)
1009 msyslog(LOG_WARNING,
1010 "%.3f Hz system clock may benefit from "
1011 "custom NTPD_INT_INT env var timer interval "
1012 "override between approx. 20 and 50 msecs.",
1013 HZ);
1014 } else {
1015 timer_period_msec = (DWORD)(0.5 + (2.752 * clockperiod / 10000));
1016 if (NULL == ntpd_int_int_text)
1017 msyslog(LOG_WARNING,
1018 "unfamiliar %.3f Hz system clock may benefit "
1019 "from custom NTPD_INT_INT env var timer "
1020 "interval override between approx. 20 and 50 "
1021 "msecs.",
1022 HZ);
1023 }
1024
1025 if (ntpd_int_int_text != NULL) {
1026 timer_period_msec = atoi(ntpd_int_int_text);
1027 timer_period_msec = max(9, timer_period_msec);
1028 msyslog(LOG_NOTICE,
1029 "using NTPD_INT_INT env var override %u",
1030 timer_period_msec);
1031 }
1032
1033 TimerHz = 1e3 / timer_period_msec;
1034 msyslog(LOG_NOTICE, "HZ %.3f using %u msec timer %.3f Hz %d deep",
1035 HZ,
1036 timer_period_msec,
1037 TimerHz,
1038 BASELINES_USED);
1039
1040 /* negative DueTime means relative to now */
1041 DueTime.QuadPart = -(int)timer_period_msec;
1042
1043 SetWaitableTimer(
1044 timer,
1045 &DueTime, /* first fire */
1046 timer_period_msec, /* period thereafter */
1047 TimerApcFunction, /* callback routine */
1048 &timer, /* context for callback */
1049 FALSE); /* do not interfere with power saving */
1050
1051 /*
1052 * The clock thread spends the rest of its life in the TimerApcFunction
1053 * and ctr_freq_timer_fired timer APC callbacks, which can only occur
1054 * while this thread is in an alertable wait. Note the Ex on
1055 * WaitForSingleObjectEx and TRUE for fAlertable. The wait will return
1056 * after each APC callback in which case we simply wait again. We will
1057 * break out of the loop when StopClockThread signals our exit event.
1058 */
1059 do res = WaitForSingleObjectEx(
1060 TimerThreadExitRequest,
1061 INFINITE,
1062 TRUE);
1063 while (WAIT_OBJECT_0 != res);
1064
1065 CloseHandle(timer);
1066
1067 if (ctr_freq_timer != INVALID_HANDLE_VALUE) {
1068 CloseHandle(ctr_freq_timer);
1069 ctr_freq_timer = INVALID_HANDLE_VALUE;
1070 }
1071
1072 return 0;
1073 }
1074
1075
1076 static void
StartClockThread(void)1077 StartClockThread(void)
1078 {
1079 static BOOL done_once = FALSE;
1080 FT_ULL StartTime;
1081
1082 /* init variables with the time now */
1083 GetSystemTimeAsFileTime(&StartTime.ft);
1084 baseline_times[0] = StartTime.ull;
1085 baseline_counts[0] = perf_ctr();
1086
1087 /* init sync objects */
1088 TimerThreadExitRequest = CreateEvent(NULL, FALSE, FALSE, NULL);
1089
1090 clock_thread =
1091 (HANDLE)_beginthreadex(
1092 NULL,
1093 0,
1094 ClockThread,
1095 NULL,
1096 CREATE_SUSPENDED,
1097 &clock_thread_id);
1098
1099 if (clock_thread != NULL) {
1100 /* remember the thread priority is only within the process class */
1101 if (!SetThreadPriority(clock_thread, THREAD_PRIORITY_TIME_CRITICAL)) {
1102 DPRINTF(1, ("Error setting thread priority\n"));
1103 }
1104
1105 lock_thread_to_processor(clock_thread);
1106 ResumeThread(clock_thread);
1107
1108 if (FALSE == done_once) {
1109 done_once = TRUE;
1110 lock_thread_to_processor(GetCurrentThread());
1111 atexit( StopClockThread );
1112 }
1113
1114 /*
1115 * Give the clock thread time to fill its counter/time
1116 * sample buffer. This will underfill the buffer a
1117 * bit for sample periods over 43 msec.
1118 */
1119 Sleep(BASELINES_USED * 43);
1120 }
1121 }
1122
1123
1124 void
StopClockThread(void)1125 StopClockThread(void)
1126 {
1127 /*
1128 * if the clock thread exit()s this routine
1129 * will be called on the clock thread and
1130 * we need not (and can't) use the normal
1131 * TimerThreadExitRequest event.
1132 */
1133 if (GetCurrentThreadId() != clock_thread_id) {
1134
1135 if (!SetEvent(TimerThreadExitRequest) ||
1136 WaitForSingleObject(clock_thread, 2 * 1000) !=
1137 WAIT_OBJECT_0) {
1138 msyslog(LOG_ERR, "Failed to stop clock thread.");
1139 }
1140 }
1141 CloseHandle(TimerThreadExitRequest);
1142 TimerThreadExitRequest = NULL;
1143 CloseHandle(clock_thread);
1144 clock_thread = NULL;
1145 }
1146
1147
1148 void
lock_thread_to_processor(HANDLE thread)1149 lock_thread_to_processor(HANDLE thread)
1150 {
1151 static DWORD_PTR ProcessAffinityMask;
1152 static DWORD_PTR ThreadAffinityMask;
1153 DWORD_PTR SystemAffinityMask;
1154 char *cputext;
1155 unsigned int cpu;
1156
1157 if ( ! winnt_time_initialized) {
1158 DPRINTF(1, ("init_winnt_time() must be called before "
1159 "lock_thread_to_processor(), exiting\n"));
1160 exit(-1);
1161 }
1162
1163 if (!winnt_use_interpolation)
1164 return;
1165
1166 if (-1 == lock_interp_threads) {
1167 DPRINTF(1, ("choose_interp_counter() is not called "
1168 "before lock_thread_to_processor()\n"));
1169 exit(-1);
1170 } else if (!lock_interp_threads)
1171 return;
1172
1173 /*
1174 * Calculate the ThreadAffinityMask we'll use once on the
1175 * first invocation.
1176 */
1177 if (!ProcessAffinityMask) {
1178
1179 /*
1180 * Choose which processor to nail the main and clock threads to.
1181 * If we have more than one, we simply choose the 2nd.
1182 * Randomly choosing from 2 to n would be better, but in
1183 * either case with clock and network interrupts more likely
1184 * to be serviced by the first procecssor, let's stay away
1185 * from it. QueryPerformanceCounter is not necessarily
1186 * consistent across CPUs, hence the need to nail the two
1187 * threads involved in QPC-based interpolation to the same
1188 * CPU.
1189 */
1190
1191 GetProcessAffinityMask(
1192 GetCurrentProcess(),
1193 &ProcessAffinityMask,
1194 &SystemAffinityMask);
1195
1196 /*
1197 * respect NTPD_CPU environment variable if present
1198 * for testing. NTPD_CPU=0 means use all CPUs, 1-64
1199 * means lock threads involved in interpolation to
1200 * that CPU. Default to 2nd if more than 1.
1201 */
1202
1203 cpu = 2;
1204 cputext = getenv("NTPD_CPU");
1205 if (cputext) {
1206 cpu = (unsigned int) atoi(cputext);
1207 cpu = min((8 * sizeof(DWORD_PTR)), cpu);
1208 }
1209
1210 /*
1211 * Clear all bits except the 2nd. If we have only one proc
1212 * that leaves ThreadAffinityMask zeroed and we won't bother
1213 * with SetThreadAffinityMask.
1214 */
1215
1216 ThreadAffinityMask = (0 == cpu) ? 0 : (1 << (cpu - 1));
1217
1218 if (ThreadAffinityMask &&
1219 !(ThreadAffinityMask & ProcessAffinityMask))
1220
1221 DPRINTF(1, ("Selected CPU %u (mask %x) is outside "
1222 "process mask %x, using all CPUs.\n",
1223 cpu, ThreadAffinityMask,
1224 ProcessAffinityMask));
1225 else
1226 DPRINTF(1, ("Wiring to processor %u (0 means all) "
1227 "affinity mask %x\n",
1228 cpu, ThreadAffinityMask));
1229
1230 ThreadAffinityMask &= ProcessAffinityMask;
1231 }
1232
1233 if (ThreadAffinityMask &&
1234 !SetThreadAffinityMask(thread, ThreadAffinityMask))
1235 msyslog(LOG_ERR,
1236 "Unable to wire thread to mask %x: %m",
1237 ThreadAffinityMask);
1238 }
1239
1240
1241 #ifdef HAVE_PPSAPI
1242 static inline void ntp_timestamp_from_counter(l_fp *, ULONGLONG,
1243 ULONGLONG);
1244
1245 /*
1246 * helper routine for serial PPS which returns QueryPerformanceCounter
1247 * timestamp and needs to interpolate it to an NTP timestamp.
1248 */
1249 void
pps_ntp_timestamp_from_counter(ntp_fp_t * result,ULONGLONG Timestamp,ULONGLONG Counterstamp)1250 pps_ntp_timestamp_from_counter(
1251 ntp_fp_t *result,
1252 ULONGLONG Timestamp,
1253 ULONGLONG Counterstamp
1254 )
1255 {
1256 /*
1257 * convert between equivalent l_fp and PPSAPI ntp_fp_t
1258 */
1259 ntp_timestamp_from_counter(
1260 (l_fp *)result,
1261 Timestamp,
1262 Counterstamp);
1263 }
1264
1265
1266 static inline
1267 void
ntp_timestamp_from_counter(l_fp * result,ULONGLONG Timestamp,ULONGLONG Counterstamp)1268 ntp_timestamp_from_counter(
1269 l_fp *result,
1270 ULONGLONG Timestamp,
1271 ULONGLONG Counterstamp
1272 )
1273 {
1274 FT_ULL Now;
1275 FT_ULL Ctr;
1276 LONGLONG CtrDelta;
1277 double seconds;
1278 ULONGLONG InterpTimestamp;
1279
1280 if (winnt_use_interpolation) {
1281 if (0 == Counterstamp) {
1282 DPRINTF(1, ("ntp_timestamp_from_counter rejecting 0 counter.\n"));
1283 ZERO(*result);
1284 return;
1285 }
1286
1287 InterpTimestamp = interp_time(Counterstamp + QPC_offset, FALSE);
1288 } else { /* ! winnt_use_interpolation */
1289 if (NULL != pGetSystemTimePreciseAsFileTime &&
1290 0 != Counterstamp) {
1291 QueryPerformanceCounter(&Ctr.li);
1292 (*pGetSystemTimePreciseAsFileTime)(&Now.ft);
1293 CtrDelta = Ctr.ull - Counterstamp;
1294 seconds = (double)CtrDelta / PerfCtrFreq;
1295 InterpTimestamp = Now.ull -
1296 (ULONGLONG)(seconds * HECTONANOSECONDS);
1297 } else {
1298 /* have to simply use the driver's system time timestamp */
1299 InterpTimestamp = Timestamp;
1300 GetSystemTimeAsFileTime(&Now.ft);
1301 }
1302 }
1303
1304 /* convert from 100ns units to NTP fixed point format */
1305
1306 InterpTimestamp -= FILETIME_1970;
1307 result->l_ui = JAN_1970 + (u_int32)(InterpTimestamp / HECTONANOSECONDS);
1308 result->l_uf = (u_int32)((InterpTimestamp % HECTONANOSECONDS) *
1309 (ULONGLONG)FRAC / HECTONANOSECONDS);
1310 }
1311 #endif /* HAVE_PPSAPI */
1312
1313
1314 void
win_time_stepped(void)1315 win_time_stepped(void)
1316 {
1317 /*
1318 * called back by ntp_set_tod after the system
1319 * time has been stepped (set).
1320 *
1321 * We normally prevent the reported time from going backwards
1322 * but need to allow it in this case.
1323 */
1324 if (FALSE == winnt_use_interpolation)
1325 return;
1326
1327
1328 /*
1329 * Restart the clock thread to get a new baseline
1330 * time/counter correlation.
1331 */
1332 StopClockThread();
1333
1334 /*
1335 * newest_baseline_gen is a generation counter
1336 * incremented once each time newest_baseline
1337 * is reset.
1338 */
1339 newest_baseline_gen++;
1340
1341 clock_backward_max = CLOCK_BACK_THRESHOLD;
1342 clock_backward_count = 0;
1343 newest_baseline = 0;
1344 ZERO(baseline_counts);
1345 ZERO(baseline_times);
1346
1347 StartClockThread();
1348 }
1349
1350
1351 /*
1352 * log2ull - log base 2 of a unsigned 64-bit number
1353 */
1354 int
log2ull(ULONGLONG n)1355 log2ull(
1356 ULONGLONG n
1357 )
1358 {
1359 const ULONGLONG one = 1;
1360 int log = 0;
1361
1362 if (n >= one<<32) { n >>= 32; log += 32; }
1363 if (n >= one<<16) { n >>= 16; log += 16; }
1364 if (n >= one<< 8) { n >>= 8; log += 8; }
1365 if (n >= one<< 4) { n >>= 4; log += 4; }
1366 if (n >= one<< 2) { n >>= 2; log += 2; }
1367 if (n >= one<< 1) { log += 1; }
1368
1369 return (n) ? log : (-1);
1370 }
1371
1372
1373 /*
1374 * ctr_freq_timer_fired is called once a few seconds before
1375 * tune_ctr_period seconds have elapsed, to reset the timer
1376 * and hopefully minimize error due to the system using the
1377 * nominal performance counter frequency to set the timer
1378 * internally, which is typically dozens of PPM from the
1379 * actual performance counter rate. A few seconds later
1380 * it is called again to observe the counter and estimate the
1381 * counter frequency.
1382 */
1383 static void CALLBACK
ctr_freq_timer_fired(LPVOID arg,DWORD dwTimeLow,DWORD dwTimeHigh)1384 ctr_freq_timer_fired(
1385 LPVOID arg,
1386 DWORD dwTimeLow,
1387 DWORD dwTimeHigh
1388 )
1389 {
1390 static FT_ULL begin_time = {0};
1391 static FT_ULL begin_count = {0};
1392 static ULONGLONG next_period_time = 0;
1393 static ULONGLONG report_systemtime = 0;
1394 const ULONGLONG five_minutes = 5ui64 * 60 * HECTONANOSECONDS;
1395 FT_ULL now_time;
1396 FT_ULL now_count;
1397
1398 if (!begin_time.ull) {
1399 begin_count.ull = perf_ctr();
1400 begin_time.ft.dwLowDateTime = dwTimeLow;
1401 begin_time.ft.dwHighDateTime = dwTimeHigh;
1402
1403 /*
1404 * adapt perf ctr observation interval to the
1405 * counter frequency
1406 */
1407 tune_ctr_period = 22680 / log2ull(NomPerfCtrFreq);
1408
1409 /*
1410 * reset timer 2s before period ends to minimize
1411 * error from OS timer routines using nominal
1412 * performance frequency internally.
1413 */
1414 tune_ctr_freq_max_interval = tune_ctr_period - 2;
1415
1416 next_period_time = begin_time.ull +
1417 (ULONGLONG)tune_ctr_period * HECTONANOSECONDS;
1418
1419 ROUND_TO_NEXT_SEC_BOTTOM(next_period_time);
1420
1421 reset_ctr_freq_timer(next_period_time, begin_time.ull);
1422
1423 return;
1424 }
1425
1426 now_time.ft.dwLowDateTime = dwTimeLow;
1427 now_time.ft.dwHighDateTime = dwTimeHigh;
1428
1429 if (now_time.ull >= next_period_time) {
1430 now_count.ull = perf_ctr();
1431 tune_ctr_freq(
1432 now_count.ull - begin_count.ull,
1433 now_time.ull - begin_time.ull);
1434 next_period_time += (ULONGLONG)tune_ctr_period * HECTONANOSECONDS;
1435 begin_count.ull = now_count.ull;
1436 begin_time.ull = now_time.ull;
1437 }
1438
1439 /*
1440 * Log clock backward events no more often than 5 minutes.
1441 */
1442 if (!report_systemtime) {
1443 report_systemtime = now_time.ull + five_minutes;
1444 } else if (report_systemtime <= now_time.ull) {
1445 report_systemtime += five_minutes;
1446 if (clock_backward_count) {
1447 msyslog(LOG_WARNING,
1448 "clock would have gone backward %d times, "
1449 "max %.1f usec",
1450 clock_backward_count,
1451 clock_backward_max / 10.);
1452
1453 clock_backward_max = CLOCK_BACK_THRESHOLD;
1454 clock_backward_count = 0;
1455 }
1456 }
1457 reset_ctr_freq_timer(next_period_time, now_time.ull);
1458 }
1459
1460
1461 void
reset_ctr_freq_timer_abs(ULONGLONG when)1462 reset_ctr_freq_timer_abs(
1463 ULONGLONG when
1464 )
1465 {
1466 FT_ULL fire_time;
1467
1468 fire_time.ull = when;
1469 SetWaitableTimer(
1470 ctr_freq_timer,
1471 &fire_time.li, /* first fire */
1472 0, /* not periodic */
1473 ctr_freq_timer_fired, /* callback routine */
1474 NULL, /* context for callback */
1475 FALSE); /* do not interfere with power saving */
1476 }
1477
1478
1479 void
reset_ctr_freq_timer(ULONGLONG when,ULONGLONG now)1480 reset_ctr_freq_timer(
1481 ULONGLONG when,
1482 ULONGLONG now
1483 )
1484 {
1485 if (when - now >
1486 (tune_ctr_freq_max_interval * HECTONANOSECONDS + HECTONANOSECONDS))
1487 when = now + tune_ctr_freq_max_interval * HECTONANOSECONDS;
1488
1489 reset_ctr_freq_timer_abs(when);
1490 }
1491
1492
1493 void
start_ctr_freq_timer(ULONGLONG now_time)1494 start_ctr_freq_timer(
1495 ULONGLONG now_time
1496 )
1497 {
1498 ULONGLONG when;
1499
1500 ctr_freq_timer = CreateWaitableTimer(NULL, FALSE, NULL);
1501 when = now_time;
1502 ROUND_TO_NEXT_SEC_BOTTOM(when);
1503
1504 reset_ctr_freq_timer_abs(when);
1505 }
1506
1507
1508 /*
1509 * tune_ctr_freq is called once per tune_ctr_period seconds
1510 * with a counter difference and time difference.
1511 */
1512 void
tune_ctr_freq(LONGLONG ctr_delta,LONGLONG time_delta)1513 tune_ctr_freq(
1514 LONGLONG ctr_delta,
1515 LONGLONG time_delta
1516 )
1517 {
1518 static unsigned count = 0;
1519 static unsigned dispcount = 0;
1520 static unsigned report_at_count = 0;
1521 static int disbelieved = 0;
1522 static int i = 0;
1523 static double nom_freq = 0;
1524 static LONGLONG diffs[TUNE_CTR_DEPTH] = {0};
1525 static LONGLONG sum = 0;
1526 char ctr_freq_eq[64];
1527 LONGLONG delta;
1528 LONGLONG deltadiff;
1529 ULONGLONG ObsPerfCtrFreq;
1530 double freq;
1531 double this_freq;
1532 BOOL isneg;
1533
1534 /* one-time initialization */
1535 if (!report_at_count) {
1536 report_at_count = 24 * 60 * 60 / tune_ctr_period;
1537 nom_freq = NomPerfCtrFreq / 1e6;
1538 }
1539
1540 /* delta is the per-second observed frequency this time */
1541 delta = (LONGLONG)((double)ctr_delta * HECTONANOSECONDS /
1542 time_delta);
1543
1544 /* disbelieve any delta more than +/- 976 PPM from nominal */
1545 deltadiff = delta - NomPerfCtrFreq;
1546 if (0 > deltadiff) {
1547 isneg = TRUE;
1548 deltadiff = -deltadiff;
1549 } else {
1550 isneg = FALSE;
1551 }
1552
1553 if ((ULONGLONG)deltadiff > (NomPerfCtrFreq / 1024)) {
1554 disbelieved++;
1555 dispcount++;
1556 #ifdef DEBUG
1557 msyslog(LOG_DEBUG, "ctr delta %s%lld exceeds limit %llu",
1558 (isneg) ? "-" : "",
1559 deltadiff,
1560 NomPerfCtrFreq / 1024);
1561 #endif
1562 } else {
1563
1564 /*
1565 * collect average over TUNE_CTR_DEPTH samples
1566 * for our PerfCtrFreq trimming.
1567 */
1568
1569 if (isneg)
1570 deltadiff = -deltadiff;
1571 sum -= diffs[i];
1572 diffs[i] = deltadiff;
1573 sum += deltadiff;
1574 i = (i + 1) % COUNTOF(diffs);
1575 count++;
1576 dispcount++;
1577 }
1578
1579 this_freq = delta / 1e6;
1580
1581 ObsPerfCtrFreq = NomPerfCtrFreq + (sum / COUNTOF(diffs));
1582
1583 #if 1 /* #if 0 to disable changing freq used */
1584 /* get rid of ObsPerfCtrFreq when removing the #ifdef */
1585 PerfCtrFreq = ObsPerfCtrFreq;
1586 #endif
1587 freq = PerfCtrFreq / 1e6;
1588
1589 /*
1590 * make the performance counter's frequency error from its
1591 * nominal rate, expressed in PPM, available via ntpq as
1592 * system variable "ctr_frequency". This is consistent with
1593 * "frequency" which is the system clock drift in PPM.
1594 */
1595 snprintf(ctr_freq_eq, sizeof(ctr_freq_eq), "ctr_frequency=%.2f",
1596 1e6 * (freq - nom_freq) / nom_freq);
1597 set_sys_var(ctr_freq_eq, strlen(ctr_freq_eq) + 1, RO | DEF);
1598
1599 /*
1600 * report observed ctr freq each time the estimate used during
1601 * startup moves toward the observed freq from the nominal.
1602 */
1603
1604 if (count > COUNTOF(diffs) &&
1605 /* (count % COUNTOF(diffs)) && */ /* enables reporting each */
1606 dispcount < report_at_count) /* TUNE_CTR_DEPTH samples */
1607 return;
1608
1609 NLOG(NLOG_CLOCKINFO)
1610 if (count <= COUNTOF(diffs))
1611 /* moving to observed freq. from nominal (startup) */
1612 msyslog(LOG_INFO,
1613 (freq > 100)
1614 ? "ctr %.3f MHz %+6.2f PPM using %.3f MHz %+6.2f PPM"
1615 : "ctr %.6f MHz %+6.2f PPM using %.6f MHz %+6.2f PPM",
1616 this_freq,
1617 1e6 * (this_freq - nom_freq) / nom_freq,
1618 freq,
1619 1e6 * (freq - nom_freq) / nom_freq);
1620 else
1621 /* steady state */
1622 msyslog(LOG_INFO,
1623 (freq > 100)
1624 ? "ctr %.3f MHz %+.2f PPM"
1625 : "ctr %.6f MHz %+.2f PPM",
1626 freq,
1627 1e6 * (freq - nom_freq) / nom_freq);
1628
1629 if (disbelieved) {
1630 msyslog(LOG_ERR,
1631 "%d ctr samples exceed +/- 976 PPM range gate",
1632 disbelieved);
1633 disbelieved = 0;
1634 }
1635
1636 dispcount = 0;
1637 }
1638
1639
1640 /*
1641 * add_counter_time_pair is called by the
1642 * high priority clock thread with each new
1643 * baseline counter/time correlation.
1644 */
1645 void
add_counter_time_pair(ULONGLONG ctr,LONGLONG time)1646 add_counter_time_pair(
1647 ULONGLONG ctr,
1648 LONGLONG time
1649 )
1650 {
1651 int i;
1652
1653 i = (newest_baseline + 1) % BASELINES_TOT;
1654
1655 baseline_counts[i] = ctr;
1656 baseline_times[i] = time;
1657
1658 newest_baseline = i;
1659 }
1660
1661
1662 /*
1663 * interp_time estimates NT time in 100ns units
1664 * based on a performance counter value given.
1665 * This must tolerate recent historical counters
1666 * as well as current. When current is FALSE
1667 * we can't assume ctr is the latest/highest
1668 * seen.
1669 */
1670 ULONGLONG
interp_time(ULONGLONG ctr,BOOL current)1671 interp_time(
1672 ULONGLONG ctr,
1673 BOOL current
1674 )
1675 {
1676 static __declspec(thread) int last_newest = -1;
1677 static __declspec(thread) int last_newest_gen;
1678 static __declspec(thread) int best_index;
1679 ULONGLONG this_ctr;
1680 LONGLONG this_time;
1681 LONGLONG latest_time;
1682 LONGLONG ctr_diff;
1683 int i;
1684 int i_gen;
1685 int c;
1686
1687 /*
1688 * Use the system time (roughly synchronised to the tick, and
1689 * extrapolated using the system performance counter.
1690 *
1691 * Cache the results per thread and only repeat the
1692 * calculation when new data has arrived.
1693 */
1694 i = newest_baseline;
1695 i_gen = newest_baseline_gen;
1696
1697 if (last_newest == i && last_newest_gen == i_gen) {
1698 this_time = baseline_times[best_index];
1699 ctr_diff = ctr - baseline_counts[best_index];
1700 this_time += (LONGLONG)PERF2HNS((double)ctr_diff);
1701
1702 return this_time;
1703 }
1704
1705 last_newest = i;
1706 last_newest_gen = i_gen;
1707
1708 latest_time = 0;
1709
1710 /*
1711 * Run through the history calculating the interpolated
1712 * time based on each counter/time correlation in turn,
1713 * and believe the latest one. This is akin to the NTP
1714 * protocol minimum delay clock filter. Errors due to
1715 * counter/time correlations with stale time are all
1716 * negative.
1717 */
1718 for (c = 0; c < BASELINES_USED; c++) {
1719 if (baseline_times[i]) {
1720 this_time = baseline_times[i];
1721 this_ctr = baseline_counts[i];
1722
1723 ctr_diff = ctr - this_ctr;
1724
1725 if (current && ctr_diff < 0) {
1726 /*
1727 * The performance counter apparently went
1728 * backwards without rolling over. It might
1729 * be nice to complain but we don't want
1730 * to do it repeatedly.
1731 */
1732 ctr_diff = 0;
1733 }
1734
1735 this_time += (LONGLONG)PERF2HNS((double)ctr_diff);
1736
1737 if (this_time > latest_time) {
1738 latest_time = this_time;
1739 best_index = i;
1740 }
1741 }
1742 i = i ? (i - 1) : (BASELINES_TOT - 1);
1743 }
1744
1745 return latest_time;
1746 }
1747