1 /*- 2 * Copyright (c) 2009 Adrian Chadd 3 * Copyright (c) 2012 Spectra Logic Corporation 4 * Copyright (c) 2014 Bryan Venteicher 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/bus.h> 35 #include <sys/clock.h> 36 #include <sys/conf.h> 37 #include <sys/fcntl.h> 38 #include <sys/limits.h> 39 #include <sys/mman.h> 40 #include <sys/proc.h> 41 #include <sys/smp.h> 42 #include <sys/sysctl.h> 43 #include <sys/vdso.h> 44 45 #include <vm/vm.h> 46 #include <vm/pmap.h> 47 48 #include <machine/atomic.h> 49 #include <machine/cpufunc.h> 50 #include <machine/md_var.h> 51 #include <machine/pvclock.h> 52 53 /* 54 * Last system time. This is used to guarantee a monotonically non-decreasing 55 * clock for the kernel codepath and approximate the same for the vDSO codepath. 56 * In theory, this should be unnecessary absent hypervisor bug(s) and/or what 57 * should be rare cases where TSC jitter may still be visible despite the 58 * hypervisor's best efforts. 59 */ 60 static volatile uint64_t pvclock_last_systime; 61 62 static uint64_t pvclock_getsystime(struct pvclock *pvc); 63 static void pvclock_read_time_info( 64 struct pvclock_vcpu_time_info *ti, uint64_t *ns, uint8_t *flags); 65 static void pvclock_read_wall_clock(struct pvclock_wall_clock *wc, 66 struct timespec *ts); 67 static u_int pvclock_tc_get_timecount(struct timecounter *tc); 68 static uint32_t pvclock_tc_vdso_timehands( 69 struct vdso_timehands *vdso_th, struct timecounter *tc); 70 #ifdef COMPAT_FREEBSD32 71 static uint32_t pvclock_tc_vdso_timehands32( 72 struct vdso_timehands32 *vdso_th, struct timecounter *tc); 73 #endif 74 75 static d_open_t pvclock_cdev_open; 76 static d_mmap_t pvclock_cdev_mmap; 77 78 static struct cdevsw pvclock_cdev_cdevsw = { 79 .d_version = D_VERSION, 80 .d_name = PVCLOCK_CDEVNAME, 81 .d_open = pvclock_cdev_open, 82 .d_mmap = pvclock_cdev_mmap, 83 }; 84 85 void 86 pvclock_resume(void) 87 { 88 atomic_store_rel_64(&pvclock_last_systime, 0); 89 } 90 91 uint64_t 92 pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti) 93 { 94 uint64_t freq; 95 96 freq = (1000000000ULL << 32) / ti->tsc_to_system_mul; 97 if (ti->tsc_shift < 0) 98 freq <<= -ti->tsc_shift; 99 else 100 freq >>= ti->tsc_shift; 101 return (freq); 102 } 103 104 static void 105 pvclock_read_time_info(struct pvclock_vcpu_time_info *ti, 106 uint64_t *ns, uint8_t *flags) 107 { 108 uint64_t delta; 109 uint32_t version; 110 111 do { 112 version = atomic_load_acq_32(&ti->version); 113 delta = rdtsc_ordered() - ti->tsc_timestamp; 114 *ns = ti->system_time + pvclock_scale_delta(delta, 115 ti->tsc_to_system_mul, ti->tsc_shift); 116 *flags = ti->flags; 117 atomic_thread_fence_acq(); 118 } while ((ti->version & 1) != 0 || ti->version != version); 119 } 120 121 static void 122 pvclock_read_wall_clock(struct pvclock_wall_clock *wc, struct timespec *ts) 123 { 124 uint32_t version; 125 126 do { 127 version = atomic_load_acq_32(&wc->version); 128 ts->tv_sec = wc->sec; 129 ts->tv_nsec = wc->nsec; 130 atomic_thread_fence_acq(); 131 } while ((wc->version & 1) != 0 || wc->version != version); 132 } 133 134 static uint64_t 135 pvclock_getsystime(struct pvclock *pvc) 136 { 137 uint64_t now, last, ret; 138 uint8_t flags; 139 140 critical_enter(); 141 pvclock_read_time_info(&pvc->timeinfos[curcpu], &now, &flags); 142 ret = now; 143 if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) { 144 last = atomic_load_acq_64(&pvclock_last_systime); 145 do { 146 if (last > now) { 147 ret = last; 148 break; 149 } 150 } while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last, 151 now)); 152 } 153 critical_exit(); 154 return (ret); 155 } 156 157 /* 158 * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c' 159 * has been migrated to the 'struct pvclock' API. 160 */ 161 uint64_t 162 pvclock_get_timecount(struct pvclock_vcpu_time_info *ti) 163 { 164 uint64_t now, last, ret; 165 uint8_t flags; 166 167 pvclock_read_time_info(ti, &now, &flags); 168 ret = now; 169 if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) { 170 last = atomic_load_acq_64(&pvclock_last_systime); 171 do { 172 if (last > now) { 173 ret = last; 174 break; 175 } 176 } while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last, 177 now)); 178 } 179 return (ret); 180 } 181 182 /* 183 * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c' 184 * has been migrated to the 'struct pvclock' API. 185 */ 186 void 187 pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts) 188 { 189 pvclock_read_wall_clock(wc, ts); 190 } 191 192 static int 193 pvclock_cdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 194 { 195 if (oflags & FWRITE) 196 return (EPERM); 197 return (0); 198 } 199 200 static int 201 pvclock_cdev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, 202 int nprot, vm_memattr_t *memattr) 203 { 204 if (offset >= mp_ncpus * sizeof(struct pvclock_vcpu_time_info)) 205 return (EINVAL); 206 if (PROT_EXTRACT(nprot) != PROT_READ) 207 return (EACCES); 208 *paddr = vtophys((uintptr_t)dev->si_drv1 + offset); 209 *memattr = VM_MEMATTR_DEFAULT; 210 return (0); 211 } 212 213 static u_int 214 pvclock_tc_get_timecount(struct timecounter *tc) 215 { 216 struct pvclock *pvc = tc->tc_priv; 217 218 return (pvclock_getsystime(pvc) & UINT_MAX); 219 } 220 221 static uint32_t 222 pvclock_tc_vdso_timehands(struct vdso_timehands *vdso_th, 223 struct timecounter *tc) 224 { 225 struct pvclock *pvc = tc->tc_priv; 226 227 if (pvc->cdev == NULL) 228 return (0); 229 230 vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; 231 vdso_th->th_x86_shift = 0; 232 vdso_th->th_x86_hpet_idx = 0; 233 vdso_th->th_x86_pvc_last_systime = 234 atomic_load_acq_64(&pvclock_last_systime); 235 vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable && 236 pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0; 237 bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); 238 return ((amd_feature & AMDID_RDTSCP) != 0 || 239 ((vdso_th->th_x86_pvc_stable_mask & PVCLOCK_FLAG_TSC_STABLE) != 0 && 240 pvc->vdso_enable_without_rdtscp)); 241 } 242 243 #ifdef COMPAT_FREEBSD32 244 static uint32_t 245 pvclock_tc_vdso_timehands32(struct vdso_timehands32 *vdso_th, 246 struct timecounter *tc) 247 { 248 struct pvclock *pvc = tc->tc_priv; 249 250 if (pvc->cdev == NULL) 251 return (0); 252 253 vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; 254 vdso_th->th_x86_shift = 0; 255 vdso_th->th_x86_hpet_idx = 0; 256 vdso_th->th_x86_pvc_last_systime = 257 atomic_load_acq_64(&pvclock_last_systime); 258 vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable && 259 pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0; 260 bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); 261 return ((amd_feature & AMDID_RDTSCP) != 0 || 262 ((vdso_th->th_x86_pvc_stable_mask & PVCLOCK_FLAG_TSC_STABLE) != 0 && 263 pvc->vdso_enable_without_rdtscp)); 264 } 265 #endif 266 267 void 268 pvclock_gettime(struct pvclock *pvc, struct timespec *ts) 269 { 270 struct timespec system_ts; 271 uint64_t system_ns; 272 273 pvclock_read_wall_clock(pvc->get_wallclock(pvc->get_wallclock_arg), ts); 274 system_ns = pvclock_getsystime(pvc); 275 system_ts.tv_sec = system_ns / 1000000000ULL; 276 system_ts.tv_nsec = system_ns % 1000000000ULL; 277 timespecadd(ts, &system_ts, ts); 278 } 279 280 void 281 pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name, 282 int tc_quality, u_int tc_flags) 283 { 284 struct make_dev_args mda; 285 int err; 286 287 KASSERT(((uintptr_t)pvc->timeinfos & PAGE_MASK) == 0, 288 ("Specified time info page(s) address is not page-aligned.")); 289 290 /* Set up vDSO stable-flag suppression test facility: */ 291 pvc->vdso_force_unstable = false; 292 SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev), 293 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, 294 "vdso_force_unstable", CTLFLAG_RW, &pvc->vdso_force_unstable, 0, 295 "Forcibly deassert stable flag in vDSO codepath"); 296 297 /* 298 * Make it possible to use the vDSO page even when the hypervisor does 299 * not support the rdtscp instruction. This is disabled by default for 300 * compatibility with old libc. 301 */ 302 pvc->vdso_enable_without_rdtscp = false; 303 SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev), 304 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, 305 "vdso_enable_without_rdtscp", CTLFLAG_RWTUN, 306 &pvc->vdso_enable_without_rdtscp, 0, 307 "Allow the use of a vDSO when rdtscp is not available"); 308 309 /* Set up timecounter and timecounter-supporting members: */ 310 pvc->tc.tc_get_timecount = pvclock_tc_get_timecount; 311 pvc->tc.tc_poll_pps = NULL; 312 pvc->tc.tc_counter_mask = ~0U; 313 pvc->tc.tc_frequency = 1000000000ULL; 314 pvc->tc.tc_name = tc_name; 315 pvc->tc.tc_quality = tc_quality; 316 pvc->tc.tc_flags = tc_flags; 317 pvc->tc.tc_priv = pvc; 318 pvc->tc.tc_fill_vdso_timehands = pvclock_tc_vdso_timehands; 319 #ifdef COMPAT_FREEBSD32 320 pvc->tc.tc_fill_vdso_timehands32 = pvclock_tc_vdso_timehands32; 321 #endif 322 323 /* Set up cdev for userspace mmapping of vCPU 0 time info page: */ 324 make_dev_args_init(&mda); 325 mda.mda_devsw = &pvclock_cdev_cdevsw; 326 mda.mda_uid = UID_ROOT; 327 mda.mda_gid = GID_WHEEL; 328 mda.mda_mode = 0444; 329 mda.mda_si_drv1 = pvc->timeinfos; 330 err = make_dev_s(&mda, &pvc->cdev, PVCLOCK_CDEVNAME); 331 if (err != 0) { 332 device_printf(dev, "Could not create /dev/%s, error %d. Fast " 333 "time of day will be unavailable for this timecounter.\n", 334 PVCLOCK_CDEVNAME, err); 335 KASSERT(pvc->cdev == NULL, 336 ("Failed make_dev_s() unexpectedly inited cdev.")); 337 } 338 339 /* Register timecounter: */ 340 tc_init(&pvc->tc); 341 342 /* 343 * Register wallclock: 344 * The RTC registration API expects a resolution in microseconds; 345 * pvclock's 1ns resolution is rounded up to 1us. 346 */ 347 clock_register(dev, 1); 348 } 349 350 int 351 pvclock_destroy(struct pvclock *pvc) 352 { 353 /* 354 * Not currently possible since there is no teardown counterpart of 355 * 'tc_init()'. 356 */ 357 return (EBUSY); 358 } 359