1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Xen event provider for DTrace 29 * 30 * NOTE: This provider is PRIVATE. It is intended as a short-term solution and 31 * may disappear or be re-implemented at anytime. 32 * 33 * This provider isn't suitable as a general-purpose solution for a number of 34 * reasons. First and foremost, we rely on the Xen tracing mechanism and don't 35 * have any way to gather data other than that collected by the Xen trace 36 * buffers. Further, it does not fit into the DTrace model (see "Interacting 37 * with DTrace" below.) 38 * 39 * 40 * Tracing in Xen 41 * -------------- 42 * 43 * Xen implements a tracing facility for generating and collecting execution 44 * event traces from the hypervisor. When tracing is enabled, compiled in 45 * probes record events in contiguous per-CPU trace buffers. 46 * 47 * +---------+ 48 * +------+ | | 49 * | CPUn |----> | BUFFERn | 50 * +------+ | | 51 * +---------+- tbuf.va + (tbuf.size * n) 52 * : : 53 * +---------+ 54 * +------+ | | 55 * | CPU1 |----> | BUFFER1 | 56 * +------+ | | 57 * +---------+- tbuf.va + tbuf.size 58 * +------+ | | 59 * | CPU0 |----> | BUFFER0 | 60 * +------+ | | 61 * +---------+- tbuf.va 62 * 63 * Each CPU buffer consists of a metadata header followed by the trace records. 64 * The metadata consists of a producer/consumer pair of pointers into the buffer 65 * that point to the next record to be written and the next record to be read 66 * respectively. 67 * 68 * A trace record can be in one of two forms, depending on if the TSC is 69 * included. The record header indicates whether or not the TSC field is 70 * present. 71 * 72 * 1. Trace record without TSC: 73 * +------------------------------------------------------------+ 74 * | HEADER(uint32_t) | DATA FIELDS | 75 * +------------------------------------------------------------+ 76 * 77 * 2. Trace record with TSC: 78 * +--------------------------------------------------------------------------+ 79 * | HEADER(uint32_t) | TSC(uint64_t) | DATA FIELDS | 80 * +--------------------------------------------------------------------------+ 81 * 82 * Where, 83 * 84 * HEADER bit field: 85 * +--------------------------------------------------------------------------+ 86 * | C | NDATA | EVENT | 87 * +--------------------------------------------------------------------------+ 88 * 31 30 28 27 0 89 * 90 * EVENT: Event ID. 91 * NDATA: Number of populated data fields. 92 * C: TSC included. 93 * 94 * DATA FIELDS: 95 * +--------------------------------------------------------------------------+ 96 * | D1(uint32_t) | D2(uint32_t) | D3(uint32_t) | . . . | D7(uint32_t) | 97 * +--------------------------------------------------------------------------+ 98 * 99 * 100 * Interacting with DTrace 101 * ----------------------- 102 * 103 * Every xdt_poll_nsec nano-seconds we poll the trace buffers for data and feed 104 * each entry into dtrace_probe() with the corresponding probe ID for the event. 105 * As a result of this periodic collection implementation probe firings are 106 * asynchronous. This is the only sensible way to implement this form of 107 * provider, but because of its asynchronous nature asking things like 108 * "current CPU" and, more importantly, arbitrary questions about the context 109 * surrounding the probe firing are not meaningful. So, consumers should not 110 * attempt to infer anything beyond what is supplied via the probe arguments. 111 */ 112 113 #include <sys/xpv_user.h> 114 115 #include <sys/types.h> 116 #include <sys/sysmacros.h> 117 #include <sys/modctl.h> 118 #include <sys/sunddi.h> 119 #include <sys/ddi.h> 120 #include <sys/conf.h> 121 #include <sys/devops.h> 122 #include <sys/stat.h> 123 #include <sys/cmn_err.h> 124 #include <sys/dtrace.h> 125 #include <sys/sdt.h> 126 #include <sys/cyclic.h> 127 #include <vm/seg_kmem.h> 128 #include <vm/hat_i86.h> 129 130 #include <sys/hypervisor.h> 131 #include <xen/public/trace.h> 132 #include <xen/public/sched.h> 133 134 #define XDT_POLL_DEFAULT 100000000 /* default poll interval (ns) */ 135 #define XDT_POLL_MIN 10000000 /* min poll interval (ns) */ 136 #define XDT_TBUF_RETRY 50 /* tbuf disable retry count */ 137 138 /* 139 * The domid must match IDLE_DOMAIN_ID in xen.hg/xen/include/xen/sched.h 140 * in the xVM gate. 141 */ 142 #define IS_IDLE_DOM(domid) (domid == 0x7FFFU) 143 144 /* Macros to extract the domid and cpuid from a HVM trace data field */ 145 #define HVM_DOMID(d) (d >> 16) 146 #define HVM_VCPUID(d) (d & 0xFFFF) 147 148 /* Flags for shadow page table events */ 149 #define SH_GUEST_32 0x000 150 #define SH_GUEST_PAE 0x100 151 #define SH_GUEST_64 0x200 152 153 #define XDT_PROBE5(event, arg0, arg1, arg2, arg3, arg4) { \ 154 dtrace_id_t id = xdt_probemap[event]; \ 155 if (id) \ 156 dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); \ 157 } \ 158 159 #define XDT_PROBE4(event, arg0, arg1, arg2, arg3) \ 160 XDT_PROBE5(event, arg0, arg1, arg2, arg3, 0) 161 162 #define XDT_PROBE3(event, arg0, arg1, arg2) \ 163 XDT_PROBE5(event, arg0, arg1, arg2, 0, 0) 164 165 #define XDT_PROBE2(event, arg0, arg1) \ 166 XDT_PROBE5(event, arg0, arg1, 0, 0, 0) 167 168 #define XDT_PROBE1(event, arg0) \ 169 XDT_PROBE5(event, arg0, 0, 0, 0, 0) 170 171 #define XDT_PROBE0(event) \ 172 XDT_PROBE5(event, 0, 0, 0, 0, 0) 173 174 /* Probe classes */ 175 #define XDT_SCHED 0 176 #define XDT_MEM 1 177 #define XDT_HVM 2 178 #define XDT_GEN 3 179 #define XDT_PV 4 180 #define XDT_SHADOW 5 181 #define XDT_PM 6 182 #define XDT_NCLASSES 7 183 184 /* Probe events */ 185 #define XDT_EVT_INVALID (-(int)1) 186 #define XDT_SCHED_OFF_CPU 0 187 #define XDT_SCHED_ON_CPU 1 188 #define XDT_SCHED_IDLE_OFF_CPU 2 189 #define XDT_SCHED_IDLE_ON_CPU 3 190 #define XDT_SCHED_BLOCK 4 191 #define XDT_SCHED_SLEEP 5 192 #define XDT_SCHED_WAKE 6 193 #define XDT_SCHED_YIELD 7 194 #define XDT_SCHED_SHUTDOWN_POWEROFF 8 195 #define XDT_SCHED_SHUTDOWN_REBOOT 9 196 #define XDT_SCHED_SHUTDOWN_SUSPEND 10 197 #define XDT_SCHED_SHUTDOWN_CRASH 11 198 #define XDT_MEM_PAGE_GRANT_MAP 12 199 #define XDT_MEM_PAGE_GRANT_UNMAP 13 200 #define XDT_MEM_PAGE_GRANT_TRANSFER 14 201 #define XDT_HVM_VMENTRY 15 202 #define XDT_HVM_VMEXIT 16 203 #define XDT_TRC_LOST_RECORDS 17 204 #define XDT_SCHED_ADD_VCPU 18 205 #define XDT_SCHED_REM_VCPU 19 /* unused */ 206 #define XDT_SCHED_CTL 20 /* unused */ 207 #define XDT_SCHED_ADJDOM 21 208 #define XDT_SCHED_S_TIMER_FN 22 /* unused */ 209 #define XDT_SCHED_T_TIMER_FN 23 /* unused */ 210 #define XDT_SCHED_DOM_TIMER_FN 24 /* unused */ 211 #define XDT_PV_HYPERCALL 25 212 #define XDT_PV_TRAP 26 213 #define XDT_PV_PAGE_FAULT 27 214 #define XDT_PV_FORCED_INVALID_OP 28 215 #define XDT_PV_EMULATE_PRIVOP 29 216 #define XDT_PV_EMULATE_4GB 30 /* unused (32-bit HV only ) */ 217 #define XDT_PV_MATH_STATE_RESTORE 31 218 #define XDT_PV_PAGING_FIXUP 32 219 #define XDT_PV_DT_MAPPING_FAULT 33 220 #define XDT_PV_PTWR_EMULATION 34 221 #define XDT_HVM_PF_XEN 35 222 #define XDT_HVM_PF_INJECT 36 223 #define XDT_HVM_EXC_INJECT 37 224 #define XDT_HVM_VIRQ_INJECT 38 225 #define XDT_HVM_VIRQ_REINJECT 39 226 #define XDT_HVM_IO_READ 40 /* unused */ 227 #define XDT_HVM_IO_WRITE 41 /* unused */ 228 #define XDT_HVM_CR_READ 42 229 #define XDT_HVM_CR_WRITE 43 230 #define XDT_HVM_DR_READ 44 /* unused */ 231 #define XDT_HVM_DR_WRITE 45 /* unused */ 232 #define XDT_HVM_MSR_READ 46 233 #define XDT_HVM_MSR_WRITE 47 234 #define XDT_HVM_CPUID 48 235 #define XDT_HVM_INTR 49 236 #define XDT_HVM_INTR_WINDOW 50 237 #define XDT_HVM_NMI 51 238 #define XDT_HVM_SMI 52 239 #define XDT_HVM_VMMCALL 53 240 #define XDT_HVM_HLT 54 241 #define XDT_HVM_INVLPG 55 242 #define XDT_HVM_MCE 56 243 #define XDT_HVM_IOPORT_READ 57 244 #define XDT_HVM_IOPORT_WRITE 58 245 #define XDT_HVM_CLTS 59 246 #define XDT_HVM_LMSW 60 247 #define XDT_HVM_IOMEM_READ 61 248 #define XDT_HVM_IOMEM_WRITE 62 249 #define XDT_SHADOW_NOT_SHADOW 63 250 #define XDT_SHADOW_FAST_PROPAGATE 64 251 #define XDT_SHADOW_FAST_MMIO 65 252 #define XDT_SHADOW_FALSE_FAST_PATH 66 253 #define XDT_SHADOW_MMIO 67 254 #define XDT_SHADOW_FIXUP 68 255 #define XDT_SHADOW_DOMF_DYING 69 256 #define XDT_SHADOW_EMULATE 70 257 #define XDT_SHADOW_EMULATE_UNSHADOW_USER 71 258 #define XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ 72 259 #define XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED 73 260 #define XDT_SHADOW_WRMAP_BF 74 261 #define XDT_SHADOW_PREALLOC_UNPIN 75 262 #define XDT_SHADOW_RESYNC_FULL 76 263 #define XDT_SHADOW_RESYNC_ONLY 77 264 #define XDT_PM_FREQ_CHANGE 78 265 #define XDT_PM_IDLE_ENTRY 79 266 #define XDT_PM_IDLE_EXIT 80 267 #define XDT_SCHED_RUNSTATE_CHANGE 81 268 #define XDT_SCHED_CONTINUE_RUNNING 82 269 #define XDT_NEVENTS 83 270 271 typedef struct { 272 const char *pr_mod; /* probe module */ 273 const char *pr_name; /* probe name */ 274 int evt_id; /* event id */ 275 uint_t class; /* probe class */ 276 } xdt_probe_t; 277 278 typedef struct { 279 uint32_t trc_mask; /* trace mask */ 280 uint32_t cnt; /* num enabled probes in class */ 281 } xdt_classinfo_t; 282 283 typedef struct { 284 ulong_t prev_domid; /* previous dom executed */ 285 ulong_t prev_vcpuid; /* previous vcpu executed */ 286 ulong_t prev_ctime; /* time spent on cpu */ 287 ulong_t next_domid; /* next dom to be scheduled */ 288 ulong_t next_vcpuid; /* next vcpu to be scheduled */ 289 ulong_t next_wtime; /* time spent waiting to get on cpu */ 290 ulong_t next_ts; /* allocated time slice */ 291 ulong_t cur_domid; /* current dom */ 292 ulong_t cur_vcpuid; /* current vcpuid */ 293 int curinfo_valid; /* info is valid */ 294 } xdt_schedinfo_t; 295 296 static struct { 297 uint_t cnt; /* total num of trace buffers */ 298 size_t size; /* size of each cpu buffer */ 299 mfn_t start_mfn; /* starting mfn of buffers */ 300 caddr_t va; /* va buffers are mapped into */ 301 302 /* per-cpu buffers */ 303 struct t_buf **meta; /* buffer metadata */ 304 struct t_rec **data; /* buffer data records */ 305 306 /* statistics */ 307 uint64_t stat_dropped_recs; /* records dropped */ 308 uint64_t stat_spurious_cpu; /* recs with garbage cpuids */ 309 uint64_t stat_spurious_switch; /* inconsistent vcpu switches */ 310 uint64_t stat_unknown_shutdown; /* unknown shutdown code */ 311 uint64_t stat_unknown_recs; /* unknown records */ 312 } tbuf; 313 314 static size_t tbuf_data_size; 315 316 static char *xdt_stats[] = { 317 "dropped_recs", 318 }; 319 320 /* 321 * Tunable variables 322 * 323 * The following may be tuned by adding a line to /etc/system that 324 * includes both the name of the module ("xdt") and the name of the variable. 325 * For example: 326 * set xdt:xdt_tbuf_pages = 40 327 */ 328 uint_t xdt_tbuf_pages = 20; /* pages to alloc per-cpu buf */ 329 330 /* 331 * The following may be tuned by adding a line to 332 * /platform/i86xpv/kernel/drv/xdt.conf. 333 * For example: 334 * xdt_poll_nsec = 200000000; 335 */ 336 static hrtime_t xdt_poll_nsec; /* trace buffer poll interval */ 337 338 /* 339 * Another tunable variable: the maximum number of records to process 340 * in one scan. If it is 0 (e.g. not set in /etc/system), it will 341 * be set to ncpu * (bufsize / max_rec_size). 342 * 343 * Having an upper limit avoids a situation where the scan would loop 344 * endlessly in case the hypervisor adds records quicker than we 345 * can process them. It's better to drop records than to loop, obviously. 346 */ 347 uint_t xdt_max_recs = 0; 348 349 /* 350 * Internal variables 351 */ 352 static dev_info_t *xdt_devi; 353 static dtrace_provider_id_t xdt_id; 354 static uint_t xdt_ncpus; /* total number of phys CPUs */ 355 static uint32_t cur_trace_mask; /* current trace mask */ 356 static xdt_schedinfo_t *xdt_cpu_schedinfo; /* per-cpu sched info */ 357 dtrace_id_t xdt_probemap[XDT_NEVENTS]; /* map of enabled probes */ 358 dtrace_id_t xdt_prid[XDT_NEVENTS]; /* IDs of registered events */ 359 static cyclic_id_t xdt_cyclic = CYCLIC_NONE; 360 static kstat_t *xdt_kstats; 361 static xdt_classinfo_t xdt_classinfo[XDT_NCLASSES]; 362 363 /* 364 * These provide context when probes fire. They can be accessed 365 * from xdt dtrace probe (as `xdt_curdom, etc). It's ok for these 366 * to be global, and not per-cpu, as probes are run strictly in sequence 367 * as the trace buffers are 368 */ 369 uint_t xdt_curdom, xdt_curvcpu, xdt_curpcpu; 370 uint64_t xdt_timestamp; 371 372 static xdt_probe_t xdt_probe[] = { 373 /* Sched probes */ 374 { "sched", "off-cpu", XDT_SCHED_OFF_CPU, XDT_SCHED }, 375 { "sched", "on-cpu", XDT_SCHED_ON_CPU, XDT_SCHED }, 376 { "sched", "idle-off-cpu", XDT_SCHED_IDLE_OFF_CPU, XDT_SCHED }, 377 { "sched", "idle-on-cpu", XDT_SCHED_IDLE_ON_CPU, XDT_SCHED }, 378 { "sched", "block", XDT_SCHED_BLOCK, XDT_SCHED }, 379 { "sched", "sleep", XDT_SCHED_SLEEP, XDT_SCHED }, 380 { "sched", "wake", XDT_SCHED_WAKE, XDT_SCHED }, 381 { "sched", "yield", XDT_SCHED_YIELD, XDT_SCHED }, 382 { "sched", "shutdown-poweroff", XDT_SCHED_SHUTDOWN_POWEROFF, 383 XDT_SCHED }, 384 { "sched", "shutdown-reboot", XDT_SCHED_SHUTDOWN_REBOOT, XDT_SCHED }, 385 { "sched", "shutdown-suspend", XDT_SCHED_SHUTDOWN_SUSPEND, XDT_SCHED }, 386 { "sched", "shutdown-crash", XDT_SCHED_SHUTDOWN_CRASH, XDT_SCHED }, 387 { "sched", "add", XDT_SCHED_ADD_VCPU, XDT_SCHED }, 388 { "sched", "runstate-change", XDT_SCHED_RUNSTATE_CHANGE, XDT_SCHED }, 389 { "sched", "continue-running", XDT_SCHED_CONTINUE_RUNNING, XDT_SCHED }, 390 391 /* Memory probes */ 392 { "mem", "page-grant-map", XDT_MEM_PAGE_GRANT_MAP, XDT_MEM }, 393 { "mem", "page-grant-unmap", XDT_MEM_PAGE_GRANT_UNMAP, XDT_MEM }, 394 { "mem", "page-grant-transfer", XDT_MEM_PAGE_GRANT_TRANSFER, XDT_MEM }, 395 396 {"pv", "hypercall", XDT_PV_HYPERCALL, XDT_PV }, 397 {"pv", "trap", XDT_PV_TRAP, XDT_PV }, 398 {"pv", "page-fault", XDT_PV_PAGE_FAULT, XDT_PV }, 399 {"pv", "forced-invalid-op", XDT_PV_FORCED_INVALID_OP, XDT_PV }, 400 {"pv", "emulate-priv-op", XDT_PV_EMULATE_PRIVOP, XDT_PV }, 401 {"pv", "math-state-restore", XDT_PV_MATH_STATE_RESTORE, XDT_PV }, 402 {"pv", "paging-fixup", XDT_PV_PAGING_FIXUP, XDT_PV }, 403 {"pv", "dt-mapping-fault", XDT_PV_DT_MAPPING_FAULT, XDT_PV }, 404 {"pv", "pte-write-emul", XDT_PV_PTWR_EMULATION, XDT_PV }, 405 406 /* HVM probes */ 407 { "hvm", "vmentry", XDT_HVM_VMENTRY, XDT_HVM }, 408 { "hvm", "vmexit", XDT_HVM_VMEXIT, XDT_HVM }, 409 { "hvm", "pagefault-xen", XDT_HVM_PF_XEN, XDT_HVM }, 410 { "hvm", "pagefault-inject", XDT_HVM_PF_INJECT, XDT_HVM }, 411 { "hvm", "exception-inject", XDT_HVM_EXC_INJECT, XDT_HVM }, 412 { "hvm", "virq-inject", XDT_HVM_VIRQ_INJECT, XDT_HVM }, 413 { "hvm", "cr-read", XDT_HVM_CR_READ, XDT_HVM }, 414 { "hvm", "cr-write", XDT_HVM_CR_WRITE, XDT_HVM }, 415 { "hvm", "msr-read", XDT_HVM_MSR_READ, XDT_HVM }, 416 { "hvm", "msr-write", XDT_HVM_MSR_WRITE, XDT_HVM }, 417 { "hvm", "cpuid", XDT_HVM_CPUID, XDT_HVM }, 418 { "hvm", "intr", XDT_HVM_INTR, XDT_HVM }, 419 { "hvm", "intr-window", XDT_HVM_INTR_WINDOW, XDT_HVM }, 420 { "hvm", "nmi", XDT_HVM_NMI, XDT_HVM }, 421 { "hvm", "smi", XDT_HVM_SMI, XDT_HVM }, 422 { "hvm", "vmmcall", XDT_HVM_VMMCALL, XDT_HVM }, 423 { "hvm", "hlt", XDT_HVM_HLT, XDT_HVM }, 424 { "hvm", "invlpg", XDT_HVM_INVLPG, XDT_HVM }, 425 { "hvm", "mce", XDT_HVM_MCE, XDT_HVM }, 426 { "hvm", "pio-read", XDT_HVM_IOPORT_READ, XDT_HVM }, 427 { "hvm", "pio-write", XDT_HVM_IOPORT_WRITE, XDT_HVM }, 428 { "hvm", "mmio-read", XDT_HVM_IOMEM_READ, XDT_HVM }, 429 { "hvm", "mmio-write", XDT_HVM_IOMEM_WRITE, XDT_HVM }, 430 { "hvm", "clts", XDT_HVM_CLTS, XDT_HVM }, 431 { "hvm", "lmsw", XDT_HVM_LMSW, XDT_HVM }, 432 433 { "shadow", "fault-not-shadow", XDT_SHADOW_NOT_SHADOW, XDT_SHADOW }, 434 { "shadow", "fast-propagate", XDT_SHADOW_FAST_PROPAGATE, XDT_SHADOW }, 435 { "shadow", "fast-mmio", XDT_SHADOW_FAST_MMIO, XDT_SHADOW }, 436 { "shadow", "false-fast-path", XDT_SHADOW_FALSE_FAST_PATH, 437 XDT_SHADOW }, 438 { "shadow", "mmio", XDT_SHADOW_MMIO, XDT_SHADOW }, 439 { "shadow", "fixup", XDT_SHADOW_FIXUP, XDT_SHADOW }, 440 { "shadow", "domf-dying", XDT_SHADOW_DOMF_DYING, XDT_SHADOW }, 441 { "shadow", "emulate", XDT_SHADOW_EMULATE, XDT_SHADOW }, 442 { "shadow", "emulate-unshadow-user", XDT_SHADOW_EMULATE_UNSHADOW_USER, 443 XDT_SHADOW }, 444 { "shadow", "emulate-unshadow-evtinj", 445 XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, XDT_SHADOW }, 446 { "shadow", "emulate-unshadow-unhandled", 447 XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, XDT_SHADOW }, 448 { "shadow", "wrmap-bf", XDT_SHADOW_WRMAP_BF, XDT_SHADOW }, 449 { "shadow", "prealloc-unpin", XDT_SHADOW_PREALLOC_UNPIN, XDT_SHADOW }, 450 { "shadow", "resync-full", XDT_SHADOW_RESYNC_FULL, XDT_SHADOW }, 451 { "shadow", "resync-only", XDT_SHADOW_RESYNC_ONLY, XDT_SHADOW }, 452 453 { "pm", "freq-change", XDT_PM_FREQ_CHANGE, XDT_PM }, 454 { "pm", "idle-entry", XDT_PM_IDLE_ENTRY, XDT_PM }, 455 { "pm", "idle-exit", XDT_PM_IDLE_EXIT, XDT_PM }, 456 457 /* Trace buffer related probes */ 458 { "trace", "records-lost", XDT_TRC_LOST_RECORDS, XDT_GEN }, 459 460 { NULL } 461 }; 462 463 static inline uint32_t 464 xdt_nr_active_probes() 465 { 466 int i; 467 uint32_t tot = 0; 468 469 for (i = 0; i < XDT_NCLASSES; i++) 470 tot += xdt_classinfo[i].cnt; 471 472 return (tot); 473 } 474 475 static void 476 xdt_init_trace_masks(void) 477 { 478 xdt_classinfo[XDT_SCHED].trc_mask = TRC_SCHED; 479 xdt_classinfo[XDT_MEM].trc_mask = TRC_MEM; 480 xdt_classinfo[XDT_HVM].trc_mask = TRC_HVM; 481 xdt_classinfo[XDT_GEN].trc_mask = TRC_GEN; 482 xdt_classinfo[XDT_PV].trc_mask = TRC_PV; 483 xdt_classinfo[XDT_SHADOW].trc_mask = TRC_SHADOW; 484 xdt_classinfo[XDT_PM].trc_mask = TRC_PM; 485 } 486 487 static int 488 xdt_kstat_update(kstat_t *ksp, int flag) 489 { 490 kstat_named_t *knp; 491 492 if (flag != KSTAT_READ) 493 return (EACCES); 494 495 knp = ksp->ks_data; 496 497 /* 498 * Assignment order should match that of the names in 499 * xdt_stats. 500 */ 501 (knp++)->value.ui64 = tbuf.stat_dropped_recs; 502 503 return (0); 504 } 505 506 static void 507 xdt_kstat_init(void) 508 { 509 int nstats = sizeof (xdt_stats) / sizeof (xdt_stats[0]); 510 char **cp = xdt_stats; 511 kstat_named_t *knp; 512 513 if ((xdt_kstats = kstat_create("xdt", 0, "trace_statistics", "misc", 514 KSTAT_TYPE_NAMED, nstats, 0)) == NULL) 515 return; 516 517 xdt_kstats->ks_update = xdt_kstat_update; 518 519 knp = xdt_kstats->ks_data; 520 while (nstats > 0) { 521 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); 522 knp++; 523 cp++; 524 nstats--; 525 } 526 527 kstat_install(xdt_kstats); 528 } 529 530 static int 531 xdt_sysctl_tbuf(xen_sysctl_tbuf_op_t *tbuf_op) 532 { 533 xen_sysctl_t op; 534 int xerr; 535 536 op.cmd = XEN_SYSCTL_tbuf_op; 537 op.interface_version = XEN_SYSCTL_INTERFACE_VERSION; 538 op.u.tbuf_op = *tbuf_op; 539 540 if ((xerr = HYPERVISOR_sysctl(&op)) != 0) 541 return (xen_xlate_errcode(xerr)); 542 543 *tbuf_op = op.u.tbuf_op; 544 return (0); 545 } 546 547 static int 548 xdt_map_trace_buffers(mfn_t mfn, caddr_t va, size_t len) 549 { 550 x86pte_t pte; 551 caddr_t const sva = va; 552 caddr_t const eva = va + len; 553 int xerr; 554 555 ASSERT(mfn != MFN_INVALID); 556 ASSERT(va != NULL); 557 ASSERT(IS_PAGEALIGNED(len)); 558 559 for (; va < eva; va += MMU_PAGESIZE) { 560 /* 561 * Ask the HAT to load a throwaway mapping to page zero, then 562 * overwrite it with the hypervisor mapping. It gets removed 563 * later via hat_unload(). 564 */ 565 hat_devload(kas.a_hat, va, MMU_PAGESIZE, (pfn_t)0, 566 PROT_READ | HAT_UNORDERED_OK, 567 HAT_LOAD_NOCONSIST | HAT_LOAD); 568 569 pte = mmu_ptob((x86pte_t)mfn) | PT_VALID | PT_USER 570 | PT_FOREIGN | PT_WRITABLE; 571 572 xerr = HYPERVISOR_update_va_mapping_otherdomain((ulong_t)va, 573 pte, UVMF_INVLPG | UVMF_LOCAL, DOMID_XEN); 574 575 if (xerr != 0) { 576 /* unmap pages loaded so far */ 577 size_t ulen = (uintptr_t)(va + MMU_PAGESIZE) - 578 (uintptr_t)sva; 579 hat_unload(kas.a_hat, sva, ulen, HAT_UNLOAD_UNMAP); 580 return (xen_xlate_errcode(xerr)); 581 } 582 583 mfn++; 584 } 585 586 return (0); 587 } 588 589 static int 590 xdt_attach_trace_buffers(void) 591 { 592 xen_sysctl_tbuf_op_t tbuf_op; 593 size_t len; 594 int err; 595 uint_t i; 596 597 /* 598 * Xen does not support trace buffer re-sizing. If the buffers 599 * have already been allocated we just use them as is. 600 */ 601 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_get_info; 602 if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0) 603 return (err); 604 605 if (tbuf_op.size == 0) { 606 /* set trace buffer size */ 607 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_set_size; 608 tbuf_op.size = xdt_tbuf_pages; 609 (void) xdt_sysctl_tbuf(&tbuf_op); 610 611 /* get trace buffer info */ 612 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_get_info; 613 if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0) 614 return (err); 615 616 if (tbuf_op.size == 0) { 617 cmn_err(CE_NOTE, "Couldn't allocate trace buffers."); 618 return (ENOBUFS); 619 } 620 } 621 622 tbuf.size = tbuf_op.size; 623 tbuf.start_mfn = (mfn_t)tbuf_op.buffer_mfn; 624 tbuf.cnt = xdt_ncpus; 625 626 ASSERT(tbuf.start_mfn != MFN_INVALID); 627 ASSERT(tbuf.cnt > 0); 628 629 len = tbuf.size * tbuf.cnt; 630 tbuf.va = vmem_alloc(heap_arena, len, VM_SLEEP); 631 632 if ((err = xdt_map_trace_buffers(tbuf.start_mfn, tbuf.va, len)) != 0) { 633 vmem_free(heap_arena, tbuf.va, len); 634 tbuf.va = NULL; 635 return (err); 636 } 637 638 tbuf.meta = (struct t_buf **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.meta), 639 KM_SLEEP); 640 tbuf.data = (struct t_rec **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.data), 641 KM_SLEEP); 642 643 for (i = 0; i < tbuf.cnt; i++) { 644 void *cpu_buf = (void *)(tbuf.va + (tbuf.size * i)); 645 tbuf.meta[i] = cpu_buf; 646 tbuf.data[i] = (struct t_rec *)((uintptr_t)cpu_buf + 647 sizeof (struct t_buf)); 648 649 /* throw away stale trace records */ 650 tbuf.meta[i]->cons = tbuf.meta[i]->prod; 651 } 652 653 tbuf_data_size = tbuf.size - sizeof (struct t_buf); 654 if (xdt_max_recs == 0) 655 xdt_max_recs = (xdt_ncpus * tbuf_data_size) 656 / sizeof (struct t_rec); 657 658 return (0); 659 } 660 661 static void 662 xdt_detach_trace_buffers(void) 663 { 664 size_t len = tbuf.size * tbuf.cnt; 665 666 ASSERT(tbuf.va != NULL); 667 668 hat_unload(kas.a_hat, tbuf.va, len, 669 HAT_UNLOAD_UNMAP | HAT_UNLOAD_UNLOCK); 670 vmem_free(heap_arena, tbuf.va, len); 671 kmem_free(tbuf.meta, tbuf.cnt * sizeof (*tbuf.meta)); 672 kmem_free(tbuf.data, tbuf.cnt * sizeof (*tbuf.data)); 673 } 674 675 static void 676 xdt_update_sched_context(uint_t cpuid, uint_t dom, uint_t vcpu) 677 { 678 xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid]; 679 680 sp->cur_domid = dom; 681 sp->cur_vcpuid = vcpu; 682 sp->curinfo_valid = 1; 683 } 684 685 static void 686 xdt_update_domain_context(uint_t dom, uint_t vcpu) 687 { 688 xdt_curdom = dom; 689 xdt_curvcpu = vcpu; 690 } 691 692 static size_t 693 xdt_process_rec(uint_t cpuid, struct t_rec *rec) 694 { 695 xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid]; 696 uint_t dom, vcpu; 697 int eid; 698 uint32_t *data; 699 uint64_t tsc, addr64, rip64, val64, pte64; 700 size_t rec_size; 701 702 ASSERT(rec != NULL); 703 ASSERT(xdt_ncpus == xpv_nr_phys_cpus()); 704 705 eid = 0; 706 if (cpuid >= xdt_ncpus) { 707 tbuf.stat_spurious_cpu++; 708 goto done; 709 } 710 711 /* 712 * If our current state isn't valid, and if this is not 713 * an event that will update our state, skip it. 714 */ 715 716 if (!sp->curinfo_valid && 717 rec->event != TRC_SCHED_SWITCH && 718 rec->event != TRC_LOST_RECORDS) 719 goto done; 720 721 if (rec->cycles_included) { 722 data = rec->u.cycles.extra_u32; 723 tsc = (((uint64_t)rec->u.cycles.cycles_hi) << 32) 724 | rec->u.cycles.cycles_lo; 725 } else { 726 data = rec->u.nocycles.extra_u32; 727 tsc = 0; 728 } 729 730 xdt_timestamp = tsc; 731 732 switch (rec->event) { 733 /* 734 * Sched probes 735 */ 736 case TRC_SCHED_SWITCH_INFPREV: 737 /* 738 * Info on vCPU being de-scheduled 739 * 740 * data[0] = prev domid 741 * data[1] = time spent on pcpu 742 */ 743 sp->prev_domid = data[0]; 744 sp->prev_ctime = data[1]; 745 break; 746 747 case TRC_SCHED_SWITCH_INFNEXT: 748 /* 749 * Info on next vCPU to be scheduled 750 * 751 * data[0] = next domid 752 * data[1] = time spent waiting to get on cpu 753 * data[2] = time slice 754 */ 755 sp->next_domid = data[0]; 756 sp->next_wtime = data[1]; 757 sp->next_ts = data[2]; 758 break; 759 760 case TRC_SCHED_SWITCH: 761 /* 762 * vCPU switch 763 * 764 * data[0] = prev domid 765 * data[1] = prev vcpuid 766 * data[2] = next domid 767 * data[3] = next vcpuid 768 */ 769 770 /* 771 * Provide valid context for this probe if there 772 * wasn't one. 773 */ 774 if (!sp->curinfo_valid) 775 xdt_update_domain_context(data[0], data[1]); 776 777 xdt_update_sched_context(cpuid, data[0], data[1]); 778 779 if (data[0] != sp->prev_domid && 780 data[2] != sp->next_domid) { 781 /* prev and next info don't match doms being sched'd */ 782 tbuf.stat_spurious_switch++; 783 goto switchdone; 784 } 785 786 sp->prev_vcpuid = data[1]; 787 sp->next_vcpuid = data[3]; 788 789 XDT_PROBE3(IS_IDLE_DOM(sp->prev_domid)? 790 XDT_SCHED_IDLE_OFF_CPU:XDT_SCHED_OFF_CPU, 791 sp->prev_domid, sp->prev_vcpuid, sp->prev_ctime); 792 793 XDT_PROBE4(IS_IDLE_DOM(sp->next_domid)? 794 XDT_SCHED_IDLE_ON_CPU:XDT_SCHED_ON_CPU, 795 sp->next_domid, sp->next_vcpuid, sp->next_wtime, 796 sp->next_ts); 797 switchdone: 798 xdt_update_sched_context(cpuid, data[2], data[3]); 799 xdt_update_domain_context(data[2], data[3]); 800 801 break; 802 803 case TRC_SCHED_BLOCK: 804 /* 805 * vCPU blocked 806 * 807 * data[0] = domid 808 * data[1] = vcpuid 809 */ 810 XDT_PROBE2(XDT_SCHED_BLOCK, data[0], data[1]); 811 break; 812 813 case TRC_SCHED_SLEEP: 814 /* 815 * Put vCPU to sleep 816 * 817 * data[0] = domid 818 * data[1] = vcpuid 819 */ 820 XDT_PROBE2(XDT_SCHED_SLEEP, data[0], data[1]); 821 break; 822 823 case TRC_SCHED_WAKE: 824 /* 825 * Wake up vCPU 826 * 827 * data[0] = domid 828 * data[1] = vcpuid 829 */ 830 XDT_PROBE2(XDT_SCHED_WAKE, data[0], data[1]); 831 break; 832 833 case TRC_SCHED_YIELD: 834 /* 835 * vCPU yielded 836 * 837 * data[0] = domid 838 * data[1] = vcpuid 839 */ 840 XDT_PROBE2(XDT_SCHED_YIELD, data[0], data[1]); 841 break; 842 843 case TRC_SCHED_SHUTDOWN: 844 /* 845 * Guest shutting down 846 * 847 * data[0] = domid 848 * data[1] = initiating vcpu 849 * data[2] = shutdown code 850 */ 851 switch (data[2]) { 852 case SHUTDOWN_poweroff: 853 eid = XDT_SCHED_SHUTDOWN_POWEROFF; 854 break; 855 case SHUTDOWN_reboot: 856 eid = XDT_SCHED_SHUTDOWN_REBOOT; 857 break; 858 case SHUTDOWN_suspend: 859 eid = XDT_SCHED_SHUTDOWN_SUSPEND; 860 break; 861 case SHUTDOWN_crash: 862 eid = XDT_SCHED_SHUTDOWN_CRASH; 863 break; 864 default: 865 tbuf.stat_unknown_shutdown++; 866 goto done; 867 } 868 869 XDT_PROBE2(eid, data[0], data[1]); 870 break; 871 872 case TRC_SCHED_DOM_REM: 873 case TRC_SCHED_CTL: 874 case TRC_SCHED_S_TIMER_FN: 875 case TRC_SCHED_T_TIMER_FN: 876 case TRC_SCHED_DOM_TIMER_FN: 877 /* unused */ 878 break; 879 case TRC_SCHED_DOM_ADD: 880 /* 881 * Add vcpu to a guest. 882 * 883 * data[0] = domid 884 * data[1] = vcpu 885 */ 886 XDT_PROBE2(XDT_SCHED_ADD_VCPU, data[0], data[1]); 887 break; 888 case TRC_SCHED_ADJDOM: 889 /* 890 * Scheduling parameters for a guest 891 * were modified. 892 * 893 * data[0] = domid; 894 */ 895 XDT_PROBE1(XDT_SCHED_ADJDOM, data[1]); 896 break; 897 case TRC_SCHED_RUNSTATE_CHANGE: 898 /* 899 * Runstate change for a VCPU. 900 * 901 * data[0] = (domain << 16) | vcpu; 902 * data[1] = oldstate; 903 * data[2] = newstate; 904 */ 905 XDT_PROBE4(XDT_SCHED_RUNSTATE_CHANGE, data[0] >> 16, 906 data[0] & 0xffff, data[1], data[2]); 907 break; 908 case TRC_SCHED_CONTINUE_RUNNING: 909 /* 910 * VCPU is back on a physical CPU that it previously 911 * was also running this VCPU. 912 * 913 * data[0] = (domain << 16) | vcpu; 914 */ 915 XDT_PROBE2(XDT_SCHED_CONTINUE_RUNNING, data[0] >> 16, 916 data[0] & 0xffff); 917 break; 918 /* 919 * Mem probes 920 */ 921 case TRC_MEM_PAGE_GRANT_MAP: 922 /* 923 * Guest mapped page grant 924 * 925 * data[0] = target domid 926 */ 927 XDT_PROBE1(XDT_MEM_PAGE_GRANT_MAP, data[0]); 928 break; 929 930 case TRC_MEM_PAGE_GRANT_UNMAP: 931 /* 932 * Guest unmapped page grant 933 * 934 * data[0] = target domid 935 */ 936 XDT_PROBE1(XDT_MEM_PAGE_GRANT_UNMAP, data[0]); 937 break; 938 939 case TRC_MEM_PAGE_GRANT_TRANSFER: 940 /* 941 * Page grant is being transferred 942 * 943 * data[0] = target domid 944 */ 945 XDT_PROBE1(XDT_MEM_PAGE_GRANT_TRANSFER, data[0]); 946 break; 947 948 /* 949 * Probes for PV domains. 950 */ 951 case TRC_PV_HYPERCALL: 952 /* 953 * Hypercall from a 32-bit PV domain. 954 * 955 * data[0] = eip 956 * data[1] = eax 957 */ 958 XDT_PROBE2(XDT_PV_HYPERCALL, data[0], data[1]); 959 break; 960 case TRC_PV_HYPERCALL | TRC_64_FLAG: 961 /* 962 * Hypercall from a 64-bit PV domain. 963 * 964 * data[0] = rip(0:31) 965 * data[1] = rip(32:63) 966 * data[2] = eax; 967 */ 968 rip64 = (((uint64_t)data[1]) << 32) | data[0]; 969 XDT_PROBE2(XDT_PV_HYPERCALL, rip64, data[2]); 970 break; 971 case TRC_PV_TRAP: 972 /* 973 * Trap in a 32-bit PV domain. 974 * 975 * data[0] = eip 976 * data[1] = trapnr | (error_code_valid << 15) 977 * | (error_code << 16); 978 */ 979 XDT_PROBE4(XDT_PV_TRAP, data[0], data[1] & 0x7fff, 980 (data[1] >> 15) & 1, data[1] >> 16); 981 break; 982 case TRC_PV_TRAP | TRC_64_FLAG: 983 /* 984 * Trap in a 64-bit PV domain. 985 * 986 * data[0] = rip(0:31) 987 * data[1] = rip(32:63) 988 * data[2] = trapnr | (error_code_valid << 15) 989 * | (error_code << 16); 990 */ 991 rip64 = (((uint64_t)data[1]) << 32) | data[2]; 992 XDT_PROBE4(XDT_PV_TRAP, rip64, data[2] & 0x7fff, 993 (data[2] >> 15) & 1, data[2] >> 16); 994 break; 995 case TRC_PV_PAGE_FAULT: 996 /* 997 * Page fault in a 32-bit PV domain. 998 * 999 * data[0] = eip 1000 * data[1] = vaddr 1001 * data[2] = error code 1002 */ 1003 XDT_PROBE3(XDT_PV_PAGE_FAULT, data[0], data[1], data[2]); 1004 break; 1005 case TRC_PV_PAGE_FAULT | TRC_64_FLAG: 1006 /* 1007 * Page fault in a 32-bit PV domain. 1008 * 1009 * data[0] = rip(0:31) 1010 * data[1] = rip(31:63) 1011 * data[2] = vaddr(0:31) 1012 * data[3] = vaddr(31:63) 1013 * data[4] = error code 1014 */ 1015 rip64 = (((uint64_t)data[1]) << 32) | data[0]; 1016 addr64 = (((uint64_t)data[3]) << 32) | data[2]; 1017 XDT_PROBE3(XDT_PV_PAGE_FAULT, rip64, addr64, data[4]); 1018 break; 1019 case TRC_PV_FORCED_INVALID_OP: 1020 /* 1021 * Hypervisor emulated a forced invalid op (ud2) 1022 * in a 32-bit PV domain. 1023 * 1024 * data[1] = eip 1025 */ 1026 XDT_PROBE1(XDT_PV_FORCED_INVALID_OP, data[1]); 1027 break; 1028 case TRC_PV_FORCED_INVALID_OP | TRC_64_FLAG: 1029 /* 1030 * Hypervisor emulated a forced invalid op (ud2) 1031 * in a 64-bit PV domain. 1032 * 1033 * data[1] = rip(0:31) 1034 * data[2] = rip(31:63) 1035 * 1036 */ 1037 rip64 = (((uint64_t)data[2]) << 32) | data[1]; 1038 XDT_PROBE1(XDT_PV_FORCED_INVALID_OP, rip64); 1039 break; 1040 case TRC_PV_EMULATE_PRIVOP: 1041 /* 1042 * Hypervisor emulated a privileged operation 1043 * in a 32-bit PV domain. 1044 * 1045 * data[0] = eip 1046 */ 1047 XDT_PROBE1(XDT_PV_EMULATE_PRIVOP, data[0]); 1048 break; 1049 case TRC_PV_EMULATE_PRIVOP | TRC_64_FLAG: 1050 /* 1051 * Hypervisor emulated a privileged operation 1052 * in a 64-bit PV domain. 1053 * 1054 * data[0] = rip(0:31) 1055 * data[1] = rip(31:63) 1056 */ 1057 rip64 = (((uint64_t)data[1]) << 32) | data[0]; 1058 XDT_PROBE1(XDT_PV_EMULATE_PRIVOP, rip64); 1059 break; 1060 case TRC_PV_EMULATE_4GB: 1061 /* unused, 32-bit hypervisor only */ 1062 break; 1063 case TRC_PV_MATH_STATE_RESTORE: 1064 /* 1065 * Hypervisor restores math state after FP DNA trap. 1066 * 1067 * No arguments. 1068 */ 1069 XDT_PROBE0(XDT_PV_MATH_STATE_RESTORE); 1070 break; 1071 case TRC_PV_PAGING_FIXUP: 1072 /* 1073 * Hypervisor fixed up a page fault (e.g. it was 1074 * a side-effect of hypervisor guest page table 1075 * bookkeeping, and not propagated to the guest). 1076 * 1077 * data[0] = eip 1078 * data[1] = vaddr 1079 */ 1080 XDT_PROBE2(XDT_PV_PAGING_FIXUP, data[0], data[2]); 1081 break; 1082 case TRC_PV_PAGING_FIXUP | TRC_64_FLAG: 1083 /* 1084 * Hypervisor fixed up a page fault (e.g. it was 1085 * a side-effect of hypervisor guest page table 1086 * bookkeeping, and not propagated to the guest). 1087 * 1088 * data[0] = eip(0:31) 1089 * data[1] = eip(31:63) 1090 * data[2] = vaddr(0:31) 1091 * data[3] = vaddr(31:63) 1092 */ 1093 rip64 = (((uint64_t)data[1]) << 32) | data[0]; 1094 addr64 = (((uint64_t)data[3]) << 32) | data[2]; 1095 XDT_PROBE2(XDT_PV_PAGING_FIXUP, rip64, addr64); 1096 break; 1097 case TRC_PV_GDT_LDT_MAPPING_FAULT: 1098 /* 1099 * Descriptor table mapping fault in a 32-bit PV domain. 1100 * data[0] = eip 1101 * data[1] = offset 1102 */ 1103 XDT_PROBE2(XDT_PV_DT_MAPPING_FAULT, data[0], data[1]); 1104 break; 1105 case TRC_PV_GDT_LDT_MAPPING_FAULT | TRC_64_FLAG: 1106 /* 1107 * Descriptor table mapping fault in a 64-bit PV domain. 1108 * 1109 * data[0] = eip(0:31) 1110 * data[1] = eip(31:63) 1111 * data[2] = offset(0:31) 1112 * data[3] = offset(31:63) 1113 */ 1114 rip64 = (((uint64_t)data[1]) << 32) | data[0]; 1115 val64 = (((uint64_t)data[3]) << 32) | data[2]; 1116 XDT_PROBE2(XDT_PV_DT_MAPPING_FAULT, rip64, val64); 1117 break; 1118 case TRC_PV_PTWR_EMULATION: 1119 case TRC_PV_PTWR_EMULATION_PAE | TRC_64_FLAG: 1120 /* 1121 * Should only happen on 32-bit hypervisor; unused. 1122 */ 1123 break; 1124 case TRC_PV_PTWR_EMULATION_PAE: 1125 /* 1126 * PTE write emulation for a 32-bit PV domain. 1127 * 1128 * data[0] = pte 1129 * data[1] = addr 1130 * data[2] = eip 1131 */ 1132 XDT_PROBE3(XDT_PV_PTWR_EMULATION, data[0], data[1], data[2]); 1133 break; 1134 case TRC_PV_PTWR_EMULATION | TRC_64_FLAG: 1135 /* 1136 * PTE write emulation for a 64-bit PV domain. 1137 * 1138 * data[0] = pte(0:31) 1139 * data[1] = pte(32:63) 1140 * data[2] = addr(0:31) 1141 * data[3] = addr(32:63) 1142 * data[4] = rip(0:31) 1143 * data[5] = rip(32:63) 1144 */ 1145 pte64 = (((uint64_t)data[1]) << 32) | data[0]; 1146 addr64 = (((uint64_t)data[3]) << 32) | data[2]; 1147 rip64 = (((uint64_t)data[5]) << 32) | data[4]; 1148 XDT_PROBE3(XDT_PV_PTWR_EMULATION, pte64, addr64, rip64); 1149 break; 1150 1151 /* 1152 * HVM probes 1153 */ 1154 case TRC_HVM_VMENTRY: 1155 /* 1156 * Return to guest via vmx_launch/vmrun 1157 * 1158 */ 1159 XDT_PROBE0(XDT_HVM_VMENTRY); 1160 break; 1161 1162 case TRC_HVM_VMEXIT: 1163 /* 1164 * Entry into VMEXIT handler from 32-bit HVM domain 1165 * 1166 * data[0] = cpu vendor specific exit code 1167 * data[1] = guest eip 1168 */ 1169 XDT_PROBE2(XDT_HVM_VMEXIT, data[0], data[1]); 1170 break; 1171 case TRC_HVM_VMEXIT64: 1172 /* 1173 * Entry into VMEXIT handler from 64-bit HVM domain 1174 * 1175 * data[0] = cpu vendor specific exit code 1176 * data[1] = guest rip(0:31) 1177 * data[2] = guest rip(32:64) 1178 */ 1179 rip64 = (((uint64_t)data[2]) << 32) | data[1]; 1180 XDT_PROBE2(XDT_HVM_VMEXIT, data[0], rip64); 1181 break; 1182 1183 case TRC_HVM_PF_XEN64: 1184 /* 1185 * Pagefault in a guest that is a Xen (e.g. shadow) 1186 * artifact, and is not injected back into the guest. 1187 * 1188 * data[0] = error code 1189 * data[1] = guest VA(0:31) 1190 * data[2] = guest VA(32:64) 1191 */ 1192 addr64 = (((uint64_t)data[2]) << 32) | data[1]; 1193 XDT_PROBE2(XDT_HVM_PF_XEN, data[0], addr64); 1194 break; 1195 1196 case TRC_HVM_PF_XEN: 1197 /* 1198 * Same as above, but for a 32-bit HVM domain. 1199 * 1200 * data[0] = error code 1201 * data[1] = guest VA 1202 */ 1203 XDT_PROBE2(XDT_HVM_PF_XEN, data[0], data[1]); 1204 break; 1205 1206 case TRC_HVM_PF_INJECT: 1207 /* 1208 * 32-bit Xen only. 1209 */ 1210 break; 1211 case TRC_HVM_PF_INJECT64: 1212 /* 1213 * Pagefault injected back into a guest (e.g. the shadow 1214 * code found no mapping). 1215 * 1216 * data[0] = error code 1217 * data[1] = guest VA(0:31) 1218 * data[2] = guest VA(32:64) 1219 */ 1220 addr64 = (((uint64_t)data[2]) << 32) | data[1]; 1221 XDT_PROBE2(XDT_HVM_PF_INJECT, data[0], addr64); 1222 break; 1223 1224 case TRC_HVM_INJ_EXC: 1225 /* 1226 * Exception injected into an HVM guest. 1227 * 1228 * data[0] = trap 1229 * data[1] = error code 1230 */ 1231 XDT_PROBE2(XDT_HVM_EXC_INJECT, data[0], data[1]); 1232 break; 1233 case TRC_HVM_INJ_VIRQ: 1234 /* 1235 * Interrupt inject into an HVM guest. 1236 * 1237 * data[0] = vector 1238 */ 1239 XDT_PROBE1(XDT_HVM_VIRQ_INJECT, data[0]); 1240 break; 1241 case TRC_HVM_REINJ_VIRQ: 1242 case TRC_HVM_IO_READ: 1243 case TRC_HVM_IO_WRITE: 1244 /* unused */ 1245 break; 1246 case TRC_HVM_CR_READ64: 1247 /* 1248 * Control register read. Intel VMX only. 1249 * 1250 * data[0] = control register # 1251 * data[1] = value(0:31) 1252 * data[2] = value(32:63) 1253 */ 1254 val64 = (((uint64_t)data[2]) << 32) | data[1]; 1255 XDT_PROBE2(XDT_HVM_CR_READ, data[0], val64); 1256 break; 1257 case TRC_HVM_CR_READ: 1258 /* 1259 * unused (32-bit Xen only) 1260 */ 1261 break; 1262 case TRC_HVM_CR_WRITE64: 1263 /* 1264 * Control register write. Intel VMX only. 1265 * 1266 * data[0] = control register # 1267 * data[1] = value(0:31) 1268 * data[2] = value(32:63) 1269 */ 1270 val64 = (((uint64_t)data[2]) << 32) | data[1]; 1271 XDT_PROBE2(XDT_HVM_CR_READ, data[0], val64); 1272 break; 1273 case TRC_HVM_CR_WRITE: 1274 /* 1275 * unused (32-bit Xen only) 1276 */ 1277 break; 1278 case TRC_HVM_DR_READ: 1279 /* 1280 * unused. 1281 * 1282 * data[0] = (domid<<16 + vcpuid) 1283 */ 1284 break; 1285 case TRC_HVM_DR_WRITE: 1286 /* 1287 * Debug register write. Not too useful; no values, 1288 * so we ignore this. 1289 * 1290 * data[0] = (domid<<16 + vcpuid) 1291 */ 1292 break; 1293 case TRC_HVM_MSR_READ: 1294 /* 1295 * MSR read. 1296 * 1297 * data[0] = MSR 1298 * data[1] = value(0:31) 1299 * data[2] = value(32:63) 1300 */ 1301 val64 = (((uint64_t)data[3]) << 32) | data[2]; 1302 XDT_PROBE2(XDT_HVM_MSR_READ, data[0], val64); 1303 break; 1304 case TRC_HVM_MSR_WRITE: 1305 /* 1306 * MSR write. 1307 * 1308 * data[0] = MSR; 1309 * data[1] = value(0:31) 1310 * data[2] = value(32:63) 1311 */ 1312 val64 = (((uint64_t)data[2]) << 32) | data[1]; 1313 XDT_PROBE2(XDT_HVM_MSR_WRITE, data[0], val64); 1314 break; 1315 case TRC_HVM_CPUID: 1316 /* 1317 * CPUID insn. 1318 * 1319 * data[0] = %eax (input) 1320 * data[1] = %eax 1321 * data[2] = %ebx 1322 * data[3] = %ecx 1323 * data[4] = %edx 1324 */ 1325 XDT_PROBE5(XDT_HVM_CPUID, data[0], data[1], data[2], data[3], 1326 data[4]); 1327 break; 1328 case TRC_HVM_INTR: 1329 /* 1330 * VMEXIT because of an interrupt. 1331 */ 1332 XDT_PROBE0(XDT_HVM_INTR); 1333 break; 1334 case TRC_HVM_INTR_WINDOW: 1335 /* 1336 * VMEXIT because of an interrupt window (an interrupt 1337 * can't be delivered immediately to a HVM guest and must 1338 * be delayed). 1339 * 1340 * data[0] = vector 1341 * data[1] = source 1342 * data[2] = info 1343 */ 1344 XDT_PROBE3(XDT_HVM_INTR_WINDOW, data[0], data[1], data[2]); 1345 break; 1346 case TRC_HVM_NMI: 1347 /* 1348 * VMEXIT because of an NMI. 1349 */ 1350 XDT_PROBE0(XDT_HVM_NMI); 1351 break; 1352 case TRC_HVM_SMI: 1353 /* 1354 * VMEXIT because of an SMI 1355 */ 1356 XDT_PROBE0(XDT_HVM_SMI); 1357 break; 1358 case TRC_HVM_VMMCALL: 1359 /* 1360 * VMMCALL insn. 1361 * 1362 * data[0] = %eax 1363 */ 1364 XDT_PROBE1(XDT_HVM_VMMCALL, data[0]); 1365 break; 1366 case TRC_HVM_HLT: 1367 /* 1368 * HLT insn. 1369 * 1370 * data[0] = 1 if VCPU runnable, 0 if not 1371 */ 1372 XDT_PROBE1(XDT_HVM_HLT, data[0]); 1373 break; 1374 case TRC_HVM_INVLPG64: 1375 /* 1376 * 1377 * data[0] = INVLPGA ? 1 : 0 1378 * data[1] = vaddr(0:31) 1379 * data[2] = vaddr(32:63) 1380 */ 1381 addr64 = (((uint64_t)data[2]) << 32) | data[1]; 1382 XDT_PROBE2(XDT_HVM_INVLPG, data[0], addr64); 1383 break; 1384 case TRC_HVM_INVLPG: 1385 /* 1386 * unused (32-bit Xen only) 1387 * 1388 * data[0] = (domid<<16 + vcpuid) 1389 */ 1390 break; 1391 case TRC_HVM_MCE: 1392 /* 1393 * #MCE VMEXIT 1394 * 1395 */ 1396 XDT_PROBE0(XDT_HVM_MCE); 1397 break; 1398 case TRC_HVM_IOPORT_READ: 1399 case TRC_HVM_IOPORT_WRITE: 1400 case TRC_HVM_IOMEM_READ: 1401 case TRC_HVM_IOMEM_WRITE: 1402 /* 1403 * data[0] = addr(0:31) 1404 * data[1] = addr(32:63) 1405 * data[2] = count 1406 * data[3] = size 1407 */ 1408 switch (rec->event) { 1409 case TRC_HVM_IOPORT_READ: 1410 eid = XDT_HVM_IOPORT_READ; 1411 break; 1412 case TRC_HVM_IOPORT_WRITE: 1413 eid = XDT_HVM_IOPORT_WRITE; 1414 break; 1415 case TRC_HVM_IOMEM_READ: 1416 eid = XDT_HVM_IOMEM_READ; 1417 break; 1418 case TRC_HVM_IOMEM_WRITE: 1419 eid = XDT_HVM_IOMEM_WRITE; 1420 break; 1421 } 1422 addr64 = (((uint64_t)data[1]) << 32) | data[0]; 1423 XDT_PROBE3(eid, addr64, data[2], data[3]); 1424 break; 1425 case TRC_HVM_CLTS: 1426 /* 1427 * CLTS insn (Intel VMX only) 1428 */ 1429 XDT_PROBE0(XDT_HVM_CLTS); 1430 break; 1431 case TRC_HVM_LMSW64: 1432 /* 1433 * LMSW insn. 1434 * 1435 * data[0] = value(0:31) 1436 * data[1] = value(32:63) 1437 */ 1438 val64 = (((uint64_t)data[1]) << 32) | data[0]; 1439 XDT_PROBE1(XDT_HVM_LMSW, val64); 1440 break; 1441 case TRC_HVM_LMSW: 1442 /* 1443 * unused (32-bit Xen only) 1444 */ 1445 break; 1446 1447 /* 1448 * Shadow page table probes (mainly used for HVM domains 1449 * without hardware paging support). 1450 */ 1451 case TRC_SHADOW_NOT_SHADOW | SH_GUEST_32: 1452 /* 1453 * data[0] = pte(0:31) 1454 * data[1] = pte(32:63) 1455 * data[2] = va 1456 * data[3] = flags 1457 */ 1458 pte64 = ((uint64_t)data[1] << 32) | data[0]; 1459 XDT_PROBE3(XDT_SHADOW_NOT_SHADOW, pte64, data[2], data[3]); 1460 break; 1461 case TRC_SHADOW_NOT_SHADOW | SH_GUEST_PAE: 1462 case TRC_SHADOW_NOT_SHADOW | SH_GUEST_64: 1463 /* 1464 * data[0] = pte(0:31) 1465 * data[1] = pte(32:63) 1466 * data[2] = va(0:31) 1467 * data[3] = va(32:63) 1468 * data[4] = flags 1469 */ 1470 addr64 = ((uint64_t)data[2] << 32) | data[3]; 1471 pte64 = ((uint64_t)data[1] << 32) | data[0]; 1472 XDT_PROBE3(XDT_SHADOW_NOT_SHADOW, pte64, addr64, data[4]); 1473 break; 1474 case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_32: 1475 /* 1476 * data[0] = va 1477 */ 1478 XDT_PROBE1(XDT_SHADOW_FAST_PROPAGATE, data[0]); 1479 break; 1480 case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_PAE: 1481 case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_64: 1482 /* 1483 * data[0] = va(0:31) 1484 * data[1] = va(32:63) 1485 */ 1486 addr64 = ((uint64_t)data[1] << 32) | data[0]; 1487 XDT_PROBE1(XDT_SHADOW_FAST_PROPAGATE, addr64); 1488 break; 1489 case TRC_SHADOW_FAST_MMIO | SH_GUEST_32: 1490 /* 1491 * data[0] = va 1492 */ 1493 XDT_PROBE1(XDT_SHADOW_FAST_MMIO, data[0]); 1494 break; 1495 case TRC_SHADOW_FAST_MMIO | SH_GUEST_PAE: 1496 case TRC_SHADOW_FAST_MMIO | SH_GUEST_64: 1497 /* 1498 * data[0] = va(0:31) 1499 * data[1] = va(32:63) 1500 */ 1501 addr64 = ((uint64_t)data[1] << 32) | data[0]; 1502 XDT_PROBE1(XDT_SHADOW_FAST_MMIO, addr64); 1503 break; 1504 case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_32: 1505 /* 1506 * data[0] = va 1507 */ 1508 XDT_PROBE1(XDT_SHADOW_FALSE_FAST_PATH, data[0]); 1509 break; 1510 case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_PAE: 1511 case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_64: 1512 /* 1513 * data[0] = va(0:31) 1514 * data[1] = va(32:63) 1515 */ 1516 addr64 = ((uint64_t)data[1] << 32) | data[0]; 1517 XDT_PROBE1(XDT_SHADOW_FALSE_FAST_PATH, addr64); 1518 break; 1519 case TRC_SHADOW_MMIO | SH_GUEST_32: 1520 /* 1521 * data[0] = va 1522 */ 1523 XDT_PROBE1(XDT_SHADOW_MMIO, data[0]); 1524 break; 1525 case TRC_SHADOW_MMIO | SH_GUEST_PAE: 1526 case TRC_SHADOW_MMIO | SH_GUEST_64: 1527 /* 1528 * data[0] = va(0:31) 1529 * data[1] = va(32:63) 1530 */ 1531 addr64 = ((uint64_t)data[1] << 32) | data[0]; 1532 XDT_PROBE1(XDT_SHADOW_MMIO, addr64); 1533 break; 1534 case TRC_SHADOW_FIXUP | SH_GUEST_32: 1535 /* 1536 * data[0] = pte(0:31) 1537 * data[1] = pte(32:63) 1538 * data[2] = va 1539 * data[3] = flags 1540 */ 1541 pte64 = ((uint64_t)data[1] << 32) | data[0]; 1542 XDT_PROBE3(XDT_SHADOW_FIXUP, pte64, data[2], data[3]); 1543 break; 1544 case TRC_SHADOW_FIXUP | SH_GUEST_64: 1545 case TRC_SHADOW_FIXUP | SH_GUEST_PAE: 1546 /* 1547 * data[0] = pte(0:31) 1548 * data[1] = pte(32:63) 1549 * data[2] = va(0:31) 1550 * data[3] = va(32:63) 1551 * data[4] = flags 1552 */ 1553 addr64 = ((uint64_t)data[2] << 32) | data[3]; 1554 pte64 = ((uint64_t)data[1] << 32) | data[0]; 1555 XDT_PROBE3(XDT_SHADOW_FIXUP, pte64, addr64, data[4]); 1556 break; 1557 case TRC_SHADOW_DOMF_DYING | SH_GUEST_32: 1558 /* 1559 * data[0] = va 1560 */ 1561 XDT_PROBE1(XDT_SHADOW_DOMF_DYING, data[0]); 1562 break; 1563 case TRC_SHADOW_DOMF_DYING | SH_GUEST_PAE: 1564 case TRC_SHADOW_DOMF_DYING | SH_GUEST_64: 1565 /* 1566 * data[0] = va(0:31) 1567 * data[1] = va(32:63) 1568 */ 1569 addr64 = ((uint64_t)data[1] << 32) | data[0]; 1570 XDT_PROBE1(XDT_SHADOW_DOMF_DYING, addr64); 1571 break; 1572 case TRC_SHADOW_EMULATE | SH_GUEST_32: 1573 /* 1574 * data[0] = pte(0:31) 1575 * data[1] = pte(32:63) 1576 * data[2] = val(0:31) 1577 * data[3] = val(32:63) 1578 * data[4] = addr 1579 * data[5] = flags 1580 */ 1581 pte64 = ((uint64_t)data[1] << 32) | data[0]; 1582 val64 = ((uint64_t)data[3] << 32) | data[2]; 1583 XDT_PROBE5(XDT_SHADOW_EMULATE, pte64, val64, data[4], 1584 data[5] & 0x7fffffff, data[5] >> 29); 1585 break; 1586 case TRC_SHADOW_EMULATE | SH_GUEST_PAE: 1587 case TRC_SHADOW_EMULATE | SH_GUEST_64: 1588 /* 1589 * data[0] = pte(0:31) 1590 * data[1] = pte(32:63) 1591 * data[2] = val(0:31) 1592 * data[3] = val(32:63) 1593 * data[4] = addr(0:31) 1594 * data[5] = addr(32:63) 1595 * data[6] = flags 1596 */ 1597 pte64 = ((uint64_t)data[1] << 32) | data[0]; 1598 val64 = ((uint64_t)data[3] << 32) | data[2]; 1599 addr64 = ((uint64_t)data[5] << 32) | data[4]; 1600 XDT_PROBE5(XDT_SHADOW_EMULATE, pte64, val64, data[4], 1601 data[6] & 0x7fffffff, data[6] >> 29); 1602 break; 1603 case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_32: 1604 /* 1605 * data[0] = gfn 1606 * data[1] = vaddr 1607 */ 1608 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_USER, data[0], data[1]); 1609 break; 1610 case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_PAE: 1611 case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_64: 1612 /* 1613 * data[0] = gfn(0:31) 1614 * data[1] = gfn(32:63) 1615 * data[2] = vaddr(0:31) 1616 * data[3] = vaddr(32:63) 1617 */ 1618 val64 = ((uint64_t)data[1] << 32) | data[0]; 1619 addr64 = ((uint64_t)data[3] << 32) | data[2]; 1620 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_USER, val64, addr64); 1621 break; 1622 case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_32: 1623 /* 1624 * data[0] = gfn 1625 * data[1] = vaddr 1626 */ 1627 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, data[0], 1628 data[1]); 1629 break; 1630 case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_PAE: 1631 case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_64: 1632 /* 1633 * data[0] = gfn(0:31) 1634 * data[1] = gfn(32:63) 1635 * data[2] = vaddr(0:31) 1636 * data[3] = vaddr(32:63) 1637 */ 1638 val64 = ((uint64_t)data[1] << 32) | data[0]; 1639 addr64 = ((uint64_t)data[3] << 32) | data[2]; 1640 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, val64, addr64); 1641 break; 1642 case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_32: 1643 /* 1644 * data[0] = gfn 1645 * data[1] = vaddr 1646 */ 1647 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, data[0], 1648 data[1]); 1649 break; 1650 case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_PAE: 1651 case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_64: 1652 /* 1653 * data[0] = gfn(0:31) 1654 * data[1] = gfn(32:63) 1655 * data[2] = vaddr(0:31) 1656 * data[3] = vaddr(32:63) 1657 */ 1658 val64 = ((uint64_t)data[1] << 32) | data[0]; 1659 addr64 = ((uint64_t)data[3] << 32) | data[2]; 1660 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, val64, 1661 addr64); 1662 break; 1663 case TRC_SHADOW_WRMAP_BF: 1664 /* 1665 * data[0] = gfn(0:31) 1666 * data[1] = gfn(32:63) 1667 */ 1668 val64 = ((uint64_t)data[1] << 32) | data[0]; 1669 XDT_PROBE1(XDT_SHADOW_WRMAP_BF, val64); 1670 break; 1671 case TRC_SHADOW_PREALLOC_UNPIN: 1672 /* 1673 * data[0] = gfn(0:31) 1674 * data[1] = gfn(32:63) 1675 */ 1676 val64 = ((uint64_t)data[1] << 32) | data[0]; 1677 XDT_PROBE1(XDT_SHADOW_PREALLOC_UNPIN, val64); 1678 break; 1679 case TRC_SHADOW_RESYNC_FULL: 1680 /* 1681 * data[0] = gmfn(0:31) 1682 * data[1] = gmfn(32:63) 1683 */ 1684 val64 = ((uint64_t)data[1] << 32) | data[0]; 1685 XDT_PROBE1(XDT_SHADOW_RESYNC_FULL, val64); 1686 break; 1687 case TRC_SHADOW_RESYNC_ONLY: 1688 /* 1689 * data[0] = gmfn(0:31) 1690 * data[1] = gmfn(32:63) 1691 */ 1692 val64 = ((uint64_t)data[1] << 32) | data[0]; 1693 XDT_PROBE1(XDT_SHADOW_RESYNC_ONLY, val64); 1694 break; 1695 1696 /* 1697 * Power management probes. 1698 */ 1699 case TRC_PM_FREQ_CHANGE: 1700 /* 1701 * data[0] = old freq 1702 * data[1] = new freq 1703 */ 1704 XDT_PROBE2(XDT_PM_FREQ_CHANGE, data[0], data[1]); 1705 break; 1706 case TRC_PM_IDLE_ENTRY: 1707 /* 1708 * data[0] = C-state 1709 * data[1] = time 1710 */ 1711 XDT_PROBE2(XDT_PM_IDLE_ENTRY, data[0], data[1]); 1712 break; 1713 case TRC_PM_IDLE_EXIT: 1714 /* 1715 * data[0] = C-state 1716 * data[1] = time 1717 */ 1718 XDT_PROBE2(XDT_PM_IDLE_EXIT, data[0], data[1]); 1719 break; 1720 case TRC_LOST_RECORDS: 1721 vcpu = data[1] >> 16; 1722 dom = data[1] & 0xffff; 1723 xdt_update_sched_context(cpuid, dom, vcpu); 1724 xdt_update_domain_context(dom, vcpu); 1725 XDT_PROBE1(XDT_TRC_LOST_RECORDS, cpuid); 1726 tbuf.stat_dropped_recs++; 1727 break; 1728 1729 default: 1730 tbuf.stat_unknown_recs++; 1731 break; 1732 } 1733 1734 done: 1735 rec_size = 4 + (rec->cycles_included ? 8 : 0) + (rec->extra_u32 * 4); 1736 return (rec_size); 1737 } 1738 1739 /* 1740 * Scan all CPU buffers for the record with the lowest timestamp so 1741 * that the probes will fire in order. 1742 */ 1743 static int 1744 xdt_get_first_rec(uint_t *cpuidp, struct t_rec **recp, uint32_t *consp) 1745 { 1746 uint_t cpuid; 1747 uint32_t prod, cons, offset; 1748 struct t_rec *rec; 1749 uint64_t minstamp = ~0ULL, stamp; 1750 uintptr_t data; 1751 1752 for (cpuid = 0; cpuid < tbuf.cnt; cpuid++) { 1753 cons = tbuf.meta[cpuid]->cons; 1754 prod = tbuf.meta[cpuid]->prod; 1755 membar_consumer(); 1756 if (prod == cons) 1757 continue; 1758 1759 offset = cons % tbuf_data_size; 1760 data = (uintptr_t)tbuf.data[cpuid] + offset; 1761 rec = (struct t_rec *)data; 1762 ASSERT((caddr_t)rec < tbuf.va + (tbuf.size * (cpuid + 1))); 1763 1764 /* 1765 * All records that we know about have time cycles included. 1766 * If this record doesn't have them, assume it's a type 1767 * that we don't handle. Use a 0 time value, which will make 1768 * it get handled first (it will be thrown away). 1769 */ 1770 if (rec->cycles_included) 1771 stamp = (((uint64_t)rec->u.cycles.cycles_hi) << 32) 1772 | rec->u.cycles.cycles_lo; 1773 else 1774 stamp = 0; 1775 1776 if (stamp < minstamp) { 1777 minstamp = stamp; 1778 *cpuidp = cpuid; 1779 *recp = rec; 1780 *consp = cons; 1781 } 1782 } 1783 1784 if (minstamp != ~0ULL) 1785 return (1); 1786 1787 return (0); 1788 } 1789 1790 /*ARGSUSED*/ 1791 static void 1792 xdt_tbuf_scan(void *arg) 1793 { 1794 uint32_t bytes_done, cons; 1795 struct t_rec *rec; 1796 xdt_schedinfo_t *sp; 1797 uint_t nrecs, cpuid; 1798 1799 for (nrecs = 0; 1800 nrecs < xdt_max_recs && xdt_get_first_rec(&cpuid, &rec, &cons) > 0; 1801 nrecs++) { 1802 xdt_curpcpu = cpuid; 1803 sp = &xdt_cpu_schedinfo[cpuid]; 1804 if (sp->curinfo_valid) 1805 xdt_update_domain_context(sp->cur_domid, 1806 sp->cur_vcpuid); 1807 1808 bytes_done = xdt_process_rec(cpuid, rec); 1809 cons += bytes_done; 1810 /* 1811 * cons and prod are incremented modulo (2 * tbuf_data_size). 1812 * See <xen/public/trace.h>. 1813 */ 1814 if (cons >= 2 * tbuf_data_size) 1815 cons -= 2 * tbuf_data_size; 1816 membar_exit(); 1817 tbuf.meta[cpuid]->cons = cons; 1818 } 1819 } 1820 1821 static void 1822 xdt_cyclic_enable(void) 1823 { 1824 cyc_handler_t hdlr; 1825 cyc_time_t when; 1826 1827 ASSERT(MUTEX_HELD(&cpu_lock)); 1828 1829 hdlr.cyh_func = xdt_tbuf_scan; 1830 hdlr.cyh_arg = NULL; 1831 hdlr.cyh_level = CY_LOW_LEVEL; 1832 1833 when.cyt_interval = xdt_poll_nsec; 1834 when.cyt_when = dtrace_gethrtime() + when.cyt_interval; 1835 1836 xdt_cyclic = cyclic_add(&hdlr, &when); 1837 } 1838 1839 static void 1840 xdt_probe_create(xdt_probe_t *p) 1841 { 1842 ASSERT(p != NULL && p->pr_mod != NULL); 1843 1844 if (dtrace_probe_lookup(xdt_id, p->pr_mod, NULL, p->pr_name) != 0) 1845 return; 1846 1847 xdt_prid[p->evt_id] = dtrace_probe_create(xdt_id, p->pr_mod, NULL, 1848 p->pr_name, dtrace_mach_aframes(), p); 1849 } 1850 1851 /*ARGSUSED*/ 1852 static void 1853 xdt_provide(void *arg, const dtrace_probedesc_t *desc) 1854 { 1855 const char *mod, *name; 1856 int i; 1857 1858 if (desc == NULL) { 1859 for (i = 0; xdt_probe[i].pr_mod != NULL; i++) { 1860 xdt_probe_create(&xdt_probe[i]); 1861 } 1862 } else { 1863 mod = desc->dtpd_mod; 1864 name = desc->dtpd_name; 1865 for (i = 0; xdt_probe[i].pr_mod != NULL; i++) { 1866 int l1 = strlen(xdt_probe[i].pr_name); 1867 int l2 = strlen(xdt_probe[i].pr_mod); 1868 if (strncmp(name, xdt_probe[i].pr_name, l1) == 0 && 1869 strncmp(mod, xdt_probe[i].pr_mod, l2) == 0) 1870 break; 1871 } 1872 1873 if (xdt_probe[i].pr_mod == NULL) 1874 return; 1875 xdt_probe_create(&xdt_probe[i]); 1876 } 1877 1878 } 1879 1880 /*ARGSUSED*/ 1881 static void 1882 xdt_destroy(void *arg, dtrace_id_t id, void *parg) 1883 { 1884 xdt_probe_t *p = parg; 1885 xdt_prid[p->evt_id] = 0; 1886 } 1887 1888 static void 1889 xdt_set_trace_mask(uint32_t mask) 1890 { 1891 xen_sysctl_tbuf_op_t tbuf_op; 1892 1893 /* Always need to trace scheduling, for context */ 1894 if (mask != 0) 1895 mask |= TRC_SCHED; 1896 tbuf_op.evt_mask = mask; 1897 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_set_evt_mask; 1898 (void) xdt_sysctl_tbuf(&tbuf_op); 1899 } 1900 1901 /*ARGSUSED*/ 1902 static int 1903 xdt_enable(void *arg, dtrace_id_t id, void *parg) 1904 { 1905 xdt_probe_t *p = parg; 1906 xen_sysctl_tbuf_op_t tbuf_op; 1907 1908 ASSERT(MUTEX_HELD(&cpu_lock)); 1909 ASSERT(xdt_prid[p->evt_id] != 0); 1910 1911 xdt_probemap[p->evt_id] = xdt_prid[p->evt_id]; 1912 xdt_classinfo[p->class].cnt++; 1913 1914 if (xdt_classinfo[p->class].cnt == 1) { 1915 /* set the trace mask for this class */ 1916 cur_trace_mask |= xdt_classinfo[p->class].trc_mask; 1917 xdt_set_trace_mask(cur_trace_mask); 1918 } 1919 1920 if (xdt_cyclic == CYCLIC_NONE) { 1921 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_enable; 1922 if (xdt_sysctl_tbuf(&tbuf_op) != 0) { 1923 cmn_err(CE_NOTE, "Couldn't enable hypervisor tracing."); 1924 return (-1); 1925 } 1926 1927 xdt_cyclic_enable(); 1928 } 1929 return (0); 1930 } 1931 1932 /*ARGSUSED*/ 1933 static void 1934 xdt_disable(void *arg, dtrace_id_t id, void *parg) 1935 { 1936 xdt_probe_t *p = parg; 1937 xen_sysctl_tbuf_op_t tbuf_op; 1938 int i, err; 1939 1940 ASSERT(MUTEX_HELD(&cpu_lock)); 1941 ASSERT(xdt_probemap[p->evt_id] != 0); 1942 ASSERT(xdt_probemap[p->evt_id] == xdt_prid[p->evt_id]); 1943 ASSERT(xdt_classinfo[p->class].cnt > 0); 1944 1945 /* 1946 * We could be here in the slight window between the cyclic firing and 1947 * a call to dtrace_probe() occurring. We need to be careful if we tear 1948 * down any shared state. 1949 */ 1950 1951 xdt_probemap[p->evt_id] = 0; 1952 xdt_classinfo[p->class].cnt--; 1953 1954 if (xdt_nr_active_probes() == 0) { 1955 cur_trace_mask = 0; 1956 1957 if (xdt_cyclic == CYCLIC_NONE) 1958 return; 1959 1960 for (i = 0; i < xdt_ncpus; i++) 1961 xdt_cpu_schedinfo[i].curinfo_valid = 0; 1962 1963 /* 1964 * We will try to disable the trace buffers. If we fail for some 1965 * reason we will try again, up to a count of XDT_TBUF_RETRY. 1966 * If we still aren't successful we try to set the trace mask 1967 * to 0 in order to prevent trace records from being written. 1968 */ 1969 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_disable; 1970 i = 0; 1971 do { 1972 err = xdt_sysctl_tbuf(&tbuf_op); 1973 } while ((err != 0) && (++i < XDT_TBUF_RETRY)); 1974 1975 if (err != 0) { 1976 cmn_err(CE_NOTE, 1977 "Couldn't disable hypervisor tracing."); 1978 xdt_set_trace_mask(0); 1979 } else { 1980 cyclic_remove(xdt_cyclic); 1981 xdt_cyclic = CYCLIC_NONE; 1982 /* 1983 * We don't bother making the hypercall to set 1984 * the trace mask, since it will be reset when 1985 * tracing is re-enabled. 1986 */ 1987 } 1988 } else if (xdt_classinfo[p->class].cnt == 0) { 1989 cur_trace_mask ^= xdt_classinfo[p->class].trc_mask; 1990 /* other probes are enabled, so add the sub-class mask back */ 1991 cur_trace_mask |= 0xF000; 1992 xdt_set_trace_mask(cur_trace_mask); 1993 } 1994 } 1995 1996 static dtrace_pattr_t xdt_attr = { 1997 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 1998 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 1999 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, 2000 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 2001 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 2002 }; 2003 2004 static dtrace_pops_t xdt_pops = { 2005 xdt_provide, /* dtps_provide() */ 2006 NULL, /* dtps_provide_module() */ 2007 xdt_enable, /* dtps_enable() */ 2008 xdt_disable, /* dtps_disable() */ 2009 NULL, /* dtps_suspend() */ 2010 NULL, /* dtps_resume() */ 2011 NULL, /* dtps_getargdesc() */ 2012 NULL, /* dtps_getargval() */ 2013 NULL, /* dtps_usermode() */ 2014 xdt_destroy /* dtps_destroy() */ 2015 }; 2016 2017 static int 2018 xdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 2019 { 2020 int val; 2021 2022 if (!DOMAIN_IS_INITDOMAIN(xen_info)) 2023 return (DDI_FAILURE); 2024 2025 switch (cmd) { 2026 case DDI_ATTACH: 2027 break; 2028 2029 case DDI_RESUME: 2030 /* 2031 * We might support proper suspend/resume in the future, so, 2032 * return DDI_FAILURE for now. 2033 */ 2034 return (DDI_FAILURE); 2035 2036 default: 2037 return (DDI_FAILURE); 2038 } 2039 2040 xdt_ncpus = xpv_nr_phys_cpus(); 2041 ASSERT(xdt_ncpus > 0); 2042 2043 if (ddi_create_minor_node(devi, "xdt", S_IFCHR, 0, DDI_PSEUDO, 0) == 2044 DDI_FAILURE || xdt_attach_trace_buffers() != 0 || 2045 dtrace_register("xdt", &xdt_attr, DTRACE_PRIV_KERNEL, NULL, 2046 &xdt_pops, NULL, &xdt_id) != 0) { 2047 if (tbuf.va != NULL) 2048 xdt_detach_trace_buffers(); 2049 ddi_remove_minor_node(devi, NULL); 2050 return (DDI_FAILURE); 2051 } 2052 2053 val = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, 2054 "xdt_poll_nsec", XDT_POLL_DEFAULT); 2055 xdt_poll_nsec = MAX(val, XDT_POLL_MIN); 2056 2057 xdt_cpu_schedinfo = (xdt_schedinfo_t *)kmem_zalloc(xdt_ncpus * 2058 sizeof (xdt_schedinfo_t), KM_SLEEP); 2059 xdt_init_trace_masks(); 2060 xdt_kstat_init(); 2061 2062 xdt_devi = devi; 2063 ddi_report_dev(devi); 2064 return (DDI_SUCCESS); 2065 } 2066 2067 static int 2068 xdt_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) 2069 { 2070 switch (cmd) { 2071 case DDI_DETACH: 2072 break; 2073 2074 case DDI_SUSPEND: 2075 /* 2076 * We might support proper suspend/resume in the future. So 2077 * return DDI_FAILURE for now. 2078 */ 2079 return (DDI_FAILURE); 2080 2081 default: 2082 return (DDI_FAILURE); 2083 } 2084 2085 if (dtrace_unregister(xdt_id) != 0) 2086 return (DDI_FAILURE); 2087 2088 xdt_detach_trace_buffers(); 2089 kmem_free(xdt_cpu_schedinfo, xdt_ncpus * sizeof (xdt_schedinfo_t)); 2090 if (xdt_cyclic != CYCLIC_NONE) 2091 cyclic_remove(xdt_cyclic); 2092 if (xdt_kstats != NULL) 2093 kstat_delete(xdt_kstats); 2094 xdt_devi = (void *)0; 2095 ddi_remove_minor_node(devi, NULL); 2096 2097 return (DDI_SUCCESS); 2098 } 2099 2100 /*ARGSUSED*/ 2101 static int 2102 xdt_info(dev_info_t *devi, ddi_info_cmd_t infocmd, void *arg, void **result) 2103 { 2104 int error; 2105 2106 switch (infocmd) { 2107 case DDI_INFO_DEVT2DEVINFO: 2108 *result = xdt_devi; 2109 error = DDI_SUCCESS; 2110 break; 2111 case DDI_INFO_DEVT2INSTANCE: 2112 *result = (void *)0; 2113 error = DDI_SUCCESS; 2114 break; 2115 default: 2116 error = DDI_FAILURE; 2117 } 2118 return (error); 2119 } 2120 2121 static struct cb_ops xdt_cb_ops = { 2122 nulldev, /* open(9E) */ 2123 nodev, /* close(9E) */ 2124 nodev, /* strategy(9E) */ 2125 nodev, /* print(9E) */ 2126 nodev, /* dump(9E) */ 2127 nodev, /* read(9E) */ 2128 nodev, /* write(9E) */ 2129 nodev, /* ioctl(9E) */ 2130 nodev, /* devmap(9E) */ 2131 nodev, /* mmap(9E) */ 2132 nodev, /* segmap(9E) */ 2133 nochpoll, /* chpoll(9E) */ 2134 ddi_prop_op, /* prop_op(9E) */ 2135 NULL, /* streamtab(9S) */ 2136 D_MP | D_64BIT | D_NEW /* cb_flag */ 2137 }; 2138 2139 static struct dev_ops xdt_ops = { 2140 DEVO_REV, /* devo_rev */ 2141 0, /* devo_refcnt */ 2142 xdt_info, /* getinfo(9E) */ 2143 nulldev, /* identify(9E) */ 2144 nulldev, /* probe(9E) */ 2145 xdt_attach, /* attach(9E) */ 2146 xdt_detach, /* detach(9E) */ 2147 nulldev, /* devo_reset */ 2148 &xdt_cb_ops, /* devo_cb_ops */ 2149 NULL, /* devo_bus_ops */ 2150 NULL, /* power(9E) */ 2151 ddi_quiesce_not_needed, /* devo_quiesce */ 2152 }; 2153 2154 2155 static struct modldrv modldrv = { 2156 &mod_driverops, 2157 "Hypervisor event tracing", 2158 &xdt_ops 2159 }; 2160 2161 static struct modlinkage modlinkage = { 2162 MODREV_1, 2163 &modldrv, 2164 NULL 2165 }; 2166 2167 int 2168 _init(void) 2169 { 2170 return (mod_install(&modlinkage)); 2171 } 2172 2173 int 2174 _fini(void) 2175 { 2176 return (mod_remove(&modlinkage)); 2177 } 2178 2179 int 2180 _info(struct modinfo *modinfop) 2181 { 2182 return (mod_info(&modlinkage, modinfop)); 2183 } 2184