xref: /dragonfly/lib/libevtr/evtr.c (revision a68e0df0)
1 /*
2  * Copyright (c) 2009, 2010 Aggelos Economopoulos.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in
12  *    the documentation and/or other materials provided with the
13  *    distribution.
14  * 3. Neither the name of The DragonFly Project nor the names of its
15  *    contributors may be used to endorse or promote products derived
16  *    from this software without specific, prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
22  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
28  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <assert.h>
33 #include <ctype.h>
34 #include <errno.h>
35 #include <limits.h>
36 #include <stdarg.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <sys/queue.h>
41 #include <sys/stat.h>
42 #include <sys/tree.h>
43 
44 
45 #include "evtr.h"
46 
47 static unsigned evtr_debug;
48 
49 #define DEFINE_DEBUG_FLAG(nam, chr)		\
50 	nam = chr - 'a'
51 
52 enum debug_flags {
53 	DEFINE_DEBUG_FLAG(IO, 'i'),
54 	DEFINE_DEBUG_FLAG(DS, 't'),	/* data structures */
55 	DEFINE_DEBUG_FLAG(MISC, 'm'),
56 };
57 
58 #define printd(subsys, ...)				\
59 	do {						\
60 		if (evtr_debug & (subsys)) {	\
61 			fprintf(stderr, __VA_ARGS__);	\
62 		}					\
63 	} while (0)
64 
65 static
66 void
67 printd_set_flags(const char *str, unsigned int *flags)
68 {
69 	/*
70 	 * This is suboptimal as we don't detect
71 	 * invalid flags.
72 	 */
73 	for (; *str; ++str) {
74 		if ('A' == *str) {
75 			*flags = -1;
76 			return;
77 		}
78 		if (!islower(*str))
79 			err(2, "invalid debug flag %c\n", *str);
80 		*flags |= *str - 'a';
81 	}
82 }
83 
84 
85 enum {
86 	MAX_EVHDR_SIZE = PATH_MAX + 200,
87 	/* string namespaces */
88 	EVTR_NS_PATH = 0x1,
89 	EVTR_NS_FUNC,
90 	EVTR_NS_DSTR,
91 	EVTR_NS_MAX,
92 	NR_BUCKETS = 1023, /* XXX */
93 	REC_ALIGN = 8,
94 	REC_BOUNDARY = 1 << 14,
95 	FILTF_ID = 0x10,
96 	EVTRF_WR = 0x1,		/* open for writing */
97 };
98 
99 typedef uint16_t fileid_t;
100 typedef uint16_t funcid_t;
101 typedef uint16_t fmtid_t;
102 
103 struct trace_event_header {
104 	uint8_t type;
105 	uint64_t ts;	/* XXX: this should only be part of probe */
106 } __attribute__((packed));
107 
108 struct probe_event_header {
109 	struct trace_event_header eh;
110 	/*
111 	 * For these fields, 0 implies "not available"
112 	 */
113 	fileid_t file;
114 	funcid_t caller1;
115 	funcid_t caller2;
116 	funcid_t func;
117 	uint16_t line;
118 	fmtid_t fmt;
119 	uint16_t datalen;
120 	uint8_t cpu;	/* -1 if n/a */
121 } __attribute__((packed));
122 
123 struct string_event_header {
124 	struct trace_event_header eh;
125 	uint16_t ns;
126 	uint32_t id;
127 	uint16_t len;
128 } __attribute__((packed));
129 
130 struct fmt_event_header {
131 	struct trace_event_header eh;
132 	uint16_t id;
133 	uint8_t subsys_len;
134 	uint8_t fmt_len;
135 } __attribute__((packed));
136 
137 struct cpuinfo_event_header {
138 	double freq;
139 	uint8_t cpu;
140 } __attribute__((packed));
141 
142 struct hashentry {
143 	const char *str;
144 	uint16_t id;
145 	struct hashentry *next;
146 };
147 
148 struct hashtab {
149 	struct hashentry *buckets[NR_BUCKETS];
150 	uint16_t id;
151 };
152 
153 struct event_fmt {
154 	const char *subsys;
155 	const char *fmt;
156 };
157 
158 struct event_filter_unresolved {
159 	TAILQ_ENTRY(event_filter_unresolved) link;
160 	evtr_filter_t filt;
161 };
162 
163 struct id_map {
164 	RB_ENTRY(id_map) rb_node;
165 	int id;
166 	const void *data;
167 };
168 
169 RB_HEAD(id_tree, id_map);
170 struct string_map {
171 	struct id_tree root;
172 };
173 
174 struct fmt_map {
175 	struct id_tree root;
176 };
177 
178 RB_HEAD(thread_tree, evtr_thread);
179 
180 struct thread_map {
181 	struct thread_tree root;
182 };
183 
184 struct event_callback {
185 	void (*cb)(evtr_event_t, void *data);
186 	void *data;	/* this field must be malloc()ed */
187 };
188 
189 struct cpu {
190 	struct evtr_thread *td;	/* currently executing thread */
191 	double freq;
192 };
193 
194 struct evtr {
195 	FILE *f;
196 	int err;
197 	int flags;
198 	char *errmsg;
199 	off_t bytes;
200 	union {
201 		/*
202 		 * When writing, we keep track of the strings we've
203 		 * already dumped so we only dump them once.
204 		 * Paths, function names etc belong to different
205 		 * namespaces.
206 		 */
207 		struct hashtab *strings[EVTR_NS_MAX - 1];
208 		/*
209 		 * When reading, we build a map from id to string.
210 		 * Every id must be defined at the point of use.
211 		 */
212 		struct string_map maps[EVTR_NS_MAX - 1];
213 	};
214 	union {
215 		/* same as above, but for subsys+fmt pairs */
216 		struct fmt_map fmtmap;
217 		struct hashtab *fmts;
218 	};
219 	/*
220 	 * Filters that have a format specified and we
221 	 * need to resolve that to an fmtid
222 	 */
223 	TAILQ_HEAD(, event_filter_unresolved) unresolved_filtq;
224 	struct event_callback **cbs;
225 	int ncbs;
226 	struct thread_map threads;
227 	struct cpu *cpus;
228 	int ncpus;
229 };
230 
231 struct evtr_query {
232 	evtr_t evtr;
233 	off_t off;
234 	evtr_filter_t filt;
235 	int nfilt;
236 	int nmatched;
237 	int ntried;
238 	void *buf;
239 	int bufsize;
240 };
241 
242 void
243 evtr_set_debug(const char *str)
244 {
245 	printd_set_flags(str, &evtr_debug);
246 }
247 
248 static int id_map_cmp(struct id_map *, struct id_map *);
249 RB_PROTOTYPE2(id_tree, id_map, rb_node, id_map_cmp, int);
250 RB_GENERATE2(id_tree, id_map, rb_node, id_map_cmp, int, id);
251 
252 static int thread_cmp(struct evtr_thread *, struct evtr_thread *);
253 RB_PROTOTYPE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *);
254 RB_GENERATE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *, id);
255 
256 static inline
257 void
258 validate_string(const char *str)
259 {
260 	if (!(evtr_debug & MISC))
261 		return;
262 	for (; *str; ++str)
263 		assert(isprint(*str));
264 }
265 
266 static
267 void
268 id_tree_free(struct id_tree *root)
269 {
270 	struct id_map *v, *n;
271 
272 	for (v = RB_MIN(id_tree, root); v; v = n) {
273 		n = RB_NEXT(id_tree, root, v);
274 		RB_REMOVE(id_tree, root, v);
275 	}
276 }
277 
278 static
279 int
280 evtr_register_callback(evtr_t evtr, void (*fn)(evtr_event_t, void *), void *d)
281 {
282 	struct event_callback *cb;
283 	void *cbs;
284 
285 	if (!(cb = malloc(sizeof(*cb)))) {
286 		evtr->err = ENOMEM;
287 		return !0;
288 	}
289 	cb->cb = fn;
290 	cb->data = d;
291 	if (!(cbs = realloc(evtr->cbs, (++evtr->ncbs) * sizeof(cb)))) {
292 		--evtr->ncbs;
293 		free(cb);
294 		evtr->err = ENOMEM;
295 		return !0;
296 	}
297 	evtr->cbs = cbs;
298 	evtr->cbs[evtr->ncbs - 1] = cb;
299 	return 0;
300 }
301 
302 static
303 void
304 evtr_deregister_callbacks(evtr_t evtr)
305 {
306 	int i;
307 
308 	for (i = 0; i < evtr->ncbs; ++i) {
309 		free(evtr->cbs[i]);
310 	}
311 	free(evtr->cbs);
312 	evtr->cbs = NULL;
313 }
314 
315 static
316 void
317 evtr_run_callbacks(evtr_event_t ev, evtr_t evtr)
318 {
319 	struct event_callback *cb;
320 	int i;
321 
322 	for (i = 0; i < evtr->ncbs; ++i) {
323 		cb = evtr->cbs[i];
324 		cb->cb(ev, cb->data);
325 	}
326 }
327 
328 static
329 struct cpu *
330 evtr_cpu(evtr_t evtr, int c)
331 {
332 	if ((c < 0) || (c >= evtr->ncpus))
333 		return NULL;
334 	return &evtr->cpus[c];
335 }
336 
337 static
338 int
339 parse_format_data(evtr_event_t ev, const char *fmt, ...) __attribute__((format (scanf, 2, 3)));
340 static
341 int
342 parse_format_data(evtr_event_t ev, const char *fmt, ...)
343 {
344 	va_list ap;
345 	char buf[2048];
346 
347 	if (strcmp(fmt, ev->fmt))
348 		return 0;
349 	vsnprintf(buf, sizeof(buf), fmt, __DECONST(void *, ev->fmtdata));
350 	printd(MISC, "string is: %s\n", buf);
351 	va_start(ap, fmt);
352 	return vsscanf(buf, fmt, ap);
353 }
354 
355 static
356 void
357 evtr_deregister_filters(evtr_t evtr, evtr_filter_t filt, int nfilt)
358 {
359 	struct event_filter_unresolved *u, *tmp;
360 	int i;
361 	TAILQ_FOREACH_MUTABLE(u, &evtr->unresolved_filtq, link, tmp) {
362 		for (i = 0; i < nfilt; ++i) {
363 			if (u->filt == &filt[i]) {
364 				TAILQ_REMOVE(&evtr->unresolved_filtq, u, link);
365 			}
366 		}
367 	}
368 }
369 
370 static
371 void
372 evtr_resolve_filters(evtr_t evtr, const char *fmt, int id)
373 {
374 	struct event_filter_unresolved *u, *tmp;
375 	TAILQ_FOREACH_MUTABLE(u, &evtr->unresolved_filtq, link, tmp) {
376 		if ((u->filt->fmt != NULL) && !strcmp(fmt, u->filt->fmt)) {
377 			u->filt->fmtid = id;
378 			u->filt->flags |= FILTF_ID;
379 			TAILQ_REMOVE(&evtr->unresolved_filtq, u, link);
380 		}
381 	}
382 }
383 
384 static
385 int
386 evtr_filter_register(evtr_t evtr, evtr_filter_t filt)
387 {
388 	struct event_filter_unresolved *res;
389 
390 	if (!(res = malloc(sizeof(*res)))) {
391 		evtr->err = ENOMEM;
392 		return !0;
393 	}
394 	res->filt = filt;
395 	TAILQ_INSERT_TAIL(&evtr->unresolved_filtq, res, link);
396 	return 0;
397 }
398 
399 void
400 evtr_event_data(evtr_event_t ev, char *buf, size_t len)
401 {
402 	/*
403 	 * XXX: we implicitly trust the format string.
404 	 * We shouldn't.
405 	 */
406 	if (ev->fmtdatalen) {
407 		vsnprintf(buf, len, ev->fmt, __DECONST(void *, ev->fmtdata));
408 	} else {
409 		strlcpy(buf, ev->fmt, len);
410 	}
411 }
412 
413 
414 int
415 evtr_error(evtr_t evtr)
416 {
417 	return evtr->err || (evtr->errmsg != NULL);
418 }
419 
420 const char *
421 evtr_errmsg(evtr_t evtr)
422 {
423 	return evtr->errmsg ? evtr->errmsg : strerror(evtr->err);
424 }
425 
426 static
427 int
428 id_map_cmp(struct id_map *a, struct id_map *b)
429 {
430 	return a->id - b->id;
431 }
432 
433 static
434 int
435 thread_cmp(struct evtr_thread *a, struct evtr_thread *b)
436 {
437 	return (int)a->id - (int)b->id;
438 }
439 
440 #define DEFINE_MAP_FIND(prefix, type)		\
441 	static					\
442 	type				\
443 	prefix ## _map_find(struct id_tree *tree, int id)\
444 	{						 \
445 		struct id_map *sid;			 \
446 							\
447 		sid = id_tree_RB_LOOKUP(tree, id);	\
448 		return sid ? sid->data : NULL;		\
449 	}
450 
451 DEFINE_MAP_FIND(string, const char *)
452 DEFINE_MAP_FIND(fmt, const struct event_fmt *)
453 
454 static
455 struct evtr_thread *
456 thread_map_find(struct thread_map *map, void *id)
457 {
458 	return thread_tree_RB_LOOKUP(&map->root, id);
459 }
460 
461 #define DEFINE_MAP_INSERT(prefix, type, _cmp, _dup)	\
462 	static					\
463 	int								\
464 	prefix ## _map_insert(struct id_tree *tree, type data, int id) \
465 	{								\
466 	struct id_map *sid, *osid;					\
467 									\
468 	sid = malloc(sizeof(*sid));					\
469 	if (!sid) {							\
470 		return ENOMEM;						\
471 	}								\
472 	sid->id = id;							\
473 	sid->data = data;						\
474 	if ((osid = id_tree_RB_INSERT(tree, sid))) {			\
475 		free(sid);						\
476 		if (_cmp((type)osid->data, data)) {			\
477 			return EEXIST;					\
478 		}							\
479 		printd(DS, "mapping already exists, skipping\n");		\
480 		/* we're OK with redefinitions of an id to the same string */ \
481 		return 0;						\
482 	}								\
483 	/* only do the strdup if we're inserting a new string */	\
484 	sid->data = _dup(data);		/* XXX: oom */			\
485 	return 0;							\
486 }
487 
488 static
489 void
490 thread_map_insert(struct thread_map *map, struct evtr_thread *td)
491 {
492 	struct evtr_thread *otd;
493 
494 	if ((otd = thread_tree_RB_INSERT(&map->root, td))) {
495 		/*
496 		 * Thread addresses might be reused, we're
497 		 * ok with that.
498 		 * DANGER, Will Robinson: this means the user
499 		 * of the API needs to copy event->td if they
500 		 * want it to remain stable.
501 		 */
502 		free((void *)otd->comm);
503 		otd->comm = td->comm;
504 		free(td);
505 	}
506 }
507 
508 static
509 int
510 event_fmt_cmp(const struct event_fmt *a, const struct event_fmt *b)
511 {
512 	int ret = 0;
513 
514 	if (a->subsys) {
515 		if (b->subsys) {
516 			ret = strcmp(a->subsys, b->subsys);
517 		} else {
518 			ret = strcmp(a->subsys, "");
519 		}
520 	} else if (b->subsys) {
521 			ret = strcmp("", b->subsys);
522 	}
523 	if (ret)
524 		return ret;
525 	return strcmp(a->fmt, b->fmt);
526 }
527 
528 static
529 struct event_fmt *
530 event_fmt_dup(const struct event_fmt *o)
531 {
532 	struct event_fmt *n;
533 
534 	if (!(n = malloc(sizeof(*n)))) {
535 		return n;
536 	}
537 	memcpy(n, o, sizeof(*n));
538 	return n;
539 }
540 
541 DEFINE_MAP_INSERT(string, const char *, strcmp, strdup)
542 DEFINE_MAP_INSERT(fmt, const struct event_fmt *, event_fmt_cmp, event_fmt_dup)
543 
544 static
545 int
546 hashfunc(const char *str)
547 {
548         unsigned long hash = 5381;
549         int c;
550 
551         while ((c = *str++))
552             hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
553 	return hash  % NR_BUCKETS;
554 }
555 
556 static
557 struct hashentry *
558 hash_find(struct hashtab *tab, const char *str)
559 {
560 	struct hashentry *ent;
561 
562 	for(ent = tab->buckets[hashfunc(str)]; ent && strcmp(ent->str, str);
563 	    ent = ent->next);
564 
565 	return ent;
566 }
567 
568 static
569 struct hashentry *
570 hash_insert(struct hashtab *tab, const char *str)
571 {
572 	struct hashentry *ent;
573 	int hsh;
574 
575 	if (!(ent = malloc(sizeof(*ent)))) {
576 		fprintf(stderr, "out of memory\n");
577 		return NULL;
578 	}
579 	hsh = hashfunc(str);
580 	ent->next = tab->buckets[hsh];
581 	ent->str = strdup(str);
582 	ent->id = ++tab->id;
583 	if (tab->id == 0) {
584 		fprintf(stderr, "too many strings\n");
585 		free(ent);
586 		return NULL;
587 	}
588 	tab->buckets[hsh] = ent;
589 	return ent;
590 }
591 
592 static
593 void
594 thread_creation_callback(evtr_event_t ev, void *d)
595 {
596 	evtr_t evtr = (evtr_t)d;
597 	struct evtr_thread *td;
598 	void *ktd;
599 	char buf[20];
600 
601 	if (parse_format_data(ev, "new_td %p %s", &ktd, buf) != 2) {
602 		return;
603 	}
604 	buf[19] = '\0';
605 
606 	if (!(td = malloc(sizeof(*td)))) {
607 		evtr->err = ENOMEM;
608 		return;
609 	}
610 	td->id = ktd;
611 	td->userdata = NULL;
612 	if (!(td->comm = strdup(buf))) {
613 		free(td);
614 		evtr->err = ENOMEM;
615 		return;
616 	}
617 	printd(DS, "inserting new thread %p: %s\n", td->id, td->comm);
618 	thread_map_insert(&evtr->threads, td);
619 }
620 
621 static
622 void
623 thread_switch_callback(evtr_event_t ev, void *d)
624 {
625 	evtr_t evtr = (evtr_t)d;
626 	struct evtr_thread *tdp, *tdn;
627 	void *ktdp, *ktdn;
628 	struct cpu *cpu;
629 	static struct evtr_event tdcr;
630 	static char *fmt = "new_td %p %s";
631 	char tidstr[40];
632 	char fmtdata[sizeof(void *) + sizeof(char *)];
633 
634 	cpu = evtr_cpu(evtr, ev->cpu);
635 	if (!cpu) {
636 		warnx("invalid cpu %d\n", ev->cpu);
637 		return;
638 	}
639 	if (parse_format_data(ev, "sw  %p > %p", &ktdp, &ktdn) != 2) {
640 		return;
641 	}
642 	tdp = thread_map_find(&evtr->threads, ktdp);
643 	if (!tdp) {
644 		printd(DS, "switching from unknown thread %p\n", ktdp);
645 	}
646 	tdn = thread_map_find(&evtr->threads, ktdn);
647 	if (!tdn) {
648 		/*
649 		 * Fake a thread creation event for threads we
650 		 * haven't seen before.
651 		 */
652 		tdcr.type = EVTR_TYPE_PROBE;
653 		tdcr.ts = ev->ts;
654 		tdcr.file = NULL;
655 		tdcr.func = NULL;
656 		tdcr.line = 0;
657 		tdcr.fmt = fmt;
658 		tdcr.fmtdata = &fmtdata;
659 		tdcr.fmtdatalen = sizeof(fmtdata);
660 		tdcr.cpu = ev->cpu;
661 		tdcr.td = NULL;
662 		snprintf(tidstr, sizeof(tidstr), "%p", ktdn);
663 		((void **)fmtdata)[0] = ktdn;
664 		((char **)fmtdata)[1] = &tidstr[0];
665 		thread_creation_callback(&tdcr, evtr);
666 
667 		tdn = thread_map_find(&evtr->threads, ktdn);
668 		assert(tdn != NULL);
669 		printd(DS, "switching to unknown thread %p\n", ktdn);
670 		cpu->td = tdn;
671 		return;
672 	}
673 	printd(DS, "cpu %d: switching to thread %p\n", ev->cpu, ktdn);
674 	cpu->td = tdn;
675 }
676 
677 static
678 void
679 assert_foff_in_sync(evtr_t evtr)
680 {
681 	off_t off;
682 
683 	/*
684 	 * We keep our own offset because we
685 	 * might want to support mmap()
686 	 */
687 	off = ftello(evtr->f);
688 	if (evtr->bytes != off) {
689 		fprintf(stderr, "bytes %jd, off %jd\n", evtr->bytes, off);
690 		abort();
691 	}
692 }
693 
694 static
695 int
696 evtr_write(evtr_t evtr, const void *buf, size_t bytes)
697 {
698 	assert_foff_in_sync(evtr);
699 	if (fwrite(buf, bytes, 1, evtr->f) != 1) {
700 		evtr->err = errno;
701 		evtr->errmsg = strerror(errno);
702 		return !0;
703 	}
704 	evtr->bytes += bytes;
705 	assert_foff_in_sync(evtr);
706 	return 0;
707 }
708 
709 /*
710  * Called after dumping a record to make sure the next
711  * record is REC_ALIGN aligned. This does not make much sense,
712  * as we shouldn't be using packed structs anyway.
713  */
714 static
715 int
716 evtr_dump_pad(evtr_t evtr)
717 {
718 	size_t pad;
719 	static char buf[REC_ALIGN];
720 
721 	pad = REC_ALIGN - (evtr->bytes % REC_ALIGN);
722 	if (pad > 0) {
723 		return evtr_write(evtr, buf, pad);
724 	}
725 	return 0;
726 }
727 
728 /*
729  * We make sure that there is a new record every REC_BOUNDARY
730  * bytes, this costs next to nothing in space and allows for
731  * fast seeking.
732  */
733 static
734 int
735 evtr_dump_avoid_boundary(evtr_t evtr, size_t bytes)
736 {
737 	unsigned pad, i;
738 	static char buf[256];
739 
740 	pad = REC_BOUNDARY - (evtr->bytes % REC_BOUNDARY);
741 	/* if adding @bytes would cause us to cross a boundary... */
742 	if (bytes > pad) {
743 		/* then pad to the boundary */
744 		for (i = 0; i < (pad / sizeof(buf)); ++i) {
745 			if (evtr_write(evtr, buf, sizeof(buf))) {
746 				return !0;
747 			}
748 		}
749 		i = pad % sizeof(buf);
750 		if (i) {
751 			if (evtr_write(evtr, buf, i)) {
752 				return !0;
753 			}
754 		}
755 	}
756 	return 0;
757 }
758 
759 static
760 int
761 evtr_dump_fmt(evtr_t evtr, uint64_t ts, const evtr_event_t ev)
762 {
763 	struct fmt_event_header fmt;
764 	struct hashentry *ent;
765 	char *subsys = "", buf[1024];
766 
767 	if (strlcpy(buf, subsys, sizeof(buf)) >= sizeof(buf)) {
768 		evtr->errmsg = "name of subsystem is too large";
769 		evtr->err = ERANGE;
770 		return 0;
771 	}
772 	if (strlcat(buf, ev->fmt, sizeof(buf)) >= sizeof(buf)) {
773 		evtr->errmsg = "fmt + name of subsystem is too large";
774 		evtr->err = ERANGE;
775 		return 0;
776 	}
777 
778 	if ((ent = hash_find(evtr->fmts, buf))) {
779 		return ent->id;
780 	}
781 	if (!(ent = hash_insert(evtr->fmts, buf))) {
782 		evtr->err = evtr->fmts->id ? ENOMEM : ERANGE;
783 		return 0;
784 	}
785 
786 	fmt.eh.type = EVTR_TYPE_FMT;
787 	fmt.eh.ts = ts;
788 	fmt.subsys_len = strlen(subsys);
789 	fmt.fmt_len = strlen(ev->fmt);
790 	fmt.id = ent->id;
791 	if (evtr_dump_avoid_boundary(evtr, sizeof(fmt) + fmt.subsys_len +
792 				     fmt.fmt_len))
793 		return 0;
794 	if (evtr_write(evtr, &fmt, sizeof(fmt)))
795 		return 0;
796 	if (evtr_write(evtr, subsys, fmt.subsys_len))
797 		return 0;
798 	if (evtr_write(evtr, ev->fmt, fmt.fmt_len))
799 		return 0;
800 	if (evtr_dump_pad(evtr))
801 		return 0;
802 	return fmt.id;
803 }
804 
805 /*
806  * Replace string pointers or string ids in fmtdata
807  */
808 static
809 int
810 mangle_string_ptrs(const char *fmt, uint8_t *fmtdata,
811 		   const char *(*replace)(void *, const char *), void *ctx)
812 {
813 	const char *f, *p;
814 	size_t skipsize, intsz;
815 	int ret = 0;
816 
817 	for (f = fmt; f[0] != '\0'; ++f) {
818 		if (f[0] != '%')
819 			continue;
820 		++f;
821 		skipsize = 0;
822 		for (p = f; p[0]; ++p) {
823 			int again = 0;
824 			/*
825 			 * Eat flags. Notice this will accept duplicate
826 			 * flags.
827 			 */
828 			switch (p[0]) {
829 			case '#':
830 			case '0':
831 			case '-':
832 			case ' ':
833 			case '+':
834 			case '\'':
835 				again = !0;
836 				break;
837 			}
838 			if (!again)
839 				break;
840 		}
841 		/* Eat minimum field width, if any */
842 		for (; isdigit(p[0]); ++p)
843 			;
844 		if (p[0] == '.')
845 			++p;
846 		/* Eat precision, if any */
847 		for (; isdigit(p[0]); ++p)
848 			;
849 		intsz = 0;
850 		switch (p[0]) {
851 		case 'l':
852 			if (p[1] == 'l') {
853 				++p;
854 				intsz = sizeof(long long);
855 			} else {
856 				intsz = sizeof(long);
857 			}
858 			break;
859 		case 'j':
860 			intsz = sizeof(intmax_t);
861 			break;
862 		case 't':
863 			intsz = sizeof(ptrdiff_t);
864 			break;
865 		case 'z':
866 			intsz = sizeof(size_t);
867 			break;
868 		default:
869 			break;
870 		}
871 		if (intsz != 0)
872 			++p;
873 		else
874 			intsz = sizeof(int);
875 
876 		switch (p[0]) {
877 		case 'd':
878 		case 'i':
879 		case 'o':
880 		case 'u':
881 		case 'x':
882 		case 'X':
883 		case 'c':
884 			skipsize = intsz;
885 			break;
886 		case 'p':
887 			skipsize = sizeof(void *);
888 			break;
889 		case 'f':
890 			if (p[-1] == 'l')
891 				skipsize = sizeof(double);
892 			else
893 				skipsize = sizeof(float);
894 			break;
895 		case 's':
896 			((const char **)fmtdata)[0] =
897 				replace(ctx, ((char **)fmtdata)[0]);
898 			skipsize = sizeof(char *);
899 			++ret;
900 			break;
901 		default:
902 			fprintf(stderr, "Unknown conversion specifier %c "
903 				"in fmt starting with %s", p[0], f - 1);
904 			return -1;
905 		}
906 		fmtdata += skipsize;
907 	}
908 	return ret;
909 }
910 
911 /* XXX: do we really want the timestamp? */
912 static
913 int
914 evtr_dump_string(evtr_t evtr, uint64_t ts, const char *str, int ns)
915 {
916 	struct string_event_header s;
917 	struct hashentry *ent;
918 
919 	assert((0 <= ns) && (ns < EVTR_NS_MAX));
920 	if ((ent = hash_find(evtr->strings[ns], str))) {
921 		return ent->id;
922 	}
923 	if (!(ent = hash_insert(evtr->strings[ns], str))) {
924 		evtr->err = evtr->strings[ns]->id ? ENOMEM : ERANGE;
925 		return 0;
926 	}
927 
928 	printd(DS, "hash_insert %s ns %d id %d\n", str, ns, ent->id);
929 	s.eh.type = EVTR_TYPE_STR;
930 	s.eh.ts = ts;
931 	s.ns = ns;
932 	s.id = ent->id;
933 	s.len = strnlen(str, PATH_MAX);
934 
935 	if (evtr_dump_avoid_boundary(evtr, sizeof(s) + s.len))
936 		return 0;
937 	if (evtr_write(evtr, &s, sizeof(s)))
938 		return 0;
939 	if (evtr_write(evtr, str, s.len))
940 		return 0;
941 	if (evtr_dump_pad(evtr))
942 		return 0;
943 	return s.id;
944 }
945 
946 struct replace_ctx {
947 	evtr_t evtr;
948 	uint64_t ts;
949 };
950 
951 static
952 const char *
953 replace_strptr(void *_ctx, const char *s)
954 {
955 	struct replace_ctx *ctx = _ctx;
956 	return (const char *)evtr_dump_string(ctx->evtr, ctx->ts, s, EVTR_NS_DSTR);
957 }
958 
959 static
960 const char *
961 replace_strid(void *_ctx, const char *s)
962 {
963 	struct replace_ctx *ctx = _ctx;
964 	const char *ret;
965 
966 	ret = string_map_find(&ctx->evtr->maps[EVTR_NS_DSTR - 1].root,
967 			      (uint32_t)s);
968 	if (!ret) {
969 		fprintf(stderr, "Unknown id for data string\n");
970 		ctx->evtr->errmsg = "unknown id for data string";
971 		ctx->evtr->err = !0;
972 	}
973 	validate_string(ret);
974 	printd(DS, "replacing strid %d (ns %d) with string '%s' (or int %#x)\n", (int)s,
975 	       EVTR_NS_DSTR, ret ? ret : "NULL", (int)ret);
976 	return ret;
977 }
978 
979 static
980 int
981 evtr_dump_probe(evtr_t evtr, evtr_event_t ev)
982 {
983 	struct probe_event_header kev;
984 	char buf[1024];
985 
986 	memset(&kev, '\0', sizeof(kev));
987 	kev.eh.type = ev->type;
988 	kev.eh.ts = ev->ts;
989 	kev.line = ev->line;
990 	kev.cpu = ev->cpu;
991 	if (ev->file) {
992 		kev.file = evtr_dump_string(evtr, kev.eh.ts, ev->file,
993 					    EVTR_NS_PATH);
994 	}
995 	if (ev->func) {
996 		kev.func = evtr_dump_string(evtr, kev.eh.ts, ev->func,
997 					    EVTR_NS_FUNC);
998 	}
999 	if (ev->fmt) {
1000 		kev.fmt = evtr_dump_fmt(evtr, kev.eh.ts, ev);
1001 	}
1002 	if (ev->fmtdata) {
1003 		struct replace_ctx replctx = {
1004 			.evtr = evtr,
1005 			.ts = ev->ts,
1006 		};
1007 		assert(ev->fmtdatalen <= sizeof(buf));
1008 		kev.datalen = ev->fmtdatalen;
1009 		/*
1010 		 * Replace all string pointers with string ids before dumping
1011 		 * the data.
1012 		 */
1013 		memcpy(buf, ev->fmtdata, ev->fmtdatalen);
1014 		if (mangle_string_ptrs(ev->fmt, buf,
1015 				       replace_strptr, &replctx) < 0)
1016 			return !0;
1017 		if (evtr->err)
1018 			return evtr->err;
1019 	}
1020 	if (evtr_dump_avoid_boundary(evtr, sizeof(kev) + ev->fmtdatalen))
1021 		return !0;
1022 	if (evtr_write(evtr, &kev, sizeof(kev)))
1023 		return !0;
1024 	if (evtr_write(evtr, buf, ev->fmtdatalen))
1025 		return !0;
1026 	if (evtr_dump_pad(evtr))
1027 		return !0;
1028 	return 0;
1029 }
1030 
1031 static
1032 int
1033 evtr_dump_sysinfo(evtr_t evtr, evtr_event_t ev)
1034 {
1035 	uint8_t type = EVTR_TYPE_SYSINFO;
1036 	uint16_t ncpus = ev->ncpus;
1037 
1038 	if (ncpus <= 0) {
1039 		evtr->errmsg = "invalid number of cpus";
1040 		return !0;
1041 	}
1042 	if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ncpus)))
1043 		return !0;
1044 	if (evtr_write(evtr, &type, sizeof(type))) {
1045 		return !0;
1046 	}
1047 	if (evtr_write(evtr, &ncpus, sizeof(ncpus))) {
1048 		return !0;
1049 	}
1050 	if (evtr_dump_pad(evtr))
1051 		return !0;
1052 	return 0;
1053 }
1054 static
1055 int
1056 evtr_dump_cpuinfo(evtr_t evtr, evtr_event_t ev)
1057 {
1058 	struct cpuinfo_event_header ci;
1059 	uint8_t type;
1060 
1061 	if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ci)))
1062 		return !0;
1063 	type = EVTR_TYPE_CPUINFO;
1064 	if (evtr_write(evtr, &type, sizeof(type))) {
1065 		return !0;
1066 	}
1067 	ci.cpu = ev->cpu;
1068 	ci.freq = ev->cpuinfo.freq;
1069 	if (evtr_dump_avoid_boundary(evtr, sizeof(ci)))
1070 		return !0;
1071 	if (evtr_write(evtr, &ci, sizeof(ci))) {
1072 		return !0;
1073 	}
1074 	if (evtr_dump_pad(evtr))
1075 		return !0;
1076 	return 0;
1077 }
1078 
1079 int
1080 evtr_rewind(evtr_t evtr)
1081 {
1082 	assert((evtr->flags & EVTRF_WR) == 0);
1083 	evtr->bytes = 0;
1084 	if (fseek(evtr->f, 0, SEEK_SET)) {
1085 		evtr->err = errno;
1086 		return !0;
1087 	}
1088 	return 0;
1089 }
1090 
1091 int
1092 evtr_dump_event(evtr_t evtr, evtr_event_t ev)
1093 {
1094 	switch (ev->type) {
1095 	case EVTR_TYPE_PROBE:
1096 		return evtr_dump_probe(evtr, ev);
1097 	case EVTR_TYPE_SYSINFO:
1098 		return evtr_dump_sysinfo(evtr, ev);
1099 	case EVTR_TYPE_CPUINFO:
1100 		return evtr_dump_cpuinfo(evtr, ev);
1101 	}
1102 	evtr->errmsg = "unknown event type";
1103 	return !0;
1104 }
1105 
1106 static
1107 evtr_t
1108 evtr_alloc(FILE *f)
1109 {
1110 	evtr_t evtr;
1111 	if (!(evtr = malloc(sizeof(*evtr)))) {
1112 		return NULL;
1113 	}
1114 
1115 	evtr->f = f;
1116 	evtr->err = 0;
1117 	evtr->errmsg = NULL;
1118 	evtr->bytes = 0;
1119 	TAILQ_INIT(&evtr->unresolved_filtq);
1120 	return evtr;
1121 }
1122 
1123 evtr_t
1124 evtr_open_read(FILE *f)
1125 {
1126 	evtr_t evtr;
1127 	struct evtr_event ev;
1128 	int i;
1129 
1130 	if (!(evtr = evtr_alloc(f))) {
1131 		return NULL;
1132 	}
1133 	evtr->flags = 0;
1134 	for (i = 0; i < (EVTR_NS_MAX - 1); ++i) {
1135 		RB_INIT(&evtr->maps[i].root);
1136 	}
1137 	RB_INIT(&evtr->fmtmap.root);
1138 	TAILQ_INIT(&evtr->unresolved_filtq);
1139 	evtr->cbs = 0;
1140 	evtr->ncbs = 0;
1141 	RB_INIT(&evtr->threads.root);
1142 	evtr->cpus = NULL;
1143 	evtr->ncpus = 0;
1144 	if (evtr_register_callback(evtr, &thread_creation_callback, evtr)) {
1145 		goto free_evtr;
1146 	}
1147 	if (evtr_register_callback(evtr, &thread_switch_callback, evtr)) {
1148 		goto free_cbs;
1149 	}
1150 	/*
1151 	 * Load the first event so we can pick up any
1152 	 * sysinfo entries.
1153 	 */
1154 	if (evtr_next_event(evtr, &ev)) {
1155 		goto free_cbs;
1156 	}
1157 	if (evtr_rewind(evtr))
1158 		goto free_cbs;
1159 	return evtr;
1160 free_cbs:
1161 	evtr_deregister_callbacks(evtr);
1162 free_evtr:
1163 	free(evtr);
1164 	return NULL;
1165 }
1166 
1167 evtr_t
1168 evtr_open_write(FILE *f)
1169 {
1170 	evtr_t evtr;
1171 	int i, j;
1172 
1173 	if (!(evtr = evtr_alloc(f))) {
1174 		return NULL;
1175 	}
1176 
1177 	evtr->flags = EVTRF_WR;
1178 	if (!(evtr->fmts = calloc(sizeof(struct hashtab), 1)))
1179 		goto free_evtr;
1180 
1181 	for (i = 0; i < EVTR_NS_MAX; ++i) {
1182 		evtr->strings[i] = calloc(sizeof(struct hashtab), 1);
1183 		if (!evtr->strings[i]) {
1184 			for (j = 0; j < i; ++j) {
1185 				free(evtr->strings[j]);
1186 			}
1187 			goto free_fmts;
1188 		}
1189 	}
1190 
1191 	return evtr;
1192 free_fmts:
1193 	free(evtr->fmts);
1194 free_evtr:
1195 	free(evtr);
1196 	return NULL;
1197 }
1198 
1199 static
1200 void
1201 hashtab_destroy(struct hashtab *h)
1202 {
1203 	struct hashentry *ent, *next;
1204 	int i;
1205 	for (i = 0; i < NR_BUCKETS; ++i) {
1206 		for (ent = h->buckets[i]; ent; ent = next) {
1207 			next = ent->next;
1208 			free(ent);
1209 		}
1210 	}
1211 	free(h);
1212 }
1213 
1214 void
1215 evtr_close(evtr_t evtr)
1216 {
1217 	int i;
1218 
1219 	if (evtr->flags & EVTRF_WR) {
1220 		hashtab_destroy(evtr->fmts);
1221 		for (i = 0; i < EVTR_NS_MAX; ++i)
1222 			hashtab_destroy(evtr->strings[i]);
1223 	} else {
1224 		id_tree_free(&evtr->fmtmap.root);
1225 		for (i = 0; i < EVTR_NS_MAX - 1; ++i) {
1226 			id_tree_free(&evtr->maps[i].root);
1227 		}
1228 	}
1229 	free(evtr);
1230 }
1231 
1232 static
1233 int
1234 evtr_read(evtr_t evtr, void *buf, size_t size)
1235 {
1236 	assert(size > 0);
1237 	assert_foff_in_sync(evtr);
1238 	printd(IO, "evtr_read at %#jx, %zd bytes\n", evtr->bytes, size);
1239 	if (fread(buf, size, 1, evtr->f) != 1) {
1240 		if (feof(evtr->f)) {
1241 			evtr->errmsg = "incomplete record";
1242 		} else {
1243 			evtr->errmsg = strerror(errno);
1244 		}
1245 		return !0;
1246 	}
1247 	evtr->bytes += size;
1248 	assert_foff_in_sync(evtr);
1249 	return 0;
1250 }
1251 
1252 static
1253 int
1254 evtr_load_fmt(evtr_t evtr, char *buf)
1255 {
1256 	struct fmt_event_header *evh = (struct fmt_event_header *)buf;
1257 	struct event_fmt *fmt;
1258 	char *subsys = NULL, *fmtstr;
1259 
1260 	if (!(fmt = malloc(sizeof(*fmt)))) {
1261 		evtr->err = errno;
1262 		return !0;
1263 	}
1264 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1265 		      sizeof(*evh) - sizeof(evh->eh))) {
1266 		goto free_fmt;
1267 	}
1268 	assert(!evh->subsys_len);
1269 	if (evh->subsys_len) {
1270 		if (!(subsys = malloc(evh->subsys_len))) {
1271 			evtr->err = errno;
1272 			goto free_fmt;
1273 		}
1274 		if (evtr_read(evtr, subsys, evh->subsys_len)) {
1275 			goto free_subsys;
1276 		}
1277 		fmt->subsys = subsys;
1278 	} else {
1279 		fmt->subsys = "";
1280 	}
1281 	if (!(fmtstr = malloc(evh->fmt_len + 1))) {
1282 		evtr->err = errno;
1283 		goto free_subsys;
1284 	}
1285 	if (evtr_read(evtr, fmtstr, evh->fmt_len)) {
1286 		goto free_fmtstr;
1287 	}
1288 	fmtstr[evh->fmt_len] = '\0';
1289 	fmt->fmt = fmtstr;
1290 
1291 	printd(DS, "fmt_map_insert (%d, %s)\n", evh->id, fmt->fmt);
1292 	evtr->err = fmt_map_insert(&evtr->fmtmap.root, fmt, evh->id);
1293 	switch (evtr->err) {
1294 	case ENOMEM:
1295 		evtr->errmsg = "out of memory";
1296 		break;
1297 	case EEXIST:
1298 		evtr->errmsg = "redefinition of an id to a "
1299 			"different format (corrupt input)";
1300 		break;
1301 	default:
1302 		evtr_resolve_filters(evtr, fmt->fmt, evh->id);
1303 	}
1304 	return 0;
1305 
1306 free_fmtstr:
1307 	free(fmtstr);
1308 free_subsys:
1309 	if (subsys)
1310 		free(subsys);
1311 free_fmt:
1312 	free(fmt);
1313 	return !0;
1314 }
1315 
1316 static
1317 int
1318 evtr_load_string(evtr_t evtr, char *buf)
1319 {
1320 	char sbuf[PATH_MAX + 1];
1321 	struct string_event_header *evh = (struct string_event_header *)buf;
1322 
1323 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1324 		      sizeof(*evh) - sizeof(evh->eh))) {
1325 		return !0;
1326 	}
1327 	if (evh->len > PATH_MAX) {
1328 		evtr->errmsg = "string too large (corrupt input)";
1329 		return !0;
1330 	}
1331 	if (evh->len && evtr_read(evtr, sbuf, evh->len)) {
1332 		return !0;
1333 	}
1334 	sbuf[evh->len] = 0;
1335 	if (evh->ns >= EVTR_NS_MAX) {
1336 		evtr->errmsg = "invalid namespace (corrupt input)";
1337 		return !0;
1338 	}
1339 	validate_string(sbuf);
1340 	printd(DS, "evtr_load_string:ns %d id %d : \"%s\"\n", evh->ns, evh->id,
1341 	       sbuf);
1342 	evtr->err = string_map_insert(&evtr->maps[evh->ns - 1].root, sbuf, evh->id);
1343 	switch (evtr->err) {
1344 	case ENOMEM:
1345 		evtr->errmsg = "out of memory";
1346 		break;
1347 	case EEXIST:
1348 		evtr->errmsg = "redefinition of an id to a "
1349 			"different string (corrupt input)";
1350 		break;
1351 	default:
1352 		;
1353 	}
1354 	return 0;
1355 }
1356 
1357 static
1358 int
1359 evtr_filter_match(evtr_filter_t f, struct probe_event_header *pev)
1360 {
1361 	if ((f->cpu != -1) && (f->cpu != pev->cpu))
1362 		return 0;
1363 	if (!f->fmtid)
1364 		return !0;
1365 	/*
1366 	 * If we don't have an id for the required format
1367 	 * string, the format string won't match anyway
1368 	 * (we require that id <-> fmt mappings appear
1369 	 * before the first appearance of the fmt string),
1370 	 * so don't bother comparing.
1371 	 */
1372 	if (!(f->flags & FILTF_ID))
1373 		return 0;
1374 	if(pev->fmt == f->fmtid)
1375 		return !0;
1376 	return 0;
1377 }
1378 
1379 static
1380 int
1381 evtr_match_filters(struct evtr_query *q, struct probe_event_header *pev)
1382 {
1383 	int i;
1384 
1385 	/* no filters means we're interested in all events */
1386 	if (!q->nfilt)
1387 		return !0;
1388 	++q->ntried;
1389 	for (i = 0; i < q->nfilt; ++i) {
1390 		if (evtr_filter_match(&q->filt[i], pev)) {
1391 			++q->nmatched;
1392 			return !0;
1393 		}
1394 	}
1395 	return 0;
1396 }
1397 
1398 static
1399 int
1400 evtr_skip(evtr_t evtr, off_t bytes)
1401 {
1402 	if (fseek(evtr->f, bytes, SEEK_CUR)) {
1403 		evtr->err = errno;
1404 		evtr->errmsg = strerror(errno);
1405 		return !0;
1406 	}
1407 	evtr->bytes += bytes;
1408 	return 0;
1409 }
1410 
1411 /*
1412  * Make sure q->buf is at least len bytes
1413  */
1414 static
1415 int
1416 evtr_query_reserve_buf(struct evtr_query *q, int len)
1417 {
1418 	void *tmp;
1419 
1420 	if (q->bufsize >= len)
1421 		return 0;
1422 	if (!(tmp = realloc(q->buf, len)))
1423 		return !0;
1424 	q->buf = tmp;
1425 	q->bufsize = len;
1426 	return 0;
1427 }
1428 
1429 static
1430 int
1431 evtr_load_probe(evtr_t evtr, evtr_event_t ev, char *buf, struct evtr_query *q)
1432 {
1433 	struct probe_event_header *evh = (struct probe_event_header *)buf;
1434 	struct cpu *cpu;
1435 
1436 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1437 		      sizeof(*evh) - sizeof(evh->eh)))
1438 		return !0;
1439 	memset(ev, '\0', sizeof(*ev));
1440 	ev->ts = evh->eh.ts;
1441 	ev->type = EVTR_TYPE_PROBE;
1442 	ev->line = evh->line;
1443 	ev->cpu = evh->cpu;
1444 	if ((cpu = evtr_cpu(evtr, evh->cpu))) {
1445 		ev->td = cpu->td;
1446 	} else {
1447 		ev->td = NULL;
1448 	}
1449 	if (evh->file) {
1450 		ev->file = string_map_find(
1451 			&evtr->maps[EVTR_NS_PATH - 1].root,
1452 			evh->file);
1453 		if (!ev->file) {
1454 			evtr->errmsg = "unknown id for file path";
1455 			evtr->err = !0;
1456 			ev->file = "<unknown>";
1457 		} else {
1458 			validate_string(ev->file);
1459 		}
1460 	} else {
1461 		ev->file = "<unknown>";
1462 	}
1463 	if (evh->fmt) {
1464 		const struct event_fmt *fmt;
1465 		if (!(fmt = fmt_map_find(&evtr->fmtmap.root, evh->fmt))) {
1466 			evtr->errmsg = "unknown id for event fmt";
1467 			evtr->err = !0;
1468 			ev->fmt = NULL;
1469 		} else {
1470 			ev->fmt = fmt->fmt;
1471 			validate_string(fmt->fmt);
1472 		}
1473 	}
1474 	if (evh->datalen) {
1475 		if (evtr_query_reserve_buf(q, evh->datalen + 1)) {
1476 			evtr->err = ENOMEM;
1477 		} else if (!evtr_read(evtr, q->buf, evh->datalen)) {
1478 			struct replace_ctx replctx = {
1479 				.evtr = evtr,
1480 				.ts = ev->ts,
1481 			};
1482 			assert(ev->fmt);
1483 
1484 			ev->fmtdata = q->buf;
1485 			/*
1486 			 * If the format specifies any string pointers, there
1487 			 * is a string id stored in the fmtdata. Look it up
1488 			 * and replace it with a string pointer before
1489 			 * returning it to the user.
1490 			 */
1491 			if (mangle_string_ptrs(ev->fmt, __DECONST(uint8_t *,
1492 								  ev->fmtdata),
1493 					       replace_strid, &replctx) < 0)
1494 				return evtr->err;
1495 			if (evtr->err)
1496 				return evtr->err;
1497 			((char *)ev->fmtdata)[evh->datalen] = '\0';
1498 			ev->fmtdatalen = evh->datalen;
1499 		}
1500 	}
1501 	evtr_run_callbacks(ev, evtr);
1502 	/* we can't filter before running the callbacks */
1503 	if (!evtr_match_filters(q, evh)) {
1504 		return -1;	/* no match */
1505 	}
1506 
1507 	return evtr->err;
1508 }
1509 
1510 static
1511 int
1512 evtr_skip_to_record(evtr_t evtr)
1513 {
1514 	int skip;
1515 
1516 	skip = REC_ALIGN - (evtr->bytes % REC_ALIGN);
1517 	if (skip > 0) {
1518 		if (fseek(evtr->f, skip, SEEK_CUR)) {
1519 			evtr->err = errno;
1520 			evtr->errmsg = strerror(errno);
1521 			return !0;
1522 		}
1523 		evtr->bytes += skip;
1524 	}
1525 	return 0;
1526 }
1527 
1528 static
1529 int
1530 evtr_load_sysinfo(evtr_t evtr)
1531 {
1532 	uint16_t ncpus;
1533 	int i;
1534 
1535 	if (evtr_read(evtr, &ncpus, sizeof(ncpus))) {
1536 		return !0;
1537 	}
1538 	if (evtr->cpus)
1539 		return 0;
1540 	evtr->cpus = malloc(ncpus * sizeof(struct cpu));
1541 	if (!evtr->cpus) {
1542 		evtr->err = ENOMEM;
1543 		return !0;
1544 	}
1545 	evtr->ncpus = ncpus;
1546 	for (i = 0; i < ncpus; ++i) {
1547 		evtr->cpus[i].td = NULL;
1548 		evtr->cpus[i].freq = -1.0;
1549 	}
1550 	return 0;
1551 }
1552 
1553 static
1554 int
1555 evtr_load_cpuinfo(evtr_t evtr)
1556 {
1557 	struct cpuinfo_event_header cih;
1558 	struct cpu *cpu;
1559 
1560 	if (evtr_read(evtr, &cih, sizeof(cih))) {
1561 		return !0;
1562 	}
1563 	if (cih.freq < 0.0) {
1564 		evtr->errmsg = "cpu freq is negative";
1565 		evtr->err = EINVAL;
1566 		return !0;
1567 	}
1568 	/*
1569 	 * Notice that freq is merely a multiplier with
1570 	 * which we convert a timestamp to seconds; if
1571 	 * ts is not in cycles, freq is not the frequency.
1572 	 */
1573 	if (!(cpu = evtr_cpu(evtr, cih.cpu))) {
1574 		evtr->errmsg = "freq for invalid cpu";
1575 		evtr->err = EINVAL;
1576 		return !0;
1577 	}
1578 	cpu->freq = cih.freq;
1579 	return 0;
1580 }
1581 
1582 static
1583 int
1584 _evtr_next_event(evtr_t evtr, evtr_event_t ev, struct evtr_query *q)
1585 {
1586 	char buf[MAX_EVHDR_SIZE];
1587 	int ret, err, ntried, nmatched;
1588 	struct trace_event_header *evhdr = (struct trace_event_header *)buf;
1589 
1590 	for (ret = 0; !ret;) {
1591 		if (evtr_read(evtr, &evhdr->type, 1)) {
1592 			if (feof(evtr->f)) {
1593 				evtr->errmsg = NULL;
1594 				evtr->err = 0;
1595 				return -1;
1596 			}
1597 			return !0;
1598 		}
1599 		/*
1600 		 * skip pad records -- this will only happen if there's a
1601 		 * variable sized record close to the boundary
1602 		 */
1603 		if (evhdr->type == EVTR_TYPE_PAD) {
1604 			evtr_skip_to_record(evtr);
1605 			continue;
1606 		}
1607 		if (evhdr->type == EVTR_TYPE_SYSINFO) {
1608 			evtr_load_sysinfo(evtr);
1609 			continue;
1610 		} else if (evhdr->type == EVTR_TYPE_CPUINFO) {
1611 			evtr_load_cpuinfo(evtr);
1612 			continue;
1613 		}
1614 		if (evtr_read(evtr, buf + 1, sizeof(*evhdr) - 1))
1615 			return feof(evtr->f) ? -1 : !0;
1616 		switch (evhdr->type) {
1617 		case EVTR_TYPE_PROBE:
1618 			ntried = q->ntried;
1619 			nmatched = q->nmatched;
1620 			if ((err = evtr_load_probe(evtr, ev, buf, q))) {
1621 				if (err == -1) {
1622 					/* no match */
1623 					ret = 0;
1624 				} else {
1625 					return !0;
1626 				}
1627 			} else {
1628 				ret = !0;
1629 			}
1630 			break;
1631 		case EVTR_TYPE_STR:
1632 			if (evtr_load_string(evtr, buf)) {
1633 				return !0;
1634 			}
1635 			break;
1636 		case EVTR_TYPE_FMT:
1637 			if (evtr_load_fmt(evtr, buf)) {
1638 				return !0;
1639 			}
1640 			break;
1641 		default:
1642 			evtr->err = !0;
1643 			evtr->errmsg = "unknown event type (corrupt input?)";
1644 			return !0;
1645 		}
1646 		evtr_skip_to_record(evtr);
1647 		if (ret) {
1648 			q->off = evtr->bytes;
1649 			return 0;
1650 		}
1651 	}
1652 	/* can't get here */
1653 	return !0;
1654 }
1655 
1656 int
1657 evtr_next_event(evtr_t evtr, evtr_event_t ev)
1658 {
1659 	struct evtr_query *q;
1660 	int ret;
1661 
1662 	if (!(q = evtr_query_init(evtr, NULL, 0))) {
1663 		evtr->err = ENOMEM;
1664 		return !0;
1665 	}
1666 	ret = _evtr_next_event(evtr, ev, q);
1667 	evtr_query_destroy(q);
1668 	return ret;
1669 }
1670 
1671 int
1672 evtr_last_event(evtr_t evtr, evtr_event_t ev)
1673 {
1674 	struct stat st;
1675 	int fd;
1676 	off_t last_boundary;
1677 
1678 	fd = fileno(evtr->f);
1679 	if (fstat(fd, &st))
1680 		return !0;
1681 	/*
1682 	 * This skips pseudo records, so we can't provide
1683 	 * an event with all fields filled in this way.
1684 	 * It's doable, just needs some care. TBD.
1685 	 */
1686 	if (0 && (st.st_mode & S_IFREG)) {
1687 		/*
1688 		 * Skip to last boundary, that's the closest to the EOF
1689 		 * location that we are sure contains a header so we can
1690 		 * pick up the stream.
1691 		 */
1692 		last_boundary = (st.st_size / REC_BOUNDARY) * REC_BOUNDARY;
1693 		/* XXX: ->bytes should be in query */
1694 		assert(evtr->bytes == 0);
1695 		evtr_skip(evtr, last_boundary);
1696 	}
1697 
1698 
1699 	/*
1700 	 * If we can't seek, we need to go through the whole file.
1701 	 * Since you can't seek back, this is pretty useless unless
1702 	 * you really are interested only in the last event.
1703 	 */
1704 	while (!evtr_next_event(evtr, ev))
1705 		;
1706 	if (evtr_error(evtr))
1707 		return !0;
1708 	evtr_rewind(evtr);
1709 	return 0;
1710 }
1711 
1712 struct evtr_query *
1713 evtr_query_init(evtr_t evtr, evtr_filter_t filt, int nfilt)
1714 {
1715 	struct evtr_query *q;
1716 	int i;
1717 
1718 	if (!(q = malloc(sizeof(*q)))) {
1719 		return q;
1720 	}
1721 	q->bufsize = 2;
1722 	if (!(q->buf = malloc(q->bufsize))) {
1723 		goto free_q;
1724 	}
1725 	q->evtr = evtr;
1726 	q->off = 0;
1727 	q->filt = filt;
1728 	q->nfilt = nfilt;
1729 	q->nmatched = 0;
1730 	for (i = 0; i < nfilt; ++i) {
1731 		filt[i].flags = 0;
1732 		if (filt[i].fmt == NULL)
1733 			continue;
1734 		if (evtr_filter_register(evtr, &filt[i])) {
1735 			evtr_deregister_filters(evtr, filt, i);
1736 			goto free_buf;
1737 		}
1738 	}
1739 
1740 	return q;
1741 free_buf:
1742 	free(q->buf);
1743 free_q:
1744 	free(q);
1745 	return NULL;
1746 }
1747 
1748 void
1749 evtr_query_destroy(struct evtr_query *q)
1750 {
1751 	evtr_deregister_filters(q->evtr, q->filt, q->nfilt);
1752 	free(q->buf);
1753 	free(q);
1754 }
1755 
1756 int
1757 evtr_query_next(struct evtr_query *q, evtr_event_t ev)
1758 {
1759 	/* we may support that in the future */
1760 	if (q->off != q->evtr->bytes)
1761 		return !0;
1762 	return _evtr_next_event(q->evtr, ev, q);
1763 }
1764 
1765 int
1766 evtr_ncpus(evtr_t evtr)
1767 {
1768 	return evtr->ncpus;
1769 }
1770 
1771 int
1772 evtr_cpufreqs(evtr_t evtr, double *freqs)
1773 {
1774 	int i;
1775 
1776 	if (!freqs)
1777 		return EINVAL;
1778 	for (i = 0; i < evtr->ncpus; ++i) {
1779 		freqs[i] = evtr->cpus[i].freq;
1780 	}
1781 	return 0;
1782 }
1783