xref: /dragonfly/lib/libevtr/evtr.c (revision 7d3e9a5b)
1 /*
2  * Copyright (c) 2009, 2010 Aggelos Economopoulos.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in
12  *    the documentation and/or other materials provided with the
13  *    distribution.
14  * 3. Neither the name of The DragonFly Project nor the names of its
15  *    contributors may be used to endorse or promote products derived
16  *    from this software without specific, prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
22  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
28  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <assert.h>
33 #include <ctype.h>
34 #include <err.h>
35 #include <errno.h>
36 #include <limits.h>
37 #include <stdarg.h>
38 #include <stddef.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <sys/param.h>
43 #include <sys/queue.h>
44 #include <sys/stat.h>
45 #include <sys/tree.h>
46 
47 
48 #include "evtr.h"
49 #include "internal.h"
50 
51 unsigned evtr_debug;
52 
53 static
54 void
55 printd_set_flags(const char *str, unsigned int *flags)
56 {
57 	/*
58 	 * This is suboptimal as we don't detect
59 	 * invalid flags.
60 	 */
61 	for (; *str; ++str) {
62 		if ('A' == *str) {
63 			*flags = -1;
64 			return;
65 		}
66 		if (!islower(*str))
67 			err(2, "invalid debug flag %c\n", *str);
68 		*flags |= 1 << (*str - 'a');
69 	}
70 }
71 
72 
73 enum {
74 	MAX_EVHDR_SIZE = PATH_MAX + 200,
75 	/* string namespaces */
76 	EVTR_NS_PATH = 0x1,
77 	EVTR_NS_FUNC,
78 	EVTR_NS_DSTR,
79 	EVTR_NS_MAX,
80 	NR_BUCKETS = 1021,	/* prime */
81 	PARSE_ERR_BUFSIZE = 256,
82 	REC_ALIGN = 8,
83 	REC_BOUNDARY = 1 << 14,
84 	FILTF_ID = 0x10,
85 	EVTRF_WR = 0x1,		/* open for writing */
86 	EVTRQF_PENDING = 0x1,
87 };
88 
89 typedef uint16_t fileid_t;
90 typedef uint16_t funcid_t;
91 typedef uint16_t fmtid_t;
92 
93 struct trace_event_header {
94 	uint8_t type;
95 	uint64_t ts;	/* XXX: this should only be part of probe */
96 } __attribute__((packed));
97 
98 struct probe_event_header {
99 	struct trace_event_header eh;
100 	/*
101 	 * For these fields, 0 implies "not available"
102 	 */
103 	fileid_t file;
104 	funcid_t caller1;
105 	funcid_t caller2;
106 	funcid_t func;
107 	uint16_t line;
108 	fmtid_t fmt;
109 	uint16_t datalen;
110 	uint8_t cpu;	/* -1 if n/a */
111 } __attribute__((packed));
112 
113 struct string_event_header {
114 	struct trace_event_header eh;
115 	uint16_t ns;
116 	uint32_t id;
117 	uint16_t len;
118 } __attribute__((packed));
119 
120 struct fmt_event_header {
121 	struct trace_event_header eh;
122 	uint16_t id;
123 	uint8_t subsys_len;
124 	uint8_t fmt_len;
125 } __attribute__((packed));
126 
127 struct cpuinfo_event_header {
128 	double freq;
129 	uint8_t cpu;
130 } __attribute__((packed));
131 
132 struct hashentry {
133 	uintptr_t key;
134 	uintptr_t val;
135 	struct hashentry *next;
136 };
137 
138 struct hashtab {
139 	struct hashentry *buckets[NR_BUCKETS];
140 	uintptr_t (*hashfunc)(uintptr_t);
141 	uintptr_t (*cmpfunc)(uintptr_t, uintptr_t);
142 };
143 
144 struct symtab {
145 	struct hashtab tab;
146 };
147 
148 struct event_fmt {
149 	const char *subsys;
150 	const char *fmt;
151 };
152 
153 struct event_filter_unresolved {
154 	TAILQ_ENTRY(event_filter_unresolved) link;
155 	evtr_filter_t filt;
156 };
157 
158 struct id_map {
159 	RB_ENTRY(id_map) rb_node;
160 	int id;
161 	const void *data;
162 };
163 
164 RB_HEAD(id_tree, id_map);
165 struct string_map {
166 	struct id_tree root;
167 };
168 
169 struct fmt_map {
170 	struct id_tree root;
171 };
172 
173 RB_HEAD(thread_tree, evtr_thread);
174 
175 struct thread_map {
176 	struct thread_tree root;
177 };
178 
179 struct event_callback {
180 	void (*cb)(evtr_event_t, void *data);
181 	void *data;	/* this field must be malloc()ed */
182 };
183 
184 struct cpu {
185 	struct evtr_thread *td;	/* currently executing thread */
186 	double freq;
187 };
188 
189 struct evtr {
190 	FILE *f;
191 	int flags;
192 	int err;
193 	const char *errmsg;
194 	off_t bytes;
195 	union {
196 		/*
197 		 * When writing, we keep track of the strings we've
198 		 * already dumped so we only dump them once.
199 		 * Paths, function names etc belong to different
200 		 * namespaces.
201 		 */
202 		struct hashtab_str *strings[EVTR_NS_MAX - 1];
203 		/*
204 		 * When reading, we build a map from id to string.
205 		 * Every id must be defined at the point of use.
206 		 */
207 		struct string_map maps[EVTR_NS_MAX - 1];
208 	};
209 	union {
210 		/* same as above, but for subsys+fmt pairs */
211 		struct fmt_map fmtmap;
212 		struct hashtab_str *fmts;
213 	};
214 	struct thread_map threads;
215 	struct cpu *cpus;
216 	int ncpus;
217 };
218 
219 struct evtr_query {
220 	evtr_t evtr;
221 	off_t off;
222 	evtr_filter_t filt;
223 	int nfilt;
224 	int nmatched;
225 	int ntried;
226 	void *buf;
227 	int bufsize;
228 	struct symtab *symtab;
229 	int ncbs;
230 	struct event_callback **cbs;
231 	/*
232 	 * Filters that have a format specified and we
233 	 * need to resolve that to an fmtid
234 	 */
235 	TAILQ_HEAD(, event_filter_unresolved) unresolved_filtq;
236 	int err;
237 	const char *errmsg;
238 	char parse_err_buf[PARSE_ERR_BUFSIZE];
239 	int flags;
240 	struct evtr_event pending_event;
241 };
242 
243 void
244 evtr_set_debug(const char *str)
245 {
246 	printd_set_flags(str, &evtr_debug);
247 }
248 
249 static int id_map_cmp(struct id_map *, struct id_map *);
250 RB_PROTOTYPE2(id_tree, id_map, rb_node, id_map_cmp, int);
251 RB_GENERATE2(id_tree, id_map, rb_node, id_map_cmp, int, id);
252 
253 static int thread_cmp(struct evtr_thread *, struct evtr_thread *);
254 RB_PROTOTYPE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *);
255 RB_GENERATE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *, id);
256 
257 static inline
258 void
259 validate_string(const char *str)
260 {
261 	if (!(evtr_debug & MISC))
262 		return;
263 	for (; *str; ++str)
264 		assert(isprint(*str));
265 }
266 
267 static
268 void
269 id_tree_free(struct id_tree *root)
270 {
271 	struct id_map *v, *n;
272 
273 	for (v = RB_MIN(id_tree, root); v; v = n) {
274 		n = RB_NEXT(id_tree, root, v);
275 		RB_REMOVE(id_tree, root, v);
276 	}
277 }
278 
279 static
280 int
281 evtr_register_callback(evtr_query_t q, void (*fn)(evtr_event_t, void *), void *d)
282 {
283 	struct event_callback *cb;
284 	void *cbs;
285 
286 	if (!(cb = malloc(sizeof(*cb)))) {
287 		q->err = ENOMEM;
288 		return !0;
289 	}
290 	cb->cb = fn;
291 	cb->data = d;
292 	if (!(cbs = realloc(q->cbs, (++q->ncbs) * sizeof(cb)))) {
293 		--q->ncbs;
294 		free(cb);
295 		q->err = ENOMEM;
296 		return !0;
297 	}
298 	q->cbs = cbs;
299 	q->cbs[q->ncbs - 1] = cb;
300 	return 0;
301 }
302 
303 static
304 void
305 evtr_deregister_callbacks(evtr_query_t q)
306 {
307 	int i;
308 
309 	for (i = 0; i < q->ncbs; ++i) {
310 		free(q->cbs[i]);
311 	}
312 	free(q->cbs);
313 	q->cbs = NULL;
314 }
315 
316 static
317 void
318 evtr_run_callbacks(evtr_event_t ev, evtr_query_t q)
319 {
320 	struct event_callback *cb;
321 	int i;
322 
323 	for (i = 0; i < q->ncbs; ++i) {
324 		cb = q->cbs[i];
325 		cb->cb(ev, cb->data);
326 	}
327 }
328 
329 static
330 struct cpu *
331 evtr_cpu(evtr_t evtr, int c)
332 {
333 	if ((c < 0) || (c >= evtr->ncpus))
334 		return NULL;
335 	return &evtr->cpus[c];
336 }
337 
338 static int parse_format_data(evtr_event_t ev, const char *fmt, ...)
339 	       __printflike(2, 3) __scanflike(2, 3);
340 
341 static
342 int
343 parse_format_data(evtr_event_t ev, const char *fmt, ...)
344 {
345 	va_list ap;
346 	char buf[2048];
347 
348 	if (strcmp(fmt, ev->fmt))
349 		return 0;
350 	vsnprintf(buf, sizeof(buf), fmt, __DECONST(void *, ev->fmtdata));
351 	printd(MISC, "string is: %s\n", buf);
352 	va_start(ap, fmt);
353 	return vsscanf(buf, fmt, ap);
354 }
355 
356 static
357 void
358 evtr_deregister_filters(evtr_query_t q, evtr_filter_t filt, int nfilt)
359 {
360 	struct event_filter_unresolved *u, *tmp;
361 	int i;
362 	TAILQ_FOREACH_MUTABLE(u, &q->unresolved_filtq, link, tmp) {
363 		for (i = 0; i < nfilt; ++i) {
364 			if (u->filt == &filt[i]) {
365 				TAILQ_REMOVE(&q->unresolved_filtq, u, link);
366 			}
367 		}
368 	}
369 }
370 
371 static
372 int
373 evtr_filter_register(evtr_query_t q, evtr_filter_t filt)
374 {
375 	struct event_filter_unresolved *res;
376 
377 	if (!(res = malloc(sizeof(*res)))) {
378 		q->err = ENOMEM;
379 		return !0;
380 	}
381 	res->filt = filt;
382 	TAILQ_INSERT_TAIL(&q->unresolved_filtq, res, link);
383 	return 0;
384 }
385 
386 static
387 int
388 evtr_query_needs_parsing(evtr_query_t q)
389 {
390 	int i;
391 
392 	for (i = 0; i < q->nfilt; ++i)
393 		if (q->filt[i].ev_type == EVTR_TYPE_STMT)
394 			return !0;
395 	return 0;
396 }
397 
398 void
399 evtr_event_data(evtr_event_t ev, char *buf, size_t len)
400 {
401 	/*
402 	 * XXX: we implicitly trust the format string.
403 	 * We shouldn't.
404 	 */
405 	if (ev->fmtdatalen) {
406 		vsnprintf(buf, len, ev->fmt, __DECONST(void *, ev->fmtdata));
407 	} else {
408 		strlcpy(buf, ev->fmt, len);
409 	}
410 }
411 
412 int
413 evtr_error(evtr_t evtr)
414 {
415 	return evtr->err || (evtr->errmsg != NULL);
416 }
417 
418 const char *
419 evtr_errmsg(evtr_t evtr)
420 {
421 	return evtr->errmsg ? evtr->errmsg : strerror(evtr->err);
422 }
423 
424 int
425 evtr_query_error(evtr_query_t q)
426 {
427 	return q->err || (q->errmsg != NULL) || evtr_error(q->evtr);
428 }
429 
430 const char *
431 evtr_query_errmsg(evtr_query_t q)
432 {
433 	return q->errmsg ? q->errmsg :
434 		(q->err ? strerror(q->err) :
435 		 (evtr_errmsg(q->evtr)));
436 }
437 
438 static
439 int
440 id_map_cmp(struct id_map *a, struct id_map *b)
441 {
442 	return a->id - b->id;
443 }
444 
445 static
446 int
447 thread_cmp(struct evtr_thread *a, struct evtr_thread *b)
448 {
449 	ptrdiff_t d;
450 	d =  a->id - b->id;
451 	if (d < 0)
452 		return -1;
453 	if (!d)
454 		return 0;
455 	return 1;
456 }
457 
458 #define DEFINE_MAP_FIND(prefix, type)		\
459 	static					\
460 	type				\
461 	prefix ## _map_find(struct id_tree *tree, int id)\
462 	{						 \
463 		struct id_map *sid;			 \
464 							\
465 		sid = id_tree_RB_LOOKUP(tree, id);	\
466 		return sid ? sid->data : NULL;		\
467 	}
468 
469 DEFINE_MAP_FIND(string, const char *)
470 DEFINE_MAP_FIND(fmt, const struct event_fmt *)
471 
472 static
473 struct evtr_thread *
474 thread_map_find(struct thread_map *map, void *id)
475 {
476 	return thread_tree_RB_LOOKUP(&map->root, id);
477 }
478 
479 #define DEFINE_MAP_INSERT(prefix, type, _cmp, _dup)	\
480 	static					\
481 	int								\
482 	prefix ## _map_insert(struct id_tree *tree, type data, int id) \
483 	{								\
484 	struct id_map *sid, *osid;					\
485 									\
486 	sid = malloc(sizeof(*sid));					\
487 	if (!sid) {							\
488 		return ENOMEM;						\
489 	}								\
490 	sid->id = id;							\
491 	sid->data = data;						\
492 	if ((osid = id_tree_RB_INSERT(tree, sid))) {			\
493 		free(sid);						\
494 		if (_cmp((type)osid->data, data)) {			\
495 			return EEXIST;					\
496 		}							\
497 		printd(DS, "mapping already exists, skipping\n");		\
498 		/* we're OK with redefinitions of an id to the same string */ \
499 		return 0;						\
500 	}								\
501 	/* only do the strdup if we're inserting a new string */	\
502 	sid->data = _dup(data);		/* XXX: oom */			\
503 	return 0;							\
504 }
505 
506 static
507 void
508 thread_map_insert(struct thread_map *map, struct evtr_thread *td)
509 {
510 	struct evtr_thread *otd;
511 
512 	if ((otd = thread_tree_RB_INSERT(&map->root, td))) {
513 		/*
514 		 * Thread addresses might be reused, we're
515 		 * ok with that.
516 		 * DANGER, Will Robinson: this means the user
517 		 * of the API needs to copy event->td if they
518 		 * want it to remain stable.
519 		 */
520 		free((void *)otd->comm);
521 		otd->comm = td->comm;
522 		free(td);
523 	}
524 }
525 
526 static
527 int
528 event_fmt_cmp(const struct event_fmt *a, const struct event_fmt *b)
529 {
530 	int ret = 0;
531 
532 	if (a->subsys) {
533 		if (b->subsys) {
534 			ret = strcmp(a->subsys, b->subsys);
535 		} else {
536 			ret = strcmp(a->subsys, "");
537 		}
538 	} else if (b->subsys) {
539 			ret = strcmp("", b->subsys);
540 	}
541 	if (ret)
542 		return ret;
543 	return strcmp(a->fmt, b->fmt);
544 }
545 
546 static
547 struct event_fmt *
548 event_fmt_dup(const struct event_fmt *o)
549 {
550 	struct event_fmt *n;
551 
552 	if (!(n = malloc(sizeof(*n)))) {
553 		return n;
554 	}
555 	memcpy(n, o, sizeof(*n));
556 	return n;
557 }
558 
559 DEFINE_MAP_INSERT(string, const char *, strcmp, strdup)
560 DEFINE_MAP_INSERT(fmt, const struct event_fmt *, event_fmt_cmp, event_fmt_dup)
561 
562 int
563 hash_find(const struct hashtab *tab, uintptr_t key, uintptr_t *val)
564 {
565 	struct hashentry *ent;
566 
567 	for(ent = tab->buckets[tab->hashfunc(key)];
568 	    ent && tab->cmpfunc(ent->key, key);
569 	    ent = ent->next);
570 
571 	if (!ent)
572 		return !0;
573 	*val = ent->val;
574 	return 0;
575 }
576 
577 struct hashentry *
578 hash_insert(struct hashtab *tab, uintptr_t key, uintptr_t val)
579 {
580 	struct hashentry *ent;
581 	int hsh;
582 
583 	if (!(ent = malloc(sizeof(*ent)))) {
584 		fprintf(stderr, "out of memory\n");
585 		return NULL;
586 	}
587 	hsh = tab->hashfunc(key);
588 	ent->next = tab->buckets[hsh];
589 	ent->key = key;
590 	ent->val = val;
591 	tab->buckets[hsh] = ent;
592 	return ent;
593 }
594 
595 static
596 uintptr_t
597 cmpfunc_pointer(uintptr_t a, uintptr_t b)
598 {
599 	return b - a;
600 }
601 
602 static
603 uintptr_t
604 hashfunc_pointer(uintptr_t p)
605 {
606 	return p % NR_BUCKETS;
607 }
608 
609 struct hashtab *
610 hash_new(void)
611 {
612 	struct hashtab *tab;
613 	if (!(tab = calloc(sizeof(struct hashtab), 1)))
614 		return tab;
615 	tab->hashfunc = &hashfunc_pointer;
616 	tab->cmpfunc = &cmpfunc_pointer;
617 	return tab;
618 }
619 
620 struct hashtab_str {	/* string -> id map */
621 	struct hashtab tab;
622 	uint16_t id;
623 };
624 
625 static
626 uintptr_t
627 hashfunc_string(uintptr_t p)
628 {
629 	const char *str = (char *)p;
630         unsigned long hash = 5381;
631         int c;
632 
633         while ((c = *str++))
634             hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
635 	return hash  % NR_BUCKETS;
636 }
637 
638 static
639 uintptr_t
640 cmpfunc_string(uintptr_t a, uintptr_t b)
641 {
642 	return strcmp((char *)a, (char *)b);
643 }
644 
645 
646 static
647 struct hashtab_str *
648 strhash_new(void)
649 {
650 	struct hashtab_str *strtab;
651 	if (!(strtab = calloc(sizeof(struct hashtab_str), 1)))
652 		return strtab;
653 	strtab->tab.hashfunc = &hashfunc_string;
654 	strtab->tab.cmpfunc = &cmpfunc_string;
655 	return strtab;
656 }
657 
658 static
659 void
660 strhash_destroy(struct hashtab_str *strtab)
661 {
662 	free(strtab);
663 }
664 
665 static
666 int
667 strhash_find(struct hashtab_str *strtab, const char *str, uint16_t *id)
668 {
669 	uintptr_t val;
670 
671 	if (hash_find(&strtab->tab, (uintptr_t)str, &val))
672 		return !0;
673 	*id = (uint16_t)val;
674 	return 0;
675 }
676 
677 static
678 int
679 strhash_insert(struct hashtab_str *strtab, const char *str, uint16_t *id)
680 {
681 	uintptr_t val;
682 
683 	val = ++strtab->id;
684 	if (strtab->id == 0) {
685 		fprintf(stderr, "too many strings\n");
686 		return ERANGE;
687 	}
688 	str = strdup(str);
689 	if (!str) {
690 		fprintf(stderr, "out of memory\n");
691 		--strtab->id;
692 		return ENOMEM;
693 	}
694 	hash_insert(&strtab->tab, (uintptr_t)str, (uintptr_t)val);
695 	*id = strtab->id;
696 	return 0;
697 }
698 
699 struct symtab *
700 symtab_new(void)
701 {
702 	struct symtab *symtab;
703 	if (!(symtab = calloc(sizeof(struct symtab), 1)))
704 		return symtab;
705 	symtab->tab.hashfunc = &hashfunc_string;
706 	symtab->tab.cmpfunc = &cmpfunc_string;
707 	return symtab;
708 }
709 
710 void
711 symtab_destroy(struct symtab *symtab)
712 {
713 	free(symtab);
714 }
715 
716 struct evtr_variable *
717 symtab_find(const struct symtab *symtab, const char *str)
718 {
719 	uintptr_t val;
720 
721 	if (hash_find(&symtab->tab, (uintptr_t)str, &val))
722 		return NULL;
723 	return (struct evtr_variable *)val;
724 }
725 
726 int
727 symtab_insert(struct symtab *symtab, const char *name,
728 	       struct evtr_variable *var)
729 {
730 	name = strdup(name);
731 	if (!name) {
732 		fprintf(stderr, "out of memory\n");
733 		return ENOMEM;
734 	}
735 	hash_insert(&symtab->tab, (uintptr_t)name, (uintptr_t)var);
736 	return 0;
737 }
738 
739 static
740 int
741 evtr_filter_match(evtr_query_t q, evtr_filter_t f, evtr_event_t ev)
742 {
743 	if ((f->cpu != -1) && (f->cpu != ev->cpu))
744 		return 0;
745 
746 	assert(!(f->flags & FILTF_ID));
747 	if (ev->type != f->ev_type)
748 		return 0;
749 	if (ev->type == EVTR_TYPE_PROBE) {
750 		if (f->fmt && strcmp(ev->fmt, f->fmt))
751 			return 0;
752 	} else if (ev->type == EVTR_TYPE_STMT) {
753 		struct evtr_variable *var;
754 		/* resolve var */
755 		/* XXX: no need to do that *every* time */
756 		parse_var(f->var, q->symtab, &var, &q->parse_err_buf[0],
757 			  PARSE_ERR_BUFSIZE);
758 		/*
759 		 * Ignore errors, they're expected since the
760 		 * variable might not be instantiated yet
761 		 */
762 		if (var != ev->stmt.var)
763 			return 0;
764 	}
765 	return !0;
766 }
767 
768 static
769 int
770 evtr_match_filters(struct evtr_query *q, evtr_event_t ev)
771 {
772 	int i;
773 
774 	/* no filters means we're interested in all events */
775 	if (!q->nfilt)
776 		return !0;
777 	++q->ntried;
778 	for (i = 0; i < q->nfilt; ++i) {
779 		if (evtr_filter_match(q, &q->filt[i], ev)) {
780 			++q->nmatched;
781 			return !0;
782 		}
783 	}
784 	return 0;
785 }
786 
787 static
788 void
789 parse_callback(evtr_event_t ev, void *d)
790 {
791 	evtr_query_t q = (evtr_query_t)d;
792 	if (ev->type != EVTR_TYPE_PROBE)
793 		return;
794 	if (!ev->fmt || (ev->fmt[0] != '#'))
795 		return;
796 	/*
797 	 * Copy the event to ->pending_event, then call
798 	 * the parser to convert it into a synthesized
799 	 * EVTR_TYPE_STMT event.
800 	 */
801 	memcpy(&q->pending_event, ev, sizeof(*ev));
802 	parse_string(&q->pending_event, q->symtab, &ev->fmt[1],
803 		     &q->parse_err_buf[0], PARSE_ERR_BUFSIZE);
804 	if (q->parse_err_buf[0]) {	/* parse error */
805 		q->errmsg = &q->parse_err_buf[0];
806 		return;
807 	}
808 	if (!evtr_match_filters(q, &q->pending_event))
809 		return;
810 	/*
811 	 * This will cause us to return ->pending_event next time
812 	 * we're called.
813 	 */
814 	q->flags |= EVTRQF_PENDING;
815 }
816 
817 static
818 void
819 thread_creation_callback(evtr_event_t ev, void *d)
820 {
821 	evtr_query_t q = (evtr_query_t)d;
822 	evtr_t evtr = q->evtr;
823 	struct evtr_thread *td;
824 	void *ktd;
825 	char buf[20];
826 
827 	if (parse_format_data(ev, "new_td %p %s", &ktd, buf) != 2) {
828 		return;
829 	}
830 	buf[19] = '\0';
831 
832 	if (!(td = malloc(sizeof(*td)))) {
833 		q->err = ENOMEM;
834 		return;
835 	}
836 	td->id = ktd;
837 	td->userdata = NULL;
838 	if (!(td->comm = strdup(buf))) {
839 		free(td);
840 		q->err = ENOMEM;
841 		return;
842 	}
843 	printd(DS, "inserting new thread %p: %s\n", td->id, td->comm);
844 	thread_map_insert(&evtr->threads, td);
845 }
846 
847 static
848 void
849 thread_switch_callback(evtr_event_t ev, void *d)
850 {
851 	evtr_t evtr = ((evtr_query_t)d)->evtr;
852 	struct evtr_thread *tdp, *tdn;
853 	void *ktdp, *ktdn;
854 	struct cpu *cpu;
855 	static struct evtr_event tdcr;
856 	static char *fmt = "new_td %p %s";
857 	char tidstr[40];
858 	void *fmtdata[2];
859 
860 	cpu = evtr_cpu(evtr, ev->cpu);
861 	if (!cpu) {
862 		printw("invalid cpu %d\n", ev->cpu);
863 		return;
864 	}
865 	if (parse_format_data(ev, "sw  %p > %p", &ktdp, &ktdn) != 2) {
866 		return;
867 	}
868 	tdp = thread_map_find(&evtr->threads, ktdp);
869 	if (!tdp) {
870 		printd(DS, "switching from unknown thread %p\n", ktdp);
871 	}
872 	tdn = thread_map_find(&evtr->threads, ktdn);
873 	if (!tdn) {
874 		/*
875 		 * Fake a thread creation event for threads we
876 		 * haven't seen before.
877 		 */
878 		tdcr.type = EVTR_TYPE_PROBE;
879 		tdcr.ts = ev->ts;
880 		tdcr.file = NULL;
881 		tdcr.func = NULL;
882 		tdcr.line = 0;
883 		tdcr.fmt = fmt;
884 		tdcr.fmtdata = &fmtdata;
885 		tdcr.fmtdatalen = sizeof(fmtdata);
886 		tdcr.cpu = ev->cpu;
887 		tdcr.td = NULL;
888 		snprintf(tidstr, sizeof(tidstr), "%p", ktdn);
889 		fmtdata[0] = ktdn;
890 		fmtdata[1] = tidstr;
891 		thread_creation_callback(&tdcr, d);
892 
893 		tdn = thread_map_find(&evtr->threads, ktdn);
894 		assert(tdn != NULL);
895 		printd(DS, "switching to unknown thread %p\n", ktdn);
896 		cpu->td = tdn;
897 		return;
898 	}
899 	printd(DS, "cpu %d: switching to thread %p\n", ev->cpu, ktdn);
900 	cpu->td = tdn;
901 }
902 
903 static
904 void
905 assert_foff_in_sync(evtr_t evtr)
906 {
907 	off_t off;
908 
909 	/*
910 	 * We keep our own offset because we
911 	 * might want to support mmap()
912 	 */
913 	off = ftello(evtr->f);
914 	if (evtr->bytes != off) {
915 		fprintf(stderr, "bytes %jd, off %jd\n", evtr->bytes, off);
916 		abort();
917 	}
918 }
919 
920 static
921 int
922 evtr_write(evtr_t evtr, const void *buf, size_t bytes)
923 {
924 	assert_foff_in_sync(evtr);
925 	if (fwrite(buf, bytes, 1, evtr->f) != 1) {
926 		evtr->err = errno;
927 		evtr->errmsg = strerror(errno);
928 		return !0;
929 	}
930 	evtr->bytes += bytes;
931 	assert_foff_in_sync(evtr);
932 	return 0;
933 }
934 
935 /*
936  * Called after dumping a record to make sure the next
937  * record is REC_ALIGN aligned. This does not make much sense,
938  * as we shouldn't be using packed structs anyway.
939  */
940 static
941 int
942 evtr_dump_pad(evtr_t evtr)
943 {
944 	size_t pad;
945 	static char buf[REC_ALIGN];
946 
947 	pad = REC_ALIGN - (evtr->bytes % REC_ALIGN);
948 	if (pad > 0) {
949 		return evtr_write(evtr, buf, pad);
950 	}
951 	return 0;
952 }
953 
954 /*
955  * We make sure that there is a new record every REC_BOUNDARY
956  * bytes, this costs next to nothing in space and allows for
957  * fast seeking.
958  */
959 static
960 int
961 evtr_dump_avoid_boundary(evtr_t evtr, size_t bytes)
962 {
963 	unsigned pad, i;
964 	static char buf[256];
965 
966 	pad = REC_BOUNDARY - (evtr->bytes % REC_BOUNDARY);
967 	/* if adding @bytes would cause us to cross a boundary... */
968 	if (bytes > pad) {
969 		/* then pad to the boundary */
970 		for (i = 0; i < (pad / sizeof(buf)); ++i) {
971 			if (evtr_write(evtr, buf, sizeof(buf))) {
972 				return !0;
973 			}
974 		}
975 		i = pad % sizeof(buf);
976 		if (i) {
977 			if (evtr_write(evtr, buf, i)) {
978 				return !0;
979 			}
980 		}
981 	}
982 	return 0;
983 }
984 
985 static
986 int
987 evtr_dump_fmt(evtr_t evtr, uint64_t ts, const evtr_event_t ev)
988 {
989 	struct fmt_event_header fmt;
990 	uint16_t id;
991 	int err;
992 	char *subsys = "", buf[1024];
993 
994 	if (strlcpy(buf, subsys, sizeof(buf)) >= sizeof(buf)) {
995 		evtr->errmsg = "name of subsystem is too large";
996 		evtr->err = ERANGE;
997 		return 0;
998 	}
999 	if (strlcat(buf, ev->fmt, sizeof(buf)) >= sizeof(buf)) {
1000 		evtr->errmsg = "fmt + name of subsystem is too large";
1001 		evtr->err = ERANGE;
1002 		return 0;
1003 	}
1004 
1005 	if (!strhash_find(evtr->fmts, buf, &id)) {
1006 		return id;
1007 	}
1008 	if ((err = strhash_insert(evtr->fmts, buf, &id))) {
1009 		evtr->err = err;
1010 		return 0;
1011 	}
1012 
1013 	fmt.eh.type = EVTR_TYPE_FMT;
1014 	fmt.eh.ts = ts;
1015 	fmt.subsys_len = strlen(subsys);
1016 	fmt.fmt_len = strlen(ev->fmt);
1017 	fmt.id = id;
1018 	if (evtr_dump_avoid_boundary(evtr, sizeof(fmt) + fmt.subsys_len +
1019 				     fmt.fmt_len))
1020 		return 0;
1021 	if (evtr_write(evtr, &fmt, sizeof(fmt)))
1022 		return 0;
1023 	if (evtr_write(evtr, subsys, fmt.subsys_len))
1024 		return 0;
1025 	if (evtr_write(evtr, ev->fmt, fmt.fmt_len))
1026 		return 0;
1027 	if (evtr_dump_pad(evtr))
1028 		return 0;
1029 	return fmt.id;
1030 }
1031 
1032 /*
1033  * Replace string pointers or string ids in fmtdata
1034  */
1035 static
1036 int
1037 mangle_string_ptrs(const char *fmt, uint8_t *fmtdata,
1038 		   const char *(*replace)(void *, const char *), void *ctx)
1039 {
1040 	const char *f, *p;
1041 	size_t skipsize, intsz;
1042 	int ret = 0;
1043 
1044 	for (f = fmt; f[0] != '\0'; ++f) {
1045 		if (f[0] != '%')
1046 			continue;
1047 		++f;
1048 		skipsize = 0;
1049 		for (p = f; p[0]; ++p) {
1050 			int again = 0;
1051 			/*
1052 			 * Eat flags. Notice this will accept duplicate
1053 			 * flags.
1054 			 */
1055 			switch (p[0]) {
1056 			case '#':
1057 			case '0':
1058 			case '-':
1059 			case ' ':
1060 			case '+':
1061 			case '\'':
1062 				again = !0;
1063 				break;
1064 			}
1065 			if (!again)
1066 				break;
1067 		}
1068 		/* Eat minimum field width, if any */
1069 		for (; isdigit(p[0]); ++p)
1070 			;
1071 		if (p[0] == '.')
1072 			++p;
1073 		/* Eat precision, if any */
1074 		for (; isdigit(p[0]); ++p)
1075 			;
1076 		intsz = 0;
1077 		switch (p[0]) {
1078 		case 'l':
1079 			if (p[1] == 'l') {
1080 				++p;
1081 				intsz = sizeof(long long);
1082 			} else {
1083 				intsz = sizeof(long);
1084 			}
1085 			break;
1086 		case 'j':
1087 			intsz = sizeof(intmax_t);
1088 			break;
1089 		case 't':
1090 			intsz = sizeof(ptrdiff_t);
1091 			break;
1092 		case 'z':
1093 			intsz = sizeof(size_t);
1094 			break;
1095 		default:
1096 			break;
1097 		}
1098 		if (intsz != 0)
1099 			++p;
1100 		else
1101 			intsz = sizeof(int);
1102 
1103 		switch (p[0]) {
1104 		case 'd':
1105 		case 'i':
1106 		case 'o':
1107 		case 'u':
1108 		case 'x':
1109 		case 'X':
1110 		case 'c':
1111 			skipsize = intsz;
1112 			break;
1113 		case 'p':
1114 			skipsize = sizeof(void *);
1115 			break;
1116 		case 'f':
1117 			if (p[-1] == 'l')
1118 				skipsize = sizeof(double);
1119 			else
1120 				skipsize = sizeof(float);
1121 			break;
1122 		case 's':
1123 			((const char **)fmtdata)[0] =
1124 				replace(ctx, ((char **)fmtdata)[0]);
1125 			skipsize = sizeof(char *);
1126 			++ret;
1127 			break;
1128 		default:
1129 			fprintf(stderr, "Unknown conversion specifier %c "
1130 				"in fmt starting with %s", p[0], f - 1);
1131 			return -1;
1132 		}
1133 		fmtdata += skipsize;
1134 	}
1135 	return ret;
1136 }
1137 
1138 /* XXX: do we really want the timestamp? */
1139 static
1140 int
1141 evtr_dump_string(evtr_t evtr, uint64_t ts, const char *str, int ns)
1142 {
1143 	struct string_event_header s;
1144 	int err;
1145 	uint16_t id;
1146 
1147 	assert((0 <= ns) && (ns < EVTR_NS_MAX));
1148 	if (!strhash_find(evtr->strings[ns], str, &id)) {
1149 		return id;
1150 	}
1151 	if ((err = strhash_insert(evtr->strings[ns], str, &id))) {
1152 		evtr->err = err;
1153 		return 0;
1154 	}
1155 
1156 	printd(DS, "hash_insert %s ns %d id %d\n", str, ns, id);
1157 	s.eh.type = EVTR_TYPE_STR;
1158 	s.eh.ts = ts;
1159 	s.ns = ns;
1160 	s.id = id;
1161 	s.len = strnlen(str, PATH_MAX);
1162 
1163 	if (evtr_dump_avoid_boundary(evtr, sizeof(s) + s.len))
1164 		return 0;
1165 	if (evtr_write(evtr, &s, sizeof(s)))
1166 		return 0;
1167 	if (evtr_write(evtr, str, s.len))
1168 		return 0;
1169 	if (evtr_dump_pad(evtr))
1170 		return 0;
1171 	return s.id;
1172 }
1173 
1174 struct replace_ctx {
1175 	evtr_t evtr;
1176 	uint64_t ts;
1177 };
1178 
1179 static
1180 const char *
1181 replace_strptr(void *_ctx, const char *s)
1182 {
1183 	struct replace_ctx *ctx = _ctx;
1184 	return (const char *)(uintptr_t)evtr_dump_string(ctx->evtr, ctx->ts, s,
1185 							 EVTR_NS_DSTR);
1186 }
1187 
1188 static
1189 const char *
1190 replace_strid(void *_ctx, const char *s)
1191 {
1192 	struct replace_ctx *ctx = _ctx;
1193 	const char *ret;
1194 
1195 	ret = string_map_find(&ctx->evtr->maps[EVTR_NS_DSTR - 1].root,
1196 			      (int)(uintptr_t)s);
1197 	if (!ret) {
1198 		fprintf(stderr, "Unknown id for data string\n");
1199 		ctx->evtr->errmsg = "unknown id for data string";
1200 		ctx->evtr->err = !0;
1201 	}
1202 	validate_string(ret);
1203 	printd(DS, "replacing strid %d (ns %d) with string '%s' (or int %#x)\n",
1204 	       (int)(uintptr_t)s, EVTR_NS_DSTR, ret ? ret : "NULL", (int)(uintptr_t)ret);
1205 	return ret;
1206 }
1207 
1208 static
1209 int
1210 evtr_dump_probe(evtr_t evtr, evtr_event_t ev)
1211 {
1212 	struct probe_event_header kev;
1213 	char buf[1024];
1214 
1215 	memset(&kev, '\0', sizeof(kev));
1216 	kev.eh.type = ev->type;
1217 	kev.eh.ts = ev->ts;
1218 	kev.line = ev->line;
1219 	kev.cpu = ev->cpu;
1220 	if (ev->file) {
1221 		kev.file = evtr_dump_string(evtr, kev.eh.ts, ev->file,
1222 					    EVTR_NS_PATH);
1223 	}
1224 	if (ev->func) {
1225 		kev.func = evtr_dump_string(evtr, kev.eh.ts, ev->func,
1226 					    EVTR_NS_FUNC);
1227 	}
1228 	if (ev->fmt) {
1229 		kev.fmt = evtr_dump_fmt(evtr, kev.eh.ts, ev);
1230 	}
1231 	if (ev->fmtdata) {
1232 		struct replace_ctx replctx = {
1233 			.evtr = evtr,
1234 			.ts = ev->ts,
1235 		};
1236 		assert(ev->fmtdatalen <= (int)sizeof(buf));
1237 		kev.datalen = ev->fmtdatalen;
1238 		/*
1239 		 * Replace all string pointers with string ids before dumping
1240 		 * the data.
1241 		 */
1242 		memcpy(buf, ev->fmtdata, ev->fmtdatalen);
1243 		if (mangle_string_ptrs(ev->fmt, buf,
1244 				       replace_strptr, &replctx) < 0)
1245 			return !0;
1246 		if (evtr->err)
1247 			return evtr->err;
1248 	}
1249 	if (evtr_dump_avoid_boundary(evtr, sizeof(kev) + ev->fmtdatalen))
1250 		return !0;
1251 	if (evtr_write(evtr, &kev, sizeof(kev)))
1252 		return !0;
1253 	if (evtr_write(evtr, buf, ev->fmtdatalen))
1254 		return !0;
1255 	if (evtr_dump_pad(evtr))
1256 		return !0;
1257 	return 0;
1258 }
1259 
1260 static
1261 int
1262 evtr_dump_sysinfo(evtr_t evtr, evtr_event_t ev)
1263 {
1264 	uint8_t type = EVTR_TYPE_SYSINFO;
1265 	uint16_t ncpus = ev->ncpus;
1266 
1267 	if (ncpus <= 0) {
1268 		evtr->errmsg = "invalid number of cpus";
1269 		return !0;
1270 	}
1271 	if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ncpus)))
1272 		return !0;
1273 	if (evtr_write(evtr, &type, sizeof(type))) {
1274 		return !0;
1275 	}
1276 	if (evtr_write(evtr, &ncpus, sizeof(ncpus))) {
1277 		return !0;
1278 	}
1279 	if (evtr_dump_pad(evtr))
1280 		return !0;
1281 	return 0;
1282 }
1283 static
1284 int
1285 evtr_dump_cpuinfo(evtr_t evtr, evtr_event_t ev)
1286 {
1287 	struct cpuinfo_event_header ci;
1288 	uint8_t type;
1289 
1290 	if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ci)))
1291 		return !0;
1292 	type = EVTR_TYPE_CPUINFO;
1293 	if (evtr_write(evtr, &type, sizeof(type))) {
1294 		return !0;
1295 	}
1296 	ci.cpu = ev->cpu;
1297 	ci.freq = ev->cpuinfo.freq;
1298 	if (evtr_dump_avoid_boundary(evtr, sizeof(ci)))
1299 		return !0;
1300 	if (evtr_write(evtr, &ci, sizeof(ci))) {
1301 		return !0;
1302 	}
1303 	if (evtr_dump_pad(evtr))
1304 		return !0;
1305 	return 0;
1306 }
1307 
1308 int
1309 evtr_rewind(evtr_t evtr)
1310 {
1311 	assert((evtr->flags & EVTRF_WR) == 0);
1312 	evtr->bytes = 0;
1313 	if (fseek(evtr->f, 0, SEEK_SET)) {
1314 		evtr->err = errno;
1315 		return !0;
1316 	}
1317 	return 0;
1318 }
1319 
1320 int
1321 evtr_dump_event(evtr_t evtr, evtr_event_t ev)
1322 {
1323 	switch (ev->type) {
1324 	case EVTR_TYPE_PROBE:
1325 		return evtr_dump_probe(evtr, ev);
1326 	case EVTR_TYPE_SYSINFO:
1327 		return evtr_dump_sysinfo(evtr, ev);
1328 	case EVTR_TYPE_CPUINFO:
1329 		return evtr_dump_cpuinfo(evtr, ev);
1330 	}
1331 	evtr->errmsg = "unknown event type";
1332 	return !0;
1333 }
1334 
1335 static
1336 evtr_t
1337 evtr_alloc(FILE *f)
1338 {
1339 	evtr_t evtr;
1340 	if (!(evtr = malloc(sizeof(*evtr)))) {
1341 		return NULL;
1342 	}
1343 
1344 	evtr->f = f;
1345 	evtr->err = 0;
1346 	evtr->errmsg = NULL;
1347 	evtr->bytes = 0;
1348 	return evtr;
1349 }
1350 
1351 static int evtr_next_event(evtr_t, evtr_event_t);
1352 
1353 evtr_t
1354 evtr_open_read(FILE *f)
1355 {
1356 	evtr_t evtr;
1357 	struct evtr_event ev;
1358 	int i;
1359 
1360 	if (!(evtr = evtr_alloc(f))) {
1361 		return NULL;
1362 	}
1363 	evtr->flags = 0;
1364 	for (i = 0; i < (EVTR_NS_MAX - 1); ++i) {
1365 		RB_INIT(&evtr->maps[i].root);
1366 	}
1367 	RB_INIT(&evtr->fmtmap.root);
1368 	RB_INIT(&evtr->threads.root);
1369 	evtr->cpus = NULL;
1370 	evtr->ncpus = 0;
1371 	/*
1372 	 * Load the first event so we can pick up any
1373 	 * sysinfo entries.
1374 	 */
1375 	if (evtr_next_event(evtr, &ev)) {
1376 		goto free_evtr;
1377 	}
1378 	if (evtr_rewind(evtr))
1379 		goto free_evtr;
1380 	return evtr;
1381 free_evtr:
1382 	free(evtr);
1383 	return NULL;
1384 }
1385 
1386 evtr_t
1387 evtr_open_write(FILE *f)
1388 {
1389 	evtr_t evtr;
1390 	int i, j;
1391 
1392 	if (!(evtr = evtr_alloc(f))) {
1393 		return NULL;
1394 	}
1395 
1396 	evtr->flags = EVTRF_WR;
1397 	if (!(evtr->fmts = strhash_new()))
1398 		goto free_evtr;
1399 	for (i = 0; i < EVTR_NS_MAX; ++i) {
1400 		evtr->strings[i] = strhash_new();
1401 		if (!evtr->strings[i]) {
1402 			for (j = 0; j < i; ++j) {
1403 				strhash_destroy(evtr->strings[j]);
1404 			}
1405 			goto free_fmts;
1406 		}
1407 	}
1408 
1409 	return evtr;
1410 free_fmts:
1411 	strhash_destroy(evtr->fmts);
1412 free_evtr:
1413 	free(evtr);
1414 	return NULL;
1415 }
1416 
1417 static
1418 void
1419 hashtab_destroy(struct hashtab *h)
1420 {
1421 	struct hashentry *ent, *next;
1422 	int i;
1423 	for (i = 0; i < NR_BUCKETS; ++i) {
1424 		for (ent = h->buckets[i]; ent; ent = next) {
1425 			next = ent->next;
1426 			free(ent);
1427 		}
1428 	}
1429 	free(h);
1430 }
1431 
1432 void
1433 evtr_close(evtr_t evtr)
1434 {
1435 	int i;
1436 
1437 	if (evtr->flags & EVTRF_WR) {
1438 		hashtab_destroy(&evtr->fmts->tab);
1439 		for (i = 0; i < EVTR_NS_MAX - 1; ++i)
1440 			hashtab_destroy(&evtr->strings[i]->tab);
1441 	} else {
1442 		id_tree_free(&evtr->fmtmap.root);
1443 		for (i = 0; i < EVTR_NS_MAX - 1; ++i) {
1444 			id_tree_free(&evtr->maps[i].root);
1445 		}
1446 	}
1447 	free(evtr);
1448 }
1449 
1450 static
1451 int
1452 evtr_read(evtr_t evtr, void *buf, size_t size)
1453 {
1454 	assert(size > 0);
1455 	assert_foff_in_sync(evtr);
1456 	printd(IO, "evtr_read at %#jx, %zu bytes\n", evtr->bytes, size);
1457 	if (fread(buf, size, 1, evtr->f) != 1) {
1458 		if (feof(evtr->f)) {
1459 			evtr->errmsg = "incomplete record";
1460 		} else {
1461 			evtr->errmsg = strerror(errno);
1462 		}
1463 		return !0;
1464 	}
1465 	evtr->bytes += size;
1466 	assert_foff_in_sync(evtr);
1467 	return 0;
1468 }
1469 
1470 static
1471 int
1472 evtr_load_fmt(evtr_query_t q, char *buf)
1473 {
1474 	evtr_t evtr = q->evtr;
1475 	struct fmt_event_header *evh = (struct fmt_event_header *)buf;
1476 	struct event_fmt *fmt;
1477 	char *subsys = NULL, *fmtstr;
1478 
1479 	if (!(fmt = malloc(sizeof(*fmt)))) {
1480 		evtr->err = errno;
1481 		return !0;
1482 	}
1483 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1484 		      sizeof(*evh) - sizeof(evh->eh))) {
1485 		goto free_fmt;
1486 	}
1487 	assert(!evh->subsys_len);
1488 	if (evh->subsys_len) {
1489 		if (!(subsys = malloc(evh->subsys_len))) {
1490 			evtr->err = errno;
1491 			goto free_fmt;
1492 		}
1493 		if (evtr_read(evtr, subsys, evh->subsys_len)) {
1494 			goto free_subsys;
1495 		}
1496 		fmt->subsys = subsys;
1497 	} else {
1498 		fmt->subsys = "";
1499 	}
1500 	if (!(fmtstr = malloc(evh->fmt_len + 1))) {
1501 		evtr->err = errno;
1502 		goto free_subsys;
1503 	}
1504 	if (evtr_read(evtr, fmtstr, evh->fmt_len)) {
1505 		goto free_fmtstr;
1506 	}
1507 	fmtstr[evh->fmt_len] = '\0';
1508 	fmt->fmt = fmtstr;
1509 
1510 	printd(DS, "fmt_map_insert (%d, %s)\n", evh->id, fmt->fmt);
1511 	evtr->err = fmt_map_insert(&evtr->fmtmap.root, fmt, evh->id);
1512 	switch (evtr->err) {
1513 	case ENOMEM:
1514 		evtr->errmsg = "out of memory";
1515 		break;
1516 	case EEXIST:
1517 		evtr->errmsg = "redefinition of an id to a "
1518 			"different format (corrupt input)";
1519 		break;
1520 	default:
1521 		;
1522 	}
1523 	return evtr->err;
1524 
1525 free_fmtstr:
1526 	free(fmtstr);
1527 free_subsys:
1528 	if (subsys)
1529 		free(subsys);
1530 free_fmt:
1531 	free(fmt);
1532 	return !0;
1533 }
1534 
1535 static
1536 int
1537 evtr_load_string(evtr_t evtr, char *buf)
1538 {
1539 	char sbuf[PATH_MAX + 1];
1540 	struct string_event_header *evh = (struct string_event_header *)buf;
1541 
1542 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1543 		      sizeof(*evh) - sizeof(evh->eh))) {
1544 		return !0;
1545 	}
1546 	if (evh->len > PATH_MAX) {
1547 		evtr->errmsg = "string too large (corrupt input)";
1548 		return !0;
1549 	}
1550 	if (evh->len && evtr_read(evtr, sbuf, evh->len)) {
1551 		return !0;
1552 	}
1553 	sbuf[evh->len] = 0;
1554 	if (evh->ns >= EVTR_NS_MAX) {
1555 		evtr->errmsg = "invalid namespace (corrupt input)";
1556 		return !0;
1557 	}
1558 	validate_string(sbuf);
1559 	printd(DS, "evtr_load_string:ns %d id %d : \"%s\"\n", evh->ns, evh->id,
1560 	       sbuf);
1561 	evtr->err = string_map_insert(&evtr->maps[evh->ns - 1].root, sbuf, evh->id);
1562 	switch (evtr->err) {
1563 	case ENOMEM:
1564 		evtr->errmsg = "out of memory";
1565 		break;
1566 	case EEXIST:
1567 		evtr->errmsg = "redefinition of an id to a "
1568 			"different string (corrupt input)";
1569 		break;
1570 	default:
1571 		;
1572 	}
1573 	return 0;
1574 }
1575 
1576 static
1577 int
1578 evtr_skip(evtr_t evtr, off_t bytes)
1579 {
1580 	if (fseek(evtr->f, bytes, SEEK_CUR)) {
1581 		evtr->err = errno;
1582 		evtr->errmsg = strerror(errno);
1583 		return !0;
1584 	}
1585 	evtr->bytes += bytes;
1586 	return 0;
1587 }
1588 
1589 /*
1590  * Make sure q->buf is at least len bytes
1591  */
1592 static
1593 int
1594 evtr_query_reserve_buf(struct evtr_query *q, int len)
1595 {
1596 	void *tmp;
1597 
1598 	if (q->bufsize >= len)
1599 		return 0;
1600 	if (!(tmp = realloc(q->buf, len)))
1601 		return !0;
1602 	q->buf = tmp;
1603 	q->bufsize = len;
1604 	return 0;
1605 }
1606 
1607 static
1608 int
1609 evtr_load_probe(evtr_t evtr, evtr_event_t ev, char *buf, struct evtr_query *q)
1610 {
1611 	struct probe_event_header *evh = (struct probe_event_header *)buf;
1612 	struct cpu *cpu;
1613 
1614 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1615 		      sizeof(*evh) - sizeof(evh->eh)))
1616 		return !0;
1617 	memset(ev, '\0', sizeof(*ev));
1618 	ev->ts = evh->eh.ts;
1619 	ev->type = EVTR_TYPE_PROBE;
1620 	ev->line = evh->line;
1621 	ev->cpu = evh->cpu;
1622 	if ((cpu = evtr_cpu(evtr, evh->cpu))) {
1623 		ev->td = cpu->td;
1624 	} else {
1625 		ev->td = NULL;
1626 	}
1627 	if (evh->file) {
1628 		ev->file = string_map_find(
1629 			&evtr->maps[EVTR_NS_PATH - 1].root,
1630 			evh->file);
1631 		if (!ev->file) {
1632 			evtr->errmsg = "unknown id for file path";
1633 			evtr->err = !0;
1634 			ev->file = "<unknown>";
1635 		} else {
1636 			validate_string(ev->file);
1637 		}
1638 	} else {
1639 		ev->file = "<unknown>";
1640 	}
1641 	if (evh->fmt) {
1642 		const struct event_fmt *fmt;
1643 		if (!(fmt = fmt_map_find(&evtr->fmtmap.root, evh->fmt))) {
1644 			evtr->errmsg = "unknown id for event fmt";
1645 			evtr->err = !0;
1646 			ev->fmt = NULL;
1647 		} else {
1648 			ev->fmt = fmt->fmt;
1649 			validate_string(fmt->fmt);
1650 		}
1651 	}
1652 	if (evh->datalen) {
1653 		if (evtr_query_reserve_buf(q, evh->datalen + 1)) {
1654 			evtr->err = ENOMEM;
1655 		} else if (!evtr_read(evtr, q->buf, evh->datalen)) {
1656 			struct replace_ctx replctx = {
1657 				.evtr = evtr,
1658 				.ts = ev->ts,
1659 			};
1660 			assert(ev->fmt);
1661 
1662 			ev->fmtdata = q->buf;
1663 			/*
1664 			 * If the format specifies any string pointers, there
1665 			 * is a string id stored in the fmtdata. Look it up
1666 			 * and replace it with a string pointer before
1667 			 * returning it to the user.
1668 			 */
1669 			if (mangle_string_ptrs(ev->fmt, __DECONST(uint8_t *,
1670 								  ev->fmtdata),
1671 					       replace_strid, &replctx) < 0)
1672 				return evtr->err;
1673 			if (evtr->err)
1674 				return evtr->err;
1675 			((char *)ev->fmtdata)[evh->datalen] = '\0';
1676 			ev->fmtdatalen = evh->datalen;
1677 		}
1678 	}
1679 	evtr_run_callbacks(ev, q);
1680 	return evtr->err;
1681 }
1682 
1683 static
1684 int
1685 evtr_skip_to_record(evtr_t evtr)
1686 {
1687 	int skip;
1688 
1689 	skip = REC_ALIGN - (evtr->bytes % REC_ALIGN);
1690 	if (skip > 0) {
1691 		if (fseek(evtr->f, skip, SEEK_CUR)) {
1692 			evtr->err = errno;
1693 			evtr->errmsg = strerror(errno);
1694 			return !0;
1695 		}
1696 		evtr->bytes += skip;
1697 	}
1698 	return 0;
1699 }
1700 
1701 static
1702 int
1703 evtr_load_sysinfo(evtr_t evtr)
1704 {
1705 	uint16_t ncpus;
1706 	int i;
1707 
1708 	if (evtr_read(evtr, &ncpus, sizeof(ncpus))) {
1709 		return !0;
1710 	}
1711 	if (evtr->cpus)
1712 		return 0;
1713 	evtr->cpus = malloc(ncpus * sizeof(struct cpu));
1714 	if (!evtr->cpus) {
1715 		evtr->err = ENOMEM;
1716 		return !0;
1717 	}
1718 	evtr->ncpus = ncpus;
1719 	for (i = 0; i < ncpus; ++i) {
1720 		evtr->cpus[i].td = NULL;
1721 		evtr->cpus[i].freq = -1.0;
1722 	}
1723 	return 0;
1724 }
1725 
1726 static
1727 int
1728 evtr_load_cpuinfo(evtr_t evtr)
1729 {
1730 	struct cpuinfo_event_header cih;
1731 	struct cpu *cpu;
1732 
1733 	if (evtr_read(evtr, &cih, sizeof(cih))) {
1734 		return !0;
1735 	}
1736 	if (cih.freq < 0.0) {
1737 		evtr->errmsg = "cpu freq is negative";
1738 		evtr->err = EINVAL;
1739 		return !0;
1740 	}
1741 	/*
1742 	 * Notice that freq is merely a multiplier with
1743 	 * which we convert a timestamp to seconds; if
1744 	 * ts is not in cycles, freq is not the frequency.
1745 	 */
1746 	if (!(cpu = evtr_cpu(evtr, cih.cpu))) {
1747 		evtr->errmsg = "freq for invalid cpu";
1748 		evtr->err = EINVAL;
1749 		return !0;
1750 	}
1751 	cpu->freq = cih.freq;
1752 	return 0;
1753 }
1754 
1755 static
1756 int
1757 _evtr_next_event(evtr_t evtr, evtr_event_t ev, struct evtr_query *q)
1758 {
1759 	char buf[MAX_EVHDR_SIZE];
1760 	int ret, err;
1761 	struct trace_event_header *evhdr = (struct trace_event_header *)buf;
1762 
1763 	for (ret = 0; !ret;) {
1764 		if (q->flags & EVTRQF_PENDING) {
1765 			q->off = evtr->bytes;
1766 			memcpy(ev, &q->pending_event, sizeof(*ev));
1767 			q->flags &= ~EVTRQF_PENDING;
1768 			return 0;
1769 		}
1770 		if (evtr_read(evtr, &evhdr->type, 1)) {
1771 			if (feof(evtr->f)) {
1772 				evtr->errmsg = NULL;
1773 				evtr->err = 0;
1774 				return -1;
1775 			}
1776 			return !0;
1777 		}
1778 		/*
1779 		 * skip pad records -- this will only happen if there's a
1780 		 * variable sized record close to the boundary
1781 		 */
1782 		if (evhdr->type == EVTR_TYPE_PAD) {
1783 			evtr_skip_to_record(evtr);
1784 			continue;
1785 		}
1786 		if (evhdr->type == EVTR_TYPE_SYSINFO) {
1787 			evtr_load_sysinfo(evtr);
1788 			continue;
1789 		} else if (evhdr->type == EVTR_TYPE_CPUINFO) {
1790 			evtr_load_cpuinfo(evtr);
1791 			continue;
1792 		}
1793 		if (evtr_read(evtr, buf + 1, sizeof(*evhdr) - 1))
1794 			return feof(evtr->f) ? -1 : !0;
1795 		switch (evhdr->type) {
1796 		case EVTR_TYPE_PROBE:
1797 			if ((err = evtr_load_probe(evtr, ev, buf, q))) {
1798 				if (err == -1) {
1799 					/* no match */
1800 					ret = 0;
1801 				} else {
1802 					return !0;
1803 				}
1804 			} else {
1805 				ret = !0;
1806 			}
1807 			break;
1808 		case EVTR_TYPE_STR:
1809 			if (evtr_load_string(evtr, buf)) {
1810 				return !0;
1811 			}
1812 			break;
1813 		case EVTR_TYPE_FMT:
1814 			if (evtr_load_fmt(q, buf)) {
1815 				return !0;
1816 			}
1817 			break;
1818 		default:
1819 			evtr->err = !0;
1820 			evtr->errmsg = "unknown event type (corrupt input?)";
1821 			return !0;
1822 		}
1823 		evtr_skip_to_record(evtr);
1824 		if (ret) {
1825 			if (!evtr_match_filters(q, ev)) {
1826 				ret = 0;
1827 				continue;
1828 			}
1829 			q->off = evtr->bytes;
1830 			return 0;
1831 		}
1832 	}
1833 	/* can't get here */
1834 	return !0;
1835 }
1836 
1837 static
1838 int
1839 evtr_next_event(evtr_t evtr, evtr_event_t ev)
1840 {
1841 	struct evtr_query *q;
1842 	int ret;
1843 
1844 	if (!(q = evtr_query_init(evtr, NULL, 0))) {
1845 		evtr->err = ENOMEM;
1846 		return !0;
1847 	}
1848 	ret = _evtr_next_event(evtr, ev, q);
1849 	evtr_query_destroy(q);
1850 	return ret;
1851 }
1852 
1853 int
1854 evtr_last_event(evtr_t evtr, evtr_event_t ev)
1855 {
1856 	struct stat st;
1857 	int fd;
1858 	off_t last_boundary;
1859 
1860 	if (evtr_error(evtr))
1861 		return !0;
1862 
1863 	fd = fileno(evtr->f);
1864 	if (fstat(fd, &st))
1865 		return !0;
1866 	/*
1867 	 * This skips pseudo records, so we can't provide
1868 	 * an event with all fields filled in this way.
1869 	 * It's doable, just needs some care. TBD.
1870 	 */
1871 	if (0 && (st.st_mode & S_IFREG)) {
1872 		/*
1873 		 * Skip to last boundary, that's the closest to the EOF
1874 		 * location that we are sure contains a header so we can
1875 		 * pick up the stream.
1876 		 */
1877 		last_boundary = rounddown(st.st_size, REC_BOUNDARY);
1878 		/* XXX: ->bytes should be in query */
1879 		assert(evtr->bytes == 0);
1880 		evtr_skip(evtr, last_boundary);
1881 	}
1882 
1883 
1884 	/*
1885 	 * If we can't seek, we need to go through the whole file.
1886 	 * Since you can't seek back, this is pretty useless unless
1887 	 * you really are interested only in the last event.
1888 	 */
1889 	while (!evtr_next_event(evtr, ev))
1890 		;
1891 	if (evtr_error(evtr))
1892 		return !0;
1893 	evtr_rewind(evtr);
1894 	return 0;
1895 }
1896 
1897 struct evtr_query *
1898 evtr_query_init(evtr_t evtr, evtr_filter_t filt, int nfilt)
1899 {
1900 	struct evtr_query *q;
1901 	int i;
1902 
1903 	if (!(q = malloc(sizeof(*q)))) {
1904 		return q;
1905 	}
1906 	q->bufsize = 2;
1907 	if (!(q->buf = malloc(q->bufsize))) {
1908 		goto free_q;
1909 	}
1910 	if (!(q->symtab = symtab_new()))
1911 		goto free_buf;
1912 	q->evtr = evtr;
1913 	q->off = 0;
1914 	q->filt = filt;
1915 	q->nfilt = nfilt;
1916 	TAILQ_INIT(&q->unresolved_filtq);
1917 	q->nmatched = 0;
1918 	q->cbs = NULL;
1919 	q->ncbs = 0;
1920 	q->flags = 0;
1921 	memset(&q->pending_event, '\0', sizeof(q->pending_event));
1922 	if (evtr_register_callback(q, &thread_creation_callback, q)) {
1923 		goto free_symtab;
1924 	}
1925 	if (evtr_register_callback(q, &thread_switch_callback, q)) {
1926 		goto free_cbs;
1927 	}
1928 	if (evtr_query_needs_parsing(q) &&
1929 	    evtr_register_callback(q, &parse_callback, q)) {
1930 		goto free_cbs;
1931 	}
1932 
1933 	for (i = 0; i < nfilt; ++i) {
1934 		filt[i].flags = 0;
1935 		if (filt[i].fmt == NULL)
1936 			continue;
1937 		if (evtr_filter_register(q, &filt[i])) {
1938 			evtr_deregister_filters(q, filt, i);
1939 			goto free_symtab;
1940 		}
1941 	}
1942 
1943 	return q;
1944 free_cbs:
1945 	evtr_deregister_callbacks(q);
1946 free_symtab:
1947 	symtab_destroy(q->symtab);
1948 free_buf:
1949 	free(q->buf);
1950 free_q:
1951 	free(q);
1952 	return NULL;
1953 }
1954 
1955 void
1956 evtr_query_destroy(struct evtr_query *q)
1957 {
1958 	evtr_deregister_filters(q, q->filt, q->nfilt);
1959 
1960 	free(q->buf);
1961 	free(q);
1962 }
1963 
1964 int
1965 evtr_query_next(struct evtr_query *q, evtr_event_t ev)
1966 {
1967 	if (evtr_query_error(q))
1968 		return !0;
1969 	/* we may support that in the future */
1970 	if (q->off != q->evtr->bytes) {
1971 		q->errmsg = "evtr/query offset mismatch";
1972 		return !0;
1973 	}
1974 	return _evtr_next_event(q->evtr, ev, q);
1975 }
1976 
1977 int
1978 evtr_ncpus(evtr_t evtr)
1979 {
1980 	return evtr->ncpus;
1981 }
1982 
1983 int
1984 evtr_cpufreqs(evtr_t evtr, double *freqs)
1985 {
1986 	int i;
1987 
1988 	if (!freqs)
1989 		return EINVAL;
1990 	for (i = 0; i < evtr->ncpus; ++i) {
1991 		freqs[i] = evtr->cpus[i].freq;
1992 	}
1993 	return 0;
1994 }
1995