xref: /dragonfly/lib/libevtr/evtr.c (revision d8d5b238)
1 /*
2  * Copyright (c) 2009, 2010 Aggelos Economopoulos.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in
12  *    the documentation and/or other materials provided with the
13  *    distribution.
14  * 3. Neither the name of The DragonFly Project nor the names of its
15  *    contributors may be used to endorse or promote products derived
16  *    from this software without specific, prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
22  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
28  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <assert.h>
33 #include <ctype.h>
34 #include <err.h>
35 #include <errno.h>
36 #include <limits.h>
37 #include <stdarg.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <sys/param.h>
42 #include <sys/queue.h>
43 #include <sys/stat.h>
44 #include <sys/tree.h>
45 
46 
47 #include "evtr.h"
48 #include "internal.h"
49 
50 unsigned evtr_debug;
51 
52 static
53 void
54 printd_set_flags(const char *str, unsigned int *flags)
55 {
56 	/*
57 	 * This is suboptimal as we don't detect
58 	 * invalid flags.
59 	 */
60 	for (; *str; ++str) {
61 		if ('A' == *str) {
62 			*flags = -1;
63 			return;
64 		}
65 		if (!islower(*str))
66 			err(2, "invalid debug flag %c\n", *str);
67 		*flags |= 1 << (*str - 'a');
68 	}
69 }
70 
71 
72 enum {
73 	MAX_EVHDR_SIZE = PATH_MAX + 200,
74 	/* string namespaces */
75 	EVTR_NS_PATH = 0x1,
76 	EVTR_NS_FUNC,
77 	EVTR_NS_DSTR,
78 	EVTR_NS_MAX,
79 	NR_BUCKETS = 1021,	/* prime */
80 	PARSE_ERR_BUFSIZE = 256,
81 	REC_ALIGN = 8,
82 	REC_BOUNDARY = 1 << 14,
83 	FILTF_ID = 0x10,
84 	EVTRF_WR = 0x1,		/* open for writing */
85 	EVTRQF_PENDING = 0x1,
86 };
87 
88 typedef uint16_t fileid_t;
89 typedef uint16_t funcid_t;
90 typedef uint16_t fmtid_t;
91 
92 struct trace_event_header {
93 	uint8_t type;
94 	uint64_t ts;	/* XXX: this should only be part of probe */
95 } __attribute__((packed));
96 
97 struct probe_event_header {
98 	struct trace_event_header eh;
99 	/*
100 	 * For these fields, 0 implies "not available"
101 	 */
102 	fileid_t file;
103 	funcid_t caller1;
104 	funcid_t caller2;
105 	funcid_t func;
106 	uint16_t line;
107 	fmtid_t fmt;
108 	uint16_t datalen;
109 	uint8_t cpu;	/* -1 if n/a */
110 } __attribute__((packed));
111 
112 struct string_event_header {
113 	struct trace_event_header eh;
114 	uint16_t ns;
115 	uint32_t id;
116 	uint16_t len;
117 } __attribute__((packed));
118 
119 struct fmt_event_header {
120 	struct trace_event_header eh;
121 	uint16_t id;
122 	uint8_t subsys_len;
123 	uint8_t fmt_len;
124 } __attribute__((packed));
125 
126 struct cpuinfo_event_header {
127 	double freq;
128 	uint8_t cpu;
129 } __attribute__((packed));
130 
131 struct hashentry {
132 	uintptr_t key;
133 	uintptr_t val;
134 	struct hashentry *next;
135 };
136 
137 struct hashtab {
138 	struct hashentry *buckets[NR_BUCKETS];
139 	uintptr_t (*hashfunc)(uintptr_t);
140 	uintptr_t (*cmpfunc)(uintptr_t, uintptr_t);
141 };
142 
143 struct symtab {
144 	struct hashtab tab;
145 };
146 
147 struct event_fmt {
148 	const char *subsys;
149 	const char *fmt;
150 };
151 
152 struct event_filter_unresolved {
153 	TAILQ_ENTRY(event_filter_unresolved) link;
154 	evtr_filter_t filt;
155 };
156 
157 struct id_map {
158 	RB_ENTRY(id_map) rb_node;
159 	int id;
160 	const void *data;
161 };
162 
163 RB_HEAD(id_tree, id_map);
164 struct string_map {
165 	struct id_tree root;
166 };
167 
168 struct fmt_map {
169 	struct id_tree root;
170 };
171 
172 RB_HEAD(thread_tree, evtr_thread);
173 
174 struct thread_map {
175 	struct thread_tree root;
176 };
177 
178 struct event_callback {
179 	void (*cb)(evtr_event_t, void *data);
180 	void *data;	/* this field must be malloc()ed */
181 };
182 
183 struct cpu {
184 	struct evtr_thread *td;	/* currently executing thread */
185 	double freq;
186 };
187 
188 struct evtr {
189 	FILE *f;
190 	int flags;
191 	int err;
192 	const char *errmsg;
193 	off_t bytes;
194 	union {
195 		/*
196 		 * When writing, we keep track of the strings we've
197 		 * already dumped so we only dump them once.
198 		 * Paths, function names etc belong to different
199 		 * namespaces.
200 		 */
201 		struct hashtab_str *strings[EVTR_NS_MAX - 1];
202 		/*
203 		 * When reading, we build a map from id to string.
204 		 * Every id must be defined at the point of use.
205 		 */
206 		struct string_map maps[EVTR_NS_MAX - 1];
207 	};
208 	union {
209 		/* same as above, but for subsys+fmt pairs */
210 		struct fmt_map fmtmap;
211 		struct hashtab_str *fmts;
212 	};
213 	struct thread_map threads;
214 	struct cpu *cpus;
215 	int ncpus;
216 };
217 
218 struct evtr_query {
219 	evtr_t evtr;
220 	off_t off;
221 	evtr_filter_t filt;
222 	int nfilt;
223 	int nmatched;
224 	int ntried;
225 	void *buf;
226 	int bufsize;
227 	struct symtab *symtab;
228 	int ncbs;
229 	struct event_callback **cbs;
230 	/*
231 	 * Filters that have a format specified and we
232 	 * need to resolve that to an fmtid
233 	 */
234 	TAILQ_HEAD(, event_filter_unresolved) unresolved_filtq;
235 	int err;
236 	const char *errmsg;
237 	char parse_err_buf[PARSE_ERR_BUFSIZE];
238 	int flags;
239 	struct evtr_event pending_event;
240 };
241 
242 void
243 evtr_set_debug(const char *str)
244 {
245 	printd_set_flags(str, &evtr_debug);
246 }
247 
248 static int id_map_cmp(struct id_map *, struct id_map *);
249 RB_PROTOTYPE2(id_tree, id_map, rb_node, id_map_cmp, int);
250 RB_GENERATE2(id_tree, id_map, rb_node, id_map_cmp, int, id);
251 
252 static int thread_cmp(struct evtr_thread *, struct evtr_thread *);
253 RB_PROTOTYPE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *);
254 RB_GENERATE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *, id);
255 
256 static inline
257 void
258 validate_string(const char *str)
259 {
260 	if (!(evtr_debug & MISC))
261 		return;
262 	for (; *str; ++str)
263 		assert(isprint(*str));
264 }
265 
266 static
267 void
268 id_tree_free(struct id_tree *root)
269 {
270 	struct id_map *v, *n;
271 
272 	for (v = RB_MIN(id_tree, root); v; v = n) {
273 		n = RB_NEXT(id_tree, root, v);
274 		RB_REMOVE(id_tree, root, v);
275 	}
276 }
277 
278 static
279 int
280 evtr_register_callback(evtr_query_t q, void (*fn)(evtr_event_t, void *), void *d)
281 {
282 	struct event_callback *cb;
283 	void *cbs;
284 
285 	if (!(cb = malloc(sizeof(*cb)))) {
286 		q->err = ENOMEM;
287 		return !0;
288 	}
289 	cb->cb = fn;
290 	cb->data = d;
291 	if (!(cbs = realloc(q->cbs, (++q->ncbs) * sizeof(cb)))) {
292 		--q->ncbs;
293 		free(cb);
294 		q->err = ENOMEM;
295 		return !0;
296 	}
297 	q->cbs = cbs;
298 	q->cbs[q->ncbs - 1] = cb;
299 	return 0;
300 }
301 
302 static
303 void
304 evtr_deregister_callbacks(evtr_query_t q)
305 {
306 	int i;
307 
308 	for (i = 0; i < q->ncbs; ++i) {
309 		free(q->cbs[i]);
310 	}
311 	free(q->cbs);
312 	q->cbs = NULL;
313 }
314 
315 static
316 void
317 evtr_run_callbacks(evtr_event_t ev, evtr_query_t q)
318 {
319 	struct event_callback *cb;
320 	int i;
321 
322 	for (i = 0; i < q->ncbs; ++i) {
323 		cb = q->cbs[i];
324 		cb->cb(ev, cb->data);
325 	}
326 }
327 
328 static
329 struct cpu *
330 evtr_cpu(evtr_t evtr, int c)
331 {
332 	if ((c < 0) || (c >= evtr->ncpus))
333 		return NULL;
334 	return &evtr->cpus[c];
335 }
336 
337 static int parse_format_data(evtr_event_t ev, const char *fmt, ...)
338 	       __printflike(2, 3) __scanflike(2, 3);
339 
340 static
341 int
342 parse_format_data(evtr_event_t ev, const char *fmt, ...)
343 {
344 	va_list ap;
345 	char buf[2048];
346 
347 	if (strcmp(fmt, ev->fmt))
348 		return 0;
349 	vsnprintf(buf, sizeof(buf), fmt, __DECONST(void *, ev->fmtdata));
350 	printd(MISC, "string is: %s\n", buf);
351 	va_start(ap, fmt);
352 	return vsscanf(buf, fmt, ap);
353 }
354 
355 static
356 void
357 evtr_deregister_filters(evtr_query_t q, evtr_filter_t filt, int nfilt)
358 {
359 	struct event_filter_unresolved *u, *tmp;
360 	int i;
361 	TAILQ_FOREACH_MUTABLE(u, &q->unresolved_filtq, link, tmp) {
362 		for (i = 0; i < nfilt; ++i) {
363 			if (u->filt == &filt[i]) {
364 				TAILQ_REMOVE(&q->unresolved_filtq, u, link);
365 			}
366 		}
367 	}
368 }
369 
370 static
371 int
372 evtr_filter_register(evtr_query_t q, evtr_filter_t filt)
373 {
374 	struct event_filter_unresolved *res;
375 
376 	if (!(res = malloc(sizeof(*res)))) {
377 		q->err = ENOMEM;
378 		return !0;
379 	}
380 	res->filt = filt;
381 	TAILQ_INSERT_TAIL(&q->unresolved_filtq, res, link);
382 	return 0;
383 }
384 
385 static
386 int
387 evtr_query_needs_parsing(evtr_query_t q)
388 {
389 	int i;
390 
391 	for (i = 0; i < q->nfilt; ++i)
392 		if (q->filt[i].ev_type == EVTR_TYPE_STMT)
393 			return !0;
394 	return 0;
395 }
396 
397 void
398 evtr_event_data(evtr_event_t ev, char *buf, size_t len)
399 {
400 	/*
401 	 * XXX: we implicitly trust the format string.
402 	 * We shouldn't.
403 	 */
404 	if (ev->fmtdatalen) {
405 		vsnprintf(buf, len, ev->fmt, __DECONST(void *, ev->fmtdata));
406 	} else {
407 		strlcpy(buf, ev->fmt, len);
408 	}
409 }
410 
411 int
412 evtr_error(evtr_t evtr)
413 {
414 	return evtr->err || (evtr->errmsg != NULL);
415 }
416 
417 const char *
418 evtr_errmsg(evtr_t evtr)
419 {
420 	return evtr->errmsg ? evtr->errmsg : strerror(evtr->err);
421 }
422 
423 int
424 evtr_query_error(evtr_query_t q)
425 {
426 	return q->err || (q->errmsg != NULL) || evtr_error(q->evtr);
427 }
428 
429 const char *
430 evtr_query_errmsg(evtr_query_t q)
431 {
432 	return q->errmsg ? q->errmsg :
433 		(q->err ? strerror(q->err) :
434 		 (evtr_errmsg(q->evtr)));
435 }
436 
437 static
438 int
439 id_map_cmp(struct id_map *a, struct id_map *b)
440 {
441 	return a->id - b->id;
442 }
443 
444 static
445 int
446 thread_cmp(struct evtr_thread *a, struct evtr_thread *b)
447 {
448 	ptrdiff_t d;
449 	d =  a->id - b->id;
450 	if (d < 0)
451 		return -1;
452 	if (!d)
453 		return 0;
454 	return 1;
455 }
456 
457 #define DEFINE_MAP_FIND(prefix, type)		\
458 	static					\
459 	type				\
460 	prefix ## _map_find(struct id_tree *tree, int id)\
461 	{						 \
462 		struct id_map *sid;			 \
463 							\
464 		sid = id_tree_RB_LOOKUP(tree, id);	\
465 		return sid ? sid->data : NULL;		\
466 	}
467 
468 DEFINE_MAP_FIND(string, const char *)
469 DEFINE_MAP_FIND(fmt, const struct event_fmt *)
470 
471 static
472 struct evtr_thread *
473 thread_map_find(struct thread_map *map, void *id)
474 {
475 	return thread_tree_RB_LOOKUP(&map->root, id);
476 }
477 
478 #define DEFINE_MAP_INSERT(prefix, type, _cmp, _dup)	\
479 	static					\
480 	int								\
481 	prefix ## _map_insert(struct id_tree *tree, type data, int id) \
482 	{								\
483 	struct id_map *sid, *osid;					\
484 									\
485 	sid = malloc(sizeof(*sid));					\
486 	if (!sid) {							\
487 		return ENOMEM;						\
488 	}								\
489 	sid->id = id;							\
490 	sid->data = data;						\
491 	if ((osid = id_tree_RB_INSERT(tree, sid))) {			\
492 		free(sid);						\
493 		if (_cmp((type)osid->data, data)) {			\
494 			return EEXIST;					\
495 		}							\
496 		printd(DS, "mapping already exists, skipping\n");		\
497 		/* we're OK with redefinitions of an id to the same string */ \
498 		return 0;						\
499 	}								\
500 	/* only do the strdup if we're inserting a new string */	\
501 	sid->data = _dup(data);		/* XXX: oom */			\
502 	return 0;							\
503 }
504 
505 static
506 void
507 thread_map_insert(struct thread_map *map, struct evtr_thread *td)
508 {
509 	struct evtr_thread *otd;
510 
511 	if ((otd = thread_tree_RB_INSERT(&map->root, td))) {
512 		/*
513 		 * Thread addresses might be reused, we're
514 		 * ok with that.
515 		 * DANGER, Will Robinson: this means the user
516 		 * of the API needs to copy event->td if they
517 		 * want it to remain stable.
518 		 */
519 		free((void *)otd->comm);
520 		otd->comm = td->comm;
521 		free(td);
522 	}
523 }
524 
525 static
526 int
527 event_fmt_cmp(const struct event_fmt *a, const struct event_fmt *b)
528 {
529 	int ret = 0;
530 
531 	if (a->subsys) {
532 		if (b->subsys) {
533 			ret = strcmp(a->subsys, b->subsys);
534 		} else {
535 			ret = strcmp(a->subsys, "");
536 		}
537 	} else if (b->subsys) {
538 			ret = strcmp("", b->subsys);
539 	}
540 	if (ret)
541 		return ret;
542 	return strcmp(a->fmt, b->fmt);
543 }
544 
545 static
546 struct event_fmt *
547 event_fmt_dup(const struct event_fmt *o)
548 {
549 	struct event_fmt *n;
550 
551 	if (!(n = malloc(sizeof(*n)))) {
552 		return n;
553 	}
554 	memcpy(n, o, sizeof(*n));
555 	return n;
556 }
557 
558 DEFINE_MAP_INSERT(string, const char *, strcmp, strdup)
559 DEFINE_MAP_INSERT(fmt, const struct event_fmt *, event_fmt_cmp, event_fmt_dup)
560 
561 int
562 hash_find(const struct hashtab *tab, uintptr_t key, uintptr_t *val)
563 {
564 	struct hashentry *ent;
565 
566 	for(ent = tab->buckets[tab->hashfunc(key)];
567 	    ent && tab->cmpfunc(ent->key, key);
568 	    ent = ent->next);
569 
570 	if (!ent)
571 		return !0;
572 	*val = ent->val;
573 	return 0;
574 }
575 
576 struct hashentry *
577 hash_insert(struct hashtab *tab, uintptr_t key, uintptr_t val)
578 {
579 	struct hashentry *ent;
580 	int hsh;
581 
582 	if (!(ent = malloc(sizeof(*ent)))) {
583 		fprintf(stderr, "out of memory\n");
584 		return NULL;
585 	}
586 	hsh = tab->hashfunc(key);
587 	ent->next = tab->buckets[hsh];
588 	ent->key = key;
589 	ent->val = val;
590 	tab->buckets[hsh] = ent;
591 	return ent;
592 }
593 
594 static
595 uintptr_t
596 cmpfunc_pointer(uintptr_t a, uintptr_t b)
597 {
598 	return b - a;
599 }
600 
601 static
602 uintptr_t
603 hashfunc_pointer(uintptr_t p)
604 {
605 	return p % NR_BUCKETS;
606 }
607 
608 struct hashtab *
609 hash_new(void)
610 {
611 	struct hashtab *tab;
612 	if (!(tab = calloc(sizeof(struct hashtab), 1)))
613 		return tab;
614 	tab->hashfunc = &hashfunc_pointer;
615 	tab->cmpfunc = &cmpfunc_pointer;
616 	return tab;
617 }
618 
619 struct hashtab_str {	/* string -> id map */
620 	struct hashtab tab;
621 	uint16_t id;
622 };
623 
624 static
625 uintptr_t
626 hashfunc_string(uintptr_t p)
627 {
628 	const char *str = (char *)p;
629         unsigned long hash = 5381;
630         int c;
631 
632         while ((c = *str++))
633             hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
634 	return hash  % NR_BUCKETS;
635 }
636 
637 static
638 uintptr_t
639 cmpfunc_string(uintptr_t a, uintptr_t b)
640 {
641 	return strcmp((char *)a, (char *)b);
642 }
643 
644 
645 static
646 struct hashtab_str *
647 strhash_new(void)
648 {
649 	struct hashtab_str *strtab;
650 	if (!(strtab = calloc(sizeof(struct hashtab_str), 1)))
651 		return strtab;
652 	strtab->tab.hashfunc = &hashfunc_string;
653 	strtab->tab.cmpfunc = &cmpfunc_string;
654 	return strtab;
655 }
656 
657 static
658 void
659 strhash_destroy(struct hashtab_str *strtab)
660 {
661 	free(strtab);
662 }
663 
664 static
665 int
666 strhash_find(struct hashtab_str *strtab, const char *str, uint16_t *id)
667 {
668 	uintptr_t val;
669 
670 	if (hash_find(&strtab->tab, (uintptr_t)str, &val))
671 		return !0;
672 	*id = (uint16_t)val;
673 	return 0;
674 }
675 
676 static
677 int
678 strhash_insert(struct hashtab_str *strtab, const char *str, uint16_t *id)
679 {
680 	uintptr_t val;
681 
682 	val = ++strtab->id;
683 	if (strtab->id == 0) {
684 		fprintf(stderr, "too many strings\n");
685 		return ERANGE;
686 	}
687 	str = strdup(str);
688 	if (!str) {
689 		fprintf(stderr, "out of memory\n");
690 		--strtab->id;
691 		return ENOMEM;
692 	}
693 	hash_insert(&strtab->tab, (uintptr_t)str, (uintptr_t)val);
694 	*id = strtab->id;
695 	return 0;
696 }
697 
698 struct symtab *
699 symtab_new(void)
700 {
701 	struct symtab *symtab;
702 	if (!(symtab = calloc(sizeof(struct symtab), 1)))
703 		return symtab;
704 	symtab->tab.hashfunc = &hashfunc_string;
705 	symtab->tab.cmpfunc = &cmpfunc_string;
706 	return symtab;
707 }
708 
709 void
710 symtab_destroy(struct symtab *symtab)
711 {
712 	free(symtab);
713 }
714 
715 struct evtr_variable *
716 symtab_find(const struct symtab *symtab, const char *str)
717 {
718 	uintptr_t val;
719 
720 	if (hash_find(&symtab->tab, (uintptr_t)str, &val))
721 		return NULL;
722 	return (struct evtr_variable *)val;
723 }
724 
725 int
726 symtab_insert(struct symtab *symtab, const char *name,
727 	       struct evtr_variable *var)
728 {
729 	name = strdup(name);
730 	if (!name) {
731 		fprintf(stderr, "out of memory\n");
732 		return ENOMEM;
733 	}
734 	hash_insert(&symtab->tab, (uintptr_t)name, (uintptr_t)var);
735 	return 0;
736 }
737 
738 static
739 int
740 evtr_filter_match(evtr_query_t q, evtr_filter_t f, evtr_event_t ev)
741 {
742 	if ((f->cpu != -1) && (f->cpu != ev->cpu))
743 		return 0;
744 
745 	assert(!(f->flags & FILTF_ID));
746 	if (ev->type != f->ev_type)
747 		return 0;
748 	if (ev->type == EVTR_TYPE_PROBE) {
749 		if (f->fmt && strcmp(ev->fmt, f->fmt))
750 			return 0;
751 	} else if (ev->type == EVTR_TYPE_STMT) {
752 		struct evtr_variable *var;
753 		/* resolve var */
754 		/* XXX: no need to do that *every* time */
755 		parse_var(f->var, q->symtab, &var, &q->parse_err_buf[0],
756 			  PARSE_ERR_BUFSIZE);
757 		/*
758 		 * Ignore errors, they're expected since the
759 		 * variable might not be instantiated yet
760 		 */
761 		if (var != ev->stmt.var)
762 			return 0;
763 	}
764 	return !0;
765 }
766 
767 static
768 int
769 evtr_match_filters(struct evtr_query *q, evtr_event_t ev)
770 {
771 	int i;
772 
773 	/* no filters means we're interested in all events */
774 	if (!q->nfilt)
775 		return !0;
776 	++q->ntried;
777 	for (i = 0; i < q->nfilt; ++i) {
778 		if (evtr_filter_match(q, &q->filt[i], ev)) {
779 			++q->nmatched;
780 			return !0;
781 		}
782 	}
783 	return 0;
784 }
785 
786 static
787 void
788 parse_callback(evtr_event_t ev, void *d)
789 {
790 	evtr_query_t q = (evtr_query_t)d;
791 	if (ev->type != EVTR_TYPE_PROBE)
792 		return;
793 	if (!ev->fmt || (ev->fmt[0] != '#'))
794 		return;
795 	/*
796 	 * Copy the event to ->pending_event, then call
797 	 * the parser to convert it into a synthesized
798 	 * EVTR_TYPE_STMT event.
799 	 */
800 	memcpy(&q->pending_event, ev, sizeof(*ev));
801 	parse_string(&q->pending_event, q->symtab, &ev->fmt[1],
802 		     &q->parse_err_buf[0], PARSE_ERR_BUFSIZE);
803 	if (q->parse_err_buf[0]) {	/* parse error */
804 		q->errmsg = &q->parse_err_buf[0];
805 		return;
806 	}
807 	if (!evtr_match_filters(q, &q->pending_event))
808 		return;
809 	/*
810 	 * This will cause us to return ->pending_event next time
811 	 * we're called.
812 	 */
813 	q->flags |= EVTRQF_PENDING;
814 }
815 
816 static
817 void
818 thread_creation_callback(evtr_event_t ev, void *d)
819 {
820 	evtr_query_t q = (evtr_query_t)d;
821 	evtr_t evtr = q->evtr;
822 	struct evtr_thread *td;
823 	void *ktd;
824 	char buf[20];
825 
826 	if (parse_format_data(ev, "new_td %p %s", &ktd, buf) != 2) {
827 		return;
828 	}
829 	buf[19] = '\0';
830 
831 	if (!(td = malloc(sizeof(*td)))) {
832 		q->err = ENOMEM;
833 		return;
834 	}
835 	td->id = ktd;
836 	td->userdata = NULL;
837 	if (!(td->comm = strdup(buf))) {
838 		free(td);
839 		q->err = ENOMEM;
840 		return;
841 	}
842 	printd(DS, "inserting new thread %p: %s\n", td->id, td->comm);
843 	thread_map_insert(&evtr->threads, td);
844 }
845 
846 static
847 void
848 thread_switch_callback(evtr_event_t ev, void *d)
849 {
850 	evtr_t evtr = ((evtr_query_t)d)->evtr;
851 	struct evtr_thread *tdp, *tdn;
852 	void *ktdp, *ktdn;
853 	struct cpu *cpu;
854 	static struct evtr_event tdcr;
855 	static char *fmt = "new_td %p %s";
856 	char tidstr[40];
857 	char fmtdata[sizeof(void *) + sizeof(char *)];
858 
859 	cpu = evtr_cpu(evtr, ev->cpu);
860 	if (!cpu) {
861 		printw("invalid cpu %d\n", ev->cpu);
862 		return;
863 	}
864 	if (parse_format_data(ev, "sw  %p > %p", &ktdp, &ktdn) != 2) {
865 		return;
866 	}
867 	tdp = thread_map_find(&evtr->threads, ktdp);
868 	if (!tdp) {
869 		printd(DS, "switching from unknown thread %p\n", ktdp);
870 	}
871 	tdn = thread_map_find(&evtr->threads, ktdn);
872 	if (!tdn) {
873 		/*
874 		 * Fake a thread creation event for threads we
875 		 * haven't seen before.
876 		 */
877 		tdcr.type = EVTR_TYPE_PROBE;
878 		tdcr.ts = ev->ts;
879 		tdcr.file = NULL;
880 		tdcr.func = NULL;
881 		tdcr.line = 0;
882 		tdcr.fmt = fmt;
883 		tdcr.fmtdata = &fmtdata;
884 		tdcr.fmtdatalen = sizeof(fmtdata);
885 		tdcr.cpu = ev->cpu;
886 		tdcr.td = NULL;
887 		snprintf(tidstr, sizeof(tidstr), "%p", ktdn);
888 		((void **)fmtdata)[0] = ktdn;
889 		((char **)fmtdata)[1] = &tidstr[0];
890 		thread_creation_callback(&tdcr, d);
891 
892 		tdn = thread_map_find(&evtr->threads, ktdn);
893 		assert(tdn != NULL);
894 		printd(DS, "switching to unknown thread %p\n", ktdn);
895 		cpu->td = tdn;
896 		return;
897 	}
898 	printd(DS, "cpu %d: switching to thread %p\n", ev->cpu, ktdn);
899 	cpu->td = tdn;
900 }
901 
902 static
903 void
904 assert_foff_in_sync(evtr_t evtr)
905 {
906 	off_t off;
907 
908 	/*
909 	 * We keep our own offset because we
910 	 * might want to support mmap()
911 	 */
912 	off = ftello(evtr->f);
913 	if (evtr->bytes != off) {
914 		fprintf(stderr, "bytes %jd, off %jd\n", evtr->bytes, off);
915 		abort();
916 	}
917 }
918 
919 static
920 int
921 evtr_write(evtr_t evtr, const void *buf, size_t bytes)
922 {
923 	assert_foff_in_sync(evtr);
924 	if (fwrite(buf, bytes, 1, evtr->f) != 1) {
925 		evtr->err = errno;
926 		evtr->errmsg = strerror(errno);
927 		return !0;
928 	}
929 	evtr->bytes += bytes;
930 	assert_foff_in_sync(evtr);
931 	return 0;
932 }
933 
934 /*
935  * Called after dumping a record to make sure the next
936  * record is REC_ALIGN aligned. This does not make much sense,
937  * as we shouldn't be using packed structs anyway.
938  */
939 static
940 int
941 evtr_dump_pad(evtr_t evtr)
942 {
943 	size_t pad;
944 	static char buf[REC_ALIGN];
945 
946 	pad = REC_ALIGN - (evtr->bytes % REC_ALIGN);
947 	if (pad > 0) {
948 		return evtr_write(evtr, buf, pad);
949 	}
950 	return 0;
951 }
952 
953 /*
954  * We make sure that there is a new record every REC_BOUNDARY
955  * bytes, this costs next to nothing in space and allows for
956  * fast seeking.
957  */
958 static
959 int
960 evtr_dump_avoid_boundary(evtr_t evtr, size_t bytes)
961 {
962 	unsigned pad, i;
963 	static char buf[256];
964 
965 	pad = REC_BOUNDARY - (evtr->bytes % REC_BOUNDARY);
966 	/* if adding @bytes would cause us to cross a boundary... */
967 	if (bytes > pad) {
968 		/* then pad to the boundary */
969 		for (i = 0; i < (pad / sizeof(buf)); ++i) {
970 			if (evtr_write(evtr, buf, sizeof(buf))) {
971 				return !0;
972 			}
973 		}
974 		i = pad % sizeof(buf);
975 		if (i) {
976 			if (evtr_write(evtr, buf, i)) {
977 				return !0;
978 			}
979 		}
980 	}
981 	return 0;
982 }
983 
984 static
985 int
986 evtr_dump_fmt(evtr_t evtr, uint64_t ts, const evtr_event_t ev)
987 {
988 	struct fmt_event_header fmt;
989 	uint16_t id;
990 	int err;
991 	char *subsys = "", buf[1024];
992 
993 	if (strlcpy(buf, subsys, sizeof(buf)) >= sizeof(buf)) {
994 		evtr->errmsg = "name of subsystem is too large";
995 		evtr->err = ERANGE;
996 		return 0;
997 	}
998 	if (strlcat(buf, ev->fmt, sizeof(buf)) >= sizeof(buf)) {
999 		evtr->errmsg = "fmt + name of subsystem is too large";
1000 		evtr->err = ERANGE;
1001 		return 0;
1002 	}
1003 
1004 	if (!strhash_find(evtr->fmts, buf, &id)) {
1005 		return id;
1006 	}
1007 	if ((err = strhash_insert(evtr->fmts, buf, &id))) {
1008 		evtr->err = err;
1009 		return 0;
1010 	}
1011 
1012 	fmt.eh.type = EVTR_TYPE_FMT;
1013 	fmt.eh.ts = ts;
1014 	fmt.subsys_len = strlen(subsys);
1015 	fmt.fmt_len = strlen(ev->fmt);
1016 	fmt.id = id;
1017 	if (evtr_dump_avoid_boundary(evtr, sizeof(fmt) + fmt.subsys_len +
1018 				     fmt.fmt_len))
1019 		return 0;
1020 	if (evtr_write(evtr, &fmt, sizeof(fmt)))
1021 		return 0;
1022 	if (evtr_write(evtr, subsys, fmt.subsys_len))
1023 		return 0;
1024 	if (evtr_write(evtr, ev->fmt, fmt.fmt_len))
1025 		return 0;
1026 	if (evtr_dump_pad(evtr))
1027 		return 0;
1028 	return fmt.id;
1029 }
1030 
1031 /*
1032  * Replace string pointers or string ids in fmtdata
1033  */
1034 static
1035 int
1036 mangle_string_ptrs(const char *fmt, uint8_t *fmtdata,
1037 		   const char *(*replace)(void *, const char *), void *ctx)
1038 {
1039 	const char *f, *p;
1040 	size_t skipsize, intsz;
1041 	int ret = 0;
1042 
1043 	for (f = fmt; f[0] != '\0'; ++f) {
1044 		if (f[0] != '%')
1045 			continue;
1046 		++f;
1047 		skipsize = 0;
1048 		for (p = f; p[0]; ++p) {
1049 			int again = 0;
1050 			/*
1051 			 * Eat flags. Notice this will accept duplicate
1052 			 * flags.
1053 			 */
1054 			switch (p[0]) {
1055 			case '#':
1056 			case '0':
1057 			case '-':
1058 			case ' ':
1059 			case '+':
1060 			case '\'':
1061 				again = !0;
1062 				break;
1063 			}
1064 			if (!again)
1065 				break;
1066 		}
1067 		/* Eat minimum field width, if any */
1068 		for (; isdigit(p[0]); ++p)
1069 			;
1070 		if (p[0] == '.')
1071 			++p;
1072 		/* Eat precision, if any */
1073 		for (; isdigit(p[0]); ++p)
1074 			;
1075 		intsz = 0;
1076 		switch (p[0]) {
1077 		case 'l':
1078 			if (p[1] == 'l') {
1079 				++p;
1080 				intsz = sizeof(long long);
1081 			} else {
1082 				intsz = sizeof(long);
1083 			}
1084 			break;
1085 		case 'j':
1086 			intsz = sizeof(intmax_t);
1087 			break;
1088 		case 't':
1089 			intsz = sizeof(ptrdiff_t);
1090 			break;
1091 		case 'z':
1092 			intsz = sizeof(size_t);
1093 			break;
1094 		default:
1095 			break;
1096 		}
1097 		if (intsz != 0)
1098 			++p;
1099 		else
1100 			intsz = sizeof(int);
1101 
1102 		switch (p[0]) {
1103 		case 'd':
1104 		case 'i':
1105 		case 'o':
1106 		case 'u':
1107 		case 'x':
1108 		case 'X':
1109 		case 'c':
1110 			skipsize = intsz;
1111 			break;
1112 		case 'p':
1113 			skipsize = sizeof(void *);
1114 			break;
1115 		case 'f':
1116 			if (p[-1] == 'l')
1117 				skipsize = sizeof(double);
1118 			else
1119 				skipsize = sizeof(float);
1120 			break;
1121 		case 's':
1122 			((const char **)fmtdata)[0] =
1123 				replace(ctx, ((char **)fmtdata)[0]);
1124 			skipsize = sizeof(char *);
1125 			++ret;
1126 			break;
1127 		default:
1128 			fprintf(stderr, "Unknown conversion specifier %c "
1129 				"in fmt starting with %s", p[0], f - 1);
1130 			return -1;
1131 		}
1132 		fmtdata += skipsize;
1133 	}
1134 	return ret;
1135 }
1136 
1137 /* XXX: do we really want the timestamp? */
1138 static
1139 int
1140 evtr_dump_string(evtr_t evtr, uint64_t ts, const char *str, int ns)
1141 {
1142 	struct string_event_header s;
1143 	int err;
1144 	uint16_t id;
1145 
1146 	assert((0 <= ns) && (ns < EVTR_NS_MAX));
1147 	if (!strhash_find(evtr->strings[ns], str, &id)) {
1148 		return id;
1149 	}
1150 	if ((err = strhash_insert(evtr->strings[ns], str, &id))) {
1151 		evtr->err = err;
1152 		return 0;
1153 	}
1154 
1155 	printd(DS, "hash_insert %s ns %d id %d\n", str, ns, id);
1156 	s.eh.type = EVTR_TYPE_STR;
1157 	s.eh.ts = ts;
1158 	s.ns = ns;
1159 	s.id = id;
1160 	s.len = strnlen(str, PATH_MAX);
1161 
1162 	if (evtr_dump_avoid_boundary(evtr, sizeof(s) + s.len))
1163 		return 0;
1164 	if (evtr_write(evtr, &s, sizeof(s)))
1165 		return 0;
1166 	if (evtr_write(evtr, str, s.len))
1167 		return 0;
1168 	if (evtr_dump_pad(evtr))
1169 		return 0;
1170 	return s.id;
1171 }
1172 
1173 struct replace_ctx {
1174 	evtr_t evtr;
1175 	uint64_t ts;
1176 };
1177 
1178 static
1179 const char *
1180 replace_strptr(void *_ctx, const char *s)
1181 {
1182 	struct replace_ctx *ctx = _ctx;
1183 	return (const char *)(uintptr_t)evtr_dump_string(ctx->evtr, ctx->ts, s,
1184 							 EVTR_NS_DSTR);
1185 }
1186 
1187 static
1188 const char *
1189 replace_strid(void *_ctx, const char *s)
1190 {
1191 	struct replace_ctx *ctx = _ctx;
1192 	const char *ret;
1193 
1194 	ret = string_map_find(&ctx->evtr->maps[EVTR_NS_DSTR - 1].root,
1195 			      (int)(uintptr_t)s);
1196 	if (!ret) {
1197 		fprintf(stderr, "Unknown id for data string\n");
1198 		ctx->evtr->errmsg = "unknown id for data string";
1199 		ctx->evtr->err = !0;
1200 	}
1201 	validate_string(ret);
1202 	printd(DS, "replacing strid %d (ns %d) with string '%s' (or int %#x)\n",
1203 	       (int)(uintptr_t)s, EVTR_NS_DSTR, ret ? ret : "NULL", (int)(uintptr_t)ret);
1204 	return ret;
1205 }
1206 
1207 static
1208 int
1209 evtr_dump_probe(evtr_t evtr, evtr_event_t ev)
1210 {
1211 	struct probe_event_header kev;
1212 	char buf[1024];
1213 
1214 	memset(&kev, '\0', sizeof(kev));
1215 	kev.eh.type = ev->type;
1216 	kev.eh.ts = ev->ts;
1217 	kev.line = ev->line;
1218 	kev.cpu = ev->cpu;
1219 	if (ev->file) {
1220 		kev.file = evtr_dump_string(evtr, kev.eh.ts, ev->file,
1221 					    EVTR_NS_PATH);
1222 	}
1223 	if (ev->func) {
1224 		kev.func = evtr_dump_string(evtr, kev.eh.ts, ev->func,
1225 					    EVTR_NS_FUNC);
1226 	}
1227 	if (ev->fmt) {
1228 		kev.fmt = evtr_dump_fmt(evtr, kev.eh.ts, ev);
1229 	}
1230 	if (ev->fmtdata) {
1231 		struct replace_ctx replctx = {
1232 			.evtr = evtr,
1233 			.ts = ev->ts,
1234 		};
1235 		assert(ev->fmtdatalen <= (int)sizeof(buf));
1236 		kev.datalen = ev->fmtdatalen;
1237 		/*
1238 		 * Replace all string pointers with string ids before dumping
1239 		 * the data.
1240 		 */
1241 		memcpy(buf, ev->fmtdata, ev->fmtdatalen);
1242 		if (mangle_string_ptrs(ev->fmt, buf,
1243 				       replace_strptr, &replctx) < 0)
1244 			return !0;
1245 		if (evtr->err)
1246 			return evtr->err;
1247 	}
1248 	if (evtr_dump_avoid_boundary(evtr, sizeof(kev) + ev->fmtdatalen))
1249 		return !0;
1250 	if (evtr_write(evtr, &kev, sizeof(kev)))
1251 		return !0;
1252 	if (evtr_write(evtr, buf, ev->fmtdatalen))
1253 		return !0;
1254 	if (evtr_dump_pad(evtr))
1255 		return !0;
1256 	return 0;
1257 }
1258 
1259 static
1260 int
1261 evtr_dump_sysinfo(evtr_t evtr, evtr_event_t ev)
1262 {
1263 	uint8_t type = EVTR_TYPE_SYSINFO;
1264 	uint16_t ncpus = ev->ncpus;
1265 
1266 	if (ncpus <= 0) {
1267 		evtr->errmsg = "invalid number of cpus";
1268 		return !0;
1269 	}
1270 	if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ncpus)))
1271 		return !0;
1272 	if (evtr_write(evtr, &type, sizeof(type))) {
1273 		return !0;
1274 	}
1275 	if (evtr_write(evtr, &ncpus, sizeof(ncpus))) {
1276 		return !0;
1277 	}
1278 	if (evtr_dump_pad(evtr))
1279 		return !0;
1280 	return 0;
1281 }
1282 static
1283 int
1284 evtr_dump_cpuinfo(evtr_t evtr, evtr_event_t ev)
1285 {
1286 	struct cpuinfo_event_header ci;
1287 	uint8_t type;
1288 
1289 	if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ci)))
1290 		return !0;
1291 	type = EVTR_TYPE_CPUINFO;
1292 	if (evtr_write(evtr, &type, sizeof(type))) {
1293 		return !0;
1294 	}
1295 	ci.cpu = ev->cpu;
1296 	ci.freq = ev->cpuinfo.freq;
1297 	if (evtr_dump_avoid_boundary(evtr, sizeof(ci)))
1298 		return !0;
1299 	if (evtr_write(evtr, &ci, sizeof(ci))) {
1300 		return !0;
1301 	}
1302 	if (evtr_dump_pad(evtr))
1303 		return !0;
1304 	return 0;
1305 }
1306 
1307 int
1308 evtr_rewind(evtr_t evtr)
1309 {
1310 	assert((evtr->flags & EVTRF_WR) == 0);
1311 	evtr->bytes = 0;
1312 	if (fseek(evtr->f, 0, SEEK_SET)) {
1313 		evtr->err = errno;
1314 		return !0;
1315 	}
1316 	return 0;
1317 }
1318 
1319 int
1320 evtr_dump_event(evtr_t evtr, evtr_event_t ev)
1321 {
1322 	switch (ev->type) {
1323 	case EVTR_TYPE_PROBE:
1324 		return evtr_dump_probe(evtr, ev);
1325 	case EVTR_TYPE_SYSINFO:
1326 		return evtr_dump_sysinfo(evtr, ev);
1327 	case EVTR_TYPE_CPUINFO:
1328 		return evtr_dump_cpuinfo(evtr, ev);
1329 	}
1330 	evtr->errmsg = "unknown event type";
1331 	return !0;
1332 }
1333 
1334 static
1335 evtr_t
1336 evtr_alloc(FILE *f)
1337 {
1338 	evtr_t evtr;
1339 	if (!(evtr = malloc(sizeof(*evtr)))) {
1340 		return NULL;
1341 	}
1342 
1343 	evtr->f = f;
1344 	evtr->err = 0;
1345 	evtr->errmsg = NULL;
1346 	evtr->bytes = 0;
1347 	return evtr;
1348 }
1349 
1350 static int evtr_next_event(evtr_t, evtr_event_t);
1351 
1352 evtr_t
1353 evtr_open_read(FILE *f)
1354 {
1355 	evtr_t evtr;
1356 	struct evtr_event ev;
1357 	int i;
1358 
1359 	if (!(evtr = evtr_alloc(f))) {
1360 		return NULL;
1361 	}
1362 	evtr->flags = 0;
1363 	for (i = 0; i < (EVTR_NS_MAX - 1); ++i) {
1364 		RB_INIT(&evtr->maps[i].root);
1365 	}
1366 	RB_INIT(&evtr->fmtmap.root);
1367 	RB_INIT(&evtr->threads.root);
1368 	evtr->cpus = NULL;
1369 	evtr->ncpus = 0;
1370 	/*
1371 	 * Load the first event so we can pick up any
1372 	 * sysinfo entries.
1373 	 */
1374 	if (evtr_next_event(evtr, &ev)) {
1375 		goto free_evtr;
1376 	}
1377 	if (evtr_rewind(evtr))
1378 		goto free_evtr;
1379 	return evtr;
1380 free_evtr:
1381 	free(evtr);
1382 	return NULL;
1383 }
1384 
1385 evtr_t
1386 evtr_open_write(FILE *f)
1387 {
1388 	evtr_t evtr;
1389 	int i, j;
1390 
1391 	if (!(evtr = evtr_alloc(f))) {
1392 		return NULL;
1393 	}
1394 
1395 	evtr->flags = EVTRF_WR;
1396 	if (!(evtr->fmts = strhash_new()))
1397 		goto free_evtr;
1398 	for (i = 0; i < EVTR_NS_MAX; ++i) {
1399 		evtr->strings[i] = strhash_new();
1400 		if (!evtr->strings[i]) {
1401 			for (j = 0; j < i; ++j) {
1402 				strhash_destroy(evtr->strings[j]);
1403 			}
1404 			goto free_fmts;
1405 		}
1406 	}
1407 
1408 	return evtr;
1409 free_fmts:
1410 	strhash_destroy(evtr->fmts);
1411 free_evtr:
1412 	free(evtr);
1413 	return NULL;
1414 }
1415 
1416 static
1417 void
1418 hashtab_destroy(struct hashtab *h)
1419 {
1420 	struct hashentry *ent, *next;
1421 	int i;
1422 	for (i = 0; i < NR_BUCKETS; ++i) {
1423 		for (ent = h->buckets[i]; ent; ent = next) {
1424 			next = ent->next;
1425 			free(ent);
1426 		}
1427 	}
1428 	free(h);
1429 }
1430 
1431 void
1432 evtr_close(evtr_t evtr)
1433 {
1434 	int i;
1435 
1436 	if (evtr->flags & EVTRF_WR) {
1437 		hashtab_destroy(&evtr->fmts->tab);
1438 		for (i = 0; i < EVTR_NS_MAX - 1; ++i)
1439 			hashtab_destroy(&evtr->strings[i]->tab);
1440 	} else {
1441 		id_tree_free(&evtr->fmtmap.root);
1442 		for (i = 0; i < EVTR_NS_MAX - 1; ++i) {
1443 			id_tree_free(&evtr->maps[i].root);
1444 		}
1445 	}
1446 	free(evtr);
1447 }
1448 
1449 static
1450 int
1451 evtr_read(evtr_t evtr, void *buf, size_t size)
1452 {
1453 	assert(size > 0);
1454 	assert_foff_in_sync(evtr);
1455 	printd(IO, "evtr_read at %#jx, %zd bytes\n", evtr->bytes, size);
1456 	if (fread(buf, size, 1, evtr->f) != 1) {
1457 		if (feof(evtr->f)) {
1458 			evtr->errmsg = "incomplete record";
1459 		} else {
1460 			evtr->errmsg = strerror(errno);
1461 		}
1462 		return !0;
1463 	}
1464 	evtr->bytes += size;
1465 	assert_foff_in_sync(evtr);
1466 	return 0;
1467 }
1468 
1469 static
1470 int
1471 evtr_load_fmt(evtr_query_t q, char *buf)
1472 {
1473 	evtr_t evtr = q->evtr;
1474 	struct fmt_event_header *evh = (struct fmt_event_header *)buf;
1475 	struct event_fmt *fmt;
1476 	char *subsys = NULL, *fmtstr;
1477 
1478 	if (!(fmt = malloc(sizeof(*fmt)))) {
1479 		evtr->err = errno;
1480 		return !0;
1481 	}
1482 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1483 		      sizeof(*evh) - sizeof(evh->eh))) {
1484 		goto free_fmt;
1485 	}
1486 	assert(!evh->subsys_len);
1487 	if (evh->subsys_len) {
1488 		if (!(subsys = malloc(evh->subsys_len))) {
1489 			evtr->err = errno;
1490 			goto free_fmt;
1491 		}
1492 		if (evtr_read(evtr, subsys, evh->subsys_len)) {
1493 			goto free_subsys;
1494 		}
1495 		fmt->subsys = subsys;
1496 	} else {
1497 		fmt->subsys = "";
1498 	}
1499 	if (!(fmtstr = malloc(evh->fmt_len + 1))) {
1500 		evtr->err = errno;
1501 		goto free_subsys;
1502 	}
1503 	if (evtr_read(evtr, fmtstr, evh->fmt_len)) {
1504 		goto free_fmtstr;
1505 	}
1506 	fmtstr[evh->fmt_len] = '\0';
1507 	fmt->fmt = fmtstr;
1508 
1509 	printd(DS, "fmt_map_insert (%d, %s)\n", evh->id, fmt->fmt);
1510 	evtr->err = fmt_map_insert(&evtr->fmtmap.root, fmt, evh->id);
1511 	switch (evtr->err) {
1512 	case ENOMEM:
1513 		evtr->errmsg = "out of memory";
1514 		break;
1515 	case EEXIST:
1516 		evtr->errmsg = "redefinition of an id to a "
1517 			"different format (corrupt input)";
1518 		break;
1519 	default:
1520 		;
1521 	}
1522 	return evtr->err;
1523 
1524 free_fmtstr:
1525 	free(fmtstr);
1526 free_subsys:
1527 	if (subsys)
1528 		free(subsys);
1529 free_fmt:
1530 	free(fmt);
1531 	return !0;
1532 }
1533 
1534 static
1535 int
1536 evtr_load_string(evtr_t evtr, char *buf)
1537 {
1538 	char sbuf[PATH_MAX + 1];
1539 	struct string_event_header *evh = (struct string_event_header *)buf;
1540 
1541 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1542 		      sizeof(*evh) - sizeof(evh->eh))) {
1543 		return !0;
1544 	}
1545 	if (evh->len > PATH_MAX) {
1546 		evtr->errmsg = "string too large (corrupt input)";
1547 		return !0;
1548 	}
1549 	if (evh->len && evtr_read(evtr, sbuf, evh->len)) {
1550 		return !0;
1551 	}
1552 	sbuf[evh->len] = 0;
1553 	if (evh->ns >= EVTR_NS_MAX) {
1554 		evtr->errmsg = "invalid namespace (corrupt input)";
1555 		return !0;
1556 	}
1557 	validate_string(sbuf);
1558 	printd(DS, "evtr_load_string:ns %d id %d : \"%s\"\n", evh->ns, evh->id,
1559 	       sbuf);
1560 	evtr->err = string_map_insert(&evtr->maps[evh->ns - 1].root, sbuf, evh->id);
1561 	switch (evtr->err) {
1562 	case ENOMEM:
1563 		evtr->errmsg = "out of memory";
1564 		break;
1565 	case EEXIST:
1566 		evtr->errmsg = "redefinition of an id to a "
1567 			"different string (corrupt input)";
1568 		break;
1569 	default:
1570 		;
1571 	}
1572 	return 0;
1573 }
1574 
1575 static
1576 int
1577 evtr_skip(evtr_t evtr, off_t bytes)
1578 {
1579 	if (fseek(evtr->f, bytes, SEEK_CUR)) {
1580 		evtr->err = errno;
1581 		evtr->errmsg = strerror(errno);
1582 		return !0;
1583 	}
1584 	evtr->bytes += bytes;
1585 	return 0;
1586 }
1587 
1588 /*
1589  * Make sure q->buf is at least len bytes
1590  */
1591 static
1592 int
1593 evtr_query_reserve_buf(struct evtr_query *q, int len)
1594 {
1595 	void *tmp;
1596 
1597 	if (q->bufsize >= len)
1598 		return 0;
1599 	if (!(tmp = realloc(q->buf, len)))
1600 		return !0;
1601 	q->buf = tmp;
1602 	q->bufsize = len;
1603 	return 0;
1604 }
1605 
1606 static
1607 int
1608 evtr_load_probe(evtr_t evtr, evtr_event_t ev, char *buf, struct evtr_query *q)
1609 {
1610 	struct probe_event_header *evh = (struct probe_event_header *)buf;
1611 	struct cpu *cpu;
1612 
1613 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1614 		      sizeof(*evh) - sizeof(evh->eh)))
1615 		return !0;
1616 	memset(ev, '\0', sizeof(*ev));
1617 	ev->ts = evh->eh.ts;
1618 	ev->type = EVTR_TYPE_PROBE;
1619 	ev->line = evh->line;
1620 	ev->cpu = evh->cpu;
1621 	if ((cpu = evtr_cpu(evtr, evh->cpu))) {
1622 		ev->td = cpu->td;
1623 	} else {
1624 		ev->td = NULL;
1625 	}
1626 	if (evh->file) {
1627 		ev->file = string_map_find(
1628 			&evtr->maps[EVTR_NS_PATH - 1].root,
1629 			evh->file);
1630 		if (!ev->file) {
1631 			evtr->errmsg = "unknown id for file path";
1632 			evtr->err = !0;
1633 			ev->file = "<unknown>";
1634 		} else {
1635 			validate_string(ev->file);
1636 		}
1637 	} else {
1638 		ev->file = "<unknown>";
1639 	}
1640 	if (evh->fmt) {
1641 		const struct event_fmt *fmt;
1642 		if (!(fmt = fmt_map_find(&evtr->fmtmap.root, evh->fmt))) {
1643 			evtr->errmsg = "unknown id for event fmt";
1644 			evtr->err = !0;
1645 			ev->fmt = NULL;
1646 		} else {
1647 			ev->fmt = fmt->fmt;
1648 			validate_string(fmt->fmt);
1649 		}
1650 	}
1651 	if (evh->datalen) {
1652 		if (evtr_query_reserve_buf(q, evh->datalen + 1)) {
1653 			evtr->err = ENOMEM;
1654 		} else if (!evtr_read(evtr, q->buf, evh->datalen)) {
1655 			struct replace_ctx replctx = {
1656 				.evtr = evtr,
1657 				.ts = ev->ts,
1658 			};
1659 			assert(ev->fmt);
1660 
1661 			ev->fmtdata = q->buf;
1662 			/*
1663 			 * If the format specifies any string pointers, there
1664 			 * is a string id stored in the fmtdata. Look it up
1665 			 * and replace it with a string pointer before
1666 			 * returning it to the user.
1667 			 */
1668 			if (mangle_string_ptrs(ev->fmt, __DECONST(uint8_t *,
1669 								  ev->fmtdata),
1670 					       replace_strid, &replctx) < 0)
1671 				return evtr->err;
1672 			if (evtr->err)
1673 				return evtr->err;
1674 			((char *)ev->fmtdata)[evh->datalen] = '\0';
1675 			ev->fmtdatalen = evh->datalen;
1676 		}
1677 	}
1678 	evtr_run_callbacks(ev, q);
1679 	return evtr->err;
1680 }
1681 
1682 static
1683 int
1684 evtr_skip_to_record(evtr_t evtr)
1685 {
1686 	int skip;
1687 
1688 	skip = REC_ALIGN - (evtr->bytes % REC_ALIGN);
1689 	if (skip > 0) {
1690 		if (fseek(evtr->f, skip, SEEK_CUR)) {
1691 			evtr->err = errno;
1692 			evtr->errmsg = strerror(errno);
1693 			return !0;
1694 		}
1695 		evtr->bytes += skip;
1696 	}
1697 	return 0;
1698 }
1699 
1700 static
1701 int
1702 evtr_load_sysinfo(evtr_t evtr)
1703 {
1704 	uint16_t ncpus;
1705 	int i;
1706 
1707 	if (evtr_read(evtr, &ncpus, sizeof(ncpus))) {
1708 		return !0;
1709 	}
1710 	if (evtr->cpus)
1711 		return 0;
1712 	evtr->cpus = malloc(ncpus * sizeof(struct cpu));
1713 	if (!evtr->cpus) {
1714 		evtr->err = ENOMEM;
1715 		return !0;
1716 	}
1717 	evtr->ncpus = ncpus;
1718 	for (i = 0; i < ncpus; ++i) {
1719 		evtr->cpus[i].td = NULL;
1720 		evtr->cpus[i].freq = -1.0;
1721 	}
1722 	return 0;
1723 }
1724 
1725 static
1726 int
1727 evtr_load_cpuinfo(evtr_t evtr)
1728 {
1729 	struct cpuinfo_event_header cih;
1730 	struct cpu *cpu;
1731 
1732 	if (evtr_read(evtr, &cih, sizeof(cih))) {
1733 		return !0;
1734 	}
1735 	if (cih.freq < 0.0) {
1736 		evtr->errmsg = "cpu freq is negative";
1737 		evtr->err = EINVAL;
1738 		return !0;
1739 	}
1740 	/*
1741 	 * Notice that freq is merely a multiplier with
1742 	 * which we convert a timestamp to seconds; if
1743 	 * ts is not in cycles, freq is not the frequency.
1744 	 */
1745 	if (!(cpu = evtr_cpu(evtr, cih.cpu))) {
1746 		evtr->errmsg = "freq for invalid cpu";
1747 		evtr->err = EINVAL;
1748 		return !0;
1749 	}
1750 	cpu->freq = cih.freq;
1751 	return 0;
1752 }
1753 
1754 static
1755 int
1756 _evtr_next_event(evtr_t evtr, evtr_event_t ev, struct evtr_query *q)
1757 {
1758 	char buf[MAX_EVHDR_SIZE];
1759 	int ret, err;
1760 	struct trace_event_header *evhdr = (struct trace_event_header *)buf;
1761 
1762 	for (ret = 0; !ret;) {
1763 		if (q->flags & EVTRQF_PENDING) {
1764 			q->off = evtr->bytes;
1765 			memcpy(ev, &q->pending_event, sizeof(*ev));
1766 			q->flags &= ~EVTRQF_PENDING;
1767 			return 0;
1768 		}
1769 		if (evtr_read(evtr, &evhdr->type, 1)) {
1770 			if (feof(evtr->f)) {
1771 				evtr->errmsg = NULL;
1772 				evtr->err = 0;
1773 				return -1;
1774 			}
1775 			return !0;
1776 		}
1777 		/*
1778 		 * skip pad records -- this will only happen if there's a
1779 		 * variable sized record close to the boundary
1780 		 */
1781 		if (evhdr->type == EVTR_TYPE_PAD) {
1782 			evtr_skip_to_record(evtr);
1783 			continue;
1784 		}
1785 		if (evhdr->type == EVTR_TYPE_SYSINFO) {
1786 			evtr_load_sysinfo(evtr);
1787 			continue;
1788 		} else if (evhdr->type == EVTR_TYPE_CPUINFO) {
1789 			evtr_load_cpuinfo(evtr);
1790 			continue;
1791 		}
1792 		if (evtr_read(evtr, buf + 1, sizeof(*evhdr) - 1))
1793 			return feof(evtr->f) ? -1 : !0;
1794 		switch (evhdr->type) {
1795 		case EVTR_TYPE_PROBE:
1796 			if ((err = evtr_load_probe(evtr, ev, buf, q))) {
1797 				if (err == -1) {
1798 					/* no match */
1799 					ret = 0;
1800 				} else {
1801 					return !0;
1802 				}
1803 			} else {
1804 				ret = !0;
1805 			}
1806 			break;
1807 		case EVTR_TYPE_STR:
1808 			if (evtr_load_string(evtr, buf)) {
1809 				return !0;
1810 			}
1811 			break;
1812 		case EVTR_TYPE_FMT:
1813 			if (evtr_load_fmt(q, buf)) {
1814 				return !0;
1815 			}
1816 			break;
1817 		default:
1818 			evtr->err = !0;
1819 			evtr->errmsg = "unknown event type (corrupt input?)";
1820 			return !0;
1821 		}
1822 		evtr_skip_to_record(evtr);
1823 		if (ret) {
1824 			if (!evtr_match_filters(q, ev)) {
1825 				ret = 0;
1826 				continue;
1827 			}
1828 			q->off = evtr->bytes;
1829 			return 0;
1830 		}
1831 	}
1832 	/* can't get here */
1833 	return !0;
1834 }
1835 
1836 static
1837 int
1838 evtr_next_event(evtr_t evtr, evtr_event_t ev)
1839 {
1840 	struct evtr_query *q;
1841 	int ret;
1842 
1843 	if (!(q = evtr_query_init(evtr, NULL, 0))) {
1844 		evtr->err = ENOMEM;
1845 		return !0;
1846 	}
1847 	ret = _evtr_next_event(evtr, ev, q);
1848 	evtr_query_destroy(q);
1849 	return ret;
1850 }
1851 
1852 int
1853 evtr_last_event(evtr_t evtr, evtr_event_t ev)
1854 {
1855 	struct stat st;
1856 	int fd;
1857 	off_t last_boundary;
1858 
1859 	if (evtr_error(evtr))
1860 		return !0;
1861 
1862 	fd = fileno(evtr->f);
1863 	if (fstat(fd, &st))
1864 		return !0;
1865 	/*
1866 	 * This skips pseudo records, so we can't provide
1867 	 * an event with all fields filled in this way.
1868 	 * It's doable, just needs some care. TBD.
1869 	 */
1870 	if (0 && (st.st_mode & S_IFREG)) {
1871 		/*
1872 		 * Skip to last boundary, that's the closest to the EOF
1873 		 * location that we are sure contains a header so we can
1874 		 * pick up the stream.
1875 		 */
1876 		last_boundary = rounddown(st.st_size, REC_BOUNDARY);
1877 		/* XXX: ->bytes should be in query */
1878 		assert(evtr->bytes == 0);
1879 		evtr_skip(evtr, last_boundary);
1880 	}
1881 
1882 
1883 	/*
1884 	 * If we can't seek, we need to go through the whole file.
1885 	 * Since you can't seek back, this is pretty useless unless
1886 	 * you really are interested only in the last event.
1887 	 */
1888 	while (!evtr_next_event(evtr, ev))
1889 		;
1890 	if (evtr_error(evtr))
1891 		return !0;
1892 	evtr_rewind(evtr);
1893 	return 0;
1894 }
1895 
1896 struct evtr_query *
1897 evtr_query_init(evtr_t evtr, evtr_filter_t filt, int nfilt)
1898 {
1899 	struct evtr_query *q;
1900 	int i;
1901 
1902 	if (!(q = malloc(sizeof(*q)))) {
1903 		return q;
1904 	}
1905 	q->bufsize = 2;
1906 	if (!(q->buf = malloc(q->bufsize))) {
1907 		goto free_q;
1908 	}
1909 	if (!(q->symtab = symtab_new()))
1910 		goto free_buf;
1911 	q->evtr = evtr;
1912 	q->off = 0;
1913 	q->filt = filt;
1914 	q->nfilt = nfilt;
1915 	TAILQ_INIT(&q->unresolved_filtq);
1916 	q->nmatched = 0;
1917 	q->cbs = NULL;
1918 	q->ncbs = 0;
1919 	q->flags = 0;
1920 	memset(&q->pending_event, '\0', sizeof(q->pending_event));
1921 	if (evtr_register_callback(q, &thread_creation_callback, q)) {
1922 		goto free_symtab;
1923 	}
1924 	if (evtr_register_callback(q, &thread_switch_callback, q)) {
1925 		goto free_cbs;
1926 	}
1927 	if (evtr_query_needs_parsing(q) &&
1928 	    evtr_register_callback(q, &parse_callback, q)) {
1929 		goto free_cbs;
1930 	}
1931 
1932 	for (i = 0; i < nfilt; ++i) {
1933 		filt[i].flags = 0;
1934 		if (filt[i].fmt == NULL)
1935 			continue;
1936 		if (evtr_filter_register(q, &filt[i])) {
1937 			evtr_deregister_filters(q, filt, i);
1938 			goto free_symtab;
1939 		}
1940 	}
1941 
1942 	return q;
1943 free_cbs:
1944 	evtr_deregister_callbacks(q);
1945 free_symtab:
1946 	symtab_destroy(q->symtab);
1947 free_buf:
1948 	free(q->buf);
1949 free_q:
1950 	free(q);
1951 	return NULL;
1952 }
1953 
1954 void
1955 evtr_query_destroy(struct evtr_query *q)
1956 {
1957 	evtr_deregister_filters(q, q->filt, q->nfilt);
1958 
1959 	free(q->buf);
1960 	free(q);
1961 }
1962 
1963 int
1964 evtr_query_next(struct evtr_query *q, evtr_event_t ev)
1965 {
1966 	if (evtr_query_error(q))
1967 		return !0;
1968 	/* we may support that in the future */
1969 	if (q->off != q->evtr->bytes) {
1970 		q->errmsg = "evtr/query offset mismatch";
1971 		return !0;
1972 	}
1973 	return _evtr_next_event(q->evtr, ev, q);
1974 }
1975 
1976 int
1977 evtr_ncpus(evtr_t evtr)
1978 {
1979 	return evtr->ncpus;
1980 }
1981 
1982 int
1983 evtr_cpufreqs(evtr_t evtr, double *freqs)
1984 {
1985 	int i;
1986 
1987 	if (!freqs)
1988 		return EINVAL;
1989 	for (i = 0; i < evtr->ncpus; ++i) {
1990 		freqs[i] = evtr->cpus[i].freq;
1991 	}
1992 	return 0;
1993 }
1994