xref: /dragonfly/lib/libevtr/evtr.c (revision 9348a738)
1 /*
2  * Copyright (c) 2009, 2010 Aggelos Economopoulos.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in
12  *    the documentation and/or other materials provided with the
13  *    distribution.
14  * 3. Neither the name of The DragonFly Project nor the names of its
15  *    contributors may be used to endorse or promote products derived
16  *    from this software without specific, prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
22  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
28  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <assert.h>
33 #include <ctype.h>
34 #include <err.h>
35 #include <errno.h>
36 #include <limits.h>
37 #include <stdarg.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <sys/queue.h>
42 #include <sys/stat.h>
43 #include <sys/tree.h>
44 
45 
46 #include "evtr.h"
47 #include "internal.h"
48 
49 unsigned evtr_debug;
50 
51 static
52 void
53 printd_set_flags(const char *str, unsigned int *flags)
54 {
55 	/*
56 	 * This is suboptimal as we don't detect
57 	 * invalid flags.
58 	 */
59 	for (; *str; ++str) {
60 		if ('A' == *str) {
61 			*flags = -1;
62 			return;
63 		}
64 		if (!islower(*str))
65 			err(2, "invalid debug flag %c\n", *str);
66 		*flags |= 1 << (*str - 'a');
67 	}
68 }
69 
70 
71 enum {
72 	MAX_EVHDR_SIZE = PATH_MAX + 200,
73 	/* string namespaces */
74 	EVTR_NS_PATH = 0x1,
75 	EVTR_NS_FUNC,
76 	EVTR_NS_DSTR,
77 	EVTR_NS_MAX,
78 	NR_BUCKETS = 1021,	/* prime */
79 	PARSE_ERR_BUFSIZE = 256,
80 	REC_ALIGN = 8,
81 	REC_BOUNDARY = 1 << 14,
82 	FILTF_ID = 0x10,
83 	EVTRF_WR = 0x1,		/* open for writing */
84 	EVTRQF_PENDING = 0x1,
85 };
86 
87 typedef uint16_t fileid_t;
88 typedef uint16_t funcid_t;
89 typedef uint16_t fmtid_t;
90 
91 struct trace_event_header {
92 	uint8_t type;
93 	uint64_t ts;	/* XXX: this should only be part of probe */
94 } __attribute__((packed));
95 
96 struct probe_event_header {
97 	struct trace_event_header eh;
98 	/*
99 	 * For these fields, 0 implies "not available"
100 	 */
101 	fileid_t file;
102 	funcid_t caller1;
103 	funcid_t caller2;
104 	funcid_t func;
105 	uint16_t line;
106 	fmtid_t fmt;
107 	uint16_t datalen;
108 	uint8_t cpu;	/* -1 if n/a */
109 } __attribute__((packed));
110 
111 struct string_event_header {
112 	struct trace_event_header eh;
113 	uint16_t ns;
114 	uint32_t id;
115 	uint16_t len;
116 } __attribute__((packed));
117 
118 struct fmt_event_header {
119 	struct trace_event_header eh;
120 	uint16_t id;
121 	uint8_t subsys_len;
122 	uint8_t fmt_len;
123 } __attribute__((packed));
124 
125 struct cpuinfo_event_header {
126 	double freq;
127 	uint8_t cpu;
128 } __attribute__((packed));
129 
130 struct hashentry {
131 	uintptr_t key;
132 	uintptr_t val;
133 	struct hashentry *next;
134 };
135 
136 struct hashtab {
137 	struct hashentry *buckets[NR_BUCKETS];
138 	uintptr_t (*hashfunc)(uintptr_t);
139 	uintptr_t (*cmpfunc)(uintptr_t, uintptr_t);
140 };
141 
142 struct symtab {
143 	struct hashtab tab;
144 };
145 
146 struct event_fmt {
147 	const char *subsys;
148 	const char *fmt;
149 };
150 
151 struct event_filter_unresolved {
152 	TAILQ_ENTRY(event_filter_unresolved) link;
153 	evtr_filter_t filt;
154 };
155 
156 struct id_map {
157 	RB_ENTRY(id_map) rb_node;
158 	int id;
159 	const void *data;
160 };
161 
162 RB_HEAD(id_tree, id_map);
163 struct string_map {
164 	struct id_tree root;
165 };
166 
167 struct fmt_map {
168 	struct id_tree root;
169 };
170 
171 RB_HEAD(thread_tree, evtr_thread);
172 
173 struct thread_map {
174 	struct thread_tree root;
175 };
176 
177 struct event_callback {
178 	void (*cb)(evtr_event_t, void *data);
179 	void *data;	/* this field must be malloc()ed */
180 };
181 
182 struct cpu {
183 	struct evtr_thread *td;	/* currently executing thread */
184 	double freq;
185 };
186 
187 struct evtr {
188 	FILE *f;
189 	int flags;
190 	int err;
191 	const char *errmsg;
192 	off_t bytes;
193 	union {
194 		/*
195 		 * When writing, we keep track of the strings we've
196 		 * already dumped so we only dump them once.
197 		 * Paths, function names etc belong to different
198 		 * namespaces.
199 		 */
200 		struct hashtab_str *strings[EVTR_NS_MAX - 1];
201 		/*
202 		 * When reading, we build a map from id to string.
203 		 * Every id must be defined at the point of use.
204 		 */
205 		struct string_map maps[EVTR_NS_MAX - 1];
206 	};
207 	union {
208 		/* same as above, but for subsys+fmt pairs */
209 		struct fmt_map fmtmap;
210 		struct hashtab_str *fmts;
211 	};
212 	struct thread_map threads;
213 	struct cpu *cpus;
214 	int ncpus;
215 };
216 
217 struct evtr_query {
218 	evtr_t evtr;
219 	off_t off;
220 	evtr_filter_t filt;
221 	int nfilt;
222 	int nmatched;
223 	int ntried;
224 	void *buf;
225 	int bufsize;
226 	struct symtab *symtab;
227 	int ncbs;
228 	struct event_callback **cbs;
229 	/*
230 	 * Filters that have a format specified and we
231 	 * need to resolve that to an fmtid
232 	 */
233 	TAILQ_HEAD(, event_filter_unresolved) unresolved_filtq;
234 	int err;
235 	const char *errmsg;
236 	char parse_err_buf[PARSE_ERR_BUFSIZE];
237 	int flags;
238 	struct evtr_event pending_event;
239 };
240 
241 void
242 evtr_set_debug(const char *str)
243 {
244 	printd_set_flags(str, &evtr_debug);
245 }
246 
247 static int id_map_cmp(struct id_map *, struct id_map *);
248 RB_PROTOTYPE2(id_tree, id_map, rb_node, id_map_cmp, int);
249 RB_GENERATE2(id_tree, id_map, rb_node, id_map_cmp, int, id);
250 
251 static int thread_cmp(struct evtr_thread *, struct evtr_thread *);
252 RB_PROTOTYPE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *);
253 RB_GENERATE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *, id);
254 
255 static inline
256 void
257 validate_string(const char *str)
258 {
259 	if (!(evtr_debug & MISC))
260 		return;
261 	for (; *str; ++str)
262 		assert(isprint(*str));
263 }
264 
265 static
266 void
267 id_tree_free(struct id_tree *root)
268 {
269 	struct id_map *v, *n;
270 
271 	for (v = RB_MIN(id_tree, root); v; v = n) {
272 		n = RB_NEXT(id_tree, root, v);
273 		RB_REMOVE(id_tree, root, v);
274 	}
275 }
276 
277 static
278 int
279 evtr_register_callback(evtr_query_t q, void (*fn)(evtr_event_t, void *), void *d)
280 {
281 	struct event_callback *cb;
282 	void *cbs;
283 
284 	if (!(cb = malloc(sizeof(*cb)))) {
285 		q->err = ENOMEM;
286 		return !0;
287 	}
288 	cb->cb = fn;
289 	cb->data = d;
290 	if (!(cbs = realloc(q->cbs, (++q->ncbs) * sizeof(cb)))) {
291 		--q->ncbs;
292 		free(cb);
293 		q->err = ENOMEM;
294 		return !0;
295 	}
296 	q->cbs = cbs;
297 	q->cbs[q->ncbs - 1] = cb;
298 	return 0;
299 }
300 
301 static
302 void
303 evtr_deregister_callbacks(evtr_query_t q)
304 {
305 	int i;
306 
307 	for (i = 0; i < q->ncbs; ++i) {
308 		free(q->cbs[i]);
309 	}
310 	free(q->cbs);
311 	q->cbs = NULL;
312 }
313 
314 static
315 void
316 evtr_run_callbacks(evtr_event_t ev, evtr_query_t q)
317 {
318 	struct event_callback *cb;
319 	int i;
320 
321 	for (i = 0; i < q->ncbs; ++i) {
322 		cb = q->cbs[i];
323 		cb->cb(ev, cb->data);
324 	}
325 }
326 
327 static
328 struct cpu *
329 evtr_cpu(evtr_t evtr, int c)
330 {
331 	if ((c < 0) || (c >= evtr->ncpus))
332 		return NULL;
333 	return &evtr->cpus[c];
334 }
335 
336 static int parse_format_data(evtr_event_t ev, const char *fmt, ...)
337 	       __printflike(2, 3) __scanflike(2, 3);
338 
339 static
340 int
341 parse_format_data(evtr_event_t ev, const char *fmt, ...)
342 {
343 	va_list ap;
344 	char buf[2048];
345 
346 	if (strcmp(fmt, ev->fmt))
347 		return 0;
348 	vsnprintf(buf, sizeof(buf), fmt, __DECONST(void *, ev->fmtdata));
349 	printd(MISC, "string is: %s\n", buf);
350 	va_start(ap, fmt);
351 	return vsscanf(buf, fmt, ap);
352 }
353 
354 static
355 void
356 evtr_deregister_filters(evtr_query_t q, evtr_filter_t filt, int nfilt)
357 {
358 	struct event_filter_unresolved *u, *tmp;
359 	int i;
360 	TAILQ_FOREACH_MUTABLE(u, &q->unresolved_filtq, link, tmp) {
361 		for (i = 0; i < nfilt; ++i) {
362 			if (u->filt == &filt[i]) {
363 				TAILQ_REMOVE(&q->unresolved_filtq, u, link);
364 			}
365 		}
366 	}
367 }
368 
369 static
370 int
371 evtr_filter_register(evtr_query_t q, evtr_filter_t filt)
372 {
373 	struct event_filter_unresolved *res;
374 
375 	if (!(res = malloc(sizeof(*res)))) {
376 		q->err = ENOMEM;
377 		return !0;
378 	}
379 	res->filt = filt;
380 	TAILQ_INSERT_TAIL(&q->unresolved_filtq, res, link);
381 	return 0;
382 }
383 
384 static
385 int
386 evtr_query_needs_parsing(evtr_query_t q)
387 {
388 	int i;
389 
390 	for (i = 0; i < q->nfilt; ++i)
391 		if (q->filt[i].ev_type == EVTR_TYPE_STMT)
392 			return !0;
393 	return 0;
394 }
395 
396 void
397 evtr_event_data(evtr_event_t ev, char *buf, size_t len)
398 {
399 	/*
400 	 * XXX: we implicitly trust the format string.
401 	 * We shouldn't.
402 	 */
403 	if (ev->fmtdatalen) {
404 		vsnprintf(buf, len, ev->fmt, __DECONST(void *, ev->fmtdata));
405 	} else {
406 		strlcpy(buf, ev->fmt, len);
407 	}
408 }
409 
410 int
411 evtr_error(evtr_t evtr)
412 {
413 	return evtr->err || (evtr->errmsg != NULL);
414 }
415 
416 const char *
417 evtr_errmsg(evtr_t evtr)
418 {
419 	return evtr->errmsg ? evtr->errmsg : strerror(evtr->err);
420 }
421 
422 int
423 evtr_query_error(evtr_query_t q)
424 {
425 	return q->err || (q->errmsg != NULL) || evtr_error(q->evtr);
426 }
427 
428 const char *
429 evtr_query_errmsg(evtr_query_t q)
430 {
431 	return q->errmsg ? q->errmsg :
432 		(q->err ? strerror(q->err) :
433 		 (evtr_errmsg(q->evtr)));
434 }
435 
436 static
437 int
438 id_map_cmp(struct id_map *a, struct id_map *b)
439 {
440 	return a->id - b->id;
441 }
442 
443 static
444 int
445 thread_cmp(struct evtr_thread *a, struct evtr_thread *b)
446 {
447 	ptrdiff_t d;
448 	d =  a->id - b->id;
449 	if (d < 0)
450 		return -1;
451 	if (!d)
452 		return 0;
453 	return 1;
454 }
455 
456 #define DEFINE_MAP_FIND(prefix, type)		\
457 	static					\
458 	type				\
459 	prefix ## _map_find(struct id_tree *tree, int id)\
460 	{						 \
461 		struct id_map *sid;			 \
462 							\
463 		sid = id_tree_RB_LOOKUP(tree, id);	\
464 		return sid ? sid->data : NULL;		\
465 	}
466 
467 DEFINE_MAP_FIND(string, const char *)
468 DEFINE_MAP_FIND(fmt, const struct event_fmt *)
469 
470 static
471 struct evtr_thread *
472 thread_map_find(struct thread_map *map, void *id)
473 {
474 	return thread_tree_RB_LOOKUP(&map->root, id);
475 }
476 
477 #define DEFINE_MAP_INSERT(prefix, type, _cmp, _dup)	\
478 	static					\
479 	int								\
480 	prefix ## _map_insert(struct id_tree *tree, type data, int id) \
481 	{								\
482 	struct id_map *sid, *osid;					\
483 									\
484 	sid = malloc(sizeof(*sid));					\
485 	if (!sid) {							\
486 		return ENOMEM;						\
487 	}								\
488 	sid->id = id;							\
489 	sid->data = data;						\
490 	if ((osid = id_tree_RB_INSERT(tree, sid))) {			\
491 		free(sid);						\
492 		if (_cmp((type)osid->data, data)) {			\
493 			return EEXIST;					\
494 		}							\
495 		printd(DS, "mapping already exists, skipping\n");		\
496 		/* we're OK with redefinitions of an id to the same string */ \
497 		return 0;						\
498 	}								\
499 	/* only do the strdup if we're inserting a new string */	\
500 	sid->data = _dup(data);		/* XXX: oom */			\
501 	return 0;							\
502 }
503 
504 static
505 void
506 thread_map_insert(struct thread_map *map, struct evtr_thread *td)
507 {
508 	struct evtr_thread *otd;
509 
510 	if ((otd = thread_tree_RB_INSERT(&map->root, td))) {
511 		/*
512 		 * Thread addresses might be reused, we're
513 		 * ok with that.
514 		 * DANGER, Will Robinson: this means the user
515 		 * of the API needs to copy event->td if they
516 		 * want it to remain stable.
517 		 */
518 		free((void *)otd->comm);
519 		otd->comm = td->comm;
520 		free(td);
521 	}
522 }
523 
524 static
525 int
526 event_fmt_cmp(const struct event_fmt *a, const struct event_fmt *b)
527 {
528 	int ret = 0;
529 
530 	if (a->subsys) {
531 		if (b->subsys) {
532 			ret = strcmp(a->subsys, b->subsys);
533 		} else {
534 			ret = strcmp(a->subsys, "");
535 		}
536 	} else if (b->subsys) {
537 			ret = strcmp("", b->subsys);
538 	}
539 	if (ret)
540 		return ret;
541 	return strcmp(a->fmt, b->fmt);
542 }
543 
544 static
545 struct event_fmt *
546 event_fmt_dup(const struct event_fmt *o)
547 {
548 	struct event_fmt *n;
549 
550 	if (!(n = malloc(sizeof(*n)))) {
551 		return n;
552 	}
553 	memcpy(n, o, sizeof(*n));
554 	return n;
555 }
556 
557 DEFINE_MAP_INSERT(string, const char *, strcmp, strdup)
558 DEFINE_MAP_INSERT(fmt, const struct event_fmt *, event_fmt_cmp, event_fmt_dup)
559 
560 int
561 hash_find(const struct hashtab *tab, uintptr_t key, uintptr_t *val)
562 {
563 	struct hashentry *ent;
564 
565 	for(ent = tab->buckets[tab->hashfunc(key)];
566 	    ent && tab->cmpfunc(ent->key, key);
567 	    ent = ent->next);
568 
569 	if (!ent)
570 		return !0;
571 	*val = ent->val;
572 	return 0;
573 }
574 
575 struct hashentry *
576 hash_insert(struct hashtab *tab, uintptr_t key, uintptr_t val)
577 {
578 	struct hashentry *ent;
579 	int hsh;
580 
581 	if (!(ent = malloc(sizeof(*ent)))) {
582 		fprintf(stderr, "out of memory\n");
583 		return NULL;
584 	}
585 	hsh = tab->hashfunc(key);
586 	ent->next = tab->buckets[hsh];
587 	ent->key = key;
588 	ent->val = val;
589 	tab->buckets[hsh] = ent;
590 	return ent;
591 }
592 
593 static
594 uintptr_t
595 cmpfunc_pointer(uintptr_t a, uintptr_t b)
596 {
597 	return b - a;
598 }
599 
600 static
601 uintptr_t
602 hashfunc_pointer(uintptr_t p)
603 {
604 	return p % NR_BUCKETS;
605 }
606 
607 struct hashtab *
608 hash_new(void)
609 {
610 	struct hashtab *tab;
611 	if (!(tab = calloc(sizeof(struct hashtab), 1)))
612 		return tab;
613 	tab->hashfunc = &hashfunc_pointer;
614 	tab->cmpfunc = &cmpfunc_pointer;
615 	return tab;
616 }
617 
618 struct hashtab_str {	/* string -> id map */
619 	struct hashtab tab;
620 	uint16_t id;
621 };
622 
623 static
624 uintptr_t
625 hashfunc_string(uintptr_t p)
626 {
627 	const char *str = (char *)p;
628         unsigned long hash = 5381;
629         int c;
630 
631         while ((c = *str++))
632             hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
633 	return hash  % NR_BUCKETS;
634 }
635 
636 static
637 uintptr_t
638 cmpfunc_string(uintptr_t a, uintptr_t b)
639 {
640 	return strcmp((char *)a, (char *)b);
641 }
642 
643 
644 static
645 struct hashtab_str *
646 strhash_new(void)
647 {
648 	struct hashtab_str *strtab;
649 	if (!(strtab = calloc(sizeof(struct hashtab_str), 1)))
650 		return strtab;
651 	strtab->tab.hashfunc = &hashfunc_string;
652 	strtab->tab.cmpfunc = &cmpfunc_string;
653 	return strtab;
654 }
655 
656 static
657 void
658 strhash_destroy(struct hashtab_str *strtab)
659 {
660 	free(strtab);
661 }
662 
663 static
664 int
665 strhash_find(struct hashtab_str *strtab, const char *str, uint16_t *id)
666 {
667 	uintptr_t val;
668 
669 	if (hash_find(&strtab->tab, (uintptr_t)str, &val))
670 		return !0;
671 	*id = (uint16_t)val;
672 	return 0;
673 }
674 
675 static
676 int
677 strhash_insert(struct hashtab_str *strtab, const char *str, uint16_t *id)
678 {
679 	uintptr_t val;
680 
681 	val = ++strtab->id;
682 	if (strtab->id == 0) {
683 		fprintf(stderr, "too many strings\n");
684 		return ERANGE;
685 	}
686 	str = strdup(str);
687 	if (!str) {
688 		fprintf(stderr, "out of memory\n");
689 		--strtab->id;
690 		return ENOMEM;
691 	}
692 	hash_insert(&strtab->tab, (uintptr_t)str, (uintptr_t)val);
693 	*id = strtab->id;
694 	return 0;
695 }
696 
697 struct symtab *
698 symtab_new(void)
699 {
700 	struct symtab *symtab;
701 	if (!(symtab = calloc(sizeof(struct symtab), 1)))
702 		return symtab;
703 	symtab->tab.hashfunc = &hashfunc_string;
704 	symtab->tab.cmpfunc = &cmpfunc_string;
705 	return symtab;
706 }
707 
708 void
709 symtab_destroy(struct symtab *symtab)
710 {
711 	free(symtab);
712 }
713 
714 struct evtr_variable *
715 symtab_find(const struct symtab *symtab, const char *str)
716 {
717 	uintptr_t val;
718 
719 	if (hash_find(&symtab->tab, (uintptr_t)str, &val))
720 		return NULL;
721 	return (struct evtr_variable *)val;
722 }
723 
724 int
725 symtab_insert(struct symtab *symtab, const char *name,
726 	       struct evtr_variable *var)
727 {
728 	name = strdup(name);
729 	if (!name) {
730 		fprintf(stderr, "out of memory\n");
731 		return ENOMEM;
732 	}
733 	hash_insert(&symtab->tab, (uintptr_t)name, (uintptr_t)var);
734 	return 0;
735 }
736 
737 static
738 int
739 evtr_filter_match(evtr_query_t q, evtr_filter_t f, evtr_event_t ev)
740 {
741 	if ((f->cpu != -1) && (f->cpu != ev->cpu))
742 		return 0;
743 
744 	assert(!(f->flags & FILTF_ID));
745 	if (ev->type != f->ev_type)
746 		return 0;
747 	if (ev->type == EVTR_TYPE_PROBE) {
748 		if (f->fmt && strcmp(ev->fmt, f->fmt))
749 			return 0;
750 	} else if (ev->type == EVTR_TYPE_STMT) {
751 		struct evtr_variable *var;
752 		/* resolve var */
753 		/* XXX: no need to do that *every* time */
754 		parse_var(f->var, q->symtab, &var, &q->parse_err_buf[0],
755 			  PARSE_ERR_BUFSIZE);
756 		/*
757 		 * Ignore errors, they're expected since the
758 		 * variable might not be instantiated yet
759 		 */
760 		if (var != ev->stmt.var)
761 			return 0;
762 	}
763 	return !0;
764 }
765 
766 static
767 int
768 evtr_match_filters(struct evtr_query *q, evtr_event_t ev)
769 {
770 	int i;
771 
772 	/* no filters means we're interested in all events */
773 	if (!q->nfilt)
774 		return !0;
775 	++q->ntried;
776 	for (i = 0; i < q->nfilt; ++i) {
777 		if (evtr_filter_match(q, &q->filt[i], ev)) {
778 			++q->nmatched;
779 			return !0;
780 		}
781 	}
782 	return 0;
783 }
784 
785 static
786 void
787 parse_callback(evtr_event_t ev, void *d)
788 {
789 	evtr_query_t q = (evtr_query_t)d;
790 	if (ev->type != EVTR_TYPE_PROBE)
791 		return;
792 	if (!ev->fmt || (ev->fmt[0] != '#'))
793 		return;
794 	/*
795 	 * Copy the event to ->pending_event, then call
796 	 * the parser to convert it into a synthesized
797 	 * EVTR_TYPE_STMT event.
798 	 */
799 	memcpy(&q->pending_event, ev, sizeof(*ev));
800 	parse_string(&q->pending_event, q->symtab, &ev->fmt[1],
801 		     &q->parse_err_buf[0], PARSE_ERR_BUFSIZE);
802 	if (q->parse_err_buf[0]) {	/* parse error */
803 		q->errmsg = &q->parse_err_buf[0];
804 		return;
805 	}
806 	if (!evtr_match_filters(q, &q->pending_event))
807 		return;
808 	/*
809 	 * This will cause us to return ->pending_event next time
810 	 * we're called.
811 	 */
812 	q->flags |= EVTRQF_PENDING;
813 }
814 
815 static
816 void
817 thread_creation_callback(evtr_event_t ev, void *d)
818 {
819 	evtr_query_t q = (evtr_query_t)d;
820 	evtr_t evtr = q->evtr;
821 	struct evtr_thread *td;
822 	void *ktd;
823 	char buf[20];
824 
825 	if (parse_format_data(ev, "new_td %p %s", &ktd, buf) != 2) {
826 		return;
827 	}
828 	buf[19] = '\0';
829 
830 	if (!(td = malloc(sizeof(*td)))) {
831 		q->err = ENOMEM;
832 		return;
833 	}
834 	td->id = ktd;
835 	td->userdata = NULL;
836 	if (!(td->comm = strdup(buf))) {
837 		free(td);
838 		q->err = ENOMEM;
839 		return;
840 	}
841 	printd(DS, "inserting new thread %p: %s\n", td->id, td->comm);
842 	thread_map_insert(&evtr->threads, td);
843 }
844 
845 static
846 void
847 thread_switch_callback(evtr_event_t ev, void *d)
848 {
849 	evtr_t evtr = ((evtr_query_t)d)->evtr;
850 	struct evtr_thread *tdp, *tdn;
851 	void *ktdp, *ktdn;
852 	struct cpu *cpu;
853 	static struct evtr_event tdcr;
854 	static char *fmt = "new_td %p %s";
855 	char tidstr[40];
856 	char fmtdata[sizeof(void *) + sizeof(char *)];
857 
858 	cpu = evtr_cpu(evtr, ev->cpu);
859 	if (!cpu) {
860 		printw("invalid cpu %d\n", ev->cpu);
861 		return;
862 	}
863 	if (parse_format_data(ev, "sw  %p > %p", &ktdp, &ktdn) != 2) {
864 		return;
865 	}
866 	tdp = thread_map_find(&evtr->threads, ktdp);
867 	if (!tdp) {
868 		printd(DS, "switching from unknown thread %p\n", ktdp);
869 	}
870 	tdn = thread_map_find(&evtr->threads, ktdn);
871 	if (!tdn) {
872 		/*
873 		 * Fake a thread creation event for threads we
874 		 * haven't seen before.
875 		 */
876 		tdcr.type = EVTR_TYPE_PROBE;
877 		tdcr.ts = ev->ts;
878 		tdcr.file = NULL;
879 		tdcr.func = NULL;
880 		tdcr.line = 0;
881 		tdcr.fmt = fmt;
882 		tdcr.fmtdata = &fmtdata;
883 		tdcr.fmtdatalen = sizeof(fmtdata);
884 		tdcr.cpu = ev->cpu;
885 		tdcr.td = NULL;
886 		snprintf(tidstr, sizeof(tidstr), "%p", ktdn);
887 		((void **)fmtdata)[0] = ktdn;
888 		((char **)fmtdata)[1] = &tidstr[0];
889 		thread_creation_callback(&tdcr, d);
890 
891 		tdn = thread_map_find(&evtr->threads, ktdn);
892 		assert(tdn != NULL);
893 		printd(DS, "switching to unknown thread %p\n", ktdn);
894 		cpu->td = tdn;
895 		return;
896 	}
897 	printd(DS, "cpu %d: switching to thread %p\n", ev->cpu, ktdn);
898 	cpu->td = tdn;
899 }
900 
901 static
902 void
903 assert_foff_in_sync(evtr_t evtr)
904 {
905 	off_t off;
906 
907 	/*
908 	 * We keep our own offset because we
909 	 * might want to support mmap()
910 	 */
911 	off = ftello(evtr->f);
912 	if (evtr->bytes != off) {
913 		fprintf(stderr, "bytes %jd, off %jd\n", evtr->bytes, off);
914 		abort();
915 	}
916 }
917 
918 static
919 int
920 evtr_write(evtr_t evtr, const void *buf, size_t bytes)
921 {
922 	assert_foff_in_sync(evtr);
923 	if (fwrite(buf, bytes, 1, evtr->f) != 1) {
924 		evtr->err = errno;
925 		evtr->errmsg = strerror(errno);
926 		return !0;
927 	}
928 	evtr->bytes += bytes;
929 	assert_foff_in_sync(evtr);
930 	return 0;
931 }
932 
933 /*
934  * Called after dumping a record to make sure the next
935  * record is REC_ALIGN aligned. This does not make much sense,
936  * as we shouldn't be using packed structs anyway.
937  */
938 static
939 int
940 evtr_dump_pad(evtr_t evtr)
941 {
942 	size_t pad;
943 	static char buf[REC_ALIGN];
944 
945 	pad = REC_ALIGN - (evtr->bytes % REC_ALIGN);
946 	if (pad > 0) {
947 		return evtr_write(evtr, buf, pad);
948 	}
949 	return 0;
950 }
951 
952 /*
953  * We make sure that there is a new record every REC_BOUNDARY
954  * bytes, this costs next to nothing in space and allows for
955  * fast seeking.
956  */
957 static
958 int
959 evtr_dump_avoid_boundary(evtr_t evtr, size_t bytes)
960 {
961 	unsigned pad, i;
962 	static char buf[256];
963 
964 	pad = REC_BOUNDARY - (evtr->bytes % REC_BOUNDARY);
965 	/* if adding @bytes would cause us to cross a boundary... */
966 	if (bytes > pad) {
967 		/* then pad to the boundary */
968 		for (i = 0; i < (pad / sizeof(buf)); ++i) {
969 			if (evtr_write(evtr, buf, sizeof(buf))) {
970 				return !0;
971 			}
972 		}
973 		i = pad % sizeof(buf);
974 		if (i) {
975 			if (evtr_write(evtr, buf, i)) {
976 				return !0;
977 			}
978 		}
979 	}
980 	return 0;
981 }
982 
983 static
984 int
985 evtr_dump_fmt(evtr_t evtr, uint64_t ts, const evtr_event_t ev)
986 {
987 	struct fmt_event_header fmt;
988 	uint16_t id;
989 	int err;
990 	char *subsys = "", buf[1024];
991 
992 	if (strlcpy(buf, subsys, sizeof(buf)) >= sizeof(buf)) {
993 		evtr->errmsg = "name of subsystem is too large";
994 		evtr->err = ERANGE;
995 		return 0;
996 	}
997 	if (strlcat(buf, ev->fmt, sizeof(buf)) >= sizeof(buf)) {
998 		evtr->errmsg = "fmt + name of subsystem is too large";
999 		evtr->err = ERANGE;
1000 		return 0;
1001 	}
1002 
1003 	if (!strhash_find(evtr->fmts, buf, &id)) {
1004 		return id;
1005 	}
1006 	if ((err = strhash_insert(evtr->fmts, buf, &id))) {
1007 		evtr->err = err;
1008 		return 0;
1009 	}
1010 
1011 	fmt.eh.type = EVTR_TYPE_FMT;
1012 	fmt.eh.ts = ts;
1013 	fmt.subsys_len = strlen(subsys);
1014 	fmt.fmt_len = strlen(ev->fmt);
1015 	fmt.id = id;
1016 	if (evtr_dump_avoid_boundary(evtr, sizeof(fmt) + fmt.subsys_len +
1017 				     fmt.fmt_len))
1018 		return 0;
1019 	if (evtr_write(evtr, &fmt, sizeof(fmt)))
1020 		return 0;
1021 	if (evtr_write(evtr, subsys, fmt.subsys_len))
1022 		return 0;
1023 	if (evtr_write(evtr, ev->fmt, fmt.fmt_len))
1024 		return 0;
1025 	if (evtr_dump_pad(evtr))
1026 		return 0;
1027 	return fmt.id;
1028 }
1029 
1030 /*
1031  * Replace string pointers or string ids in fmtdata
1032  */
1033 static
1034 int
1035 mangle_string_ptrs(const char *fmt, uint8_t *fmtdata,
1036 		   const char *(*replace)(void *, const char *), void *ctx)
1037 {
1038 	const char *f, *p;
1039 	size_t skipsize, intsz;
1040 	int ret = 0;
1041 
1042 	for (f = fmt; f[0] != '\0'; ++f) {
1043 		if (f[0] != '%')
1044 			continue;
1045 		++f;
1046 		skipsize = 0;
1047 		for (p = f; p[0]; ++p) {
1048 			int again = 0;
1049 			/*
1050 			 * Eat flags. Notice this will accept duplicate
1051 			 * flags.
1052 			 */
1053 			switch (p[0]) {
1054 			case '#':
1055 			case '0':
1056 			case '-':
1057 			case ' ':
1058 			case '+':
1059 			case '\'':
1060 				again = !0;
1061 				break;
1062 			}
1063 			if (!again)
1064 				break;
1065 		}
1066 		/* Eat minimum field width, if any */
1067 		for (; isdigit(p[0]); ++p)
1068 			;
1069 		if (p[0] == '.')
1070 			++p;
1071 		/* Eat precision, if any */
1072 		for (; isdigit(p[0]); ++p)
1073 			;
1074 		intsz = 0;
1075 		switch (p[0]) {
1076 		case 'l':
1077 			if (p[1] == 'l') {
1078 				++p;
1079 				intsz = sizeof(long long);
1080 			} else {
1081 				intsz = sizeof(long);
1082 			}
1083 			break;
1084 		case 'j':
1085 			intsz = sizeof(intmax_t);
1086 			break;
1087 		case 't':
1088 			intsz = sizeof(ptrdiff_t);
1089 			break;
1090 		case 'z':
1091 			intsz = sizeof(size_t);
1092 			break;
1093 		default:
1094 			break;
1095 		}
1096 		if (intsz != 0)
1097 			++p;
1098 		else
1099 			intsz = sizeof(int);
1100 
1101 		switch (p[0]) {
1102 		case 'd':
1103 		case 'i':
1104 		case 'o':
1105 		case 'u':
1106 		case 'x':
1107 		case 'X':
1108 		case 'c':
1109 			skipsize = intsz;
1110 			break;
1111 		case 'p':
1112 			skipsize = sizeof(void *);
1113 			break;
1114 		case 'f':
1115 			if (p[-1] == 'l')
1116 				skipsize = sizeof(double);
1117 			else
1118 				skipsize = sizeof(float);
1119 			break;
1120 		case 's':
1121 			((const char **)fmtdata)[0] =
1122 				replace(ctx, ((char **)fmtdata)[0]);
1123 			skipsize = sizeof(char *);
1124 			++ret;
1125 			break;
1126 		default:
1127 			fprintf(stderr, "Unknown conversion specifier %c "
1128 				"in fmt starting with %s", p[0], f - 1);
1129 			return -1;
1130 		}
1131 		fmtdata += skipsize;
1132 	}
1133 	return ret;
1134 }
1135 
1136 /* XXX: do we really want the timestamp? */
1137 static
1138 int
1139 evtr_dump_string(evtr_t evtr, uint64_t ts, const char *str, int ns)
1140 {
1141 	struct string_event_header s;
1142 	int err;
1143 	uint16_t id;
1144 
1145 	assert((0 <= ns) && (ns < EVTR_NS_MAX));
1146 	if (!strhash_find(evtr->strings[ns], str, &id)) {
1147 		return id;
1148 	}
1149 	if ((err = strhash_insert(evtr->strings[ns], str, &id))) {
1150 		evtr->err = err;
1151 		return 0;
1152 	}
1153 
1154 	printd(DS, "hash_insert %s ns %d id %d\n", str, ns, id);
1155 	s.eh.type = EVTR_TYPE_STR;
1156 	s.eh.ts = ts;
1157 	s.ns = ns;
1158 	s.id = id;
1159 	s.len = strnlen(str, PATH_MAX);
1160 
1161 	if (evtr_dump_avoid_boundary(evtr, sizeof(s) + s.len))
1162 		return 0;
1163 	if (evtr_write(evtr, &s, sizeof(s)))
1164 		return 0;
1165 	if (evtr_write(evtr, str, s.len))
1166 		return 0;
1167 	if (evtr_dump_pad(evtr))
1168 		return 0;
1169 	return s.id;
1170 }
1171 
1172 struct replace_ctx {
1173 	evtr_t evtr;
1174 	uint64_t ts;
1175 };
1176 
1177 static
1178 const char *
1179 replace_strptr(void *_ctx, const char *s)
1180 {
1181 	struct replace_ctx *ctx = _ctx;
1182 	return (const char *)(uintptr_t)evtr_dump_string(ctx->evtr, ctx->ts, s,
1183 							 EVTR_NS_DSTR);
1184 }
1185 
1186 static
1187 const char *
1188 replace_strid(void *_ctx, const char *s)
1189 {
1190 	struct replace_ctx *ctx = _ctx;
1191 	const char *ret;
1192 
1193 	ret = string_map_find(&ctx->evtr->maps[EVTR_NS_DSTR - 1].root,
1194 			      (int)(uintptr_t)s);
1195 	if (!ret) {
1196 		fprintf(stderr, "Unknown id for data string\n");
1197 		ctx->evtr->errmsg = "unknown id for data string";
1198 		ctx->evtr->err = !0;
1199 	}
1200 	validate_string(ret);
1201 	printd(DS, "replacing strid %d (ns %d) with string '%s' (or int %#x)\n",
1202 	       (int)(uintptr_t)s, EVTR_NS_DSTR, ret ? ret : "NULL", (int)(uintptr_t)ret);
1203 	return ret;
1204 }
1205 
1206 static
1207 int
1208 evtr_dump_probe(evtr_t evtr, evtr_event_t ev)
1209 {
1210 	struct probe_event_header kev;
1211 	char buf[1024];
1212 
1213 	memset(&kev, '\0', sizeof(kev));
1214 	kev.eh.type = ev->type;
1215 	kev.eh.ts = ev->ts;
1216 	kev.line = ev->line;
1217 	kev.cpu = ev->cpu;
1218 	if (ev->file) {
1219 		kev.file = evtr_dump_string(evtr, kev.eh.ts, ev->file,
1220 					    EVTR_NS_PATH);
1221 	}
1222 	if (ev->func) {
1223 		kev.func = evtr_dump_string(evtr, kev.eh.ts, ev->func,
1224 					    EVTR_NS_FUNC);
1225 	}
1226 	if (ev->fmt) {
1227 		kev.fmt = evtr_dump_fmt(evtr, kev.eh.ts, ev);
1228 	}
1229 	if (ev->fmtdata) {
1230 		struct replace_ctx replctx = {
1231 			.evtr = evtr,
1232 			.ts = ev->ts,
1233 		};
1234 		assert(ev->fmtdatalen <= (int)sizeof(buf));
1235 		kev.datalen = ev->fmtdatalen;
1236 		/*
1237 		 * Replace all string pointers with string ids before dumping
1238 		 * the data.
1239 		 */
1240 		memcpy(buf, ev->fmtdata, ev->fmtdatalen);
1241 		if (mangle_string_ptrs(ev->fmt, buf,
1242 				       replace_strptr, &replctx) < 0)
1243 			return !0;
1244 		if (evtr->err)
1245 			return evtr->err;
1246 	}
1247 	if (evtr_dump_avoid_boundary(evtr, sizeof(kev) + ev->fmtdatalen))
1248 		return !0;
1249 	if (evtr_write(evtr, &kev, sizeof(kev)))
1250 		return !0;
1251 	if (evtr_write(evtr, buf, ev->fmtdatalen))
1252 		return !0;
1253 	if (evtr_dump_pad(evtr))
1254 		return !0;
1255 	return 0;
1256 }
1257 
1258 static
1259 int
1260 evtr_dump_sysinfo(evtr_t evtr, evtr_event_t ev)
1261 {
1262 	uint8_t type = EVTR_TYPE_SYSINFO;
1263 	uint16_t ncpus = ev->ncpus;
1264 
1265 	if (ncpus <= 0) {
1266 		evtr->errmsg = "invalid number of cpus";
1267 		return !0;
1268 	}
1269 	if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ncpus)))
1270 		return !0;
1271 	if (evtr_write(evtr, &type, sizeof(type))) {
1272 		return !0;
1273 	}
1274 	if (evtr_write(evtr, &ncpus, sizeof(ncpus))) {
1275 		return !0;
1276 	}
1277 	if (evtr_dump_pad(evtr))
1278 		return !0;
1279 	return 0;
1280 }
1281 static
1282 int
1283 evtr_dump_cpuinfo(evtr_t evtr, evtr_event_t ev)
1284 {
1285 	struct cpuinfo_event_header ci;
1286 	uint8_t type;
1287 
1288 	if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ci)))
1289 		return !0;
1290 	type = EVTR_TYPE_CPUINFO;
1291 	if (evtr_write(evtr, &type, sizeof(type))) {
1292 		return !0;
1293 	}
1294 	ci.cpu = ev->cpu;
1295 	ci.freq = ev->cpuinfo.freq;
1296 	if (evtr_dump_avoid_boundary(evtr, sizeof(ci)))
1297 		return !0;
1298 	if (evtr_write(evtr, &ci, sizeof(ci))) {
1299 		return !0;
1300 	}
1301 	if (evtr_dump_pad(evtr))
1302 		return !0;
1303 	return 0;
1304 }
1305 
1306 int
1307 evtr_rewind(evtr_t evtr)
1308 {
1309 	assert((evtr->flags & EVTRF_WR) == 0);
1310 	evtr->bytes = 0;
1311 	if (fseek(evtr->f, 0, SEEK_SET)) {
1312 		evtr->err = errno;
1313 		return !0;
1314 	}
1315 	return 0;
1316 }
1317 
1318 int
1319 evtr_dump_event(evtr_t evtr, evtr_event_t ev)
1320 {
1321 	switch (ev->type) {
1322 	case EVTR_TYPE_PROBE:
1323 		return evtr_dump_probe(evtr, ev);
1324 	case EVTR_TYPE_SYSINFO:
1325 		return evtr_dump_sysinfo(evtr, ev);
1326 	case EVTR_TYPE_CPUINFO:
1327 		return evtr_dump_cpuinfo(evtr, ev);
1328 	}
1329 	evtr->errmsg = "unknown event type";
1330 	return !0;
1331 }
1332 
1333 static
1334 evtr_t
1335 evtr_alloc(FILE *f)
1336 {
1337 	evtr_t evtr;
1338 	if (!(evtr = malloc(sizeof(*evtr)))) {
1339 		return NULL;
1340 	}
1341 
1342 	evtr->f = f;
1343 	evtr->err = 0;
1344 	evtr->errmsg = NULL;
1345 	evtr->bytes = 0;
1346 	return evtr;
1347 }
1348 
1349 static int evtr_next_event(evtr_t, evtr_event_t);
1350 
1351 evtr_t
1352 evtr_open_read(FILE *f)
1353 {
1354 	evtr_t evtr;
1355 	struct evtr_event ev;
1356 	int i;
1357 
1358 	if (!(evtr = evtr_alloc(f))) {
1359 		return NULL;
1360 	}
1361 	evtr->flags = 0;
1362 	for (i = 0; i < (EVTR_NS_MAX - 1); ++i) {
1363 		RB_INIT(&evtr->maps[i].root);
1364 	}
1365 	RB_INIT(&evtr->fmtmap.root);
1366 	RB_INIT(&evtr->threads.root);
1367 	evtr->cpus = NULL;
1368 	evtr->ncpus = 0;
1369 	/*
1370 	 * Load the first event so we can pick up any
1371 	 * sysinfo entries.
1372 	 */
1373 	if (evtr_next_event(evtr, &ev)) {
1374 		goto free_evtr;
1375 	}
1376 	if (evtr_rewind(evtr))
1377 		goto free_evtr;
1378 	return evtr;
1379 free_evtr:
1380 	free(evtr);
1381 	return NULL;
1382 }
1383 
1384 evtr_t
1385 evtr_open_write(FILE *f)
1386 {
1387 	evtr_t evtr;
1388 	int i, j;
1389 
1390 	if (!(evtr = evtr_alloc(f))) {
1391 		return NULL;
1392 	}
1393 
1394 	evtr->flags = EVTRF_WR;
1395 	if (!(evtr->fmts = strhash_new()))
1396 		goto free_evtr;
1397 	for (i = 0; i < EVTR_NS_MAX; ++i) {
1398 		evtr->strings[i] = strhash_new();
1399 		if (!evtr->strings[i]) {
1400 			for (j = 0; j < i; ++j) {
1401 				strhash_destroy(evtr->strings[j]);
1402 			}
1403 			goto free_fmts;
1404 		}
1405 	}
1406 
1407 	return evtr;
1408 free_fmts:
1409 	strhash_destroy(evtr->fmts);
1410 free_evtr:
1411 	free(evtr);
1412 	return NULL;
1413 }
1414 
1415 static
1416 void
1417 hashtab_destroy(struct hashtab *h)
1418 {
1419 	struct hashentry *ent, *next;
1420 	int i;
1421 	for (i = 0; i < NR_BUCKETS; ++i) {
1422 		for (ent = h->buckets[i]; ent; ent = next) {
1423 			next = ent->next;
1424 			free(ent);
1425 		}
1426 	}
1427 	free(h);
1428 }
1429 
1430 void
1431 evtr_close(evtr_t evtr)
1432 {
1433 	int i;
1434 
1435 	if (evtr->flags & EVTRF_WR) {
1436 		hashtab_destroy(&evtr->fmts->tab);
1437 		for (i = 0; i < EVTR_NS_MAX - 1; ++i)
1438 			hashtab_destroy(&evtr->strings[i]->tab);
1439 	} else {
1440 		id_tree_free(&evtr->fmtmap.root);
1441 		for (i = 0; i < EVTR_NS_MAX - 1; ++i) {
1442 			id_tree_free(&evtr->maps[i].root);
1443 		}
1444 	}
1445 	free(evtr);
1446 }
1447 
1448 static
1449 int
1450 evtr_read(evtr_t evtr, void *buf, size_t size)
1451 {
1452 	assert(size > 0);
1453 	assert_foff_in_sync(evtr);
1454 	printd(IO, "evtr_read at %#jx, %zd bytes\n", evtr->bytes, size);
1455 	if (fread(buf, size, 1, evtr->f) != 1) {
1456 		if (feof(evtr->f)) {
1457 			evtr->errmsg = "incomplete record";
1458 		} else {
1459 			evtr->errmsg = strerror(errno);
1460 		}
1461 		return !0;
1462 	}
1463 	evtr->bytes += size;
1464 	assert_foff_in_sync(evtr);
1465 	return 0;
1466 }
1467 
1468 static
1469 int
1470 evtr_load_fmt(evtr_query_t q, char *buf)
1471 {
1472 	evtr_t evtr = q->evtr;
1473 	struct fmt_event_header *evh = (struct fmt_event_header *)buf;
1474 	struct event_fmt *fmt;
1475 	char *subsys = NULL, *fmtstr;
1476 
1477 	if (!(fmt = malloc(sizeof(*fmt)))) {
1478 		evtr->err = errno;
1479 		return !0;
1480 	}
1481 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1482 		      sizeof(*evh) - sizeof(evh->eh))) {
1483 		goto free_fmt;
1484 	}
1485 	assert(!evh->subsys_len);
1486 	if (evh->subsys_len) {
1487 		if (!(subsys = malloc(evh->subsys_len))) {
1488 			evtr->err = errno;
1489 			goto free_fmt;
1490 		}
1491 		if (evtr_read(evtr, subsys, evh->subsys_len)) {
1492 			goto free_subsys;
1493 		}
1494 		fmt->subsys = subsys;
1495 	} else {
1496 		fmt->subsys = "";
1497 	}
1498 	if (!(fmtstr = malloc(evh->fmt_len + 1))) {
1499 		evtr->err = errno;
1500 		goto free_subsys;
1501 	}
1502 	if (evtr_read(evtr, fmtstr, evh->fmt_len)) {
1503 		goto free_fmtstr;
1504 	}
1505 	fmtstr[evh->fmt_len] = '\0';
1506 	fmt->fmt = fmtstr;
1507 
1508 	printd(DS, "fmt_map_insert (%d, %s)\n", evh->id, fmt->fmt);
1509 	evtr->err = fmt_map_insert(&evtr->fmtmap.root, fmt, evh->id);
1510 	switch (evtr->err) {
1511 	case ENOMEM:
1512 		evtr->errmsg = "out of memory";
1513 		break;
1514 	case EEXIST:
1515 		evtr->errmsg = "redefinition of an id to a "
1516 			"different format (corrupt input)";
1517 		break;
1518 	default:
1519 		;
1520 	}
1521 	return evtr->err;
1522 
1523 free_fmtstr:
1524 	free(fmtstr);
1525 free_subsys:
1526 	if (subsys)
1527 		free(subsys);
1528 free_fmt:
1529 	free(fmt);
1530 	return !0;
1531 }
1532 
1533 static
1534 int
1535 evtr_load_string(evtr_t evtr, char *buf)
1536 {
1537 	char sbuf[PATH_MAX + 1];
1538 	struct string_event_header *evh = (struct string_event_header *)buf;
1539 
1540 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1541 		      sizeof(*evh) - sizeof(evh->eh))) {
1542 		return !0;
1543 	}
1544 	if (evh->len > PATH_MAX) {
1545 		evtr->errmsg = "string too large (corrupt input)";
1546 		return !0;
1547 	}
1548 	if (evh->len && evtr_read(evtr, sbuf, evh->len)) {
1549 		return !0;
1550 	}
1551 	sbuf[evh->len] = 0;
1552 	if (evh->ns >= EVTR_NS_MAX) {
1553 		evtr->errmsg = "invalid namespace (corrupt input)";
1554 		return !0;
1555 	}
1556 	validate_string(sbuf);
1557 	printd(DS, "evtr_load_string:ns %d id %d : \"%s\"\n", evh->ns, evh->id,
1558 	       sbuf);
1559 	evtr->err = string_map_insert(&evtr->maps[evh->ns - 1].root, sbuf, evh->id);
1560 	switch (evtr->err) {
1561 	case ENOMEM:
1562 		evtr->errmsg = "out of memory";
1563 		break;
1564 	case EEXIST:
1565 		evtr->errmsg = "redefinition of an id to a "
1566 			"different string (corrupt input)";
1567 		break;
1568 	default:
1569 		;
1570 	}
1571 	return 0;
1572 }
1573 
1574 static
1575 int
1576 evtr_skip(evtr_t evtr, off_t bytes)
1577 {
1578 	if (fseek(evtr->f, bytes, SEEK_CUR)) {
1579 		evtr->err = errno;
1580 		evtr->errmsg = strerror(errno);
1581 		return !0;
1582 	}
1583 	evtr->bytes += bytes;
1584 	return 0;
1585 }
1586 
1587 /*
1588  * Make sure q->buf is at least len bytes
1589  */
1590 static
1591 int
1592 evtr_query_reserve_buf(struct evtr_query *q, int len)
1593 {
1594 	void *tmp;
1595 
1596 	if (q->bufsize >= len)
1597 		return 0;
1598 	if (!(tmp = realloc(q->buf, len)))
1599 		return !0;
1600 	q->buf = tmp;
1601 	q->bufsize = len;
1602 	return 0;
1603 }
1604 
1605 static
1606 int
1607 evtr_load_probe(evtr_t evtr, evtr_event_t ev, char *buf, struct evtr_query *q)
1608 {
1609 	struct probe_event_header *evh = (struct probe_event_header *)buf;
1610 	struct cpu *cpu;
1611 
1612 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1613 		      sizeof(*evh) - sizeof(evh->eh)))
1614 		return !0;
1615 	memset(ev, '\0', sizeof(*ev));
1616 	ev->ts = evh->eh.ts;
1617 	ev->type = EVTR_TYPE_PROBE;
1618 	ev->line = evh->line;
1619 	ev->cpu = evh->cpu;
1620 	if ((cpu = evtr_cpu(evtr, evh->cpu))) {
1621 		ev->td = cpu->td;
1622 	} else {
1623 		ev->td = NULL;
1624 	}
1625 	if (evh->file) {
1626 		ev->file = string_map_find(
1627 			&evtr->maps[EVTR_NS_PATH - 1].root,
1628 			evh->file);
1629 		if (!ev->file) {
1630 			evtr->errmsg = "unknown id for file path";
1631 			evtr->err = !0;
1632 			ev->file = "<unknown>";
1633 		} else {
1634 			validate_string(ev->file);
1635 		}
1636 	} else {
1637 		ev->file = "<unknown>";
1638 	}
1639 	if (evh->fmt) {
1640 		const struct event_fmt *fmt;
1641 		if (!(fmt = fmt_map_find(&evtr->fmtmap.root, evh->fmt))) {
1642 			evtr->errmsg = "unknown id for event fmt";
1643 			evtr->err = !0;
1644 			ev->fmt = NULL;
1645 		} else {
1646 			ev->fmt = fmt->fmt;
1647 			validate_string(fmt->fmt);
1648 		}
1649 	}
1650 	if (evh->datalen) {
1651 		if (evtr_query_reserve_buf(q, evh->datalen + 1)) {
1652 			evtr->err = ENOMEM;
1653 		} else if (!evtr_read(evtr, q->buf, evh->datalen)) {
1654 			struct replace_ctx replctx = {
1655 				.evtr = evtr,
1656 				.ts = ev->ts,
1657 			};
1658 			assert(ev->fmt);
1659 
1660 			ev->fmtdata = q->buf;
1661 			/*
1662 			 * If the format specifies any string pointers, there
1663 			 * is a string id stored in the fmtdata. Look it up
1664 			 * and replace it with a string pointer before
1665 			 * returning it to the user.
1666 			 */
1667 			if (mangle_string_ptrs(ev->fmt, __DECONST(uint8_t *,
1668 								  ev->fmtdata),
1669 					       replace_strid, &replctx) < 0)
1670 				return evtr->err;
1671 			if (evtr->err)
1672 				return evtr->err;
1673 			((char *)ev->fmtdata)[evh->datalen] = '\0';
1674 			ev->fmtdatalen = evh->datalen;
1675 		}
1676 	}
1677 	evtr_run_callbacks(ev, q);
1678 	return evtr->err;
1679 }
1680 
1681 static
1682 int
1683 evtr_skip_to_record(evtr_t evtr)
1684 {
1685 	int skip;
1686 
1687 	skip = REC_ALIGN - (evtr->bytes % REC_ALIGN);
1688 	if (skip > 0) {
1689 		if (fseek(evtr->f, skip, SEEK_CUR)) {
1690 			evtr->err = errno;
1691 			evtr->errmsg = strerror(errno);
1692 			return !0;
1693 		}
1694 		evtr->bytes += skip;
1695 	}
1696 	return 0;
1697 }
1698 
1699 static
1700 int
1701 evtr_load_sysinfo(evtr_t evtr)
1702 {
1703 	uint16_t ncpus;
1704 	int i;
1705 
1706 	if (evtr_read(evtr, &ncpus, sizeof(ncpus))) {
1707 		return !0;
1708 	}
1709 	if (evtr->cpus)
1710 		return 0;
1711 	evtr->cpus = malloc(ncpus * sizeof(struct cpu));
1712 	if (!evtr->cpus) {
1713 		evtr->err = ENOMEM;
1714 		return !0;
1715 	}
1716 	evtr->ncpus = ncpus;
1717 	for (i = 0; i < ncpus; ++i) {
1718 		evtr->cpus[i].td = NULL;
1719 		evtr->cpus[i].freq = -1.0;
1720 	}
1721 	return 0;
1722 }
1723 
1724 static
1725 int
1726 evtr_load_cpuinfo(evtr_t evtr)
1727 {
1728 	struct cpuinfo_event_header cih;
1729 	struct cpu *cpu;
1730 
1731 	if (evtr_read(evtr, &cih, sizeof(cih))) {
1732 		return !0;
1733 	}
1734 	if (cih.freq < 0.0) {
1735 		evtr->errmsg = "cpu freq is negative";
1736 		evtr->err = EINVAL;
1737 		return !0;
1738 	}
1739 	/*
1740 	 * Notice that freq is merely a multiplier with
1741 	 * which we convert a timestamp to seconds; if
1742 	 * ts is not in cycles, freq is not the frequency.
1743 	 */
1744 	if (!(cpu = evtr_cpu(evtr, cih.cpu))) {
1745 		evtr->errmsg = "freq for invalid cpu";
1746 		evtr->err = EINVAL;
1747 		return !0;
1748 	}
1749 	cpu->freq = cih.freq;
1750 	return 0;
1751 }
1752 
1753 static
1754 int
1755 _evtr_next_event(evtr_t evtr, evtr_event_t ev, struct evtr_query *q)
1756 {
1757 	char buf[MAX_EVHDR_SIZE];
1758 	int ret, err;
1759 	struct trace_event_header *evhdr = (struct trace_event_header *)buf;
1760 
1761 	for (ret = 0; !ret;) {
1762 		if (q->flags & EVTRQF_PENDING) {
1763 			q->off = evtr->bytes;
1764 			memcpy(ev, &q->pending_event, sizeof(*ev));
1765 			q->flags &= ~EVTRQF_PENDING;
1766 			return 0;
1767 		}
1768 		if (evtr_read(evtr, &evhdr->type, 1)) {
1769 			if (feof(evtr->f)) {
1770 				evtr->errmsg = NULL;
1771 				evtr->err = 0;
1772 				return -1;
1773 			}
1774 			return !0;
1775 		}
1776 		/*
1777 		 * skip pad records -- this will only happen if there's a
1778 		 * variable sized record close to the boundary
1779 		 */
1780 		if (evhdr->type == EVTR_TYPE_PAD) {
1781 			evtr_skip_to_record(evtr);
1782 			continue;
1783 		}
1784 		if (evhdr->type == EVTR_TYPE_SYSINFO) {
1785 			evtr_load_sysinfo(evtr);
1786 			continue;
1787 		} else if (evhdr->type == EVTR_TYPE_CPUINFO) {
1788 			evtr_load_cpuinfo(evtr);
1789 			continue;
1790 		}
1791 		if (evtr_read(evtr, buf + 1, sizeof(*evhdr) - 1))
1792 			return feof(evtr->f) ? -1 : !0;
1793 		switch (evhdr->type) {
1794 		case EVTR_TYPE_PROBE:
1795 			if ((err = evtr_load_probe(evtr, ev, buf, q))) {
1796 				if (err == -1) {
1797 					/* no match */
1798 					ret = 0;
1799 				} else {
1800 					return !0;
1801 				}
1802 			} else {
1803 				ret = !0;
1804 			}
1805 			break;
1806 		case EVTR_TYPE_STR:
1807 			if (evtr_load_string(evtr, buf)) {
1808 				return !0;
1809 			}
1810 			break;
1811 		case EVTR_TYPE_FMT:
1812 			if (evtr_load_fmt(q, buf)) {
1813 				return !0;
1814 			}
1815 			break;
1816 		default:
1817 			evtr->err = !0;
1818 			evtr->errmsg = "unknown event type (corrupt input?)";
1819 			return !0;
1820 		}
1821 		evtr_skip_to_record(evtr);
1822 		if (ret) {
1823 			if (!evtr_match_filters(q, ev)) {
1824 				ret = 0;
1825 				continue;
1826 			}
1827 			q->off = evtr->bytes;
1828 			return 0;
1829 		}
1830 	}
1831 	/* can't get here */
1832 	return !0;
1833 }
1834 
1835 static
1836 int
1837 evtr_next_event(evtr_t evtr, evtr_event_t ev)
1838 {
1839 	struct evtr_query *q;
1840 	int ret;
1841 
1842 	if (!(q = evtr_query_init(evtr, NULL, 0))) {
1843 		evtr->err = ENOMEM;
1844 		return !0;
1845 	}
1846 	ret = _evtr_next_event(evtr, ev, q);
1847 	evtr_query_destroy(q);
1848 	return ret;
1849 }
1850 
1851 int
1852 evtr_last_event(evtr_t evtr, evtr_event_t ev)
1853 {
1854 	struct stat st;
1855 	int fd;
1856 	off_t last_boundary;
1857 
1858 	if (evtr_error(evtr))
1859 		return !0;
1860 
1861 	fd = fileno(evtr->f);
1862 	if (fstat(fd, &st))
1863 		return !0;
1864 	/*
1865 	 * This skips pseudo records, so we can't provide
1866 	 * an event with all fields filled in this way.
1867 	 * It's doable, just needs some care. TBD.
1868 	 */
1869 	if (0 && (st.st_mode & S_IFREG)) {
1870 		/*
1871 		 * Skip to last boundary, that's the closest to the EOF
1872 		 * location that we are sure contains a header so we can
1873 		 * pick up the stream.
1874 		 */
1875 		last_boundary = (st.st_size / REC_BOUNDARY) * REC_BOUNDARY;
1876 		/* XXX: ->bytes should be in query */
1877 		assert(evtr->bytes == 0);
1878 		evtr_skip(evtr, last_boundary);
1879 	}
1880 
1881 
1882 	/*
1883 	 * If we can't seek, we need to go through the whole file.
1884 	 * Since you can't seek back, this is pretty useless unless
1885 	 * you really are interested only in the last event.
1886 	 */
1887 	while (!evtr_next_event(evtr, ev))
1888 		;
1889 	if (evtr_error(evtr))
1890 		return !0;
1891 	evtr_rewind(evtr);
1892 	return 0;
1893 }
1894 
1895 struct evtr_query *
1896 evtr_query_init(evtr_t evtr, evtr_filter_t filt, int nfilt)
1897 {
1898 	struct evtr_query *q;
1899 	int i;
1900 
1901 	if (!(q = malloc(sizeof(*q)))) {
1902 		return q;
1903 	}
1904 	q->bufsize = 2;
1905 	if (!(q->buf = malloc(q->bufsize))) {
1906 		goto free_q;
1907 	}
1908 	if (!(q->symtab = symtab_new()))
1909 		goto free_buf;
1910 	q->evtr = evtr;
1911 	q->off = 0;
1912 	q->filt = filt;
1913 	q->nfilt = nfilt;
1914 	TAILQ_INIT(&q->unresolved_filtq);
1915 	q->nmatched = 0;
1916 	q->cbs = NULL;
1917 	q->ncbs = 0;
1918 	q->flags = 0;
1919 	memset(&q->pending_event, '\0', sizeof(q->pending_event));
1920 	if (evtr_register_callback(q, &thread_creation_callback, q)) {
1921 		goto free_symtab;
1922 	}
1923 	if (evtr_register_callback(q, &thread_switch_callback, q)) {
1924 		goto free_cbs;
1925 	}
1926 	if (evtr_query_needs_parsing(q) &&
1927 	    evtr_register_callback(q, &parse_callback, q)) {
1928 		goto free_cbs;
1929 	}
1930 
1931 	for (i = 0; i < nfilt; ++i) {
1932 		filt[i].flags = 0;
1933 		if (filt[i].fmt == NULL)
1934 			continue;
1935 		if (evtr_filter_register(q, &filt[i])) {
1936 			evtr_deregister_filters(q, filt, i);
1937 			goto free_symtab;
1938 		}
1939 	}
1940 
1941 	return q;
1942 free_cbs:
1943 	evtr_deregister_callbacks(q);
1944 free_symtab:
1945 	symtab_destroy(q->symtab);
1946 free_buf:
1947 	free(q->buf);
1948 free_q:
1949 	free(q);
1950 	return NULL;
1951 }
1952 
1953 void
1954 evtr_query_destroy(struct evtr_query *q)
1955 {
1956 	evtr_deregister_filters(q, q->filt, q->nfilt);
1957 
1958 	free(q->buf);
1959 	free(q);
1960 }
1961 
1962 int
1963 evtr_query_next(struct evtr_query *q, evtr_event_t ev)
1964 {
1965 	if (evtr_query_error(q))
1966 		return !0;
1967 	/* we may support that in the future */
1968 	if (q->off != q->evtr->bytes) {
1969 		q->errmsg = "evtr/query offset mismatch";
1970 		return !0;
1971 	}
1972 	return _evtr_next_event(q->evtr, ev, q);
1973 }
1974 
1975 int
1976 evtr_ncpus(evtr_t evtr)
1977 {
1978 	return evtr->ncpus;
1979 }
1980 
1981 int
1982 evtr_cpufreqs(evtr_t evtr, double *freqs)
1983 {
1984 	int i;
1985 
1986 	if (!freqs)
1987 		return EINVAL;
1988 	for (i = 0; i < evtr->ncpus; ++i) {
1989 		freqs[i] = evtr->cpus[i].freq;
1990 	}
1991 	return 0;
1992 }
1993