xref: /linux/tools/perf/builtin-record.c (revision 44f57d78)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "perf.h"
12 
13 #include "util/build-id.h"
14 #include "util/util.h"
15 #include <subcmd/parse-options.h>
16 #include "util/parse-events.h"
17 #include "util/config.h"
18 
19 #include "util/callchain.h"
20 #include "util/cgroup.h"
21 #include "util/header.h"
22 #include "util/event.h"
23 #include "util/evlist.h"
24 #include "util/evsel.h"
25 #include "util/debug.h"
26 #include "util/session.h"
27 #include "util/tool.h"
28 #include "util/symbol.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/llvm-utils.h"
38 #include "util/bpf-loader.h"
39 #include "util/trigger.h"
40 #include "util/perf-hooks.h"
41 #include "util/cpu-set-sched.h"
42 #include "util/time-utils.h"
43 #include "util/units.h"
44 #include "util/bpf-event.h"
45 #include "asm/bug.h"
46 
47 #include <errno.h>
48 #include <inttypes.h>
49 #include <locale.h>
50 #include <poll.h>
51 #include <unistd.h>
52 #include <sched.h>
53 #include <signal.h>
54 #include <sys/mman.h>
55 #include <sys/wait.h>
56 #include <linux/time64.h>
57 
58 struct switch_output {
59 	bool		 enabled;
60 	bool		 signal;
61 	unsigned long	 size;
62 	unsigned long	 time;
63 	const char	*str;
64 	bool		 set;
65 	char		 **filenames;
66 	int		 num_files;
67 	int		 cur_file;
68 };
69 
70 struct record {
71 	struct perf_tool	tool;
72 	struct record_opts	opts;
73 	u64			bytes_written;
74 	struct perf_data	data;
75 	struct auxtrace_record	*itr;
76 	struct perf_evlist	*evlist;
77 	struct perf_session	*session;
78 	int			realtime_prio;
79 	bool			no_buildid;
80 	bool			no_buildid_set;
81 	bool			no_buildid_cache;
82 	bool			no_buildid_cache_set;
83 	bool			buildid_all;
84 	bool			timestamp_filename;
85 	bool			timestamp_boundary;
86 	struct switch_output	switch_output;
87 	unsigned long long	samples;
88 	cpu_set_t		affinity_mask;
89 };
90 
91 static volatile int auxtrace_record__snapshot_started;
92 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
93 static DEFINE_TRIGGER(switch_output_trigger);
94 
95 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
96 	"SYS", "NODE", "CPU"
97 };
98 
99 static bool switch_output_signal(struct record *rec)
100 {
101 	return rec->switch_output.signal &&
102 	       trigger_is_ready(&switch_output_trigger);
103 }
104 
105 static bool switch_output_size(struct record *rec)
106 {
107 	return rec->switch_output.size &&
108 	       trigger_is_ready(&switch_output_trigger) &&
109 	       (rec->bytes_written >= rec->switch_output.size);
110 }
111 
112 static bool switch_output_time(struct record *rec)
113 {
114 	return rec->switch_output.time &&
115 	       trigger_is_ready(&switch_output_trigger);
116 }
117 
118 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
119 			 void *bf, size_t size)
120 {
121 	struct perf_data_file *file = &rec->session->data->file;
122 
123 	if (perf_data_file__write(file, bf, size) < 0) {
124 		pr_err("failed to write perf data, error: %m\n");
125 		return -1;
126 	}
127 
128 	rec->bytes_written += size;
129 
130 	if (switch_output_size(rec))
131 		trigger_hit(&switch_output_trigger);
132 
133 	return 0;
134 }
135 
136 static int record__aio_enabled(struct record *rec);
137 static int record__comp_enabled(struct record *rec);
138 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
139 			    void *src, size_t src_size);
140 
141 #ifdef HAVE_AIO_SUPPORT
142 static int record__aio_write(struct aiocb *cblock, int trace_fd,
143 		void *buf, size_t size, off_t off)
144 {
145 	int rc;
146 
147 	cblock->aio_fildes = trace_fd;
148 	cblock->aio_buf    = buf;
149 	cblock->aio_nbytes = size;
150 	cblock->aio_offset = off;
151 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
152 
153 	do {
154 		rc = aio_write(cblock);
155 		if (rc == 0) {
156 			break;
157 		} else if (errno != EAGAIN) {
158 			cblock->aio_fildes = -1;
159 			pr_err("failed to queue perf data, error: %m\n");
160 			break;
161 		}
162 	} while (1);
163 
164 	return rc;
165 }
166 
167 static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
168 {
169 	void *rem_buf;
170 	off_t rem_off;
171 	size_t rem_size;
172 	int rc, aio_errno;
173 	ssize_t aio_ret, written;
174 
175 	aio_errno = aio_error(cblock);
176 	if (aio_errno == EINPROGRESS)
177 		return 0;
178 
179 	written = aio_ret = aio_return(cblock);
180 	if (aio_ret < 0) {
181 		if (aio_errno != EINTR)
182 			pr_err("failed to write perf data, error: %m\n");
183 		written = 0;
184 	}
185 
186 	rem_size = cblock->aio_nbytes - written;
187 
188 	if (rem_size == 0) {
189 		cblock->aio_fildes = -1;
190 		/*
191 		 * md->refcount is incremented in record__aio_pushfn() for
192 		 * every aio write request started in record__aio_push() so
193 		 * decrement it because the request is now complete.
194 		 */
195 		perf_mmap__put(md);
196 		rc = 1;
197 	} else {
198 		/*
199 		 * aio write request may require restart with the
200 		 * reminder if the kernel didn't write whole
201 		 * chunk at once.
202 		 */
203 		rem_off = cblock->aio_offset + written;
204 		rem_buf = (void *)(cblock->aio_buf + written);
205 		record__aio_write(cblock, cblock->aio_fildes,
206 				rem_buf, rem_size, rem_off);
207 		rc = 0;
208 	}
209 
210 	return rc;
211 }
212 
213 static int record__aio_sync(struct perf_mmap *md, bool sync_all)
214 {
215 	struct aiocb **aiocb = md->aio.aiocb;
216 	struct aiocb *cblocks = md->aio.cblocks;
217 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
218 	int i, do_suspend;
219 
220 	do {
221 		do_suspend = 0;
222 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
223 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
224 				if (sync_all)
225 					aiocb[i] = NULL;
226 				else
227 					return i;
228 			} else {
229 				/*
230 				 * Started aio write is not complete yet
231 				 * so it has to be waited before the
232 				 * next allocation.
233 				 */
234 				aiocb[i] = &cblocks[i];
235 				do_suspend = 1;
236 			}
237 		}
238 		if (!do_suspend)
239 			return -1;
240 
241 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
242 			if (!(errno == EAGAIN || errno == EINTR))
243 				pr_err("failed to sync perf data, error: %m\n");
244 		}
245 	} while (1);
246 }
247 
248 struct record_aio {
249 	struct record	*rec;
250 	void		*data;
251 	size_t		size;
252 };
253 
254 static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
255 {
256 	struct record_aio *aio = to;
257 
258 	/*
259 	 * map->base data pointed by buf is copied into free map->aio.data[] buffer
260 	 * to release space in the kernel buffer as fast as possible, calling
261 	 * perf_mmap__consume() from perf_mmap__push() function.
262 	 *
263 	 * That lets the kernel to proceed with storing more profiling data into
264 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
265 	 *
266 	 * Coping can be done in two steps in case the chunk of profiling data
267 	 * crosses the upper bound of the kernel buffer. In this case we first move
268 	 * part of data from map->start till the upper bound and then the reminder
269 	 * from the beginning of the kernel buffer till the end of the data chunk.
270 	 */
271 
272 	if (record__comp_enabled(aio->rec)) {
273 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
274 				     perf_mmap__mmap_len(map) - aio->size,
275 				     buf, size);
276 	} else {
277 		memcpy(aio->data + aio->size, buf, size);
278 	}
279 
280 	if (!aio->size) {
281 		/*
282 		 * Increment map->refcount to guard map->aio.data[] buffer
283 		 * from premature deallocation because map object can be
284 		 * released earlier than aio write request started on
285 		 * map->aio.data[] buffer is complete.
286 		 *
287 		 * perf_mmap__put() is done at record__aio_complete()
288 		 * after started aio request completion or at record__aio_push()
289 		 * if the request failed to start.
290 		 */
291 		perf_mmap__get(map);
292 	}
293 
294 	aio->size += size;
295 
296 	return size;
297 }
298 
299 static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
300 {
301 	int ret, idx;
302 	int trace_fd = rec->session->data->file.fd;
303 	struct record_aio aio = { .rec = rec, .size = 0 };
304 
305 	/*
306 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
307 	 * becomes available after previous aio write operation.
308 	 */
309 
310 	idx = record__aio_sync(map, false);
311 	aio.data = map->aio.data[idx];
312 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
313 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
314 		return ret;
315 
316 	rec->samples++;
317 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
318 	if (!ret) {
319 		*off += aio.size;
320 		rec->bytes_written += aio.size;
321 		if (switch_output_size(rec))
322 			trigger_hit(&switch_output_trigger);
323 	} else {
324 		/*
325 		 * Decrement map->refcount incremented in record__aio_pushfn()
326 		 * back if record__aio_write() operation failed to start, otherwise
327 		 * map->refcount is decremented in record__aio_complete() after
328 		 * aio write operation finishes successfully.
329 		 */
330 		perf_mmap__put(map);
331 	}
332 
333 	return ret;
334 }
335 
336 static off_t record__aio_get_pos(int trace_fd)
337 {
338 	return lseek(trace_fd, 0, SEEK_CUR);
339 }
340 
341 static void record__aio_set_pos(int trace_fd, off_t pos)
342 {
343 	lseek(trace_fd, pos, SEEK_SET);
344 }
345 
346 static void record__aio_mmap_read_sync(struct record *rec)
347 {
348 	int i;
349 	struct perf_evlist *evlist = rec->evlist;
350 	struct perf_mmap *maps = evlist->mmap;
351 
352 	if (!record__aio_enabled(rec))
353 		return;
354 
355 	for (i = 0; i < evlist->nr_mmaps; i++) {
356 		struct perf_mmap *map = &maps[i];
357 
358 		if (map->base)
359 			record__aio_sync(map, true);
360 	}
361 }
362 
363 static int nr_cblocks_default = 1;
364 static int nr_cblocks_max = 4;
365 
366 static int record__aio_parse(const struct option *opt,
367 			     const char *str,
368 			     int unset)
369 {
370 	struct record_opts *opts = (struct record_opts *)opt->value;
371 
372 	if (unset) {
373 		opts->nr_cblocks = 0;
374 	} else {
375 		if (str)
376 			opts->nr_cblocks = strtol(str, NULL, 0);
377 		if (!opts->nr_cblocks)
378 			opts->nr_cblocks = nr_cblocks_default;
379 	}
380 
381 	return 0;
382 }
383 #else /* HAVE_AIO_SUPPORT */
384 static int nr_cblocks_max = 0;
385 
386 static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
387 			    off_t *off __maybe_unused)
388 {
389 	return -1;
390 }
391 
392 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
393 {
394 	return -1;
395 }
396 
397 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
398 {
399 }
400 
401 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
402 {
403 }
404 #endif
405 
406 static int record__aio_enabled(struct record *rec)
407 {
408 	return rec->opts.nr_cblocks > 0;
409 }
410 
411 #define MMAP_FLUSH_DEFAULT 1
412 static int record__mmap_flush_parse(const struct option *opt,
413 				    const char *str,
414 				    int unset)
415 {
416 	int flush_max;
417 	struct record_opts *opts = (struct record_opts *)opt->value;
418 	static struct parse_tag tags[] = {
419 			{ .tag  = 'B', .mult = 1       },
420 			{ .tag  = 'K', .mult = 1 << 10 },
421 			{ .tag  = 'M', .mult = 1 << 20 },
422 			{ .tag  = 'G', .mult = 1 << 30 },
423 			{ .tag  = 0 },
424 	};
425 
426 	if (unset)
427 		return 0;
428 
429 	if (str) {
430 		opts->mmap_flush = parse_tag_value(str, tags);
431 		if (opts->mmap_flush == (int)-1)
432 			opts->mmap_flush = strtol(str, NULL, 0);
433 	}
434 
435 	if (!opts->mmap_flush)
436 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
437 
438 	flush_max = perf_evlist__mmap_size(opts->mmap_pages);
439 	flush_max /= 4;
440 	if (opts->mmap_flush > flush_max)
441 		opts->mmap_flush = flush_max;
442 
443 	return 0;
444 }
445 
446 #ifdef HAVE_ZSTD_SUPPORT
447 static unsigned int comp_level_default = 1;
448 
449 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
450 {
451 	struct record_opts *opts = opt->value;
452 
453 	if (unset) {
454 		opts->comp_level = 0;
455 	} else {
456 		if (str)
457 			opts->comp_level = strtol(str, NULL, 0);
458 		if (!opts->comp_level)
459 			opts->comp_level = comp_level_default;
460 	}
461 
462 	return 0;
463 }
464 #endif
465 static unsigned int comp_level_max = 22;
466 
467 static int record__comp_enabled(struct record *rec)
468 {
469 	return rec->opts.comp_level > 0;
470 }
471 
472 static int process_synthesized_event(struct perf_tool *tool,
473 				     union perf_event *event,
474 				     struct perf_sample *sample __maybe_unused,
475 				     struct machine *machine __maybe_unused)
476 {
477 	struct record *rec = container_of(tool, struct record, tool);
478 	return record__write(rec, NULL, event, event->header.size);
479 }
480 
481 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
482 {
483 	struct record *rec = to;
484 
485 	if (record__comp_enabled(rec)) {
486 		size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
487 		bf   = map->data;
488 	}
489 
490 	rec->samples++;
491 	return record__write(rec, map, bf, size);
492 }
493 
494 static volatile int done;
495 static volatile int signr = -1;
496 static volatile int child_finished;
497 
498 static void sig_handler(int sig)
499 {
500 	if (sig == SIGCHLD)
501 		child_finished = 1;
502 	else
503 		signr = sig;
504 
505 	done = 1;
506 }
507 
508 static void sigsegv_handler(int sig)
509 {
510 	perf_hooks__recover();
511 	sighandler_dump_stack(sig);
512 }
513 
514 static void record__sig_exit(void)
515 {
516 	if (signr == -1)
517 		return;
518 
519 	signal(signr, SIG_DFL);
520 	raise(signr);
521 }
522 
523 #ifdef HAVE_AUXTRACE_SUPPORT
524 
525 static int record__process_auxtrace(struct perf_tool *tool,
526 				    struct perf_mmap *map,
527 				    union perf_event *event, void *data1,
528 				    size_t len1, void *data2, size_t len2)
529 {
530 	struct record *rec = container_of(tool, struct record, tool);
531 	struct perf_data *data = &rec->data;
532 	size_t padding;
533 	u8 pad[8] = {0};
534 
535 	if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
536 		off_t file_offset;
537 		int fd = perf_data__fd(data);
538 		int err;
539 
540 		file_offset = lseek(fd, 0, SEEK_CUR);
541 		if (file_offset == -1)
542 			return -1;
543 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
544 						     event, file_offset);
545 		if (err)
546 			return err;
547 	}
548 
549 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
550 	padding = (len1 + len2) & 7;
551 	if (padding)
552 		padding = 8 - padding;
553 
554 	record__write(rec, map, event, event->header.size);
555 	record__write(rec, map, data1, len1);
556 	if (len2)
557 		record__write(rec, map, data2, len2);
558 	record__write(rec, map, &pad, padding);
559 
560 	return 0;
561 }
562 
563 static int record__auxtrace_mmap_read(struct record *rec,
564 				      struct perf_mmap *map)
565 {
566 	int ret;
567 
568 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
569 				  record__process_auxtrace);
570 	if (ret < 0)
571 		return ret;
572 
573 	if (ret)
574 		rec->samples++;
575 
576 	return 0;
577 }
578 
579 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
580 					       struct perf_mmap *map)
581 {
582 	int ret;
583 
584 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
585 					   record__process_auxtrace,
586 					   rec->opts.auxtrace_snapshot_size);
587 	if (ret < 0)
588 		return ret;
589 
590 	if (ret)
591 		rec->samples++;
592 
593 	return 0;
594 }
595 
596 static int record__auxtrace_read_snapshot_all(struct record *rec)
597 {
598 	int i;
599 	int rc = 0;
600 
601 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
602 		struct perf_mmap *map = &rec->evlist->mmap[i];
603 
604 		if (!map->auxtrace_mmap.base)
605 			continue;
606 
607 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
608 			rc = -1;
609 			goto out;
610 		}
611 	}
612 out:
613 	return rc;
614 }
615 
616 static void record__read_auxtrace_snapshot(struct record *rec)
617 {
618 	pr_debug("Recording AUX area tracing snapshot\n");
619 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
620 		trigger_error(&auxtrace_snapshot_trigger);
621 	} else {
622 		if (auxtrace_record__snapshot_finish(rec->itr))
623 			trigger_error(&auxtrace_snapshot_trigger);
624 		else
625 			trigger_ready(&auxtrace_snapshot_trigger);
626 	}
627 }
628 
629 static int record__auxtrace_init(struct record *rec)
630 {
631 	int err;
632 
633 	if (!rec->itr) {
634 		rec->itr = auxtrace_record__init(rec->evlist, &err);
635 		if (err)
636 			return err;
637 	}
638 
639 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
640 					      rec->opts.auxtrace_snapshot_opts);
641 	if (err)
642 		return err;
643 
644 	return auxtrace_parse_filters(rec->evlist);
645 }
646 
647 #else
648 
649 static inline
650 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
651 			       struct perf_mmap *map __maybe_unused)
652 {
653 	return 0;
654 }
655 
656 static inline
657 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
658 {
659 }
660 
661 static inline
662 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
663 {
664 	return 0;
665 }
666 
667 static int record__auxtrace_init(struct record *rec __maybe_unused)
668 {
669 	return 0;
670 }
671 
672 #endif
673 
674 static int record__mmap_evlist(struct record *rec,
675 			       struct perf_evlist *evlist)
676 {
677 	struct record_opts *opts = &rec->opts;
678 	char msg[512];
679 
680 	if (opts->affinity != PERF_AFFINITY_SYS)
681 		cpu__setup_cpunode_map();
682 
683 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
684 				 opts->auxtrace_mmap_pages,
685 				 opts->auxtrace_snapshot_mode,
686 				 opts->nr_cblocks, opts->affinity,
687 				 opts->mmap_flush, opts->comp_level) < 0) {
688 		if (errno == EPERM) {
689 			pr_err("Permission error mapping pages.\n"
690 			       "Consider increasing "
691 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
692 			       "or try again with a smaller value of -m/--mmap_pages.\n"
693 			       "(current value: %u,%u)\n",
694 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
695 			return -errno;
696 		} else {
697 			pr_err("failed to mmap with %d (%s)\n", errno,
698 				str_error_r(errno, msg, sizeof(msg)));
699 			if (errno)
700 				return -errno;
701 			else
702 				return -EINVAL;
703 		}
704 	}
705 	return 0;
706 }
707 
708 static int record__mmap(struct record *rec)
709 {
710 	return record__mmap_evlist(rec, rec->evlist);
711 }
712 
713 static int record__open(struct record *rec)
714 {
715 	char msg[BUFSIZ];
716 	struct perf_evsel *pos;
717 	struct perf_evlist *evlist = rec->evlist;
718 	struct perf_session *session = rec->session;
719 	struct record_opts *opts = &rec->opts;
720 	int rc = 0;
721 
722 	/*
723 	 * For initial_delay we need to add a dummy event so that we can track
724 	 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
725 	 * real events, the ones asked by the user.
726 	 */
727 	if (opts->initial_delay) {
728 		if (perf_evlist__add_dummy(evlist))
729 			return -ENOMEM;
730 
731 		pos = perf_evlist__first(evlist);
732 		pos->tracking = 0;
733 		pos = perf_evlist__last(evlist);
734 		pos->tracking = 1;
735 		pos->attr.enable_on_exec = 1;
736 	}
737 
738 	perf_evlist__config(evlist, opts, &callchain_param);
739 
740 	evlist__for_each_entry(evlist, pos) {
741 try_again:
742 		if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
743 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
744 				if (verbose > 0)
745 					ui__warning("%s\n", msg);
746 				goto try_again;
747 			}
748 			if ((errno == EINVAL || errno == EBADF) &&
749 			    pos->leader != pos &&
750 			    pos->weak_group) {
751 			        pos = perf_evlist__reset_weak_group(evlist, pos);
752 				goto try_again;
753 			}
754 			rc = -errno;
755 			perf_evsel__open_strerror(pos, &opts->target,
756 						  errno, msg, sizeof(msg));
757 			ui__error("%s\n", msg);
758 			goto out;
759 		}
760 
761 		pos->supported = true;
762 	}
763 
764 	if (perf_evlist__apply_filters(evlist, &pos)) {
765 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
766 			pos->filter, perf_evsel__name(pos), errno,
767 			str_error_r(errno, msg, sizeof(msg)));
768 		rc = -1;
769 		goto out;
770 	}
771 
772 	rc = record__mmap(rec);
773 	if (rc)
774 		goto out;
775 
776 	session->evlist = evlist;
777 	perf_session__set_id_hdr_size(session);
778 out:
779 	return rc;
780 }
781 
782 static int process_sample_event(struct perf_tool *tool,
783 				union perf_event *event,
784 				struct perf_sample *sample,
785 				struct perf_evsel *evsel,
786 				struct machine *machine)
787 {
788 	struct record *rec = container_of(tool, struct record, tool);
789 
790 	if (rec->evlist->first_sample_time == 0)
791 		rec->evlist->first_sample_time = sample->time;
792 
793 	rec->evlist->last_sample_time = sample->time;
794 
795 	if (rec->buildid_all)
796 		return 0;
797 
798 	rec->samples++;
799 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
800 }
801 
802 static int process_buildids(struct record *rec)
803 {
804 	struct perf_session *session = rec->session;
805 
806 	if (perf_data__size(&rec->data) == 0)
807 		return 0;
808 
809 	/*
810 	 * During this process, it'll load kernel map and replace the
811 	 * dso->long_name to a real pathname it found.  In this case
812 	 * we prefer the vmlinux path like
813 	 *   /lib/modules/3.16.4/build/vmlinux
814 	 *
815 	 * rather than build-id path (in debug directory).
816 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
817 	 */
818 	symbol_conf.ignore_vmlinux_buildid = true;
819 
820 	/*
821 	 * If --buildid-all is given, it marks all DSO regardless of hits,
822 	 * so no need to process samples. But if timestamp_boundary is enabled,
823 	 * it still needs to walk on all samples to get the timestamps of
824 	 * first/last samples.
825 	 */
826 	if (rec->buildid_all && !rec->timestamp_boundary)
827 		rec->tool.sample = NULL;
828 
829 	return perf_session__process_events(session);
830 }
831 
832 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
833 {
834 	int err;
835 	struct perf_tool *tool = data;
836 	/*
837 	 *As for guest kernel when processing subcommand record&report,
838 	 *we arrange module mmap prior to guest kernel mmap and trigger
839 	 *a preload dso because default guest module symbols are loaded
840 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
841 	 *method is used to avoid symbol missing when the first addr is
842 	 *in module instead of in guest kernel.
843 	 */
844 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
845 					     machine);
846 	if (err < 0)
847 		pr_err("Couldn't record guest kernel [%d]'s reference"
848 		       " relocation symbol.\n", machine->pid);
849 
850 	/*
851 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
852 	 * have no _text sometimes.
853 	 */
854 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
855 						 machine);
856 	if (err < 0)
857 		pr_err("Couldn't record guest kernel [%d]'s reference"
858 		       " relocation symbol.\n", machine->pid);
859 }
860 
861 static struct perf_event_header finished_round_event = {
862 	.size = sizeof(struct perf_event_header),
863 	.type = PERF_RECORD_FINISHED_ROUND,
864 };
865 
866 static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
867 {
868 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
869 	    !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
870 		CPU_ZERO(&rec->affinity_mask);
871 		CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
872 		sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
873 	}
874 }
875 
876 static size_t process_comp_header(void *record, size_t increment)
877 {
878 	struct compressed_event *event = record;
879 	size_t size = sizeof(*event);
880 
881 	if (increment) {
882 		event->header.size += increment;
883 		return increment;
884 	}
885 
886 	event->header.type = PERF_RECORD_COMPRESSED;
887 	event->header.size = size;
888 
889 	return size;
890 }
891 
892 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
893 			    void *src, size_t src_size)
894 {
895 	size_t compressed;
896 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct compressed_event) - 1;
897 
898 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
899 						     max_record_size, process_comp_header);
900 
901 	session->bytes_transferred += src_size;
902 	session->bytes_compressed  += compressed;
903 
904 	return compressed;
905 }
906 
907 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
908 				    bool overwrite, bool synch)
909 {
910 	u64 bytes_written = rec->bytes_written;
911 	int i;
912 	int rc = 0;
913 	struct perf_mmap *maps;
914 	int trace_fd = rec->data.file.fd;
915 	off_t off = 0;
916 
917 	if (!evlist)
918 		return 0;
919 
920 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
921 	if (!maps)
922 		return 0;
923 
924 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
925 		return 0;
926 
927 	if (record__aio_enabled(rec))
928 		off = record__aio_get_pos(trace_fd);
929 
930 	for (i = 0; i < evlist->nr_mmaps; i++) {
931 		u64 flush = 0;
932 		struct perf_mmap *map = &maps[i];
933 
934 		if (map->base) {
935 			record__adjust_affinity(rec, map);
936 			if (synch) {
937 				flush = map->flush;
938 				map->flush = 1;
939 			}
940 			if (!record__aio_enabled(rec)) {
941 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
942 					if (synch)
943 						map->flush = flush;
944 					rc = -1;
945 					goto out;
946 				}
947 			} else {
948 				if (record__aio_push(rec, map, &off) < 0) {
949 					record__aio_set_pos(trace_fd, off);
950 					if (synch)
951 						map->flush = flush;
952 					rc = -1;
953 					goto out;
954 				}
955 			}
956 			if (synch)
957 				map->flush = flush;
958 		}
959 
960 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
961 		    record__auxtrace_mmap_read(rec, map) != 0) {
962 			rc = -1;
963 			goto out;
964 		}
965 	}
966 
967 	if (record__aio_enabled(rec))
968 		record__aio_set_pos(trace_fd, off);
969 
970 	/*
971 	 * Mark the round finished in case we wrote
972 	 * at least one event.
973 	 */
974 	if (bytes_written != rec->bytes_written)
975 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
976 
977 	if (overwrite)
978 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
979 out:
980 	return rc;
981 }
982 
983 static int record__mmap_read_all(struct record *rec, bool synch)
984 {
985 	int err;
986 
987 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
988 	if (err)
989 		return err;
990 
991 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
992 }
993 
994 static void record__init_features(struct record *rec)
995 {
996 	struct perf_session *session = rec->session;
997 	int feat;
998 
999 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1000 		perf_header__set_feat(&session->header, feat);
1001 
1002 	if (rec->no_buildid)
1003 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1004 
1005 	if (!have_tracepoints(&rec->evlist->entries))
1006 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1007 
1008 	if (!rec->opts.branch_stack)
1009 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1010 
1011 	if (!rec->opts.full_auxtrace)
1012 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1013 
1014 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1015 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1016 
1017 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1018 	if (!record__comp_enabled(rec))
1019 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1020 
1021 	perf_header__clear_feat(&session->header, HEADER_STAT);
1022 }
1023 
1024 static void
1025 record__finish_output(struct record *rec)
1026 {
1027 	struct perf_data *data = &rec->data;
1028 	int fd = perf_data__fd(data);
1029 
1030 	if (data->is_pipe)
1031 		return;
1032 
1033 	rec->session->header.data_size += rec->bytes_written;
1034 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1035 
1036 	if (!rec->no_buildid) {
1037 		process_buildids(rec);
1038 
1039 		if (rec->buildid_all)
1040 			dsos__hit_all(rec->session);
1041 	}
1042 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1043 
1044 	return;
1045 }
1046 
1047 static int record__synthesize_workload(struct record *rec, bool tail)
1048 {
1049 	int err;
1050 	struct thread_map *thread_map;
1051 
1052 	if (rec->opts.tail_synthesize != tail)
1053 		return 0;
1054 
1055 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1056 	if (thread_map == NULL)
1057 		return -1;
1058 
1059 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1060 						 process_synthesized_event,
1061 						 &rec->session->machines.host,
1062 						 rec->opts.sample_address);
1063 	thread_map__put(thread_map);
1064 	return err;
1065 }
1066 
1067 static int record__synthesize(struct record *rec, bool tail);
1068 
1069 static int
1070 record__switch_output(struct record *rec, bool at_exit)
1071 {
1072 	struct perf_data *data = &rec->data;
1073 	int fd, err;
1074 	char *new_filename;
1075 
1076 	/* Same Size:      "2015122520103046"*/
1077 	char timestamp[] = "InvalidTimestamp";
1078 
1079 	record__aio_mmap_read_sync(rec);
1080 
1081 	record__synthesize(rec, true);
1082 	if (target__none(&rec->opts.target))
1083 		record__synthesize_workload(rec, true);
1084 
1085 	rec->samples = 0;
1086 	record__finish_output(rec);
1087 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1088 	if (err) {
1089 		pr_err("Failed to get current timestamp\n");
1090 		return -EINVAL;
1091 	}
1092 
1093 	fd = perf_data__switch(data, timestamp,
1094 				    rec->session->header.data_offset,
1095 				    at_exit, &new_filename);
1096 	if (fd >= 0 && !at_exit) {
1097 		rec->bytes_written = 0;
1098 		rec->session->header.data_size = 0;
1099 	}
1100 
1101 	if (!quiet)
1102 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1103 			data->path, timestamp);
1104 
1105 	if (rec->switch_output.num_files) {
1106 		int n = rec->switch_output.cur_file + 1;
1107 
1108 		if (n >= rec->switch_output.num_files)
1109 			n = 0;
1110 		rec->switch_output.cur_file = n;
1111 		if (rec->switch_output.filenames[n]) {
1112 			remove(rec->switch_output.filenames[n]);
1113 			free(rec->switch_output.filenames[n]);
1114 		}
1115 		rec->switch_output.filenames[n] = new_filename;
1116 	} else {
1117 		free(new_filename);
1118 	}
1119 
1120 	/* Output tracking events */
1121 	if (!at_exit) {
1122 		record__synthesize(rec, false);
1123 
1124 		/*
1125 		 * In 'perf record --switch-output' without -a,
1126 		 * record__synthesize() in record__switch_output() won't
1127 		 * generate tracking events because there's no thread_map
1128 		 * in evlist. Which causes newly created perf.data doesn't
1129 		 * contain map and comm information.
1130 		 * Create a fake thread_map and directly call
1131 		 * perf_event__synthesize_thread_map() for those events.
1132 		 */
1133 		if (target__none(&rec->opts.target))
1134 			record__synthesize_workload(rec, false);
1135 	}
1136 	return fd;
1137 }
1138 
1139 static volatile int workload_exec_errno;
1140 
1141 /*
1142  * perf_evlist__prepare_workload will send a SIGUSR1
1143  * if the fork fails, since we asked by setting its
1144  * want_signal to true.
1145  */
1146 static void workload_exec_failed_signal(int signo __maybe_unused,
1147 					siginfo_t *info,
1148 					void *ucontext __maybe_unused)
1149 {
1150 	workload_exec_errno = info->si_value.sival_int;
1151 	done = 1;
1152 	child_finished = 1;
1153 }
1154 
1155 static void snapshot_sig_handler(int sig);
1156 static void alarm_sig_handler(int sig);
1157 
1158 int __weak
1159 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
1160 			    struct perf_tool *tool __maybe_unused,
1161 			    perf_event__handler_t process __maybe_unused,
1162 			    struct machine *machine __maybe_unused)
1163 {
1164 	return 0;
1165 }
1166 
1167 static const struct perf_event_mmap_page *
1168 perf_evlist__pick_pc(struct perf_evlist *evlist)
1169 {
1170 	if (evlist) {
1171 		if (evlist->mmap && evlist->mmap[0].base)
1172 			return evlist->mmap[0].base;
1173 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1174 			return evlist->overwrite_mmap[0].base;
1175 	}
1176 	return NULL;
1177 }
1178 
1179 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1180 {
1181 	const struct perf_event_mmap_page *pc;
1182 
1183 	pc = perf_evlist__pick_pc(rec->evlist);
1184 	if (pc)
1185 		return pc;
1186 	return NULL;
1187 }
1188 
1189 static int record__synthesize(struct record *rec, bool tail)
1190 {
1191 	struct perf_session *session = rec->session;
1192 	struct machine *machine = &session->machines.host;
1193 	struct perf_data *data = &rec->data;
1194 	struct record_opts *opts = &rec->opts;
1195 	struct perf_tool *tool = &rec->tool;
1196 	int fd = perf_data__fd(data);
1197 	int err = 0;
1198 
1199 	if (rec->opts.tail_synthesize != tail)
1200 		return 0;
1201 
1202 	if (data->is_pipe) {
1203 		/*
1204 		 * We need to synthesize events first, because some
1205 		 * features works on top of them (on report side).
1206 		 */
1207 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1208 						   process_synthesized_event);
1209 		if (err < 0) {
1210 			pr_err("Couldn't synthesize attrs.\n");
1211 			goto out;
1212 		}
1213 
1214 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1215 						      process_synthesized_event);
1216 		if (err < 0) {
1217 			pr_err("Couldn't synthesize features.\n");
1218 			return err;
1219 		}
1220 
1221 		if (have_tracepoints(&rec->evlist->entries)) {
1222 			/*
1223 			 * FIXME err <= 0 here actually means that
1224 			 * there were no tracepoints so its not really
1225 			 * an error, just that we don't need to
1226 			 * synthesize anything.  We really have to
1227 			 * return this more properly and also
1228 			 * propagate errors that now are calling die()
1229 			 */
1230 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1231 								  process_synthesized_event);
1232 			if (err <= 0) {
1233 				pr_err("Couldn't record tracing data.\n");
1234 				goto out;
1235 			}
1236 			rec->bytes_written += err;
1237 		}
1238 	}
1239 
1240 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1241 					  process_synthesized_event, machine);
1242 	if (err)
1243 		goto out;
1244 
1245 	if (rec->opts.full_auxtrace) {
1246 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1247 					session, process_synthesized_event);
1248 		if (err)
1249 			goto out;
1250 	}
1251 
1252 	if (!perf_evlist__exclude_kernel(rec->evlist)) {
1253 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1254 							 machine);
1255 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1256 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1257 				   "Check /proc/kallsyms permission or run as root.\n");
1258 
1259 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1260 						     machine);
1261 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1262 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1263 				   "Check /proc/modules permission or run as root.\n");
1264 	}
1265 
1266 	if (perf_guest) {
1267 		machines__process_guests(&session->machines,
1268 					 perf_event__synthesize_guest_os, tool);
1269 	}
1270 
1271 	err = perf_event__synthesize_extra_attr(&rec->tool,
1272 						rec->evlist,
1273 						process_synthesized_event,
1274 						data->is_pipe);
1275 	if (err)
1276 		goto out;
1277 
1278 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->threads,
1279 						 process_synthesized_event,
1280 						NULL);
1281 	if (err < 0) {
1282 		pr_err("Couldn't synthesize thread map.\n");
1283 		return err;
1284 	}
1285 
1286 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->cpus,
1287 					     process_synthesized_event, NULL);
1288 	if (err < 0) {
1289 		pr_err("Couldn't synthesize cpu map.\n");
1290 		return err;
1291 	}
1292 
1293 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1294 						machine, opts);
1295 	if (err < 0)
1296 		pr_warning("Couldn't synthesize bpf events.\n");
1297 
1298 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
1299 					    process_synthesized_event, opts->sample_address,
1300 					    1);
1301 out:
1302 	return err;
1303 }
1304 
1305 static int __cmd_record(struct record *rec, int argc, const char **argv)
1306 {
1307 	int err;
1308 	int status = 0;
1309 	unsigned long waking = 0;
1310 	const bool forks = argc > 0;
1311 	struct perf_tool *tool = &rec->tool;
1312 	struct record_opts *opts = &rec->opts;
1313 	struct perf_data *data = &rec->data;
1314 	struct perf_session *session;
1315 	bool disabled = false, draining = false;
1316 	struct perf_evlist *sb_evlist = NULL;
1317 	int fd;
1318 	float ratio = 0;
1319 
1320 	atexit(record__sig_exit);
1321 	signal(SIGCHLD, sig_handler);
1322 	signal(SIGINT, sig_handler);
1323 	signal(SIGTERM, sig_handler);
1324 	signal(SIGSEGV, sigsegv_handler);
1325 
1326 	if (rec->opts.record_namespaces)
1327 		tool->namespace_events = true;
1328 
1329 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1330 		signal(SIGUSR2, snapshot_sig_handler);
1331 		if (rec->opts.auxtrace_snapshot_mode)
1332 			trigger_on(&auxtrace_snapshot_trigger);
1333 		if (rec->switch_output.enabled)
1334 			trigger_on(&switch_output_trigger);
1335 	} else {
1336 		signal(SIGUSR2, SIG_IGN);
1337 	}
1338 
1339 	session = perf_session__new(data, false, tool);
1340 	if (session == NULL) {
1341 		pr_err("Perf session creation failed.\n");
1342 		return -1;
1343 	}
1344 
1345 	fd = perf_data__fd(data);
1346 	rec->session = session;
1347 
1348 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1349 		pr_err("Compression initialization failed.\n");
1350 		return -1;
1351 	}
1352 
1353 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1354 	session->header.env.comp_level = rec->opts.comp_level;
1355 
1356 	record__init_features(rec);
1357 
1358 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1359 		session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1360 
1361 	if (forks) {
1362 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1363 						    argv, data->is_pipe,
1364 						    workload_exec_failed_signal);
1365 		if (err < 0) {
1366 			pr_err("Couldn't run the workload!\n");
1367 			status = err;
1368 			goto out_delete_session;
1369 		}
1370 	}
1371 
1372 	/*
1373 	 * If we have just single event and are sending data
1374 	 * through pipe, we need to force the ids allocation,
1375 	 * because we synthesize event name through the pipe
1376 	 * and need the id for that.
1377 	 */
1378 	if (data->is_pipe && rec->evlist->nr_entries == 1)
1379 		rec->opts.sample_id = true;
1380 
1381 	if (record__open(rec) != 0) {
1382 		err = -1;
1383 		goto out_child;
1384 	}
1385 	session->header.env.comp_mmap_len = session->evlist->mmap_len;
1386 
1387 	err = bpf__apply_obj_config();
1388 	if (err) {
1389 		char errbuf[BUFSIZ];
1390 
1391 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1392 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1393 			 errbuf);
1394 		goto out_child;
1395 	}
1396 
1397 	/*
1398 	 * Normally perf_session__new would do this, but it doesn't have the
1399 	 * evlist.
1400 	 */
1401 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1402 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1403 		rec->tool.ordered_events = false;
1404 	}
1405 
1406 	if (!rec->evlist->nr_groups)
1407 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1408 
1409 	if (data->is_pipe) {
1410 		err = perf_header__write_pipe(fd);
1411 		if (err < 0)
1412 			goto out_child;
1413 	} else {
1414 		err = perf_session__write_header(session, rec->evlist, fd, false);
1415 		if (err < 0)
1416 			goto out_child;
1417 	}
1418 
1419 	if (!rec->no_buildid
1420 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1421 		pr_err("Couldn't generate buildids. "
1422 		       "Use --no-buildid to profile anyway.\n");
1423 		err = -1;
1424 		goto out_child;
1425 	}
1426 
1427 	if (!opts->no_bpf_event)
1428 		bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1429 
1430 	if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1431 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1432 		opts->no_bpf_event = true;
1433 	}
1434 
1435 	err = record__synthesize(rec, false);
1436 	if (err < 0)
1437 		goto out_child;
1438 
1439 	if (rec->realtime_prio) {
1440 		struct sched_param param;
1441 
1442 		param.sched_priority = rec->realtime_prio;
1443 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1444 			pr_err("Could not set realtime priority.\n");
1445 			err = -1;
1446 			goto out_child;
1447 		}
1448 	}
1449 
1450 	/*
1451 	 * When perf is starting the traced process, all the events
1452 	 * (apart from group members) have enable_on_exec=1 set,
1453 	 * so don't spoil it by prematurely enabling them.
1454 	 */
1455 	if (!target__none(&opts->target) && !opts->initial_delay)
1456 		perf_evlist__enable(rec->evlist);
1457 
1458 	/*
1459 	 * Let the child rip
1460 	 */
1461 	if (forks) {
1462 		struct machine *machine = &session->machines.host;
1463 		union perf_event *event;
1464 		pid_t tgid;
1465 
1466 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1467 		if (event == NULL) {
1468 			err = -ENOMEM;
1469 			goto out_child;
1470 		}
1471 
1472 		/*
1473 		 * Some H/W events are generated before COMM event
1474 		 * which is emitted during exec(), so perf script
1475 		 * cannot see a correct process name for those events.
1476 		 * Synthesize COMM event to prevent it.
1477 		 */
1478 		tgid = perf_event__synthesize_comm(tool, event,
1479 						   rec->evlist->workload.pid,
1480 						   process_synthesized_event,
1481 						   machine);
1482 		free(event);
1483 
1484 		if (tgid == -1)
1485 			goto out_child;
1486 
1487 		event = malloc(sizeof(event->namespaces) +
1488 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1489 			       machine->id_hdr_size);
1490 		if (event == NULL) {
1491 			err = -ENOMEM;
1492 			goto out_child;
1493 		}
1494 
1495 		/*
1496 		 * Synthesize NAMESPACES event for the command specified.
1497 		 */
1498 		perf_event__synthesize_namespaces(tool, event,
1499 						  rec->evlist->workload.pid,
1500 						  tgid, process_synthesized_event,
1501 						  machine);
1502 		free(event);
1503 
1504 		perf_evlist__start_workload(rec->evlist);
1505 	}
1506 
1507 	if (opts->initial_delay) {
1508 		usleep(opts->initial_delay * USEC_PER_MSEC);
1509 		perf_evlist__enable(rec->evlist);
1510 	}
1511 
1512 	trigger_ready(&auxtrace_snapshot_trigger);
1513 	trigger_ready(&switch_output_trigger);
1514 	perf_hooks__invoke_record_start();
1515 	for (;;) {
1516 		unsigned long long hits = rec->samples;
1517 
1518 		/*
1519 		 * rec->evlist->bkw_mmap_state is possible to be
1520 		 * BKW_MMAP_EMPTY here: when done == true and
1521 		 * hits != rec->samples in previous round.
1522 		 *
1523 		 * perf_evlist__toggle_bkw_mmap ensure we never
1524 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1525 		 */
1526 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1527 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1528 
1529 		if (record__mmap_read_all(rec, false) < 0) {
1530 			trigger_error(&auxtrace_snapshot_trigger);
1531 			trigger_error(&switch_output_trigger);
1532 			err = -1;
1533 			goto out_child;
1534 		}
1535 
1536 		if (auxtrace_record__snapshot_started) {
1537 			auxtrace_record__snapshot_started = 0;
1538 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1539 				record__read_auxtrace_snapshot(rec);
1540 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1541 				pr_err("AUX area tracing snapshot failed\n");
1542 				err = -1;
1543 				goto out_child;
1544 			}
1545 		}
1546 
1547 		if (trigger_is_hit(&switch_output_trigger)) {
1548 			/*
1549 			 * If switch_output_trigger is hit, the data in
1550 			 * overwritable ring buffer should have been collected,
1551 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1552 			 *
1553 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1554 			 * record__mmap_read_all() didn't collect data from
1555 			 * overwritable ring buffer. Read again.
1556 			 */
1557 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1558 				continue;
1559 			trigger_ready(&switch_output_trigger);
1560 
1561 			/*
1562 			 * Reenable events in overwrite ring buffer after
1563 			 * record__mmap_read_all(): we should have collected
1564 			 * data from it.
1565 			 */
1566 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1567 
1568 			if (!quiet)
1569 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1570 					waking);
1571 			waking = 0;
1572 			fd = record__switch_output(rec, false);
1573 			if (fd < 0) {
1574 				pr_err("Failed to switch to new file\n");
1575 				trigger_error(&switch_output_trigger);
1576 				err = fd;
1577 				goto out_child;
1578 			}
1579 
1580 			/* re-arm the alarm */
1581 			if (rec->switch_output.time)
1582 				alarm(rec->switch_output.time);
1583 		}
1584 
1585 		if (hits == rec->samples) {
1586 			if (done || draining)
1587 				break;
1588 			err = perf_evlist__poll(rec->evlist, -1);
1589 			/*
1590 			 * Propagate error, only if there's any. Ignore positive
1591 			 * number of returned events and interrupt error.
1592 			 */
1593 			if (err > 0 || (err < 0 && errno == EINTR))
1594 				err = 0;
1595 			waking++;
1596 
1597 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1598 				draining = true;
1599 		}
1600 
1601 		/*
1602 		 * When perf is starting the traced process, at the end events
1603 		 * die with the process and we wait for that. Thus no need to
1604 		 * disable events in this case.
1605 		 */
1606 		if (done && !disabled && !target__none(&opts->target)) {
1607 			trigger_off(&auxtrace_snapshot_trigger);
1608 			perf_evlist__disable(rec->evlist);
1609 			disabled = true;
1610 		}
1611 	}
1612 	trigger_off(&auxtrace_snapshot_trigger);
1613 	trigger_off(&switch_output_trigger);
1614 
1615 	if (forks && workload_exec_errno) {
1616 		char msg[STRERR_BUFSIZE];
1617 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1618 		pr_err("Workload failed: %s\n", emsg);
1619 		err = -1;
1620 		goto out_child;
1621 	}
1622 
1623 	if (!quiet)
1624 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1625 
1626 	if (target__none(&rec->opts.target))
1627 		record__synthesize_workload(rec, true);
1628 
1629 out_child:
1630 	record__mmap_read_all(rec, true);
1631 	record__aio_mmap_read_sync(rec);
1632 
1633 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1634 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1635 		session->header.env.comp_ratio = ratio + 0.5;
1636 	}
1637 
1638 	if (forks) {
1639 		int exit_status;
1640 
1641 		if (!child_finished)
1642 			kill(rec->evlist->workload.pid, SIGTERM);
1643 
1644 		wait(&exit_status);
1645 
1646 		if (err < 0)
1647 			status = err;
1648 		else if (WIFEXITED(exit_status))
1649 			status = WEXITSTATUS(exit_status);
1650 		else if (WIFSIGNALED(exit_status))
1651 			signr = WTERMSIG(exit_status);
1652 	} else
1653 		status = err;
1654 
1655 	record__synthesize(rec, true);
1656 	/* this will be recalculated during process_buildids() */
1657 	rec->samples = 0;
1658 
1659 	if (!err) {
1660 		if (!rec->timestamp_filename) {
1661 			record__finish_output(rec);
1662 		} else {
1663 			fd = record__switch_output(rec, true);
1664 			if (fd < 0) {
1665 				status = fd;
1666 				goto out_delete_session;
1667 			}
1668 		}
1669 	}
1670 
1671 	perf_hooks__invoke_record_end();
1672 
1673 	if (!err && !quiet) {
1674 		char samples[128];
1675 		const char *postfix = rec->timestamp_filename ?
1676 					".<timestamp>" : "";
1677 
1678 		if (rec->samples && !rec->opts.full_auxtrace)
1679 			scnprintf(samples, sizeof(samples),
1680 				  " (%" PRIu64 " samples)", rec->samples);
1681 		else
1682 			samples[0] = '\0';
1683 
1684 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
1685 			perf_data__size(data) / 1024.0 / 1024.0,
1686 			data->path, postfix, samples);
1687 		if (ratio) {
1688 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
1689 					rec->session->bytes_transferred / 1024.0 / 1024.0,
1690 					ratio);
1691 		}
1692 		fprintf(stderr, " ]\n");
1693 	}
1694 
1695 out_delete_session:
1696 	zstd_fini(&session->zstd_data);
1697 	perf_session__delete(session);
1698 
1699 	if (!opts->no_bpf_event)
1700 		perf_evlist__stop_sb_thread(sb_evlist);
1701 	return status;
1702 }
1703 
1704 static void callchain_debug(struct callchain_param *callchain)
1705 {
1706 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1707 
1708 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1709 
1710 	if (callchain->record_mode == CALLCHAIN_DWARF)
1711 		pr_debug("callchain: stack dump size %d\n",
1712 			 callchain->dump_size);
1713 }
1714 
1715 int record_opts__parse_callchain(struct record_opts *record,
1716 				 struct callchain_param *callchain,
1717 				 const char *arg, bool unset)
1718 {
1719 	int ret;
1720 	callchain->enabled = !unset;
1721 
1722 	/* --no-call-graph */
1723 	if (unset) {
1724 		callchain->record_mode = CALLCHAIN_NONE;
1725 		pr_debug("callchain: disabled\n");
1726 		return 0;
1727 	}
1728 
1729 	ret = parse_callchain_record_opt(arg, callchain);
1730 	if (!ret) {
1731 		/* Enable data address sampling for DWARF unwind. */
1732 		if (callchain->record_mode == CALLCHAIN_DWARF)
1733 			record->sample_address = true;
1734 		callchain_debug(callchain);
1735 	}
1736 
1737 	return ret;
1738 }
1739 
1740 int record_parse_callchain_opt(const struct option *opt,
1741 			       const char *arg,
1742 			       int unset)
1743 {
1744 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1745 }
1746 
1747 int record_callchain_opt(const struct option *opt,
1748 			 const char *arg __maybe_unused,
1749 			 int unset __maybe_unused)
1750 {
1751 	struct callchain_param *callchain = opt->value;
1752 
1753 	callchain->enabled = true;
1754 
1755 	if (callchain->record_mode == CALLCHAIN_NONE)
1756 		callchain->record_mode = CALLCHAIN_FP;
1757 
1758 	callchain_debug(callchain);
1759 	return 0;
1760 }
1761 
1762 static int perf_record_config(const char *var, const char *value, void *cb)
1763 {
1764 	struct record *rec = cb;
1765 
1766 	if (!strcmp(var, "record.build-id")) {
1767 		if (!strcmp(value, "cache"))
1768 			rec->no_buildid_cache = false;
1769 		else if (!strcmp(value, "no-cache"))
1770 			rec->no_buildid_cache = true;
1771 		else if (!strcmp(value, "skip"))
1772 			rec->no_buildid = true;
1773 		else
1774 			return -1;
1775 		return 0;
1776 	}
1777 	if (!strcmp(var, "record.call-graph")) {
1778 		var = "call-graph.record-mode";
1779 		return perf_default_config(var, value, cb);
1780 	}
1781 #ifdef HAVE_AIO_SUPPORT
1782 	if (!strcmp(var, "record.aio")) {
1783 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
1784 		if (!rec->opts.nr_cblocks)
1785 			rec->opts.nr_cblocks = nr_cblocks_default;
1786 	}
1787 #endif
1788 
1789 	return 0;
1790 }
1791 
1792 struct clockid_map {
1793 	const char *name;
1794 	int clockid;
1795 };
1796 
1797 #define CLOCKID_MAP(n, c)	\
1798 	{ .name = n, .clockid = (c), }
1799 
1800 #define CLOCKID_END	{ .name = NULL, }
1801 
1802 
1803 /*
1804  * Add the missing ones, we need to build on many distros...
1805  */
1806 #ifndef CLOCK_MONOTONIC_RAW
1807 #define CLOCK_MONOTONIC_RAW 4
1808 #endif
1809 #ifndef CLOCK_BOOTTIME
1810 #define CLOCK_BOOTTIME 7
1811 #endif
1812 #ifndef CLOCK_TAI
1813 #define CLOCK_TAI 11
1814 #endif
1815 
1816 static const struct clockid_map clockids[] = {
1817 	/* available for all events, NMI safe */
1818 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1819 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1820 
1821 	/* available for some events */
1822 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
1823 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1824 	CLOCKID_MAP("tai", CLOCK_TAI),
1825 
1826 	/* available for the lazy */
1827 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1828 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1829 	CLOCKID_MAP("real", CLOCK_REALTIME),
1830 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1831 
1832 	CLOCKID_END,
1833 };
1834 
1835 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1836 {
1837 	struct timespec res;
1838 
1839 	*res_ns = 0;
1840 	if (!clock_getres(clk_id, &res))
1841 		*res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1842 	else
1843 		pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1844 
1845 	return 0;
1846 }
1847 
1848 static int parse_clockid(const struct option *opt, const char *str, int unset)
1849 {
1850 	struct record_opts *opts = (struct record_opts *)opt->value;
1851 	const struct clockid_map *cm;
1852 	const char *ostr = str;
1853 
1854 	if (unset) {
1855 		opts->use_clockid = 0;
1856 		return 0;
1857 	}
1858 
1859 	/* no arg passed */
1860 	if (!str)
1861 		return 0;
1862 
1863 	/* no setting it twice */
1864 	if (opts->use_clockid)
1865 		return -1;
1866 
1867 	opts->use_clockid = true;
1868 
1869 	/* if its a number, we're done */
1870 	if (sscanf(str, "%d", &opts->clockid) == 1)
1871 		return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1872 
1873 	/* allow a "CLOCK_" prefix to the name */
1874 	if (!strncasecmp(str, "CLOCK_", 6))
1875 		str += 6;
1876 
1877 	for (cm = clockids; cm->name; cm++) {
1878 		if (!strcasecmp(str, cm->name)) {
1879 			opts->clockid = cm->clockid;
1880 			return get_clockid_res(opts->clockid,
1881 					       &opts->clockid_res_ns);
1882 		}
1883 	}
1884 
1885 	opts->use_clockid = false;
1886 	ui__warning("unknown clockid %s, check man page\n", ostr);
1887 	return -1;
1888 }
1889 
1890 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1891 {
1892 	struct record_opts *opts = (struct record_opts *)opt->value;
1893 
1894 	if (unset || !str)
1895 		return 0;
1896 
1897 	if (!strcasecmp(str, "node"))
1898 		opts->affinity = PERF_AFFINITY_NODE;
1899 	else if (!strcasecmp(str, "cpu"))
1900 		opts->affinity = PERF_AFFINITY_CPU;
1901 
1902 	return 0;
1903 }
1904 
1905 static int record__parse_mmap_pages(const struct option *opt,
1906 				    const char *str,
1907 				    int unset __maybe_unused)
1908 {
1909 	struct record_opts *opts = opt->value;
1910 	char *s, *p;
1911 	unsigned int mmap_pages;
1912 	int ret;
1913 
1914 	if (!str)
1915 		return -EINVAL;
1916 
1917 	s = strdup(str);
1918 	if (!s)
1919 		return -ENOMEM;
1920 
1921 	p = strchr(s, ',');
1922 	if (p)
1923 		*p = '\0';
1924 
1925 	if (*s) {
1926 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1927 		if (ret)
1928 			goto out_free;
1929 		opts->mmap_pages = mmap_pages;
1930 	}
1931 
1932 	if (!p) {
1933 		ret = 0;
1934 		goto out_free;
1935 	}
1936 
1937 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1938 	if (ret)
1939 		goto out_free;
1940 
1941 	opts->auxtrace_mmap_pages = mmap_pages;
1942 
1943 out_free:
1944 	free(s);
1945 	return ret;
1946 }
1947 
1948 static void switch_output_size_warn(struct record *rec)
1949 {
1950 	u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1951 	struct switch_output *s = &rec->switch_output;
1952 
1953 	wakeup_size /= 2;
1954 
1955 	if (s->size < wakeup_size) {
1956 		char buf[100];
1957 
1958 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1959 		pr_warning("WARNING: switch-output data size lower than "
1960 			   "wakeup kernel buffer size (%s) "
1961 			   "expect bigger perf.data sizes\n", buf);
1962 	}
1963 }
1964 
1965 static int switch_output_setup(struct record *rec)
1966 {
1967 	struct switch_output *s = &rec->switch_output;
1968 	static struct parse_tag tags_size[] = {
1969 		{ .tag  = 'B', .mult = 1       },
1970 		{ .tag  = 'K', .mult = 1 << 10 },
1971 		{ .tag  = 'M', .mult = 1 << 20 },
1972 		{ .tag  = 'G', .mult = 1 << 30 },
1973 		{ .tag  = 0 },
1974 	};
1975 	static struct parse_tag tags_time[] = {
1976 		{ .tag  = 's', .mult = 1        },
1977 		{ .tag  = 'm', .mult = 60       },
1978 		{ .tag  = 'h', .mult = 60*60    },
1979 		{ .tag  = 'd', .mult = 60*60*24 },
1980 		{ .tag  = 0 },
1981 	};
1982 	unsigned long val;
1983 
1984 	if (!s->set)
1985 		return 0;
1986 
1987 	if (!strcmp(s->str, "signal")) {
1988 		s->signal = true;
1989 		pr_debug("switch-output with SIGUSR2 signal\n");
1990 		goto enabled;
1991 	}
1992 
1993 	val = parse_tag_value(s->str, tags_size);
1994 	if (val != (unsigned long) -1) {
1995 		s->size = val;
1996 		pr_debug("switch-output with %s size threshold\n", s->str);
1997 		goto enabled;
1998 	}
1999 
2000 	val = parse_tag_value(s->str, tags_time);
2001 	if (val != (unsigned long) -1) {
2002 		s->time = val;
2003 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2004 			 s->str, s->time);
2005 		goto enabled;
2006 	}
2007 
2008 	return -1;
2009 
2010 enabled:
2011 	rec->timestamp_filename = true;
2012 	s->enabled              = true;
2013 
2014 	if (s->size && !rec->opts.no_buffering)
2015 		switch_output_size_warn(rec);
2016 
2017 	return 0;
2018 }
2019 
2020 static const char * const __record_usage[] = {
2021 	"perf record [<options>] [<command>]",
2022 	"perf record [<options>] -- <command> [<options>]",
2023 	NULL
2024 };
2025 const char * const *record_usage = __record_usage;
2026 
2027 /*
2028  * XXX Ideally would be local to cmd_record() and passed to a record__new
2029  * because we need to have access to it in record__exit, that is called
2030  * after cmd_record() exits, but since record_options need to be accessible to
2031  * builtin-script, leave it here.
2032  *
2033  * At least we don't ouch it in all the other functions here directly.
2034  *
2035  * Just say no to tons of global variables, sigh.
2036  */
2037 static struct record record = {
2038 	.opts = {
2039 		.sample_time	     = true,
2040 		.mmap_pages	     = UINT_MAX,
2041 		.user_freq	     = UINT_MAX,
2042 		.user_interval	     = ULLONG_MAX,
2043 		.freq		     = 4000,
2044 		.target		     = {
2045 			.uses_mmap   = true,
2046 			.default_per_cpu = true,
2047 		},
2048 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2049 	},
2050 	.tool = {
2051 		.sample		= process_sample_event,
2052 		.fork		= perf_event__process_fork,
2053 		.exit		= perf_event__process_exit,
2054 		.comm		= perf_event__process_comm,
2055 		.namespaces	= perf_event__process_namespaces,
2056 		.mmap		= perf_event__process_mmap,
2057 		.mmap2		= perf_event__process_mmap2,
2058 		.ordered_events	= true,
2059 	},
2060 };
2061 
2062 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2063 	"\n\t\t\t\tDefault: fp";
2064 
2065 static bool dry_run;
2066 
2067 /*
2068  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2069  * with it and switch to use the library functions in perf_evlist that came
2070  * from builtin-record.c, i.e. use record_opts,
2071  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2072  * using pipes, etc.
2073  */
2074 static struct option __record_options[] = {
2075 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2076 		     "event selector. use 'perf list' to list available events",
2077 		     parse_events_option),
2078 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2079 		     "event filter", parse_filter),
2080 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2081 			   NULL, "don't record events from perf itself",
2082 			   exclude_perf),
2083 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2084 		    "record events on existing process id"),
2085 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2086 		    "record events on existing thread id"),
2087 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2088 		    "collect data with this RT SCHED_FIFO priority"),
2089 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2090 		    "collect data without buffering"),
2091 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2092 		    "collect raw sample records from all opened counters"),
2093 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2094 			    "system-wide collection from all CPUs"),
2095 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2096 		    "list of cpus to monitor"),
2097 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2098 	OPT_STRING('o', "output", &record.data.path, "file",
2099 		    "output file name"),
2100 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2101 			&record.opts.no_inherit_set,
2102 			"child tasks do not inherit counters"),
2103 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2104 		    "synthesize non-sample events at the end of output"),
2105 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2106 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2107 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2108 		    "Fail if the specified frequency can't be used"),
2109 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2110 		     "profile at this frequency",
2111 		      record__parse_freq),
2112 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2113 		     "number of mmap data pages and AUX area tracing mmap pages",
2114 		     record__parse_mmap_pages),
2115 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2116 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2117 		     record__mmap_flush_parse),
2118 	OPT_BOOLEAN(0, "group", &record.opts.group,
2119 		    "put the counters into a counter group"),
2120 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2121 			   NULL, "enables call-graph recording" ,
2122 			   &record_callchain_opt),
2123 	OPT_CALLBACK(0, "call-graph", &record.opts,
2124 		     "record_mode[,record_size]", record_callchain_help,
2125 		     &record_parse_callchain_opt),
2126 	OPT_INCR('v', "verbose", &verbose,
2127 		    "be more verbose (show counter open errors, etc)"),
2128 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2129 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2130 		    "per thread counts"),
2131 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2132 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2133 		    "Record the sample physical addresses"),
2134 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2135 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2136 			&record.opts.sample_time_set,
2137 			"Record the sample timestamps"),
2138 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2139 			"Record the sample period"),
2140 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2141 		    "don't sample"),
2142 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2143 			&record.no_buildid_cache_set,
2144 			"do not update the buildid cache"),
2145 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2146 			&record.no_buildid_set,
2147 			"do not collect buildids in perf.data"),
2148 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2149 		     "monitor event in cgroup name only",
2150 		     parse_cgroups),
2151 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2152 		  "ms to wait before starting measurement after program start"),
2153 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2154 		   "user to profile"),
2155 
2156 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2157 		     "branch any", "sample any taken branches",
2158 		     parse_branch_stack),
2159 
2160 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2161 		     "branch filter mask", "branch stack filter modes",
2162 		     parse_branch_stack),
2163 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2164 		    "sample by weight (on special events only)"),
2165 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2166 		    "sample transaction flags (special events only)"),
2167 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2168 		    "use per-thread mmaps"),
2169 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2170 		    "sample selected machine registers on interrupt,"
2171 		    " use '-I?' to list register names", parse_intr_regs),
2172 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2173 		    "sample selected machine registers on interrupt,"
2174 		    " use '--user-regs=?' to list register names", parse_user_regs),
2175 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2176 		    "Record running/enabled time of read (:S) events"),
2177 	OPT_CALLBACK('k', "clockid", &record.opts,
2178 	"clockid", "clockid to use for events, see clock_gettime()",
2179 	parse_clockid),
2180 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2181 			  "opts", "AUX area tracing Snapshot Mode", ""),
2182 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2183 			"per thread proc mmap processing timeout in ms"),
2184 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2185 		    "Record namespaces events"),
2186 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2187 		    "Record context switch events"),
2188 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2189 			 "Configure all used events to run in kernel space.",
2190 			 PARSE_OPT_EXCLUSIVE),
2191 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2192 			 "Configure all used events to run in user space.",
2193 			 PARSE_OPT_EXCLUSIVE),
2194 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2195 		   "clang binary to use for compiling BPF scriptlets"),
2196 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2197 		   "options passed to clang when compiling BPF scriptlets"),
2198 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2199 		   "file", "vmlinux pathname"),
2200 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2201 		    "Record build-id of all DSOs regardless of hits"),
2202 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2203 		    "append timestamp to output filename"),
2204 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2205 		    "Record timestamp boundary (time of first/last samples)"),
2206 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2207 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2208 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2209 			  "signal"),
2210 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2211 		   "Limit number of switch output generated files"),
2212 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2213 		    "Parse options then exit"),
2214 #ifdef HAVE_AIO_SUPPORT
2215 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2216 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2217 		     record__aio_parse),
2218 #endif
2219 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2220 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2221 		     record__parse_affinity),
2222 #ifdef HAVE_ZSTD_SUPPORT
2223 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2224 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2225 			    record__parse_comp_level),
2226 #endif
2227 	OPT_END()
2228 };
2229 
2230 struct option *record_options = __record_options;
2231 
2232 int cmd_record(int argc, const char **argv)
2233 {
2234 	int err;
2235 	struct record *rec = &record;
2236 	char errbuf[BUFSIZ];
2237 
2238 	setlocale(LC_ALL, "");
2239 
2240 #ifndef HAVE_LIBBPF_SUPPORT
2241 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2242 	set_nobuild('\0', "clang-path", true);
2243 	set_nobuild('\0', "clang-opt", true);
2244 # undef set_nobuild
2245 #endif
2246 
2247 #ifndef HAVE_BPF_PROLOGUE
2248 # if !defined (HAVE_DWARF_SUPPORT)
2249 #  define REASON  "NO_DWARF=1"
2250 # elif !defined (HAVE_LIBBPF_SUPPORT)
2251 #  define REASON  "NO_LIBBPF=1"
2252 # else
2253 #  define REASON  "this architecture doesn't support BPF prologue"
2254 # endif
2255 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2256 	set_nobuild('\0', "vmlinux", true);
2257 # undef set_nobuild
2258 # undef REASON
2259 #endif
2260 
2261 	CPU_ZERO(&rec->affinity_mask);
2262 	rec->opts.affinity = PERF_AFFINITY_SYS;
2263 
2264 	rec->evlist = perf_evlist__new();
2265 	if (rec->evlist == NULL)
2266 		return -ENOMEM;
2267 
2268 	err = perf_config(perf_record_config, rec);
2269 	if (err)
2270 		return err;
2271 
2272 	argc = parse_options(argc, argv, record_options, record_usage,
2273 			    PARSE_OPT_STOP_AT_NON_OPTION);
2274 	if (quiet)
2275 		perf_quiet_option();
2276 
2277 	/* Make system wide (-a) the default target. */
2278 	if (!argc && target__none(&rec->opts.target))
2279 		rec->opts.target.system_wide = true;
2280 
2281 	if (nr_cgroups && !rec->opts.target.system_wide) {
2282 		usage_with_options_msg(record_usage, record_options,
2283 			"cgroup monitoring only available in system-wide mode");
2284 
2285 	}
2286 
2287 	if (rec->opts.comp_level != 0) {
2288 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2289 		rec->no_buildid = true;
2290 	}
2291 
2292 	if (rec->opts.record_switch_events &&
2293 	    !perf_can_record_switch_events()) {
2294 		ui__error("kernel does not support recording context switch events\n");
2295 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2296 		return -EINVAL;
2297 	}
2298 
2299 	if (switch_output_setup(rec)) {
2300 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2301 		return -EINVAL;
2302 	}
2303 
2304 	if (rec->switch_output.time) {
2305 		signal(SIGALRM, alarm_sig_handler);
2306 		alarm(rec->switch_output.time);
2307 	}
2308 
2309 	if (rec->switch_output.num_files) {
2310 		rec->switch_output.filenames = calloc(sizeof(char *),
2311 						      rec->switch_output.num_files);
2312 		if (!rec->switch_output.filenames)
2313 			return -EINVAL;
2314 	}
2315 
2316 	/*
2317 	 * Allow aliases to facilitate the lookup of symbols for address
2318 	 * filters. Refer to auxtrace_parse_filters().
2319 	 */
2320 	symbol_conf.allow_aliases = true;
2321 
2322 	symbol__init(NULL);
2323 
2324 	err = record__auxtrace_init(rec);
2325 	if (err)
2326 		goto out;
2327 
2328 	if (dry_run)
2329 		goto out;
2330 
2331 	err = bpf__setup_stdout(rec->evlist);
2332 	if (err) {
2333 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2334 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2335 			 errbuf);
2336 		goto out;
2337 	}
2338 
2339 	err = -ENOMEM;
2340 
2341 	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2342 		pr_warning(
2343 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2344 "check /proc/sys/kernel/kptr_restrict.\n\n"
2345 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2346 "file is not found in the buildid cache or in the vmlinux path.\n\n"
2347 "Samples in kernel modules won't be resolved at all.\n\n"
2348 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2349 "even with a suitable vmlinux or kallsyms file.\n\n");
2350 
2351 	if (rec->no_buildid_cache || rec->no_buildid) {
2352 		disable_buildid_cache();
2353 	} else if (rec->switch_output.enabled) {
2354 		/*
2355 		 * In 'perf record --switch-output', disable buildid
2356 		 * generation by default to reduce data file switching
2357 		 * overhead. Still generate buildid if they are required
2358 		 * explicitly using
2359 		 *
2360 		 *  perf record --switch-output --no-no-buildid \
2361 		 *              --no-no-buildid-cache
2362 		 *
2363 		 * Following code equals to:
2364 		 *
2365 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2366 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2367 		 *         disable_buildid_cache();
2368 		 */
2369 		bool disable = true;
2370 
2371 		if (rec->no_buildid_set && !rec->no_buildid)
2372 			disable = false;
2373 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2374 			disable = false;
2375 		if (disable) {
2376 			rec->no_buildid = true;
2377 			rec->no_buildid_cache = true;
2378 			disable_buildid_cache();
2379 		}
2380 	}
2381 
2382 	if (record.opts.overwrite)
2383 		record.opts.tail_synthesize = true;
2384 
2385 	if (rec->evlist->nr_entries == 0 &&
2386 	    __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2387 		pr_err("Not enough memory for event selector list\n");
2388 		goto out;
2389 	}
2390 
2391 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2392 		rec->opts.no_inherit = true;
2393 
2394 	err = target__validate(&rec->opts.target);
2395 	if (err) {
2396 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2397 		ui__warning("%s\n", errbuf);
2398 	}
2399 
2400 	err = target__parse_uid(&rec->opts.target);
2401 	if (err) {
2402 		int saved_errno = errno;
2403 
2404 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2405 		ui__error("%s", errbuf);
2406 
2407 		err = -saved_errno;
2408 		goto out;
2409 	}
2410 
2411 	/* Enable ignoring missing threads when -u/-p option is defined. */
2412 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2413 
2414 	err = -ENOMEM;
2415 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2416 		usage_with_options(record_usage, record_options);
2417 
2418 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2419 	if (err)
2420 		goto out;
2421 
2422 	/*
2423 	 * We take all buildids when the file contains
2424 	 * AUX area tracing data because we do not decode the
2425 	 * trace because it would take too long.
2426 	 */
2427 	if (rec->opts.full_auxtrace)
2428 		rec->buildid_all = true;
2429 
2430 	if (record_opts__config(&rec->opts)) {
2431 		err = -EINVAL;
2432 		goto out;
2433 	}
2434 
2435 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2436 		rec->opts.nr_cblocks = nr_cblocks_max;
2437 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2438 
2439 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2440 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2441 
2442 	if (rec->opts.comp_level > comp_level_max)
2443 		rec->opts.comp_level = comp_level_max;
2444 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2445 
2446 	err = __cmd_record(&record, argc, argv);
2447 out:
2448 	perf_evlist__delete(rec->evlist);
2449 	symbol__exit();
2450 	auxtrace_record__free(rec->itr);
2451 	return err;
2452 }
2453 
2454 static void snapshot_sig_handler(int sig __maybe_unused)
2455 {
2456 	struct record *rec = &record;
2457 
2458 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2459 		trigger_hit(&auxtrace_snapshot_trigger);
2460 		auxtrace_record__snapshot_started = 1;
2461 		if (auxtrace_record__snapshot_start(record.itr))
2462 			trigger_error(&auxtrace_snapshot_trigger);
2463 	}
2464 
2465 	if (switch_output_signal(rec))
2466 		trigger_hit(&switch_output_trigger);
2467 }
2468 
2469 static void alarm_sig_handler(int sig __maybe_unused)
2470 {
2471 	struct record *rec = &record;
2472 
2473 	if (switch_output_time(rec))
2474 		trigger_hit(&switch_output_trigger);
2475 }
2476