xref: /linux/drivers/iommu/intel/perfmon.c (revision 1e525507)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Support Intel IOMMU PerfMon
4  * Copyright(c) 2023 Intel Corporation.
5  */
6 #define pr_fmt(fmt)	"DMAR: " fmt
7 #define dev_fmt(fmt)	pr_fmt(fmt)
8 
9 #include <linux/dmar.h>
10 #include "iommu.h"
11 #include "perfmon.h"
12 
13 PMU_FORMAT_ATTR(event,		"config:0-27");		/* ES: Events Select */
14 PMU_FORMAT_ATTR(event_group,	"config:28-31");	/* EGI: Event Group Index */
15 
16 static struct attribute *iommu_pmu_format_attrs[] = {
17 	&format_attr_event_group.attr,
18 	&format_attr_event.attr,
19 	NULL
20 };
21 
22 static struct attribute_group iommu_pmu_format_attr_group = {
23 	.name = "format",
24 	.attrs = iommu_pmu_format_attrs,
25 };
26 
27 /* The available events are added in attr_update later */
28 static struct attribute *attrs_empty[] = {
29 	NULL
30 };
31 
32 static struct attribute_group iommu_pmu_events_attr_group = {
33 	.name = "events",
34 	.attrs = attrs_empty,
35 };
36 
37 static cpumask_t iommu_pmu_cpu_mask;
38 
39 static ssize_t
40 cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
41 {
42 	return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask);
43 }
44 static DEVICE_ATTR_RO(cpumask);
45 
46 static struct attribute *iommu_pmu_cpumask_attrs[] = {
47 	&dev_attr_cpumask.attr,
48 	NULL
49 };
50 
51 static struct attribute_group iommu_pmu_cpumask_attr_group = {
52 	.attrs = iommu_pmu_cpumask_attrs,
53 };
54 
55 static const struct attribute_group *iommu_pmu_attr_groups[] = {
56 	&iommu_pmu_format_attr_group,
57 	&iommu_pmu_events_attr_group,
58 	&iommu_pmu_cpumask_attr_group,
59 	NULL
60 };
61 
62 static inline struct iommu_pmu *dev_to_iommu_pmu(struct device *dev)
63 {
64 	/*
65 	 * The perf_event creates its own dev for each PMU.
66 	 * See pmu_dev_alloc()
67 	 */
68 	return container_of(dev_get_drvdata(dev), struct iommu_pmu, pmu);
69 }
70 
71 #define IOMMU_PMU_ATTR(_name, _format, _filter)				\
72 	PMU_FORMAT_ATTR(_name, _format);				\
73 									\
74 static struct attribute *_name##_attr[] = {				\
75 	&format_attr_##_name.attr,					\
76 	NULL								\
77 };									\
78 									\
79 static umode_t								\
80 _name##_is_visible(struct kobject *kobj, struct attribute *attr, int i)	\
81 {									\
82 	struct device *dev = kobj_to_dev(kobj);				\
83 	struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev);		\
84 									\
85 	if (!iommu_pmu)							\
86 		return 0;						\
87 	return (iommu_pmu->filter & _filter) ? attr->mode : 0;		\
88 }									\
89 									\
90 static struct attribute_group _name = {					\
91 	.name		= "format",					\
92 	.attrs		= _name##_attr,					\
93 	.is_visible	= _name##_is_visible,				\
94 };
95 
96 IOMMU_PMU_ATTR(filter_requester_id_en,	"config1:0",		IOMMU_PMU_FILTER_REQUESTER_ID);
97 IOMMU_PMU_ATTR(filter_domain_en,	"config1:1",		IOMMU_PMU_FILTER_DOMAIN);
98 IOMMU_PMU_ATTR(filter_pasid_en,		"config1:2",		IOMMU_PMU_FILTER_PASID);
99 IOMMU_PMU_ATTR(filter_ats_en,		"config1:3",		IOMMU_PMU_FILTER_ATS);
100 IOMMU_PMU_ATTR(filter_page_table_en,	"config1:4",		IOMMU_PMU_FILTER_PAGE_TABLE);
101 IOMMU_PMU_ATTR(filter_requester_id,	"config1:16-31",	IOMMU_PMU_FILTER_REQUESTER_ID);
102 IOMMU_PMU_ATTR(filter_domain,		"config1:32-47",	IOMMU_PMU_FILTER_DOMAIN);
103 IOMMU_PMU_ATTR(filter_pasid,		"config2:0-21",		IOMMU_PMU_FILTER_PASID);
104 IOMMU_PMU_ATTR(filter_ats,		"config2:24-28",	IOMMU_PMU_FILTER_ATS);
105 IOMMU_PMU_ATTR(filter_page_table,	"config2:32-36",	IOMMU_PMU_FILTER_PAGE_TABLE);
106 
107 #define iommu_pmu_en_requester_id(e)		((e) & 0x1)
108 #define iommu_pmu_en_domain(e)			(((e) >> 1) & 0x1)
109 #define iommu_pmu_en_pasid(e)			(((e) >> 2) & 0x1)
110 #define iommu_pmu_en_ats(e)			(((e) >> 3) & 0x1)
111 #define iommu_pmu_en_page_table(e)		(((e) >> 4) & 0x1)
112 #define iommu_pmu_get_requester_id(filter)	(((filter) >> 16) & 0xffff)
113 #define iommu_pmu_get_domain(filter)		(((filter) >> 32) & 0xffff)
114 #define iommu_pmu_get_pasid(filter)		((filter) & 0x3fffff)
115 #define iommu_pmu_get_ats(filter)		(((filter) >> 24) & 0x1f)
116 #define iommu_pmu_get_page_table(filter)	(((filter) >> 32) & 0x1f)
117 
118 #define iommu_pmu_set_filter(_name, _config, _filter, _idx, _econfig)		\
119 {										\
120 	if ((iommu_pmu->filter & _filter) && iommu_pmu_en_##_name(_econfig)) {	\
121 		dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET +	\
122 			    IOMMU_PMU_CFG_SIZE +				\
123 			    (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET,	\
124 			    iommu_pmu_get_##_name(_config) | IOMMU_PMU_FILTER_EN);\
125 	}									\
126 }
127 
128 #define iommu_pmu_clear_filter(_filter, _idx)					\
129 {										\
130 	if (iommu_pmu->filter & _filter) {					\
131 		dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET +	\
132 			    IOMMU_PMU_CFG_SIZE +				\
133 			    (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET,	\
134 			    0);							\
135 	}									\
136 }
137 
138 /*
139  * Define the event attr related functions
140  * Input: _name: event attr name
141  *        _string: string of the event in sysfs
142  *        _g_idx: event group encoding
143  *        _event: event encoding
144  */
145 #define IOMMU_PMU_EVENT_ATTR(_name, _string, _g_idx, _event)			\
146 	PMU_EVENT_ATTR_STRING(_name, event_attr_##_name, _string)		\
147 										\
148 static struct attribute *_name##_attr[] = {					\
149 	&event_attr_##_name.attr.attr,						\
150 	NULL									\
151 };										\
152 										\
153 static umode_t									\
154 _name##_is_visible(struct kobject *kobj, struct attribute *attr, int i)		\
155 {										\
156 	struct device *dev = kobj_to_dev(kobj);					\
157 	struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev);			\
158 										\
159 	if (!iommu_pmu)								\
160 		return 0;							\
161 	return (iommu_pmu->evcap[_g_idx] & _event) ? attr->mode : 0;		\
162 }										\
163 										\
164 static struct attribute_group _name = {						\
165 	.name		= "events",						\
166 	.attrs		= _name##_attr,						\
167 	.is_visible	= _name##_is_visible,					\
168 };
169 
170 IOMMU_PMU_EVENT_ATTR(iommu_clocks,		"event_group=0x0,event=0x001", 0x0, 0x001)
171 IOMMU_PMU_EVENT_ATTR(iommu_requests,		"event_group=0x0,event=0x002", 0x0, 0x002)
172 IOMMU_PMU_EVENT_ATTR(pw_occupancy,		"event_group=0x0,event=0x004", 0x0, 0x004)
173 IOMMU_PMU_EVENT_ATTR(ats_blocked,		"event_group=0x0,event=0x008", 0x0, 0x008)
174 IOMMU_PMU_EVENT_ATTR(iommu_mrds,		"event_group=0x1,event=0x001", 0x1, 0x001)
175 IOMMU_PMU_EVENT_ATTR(iommu_mem_blocked,		"event_group=0x1,event=0x020", 0x1, 0x020)
176 IOMMU_PMU_EVENT_ATTR(pg_req_posted,		"event_group=0x1,event=0x040", 0x1, 0x040)
177 IOMMU_PMU_EVENT_ATTR(ctxt_cache_lookup,		"event_group=0x2,event=0x001", 0x2, 0x001)
178 IOMMU_PMU_EVENT_ATTR(ctxt_cache_hit,		"event_group=0x2,event=0x002", 0x2, 0x002)
179 IOMMU_PMU_EVENT_ATTR(pasid_cache_lookup,	"event_group=0x2,event=0x004", 0x2, 0x004)
180 IOMMU_PMU_EVENT_ATTR(pasid_cache_hit,		"event_group=0x2,event=0x008", 0x2, 0x008)
181 IOMMU_PMU_EVENT_ATTR(ss_nonleaf_lookup,		"event_group=0x2,event=0x010", 0x2, 0x010)
182 IOMMU_PMU_EVENT_ATTR(ss_nonleaf_hit,		"event_group=0x2,event=0x020", 0x2, 0x020)
183 IOMMU_PMU_EVENT_ATTR(fs_nonleaf_lookup,		"event_group=0x2,event=0x040", 0x2, 0x040)
184 IOMMU_PMU_EVENT_ATTR(fs_nonleaf_hit,		"event_group=0x2,event=0x080", 0x2, 0x080)
185 IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_lookup,	"event_group=0x2,event=0x100", 0x2, 0x100)
186 IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_hit,		"event_group=0x2,event=0x200", 0x2, 0x200)
187 IOMMU_PMU_EVENT_ATTR(iotlb_lookup,		"event_group=0x3,event=0x001", 0x3, 0x001)
188 IOMMU_PMU_EVENT_ATTR(iotlb_hit,			"event_group=0x3,event=0x002", 0x3, 0x002)
189 IOMMU_PMU_EVENT_ATTR(hpt_leaf_lookup,		"event_group=0x3,event=0x004", 0x3, 0x004)
190 IOMMU_PMU_EVENT_ATTR(hpt_leaf_hit,		"event_group=0x3,event=0x008", 0x3, 0x008)
191 IOMMU_PMU_EVENT_ATTR(int_cache_lookup,		"event_group=0x4,event=0x001", 0x4, 0x001)
192 IOMMU_PMU_EVENT_ATTR(int_cache_hit_nonposted,	"event_group=0x4,event=0x002", 0x4, 0x002)
193 IOMMU_PMU_EVENT_ATTR(int_cache_hit_posted,	"event_group=0x4,event=0x004", 0x4, 0x004)
194 
195 static const struct attribute_group *iommu_pmu_attr_update[] = {
196 	&filter_requester_id_en,
197 	&filter_domain_en,
198 	&filter_pasid_en,
199 	&filter_ats_en,
200 	&filter_page_table_en,
201 	&filter_requester_id,
202 	&filter_domain,
203 	&filter_pasid,
204 	&filter_ats,
205 	&filter_page_table,
206 	&iommu_clocks,
207 	&iommu_requests,
208 	&pw_occupancy,
209 	&ats_blocked,
210 	&iommu_mrds,
211 	&iommu_mem_blocked,
212 	&pg_req_posted,
213 	&ctxt_cache_lookup,
214 	&ctxt_cache_hit,
215 	&pasid_cache_lookup,
216 	&pasid_cache_hit,
217 	&ss_nonleaf_lookup,
218 	&ss_nonleaf_hit,
219 	&fs_nonleaf_lookup,
220 	&fs_nonleaf_hit,
221 	&hpt_nonleaf_lookup,
222 	&hpt_nonleaf_hit,
223 	&iotlb_lookup,
224 	&iotlb_hit,
225 	&hpt_leaf_lookup,
226 	&hpt_leaf_hit,
227 	&int_cache_lookup,
228 	&int_cache_hit_nonposted,
229 	&int_cache_hit_posted,
230 	NULL
231 };
232 
233 static inline void __iomem *
234 iommu_event_base(struct iommu_pmu *iommu_pmu, int idx)
235 {
236 	return iommu_pmu->cntr_reg + idx * iommu_pmu->cntr_stride;
237 }
238 
239 static inline void __iomem *
240 iommu_config_base(struct iommu_pmu *iommu_pmu, int idx)
241 {
242 	return iommu_pmu->cfg_reg + idx * IOMMU_PMU_CFG_OFFSET;
243 }
244 
245 static inline struct iommu_pmu *iommu_event_to_pmu(struct perf_event *event)
246 {
247 	return container_of(event->pmu, struct iommu_pmu, pmu);
248 }
249 
250 static inline u64 iommu_event_config(struct perf_event *event)
251 {
252 	u64 config = event->attr.config;
253 
254 	return (iommu_event_select(config) << IOMMU_EVENT_CFG_ES_SHIFT) |
255 	       (iommu_event_group(config) << IOMMU_EVENT_CFG_EGI_SHIFT) |
256 	       IOMMU_EVENT_CFG_INT;
257 }
258 
259 static inline bool is_iommu_pmu_event(struct iommu_pmu *iommu_pmu,
260 				      struct perf_event *event)
261 {
262 	return event->pmu == &iommu_pmu->pmu;
263 }
264 
265 static int iommu_pmu_validate_event(struct perf_event *event)
266 {
267 	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
268 	u32 event_group = iommu_event_group(event->attr.config);
269 
270 	if (event_group >= iommu_pmu->num_eg)
271 		return -EINVAL;
272 
273 	return 0;
274 }
275 
276 static int iommu_pmu_validate_group(struct perf_event *event)
277 {
278 	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
279 	struct perf_event *sibling;
280 	int nr = 0;
281 
282 	/*
283 	 * All events in a group must be scheduled simultaneously.
284 	 * Check whether there is enough counters for all the events.
285 	 */
286 	for_each_sibling_event(sibling, event->group_leader) {
287 		if (!is_iommu_pmu_event(iommu_pmu, sibling) ||
288 		    sibling->state <= PERF_EVENT_STATE_OFF)
289 			continue;
290 
291 		if (++nr > iommu_pmu->num_cntr)
292 			return -EINVAL;
293 	}
294 
295 	return 0;
296 }
297 
298 static int iommu_pmu_event_init(struct perf_event *event)
299 {
300 	struct hw_perf_event *hwc = &event->hw;
301 
302 	if (event->attr.type != event->pmu->type)
303 		return -ENOENT;
304 
305 	/* sampling not supported */
306 	if (event->attr.sample_period)
307 		return -EINVAL;
308 
309 	if (event->cpu < 0)
310 		return -EINVAL;
311 
312 	if (iommu_pmu_validate_event(event))
313 		return -EINVAL;
314 
315 	hwc->config = iommu_event_config(event);
316 
317 	return iommu_pmu_validate_group(event);
318 }
319 
320 static void iommu_pmu_event_update(struct perf_event *event)
321 {
322 	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
323 	struct hw_perf_event *hwc = &event->hw;
324 	u64 prev_count, new_count, delta;
325 	int shift = 64 - iommu_pmu->cntr_width;
326 
327 again:
328 	prev_count = local64_read(&hwc->prev_count);
329 	new_count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
330 	if (local64_xchg(&hwc->prev_count, new_count) != prev_count)
331 		goto again;
332 
333 	/*
334 	 * The counter width is enumerated. Always shift the counter
335 	 * before using it.
336 	 */
337 	delta = (new_count << shift) - (prev_count << shift);
338 	delta >>= shift;
339 
340 	local64_add(delta, &event->count);
341 }
342 
343 static void iommu_pmu_start(struct perf_event *event, int flags)
344 {
345 	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
346 	struct intel_iommu *iommu = iommu_pmu->iommu;
347 	struct hw_perf_event *hwc = &event->hw;
348 	u64 count;
349 
350 	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
351 		return;
352 
353 	if (WARN_ON_ONCE(hwc->idx < 0 || hwc->idx >= IOMMU_PMU_IDX_MAX))
354 		return;
355 
356 	if (flags & PERF_EF_RELOAD)
357 		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
358 
359 	hwc->state = 0;
360 
361 	/* Always reprogram the period */
362 	count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
363 	local64_set((&hwc->prev_count), count);
364 
365 	/*
366 	 * The error of ecmd will be ignored.
367 	 * - The existing perf_event subsystem doesn't handle the error.
368 	 *   Only IOMMU PMU returns runtime HW error. We don't want to
369 	 *   change the existing generic interfaces for the specific case.
370 	 * - It's a corner case caused by HW, which is very unlikely to
371 	 *   happen. There is nothing SW can do.
372 	 * - The worst case is that the user will get <not count> with
373 	 *   perf command, which can give the user some hints.
374 	 */
375 	ecmd_submit_sync(iommu, DMA_ECMD_ENABLE, hwc->idx, 0);
376 
377 	perf_event_update_userpage(event);
378 }
379 
380 static void iommu_pmu_stop(struct perf_event *event, int flags)
381 {
382 	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
383 	struct intel_iommu *iommu = iommu_pmu->iommu;
384 	struct hw_perf_event *hwc = &event->hw;
385 
386 	if (!(hwc->state & PERF_HES_STOPPED)) {
387 		ecmd_submit_sync(iommu, DMA_ECMD_DISABLE, hwc->idx, 0);
388 
389 		iommu_pmu_event_update(event);
390 
391 		hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
392 	}
393 }
394 
395 static inline int
396 iommu_pmu_validate_per_cntr_event(struct iommu_pmu *iommu_pmu,
397 				  int idx, struct perf_event *event)
398 {
399 	u32 event_group = iommu_event_group(event->attr.config);
400 	u32 select = iommu_event_select(event->attr.config);
401 
402 	if (!(iommu_pmu->cntr_evcap[idx][event_group] & select))
403 		return -EINVAL;
404 
405 	return 0;
406 }
407 
408 static int iommu_pmu_assign_event(struct iommu_pmu *iommu_pmu,
409 				  struct perf_event *event)
410 {
411 	struct hw_perf_event *hwc = &event->hw;
412 	int idx;
413 
414 	/*
415 	 * The counters which support limited events are usually at the end.
416 	 * Schedule them first to accommodate more events.
417 	 */
418 	for (idx = iommu_pmu->num_cntr - 1; idx >= 0; idx--) {
419 		if (test_and_set_bit(idx, iommu_pmu->used_mask))
420 			continue;
421 		/* Check per-counter event capabilities */
422 		if (!iommu_pmu_validate_per_cntr_event(iommu_pmu, idx, event))
423 			break;
424 		clear_bit(idx, iommu_pmu->used_mask);
425 	}
426 	if (idx < 0)
427 		return -EINVAL;
428 
429 	iommu_pmu->event_list[idx] = event;
430 	hwc->idx = idx;
431 
432 	/* config events */
433 	dmar_writeq(iommu_config_base(iommu_pmu, idx), hwc->config);
434 
435 	iommu_pmu_set_filter(requester_id, event->attr.config1,
436 			     IOMMU_PMU_FILTER_REQUESTER_ID, idx,
437 			     event->attr.config1);
438 	iommu_pmu_set_filter(domain, event->attr.config1,
439 			     IOMMU_PMU_FILTER_DOMAIN, idx,
440 			     event->attr.config1);
441 	iommu_pmu_set_filter(pasid, event->attr.config2,
442 			     IOMMU_PMU_FILTER_PASID, idx,
443 			     event->attr.config1);
444 	iommu_pmu_set_filter(ats, event->attr.config2,
445 			     IOMMU_PMU_FILTER_ATS, idx,
446 			     event->attr.config1);
447 	iommu_pmu_set_filter(page_table, event->attr.config2,
448 			     IOMMU_PMU_FILTER_PAGE_TABLE, idx,
449 			     event->attr.config1);
450 
451 	return 0;
452 }
453 
454 static int iommu_pmu_add(struct perf_event *event, int flags)
455 {
456 	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
457 	struct hw_perf_event *hwc = &event->hw;
458 	int ret;
459 
460 	ret = iommu_pmu_assign_event(iommu_pmu, event);
461 	if (ret < 0)
462 		return ret;
463 
464 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
465 
466 	if (flags & PERF_EF_START)
467 		iommu_pmu_start(event, 0);
468 
469 	return 0;
470 }
471 
472 static void iommu_pmu_del(struct perf_event *event, int flags)
473 {
474 	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
475 	int idx = event->hw.idx;
476 
477 	iommu_pmu_stop(event, PERF_EF_UPDATE);
478 
479 	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_REQUESTER_ID, idx);
480 	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_DOMAIN, idx);
481 	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PASID, idx);
482 	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_ATS, idx);
483 	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PAGE_TABLE, idx);
484 
485 	iommu_pmu->event_list[idx] = NULL;
486 	event->hw.idx = -1;
487 	clear_bit(idx, iommu_pmu->used_mask);
488 
489 	perf_event_update_userpage(event);
490 }
491 
492 static void iommu_pmu_enable(struct pmu *pmu)
493 {
494 	struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu);
495 	struct intel_iommu *iommu = iommu_pmu->iommu;
496 
497 	ecmd_submit_sync(iommu, DMA_ECMD_UNFREEZE, 0, 0);
498 }
499 
500 static void iommu_pmu_disable(struct pmu *pmu)
501 {
502 	struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu);
503 	struct intel_iommu *iommu = iommu_pmu->iommu;
504 
505 	ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, 0);
506 }
507 
508 static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu)
509 {
510 	struct perf_event *event;
511 	u64 status;
512 	int i;
513 
514 	/*
515 	 * Two counters may be overflowed very close. Always check
516 	 * whether there are more to handle.
517 	 */
518 	while ((status = dmar_readq(iommu_pmu->overflow))) {
519 		for_each_set_bit(i, (unsigned long *)&status, iommu_pmu->num_cntr) {
520 			/*
521 			 * Find the assigned event of the counter.
522 			 * Accumulate the value into the event->count.
523 			 */
524 			event = iommu_pmu->event_list[i];
525 			if (!event) {
526 				pr_warn_once("Cannot find the assigned event for counter %d\n", i);
527 				continue;
528 			}
529 			iommu_pmu_event_update(event);
530 		}
531 
532 		dmar_writeq(iommu_pmu->overflow, status);
533 	}
534 }
535 
536 static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id)
537 {
538 	struct intel_iommu *iommu = dev_id;
539 
540 	if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG))
541 		return IRQ_NONE;
542 
543 	iommu_pmu_counter_overflow(iommu->pmu);
544 
545 	/* Clear the status bit */
546 	dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS);
547 
548 	return IRQ_HANDLED;
549 }
550 
551 static int __iommu_pmu_register(struct intel_iommu *iommu)
552 {
553 	struct iommu_pmu *iommu_pmu = iommu->pmu;
554 
555 	iommu_pmu->pmu.name		= iommu->name;
556 	iommu_pmu->pmu.task_ctx_nr	= perf_invalid_context;
557 	iommu_pmu->pmu.event_init	= iommu_pmu_event_init;
558 	iommu_pmu->pmu.pmu_enable	= iommu_pmu_enable;
559 	iommu_pmu->pmu.pmu_disable	= iommu_pmu_disable;
560 	iommu_pmu->pmu.add		= iommu_pmu_add;
561 	iommu_pmu->pmu.del		= iommu_pmu_del;
562 	iommu_pmu->pmu.start		= iommu_pmu_start;
563 	iommu_pmu->pmu.stop		= iommu_pmu_stop;
564 	iommu_pmu->pmu.read		= iommu_pmu_event_update;
565 	iommu_pmu->pmu.attr_groups	= iommu_pmu_attr_groups;
566 	iommu_pmu->pmu.attr_update	= iommu_pmu_attr_update;
567 	iommu_pmu->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
568 	iommu_pmu->pmu.module		= THIS_MODULE;
569 
570 	return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1);
571 }
572 
573 static inline void __iomem *
574 get_perf_reg_address(struct intel_iommu *iommu, u32 offset)
575 {
576 	u32 off = dmar_readl(iommu->reg + offset);
577 
578 	return iommu->reg + off;
579 }
580 
581 int alloc_iommu_pmu(struct intel_iommu *iommu)
582 {
583 	struct iommu_pmu *iommu_pmu;
584 	int i, j, ret;
585 	u64 perfcap;
586 	u32 cap;
587 
588 	if (!ecap_pms(iommu->ecap))
589 		return 0;
590 
591 	/* The IOMMU PMU requires the ECMD support as well */
592 	if (!cap_ecmds(iommu->cap))
593 		return -ENODEV;
594 
595 	perfcap = dmar_readq(iommu->reg + DMAR_PERFCAP_REG);
596 	/* The performance monitoring is not supported. */
597 	if (!perfcap)
598 		return -ENODEV;
599 
600 	/* Sanity check for the number of the counters and event groups */
601 	if (!pcap_num_cntr(perfcap) || !pcap_num_event_group(perfcap))
602 		return -ENODEV;
603 
604 	/* The interrupt on overflow is required */
605 	if (!pcap_interrupt(perfcap))
606 		return -ENODEV;
607 
608 	/* Check required Enhanced Command Capability */
609 	if (!ecmd_has_pmu_essential(iommu))
610 		return -ENODEV;
611 
612 	iommu_pmu = kzalloc(sizeof(*iommu_pmu), GFP_KERNEL);
613 	if (!iommu_pmu)
614 		return -ENOMEM;
615 
616 	iommu_pmu->num_cntr = pcap_num_cntr(perfcap);
617 	if (iommu_pmu->num_cntr > IOMMU_PMU_IDX_MAX) {
618 		pr_warn_once("The number of IOMMU counters %d > max(%d), clipping!",
619 			     iommu_pmu->num_cntr, IOMMU_PMU_IDX_MAX);
620 		iommu_pmu->num_cntr = IOMMU_PMU_IDX_MAX;
621 	}
622 
623 	iommu_pmu->cntr_width = pcap_cntr_width(perfcap);
624 	iommu_pmu->filter = pcap_filters_mask(perfcap);
625 	iommu_pmu->cntr_stride = pcap_cntr_stride(perfcap);
626 	iommu_pmu->num_eg = pcap_num_event_group(perfcap);
627 
628 	iommu_pmu->evcap = kcalloc(iommu_pmu->num_eg, sizeof(u64), GFP_KERNEL);
629 	if (!iommu_pmu->evcap) {
630 		ret = -ENOMEM;
631 		goto free_pmu;
632 	}
633 
634 	/* Parse event group capabilities */
635 	for (i = 0; i < iommu_pmu->num_eg; i++) {
636 		u64 pcap;
637 
638 		pcap = dmar_readq(iommu->reg + DMAR_PERFEVNTCAP_REG +
639 				  i * IOMMU_PMU_CAP_REGS_STEP);
640 		iommu_pmu->evcap[i] = pecap_es(pcap);
641 	}
642 
643 	iommu_pmu->cntr_evcap = kcalloc(iommu_pmu->num_cntr, sizeof(u32 *), GFP_KERNEL);
644 	if (!iommu_pmu->cntr_evcap) {
645 		ret = -ENOMEM;
646 		goto free_pmu_evcap;
647 	}
648 	for (i = 0; i < iommu_pmu->num_cntr; i++) {
649 		iommu_pmu->cntr_evcap[i] = kcalloc(iommu_pmu->num_eg, sizeof(u32), GFP_KERNEL);
650 		if (!iommu_pmu->cntr_evcap[i]) {
651 			ret = -ENOMEM;
652 			goto free_pmu_cntr_evcap;
653 		}
654 		/*
655 		 * Set to the global capabilities, will adjust according
656 		 * to per-counter capabilities later.
657 		 */
658 		for (j = 0; j < iommu_pmu->num_eg; j++)
659 			iommu_pmu->cntr_evcap[i][j] = (u32)iommu_pmu->evcap[j];
660 	}
661 
662 	iommu_pmu->cfg_reg = get_perf_reg_address(iommu, DMAR_PERFCFGOFF_REG);
663 	iommu_pmu->cntr_reg = get_perf_reg_address(iommu, DMAR_PERFCNTROFF_REG);
664 	iommu_pmu->overflow = get_perf_reg_address(iommu, DMAR_PERFOVFOFF_REG);
665 
666 	/*
667 	 * Check per-counter capabilities. All counters should have the
668 	 * same capabilities on Interrupt on Overflow Support and Counter
669 	 * Width.
670 	 */
671 	for (i = 0; i < iommu_pmu->num_cntr; i++) {
672 		cap = dmar_readl(iommu_pmu->cfg_reg +
673 				 i * IOMMU_PMU_CFG_OFFSET +
674 				 IOMMU_PMU_CFG_CNTRCAP_OFFSET);
675 		if (!iommu_cntrcap_pcc(cap))
676 			continue;
677 
678 		/*
679 		 * It's possible that some counters have a different
680 		 * capability because of e.g., HW bug. Check the corner
681 		 * case here and simply drop those counters.
682 		 */
683 		if ((iommu_cntrcap_cw(cap) != iommu_pmu->cntr_width) ||
684 		    !iommu_cntrcap_ios(cap)) {
685 			iommu_pmu->num_cntr = i;
686 			pr_warn("PMU counter capability inconsistent, counter number reduced to %d\n",
687 				iommu_pmu->num_cntr);
688 		}
689 
690 		/* Clear the pre-defined events group */
691 		for (j = 0; j < iommu_pmu->num_eg; j++)
692 			iommu_pmu->cntr_evcap[i][j] = 0;
693 
694 		/* Override with per-counter event capabilities */
695 		for (j = 0; j < iommu_cntrcap_egcnt(cap); j++) {
696 			cap = dmar_readl(iommu_pmu->cfg_reg + i * IOMMU_PMU_CFG_OFFSET +
697 					 IOMMU_PMU_CFG_CNTREVCAP_OFFSET +
698 					 (j * IOMMU_PMU_OFF_REGS_STEP));
699 			iommu_pmu->cntr_evcap[i][iommu_event_group(cap)] = iommu_event_select(cap);
700 			/*
701 			 * Some events may only be supported by a specific counter.
702 			 * Track them in the evcap as well.
703 			 */
704 			iommu_pmu->evcap[iommu_event_group(cap)] |= iommu_event_select(cap);
705 		}
706 	}
707 
708 	iommu_pmu->iommu = iommu;
709 	iommu->pmu = iommu_pmu;
710 
711 	return 0;
712 
713 free_pmu_cntr_evcap:
714 	for (i = 0; i < iommu_pmu->num_cntr; i++)
715 		kfree(iommu_pmu->cntr_evcap[i]);
716 	kfree(iommu_pmu->cntr_evcap);
717 free_pmu_evcap:
718 	kfree(iommu_pmu->evcap);
719 free_pmu:
720 	kfree(iommu_pmu);
721 
722 	return ret;
723 }
724 
725 void free_iommu_pmu(struct intel_iommu *iommu)
726 {
727 	struct iommu_pmu *iommu_pmu = iommu->pmu;
728 
729 	if (!iommu_pmu)
730 		return;
731 
732 	if (iommu_pmu->evcap) {
733 		int i;
734 
735 		for (i = 0; i < iommu_pmu->num_cntr; i++)
736 			kfree(iommu_pmu->cntr_evcap[i]);
737 		kfree(iommu_pmu->cntr_evcap);
738 	}
739 	kfree(iommu_pmu->evcap);
740 	kfree(iommu_pmu);
741 	iommu->pmu = NULL;
742 }
743 
744 static int iommu_pmu_set_interrupt(struct intel_iommu *iommu)
745 {
746 	struct iommu_pmu *iommu_pmu = iommu->pmu;
747 	int irq, ret;
748 
749 	irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id, iommu->node, iommu);
750 	if (irq <= 0)
751 		return -EINVAL;
752 
753 	snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name), "dmar%d-perf", iommu->seq_id);
754 
755 	iommu->perf_irq = irq;
756 	ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler,
757 				   IRQF_ONESHOT, iommu_pmu->irq_name, iommu);
758 	if (ret) {
759 		dmar_free_hwirq(irq);
760 		iommu->perf_irq = 0;
761 		return ret;
762 	}
763 	return 0;
764 }
765 
766 static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
767 {
768 	if (!iommu->perf_irq)
769 		return;
770 
771 	free_irq(iommu->perf_irq, iommu);
772 	dmar_free_hwirq(iommu->perf_irq);
773 	iommu->perf_irq = 0;
774 }
775 
776 static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
777 {
778 	struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
779 
780 	if (cpumask_empty(&iommu_pmu_cpu_mask))
781 		cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask);
782 
783 	if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask))
784 		iommu_pmu->cpu = cpu;
785 
786 	return 0;
787 }
788 
789 static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
790 {
791 	struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
792 	int target = cpumask_first(&iommu_pmu_cpu_mask);
793 
794 	/*
795 	 * The iommu_pmu_cpu_mask has been updated when offline the CPU
796 	 * for the first iommu_pmu. Migrate the other iommu_pmu to the
797 	 * new target.
798 	 */
799 	if (target < nr_cpu_ids && target != iommu_pmu->cpu) {
800 		perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
801 		iommu_pmu->cpu = target;
802 		return 0;
803 	}
804 
805 	if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask))
806 		return 0;
807 
808 	target = cpumask_any_but(cpu_online_mask, cpu);
809 
810 	if (target < nr_cpu_ids)
811 		cpumask_set_cpu(target, &iommu_pmu_cpu_mask);
812 	else
813 		return 0;
814 
815 	perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
816 	iommu_pmu->cpu = target;
817 
818 	return 0;
819 }
820 
821 static int nr_iommu_pmu;
822 static enum cpuhp_state iommu_cpuhp_slot;
823 
824 static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu)
825 {
826 	int ret;
827 
828 	if (!nr_iommu_pmu) {
829 		ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
830 					      "driver/iommu/intel/perfmon:online",
831 					      iommu_pmu_cpu_online,
832 					      iommu_pmu_cpu_offline);
833 		if (ret < 0)
834 			return ret;
835 		iommu_cpuhp_slot = ret;
836 	}
837 
838 	ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
839 	if (ret) {
840 		if (!nr_iommu_pmu)
841 			cpuhp_remove_multi_state(iommu_cpuhp_slot);
842 		return ret;
843 	}
844 	nr_iommu_pmu++;
845 
846 	return 0;
847 }
848 
849 static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu)
850 {
851 	cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
852 
853 	if (--nr_iommu_pmu)
854 		return;
855 
856 	cpuhp_remove_multi_state(iommu_cpuhp_slot);
857 }
858 
859 void iommu_pmu_register(struct intel_iommu *iommu)
860 {
861 	struct iommu_pmu *iommu_pmu = iommu->pmu;
862 
863 	if (!iommu_pmu)
864 		return;
865 
866 	if (__iommu_pmu_register(iommu))
867 		goto err;
868 
869 	if (iommu_pmu_cpuhp_setup(iommu_pmu))
870 		goto unregister;
871 
872 	/* Set interrupt for overflow */
873 	if (iommu_pmu_set_interrupt(iommu))
874 		goto cpuhp_free;
875 
876 	return;
877 
878 cpuhp_free:
879 	iommu_pmu_cpuhp_free(iommu_pmu);
880 unregister:
881 	perf_pmu_unregister(&iommu_pmu->pmu);
882 err:
883 	pr_err("Failed to register PMU for iommu (seq_id = %d)\n", iommu->seq_id);
884 	free_iommu_pmu(iommu);
885 }
886 
887 void iommu_pmu_unregister(struct intel_iommu *iommu)
888 {
889 	struct iommu_pmu *iommu_pmu = iommu->pmu;
890 
891 	if (!iommu_pmu)
892 		return;
893 
894 	iommu_pmu_unset_interrupt(iommu);
895 	iommu_pmu_cpuhp_free(iommu_pmu);
896 	perf_pmu_unregister(&iommu_pmu->pmu);
897 }
898