xref: /linux/drivers/iommu/intel/iommu.c (revision dd093fb0)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/intel-svm.h>
20 #include <linux/memory.h>
21 #include <linux/pci.h>
22 #include <linux/pci-ats.h>
23 #include <linux/spinlock.h>
24 #include <linux/syscore_ops.h>
25 #include <linux/tboot.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131 
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139 
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141 
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148 	if (!(re->lo & 1))
149 		return 0;
150 
151 	return re->lo & VTD_PAGE_MASK;
152 }
153 
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160 	if (!(re->hi & 1))
161 		return 0;
162 
163 	return re->hi & VTD_PAGE_MASK;
164 }
165 
166 static inline void context_set_present(struct context_entry *context)
167 {
168 	context->lo |= 1;
169 }
170 
171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173 	context->lo &= (((u64)-1) << 2) | 1;
174 }
175 
176 static inline void context_set_translation_type(struct context_entry *context,
177 						unsigned long value)
178 {
179 	context->lo &= (((u64)-1) << 4) | 3;
180 	context->lo |= (value & 3) << 2;
181 }
182 
183 static inline void context_set_address_root(struct context_entry *context,
184 					    unsigned long value)
185 {
186 	context->lo &= ~VTD_PAGE_MASK;
187 	context->lo |= value & VTD_PAGE_MASK;
188 }
189 
190 static inline void context_set_address_width(struct context_entry *context,
191 					     unsigned long value)
192 {
193 	context->hi |= value & 7;
194 }
195 
196 static inline void context_set_domain_id(struct context_entry *context,
197 					 unsigned long value)
198 {
199 	context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201 
202 static inline void context_set_pasid(struct context_entry *context)
203 {
204 	context->lo |= CONTEXT_PASIDE;
205 }
206 
207 static inline int context_domain_id(struct context_entry *c)
208 {
209 	return((c->hi >> 8) & 0xffff);
210 }
211 
212 static inline void context_clear_entry(struct context_entry *context)
213 {
214 	context->lo = 0;
215 	context->hi = 0;
216 }
217 
218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219 {
220 	if (!iommu->copied_tables)
221 		return false;
222 
223 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224 }
225 
226 static inline void
227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228 {
229 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230 }
231 
232 static inline void
233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 {
235 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236 }
237 
238 /*
239  * This domain is a statically identity mapping domain.
240  *	1. This domain creats a static 1:1 mapping to all usable memory.
241  * 	2. It maps to each iommu if successful.
242  *	3. Each iommu mapps to this domain if successful.
243  */
244 static struct dmar_domain *si_domain;
245 static int hw_pass_through = 1;
246 
247 struct dmar_rmrr_unit {
248 	struct list_head list;		/* list of rmrr units	*/
249 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
250 	u64	base_address;		/* reserved base address*/
251 	u64	end_address;		/* reserved end address */
252 	struct dmar_dev_scope *devices;	/* target devices */
253 	int	devices_cnt;		/* target device count */
254 };
255 
256 struct dmar_atsr_unit {
257 	struct list_head list;		/* list of ATSR units */
258 	struct acpi_dmar_header *hdr;	/* ACPI header */
259 	struct dmar_dev_scope *devices;	/* target devices */
260 	int devices_cnt;		/* target device count */
261 	u8 include_all:1;		/* include all ports */
262 };
263 
264 struct dmar_satc_unit {
265 	struct list_head list;		/* list of SATC units */
266 	struct acpi_dmar_header *hdr;	/* ACPI header */
267 	struct dmar_dev_scope *devices;	/* target devices */
268 	struct intel_iommu *iommu;	/* the corresponding iommu */
269 	int devices_cnt;		/* target device count */
270 	u8 atc_required:1;		/* ATS is required */
271 };
272 
273 static LIST_HEAD(dmar_atsr_units);
274 static LIST_HEAD(dmar_rmrr_units);
275 static LIST_HEAD(dmar_satc_units);
276 
277 #define for_each_rmrr_units(rmrr) \
278 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279 
280 static void device_block_translation(struct device *dev);
281 static void intel_iommu_domain_free(struct iommu_domain *domain);
282 
283 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
284 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
285 
286 int intel_iommu_enabled = 0;
287 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
288 
289 static int dmar_map_gfx = 1;
290 static int intel_iommu_superpage = 1;
291 static int iommu_identity_mapping;
292 static int iommu_skip_te_disable;
293 
294 #define IDENTMAP_GFX		2
295 #define IDENTMAP_AZALIA		4
296 
297 const struct iommu_ops intel_iommu_ops;
298 
299 static bool translation_pre_enabled(struct intel_iommu *iommu)
300 {
301 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
302 }
303 
304 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
305 {
306 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
307 }
308 
309 static void init_translation_status(struct intel_iommu *iommu)
310 {
311 	u32 gsts;
312 
313 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
314 	if (gsts & DMA_GSTS_TES)
315 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
316 }
317 
318 static int __init intel_iommu_setup(char *str)
319 {
320 	if (!str)
321 		return -EINVAL;
322 
323 	while (*str) {
324 		if (!strncmp(str, "on", 2)) {
325 			dmar_disabled = 0;
326 			pr_info("IOMMU enabled\n");
327 		} else if (!strncmp(str, "off", 3)) {
328 			dmar_disabled = 1;
329 			no_platform_optin = 1;
330 			pr_info("IOMMU disabled\n");
331 		} else if (!strncmp(str, "igfx_off", 8)) {
332 			dmar_map_gfx = 0;
333 			pr_info("Disable GFX device mapping\n");
334 		} else if (!strncmp(str, "forcedac", 8)) {
335 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
336 			iommu_dma_forcedac = true;
337 		} else if (!strncmp(str, "strict", 6)) {
338 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
339 			iommu_set_dma_strict();
340 		} else if (!strncmp(str, "sp_off", 6)) {
341 			pr_info("Disable supported super page\n");
342 			intel_iommu_superpage = 0;
343 		} else if (!strncmp(str, "sm_on", 5)) {
344 			pr_info("Enable scalable mode if hardware supports\n");
345 			intel_iommu_sm = 1;
346 		} else if (!strncmp(str, "sm_off", 6)) {
347 			pr_info("Scalable mode is disallowed\n");
348 			intel_iommu_sm = 0;
349 		} else if (!strncmp(str, "tboot_noforce", 13)) {
350 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
351 			intel_iommu_tboot_noforce = 1;
352 		} else {
353 			pr_notice("Unknown option - '%s'\n", str);
354 		}
355 
356 		str += strcspn(str, ",");
357 		while (*str == ',')
358 			str++;
359 	}
360 
361 	return 1;
362 }
363 __setup("intel_iommu=", intel_iommu_setup);
364 
365 void *alloc_pgtable_page(int node)
366 {
367 	struct page *page;
368 	void *vaddr = NULL;
369 
370 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
371 	if (page)
372 		vaddr = page_address(page);
373 	return vaddr;
374 }
375 
376 void free_pgtable_page(void *vaddr)
377 {
378 	free_page((unsigned long)vaddr);
379 }
380 
381 static inline int domain_type_is_si(struct dmar_domain *domain)
382 {
383 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
384 }
385 
386 static inline int domain_pfn_supported(struct dmar_domain *domain,
387 				       unsigned long pfn)
388 {
389 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
390 
391 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
392 }
393 
394 /*
395  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
396  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
397  * the returned SAGAW.
398  */
399 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
400 {
401 	unsigned long fl_sagaw, sl_sagaw;
402 
403 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
404 	sl_sagaw = cap_sagaw(iommu->cap);
405 
406 	/* Second level only. */
407 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
408 		return sl_sagaw;
409 
410 	/* First level only. */
411 	if (!ecap_slts(iommu->ecap))
412 		return fl_sagaw;
413 
414 	return fl_sagaw & sl_sagaw;
415 }
416 
417 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
418 {
419 	unsigned long sagaw;
420 	int agaw;
421 
422 	sagaw = __iommu_calculate_sagaw(iommu);
423 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
424 		if (test_bit(agaw, &sagaw))
425 			break;
426 	}
427 
428 	return agaw;
429 }
430 
431 /*
432  * Calculate max SAGAW for each iommu.
433  */
434 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
435 {
436 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
437 }
438 
439 /*
440  * calculate agaw for each iommu.
441  * "SAGAW" may be different across iommus, use a default agaw, and
442  * get a supported less agaw for iommus that don't support the default agaw.
443  */
444 int iommu_calculate_agaw(struct intel_iommu *iommu)
445 {
446 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
447 }
448 
449 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
450 {
451 	return sm_supported(iommu) ?
452 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
453 }
454 
455 static void domain_update_iommu_coherency(struct dmar_domain *domain)
456 {
457 	struct iommu_domain_info *info;
458 	struct dmar_drhd_unit *drhd;
459 	struct intel_iommu *iommu;
460 	bool found = false;
461 	unsigned long i;
462 
463 	domain->iommu_coherency = true;
464 	xa_for_each(&domain->iommu_array, i, info) {
465 		found = true;
466 		if (!iommu_paging_structure_coherency(info->iommu)) {
467 			domain->iommu_coherency = false;
468 			break;
469 		}
470 	}
471 	if (found)
472 		return;
473 
474 	/* No hardware attached; use lowest common denominator */
475 	rcu_read_lock();
476 	for_each_active_iommu(iommu, drhd) {
477 		if (!iommu_paging_structure_coherency(iommu)) {
478 			domain->iommu_coherency = false;
479 			break;
480 		}
481 	}
482 	rcu_read_unlock();
483 }
484 
485 static int domain_update_iommu_superpage(struct dmar_domain *domain,
486 					 struct intel_iommu *skip)
487 {
488 	struct dmar_drhd_unit *drhd;
489 	struct intel_iommu *iommu;
490 	int mask = 0x3;
491 
492 	if (!intel_iommu_superpage)
493 		return 0;
494 
495 	/* set iommu_superpage to the smallest common denominator */
496 	rcu_read_lock();
497 	for_each_active_iommu(iommu, drhd) {
498 		if (iommu != skip) {
499 			if (domain && domain->use_first_level) {
500 				if (!cap_fl1gp_support(iommu->cap))
501 					mask = 0x1;
502 			} else {
503 				mask &= cap_super_page_val(iommu->cap);
504 			}
505 
506 			if (!mask)
507 				break;
508 		}
509 	}
510 	rcu_read_unlock();
511 
512 	return fls(mask);
513 }
514 
515 static int domain_update_device_node(struct dmar_domain *domain)
516 {
517 	struct device_domain_info *info;
518 	int nid = NUMA_NO_NODE;
519 	unsigned long flags;
520 
521 	spin_lock_irqsave(&domain->lock, flags);
522 	list_for_each_entry(info, &domain->devices, link) {
523 		/*
524 		 * There could possibly be multiple device numa nodes as devices
525 		 * within the same domain may sit behind different IOMMUs. There
526 		 * isn't perfect answer in such situation, so we select first
527 		 * come first served policy.
528 		 */
529 		nid = dev_to_node(info->dev);
530 		if (nid != NUMA_NO_NODE)
531 			break;
532 	}
533 	spin_unlock_irqrestore(&domain->lock, flags);
534 
535 	return nid;
536 }
537 
538 static void domain_update_iotlb(struct dmar_domain *domain);
539 
540 /* Return the super pagesize bitmap if supported. */
541 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
542 {
543 	unsigned long bitmap = 0;
544 
545 	/*
546 	 * 1-level super page supports page size of 2MiB, 2-level super page
547 	 * supports page size of both 2MiB and 1GiB.
548 	 */
549 	if (domain->iommu_superpage == 1)
550 		bitmap |= SZ_2M;
551 	else if (domain->iommu_superpage == 2)
552 		bitmap |= SZ_2M | SZ_1G;
553 
554 	return bitmap;
555 }
556 
557 /* Some capabilities may be different across iommus */
558 static void domain_update_iommu_cap(struct dmar_domain *domain)
559 {
560 	domain_update_iommu_coherency(domain);
561 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
562 
563 	/*
564 	 * If RHSA is missing, we should default to the device numa domain
565 	 * as fall back.
566 	 */
567 	if (domain->nid == NUMA_NO_NODE)
568 		domain->nid = domain_update_device_node(domain);
569 
570 	/*
571 	 * First-level translation restricts the input-address to a
572 	 * canonical address (i.e., address bits 63:N have the same
573 	 * value as address bit [N-1], where N is 48-bits with 4-level
574 	 * paging and 57-bits with 5-level paging). Hence, skip bit
575 	 * [N-1].
576 	 */
577 	if (domain->use_first_level)
578 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
579 	else
580 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
581 
582 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
583 	domain_update_iotlb(domain);
584 }
585 
586 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
587 					 u8 devfn, int alloc)
588 {
589 	struct root_entry *root = &iommu->root_entry[bus];
590 	struct context_entry *context;
591 	u64 *entry;
592 
593 	/*
594 	 * Except that the caller requested to allocate a new entry,
595 	 * returning a copied context entry makes no sense.
596 	 */
597 	if (!alloc && context_copied(iommu, bus, devfn))
598 		return NULL;
599 
600 	entry = &root->lo;
601 	if (sm_supported(iommu)) {
602 		if (devfn >= 0x80) {
603 			devfn -= 0x80;
604 			entry = &root->hi;
605 		}
606 		devfn *= 2;
607 	}
608 	if (*entry & 1)
609 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
610 	else {
611 		unsigned long phy_addr;
612 		if (!alloc)
613 			return NULL;
614 
615 		context = alloc_pgtable_page(iommu->node);
616 		if (!context)
617 			return NULL;
618 
619 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
620 		phy_addr = virt_to_phys((void *)context);
621 		*entry = phy_addr | 1;
622 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
623 	}
624 	return &context[devfn];
625 }
626 
627 /**
628  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
629  *				 sub-hierarchy of a candidate PCI-PCI bridge
630  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
631  * @bridge: the candidate PCI-PCI bridge
632  *
633  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
634  */
635 static bool
636 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
637 {
638 	struct pci_dev *pdev, *pbridge;
639 
640 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
641 		return false;
642 
643 	pdev = to_pci_dev(dev);
644 	pbridge = to_pci_dev(bridge);
645 
646 	if (pbridge->subordinate &&
647 	    pbridge->subordinate->number <= pdev->bus->number &&
648 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
649 		return true;
650 
651 	return false;
652 }
653 
654 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
655 {
656 	struct dmar_drhd_unit *drhd;
657 	u32 vtbar;
658 	int rc;
659 
660 	/* We know that this device on this chipset has its own IOMMU.
661 	 * If we find it under a different IOMMU, then the BIOS is lying
662 	 * to us. Hope that the IOMMU for this device is actually
663 	 * disabled, and it needs no translation...
664 	 */
665 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
666 	if (rc) {
667 		/* "can't" happen */
668 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
669 		return false;
670 	}
671 	vtbar &= 0xffff0000;
672 
673 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
674 	drhd = dmar_find_matched_drhd_unit(pdev);
675 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
676 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
677 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
678 		return true;
679 	}
680 
681 	return false;
682 }
683 
684 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
685 {
686 	if (!iommu || iommu->drhd->ignored)
687 		return true;
688 
689 	if (dev_is_pci(dev)) {
690 		struct pci_dev *pdev = to_pci_dev(dev);
691 
692 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
693 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
694 		    quirk_ioat_snb_local_iommu(pdev))
695 			return true;
696 	}
697 
698 	return false;
699 }
700 
701 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
702 {
703 	struct dmar_drhd_unit *drhd = NULL;
704 	struct pci_dev *pdev = NULL;
705 	struct intel_iommu *iommu;
706 	struct device *tmp;
707 	u16 segment = 0;
708 	int i;
709 
710 	if (!dev)
711 		return NULL;
712 
713 	if (dev_is_pci(dev)) {
714 		struct pci_dev *pf_pdev;
715 
716 		pdev = pci_real_dma_dev(to_pci_dev(dev));
717 
718 		/* VFs aren't listed in scope tables; we need to look up
719 		 * the PF instead to find the IOMMU. */
720 		pf_pdev = pci_physfn(pdev);
721 		dev = &pf_pdev->dev;
722 		segment = pci_domain_nr(pdev->bus);
723 	} else if (has_acpi_companion(dev))
724 		dev = &ACPI_COMPANION(dev)->dev;
725 
726 	rcu_read_lock();
727 	for_each_iommu(iommu, drhd) {
728 		if (pdev && segment != drhd->segment)
729 			continue;
730 
731 		for_each_active_dev_scope(drhd->devices,
732 					  drhd->devices_cnt, i, tmp) {
733 			if (tmp == dev) {
734 				/* For a VF use its original BDF# not that of the PF
735 				 * which we used for the IOMMU lookup. Strictly speaking
736 				 * we could do this for all PCI devices; we only need to
737 				 * get the BDF# from the scope table for ACPI matches. */
738 				if (pdev && pdev->is_virtfn)
739 					goto got_pdev;
740 
741 				if (bus && devfn) {
742 					*bus = drhd->devices[i].bus;
743 					*devfn = drhd->devices[i].devfn;
744 				}
745 				goto out;
746 			}
747 
748 			if (is_downstream_to_pci_bridge(dev, tmp))
749 				goto got_pdev;
750 		}
751 
752 		if (pdev && drhd->include_all) {
753 got_pdev:
754 			if (bus && devfn) {
755 				*bus = pdev->bus->number;
756 				*devfn = pdev->devfn;
757 			}
758 			goto out;
759 		}
760 	}
761 	iommu = NULL;
762 out:
763 	if (iommu_is_dummy(iommu, dev))
764 		iommu = NULL;
765 
766 	rcu_read_unlock();
767 
768 	return iommu;
769 }
770 
771 static void domain_flush_cache(struct dmar_domain *domain,
772 			       void *addr, int size)
773 {
774 	if (!domain->iommu_coherency)
775 		clflush_cache_range(addr, size);
776 }
777 
778 static void free_context_table(struct intel_iommu *iommu)
779 {
780 	struct context_entry *context;
781 	int i;
782 
783 	if (!iommu->root_entry)
784 		return;
785 
786 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
787 		context = iommu_context_addr(iommu, i, 0, 0);
788 		if (context)
789 			free_pgtable_page(context);
790 
791 		if (!sm_supported(iommu))
792 			continue;
793 
794 		context = iommu_context_addr(iommu, i, 0x80, 0);
795 		if (context)
796 			free_pgtable_page(context);
797 	}
798 
799 	free_pgtable_page(iommu->root_entry);
800 	iommu->root_entry = NULL;
801 }
802 
803 #ifdef CONFIG_DMAR_DEBUG
804 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
805 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
806 {
807 	struct dma_pte *pte;
808 	int offset;
809 
810 	while (1) {
811 		offset = pfn_level_offset(pfn, level);
812 		pte = &parent[offset];
813 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
814 			pr_info("PTE not present at level %d\n", level);
815 			break;
816 		}
817 
818 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
819 
820 		if (level == 1)
821 			break;
822 
823 		parent = phys_to_virt(dma_pte_addr(pte));
824 		level--;
825 	}
826 }
827 
828 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
829 			  unsigned long long addr, u32 pasid)
830 {
831 	struct pasid_dir_entry *dir, *pde;
832 	struct pasid_entry *entries, *pte;
833 	struct context_entry *ctx_entry;
834 	struct root_entry *rt_entry;
835 	int i, dir_index, index, level;
836 	u8 devfn = source_id & 0xff;
837 	u8 bus = source_id >> 8;
838 	struct dma_pte *pgtable;
839 
840 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
841 
842 	/* root entry dump */
843 	rt_entry = &iommu->root_entry[bus];
844 	if (!rt_entry) {
845 		pr_info("root table entry is not present\n");
846 		return;
847 	}
848 
849 	if (sm_supported(iommu))
850 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
851 			rt_entry->hi, rt_entry->lo);
852 	else
853 		pr_info("root entry: 0x%016llx", rt_entry->lo);
854 
855 	/* context entry dump */
856 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
857 	if (!ctx_entry) {
858 		pr_info("context table entry is not present\n");
859 		return;
860 	}
861 
862 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
863 		ctx_entry->hi, ctx_entry->lo);
864 
865 	/* legacy mode does not require PASID entries */
866 	if (!sm_supported(iommu)) {
867 		level = agaw_to_level(ctx_entry->hi & 7);
868 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
869 		goto pgtable_walk;
870 	}
871 
872 	/* get the pointer to pasid directory entry */
873 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874 	if (!dir) {
875 		pr_info("pasid directory entry is not present\n");
876 		return;
877 	}
878 	/* For request-without-pasid, get the pasid from context entry */
879 	if (intel_iommu_sm && pasid == INVALID_IOASID)
880 		pasid = PASID_RID2PASID;
881 
882 	dir_index = pasid >> PASID_PDE_SHIFT;
883 	pde = &dir[dir_index];
884 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
885 
886 	/* get the pointer to the pasid table entry */
887 	entries = get_pasid_table_from_pde(pde);
888 	if (!entries) {
889 		pr_info("pasid table entry is not present\n");
890 		return;
891 	}
892 	index = pasid & PASID_PTE_MASK;
893 	pte = &entries[index];
894 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
895 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
896 
897 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
898 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
899 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
900 	} else {
901 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
902 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
903 	}
904 
905 pgtable_walk:
906 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
907 }
908 #endif
909 
910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911 				      unsigned long pfn, int *target_level)
912 {
913 	struct dma_pte *parent, *pte;
914 	int level = agaw_to_level(domain->agaw);
915 	int offset;
916 
917 	BUG_ON(!domain->pgd);
918 
919 	if (!domain_pfn_supported(domain, pfn))
920 		/* Address beyond IOMMU's addressing capabilities. */
921 		return NULL;
922 
923 	parent = domain->pgd;
924 
925 	while (1) {
926 		void *tmp_page;
927 
928 		offset = pfn_level_offset(pfn, level);
929 		pte = &parent[offset];
930 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
931 			break;
932 		if (level == *target_level)
933 			break;
934 
935 		if (!dma_pte_present(pte)) {
936 			uint64_t pteval;
937 
938 			tmp_page = alloc_pgtable_page(domain->nid);
939 
940 			if (!tmp_page)
941 				return NULL;
942 
943 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
944 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
945 			if (domain->use_first_level)
946 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
947 
948 			if (cmpxchg64(&pte->val, 0ULL, pteval))
949 				/* Someone else set it while we were thinking; use theirs. */
950 				free_pgtable_page(tmp_page);
951 			else
952 				domain_flush_cache(domain, pte, sizeof(*pte));
953 		}
954 		if (level == 1)
955 			break;
956 
957 		parent = phys_to_virt(dma_pte_addr(pte));
958 		level--;
959 	}
960 
961 	if (!*target_level)
962 		*target_level = level;
963 
964 	return pte;
965 }
966 
967 /* return address's pte at specific level */
968 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
969 					 unsigned long pfn,
970 					 int level, int *large_page)
971 {
972 	struct dma_pte *parent, *pte;
973 	int total = agaw_to_level(domain->agaw);
974 	int offset;
975 
976 	parent = domain->pgd;
977 	while (level <= total) {
978 		offset = pfn_level_offset(pfn, total);
979 		pte = &parent[offset];
980 		if (level == total)
981 			return pte;
982 
983 		if (!dma_pte_present(pte)) {
984 			*large_page = total;
985 			break;
986 		}
987 
988 		if (dma_pte_superpage(pte)) {
989 			*large_page = total;
990 			return pte;
991 		}
992 
993 		parent = phys_to_virt(dma_pte_addr(pte));
994 		total--;
995 	}
996 	return NULL;
997 }
998 
999 /* clear last level pte, a tlb flush should be followed */
1000 static void dma_pte_clear_range(struct dmar_domain *domain,
1001 				unsigned long start_pfn,
1002 				unsigned long last_pfn)
1003 {
1004 	unsigned int large_page;
1005 	struct dma_pte *first_pte, *pte;
1006 
1007 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1008 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1009 	BUG_ON(start_pfn > last_pfn);
1010 
1011 	/* we don't need lock here; nobody else touches the iova range */
1012 	do {
1013 		large_page = 1;
1014 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1015 		if (!pte) {
1016 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1017 			continue;
1018 		}
1019 		do {
1020 			dma_clear_pte(pte);
1021 			start_pfn += lvl_to_nr_pages(large_page);
1022 			pte++;
1023 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1024 
1025 		domain_flush_cache(domain, first_pte,
1026 				   (void *)pte - (void *)first_pte);
1027 
1028 	} while (start_pfn && start_pfn <= last_pfn);
1029 }
1030 
1031 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1032 			       int retain_level, struct dma_pte *pte,
1033 			       unsigned long pfn, unsigned long start_pfn,
1034 			       unsigned long last_pfn)
1035 {
1036 	pfn = max(start_pfn, pfn);
1037 	pte = &pte[pfn_level_offset(pfn, level)];
1038 
1039 	do {
1040 		unsigned long level_pfn;
1041 		struct dma_pte *level_pte;
1042 
1043 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1044 			goto next;
1045 
1046 		level_pfn = pfn & level_mask(level);
1047 		level_pte = phys_to_virt(dma_pte_addr(pte));
1048 
1049 		if (level > 2) {
1050 			dma_pte_free_level(domain, level - 1, retain_level,
1051 					   level_pte, level_pfn, start_pfn,
1052 					   last_pfn);
1053 		}
1054 
1055 		/*
1056 		 * Free the page table if we're below the level we want to
1057 		 * retain and the range covers the entire table.
1058 		 */
1059 		if (level < retain_level && !(start_pfn > level_pfn ||
1060 		      last_pfn < level_pfn + level_size(level) - 1)) {
1061 			dma_clear_pte(pte);
1062 			domain_flush_cache(domain, pte, sizeof(*pte));
1063 			free_pgtable_page(level_pte);
1064 		}
1065 next:
1066 		pfn += level_size(level);
1067 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068 }
1069 
1070 /*
1071  * clear last level (leaf) ptes and free page table pages below the
1072  * level we wish to keep intact.
1073  */
1074 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1075 				   unsigned long start_pfn,
1076 				   unsigned long last_pfn,
1077 				   int retain_level)
1078 {
1079 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1080 
1081 	/* We don't need lock here; nobody else touches the iova range */
1082 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1083 			   domain->pgd, 0, start_pfn, last_pfn);
1084 
1085 	/* free pgd */
1086 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1087 		free_pgtable_page(domain->pgd);
1088 		domain->pgd = NULL;
1089 	}
1090 }
1091 
1092 /* When a page at a given level is being unlinked from its parent, we don't
1093    need to *modify* it at all. All we need to do is make a list of all the
1094    pages which can be freed just as soon as we've flushed the IOTLB and we
1095    know the hardware page-walk will no longer touch them.
1096    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1097    be freed. */
1098 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1099 				    int level, struct dma_pte *pte,
1100 				    struct list_head *freelist)
1101 {
1102 	struct page *pg;
1103 
1104 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1105 	list_add_tail(&pg->lru, freelist);
1106 
1107 	if (level == 1)
1108 		return;
1109 
1110 	pte = page_address(pg);
1111 	do {
1112 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1113 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1114 		pte++;
1115 	} while (!first_pte_in_page(pte));
1116 }
1117 
1118 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1119 				struct dma_pte *pte, unsigned long pfn,
1120 				unsigned long start_pfn, unsigned long last_pfn,
1121 				struct list_head *freelist)
1122 {
1123 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1124 
1125 	pfn = max(start_pfn, pfn);
1126 	pte = &pte[pfn_level_offset(pfn, level)];
1127 
1128 	do {
1129 		unsigned long level_pfn = pfn & level_mask(level);
1130 
1131 		if (!dma_pte_present(pte))
1132 			goto next;
1133 
1134 		/* If range covers entire pagetable, free it */
1135 		if (start_pfn <= level_pfn &&
1136 		    last_pfn >= level_pfn + level_size(level) - 1) {
1137 			/* These suborbinate page tables are going away entirely. Don't
1138 			   bother to clear them; we're just going to *free* them. */
1139 			if (level > 1 && !dma_pte_superpage(pte))
1140 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1141 
1142 			dma_clear_pte(pte);
1143 			if (!first_pte)
1144 				first_pte = pte;
1145 			last_pte = pte;
1146 		} else if (level > 1) {
1147 			/* Recurse down into a level that isn't *entirely* obsolete */
1148 			dma_pte_clear_level(domain, level - 1,
1149 					    phys_to_virt(dma_pte_addr(pte)),
1150 					    level_pfn, start_pfn, last_pfn,
1151 					    freelist);
1152 		}
1153 next:
1154 		pfn = level_pfn + level_size(level);
1155 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1156 
1157 	if (first_pte)
1158 		domain_flush_cache(domain, first_pte,
1159 				   (void *)++last_pte - (void *)first_pte);
1160 }
1161 
1162 /* We can't just free the pages because the IOMMU may still be walking
1163    the page tables, and may have cached the intermediate levels. The
1164    pages can only be freed after the IOTLB flush has been done. */
1165 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1166 			 unsigned long last_pfn, struct list_head *freelist)
1167 {
1168 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1169 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1170 	BUG_ON(start_pfn > last_pfn);
1171 
1172 	/* we don't need lock here; nobody else touches the iova range */
1173 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1174 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1175 
1176 	/* free pgd */
1177 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1178 		struct page *pgd_page = virt_to_page(domain->pgd);
1179 		list_add_tail(&pgd_page->lru, freelist);
1180 		domain->pgd = NULL;
1181 	}
1182 }
1183 
1184 /* iommu handling */
1185 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1186 {
1187 	struct root_entry *root;
1188 
1189 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1190 	if (!root) {
1191 		pr_err("Allocating root entry for %s failed\n",
1192 			iommu->name);
1193 		return -ENOMEM;
1194 	}
1195 
1196 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1197 	iommu->root_entry = root;
1198 
1199 	return 0;
1200 }
1201 
1202 static void iommu_set_root_entry(struct intel_iommu *iommu)
1203 {
1204 	u64 addr;
1205 	u32 sts;
1206 	unsigned long flag;
1207 
1208 	addr = virt_to_phys(iommu->root_entry);
1209 	if (sm_supported(iommu))
1210 		addr |= DMA_RTADDR_SMT;
1211 
1212 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1214 
1215 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1216 
1217 	/* Make sure hardware complete it */
1218 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1219 		      readl, (sts & DMA_GSTS_RTPS), sts);
1220 
1221 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1222 
1223 	/*
1224 	 * Hardware invalidates all DMA remapping hardware translation
1225 	 * caches as part of SRTP flow.
1226 	 */
1227 	if (cap_esrtps(iommu->cap))
1228 		return;
1229 
1230 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1231 	if (sm_supported(iommu))
1232 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1233 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1234 }
1235 
1236 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1237 {
1238 	u32 val;
1239 	unsigned long flag;
1240 
1241 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1242 		return;
1243 
1244 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1245 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1246 
1247 	/* Make sure hardware complete it */
1248 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1249 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1250 
1251 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1252 }
1253 
1254 /* return value determine if we need a write buffer flush */
1255 static void __iommu_flush_context(struct intel_iommu *iommu,
1256 				  u16 did, u16 source_id, u8 function_mask,
1257 				  u64 type)
1258 {
1259 	u64 val = 0;
1260 	unsigned long flag;
1261 
1262 	switch (type) {
1263 	case DMA_CCMD_GLOBAL_INVL:
1264 		val = DMA_CCMD_GLOBAL_INVL;
1265 		break;
1266 	case DMA_CCMD_DOMAIN_INVL:
1267 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1268 		break;
1269 	case DMA_CCMD_DEVICE_INVL:
1270 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1271 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1272 		break;
1273 	default:
1274 		BUG();
1275 	}
1276 	val |= DMA_CCMD_ICC;
1277 
1278 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1279 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1280 
1281 	/* Make sure hardware complete it */
1282 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1283 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1284 
1285 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1286 }
1287 
1288 /* return value determine if we need a write buffer flush */
1289 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1290 				u64 addr, unsigned int size_order, u64 type)
1291 {
1292 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1293 	u64 val = 0, val_iva = 0;
1294 	unsigned long flag;
1295 
1296 	switch (type) {
1297 	case DMA_TLB_GLOBAL_FLUSH:
1298 		/* global flush doesn't need set IVA_REG */
1299 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1300 		break;
1301 	case DMA_TLB_DSI_FLUSH:
1302 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303 		break;
1304 	case DMA_TLB_PSI_FLUSH:
1305 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1306 		/* IH bit is passed in as part of address */
1307 		val_iva = size_order | addr;
1308 		break;
1309 	default:
1310 		BUG();
1311 	}
1312 	/* Note: set drain read/write */
1313 #if 0
1314 	/*
1315 	 * This is probably to be super secure.. Looks like we can
1316 	 * ignore it without any impact.
1317 	 */
1318 	if (cap_read_drain(iommu->cap))
1319 		val |= DMA_TLB_READ_DRAIN;
1320 #endif
1321 	if (cap_write_drain(iommu->cap))
1322 		val |= DMA_TLB_WRITE_DRAIN;
1323 
1324 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325 	/* Note: Only uses first TLB reg currently */
1326 	if (val_iva)
1327 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329 
1330 	/* Make sure hardware complete it */
1331 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333 
1334 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335 
1336 	/* check IOTLB invalidation granularity */
1337 	if (DMA_TLB_IAIG(val) == 0)
1338 		pr_err("Flush IOTLB failed\n");
1339 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1341 			(unsigned long long)DMA_TLB_IIRG(type),
1342 			(unsigned long long)DMA_TLB_IAIG(val));
1343 }
1344 
1345 static struct device_domain_info *
1346 domain_lookup_dev_info(struct dmar_domain *domain,
1347 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1348 {
1349 	struct device_domain_info *info;
1350 	unsigned long flags;
1351 
1352 	spin_lock_irqsave(&domain->lock, flags);
1353 	list_for_each_entry(info, &domain->devices, link) {
1354 		if (info->iommu == iommu && info->bus == bus &&
1355 		    info->devfn == devfn) {
1356 			spin_unlock_irqrestore(&domain->lock, flags);
1357 			return info;
1358 		}
1359 	}
1360 	spin_unlock_irqrestore(&domain->lock, flags);
1361 
1362 	return NULL;
1363 }
1364 
1365 static void domain_update_iotlb(struct dmar_domain *domain)
1366 {
1367 	struct device_domain_info *info;
1368 	bool has_iotlb_device = false;
1369 	unsigned long flags;
1370 
1371 	spin_lock_irqsave(&domain->lock, flags);
1372 	list_for_each_entry(info, &domain->devices, link) {
1373 		if (info->ats_enabled) {
1374 			has_iotlb_device = true;
1375 			break;
1376 		}
1377 	}
1378 	domain->has_iotlb_device = has_iotlb_device;
1379 	spin_unlock_irqrestore(&domain->lock, flags);
1380 }
1381 
1382 /*
1383  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1384  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1385  * check because it applies only to the built-in QAT devices and it doesn't
1386  * grant additional privileges.
1387  */
1388 #define BUGGY_QAT_DEVID_MASK 0x4940
1389 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1390 {
1391 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1392 		return false;
1393 
1394 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1395 		return false;
1396 
1397 	return true;
1398 }
1399 
1400 static void iommu_enable_pci_caps(struct device_domain_info *info)
1401 {
1402 	struct pci_dev *pdev;
1403 
1404 	if (!dev_is_pci(info->dev))
1405 		return;
1406 
1407 	pdev = to_pci_dev(info->dev);
1408 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1409 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1410 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1411 	 * reserved, which should be set to 0.
1412 	 */
1413 	if (!ecap_dit(info->iommu->ecap))
1414 		info->pfsid = 0;
1415 	else {
1416 		struct pci_dev *pf_pdev;
1417 
1418 		/* pdev will be returned if device is not a vf */
1419 		pf_pdev = pci_physfn(pdev);
1420 		info->pfsid = pci_dev_id(pf_pdev);
1421 	}
1422 
1423 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1424 	   the device if you enable PASID support after ATS support is
1425 	   undefined. So always enable PASID support on devices which
1426 	   have it, even if we can't yet know if we're ever going to
1427 	   use it. */
1428 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1429 		info->pasid_enabled = 1;
1430 
1431 	if (info->pri_supported &&
1432 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1433 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1434 		info->pri_enabled = 1;
1435 
1436 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1437 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1438 		info->ats_enabled = 1;
1439 		domain_update_iotlb(info->domain);
1440 		info->ats_qdep = pci_ats_queue_depth(pdev);
1441 	}
1442 }
1443 
1444 static void iommu_disable_pci_caps(struct device_domain_info *info)
1445 {
1446 	struct pci_dev *pdev;
1447 
1448 	if (!dev_is_pci(info->dev))
1449 		return;
1450 
1451 	pdev = to_pci_dev(info->dev);
1452 
1453 	if (info->ats_enabled) {
1454 		pci_disable_ats(pdev);
1455 		info->ats_enabled = 0;
1456 		domain_update_iotlb(info->domain);
1457 	}
1458 
1459 	if (info->pri_enabled) {
1460 		pci_disable_pri(pdev);
1461 		info->pri_enabled = 0;
1462 	}
1463 
1464 	if (info->pasid_enabled) {
1465 		pci_disable_pasid(pdev);
1466 		info->pasid_enabled = 0;
1467 	}
1468 }
1469 
1470 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1471 				    u64 addr, unsigned int mask)
1472 {
1473 	u16 sid, qdep;
1474 
1475 	if (!info || !info->ats_enabled)
1476 		return;
1477 
1478 	sid = info->bus << 8 | info->devfn;
1479 	qdep = info->ats_qdep;
1480 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1481 			   qdep, addr, mask);
1482 	quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1483 }
1484 
1485 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1486 				  u64 addr, unsigned mask)
1487 {
1488 	struct device_domain_info *info;
1489 	unsigned long flags;
1490 
1491 	if (!domain->has_iotlb_device)
1492 		return;
1493 
1494 	spin_lock_irqsave(&domain->lock, flags);
1495 	list_for_each_entry(info, &domain->devices, link)
1496 		__iommu_flush_dev_iotlb(info, addr, mask);
1497 	spin_unlock_irqrestore(&domain->lock, flags);
1498 }
1499 
1500 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1501 				  struct dmar_domain *domain,
1502 				  unsigned long pfn, unsigned int pages,
1503 				  int ih, int map)
1504 {
1505 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1506 	unsigned int mask = ilog2(aligned_pages);
1507 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1508 	u16 did = domain_id_iommu(domain, iommu);
1509 
1510 	BUG_ON(pages == 0);
1511 
1512 	if (ih)
1513 		ih = 1 << 6;
1514 
1515 	if (domain->use_first_level) {
1516 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1517 	} else {
1518 		unsigned long bitmask = aligned_pages - 1;
1519 
1520 		/*
1521 		 * PSI masks the low order bits of the base address. If the
1522 		 * address isn't aligned to the mask, then compute a mask value
1523 		 * needed to ensure the target range is flushed.
1524 		 */
1525 		if (unlikely(bitmask & pfn)) {
1526 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1527 
1528 			/*
1529 			 * Since end_pfn <= pfn + bitmask, the only way bits
1530 			 * higher than bitmask can differ in pfn and end_pfn is
1531 			 * by carrying. This means after masking out bitmask,
1532 			 * high bits starting with the first set bit in
1533 			 * shared_bits are all equal in both pfn and end_pfn.
1534 			 */
1535 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1536 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1537 		}
1538 
1539 		/*
1540 		 * Fallback to domain selective flush if no PSI support or
1541 		 * the size is too big.
1542 		 */
1543 		if (!cap_pgsel_inv(iommu->cap) ||
1544 		    mask > cap_max_amask_val(iommu->cap))
1545 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1546 							DMA_TLB_DSI_FLUSH);
1547 		else
1548 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1549 							DMA_TLB_PSI_FLUSH);
1550 	}
1551 
1552 	/*
1553 	 * In caching mode, changes of pages from non-present to present require
1554 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1555 	 */
1556 	if (!cap_caching_mode(iommu->cap) || !map)
1557 		iommu_flush_dev_iotlb(domain, addr, mask);
1558 }
1559 
1560 /* Notification for newly created mappings */
1561 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1562 					struct dmar_domain *domain,
1563 					unsigned long pfn, unsigned int pages)
1564 {
1565 	/*
1566 	 * It's a non-present to present mapping. Only flush if caching mode
1567 	 * and second level.
1568 	 */
1569 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1570 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1571 	else
1572 		iommu_flush_write_buffer(iommu);
1573 }
1574 
1575 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1576 {
1577 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1578 	struct iommu_domain_info *info;
1579 	unsigned long idx;
1580 
1581 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1582 		struct intel_iommu *iommu = info->iommu;
1583 		u16 did = domain_id_iommu(dmar_domain, iommu);
1584 
1585 		if (dmar_domain->use_first_level)
1586 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1587 		else
1588 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1589 						 DMA_TLB_DSI_FLUSH);
1590 
1591 		if (!cap_caching_mode(iommu->cap))
1592 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1593 	}
1594 }
1595 
1596 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597 {
1598 	u32 pmen;
1599 	unsigned long flags;
1600 
1601 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602 		return;
1603 
1604 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606 	pmen &= ~DMA_PMEN_EPM;
1607 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608 
1609 	/* wait for the protected region status bit to clear */
1610 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1612 
1613 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614 }
1615 
1616 static void iommu_enable_translation(struct intel_iommu *iommu)
1617 {
1618 	u32 sts;
1619 	unsigned long flags;
1620 
1621 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622 	iommu->gcmd |= DMA_GCMD_TE;
1623 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624 
1625 	/* Make sure hardware complete it */
1626 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627 		      readl, (sts & DMA_GSTS_TES), sts);
1628 
1629 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631 
1632 static void iommu_disable_translation(struct intel_iommu *iommu)
1633 {
1634 	u32 sts;
1635 	unsigned long flag;
1636 
1637 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1638 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1639 		return;
1640 
1641 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1642 	iommu->gcmd &= ~DMA_GCMD_TE;
1643 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644 
1645 	/* Make sure hardware complete it */
1646 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1648 
1649 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650 }
1651 
1652 static int iommu_init_domains(struct intel_iommu *iommu)
1653 {
1654 	u32 ndomains;
1655 
1656 	ndomains = cap_ndoms(iommu->cap);
1657 	pr_debug("%s: Number of Domains supported <%d>\n",
1658 		 iommu->name, ndomains);
1659 
1660 	spin_lock_init(&iommu->lock);
1661 
1662 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1663 	if (!iommu->domain_ids)
1664 		return -ENOMEM;
1665 
1666 	/*
1667 	 * If Caching mode is set, then invalid translations are tagged
1668 	 * with domain-id 0, hence we need to pre-allocate it. We also
1669 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1670 	 * make sure it is not used for a real domain.
1671 	 */
1672 	set_bit(0, iommu->domain_ids);
1673 
1674 	/*
1675 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1676 	 * entry for first-level or pass-through translation modes should
1677 	 * be programmed with a domain id different from those used for
1678 	 * second-level or nested translation. We reserve a domain id for
1679 	 * this purpose.
1680 	 */
1681 	if (sm_supported(iommu))
1682 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1683 
1684 	return 0;
1685 }
1686 
1687 static void disable_dmar_iommu(struct intel_iommu *iommu)
1688 {
1689 	if (!iommu->domain_ids)
1690 		return;
1691 
1692 	/*
1693 	 * All iommu domains must have been detached from the devices,
1694 	 * hence there should be no domain IDs in use.
1695 	 */
1696 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1697 		    > NUM_RESERVED_DID))
1698 		return;
1699 
1700 	if (iommu->gcmd & DMA_GCMD_TE)
1701 		iommu_disable_translation(iommu);
1702 }
1703 
1704 static void free_dmar_iommu(struct intel_iommu *iommu)
1705 {
1706 	if (iommu->domain_ids) {
1707 		bitmap_free(iommu->domain_ids);
1708 		iommu->domain_ids = NULL;
1709 	}
1710 
1711 	if (iommu->copied_tables) {
1712 		bitmap_free(iommu->copied_tables);
1713 		iommu->copied_tables = NULL;
1714 	}
1715 
1716 	/* free context mapping */
1717 	free_context_table(iommu);
1718 
1719 #ifdef CONFIG_INTEL_IOMMU_SVM
1720 	if (pasid_supported(iommu)) {
1721 		if (ecap_prs(iommu->ecap))
1722 			intel_svm_finish_prq(iommu);
1723 	}
1724 	if (vccap_pasid(iommu->vccap))
1725 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1726 
1727 #endif
1728 }
1729 
1730 /*
1731  * Check and return whether first level is used by default for
1732  * DMA translation.
1733  */
1734 static bool first_level_by_default(unsigned int type)
1735 {
1736 	/* Only SL is available in legacy mode */
1737 	if (!scalable_mode_support())
1738 		return false;
1739 
1740 	/* Only level (either FL or SL) is available, just use it */
1741 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1742 		return intel_cap_flts_sanity();
1743 
1744 	/* Both levels are available, decide it based on domain type */
1745 	return type != IOMMU_DOMAIN_UNMANAGED;
1746 }
1747 
1748 static struct dmar_domain *alloc_domain(unsigned int type)
1749 {
1750 	struct dmar_domain *domain;
1751 
1752 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1753 	if (!domain)
1754 		return NULL;
1755 
1756 	domain->nid = NUMA_NO_NODE;
1757 	if (first_level_by_default(type))
1758 		domain->use_first_level = true;
1759 	domain->has_iotlb_device = false;
1760 	INIT_LIST_HEAD(&domain->devices);
1761 	spin_lock_init(&domain->lock);
1762 	xa_init(&domain->iommu_array);
1763 
1764 	return domain;
1765 }
1766 
1767 static int domain_attach_iommu(struct dmar_domain *domain,
1768 			       struct intel_iommu *iommu)
1769 {
1770 	struct iommu_domain_info *info, *curr;
1771 	unsigned long ndomains;
1772 	int num, ret = -ENOSPC;
1773 
1774 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1775 	if (!info)
1776 		return -ENOMEM;
1777 
1778 	spin_lock(&iommu->lock);
1779 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1780 	if (curr) {
1781 		curr->refcnt++;
1782 		spin_unlock(&iommu->lock);
1783 		kfree(info);
1784 		return 0;
1785 	}
1786 
1787 	ndomains = cap_ndoms(iommu->cap);
1788 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1789 	if (num >= ndomains) {
1790 		pr_err("%s: No free domain ids\n", iommu->name);
1791 		goto err_unlock;
1792 	}
1793 
1794 	set_bit(num, iommu->domain_ids);
1795 	info->refcnt	= 1;
1796 	info->did	= num;
1797 	info->iommu	= iommu;
1798 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1799 			  NULL, info, GFP_ATOMIC);
1800 	if (curr) {
1801 		ret = xa_err(curr) ? : -EBUSY;
1802 		goto err_clear;
1803 	}
1804 	domain_update_iommu_cap(domain);
1805 
1806 	spin_unlock(&iommu->lock);
1807 	return 0;
1808 
1809 err_clear:
1810 	clear_bit(info->did, iommu->domain_ids);
1811 err_unlock:
1812 	spin_unlock(&iommu->lock);
1813 	kfree(info);
1814 	return ret;
1815 }
1816 
1817 static void domain_detach_iommu(struct dmar_domain *domain,
1818 				struct intel_iommu *iommu)
1819 {
1820 	struct iommu_domain_info *info;
1821 
1822 	spin_lock(&iommu->lock);
1823 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1824 	if (--info->refcnt == 0) {
1825 		clear_bit(info->did, iommu->domain_ids);
1826 		xa_erase(&domain->iommu_array, iommu->seq_id);
1827 		domain->nid = NUMA_NO_NODE;
1828 		domain_update_iommu_cap(domain);
1829 		kfree(info);
1830 	}
1831 	spin_unlock(&iommu->lock);
1832 }
1833 
1834 static inline int guestwidth_to_adjustwidth(int gaw)
1835 {
1836 	int agaw;
1837 	int r = (gaw - 12) % 9;
1838 
1839 	if (r == 0)
1840 		agaw = gaw;
1841 	else
1842 		agaw = gaw + 9 - r;
1843 	if (agaw > 64)
1844 		agaw = 64;
1845 	return agaw;
1846 }
1847 
1848 static void domain_exit(struct dmar_domain *domain)
1849 {
1850 	if (domain->pgd) {
1851 		LIST_HEAD(freelist);
1852 
1853 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1854 		put_pages_list(&freelist);
1855 	}
1856 
1857 	if (WARN_ON(!list_empty(&domain->devices)))
1858 		return;
1859 
1860 	kfree(domain);
1861 }
1862 
1863 /*
1864  * Get the PASID directory size for scalable mode context entry.
1865  * Value of X in the PDTS field of a scalable mode context entry
1866  * indicates PASID directory with 2^(X + 7) entries.
1867  */
1868 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1869 {
1870 	unsigned long pds, max_pde;
1871 
1872 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1873 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1874 	if (pds < 7)
1875 		return 0;
1876 
1877 	return pds - 7;
1878 }
1879 
1880 /*
1881  * Set the RID_PASID field of a scalable mode context entry. The
1882  * IOMMU hardware will use the PASID value set in this field for
1883  * DMA translations of DMA requests without PASID.
1884  */
1885 static inline void
1886 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1887 {
1888 	context->hi |= pasid & ((1 << 20) - 1);
1889 }
1890 
1891 /*
1892  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1893  * entry.
1894  */
1895 static inline void context_set_sm_dte(struct context_entry *context)
1896 {
1897 	context->lo |= (1 << 2);
1898 }
1899 
1900 /*
1901  * Set the PRE(Page Request Enable) field of a scalable mode context
1902  * entry.
1903  */
1904 static inline void context_set_sm_pre(struct context_entry *context)
1905 {
1906 	context->lo |= (1 << 4);
1907 }
1908 
1909 /* Convert value to context PASID directory size field coding. */
1910 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1911 
1912 static int domain_context_mapping_one(struct dmar_domain *domain,
1913 				      struct intel_iommu *iommu,
1914 				      struct pasid_table *table,
1915 				      u8 bus, u8 devfn)
1916 {
1917 	struct device_domain_info *info =
1918 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1919 	u16 did = domain_id_iommu(domain, iommu);
1920 	int translation = CONTEXT_TT_MULTI_LEVEL;
1921 	struct context_entry *context;
1922 	int ret;
1923 
1924 	WARN_ON(did == 0);
1925 
1926 	if (hw_pass_through && domain_type_is_si(domain))
1927 		translation = CONTEXT_TT_PASS_THROUGH;
1928 
1929 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1930 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1931 
1932 	BUG_ON(!domain->pgd);
1933 
1934 	spin_lock(&iommu->lock);
1935 	ret = -ENOMEM;
1936 	context = iommu_context_addr(iommu, bus, devfn, 1);
1937 	if (!context)
1938 		goto out_unlock;
1939 
1940 	ret = 0;
1941 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1942 		goto out_unlock;
1943 
1944 	/*
1945 	 * For kdump cases, old valid entries may be cached due to the
1946 	 * in-flight DMA and copied pgtable, but there is no unmapping
1947 	 * behaviour for them, thus we need an explicit cache flush for
1948 	 * the newly-mapped device. For kdump, at this point, the device
1949 	 * is supposed to finish reset at its driver probe stage, so no
1950 	 * in-flight DMA will exist, and we don't need to worry anymore
1951 	 * hereafter.
1952 	 */
1953 	if (context_copied(iommu, bus, devfn)) {
1954 		u16 did_old = context_domain_id(context);
1955 
1956 		if (did_old < cap_ndoms(iommu->cap)) {
1957 			iommu->flush.flush_context(iommu, did_old,
1958 						   (((u16)bus) << 8) | devfn,
1959 						   DMA_CCMD_MASK_NOBIT,
1960 						   DMA_CCMD_DEVICE_INVL);
1961 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1962 						 DMA_TLB_DSI_FLUSH);
1963 		}
1964 
1965 		clear_context_copied(iommu, bus, devfn);
1966 	}
1967 
1968 	context_clear_entry(context);
1969 
1970 	if (sm_supported(iommu)) {
1971 		unsigned long pds;
1972 
1973 		WARN_ON(!table);
1974 
1975 		/* Setup the PASID DIR pointer: */
1976 		pds = context_get_sm_pds(table);
1977 		context->lo = (u64)virt_to_phys(table->table) |
1978 				context_pdts(pds);
1979 
1980 		/* Setup the RID_PASID field: */
1981 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1982 
1983 		/*
1984 		 * Setup the Device-TLB enable bit and Page request
1985 		 * Enable bit:
1986 		 */
1987 		if (info && info->ats_supported)
1988 			context_set_sm_dte(context);
1989 		if (info && info->pri_supported)
1990 			context_set_sm_pre(context);
1991 		if (info && info->pasid_supported)
1992 			context_set_pasid(context);
1993 	} else {
1994 		struct dma_pte *pgd = domain->pgd;
1995 		int agaw;
1996 
1997 		context_set_domain_id(context, did);
1998 
1999 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2000 			/*
2001 			 * Skip top levels of page tables for iommu which has
2002 			 * less agaw than default. Unnecessary for PT mode.
2003 			 */
2004 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2005 				ret = -ENOMEM;
2006 				pgd = phys_to_virt(dma_pte_addr(pgd));
2007 				if (!dma_pte_present(pgd))
2008 					goto out_unlock;
2009 			}
2010 
2011 			if (info && info->ats_supported)
2012 				translation = CONTEXT_TT_DEV_IOTLB;
2013 			else
2014 				translation = CONTEXT_TT_MULTI_LEVEL;
2015 
2016 			context_set_address_root(context, virt_to_phys(pgd));
2017 			context_set_address_width(context, agaw);
2018 		} else {
2019 			/*
2020 			 * In pass through mode, AW must be programmed to
2021 			 * indicate the largest AGAW value supported by
2022 			 * hardware. And ASR is ignored by hardware.
2023 			 */
2024 			context_set_address_width(context, iommu->msagaw);
2025 		}
2026 
2027 		context_set_translation_type(context, translation);
2028 	}
2029 
2030 	context_set_fault_enable(context);
2031 	context_set_present(context);
2032 	if (!ecap_coherent(iommu->ecap))
2033 		clflush_cache_range(context, sizeof(*context));
2034 
2035 	/*
2036 	 * It's a non-present to present mapping. If hardware doesn't cache
2037 	 * non-present entry we only need to flush the write-buffer. If the
2038 	 * _does_ cache non-present entries, then it does so in the special
2039 	 * domain #0, which we have to flush:
2040 	 */
2041 	if (cap_caching_mode(iommu->cap)) {
2042 		iommu->flush.flush_context(iommu, 0,
2043 					   (((u16)bus) << 8) | devfn,
2044 					   DMA_CCMD_MASK_NOBIT,
2045 					   DMA_CCMD_DEVICE_INVL);
2046 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2047 	} else {
2048 		iommu_flush_write_buffer(iommu);
2049 	}
2050 
2051 	ret = 0;
2052 
2053 out_unlock:
2054 	spin_unlock(&iommu->lock);
2055 
2056 	return ret;
2057 }
2058 
2059 struct domain_context_mapping_data {
2060 	struct dmar_domain *domain;
2061 	struct intel_iommu *iommu;
2062 	struct pasid_table *table;
2063 };
2064 
2065 static int domain_context_mapping_cb(struct pci_dev *pdev,
2066 				     u16 alias, void *opaque)
2067 {
2068 	struct domain_context_mapping_data *data = opaque;
2069 
2070 	return domain_context_mapping_one(data->domain, data->iommu,
2071 					  data->table, PCI_BUS_NUM(alias),
2072 					  alias & 0xff);
2073 }
2074 
2075 static int
2076 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2077 {
2078 	struct domain_context_mapping_data data;
2079 	struct pasid_table *table;
2080 	struct intel_iommu *iommu;
2081 	u8 bus, devfn;
2082 
2083 	iommu = device_to_iommu(dev, &bus, &devfn);
2084 	if (!iommu)
2085 		return -ENODEV;
2086 
2087 	table = intel_pasid_get_table(dev);
2088 
2089 	if (!dev_is_pci(dev))
2090 		return domain_context_mapping_one(domain, iommu, table,
2091 						  bus, devfn);
2092 
2093 	data.domain = domain;
2094 	data.iommu = iommu;
2095 	data.table = table;
2096 
2097 	return pci_for_each_dma_alias(to_pci_dev(dev),
2098 				      &domain_context_mapping_cb, &data);
2099 }
2100 
2101 /* Returns a number of VTD pages, but aligned to MM page size */
2102 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2103 					    size_t size)
2104 {
2105 	host_addr &= ~PAGE_MASK;
2106 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2107 }
2108 
2109 /* Return largest possible superpage level for a given mapping */
2110 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2111 					  unsigned long iov_pfn,
2112 					  unsigned long phy_pfn,
2113 					  unsigned long pages)
2114 {
2115 	int support, level = 1;
2116 	unsigned long pfnmerge;
2117 
2118 	support = domain->iommu_superpage;
2119 
2120 	/* To use a large page, the virtual *and* physical addresses
2121 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2122 	   of them will mean we have to use smaller pages. So just
2123 	   merge them and check both at once. */
2124 	pfnmerge = iov_pfn | phy_pfn;
2125 
2126 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2127 		pages >>= VTD_STRIDE_SHIFT;
2128 		if (!pages)
2129 			break;
2130 		pfnmerge >>= VTD_STRIDE_SHIFT;
2131 		level++;
2132 		support--;
2133 	}
2134 	return level;
2135 }
2136 
2137 /*
2138  * Ensure that old small page tables are removed to make room for superpage(s).
2139  * We're going to add new large pages, so make sure we don't remove their parent
2140  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2141  */
2142 static void switch_to_super_page(struct dmar_domain *domain,
2143 				 unsigned long start_pfn,
2144 				 unsigned long end_pfn, int level)
2145 {
2146 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2147 	struct iommu_domain_info *info;
2148 	struct dma_pte *pte = NULL;
2149 	unsigned long i;
2150 
2151 	while (start_pfn <= end_pfn) {
2152 		if (!pte)
2153 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2154 
2155 		if (dma_pte_present(pte)) {
2156 			dma_pte_free_pagetable(domain, start_pfn,
2157 					       start_pfn + lvl_pages - 1,
2158 					       level + 1);
2159 
2160 			xa_for_each(&domain->iommu_array, i, info)
2161 				iommu_flush_iotlb_psi(info->iommu, domain,
2162 						      start_pfn, lvl_pages,
2163 						      0, 0);
2164 		}
2165 
2166 		pte++;
2167 		start_pfn += lvl_pages;
2168 		if (first_pte_in_page(pte))
2169 			pte = NULL;
2170 	}
2171 }
2172 
2173 static int
2174 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2175 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2176 {
2177 	struct dma_pte *first_pte = NULL, *pte = NULL;
2178 	unsigned int largepage_lvl = 0;
2179 	unsigned long lvl_pages = 0;
2180 	phys_addr_t pteval;
2181 	u64 attr;
2182 
2183 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2184 
2185 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2186 		return -EINVAL;
2187 
2188 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2189 	attr |= DMA_FL_PTE_PRESENT;
2190 	if (domain->use_first_level) {
2191 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2192 		if (prot & DMA_PTE_WRITE)
2193 			attr |= DMA_FL_PTE_DIRTY;
2194 	}
2195 
2196 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2197 
2198 	while (nr_pages > 0) {
2199 		uint64_t tmp;
2200 
2201 		if (!pte) {
2202 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2203 					phys_pfn, nr_pages);
2204 
2205 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2206 			if (!pte)
2207 				return -ENOMEM;
2208 			first_pte = pte;
2209 
2210 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2211 
2212 			/* It is large page*/
2213 			if (largepage_lvl > 1) {
2214 				unsigned long end_pfn;
2215 				unsigned long pages_to_remove;
2216 
2217 				pteval |= DMA_PTE_LARGE_PAGE;
2218 				pages_to_remove = min_t(unsigned long, nr_pages,
2219 							nr_pte_to_next_page(pte) * lvl_pages);
2220 				end_pfn = iov_pfn + pages_to_remove - 1;
2221 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2222 			} else {
2223 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2224 			}
2225 
2226 		}
2227 		/* We don't need lock here, nobody else
2228 		 * touches the iova range
2229 		 */
2230 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2231 		if (tmp) {
2232 			static int dumps = 5;
2233 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2234 				iov_pfn, tmp, (unsigned long long)pteval);
2235 			if (dumps) {
2236 				dumps--;
2237 				debug_dma_dump_mappings(NULL);
2238 			}
2239 			WARN_ON(1);
2240 		}
2241 
2242 		nr_pages -= lvl_pages;
2243 		iov_pfn += lvl_pages;
2244 		phys_pfn += lvl_pages;
2245 		pteval += lvl_pages * VTD_PAGE_SIZE;
2246 
2247 		/* If the next PTE would be the first in a new page, then we
2248 		 * need to flush the cache on the entries we've just written.
2249 		 * And then we'll need to recalculate 'pte', so clear it and
2250 		 * let it get set again in the if (!pte) block above.
2251 		 *
2252 		 * If we're done (!nr_pages) we need to flush the cache too.
2253 		 *
2254 		 * Also if we've been setting superpages, we may need to
2255 		 * recalculate 'pte' and switch back to smaller pages for the
2256 		 * end of the mapping, if the trailing size is not enough to
2257 		 * use another superpage (i.e. nr_pages < lvl_pages).
2258 		 */
2259 		pte++;
2260 		if (!nr_pages || first_pte_in_page(pte) ||
2261 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2262 			domain_flush_cache(domain, first_pte,
2263 					   (void *)pte - (void *)first_pte);
2264 			pte = NULL;
2265 		}
2266 	}
2267 
2268 	return 0;
2269 }
2270 
2271 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2272 {
2273 	struct intel_iommu *iommu = info->iommu;
2274 	struct context_entry *context;
2275 	u16 did_old;
2276 
2277 	if (!iommu)
2278 		return;
2279 
2280 	spin_lock(&iommu->lock);
2281 	context = iommu_context_addr(iommu, bus, devfn, 0);
2282 	if (!context) {
2283 		spin_unlock(&iommu->lock);
2284 		return;
2285 	}
2286 
2287 	if (sm_supported(iommu)) {
2288 		if (hw_pass_through && domain_type_is_si(info->domain))
2289 			did_old = FLPT_DEFAULT_DID;
2290 		else
2291 			did_old = domain_id_iommu(info->domain, iommu);
2292 	} else {
2293 		did_old = context_domain_id(context);
2294 	}
2295 
2296 	context_clear_entry(context);
2297 	__iommu_flush_cache(iommu, context, sizeof(*context));
2298 	spin_unlock(&iommu->lock);
2299 	iommu->flush.flush_context(iommu,
2300 				   did_old,
2301 				   (((u16)bus) << 8) | devfn,
2302 				   DMA_CCMD_MASK_NOBIT,
2303 				   DMA_CCMD_DEVICE_INVL);
2304 
2305 	if (sm_supported(iommu))
2306 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2307 
2308 	iommu->flush.flush_iotlb(iommu,
2309 				 did_old,
2310 				 0,
2311 				 0,
2312 				 DMA_TLB_DSI_FLUSH);
2313 
2314 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2315 }
2316 
2317 static int domain_setup_first_level(struct intel_iommu *iommu,
2318 				    struct dmar_domain *domain,
2319 				    struct device *dev,
2320 				    u32 pasid)
2321 {
2322 	struct dma_pte *pgd = domain->pgd;
2323 	int agaw, level;
2324 	int flags = 0;
2325 
2326 	/*
2327 	 * Skip top levels of page tables for iommu which has
2328 	 * less agaw than default. Unnecessary for PT mode.
2329 	 */
2330 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2331 		pgd = phys_to_virt(dma_pte_addr(pgd));
2332 		if (!dma_pte_present(pgd))
2333 			return -ENOMEM;
2334 	}
2335 
2336 	level = agaw_to_level(agaw);
2337 	if (level != 4 && level != 5)
2338 		return -EINVAL;
2339 
2340 	if (pasid != PASID_RID2PASID)
2341 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2342 	if (level == 5)
2343 		flags |= PASID_FLAG_FL5LP;
2344 
2345 	if (domain->force_snooping)
2346 		flags |= PASID_FLAG_PAGE_SNOOP;
2347 
2348 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2349 					     domain_id_iommu(domain, iommu),
2350 					     flags);
2351 }
2352 
2353 static bool dev_is_real_dma_subdevice(struct device *dev)
2354 {
2355 	return dev && dev_is_pci(dev) &&
2356 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2357 }
2358 
2359 static int iommu_domain_identity_map(struct dmar_domain *domain,
2360 				     unsigned long first_vpfn,
2361 				     unsigned long last_vpfn)
2362 {
2363 	/*
2364 	 * RMRR range might have overlap with physical memory range,
2365 	 * clear it first
2366 	 */
2367 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2368 
2369 	return __domain_mapping(domain, first_vpfn,
2370 				first_vpfn, last_vpfn - first_vpfn + 1,
2371 				DMA_PTE_READ|DMA_PTE_WRITE);
2372 }
2373 
2374 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2375 
2376 static int __init si_domain_init(int hw)
2377 {
2378 	struct dmar_rmrr_unit *rmrr;
2379 	struct device *dev;
2380 	int i, nid, ret;
2381 
2382 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2383 	if (!si_domain)
2384 		return -EFAULT;
2385 
2386 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2387 		domain_exit(si_domain);
2388 		si_domain = NULL;
2389 		return -EFAULT;
2390 	}
2391 
2392 	if (hw)
2393 		return 0;
2394 
2395 	for_each_online_node(nid) {
2396 		unsigned long start_pfn, end_pfn;
2397 		int i;
2398 
2399 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2400 			ret = iommu_domain_identity_map(si_domain,
2401 					mm_to_dma_pfn(start_pfn),
2402 					mm_to_dma_pfn(end_pfn));
2403 			if (ret)
2404 				return ret;
2405 		}
2406 	}
2407 
2408 	/*
2409 	 * Identity map the RMRRs so that devices with RMRRs could also use
2410 	 * the si_domain.
2411 	 */
2412 	for_each_rmrr_units(rmrr) {
2413 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2414 					  i, dev) {
2415 			unsigned long long start = rmrr->base_address;
2416 			unsigned long long end = rmrr->end_address;
2417 
2418 			if (WARN_ON(end < start ||
2419 				    end >> agaw_to_width(si_domain->agaw)))
2420 				continue;
2421 
2422 			ret = iommu_domain_identity_map(si_domain,
2423 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2424 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2425 			if (ret)
2426 				return ret;
2427 		}
2428 	}
2429 
2430 	return 0;
2431 }
2432 
2433 static int dmar_domain_attach_device(struct dmar_domain *domain,
2434 				     struct device *dev)
2435 {
2436 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2437 	struct intel_iommu *iommu;
2438 	unsigned long flags;
2439 	u8 bus, devfn;
2440 	int ret;
2441 
2442 	iommu = device_to_iommu(dev, &bus, &devfn);
2443 	if (!iommu)
2444 		return -ENODEV;
2445 
2446 	ret = domain_attach_iommu(domain, iommu);
2447 	if (ret)
2448 		return ret;
2449 	info->domain = domain;
2450 	spin_lock_irqsave(&domain->lock, flags);
2451 	list_add(&info->link, &domain->devices);
2452 	spin_unlock_irqrestore(&domain->lock, flags);
2453 
2454 	/* PASID table is mandatory for a PCI device in scalable mode. */
2455 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2456 		/* Setup the PASID entry for requests without PASID: */
2457 		if (hw_pass_through && domain_type_is_si(domain))
2458 			ret = intel_pasid_setup_pass_through(iommu, domain,
2459 					dev, PASID_RID2PASID);
2460 		else if (domain->use_first_level)
2461 			ret = domain_setup_first_level(iommu, domain, dev,
2462 					PASID_RID2PASID);
2463 		else
2464 			ret = intel_pasid_setup_second_level(iommu, domain,
2465 					dev, PASID_RID2PASID);
2466 		if (ret) {
2467 			dev_err(dev, "Setup RID2PASID failed\n");
2468 			device_block_translation(dev);
2469 			return ret;
2470 		}
2471 	}
2472 
2473 	ret = domain_context_mapping(domain, dev);
2474 	if (ret) {
2475 		dev_err(dev, "Domain context map failed\n");
2476 		device_block_translation(dev);
2477 		return ret;
2478 	}
2479 
2480 	iommu_enable_pci_caps(info);
2481 
2482 	return 0;
2483 }
2484 
2485 static bool device_has_rmrr(struct device *dev)
2486 {
2487 	struct dmar_rmrr_unit *rmrr;
2488 	struct device *tmp;
2489 	int i;
2490 
2491 	rcu_read_lock();
2492 	for_each_rmrr_units(rmrr) {
2493 		/*
2494 		 * Return TRUE if this RMRR contains the device that
2495 		 * is passed in.
2496 		 */
2497 		for_each_active_dev_scope(rmrr->devices,
2498 					  rmrr->devices_cnt, i, tmp)
2499 			if (tmp == dev ||
2500 			    is_downstream_to_pci_bridge(dev, tmp)) {
2501 				rcu_read_unlock();
2502 				return true;
2503 			}
2504 	}
2505 	rcu_read_unlock();
2506 	return false;
2507 }
2508 
2509 /**
2510  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2511  * is relaxable (ie. is allowed to be not enforced under some conditions)
2512  * @dev: device handle
2513  *
2514  * We assume that PCI USB devices with RMRRs have them largely
2515  * for historical reasons and that the RMRR space is not actively used post
2516  * boot.  This exclusion may change if vendors begin to abuse it.
2517  *
2518  * The same exception is made for graphics devices, with the requirement that
2519  * any use of the RMRR regions will be torn down before assigning the device
2520  * to a guest.
2521  *
2522  * Return: true if the RMRR is relaxable, false otherwise
2523  */
2524 static bool device_rmrr_is_relaxable(struct device *dev)
2525 {
2526 	struct pci_dev *pdev;
2527 
2528 	if (!dev_is_pci(dev))
2529 		return false;
2530 
2531 	pdev = to_pci_dev(dev);
2532 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2533 		return true;
2534 	else
2535 		return false;
2536 }
2537 
2538 /*
2539  * There are a couple cases where we need to restrict the functionality of
2540  * devices associated with RMRRs.  The first is when evaluating a device for
2541  * identity mapping because problems exist when devices are moved in and out
2542  * of domains and their respective RMRR information is lost.  This means that
2543  * a device with associated RMRRs will never be in a "passthrough" domain.
2544  * The second is use of the device through the IOMMU API.  This interface
2545  * expects to have full control of the IOVA space for the device.  We cannot
2546  * satisfy both the requirement that RMRR access is maintained and have an
2547  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2548  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2549  * We therefore prevent devices associated with an RMRR from participating in
2550  * the IOMMU API, which eliminates them from device assignment.
2551  *
2552  * In both cases, devices which have relaxable RMRRs are not concerned by this
2553  * restriction. See device_rmrr_is_relaxable comment.
2554  */
2555 static bool device_is_rmrr_locked(struct device *dev)
2556 {
2557 	if (!device_has_rmrr(dev))
2558 		return false;
2559 
2560 	if (device_rmrr_is_relaxable(dev))
2561 		return false;
2562 
2563 	return true;
2564 }
2565 
2566 /*
2567  * Return the required default domain type for a specific device.
2568  *
2569  * @dev: the device in query
2570  * @startup: true if this is during early boot
2571  *
2572  * Returns:
2573  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2574  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2575  *  - 0: both identity and dynamic domains work for this device
2576  */
2577 static int device_def_domain_type(struct device *dev)
2578 {
2579 	if (dev_is_pci(dev)) {
2580 		struct pci_dev *pdev = to_pci_dev(dev);
2581 
2582 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2583 			return IOMMU_DOMAIN_IDENTITY;
2584 
2585 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2586 			return IOMMU_DOMAIN_IDENTITY;
2587 	}
2588 
2589 	return 0;
2590 }
2591 
2592 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2593 {
2594 	/*
2595 	 * Start from the sane iommu hardware state.
2596 	 * If the queued invalidation is already initialized by us
2597 	 * (for example, while enabling interrupt-remapping) then
2598 	 * we got the things already rolling from a sane state.
2599 	 */
2600 	if (!iommu->qi) {
2601 		/*
2602 		 * Clear any previous faults.
2603 		 */
2604 		dmar_fault(-1, iommu);
2605 		/*
2606 		 * Disable queued invalidation if supported and already enabled
2607 		 * before OS handover.
2608 		 */
2609 		dmar_disable_qi(iommu);
2610 	}
2611 
2612 	if (dmar_enable_qi(iommu)) {
2613 		/*
2614 		 * Queued Invalidate not enabled, use Register Based Invalidate
2615 		 */
2616 		iommu->flush.flush_context = __iommu_flush_context;
2617 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2618 		pr_info("%s: Using Register based invalidation\n",
2619 			iommu->name);
2620 	} else {
2621 		iommu->flush.flush_context = qi_flush_context;
2622 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2623 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2624 	}
2625 }
2626 
2627 static int copy_context_table(struct intel_iommu *iommu,
2628 			      struct root_entry *old_re,
2629 			      struct context_entry **tbl,
2630 			      int bus, bool ext)
2631 {
2632 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2633 	struct context_entry *new_ce = NULL, ce;
2634 	struct context_entry *old_ce = NULL;
2635 	struct root_entry re;
2636 	phys_addr_t old_ce_phys;
2637 
2638 	tbl_idx = ext ? bus * 2 : bus;
2639 	memcpy(&re, old_re, sizeof(re));
2640 
2641 	for (devfn = 0; devfn < 256; devfn++) {
2642 		/* First calculate the correct index */
2643 		idx = (ext ? devfn * 2 : devfn) % 256;
2644 
2645 		if (idx == 0) {
2646 			/* First save what we may have and clean up */
2647 			if (new_ce) {
2648 				tbl[tbl_idx] = new_ce;
2649 				__iommu_flush_cache(iommu, new_ce,
2650 						    VTD_PAGE_SIZE);
2651 				pos = 1;
2652 			}
2653 
2654 			if (old_ce)
2655 				memunmap(old_ce);
2656 
2657 			ret = 0;
2658 			if (devfn < 0x80)
2659 				old_ce_phys = root_entry_lctp(&re);
2660 			else
2661 				old_ce_phys = root_entry_uctp(&re);
2662 
2663 			if (!old_ce_phys) {
2664 				if (ext && devfn == 0) {
2665 					/* No LCTP, try UCTP */
2666 					devfn = 0x7f;
2667 					continue;
2668 				} else {
2669 					goto out;
2670 				}
2671 			}
2672 
2673 			ret = -ENOMEM;
2674 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2675 					MEMREMAP_WB);
2676 			if (!old_ce)
2677 				goto out;
2678 
2679 			new_ce = alloc_pgtable_page(iommu->node);
2680 			if (!new_ce)
2681 				goto out_unmap;
2682 
2683 			ret = 0;
2684 		}
2685 
2686 		/* Now copy the context entry */
2687 		memcpy(&ce, old_ce + idx, sizeof(ce));
2688 
2689 		if (!context_present(&ce))
2690 			continue;
2691 
2692 		did = context_domain_id(&ce);
2693 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2694 			set_bit(did, iommu->domain_ids);
2695 
2696 		set_context_copied(iommu, bus, devfn);
2697 		new_ce[idx] = ce;
2698 	}
2699 
2700 	tbl[tbl_idx + pos] = new_ce;
2701 
2702 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2703 
2704 out_unmap:
2705 	memunmap(old_ce);
2706 
2707 out:
2708 	return ret;
2709 }
2710 
2711 static int copy_translation_tables(struct intel_iommu *iommu)
2712 {
2713 	struct context_entry **ctxt_tbls;
2714 	struct root_entry *old_rt;
2715 	phys_addr_t old_rt_phys;
2716 	int ctxt_table_entries;
2717 	u64 rtaddr_reg;
2718 	int bus, ret;
2719 	bool new_ext, ext;
2720 
2721 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2722 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2723 	new_ext    = !!sm_supported(iommu);
2724 
2725 	/*
2726 	 * The RTT bit can only be changed when translation is disabled,
2727 	 * but disabling translation means to open a window for data
2728 	 * corruption. So bail out and don't copy anything if we would
2729 	 * have to change the bit.
2730 	 */
2731 	if (new_ext != ext)
2732 		return -EINVAL;
2733 
2734 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2735 	if (!iommu->copied_tables)
2736 		return -ENOMEM;
2737 
2738 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2739 	if (!old_rt_phys)
2740 		return -EINVAL;
2741 
2742 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2743 	if (!old_rt)
2744 		return -ENOMEM;
2745 
2746 	/* This is too big for the stack - allocate it from slab */
2747 	ctxt_table_entries = ext ? 512 : 256;
2748 	ret = -ENOMEM;
2749 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2750 	if (!ctxt_tbls)
2751 		goto out_unmap;
2752 
2753 	for (bus = 0; bus < 256; bus++) {
2754 		ret = copy_context_table(iommu, &old_rt[bus],
2755 					 ctxt_tbls, bus, ext);
2756 		if (ret) {
2757 			pr_err("%s: Failed to copy context table for bus %d\n",
2758 				iommu->name, bus);
2759 			continue;
2760 		}
2761 	}
2762 
2763 	spin_lock(&iommu->lock);
2764 
2765 	/* Context tables are copied, now write them to the root_entry table */
2766 	for (bus = 0; bus < 256; bus++) {
2767 		int idx = ext ? bus * 2 : bus;
2768 		u64 val;
2769 
2770 		if (ctxt_tbls[idx]) {
2771 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2772 			iommu->root_entry[bus].lo = val;
2773 		}
2774 
2775 		if (!ext || !ctxt_tbls[idx + 1])
2776 			continue;
2777 
2778 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2779 		iommu->root_entry[bus].hi = val;
2780 	}
2781 
2782 	spin_unlock(&iommu->lock);
2783 
2784 	kfree(ctxt_tbls);
2785 
2786 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2787 
2788 	ret = 0;
2789 
2790 out_unmap:
2791 	memunmap(old_rt);
2792 
2793 	return ret;
2794 }
2795 
2796 #ifdef CONFIG_INTEL_IOMMU_SVM
2797 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2798 {
2799 	struct intel_iommu *iommu = data;
2800 	ioasid_t ioasid;
2801 
2802 	if (!iommu)
2803 		return INVALID_IOASID;
2804 	/*
2805 	 * VT-d virtual command interface always uses the full 20 bit
2806 	 * PASID range. Host can partition guest PASID range based on
2807 	 * policies but it is out of guest's control.
2808 	 */
2809 	if (min < PASID_MIN || max > intel_pasid_max_id)
2810 		return INVALID_IOASID;
2811 
2812 	if (vcmd_alloc_pasid(iommu, &ioasid))
2813 		return INVALID_IOASID;
2814 
2815 	return ioasid;
2816 }
2817 
2818 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2819 {
2820 	struct intel_iommu *iommu = data;
2821 
2822 	if (!iommu)
2823 		return;
2824 	/*
2825 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2826 	 * We can only free the PASID when all the devices are unbound.
2827 	 */
2828 	if (ioasid_find(NULL, ioasid, NULL)) {
2829 		pr_alert("Cannot free active IOASID %d\n", ioasid);
2830 		return;
2831 	}
2832 	vcmd_free_pasid(iommu, ioasid);
2833 }
2834 
2835 static void register_pasid_allocator(struct intel_iommu *iommu)
2836 {
2837 	/*
2838 	 * If we are running in the host, no need for custom allocator
2839 	 * in that PASIDs are allocated from the host system-wide.
2840 	 */
2841 	if (!cap_caching_mode(iommu->cap))
2842 		return;
2843 
2844 	if (!sm_supported(iommu)) {
2845 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2846 		return;
2847 	}
2848 
2849 	/*
2850 	 * Register a custom PASID allocator if we are running in a guest,
2851 	 * guest PASID must be obtained via virtual command interface.
2852 	 * There can be multiple vIOMMUs in each guest but only one allocator
2853 	 * is active. All vIOMMU allocators will eventually be calling the same
2854 	 * host allocator.
2855 	 */
2856 	if (!vccap_pasid(iommu->vccap))
2857 		return;
2858 
2859 	pr_info("Register custom PASID allocator\n");
2860 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2861 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2862 	iommu->pasid_allocator.pdata = (void *)iommu;
2863 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2864 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2865 		/*
2866 		 * Disable scalable mode on this IOMMU if there
2867 		 * is no custom allocator. Mixing SM capable vIOMMU
2868 		 * and non-SM vIOMMU are not supported.
2869 		 */
2870 		intel_iommu_sm = 0;
2871 	}
2872 }
2873 #endif
2874 
2875 static int __init init_dmars(void)
2876 {
2877 	struct dmar_drhd_unit *drhd;
2878 	struct intel_iommu *iommu;
2879 	int ret;
2880 
2881 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2882 	if (ret)
2883 		goto free_iommu;
2884 
2885 	for_each_iommu(iommu, drhd) {
2886 		if (drhd->ignored) {
2887 			iommu_disable_translation(iommu);
2888 			continue;
2889 		}
2890 
2891 		/*
2892 		 * Find the max pasid size of all IOMMU's in the system.
2893 		 * We need to ensure the system pasid table is no bigger
2894 		 * than the smallest supported.
2895 		 */
2896 		if (pasid_supported(iommu)) {
2897 			u32 temp = 2 << ecap_pss(iommu->ecap);
2898 
2899 			intel_pasid_max_id = min_t(u32, temp,
2900 						   intel_pasid_max_id);
2901 		}
2902 
2903 		intel_iommu_init_qi(iommu);
2904 
2905 		ret = iommu_init_domains(iommu);
2906 		if (ret)
2907 			goto free_iommu;
2908 
2909 		init_translation_status(iommu);
2910 
2911 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2912 			iommu_disable_translation(iommu);
2913 			clear_translation_pre_enabled(iommu);
2914 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2915 				iommu->name);
2916 		}
2917 
2918 		/*
2919 		 * TBD:
2920 		 * we could share the same root & context tables
2921 		 * among all IOMMU's. Need to Split it later.
2922 		 */
2923 		ret = iommu_alloc_root_entry(iommu);
2924 		if (ret)
2925 			goto free_iommu;
2926 
2927 		if (translation_pre_enabled(iommu)) {
2928 			pr_info("Translation already enabled - trying to copy translation structures\n");
2929 
2930 			ret = copy_translation_tables(iommu);
2931 			if (ret) {
2932 				/*
2933 				 * We found the IOMMU with translation
2934 				 * enabled - but failed to copy over the
2935 				 * old root-entry table. Try to proceed
2936 				 * by disabling translation now and
2937 				 * allocating a clean root-entry table.
2938 				 * This might cause DMAR faults, but
2939 				 * probably the dump will still succeed.
2940 				 */
2941 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2942 				       iommu->name);
2943 				iommu_disable_translation(iommu);
2944 				clear_translation_pre_enabled(iommu);
2945 			} else {
2946 				pr_info("Copied translation tables from previous kernel for %s\n",
2947 					iommu->name);
2948 			}
2949 		}
2950 
2951 		if (!ecap_pass_through(iommu->ecap))
2952 			hw_pass_through = 0;
2953 		intel_svm_check(iommu);
2954 	}
2955 
2956 	/*
2957 	 * Now that qi is enabled on all iommus, set the root entry and flush
2958 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2959 	 * flush_context function will loop forever and the boot hangs.
2960 	 */
2961 	for_each_active_iommu(iommu, drhd) {
2962 		iommu_flush_write_buffer(iommu);
2963 #ifdef CONFIG_INTEL_IOMMU_SVM
2964 		register_pasid_allocator(iommu);
2965 #endif
2966 		iommu_set_root_entry(iommu);
2967 	}
2968 
2969 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2970 	dmar_map_gfx = 0;
2971 #endif
2972 
2973 	if (!dmar_map_gfx)
2974 		iommu_identity_mapping |= IDENTMAP_GFX;
2975 
2976 	check_tylersburg_isoch();
2977 
2978 	ret = si_domain_init(hw_pass_through);
2979 	if (ret)
2980 		goto free_iommu;
2981 
2982 	/*
2983 	 * for each drhd
2984 	 *   enable fault log
2985 	 *   global invalidate context cache
2986 	 *   global invalidate iotlb
2987 	 *   enable translation
2988 	 */
2989 	for_each_iommu(iommu, drhd) {
2990 		if (drhd->ignored) {
2991 			/*
2992 			 * we always have to disable PMRs or DMA may fail on
2993 			 * this device
2994 			 */
2995 			if (force_on)
2996 				iommu_disable_protect_mem_regions(iommu);
2997 			continue;
2998 		}
2999 
3000 		iommu_flush_write_buffer(iommu);
3001 
3002 #ifdef CONFIG_INTEL_IOMMU_SVM
3003 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3004 			/*
3005 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3006 			 * could cause possible lock race condition.
3007 			 */
3008 			up_write(&dmar_global_lock);
3009 			ret = intel_svm_enable_prq(iommu);
3010 			down_write(&dmar_global_lock);
3011 			if (ret)
3012 				goto free_iommu;
3013 		}
3014 #endif
3015 		ret = dmar_set_interrupt(iommu);
3016 		if (ret)
3017 			goto free_iommu;
3018 	}
3019 
3020 	return 0;
3021 
3022 free_iommu:
3023 	for_each_active_iommu(iommu, drhd) {
3024 		disable_dmar_iommu(iommu);
3025 		free_dmar_iommu(iommu);
3026 	}
3027 	if (si_domain) {
3028 		domain_exit(si_domain);
3029 		si_domain = NULL;
3030 	}
3031 
3032 	return ret;
3033 }
3034 
3035 static void __init init_no_remapping_devices(void)
3036 {
3037 	struct dmar_drhd_unit *drhd;
3038 	struct device *dev;
3039 	int i;
3040 
3041 	for_each_drhd_unit(drhd) {
3042 		if (!drhd->include_all) {
3043 			for_each_active_dev_scope(drhd->devices,
3044 						  drhd->devices_cnt, i, dev)
3045 				break;
3046 			/* ignore DMAR unit if no devices exist */
3047 			if (i == drhd->devices_cnt)
3048 				drhd->ignored = 1;
3049 		}
3050 	}
3051 
3052 	for_each_active_drhd_unit(drhd) {
3053 		if (drhd->include_all)
3054 			continue;
3055 
3056 		for_each_active_dev_scope(drhd->devices,
3057 					  drhd->devices_cnt, i, dev)
3058 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3059 				break;
3060 		if (i < drhd->devices_cnt)
3061 			continue;
3062 
3063 		/* This IOMMU has *only* gfx devices. Either bypass it or
3064 		   set the gfx_mapped flag, as appropriate */
3065 		drhd->gfx_dedicated = 1;
3066 		if (!dmar_map_gfx)
3067 			drhd->ignored = 1;
3068 	}
3069 }
3070 
3071 #ifdef CONFIG_SUSPEND
3072 static int init_iommu_hw(void)
3073 {
3074 	struct dmar_drhd_unit *drhd;
3075 	struct intel_iommu *iommu = NULL;
3076 
3077 	for_each_active_iommu(iommu, drhd)
3078 		if (iommu->qi)
3079 			dmar_reenable_qi(iommu);
3080 
3081 	for_each_iommu(iommu, drhd) {
3082 		if (drhd->ignored) {
3083 			/*
3084 			 * we always have to disable PMRs or DMA may fail on
3085 			 * this device
3086 			 */
3087 			if (force_on)
3088 				iommu_disable_protect_mem_regions(iommu);
3089 			continue;
3090 		}
3091 
3092 		iommu_flush_write_buffer(iommu);
3093 		iommu_set_root_entry(iommu);
3094 		iommu_enable_translation(iommu);
3095 		iommu_disable_protect_mem_regions(iommu);
3096 	}
3097 
3098 	return 0;
3099 }
3100 
3101 static void iommu_flush_all(void)
3102 {
3103 	struct dmar_drhd_unit *drhd;
3104 	struct intel_iommu *iommu;
3105 
3106 	for_each_active_iommu(iommu, drhd) {
3107 		iommu->flush.flush_context(iommu, 0, 0, 0,
3108 					   DMA_CCMD_GLOBAL_INVL);
3109 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3110 					 DMA_TLB_GLOBAL_FLUSH);
3111 	}
3112 }
3113 
3114 static int iommu_suspend(void)
3115 {
3116 	struct dmar_drhd_unit *drhd;
3117 	struct intel_iommu *iommu = NULL;
3118 	unsigned long flag;
3119 
3120 	for_each_active_iommu(iommu, drhd) {
3121 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3122 					     GFP_KERNEL);
3123 		if (!iommu->iommu_state)
3124 			goto nomem;
3125 	}
3126 
3127 	iommu_flush_all();
3128 
3129 	for_each_active_iommu(iommu, drhd) {
3130 		iommu_disable_translation(iommu);
3131 
3132 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3133 
3134 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3135 			readl(iommu->reg + DMAR_FECTL_REG);
3136 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3137 			readl(iommu->reg + DMAR_FEDATA_REG);
3138 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3139 			readl(iommu->reg + DMAR_FEADDR_REG);
3140 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3141 			readl(iommu->reg + DMAR_FEUADDR_REG);
3142 
3143 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3144 	}
3145 	return 0;
3146 
3147 nomem:
3148 	for_each_active_iommu(iommu, drhd)
3149 		kfree(iommu->iommu_state);
3150 
3151 	return -ENOMEM;
3152 }
3153 
3154 static void iommu_resume(void)
3155 {
3156 	struct dmar_drhd_unit *drhd;
3157 	struct intel_iommu *iommu = NULL;
3158 	unsigned long flag;
3159 
3160 	if (init_iommu_hw()) {
3161 		if (force_on)
3162 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3163 		else
3164 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3165 		return;
3166 	}
3167 
3168 	for_each_active_iommu(iommu, drhd) {
3169 
3170 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3171 
3172 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3173 			iommu->reg + DMAR_FECTL_REG);
3174 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3175 			iommu->reg + DMAR_FEDATA_REG);
3176 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3177 			iommu->reg + DMAR_FEADDR_REG);
3178 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3179 			iommu->reg + DMAR_FEUADDR_REG);
3180 
3181 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3182 	}
3183 
3184 	for_each_active_iommu(iommu, drhd)
3185 		kfree(iommu->iommu_state);
3186 }
3187 
3188 static struct syscore_ops iommu_syscore_ops = {
3189 	.resume		= iommu_resume,
3190 	.suspend	= iommu_suspend,
3191 };
3192 
3193 static void __init init_iommu_pm_ops(void)
3194 {
3195 	register_syscore_ops(&iommu_syscore_ops);
3196 }
3197 
3198 #else
3199 static inline void init_iommu_pm_ops(void) {}
3200 #endif	/* CONFIG_PM */
3201 
3202 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3203 {
3204 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3205 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3206 	    rmrr->end_address <= rmrr->base_address ||
3207 	    arch_rmrr_sanity_check(rmrr))
3208 		return -EINVAL;
3209 
3210 	return 0;
3211 }
3212 
3213 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3214 {
3215 	struct acpi_dmar_reserved_memory *rmrr;
3216 	struct dmar_rmrr_unit *rmrru;
3217 
3218 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3219 	if (rmrr_sanity_check(rmrr)) {
3220 		pr_warn(FW_BUG
3221 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3222 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3223 			   rmrr->base_address, rmrr->end_address,
3224 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3225 			   dmi_get_system_info(DMI_BIOS_VERSION),
3226 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3227 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3228 	}
3229 
3230 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3231 	if (!rmrru)
3232 		goto out;
3233 
3234 	rmrru->hdr = header;
3235 
3236 	rmrru->base_address = rmrr->base_address;
3237 	rmrru->end_address = rmrr->end_address;
3238 
3239 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3240 				((void *)rmrr) + rmrr->header.length,
3241 				&rmrru->devices_cnt);
3242 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3243 		goto free_rmrru;
3244 
3245 	list_add(&rmrru->list, &dmar_rmrr_units);
3246 
3247 	return 0;
3248 free_rmrru:
3249 	kfree(rmrru);
3250 out:
3251 	return -ENOMEM;
3252 }
3253 
3254 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3255 {
3256 	struct dmar_atsr_unit *atsru;
3257 	struct acpi_dmar_atsr *tmp;
3258 
3259 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3260 				dmar_rcu_check()) {
3261 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3262 		if (atsr->segment != tmp->segment)
3263 			continue;
3264 		if (atsr->header.length != tmp->header.length)
3265 			continue;
3266 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3267 			return atsru;
3268 	}
3269 
3270 	return NULL;
3271 }
3272 
3273 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3274 {
3275 	struct acpi_dmar_atsr *atsr;
3276 	struct dmar_atsr_unit *atsru;
3277 
3278 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3279 		return 0;
3280 
3281 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3282 	atsru = dmar_find_atsr(atsr);
3283 	if (atsru)
3284 		return 0;
3285 
3286 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3287 	if (!atsru)
3288 		return -ENOMEM;
3289 
3290 	/*
3291 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3292 	 * copy the memory content because the memory buffer will be freed
3293 	 * on return.
3294 	 */
3295 	atsru->hdr = (void *)(atsru + 1);
3296 	memcpy(atsru->hdr, hdr, hdr->length);
3297 	atsru->include_all = atsr->flags & 0x1;
3298 	if (!atsru->include_all) {
3299 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3300 				(void *)atsr + atsr->header.length,
3301 				&atsru->devices_cnt);
3302 		if (atsru->devices_cnt && atsru->devices == NULL) {
3303 			kfree(atsru);
3304 			return -ENOMEM;
3305 		}
3306 	}
3307 
3308 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3309 
3310 	return 0;
3311 }
3312 
3313 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3314 {
3315 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3316 	kfree(atsru);
3317 }
3318 
3319 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3320 {
3321 	struct acpi_dmar_atsr *atsr;
3322 	struct dmar_atsr_unit *atsru;
3323 
3324 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3325 	atsru = dmar_find_atsr(atsr);
3326 	if (atsru) {
3327 		list_del_rcu(&atsru->list);
3328 		synchronize_rcu();
3329 		intel_iommu_free_atsr(atsru);
3330 	}
3331 
3332 	return 0;
3333 }
3334 
3335 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3336 {
3337 	int i;
3338 	struct device *dev;
3339 	struct acpi_dmar_atsr *atsr;
3340 	struct dmar_atsr_unit *atsru;
3341 
3342 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3343 	atsru = dmar_find_atsr(atsr);
3344 	if (!atsru)
3345 		return 0;
3346 
3347 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3348 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3349 					  i, dev)
3350 			return -EBUSY;
3351 	}
3352 
3353 	return 0;
3354 }
3355 
3356 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3357 {
3358 	struct dmar_satc_unit *satcu;
3359 	struct acpi_dmar_satc *tmp;
3360 
3361 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3362 				dmar_rcu_check()) {
3363 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3364 		if (satc->segment != tmp->segment)
3365 			continue;
3366 		if (satc->header.length != tmp->header.length)
3367 			continue;
3368 		if (memcmp(satc, tmp, satc->header.length) == 0)
3369 			return satcu;
3370 	}
3371 
3372 	return NULL;
3373 }
3374 
3375 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3376 {
3377 	struct acpi_dmar_satc *satc;
3378 	struct dmar_satc_unit *satcu;
3379 
3380 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3381 		return 0;
3382 
3383 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3384 	satcu = dmar_find_satc(satc);
3385 	if (satcu)
3386 		return 0;
3387 
3388 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3389 	if (!satcu)
3390 		return -ENOMEM;
3391 
3392 	satcu->hdr = (void *)(satcu + 1);
3393 	memcpy(satcu->hdr, hdr, hdr->length);
3394 	satcu->atc_required = satc->flags & 0x1;
3395 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3396 					      (void *)satc + satc->header.length,
3397 					      &satcu->devices_cnt);
3398 	if (satcu->devices_cnt && !satcu->devices) {
3399 		kfree(satcu);
3400 		return -ENOMEM;
3401 	}
3402 	list_add_rcu(&satcu->list, &dmar_satc_units);
3403 
3404 	return 0;
3405 }
3406 
3407 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3408 {
3409 	int sp, ret;
3410 	struct intel_iommu *iommu = dmaru->iommu;
3411 
3412 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3413 	if (ret)
3414 		goto out;
3415 
3416 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3417 		pr_warn("%s: Doesn't support hardware pass through.\n",
3418 			iommu->name);
3419 		return -ENXIO;
3420 	}
3421 
3422 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3423 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3424 		pr_warn("%s: Doesn't support large page.\n",
3425 			iommu->name);
3426 		return -ENXIO;
3427 	}
3428 
3429 	/*
3430 	 * Disable translation if already enabled prior to OS handover.
3431 	 */
3432 	if (iommu->gcmd & DMA_GCMD_TE)
3433 		iommu_disable_translation(iommu);
3434 
3435 	ret = iommu_init_domains(iommu);
3436 	if (ret == 0)
3437 		ret = iommu_alloc_root_entry(iommu);
3438 	if (ret)
3439 		goto out;
3440 
3441 	intel_svm_check(iommu);
3442 
3443 	if (dmaru->ignored) {
3444 		/*
3445 		 * we always have to disable PMRs or DMA may fail on this device
3446 		 */
3447 		if (force_on)
3448 			iommu_disable_protect_mem_regions(iommu);
3449 		return 0;
3450 	}
3451 
3452 	intel_iommu_init_qi(iommu);
3453 	iommu_flush_write_buffer(iommu);
3454 
3455 #ifdef CONFIG_INTEL_IOMMU_SVM
3456 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3457 		ret = intel_svm_enable_prq(iommu);
3458 		if (ret)
3459 			goto disable_iommu;
3460 	}
3461 #endif
3462 	ret = dmar_set_interrupt(iommu);
3463 	if (ret)
3464 		goto disable_iommu;
3465 
3466 	iommu_set_root_entry(iommu);
3467 	iommu_enable_translation(iommu);
3468 
3469 	iommu_disable_protect_mem_regions(iommu);
3470 	return 0;
3471 
3472 disable_iommu:
3473 	disable_dmar_iommu(iommu);
3474 out:
3475 	free_dmar_iommu(iommu);
3476 	return ret;
3477 }
3478 
3479 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3480 {
3481 	int ret = 0;
3482 	struct intel_iommu *iommu = dmaru->iommu;
3483 
3484 	if (!intel_iommu_enabled)
3485 		return 0;
3486 	if (iommu == NULL)
3487 		return -EINVAL;
3488 
3489 	if (insert) {
3490 		ret = intel_iommu_add(dmaru);
3491 	} else {
3492 		disable_dmar_iommu(iommu);
3493 		free_dmar_iommu(iommu);
3494 	}
3495 
3496 	return ret;
3497 }
3498 
3499 static void intel_iommu_free_dmars(void)
3500 {
3501 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3502 	struct dmar_atsr_unit *atsru, *atsr_n;
3503 	struct dmar_satc_unit *satcu, *satc_n;
3504 
3505 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3506 		list_del(&rmrru->list);
3507 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3508 		kfree(rmrru);
3509 	}
3510 
3511 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3512 		list_del(&atsru->list);
3513 		intel_iommu_free_atsr(atsru);
3514 	}
3515 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3516 		list_del(&satcu->list);
3517 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3518 		kfree(satcu);
3519 	}
3520 }
3521 
3522 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3523 {
3524 	struct dmar_satc_unit *satcu;
3525 	struct acpi_dmar_satc *satc;
3526 	struct device *tmp;
3527 	int i;
3528 
3529 	dev = pci_physfn(dev);
3530 	rcu_read_lock();
3531 
3532 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3533 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3534 		if (satc->segment != pci_domain_nr(dev->bus))
3535 			continue;
3536 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3537 			if (to_pci_dev(tmp) == dev)
3538 				goto out;
3539 	}
3540 	satcu = NULL;
3541 out:
3542 	rcu_read_unlock();
3543 	return satcu;
3544 }
3545 
3546 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3547 {
3548 	int i, ret = 1;
3549 	struct pci_bus *bus;
3550 	struct pci_dev *bridge = NULL;
3551 	struct device *tmp;
3552 	struct acpi_dmar_atsr *atsr;
3553 	struct dmar_atsr_unit *atsru;
3554 	struct dmar_satc_unit *satcu;
3555 
3556 	dev = pci_physfn(dev);
3557 	satcu = dmar_find_matched_satc_unit(dev);
3558 	if (satcu)
3559 		/*
3560 		 * This device supports ATS as it is in SATC table.
3561 		 * When IOMMU is in legacy mode, enabling ATS is done
3562 		 * automatically by HW for the device that requires
3563 		 * ATS, hence OS should not enable this device ATS
3564 		 * to avoid duplicated TLB invalidation.
3565 		 */
3566 		return !(satcu->atc_required && !sm_supported(iommu));
3567 
3568 	for (bus = dev->bus; bus; bus = bus->parent) {
3569 		bridge = bus->self;
3570 		/* If it's an integrated device, allow ATS */
3571 		if (!bridge)
3572 			return 1;
3573 		/* Connected via non-PCIe: no ATS */
3574 		if (!pci_is_pcie(bridge) ||
3575 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3576 			return 0;
3577 		/* If we found the root port, look it up in the ATSR */
3578 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3579 			break;
3580 	}
3581 
3582 	rcu_read_lock();
3583 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3584 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3585 		if (atsr->segment != pci_domain_nr(dev->bus))
3586 			continue;
3587 
3588 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3589 			if (tmp == &bridge->dev)
3590 				goto out;
3591 
3592 		if (atsru->include_all)
3593 			goto out;
3594 	}
3595 	ret = 0;
3596 out:
3597 	rcu_read_unlock();
3598 
3599 	return ret;
3600 }
3601 
3602 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3603 {
3604 	int ret;
3605 	struct dmar_rmrr_unit *rmrru;
3606 	struct dmar_atsr_unit *atsru;
3607 	struct dmar_satc_unit *satcu;
3608 	struct acpi_dmar_atsr *atsr;
3609 	struct acpi_dmar_reserved_memory *rmrr;
3610 	struct acpi_dmar_satc *satc;
3611 
3612 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3613 		return 0;
3614 
3615 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3616 		rmrr = container_of(rmrru->hdr,
3617 				    struct acpi_dmar_reserved_memory, header);
3618 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3619 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3620 				((void *)rmrr) + rmrr->header.length,
3621 				rmrr->segment, rmrru->devices,
3622 				rmrru->devices_cnt);
3623 			if (ret < 0)
3624 				return ret;
3625 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3626 			dmar_remove_dev_scope(info, rmrr->segment,
3627 				rmrru->devices, rmrru->devices_cnt);
3628 		}
3629 	}
3630 
3631 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3632 		if (atsru->include_all)
3633 			continue;
3634 
3635 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3636 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3637 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3638 					(void *)atsr + atsr->header.length,
3639 					atsr->segment, atsru->devices,
3640 					atsru->devices_cnt);
3641 			if (ret > 0)
3642 				break;
3643 			else if (ret < 0)
3644 				return ret;
3645 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3646 			if (dmar_remove_dev_scope(info, atsr->segment,
3647 					atsru->devices, atsru->devices_cnt))
3648 				break;
3649 		}
3650 	}
3651 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3652 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3653 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3654 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3655 					(void *)satc + satc->header.length,
3656 					satc->segment, satcu->devices,
3657 					satcu->devices_cnt);
3658 			if (ret > 0)
3659 				break;
3660 			else if (ret < 0)
3661 				return ret;
3662 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3663 			if (dmar_remove_dev_scope(info, satc->segment,
3664 					satcu->devices, satcu->devices_cnt))
3665 				break;
3666 		}
3667 	}
3668 
3669 	return 0;
3670 }
3671 
3672 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3673 				       unsigned long val, void *v)
3674 {
3675 	struct memory_notify *mhp = v;
3676 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3677 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3678 			mhp->nr_pages - 1);
3679 
3680 	switch (val) {
3681 	case MEM_GOING_ONLINE:
3682 		if (iommu_domain_identity_map(si_domain,
3683 					      start_vpfn, last_vpfn)) {
3684 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3685 				start_vpfn, last_vpfn);
3686 			return NOTIFY_BAD;
3687 		}
3688 		break;
3689 
3690 	case MEM_OFFLINE:
3691 	case MEM_CANCEL_ONLINE:
3692 		{
3693 			struct dmar_drhd_unit *drhd;
3694 			struct intel_iommu *iommu;
3695 			LIST_HEAD(freelist);
3696 
3697 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3698 
3699 			rcu_read_lock();
3700 			for_each_active_iommu(iommu, drhd)
3701 				iommu_flush_iotlb_psi(iommu, si_domain,
3702 					start_vpfn, mhp->nr_pages,
3703 					list_empty(&freelist), 0);
3704 			rcu_read_unlock();
3705 			put_pages_list(&freelist);
3706 		}
3707 		break;
3708 	}
3709 
3710 	return NOTIFY_OK;
3711 }
3712 
3713 static struct notifier_block intel_iommu_memory_nb = {
3714 	.notifier_call = intel_iommu_memory_notifier,
3715 	.priority = 0
3716 };
3717 
3718 static void intel_disable_iommus(void)
3719 {
3720 	struct intel_iommu *iommu = NULL;
3721 	struct dmar_drhd_unit *drhd;
3722 
3723 	for_each_iommu(iommu, drhd)
3724 		iommu_disable_translation(iommu);
3725 }
3726 
3727 void intel_iommu_shutdown(void)
3728 {
3729 	struct dmar_drhd_unit *drhd;
3730 	struct intel_iommu *iommu = NULL;
3731 
3732 	if (no_iommu || dmar_disabled)
3733 		return;
3734 
3735 	down_write(&dmar_global_lock);
3736 
3737 	/* Disable PMRs explicitly here. */
3738 	for_each_iommu(iommu, drhd)
3739 		iommu_disable_protect_mem_regions(iommu);
3740 
3741 	/* Make sure the IOMMUs are switched off */
3742 	intel_disable_iommus();
3743 
3744 	up_write(&dmar_global_lock);
3745 }
3746 
3747 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3748 {
3749 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3750 
3751 	return container_of(iommu_dev, struct intel_iommu, iommu);
3752 }
3753 
3754 static ssize_t version_show(struct device *dev,
3755 			    struct device_attribute *attr, char *buf)
3756 {
3757 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3758 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3759 	return sprintf(buf, "%d:%d\n",
3760 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3761 }
3762 static DEVICE_ATTR_RO(version);
3763 
3764 static ssize_t address_show(struct device *dev,
3765 			    struct device_attribute *attr, char *buf)
3766 {
3767 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3768 	return sprintf(buf, "%llx\n", iommu->reg_phys);
3769 }
3770 static DEVICE_ATTR_RO(address);
3771 
3772 static ssize_t cap_show(struct device *dev,
3773 			struct device_attribute *attr, char *buf)
3774 {
3775 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3776 	return sprintf(buf, "%llx\n", iommu->cap);
3777 }
3778 static DEVICE_ATTR_RO(cap);
3779 
3780 static ssize_t ecap_show(struct device *dev,
3781 			 struct device_attribute *attr, char *buf)
3782 {
3783 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3784 	return sprintf(buf, "%llx\n", iommu->ecap);
3785 }
3786 static DEVICE_ATTR_RO(ecap);
3787 
3788 static ssize_t domains_supported_show(struct device *dev,
3789 				      struct device_attribute *attr, char *buf)
3790 {
3791 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3792 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3793 }
3794 static DEVICE_ATTR_RO(domains_supported);
3795 
3796 static ssize_t domains_used_show(struct device *dev,
3797 				 struct device_attribute *attr, char *buf)
3798 {
3799 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3800 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3801 						  cap_ndoms(iommu->cap)));
3802 }
3803 static DEVICE_ATTR_RO(domains_used);
3804 
3805 static struct attribute *intel_iommu_attrs[] = {
3806 	&dev_attr_version.attr,
3807 	&dev_attr_address.attr,
3808 	&dev_attr_cap.attr,
3809 	&dev_attr_ecap.attr,
3810 	&dev_attr_domains_supported.attr,
3811 	&dev_attr_domains_used.attr,
3812 	NULL,
3813 };
3814 
3815 static struct attribute_group intel_iommu_group = {
3816 	.name = "intel-iommu",
3817 	.attrs = intel_iommu_attrs,
3818 };
3819 
3820 const struct attribute_group *intel_iommu_groups[] = {
3821 	&intel_iommu_group,
3822 	NULL,
3823 };
3824 
3825 static inline bool has_external_pci(void)
3826 {
3827 	struct pci_dev *pdev = NULL;
3828 
3829 	for_each_pci_dev(pdev)
3830 		if (pdev->external_facing) {
3831 			pci_dev_put(pdev);
3832 			return true;
3833 		}
3834 
3835 	return false;
3836 }
3837 
3838 static int __init platform_optin_force_iommu(void)
3839 {
3840 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3841 		return 0;
3842 
3843 	if (no_iommu || dmar_disabled)
3844 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3845 
3846 	/*
3847 	 * If Intel-IOMMU is disabled by default, we will apply identity
3848 	 * map for all devices except those marked as being untrusted.
3849 	 */
3850 	if (dmar_disabled)
3851 		iommu_set_default_passthrough(false);
3852 
3853 	dmar_disabled = 0;
3854 	no_iommu = 0;
3855 
3856 	return 1;
3857 }
3858 
3859 static int __init probe_acpi_namespace_devices(void)
3860 {
3861 	struct dmar_drhd_unit *drhd;
3862 	/* To avoid a -Wunused-but-set-variable warning. */
3863 	struct intel_iommu *iommu __maybe_unused;
3864 	struct device *dev;
3865 	int i, ret = 0;
3866 
3867 	for_each_active_iommu(iommu, drhd) {
3868 		for_each_active_dev_scope(drhd->devices,
3869 					  drhd->devices_cnt, i, dev) {
3870 			struct acpi_device_physical_node *pn;
3871 			struct iommu_group *group;
3872 			struct acpi_device *adev;
3873 
3874 			if (dev->bus != &acpi_bus_type)
3875 				continue;
3876 
3877 			adev = to_acpi_device(dev);
3878 			mutex_lock(&adev->physical_node_lock);
3879 			list_for_each_entry(pn,
3880 					    &adev->physical_node_list, node) {
3881 				group = iommu_group_get(pn->dev);
3882 				if (group) {
3883 					iommu_group_put(group);
3884 					continue;
3885 				}
3886 
3887 				ret = iommu_probe_device(pn->dev);
3888 				if (ret)
3889 					break;
3890 			}
3891 			mutex_unlock(&adev->physical_node_lock);
3892 
3893 			if (ret)
3894 				return ret;
3895 		}
3896 	}
3897 
3898 	return 0;
3899 }
3900 
3901 static __init int tboot_force_iommu(void)
3902 {
3903 	if (!tboot_enabled())
3904 		return 0;
3905 
3906 	if (no_iommu || dmar_disabled)
3907 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3908 
3909 	dmar_disabled = 0;
3910 	no_iommu = 0;
3911 
3912 	return 1;
3913 }
3914 
3915 int __init intel_iommu_init(void)
3916 {
3917 	int ret = -ENODEV;
3918 	struct dmar_drhd_unit *drhd;
3919 	struct intel_iommu *iommu;
3920 
3921 	/*
3922 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3923 	 * opt in, so enforce that.
3924 	 */
3925 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3926 		    platform_optin_force_iommu();
3927 
3928 	down_write(&dmar_global_lock);
3929 	if (dmar_table_init()) {
3930 		if (force_on)
3931 			panic("tboot: Failed to initialize DMAR table\n");
3932 		goto out_free_dmar;
3933 	}
3934 
3935 	if (dmar_dev_scope_init() < 0) {
3936 		if (force_on)
3937 			panic("tboot: Failed to initialize DMAR device scope\n");
3938 		goto out_free_dmar;
3939 	}
3940 
3941 	up_write(&dmar_global_lock);
3942 
3943 	/*
3944 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3945 	 * complain later when we register it under the lock.
3946 	 */
3947 	dmar_register_bus_notifier();
3948 
3949 	down_write(&dmar_global_lock);
3950 
3951 	if (!no_iommu)
3952 		intel_iommu_debugfs_init();
3953 
3954 	if (no_iommu || dmar_disabled) {
3955 		/*
3956 		 * We exit the function here to ensure IOMMU's remapping and
3957 		 * mempool aren't setup, which means that the IOMMU's PMRs
3958 		 * won't be disabled via the call to init_dmars(). So disable
3959 		 * it explicitly here. The PMRs were setup by tboot prior to
3960 		 * calling SENTER, but the kernel is expected to reset/tear
3961 		 * down the PMRs.
3962 		 */
3963 		if (intel_iommu_tboot_noforce) {
3964 			for_each_iommu(iommu, drhd)
3965 				iommu_disable_protect_mem_regions(iommu);
3966 		}
3967 
3968 		/*
3969 		 * Make sure the IOMMUs are switched off, even when we
3970 		 * boot into a kexec kernel and the previous kernel left
3971 		 * them enabled
3972 		 */
3973 		intel_disable_iommus();
3974 		goto out_free_dmar;
3975 	}
3976 
3977 	if (list_empty(&dmar_rmrr_units))
3978 		pr_info("No RMRR found\n");
3979 
3980 	if (list_empty(&dmar_atsr_units))
3981 		pr_info("No ATSR found\n");
3982 
3983 	if (list_empty(&dmar_satc_units))
3984 		pr_info("No SATC found\n");
3985 
3986 	init_no_remapping_devices();
3987 
3988 	ret = init_dmars();
3989 	if (ret) {
3990 		if (force_on)
3991 			panic("tboot: Failed to initialize DMARs\n");
3992 		pr_err("Initialization failed\n");
3993 		goto out_free_dmar;
3994 	}
3995 	up_write(&dmar_global_lock);
3996 
3997 	init_iommu_pm_ops();
3998 
3999 	down_read(&dmar_global_lock);
4000 	for_each_active_iommu(iommu, drhd) {
4001 		/*
4002 		 * The flush queue implementation does not perform
4003 		 * page-selective invalidations that are required for efficient
4004 		 * TLB flushes in virtual environments.  The benefit of batching
4005 		 * is likely to be much lower than the overhead of synchronizing
4006 		 * the virtual and physical IOMMU page-tables.
4007 		 */
4008 		if (cap_caching_mode(iommu->cap)) {
4009 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4010 			iommu_set_dma_strict();
4011 		}
4012 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4013 				       intel_iommu_groups,
4014 				       "%s", iommu->name);
4015 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4016 	}
4017 	up_read(&dmar_global_lock);
4018 
4019 	if (si_domain && !hw_pass_through)
4020 		register_memory_notifier(&intel_iommu_memory_nb);
4021 
4022 	down_read(&dmar_global_lock);
4023 	if (probe_acpi_namespace_devices())
4024 		pr_warn("ACPI name space devices didn't probe correctly\n");
4025 
4026 	/* Finally, we enable the DMA remapping hardware. */
4027 	for_each_iommu(iommu, drhd) {
4028 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4029 			iommu_enable_translation(iommu);
4030 
4031 		iommu_disable_protect_mem_regions(iommu);
4032 	}
4033 	up_read(&dmar_global_lock);
4034 
4035 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4036 
4037 	intel_iommu_enabled = 1;
4038 
4039 	return 0;
4040 
4041 out_free_dmar:
4042 	intel_iommu_free_dmars();
4043 	up_write(&dmar_global_lock);
4044 	return ret;
4045 }
4046 
4047 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4048 {
4049 	struct device_domain_info *info = opaque;
4050 
4051 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4052 	return 0;
4053 }
4054 
4055 /*
4056  * NB - intel-iommu lacks any sort of reference counting for the users of
4057  * dependent devices.  If multiple endpoints have intersecting dependent
4058  * devices, unbinding the driver from any one of them will possibly leave
4059  * the others unable to operate.
4060  */
4061 static void domain_context_clear(struct device_domain_info *info)
4062 {
4063 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4064 		return;
4065 
4066 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4067 			       &domain_context_clear_one_cb, info);
4068 }
4069 
4070 static void dmar_remove_one_dev_info(struct device *dev)
4071 {
4072 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4073 	struct dmar_domain *domain = info->domain;
4074 	struct intel_iommu *iommu = info->iommu;
4075 	unsigned long flags;
4076 
4077 	if (!dev_is_real_dma_subdevice(info->dev)) {
4078 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4079 			intel_pasid_tear_down_entry(iommu, info->dev,
4080 					PASID_RID2PASID, false);
4081 
4082 		iommu_disable_pci_caps(info);
4083 		domain_context_clear(info);
4084 	}
4085 
4086 	spin_lock_irqsave(&domain->lock, flags);
4087 	list_del(&info->link);
4088 	spin_unlock_irqrestore(&domain->lock, flags);
4089 
4090 	domain_detach_iommu(domain, iommu);
4091 	info->domain = NULL;
4092 }
4093 
4094 /*
4095  * Clear the page table pointer in context or pasid table entries so that
4096  * all DMA requests without PASID from the device are blocked. If the page
4097  * table has been set, clean up the data structures.
4098  */
4099 static void device_block_translation(struct device *dev)
4100 {
4101 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4102 	struct intel_iommu *iommu = info->iommu;
4103 	unsigned long flags;
4104 
4105 	iommu_disable_pci_caps(info);
4106 	if (!dev_is_real_dma_subdevice(dev)) {
4107 		if (sm_supported(iommu))
4108 			intel_pasid_tear_down_entry(iommu, dev,
4109 						    PASID_RID2PASID, false);
4110 		else
4111 			domain_context_clear(info);
4112 	}
4113 
4114 	if (!info->domain)
4115 		return;
4116 
4117 	spin_lock_irqsave(&info->domain->lock, flags);
4118 	list_del(&info->link);
4119 	spin_unlock_irqrestore(&info->domain->lock, flags);
4120 
4121 	domain_detach_iommu(info->domain, iommu);
4122 	info->domain = NULL;
4123 }
4124 
4125 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4126 {
4127 	int adjust_width;
4128 
4129 	/* calculate AGAW */
4130 	domain->gaw = guest_width;
4131 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4132 	domain->agaw = width_to_agaw(adjust_width);
4133 
4134 	domain->iommu_coherency = false;
4135 	domain->iommu_superpage = 0;
4136 	domain->max_addr = 0;
4137 
4138 	/* always allocate the top pgd */
4139 	domain->pgd = alloc_pgtable_page(domain->nid);
4140 	if (!domain->pgd)
4141 		return -ENOMEM;
4142 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4143 	return 0;
4144 }
4145 
4146 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4147 				      struct device *dev)
4148 {
4149 	device_block_translation(dev);
4150 	return 0;
4151 }
4152 
4153 static struct iommu_domain blocking_domain = {
4154 	.ops = &(const struct iommu_domain_ops) {
4155 		.attach_dev	= blocking_domain_attach_dev,
4156 		.free		= intel_iommu_domain_free
4157 	}
4158 };
4159 
4160 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4161 {
4162 	struct dmar_domain *dmar_domain;
4163 	struct iommu_domain *domain;
4164 
4165 	switch (type) {
4166 	case IOMMU_DOMAIN_BLOCKED:
4167 		return &blocking_domain;
4168 	case IOMMU_DOMAIN_DMA:
4169 	case IOMMU_DOMAIN_DMA_FQ:
4170 	case IOMMU_DOMAIN_UNMANAGED:
4171 		dmar_domain = alloc_domain(type);
4172 		if (!dmar_domain) {
4173 			pr_err("Can't allocate dmar_domain\n");
4174 			return NULL;
4175 		}
4176 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4177 			pr_err("Domain initialization failed\n");
4178 			domain_exit(dmar_domain);
4179 			return NULL;
4180 		}
4181 
4182 		domain = &dmar_domain->domain;
4183 		domain->geometry.aperture_start = 0;
4184 		domain->geometry.aperture_end   =
4185 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4186 		domain->geometry.force_aperture = true;
4187 
4188 		return domain;
4189 	case IOMMU_DOMAIN_IDENTITY:
4190 		return &si_domain->domain;
4191 	case IOMMU_DOMAIN_SVA:
4192 		return intel_svm_domain_alloc();
4193 	default:
4194 		return NULL;
4195 	}
4196 
4197 	return NULL;
4198 }
4199 
4200 static void intel_iommu_domain_free(struct iommu_domain *domain)
4201 {
4202 	if (domain != &si_domain->domain && domain != &blocking_domain)
4203 		domain_exit(to_dmar_domain(domain));
4204 }
4205 
4206 static int prepare_domain_attach_device(struct iommu_domain *domain,
4207 					struct device *dev)
4208 {
4209 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4210 	struct intel_iommu *iommu;
4211 	int addr_width;
4212 
4213 	iommu = device_to_iommu(dev, NULL, NULL);
4214 	if (!iommu)
4215 		return -ENODEV;
4216 
4217 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4218 		return -EINVAL;
4219 
4220 	/* check if this iommu agaw is sufficient for max mapped address */
4221 	addr_width = agaw_to_width(iommu->agaw);
4222 	if (addr_width > cap_mgaw(iommu->cap))
4223 		addr_width = cap_mgaw(iommu->cap);
4224 
4225 	if (dmar_domain->max_addr > (1LL << addr_width))
4226 		return -EINVAL;
4227 	dmar_domain->gaw = addr_width;
4228 
4229 	/*
4230 	 * Knock out extra levels of page tables if necessary
4231 	 */
4232 	while (iommu->agaw < dmar_domain->agaw) {
4233 		struct dma_pte *pte;
4234 
4235 		pte = dmar_domain->pgd;
4236 		if (dma_pte_present(pte)) {
4237 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4238 			free_pgtable_page(pte);
4239 		}
4240 		dmar_domain->agaw--;
4241 	}
4242 
4243 	return 0;
4244 }
4245 
4246 static int intel_iommu_attach_device(struct iommu_domain *domain,
4247 				     struct device *dev)
4248 {
4249 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4250 	int ret;
4251 
4252 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4253 	    device_is_rmrr_locked(dev)) {
4254 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4255 		return -EPERM;
4256 	}
4257 
4258 	if (info->domain)
4259 		device_block_translation(dev);
4260 
4261 	ret = prepare_domain_attach_device(domain, dev);
4262 	if (ret)
4263 		return ret;
4264 
4265 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4266 }
4267 
4268 static int intel_iommu_map(struct iommu_domain *domain,
4269 			   unsigned long iova, phys_addr_t hpa,
4270 			   size_t size, int iommu_prot, gfp_t gfp)
4271 {
4272 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4273 	u64 max_addr;
4274 	int prot = 0;
4275 
4276 	if (iommu_prot & IOMMU_READ)
4277 		prot |= DMA_PTE_READ;
4278 	if (iommu_prot & IOMMU_WRITE)
4279 		prot |= DMA_PTE_WRITE;
4280 	if (dmar_domain->set_pte_snp)
4281 		prot |= DMA_PTE_SNP;
4282 
4283 	max_addr = iova + size;
4284 	if (dmar_domain->max_addr < max_addr) {
4285 		u64 end;
4286 
4287 		/* check if minimum agaw is sufficient for mapped address */
4288 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4289 		if (end < max_addr) {
4290 			pr_err("%s: iommu width (%d) is not "
4291 			       "sufficient for the mapped address (%llx)\n",
4292 			       __func__, dmar_domain->gaw, max_addr);
4293 			return -EFAULT;
4294 		}
4295 		dmar_domain->max_addr = max_addr;
4296 	}
4297 	/* Round up size to next multiple of PAGE_SIZE, if it and
4298 	   the low bits of hpa would take us onto the next page */
4299 	size = aligned_nrpages(hpa, size);
4300 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4301 				hpa >> VTD_PAGE_SHIFT, size, prot);
4302 }
4303 
4304 static int intel_iommu_map_pages(struct iommu_domain *domain,
4305 				 unsigned long iova, phys_addr_t paddr,
4306 				 size_t pgsize, size_t pgcount,
4307 				 int prot, gfp_t gfp, size_t *mapped)
4308 {
4309 	unsigned long pgshift = __ffs(pgsize);
4310 	size_t size = pgcount << pgshift;
4311 	int ret;
4312 
4313 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4314 		return -EINVAL;
4315 
4316 	if (!IS_ALIGNED(iova | paddr, pgsize))
4317 		return -EINVAL;
4318 
4319 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4320 	if (!ret && mapped)
4321 		*mapped = size;
4322 
4323 	return ret;
4324 }
4325 
4326 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4327 				unsigned long iova, size_t size,
4328 				struct iommu_iotlb_gather *gather)
4329 {
4330 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4331 	unsigned long start_pfn, last_pfn;
4332 	int level = 0;
4333 
4334 	/* Cope with horrid API which requires us to unmap more than the
4335 	   size argument if it happens to be a large-page mapping. */
4336 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4337 
4338 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4339 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4340 
4341 	start_pfn = iova >> VTD_PAGE_SHIFT;
4342 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4343 
4344 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4345 
4346 	if (dmar_domain->max_addr == iova + size)
4347 		dmar_domain->max_addr = iova;
4348 
4349 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4350 
4351 	return size;
4352 }
4353 
4354 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4355 				      unsigned long iova,
4356 				      size_t pgsize, size_t pgcount,
4357 				      struct iommu_iotlb_gather *gather)
4358 {
4359 	unsigned long pgshift = __ffs(pgsize);
4360 	size_t size = pgcount << pgshift;
4361 
4362 	return intel_iommu_unmap(domain, iova, size, gather);
4363 }
4364 
4365 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4366 				 struct iommu_iotlb_gather *gather)
4367 {
4368 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4369 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4370 	size_t size = gather->end - gather->start;
4371 	struct iommu_domain_info *info;
4372 	unsigned long start_pfn;
4373 	unsigned long nrpages;
4374 	unsigned long i;
4375 
4376 	nrpages = aligned_nrpages(gather->start, size);
4377 	start_pfn = mm_to_dma_pfn(iova_pfn);
4378 
4379 	xa_for_each(&dmar_domain->iommu_array, i, info)
4380 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4381 				      start_pfn, nrpages,
4382 				      list_empty(&gather->freelist), 0);
4383 
4384 	put_pages_list(&gather->freelist);
4385 }
4386 
4387 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4388 					    dma_addr_t iova)
4389 {
4390 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4391 	struct dma_pte *pte;
4392 	int level = 0;
4393 	u64 phys = 0;
4394 
4395 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4396 	if (pte && dma_pte_present(pte))
4397 		phys = dma_pte_addr(pte) +
4398 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4399 						VTD_PAGE_SHIFT) - 1));
4400 
4401 	return phys;
4402 }
4403 
4404 static bool domain_support_force_snooping(struct dmar_domain *domain)
4405 {
4406 	struct device_domain_info *info;
4407 	bool support = true;
4408 
4409 	assert_spin_locked(&domain->lock);
4410 	list_for_each_entry(info, &domain->devices, link) {
4411 		if (!ecap_sc_support(info->iommu->ecap)) {
4412 			support = false;
4413 			break;
4414 		}
4415 	}
4416 
4417 	return support;
4418 }
4419 
4420 static void domain_set_force_snooping(struct dmar_domain *domain)
4421 {
4422 	struct device_domain_info *info;
4423 
4424 	assert_spin_locked(&domain->lock);
4425 	/*
4426 	 * Second level page table supports per-PTE snoop control. The
4427 	 * iommu_map() interface will handle this by setting SNP bit.
4428 	 */
4429 	if (!domain->use_first_level) {
4430 		domain->set_pte_snp = true;
4431 		return;
4432 	}
4433 
4434 	list_for_each_entry(info, &domain->devices, link)
4435 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4436 						     PASID_RID2PASID);
4437 }
4438 
4439 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4440 {
4441 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4442 	unsigned long flags;
4443 
4444 	if (dmar_domain->force_snooping)
4445 		return true;
4446 
4447 	spin_lock_irqsave(&dmar_domain->lock, flags);
4448 	if (!domain_support_force_snooping(dmar_domain)) {
4449 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4450 		return false;
4451 	}
4452 
4453 	domain_set_force_snooping(dmar_domain);
4454 	dmar_domain->force_snooping = true;
4455 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4456 
4457 	return true;
4458 }
4459 
4460 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4461 {
4462 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4463 
4464 	switch (cap) {
4465 	case IOMMU_CAP_CACHE_COHERENCY:
4466 		return true;
4467 	case IOMMU_CAP_INTR_REMAP:
4468 		return irq_remapping_enabled == 1;
4469 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4470 		return dmar_platform_optin();
4471 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4472 		return ecap_sc_support(info->iommu->ecap);
4473 	default:
4474 		return false;
4475 	}
4476 }
4477 
4478 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4479 {
4480 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4481 	struct device_domain_info *info;
4482 	struct intel_iommu *iommu;
4483 	u8 bus, devfn;
4484 	int ret;
4485 
4486 	iommu = device_to_iommu(dev, &bus, &devfn);
4487 	if (!iommu || !iommu->iommu.ops)
4488 		return ERR_PTR(-ENODEV);
4489 
4490 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4491 	if (!info)
4492 		return ERR_PTR(-ENOMEM);
4493 
4494 	if (dev_is_real_dma_subdevice(dev)) {
4495 		info->bus = pdev->bus->number;
4496 		info->devfn = pdev->devfn;
4497 		info->segment = pci_domain_nr(pdev->bus);
4498 	} else {
4499 		info->bus = bus;
4500 		info->devfn = devfn;
4501 		info->segment = iommu->segment;
4502 	}
4503 
4504 	info->dev = dev;
4505 	info->iommu = iommu;
4506 	if (dev_is_pci(dev)) {
4507 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4508 		    pci_ats_supported(pdev) &&
4509 		    dmar_ats_supported(pdev, iommu)) {
4510 			info->ats_supported = 1;
4511 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4512 		}
4513 		if (sm_supported(iommu)) {
4514 			if (pasid_supported(iommu)) {
4515 				int features = pci_pasid_features(pdev);
4516 
4517 				if (features >= 0)
4518 					info->pasid_supported = features | 1;
4519 			}
4520 
4521 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4522 			    pci_pri_supported(pdev))
4523 				info->pri_supported = 1;
4524 		}
4525 	}
4526 
4527 	dev_iommu_priv_set(dev, info);
4528 
4529 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4530 		ret = intel_pasid_alloc_table(dev);
4531 		if (ret) {
4532 			dev_err(dev, "PASID table allocation failed\n");
4533 			dev_iommu_priv_set(dev, NULL);
4534 			kfree(info);
4535 			return ERR_PTR(ret);
4536 		}
4537 	}
4538 
4539 	return &iommu->iommu;
4540 }
4541 
4542 static void intel_iommu_release_device(struct device *dev)
4543 {
4544 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4545 
4546 	dmar_remove_one_dev_info(dev);
4547 	intel_pasid_free_table(dev);
4548 	dev_iommu_priv_set(dev, NULL);
4549 	kfree(info);
4550 	set_dma_ops(dev, NULL);
4551 }
4552 
4553 static void intel_iommu_probe_finalize(struct device *dev)
4554 {
4555 	set_dma_ops(dev, NULL);
4556 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4557 }
4558 
4559 static void intel_iommu_get_resv_regions(struct device *device,
4560 					 struct list_head *head)
4561 {
4562 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4563 	struct iommu_resv_region *reg;
4564 	struct dmar_rmrr_unit *rmrr;
4565 	struct device *i_dev;
4566 	int i;
4567 
4568 	rcu_read_lock();
4569 	for_each_rmrr_units(rmrr) {
4570 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4571 					  i, i_dev) {
4572 			struct iommu_resv_region *resv;
4573 			enum iommu_resv_type type;
4574 			size_t length;
4575 
4576 			if (i_dev != device &&
4577 			    !is_downstream_to_pci_bridge(device, i_dev))
4578 				continue;
4579 
4580 			length = rmrr->end_address - rmrr->base_address + 1;
4581 
4582 			type = device_rmrr_is_relaxable(device) ?
4583 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4584 
4585 			resv = iommu_alloc_resv_region(rmrr->base_address,
4586 						       length, prot, type,
4587 						       GFP_ATOMIC);
4588 			if (!resv)
4589 				break;
4590 
4591 			list_add_tail(&resv->list, head);
4592 		}
4593 	}
4594 	rcu_read_unlock();
4595 
4596 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4597 	if (dev_is_pci(device)) {
4598 		struct pci_dev *pdev = to_pci_dev(device);
4599 
4600 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4601 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4602 					IOMMU_RESV_DIRECT_RELAXABLE,
4603 					GFP_KERNEL);
4604 			if (reg)
4605 				list_add_tail(&reg->list, head);
4606 		}
4607 	}
4608 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4609 
4610 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4611 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4612 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4613 	if (!reg)
4614 		return;
4615 	list_add_tail(&reg->list, head);
4616 }
4617 
4618 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4619 {
4620 	if (dev_is_pci(dev))
4621 		return pci_device_group(dev);
4622 	return generic_device_group(dev);
4623 }
4624 
4625 static int intel_iommu_enable_sva(struct device *dev)
4626 {
4627 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4628 	struct intel_iommu *iommu;
4629 	int ret;
4630 
4631 	if (!info || dmar_disabled)
4632 		return -EINVAL;
4633 
4634 	iommu = info->iommu;
4635 	if (!iommu)
4636 		return -EINVAL;
4637 
4638 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4639 		return -ENODEV;
4640 
4641 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4642 		return -EINVAL;
4643 
4644 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4645 	if (!ret)
4646 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4647 
4648 	return ret;
4649 }
4650 
4651 static int intel_iommu_disable_sva(struct device *dev)
4652 {
4653 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4654 	struct intel_iommu *iommu = info->iommu;
4655 	int ret;
4656 
4657 	ret = iommu_unregister_device_fault_handler(dev);
4658 	if (!ret)
4659 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4660 
4661 	return ret;
4662 }
4663 
4664 static int intel_iommu_enable_iopf(struct device *dev)
4665 {
4666 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4667 
4668 	if (info && info->pri_supported)
4669 		return 0;
4670 
4671 	return -ENODEV;
4672 }
4673 
4674 static int
4675 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4676 {
4677 	switch (feat) {
4678 	case IOMMU_DEV_FEAT_IOPF:
4679 		return intel_iommu_enable_iopf(dev);
4680 
4681 	case IOMMU_DEV_FEAT_SVA:
4682 		return intel_iommu_enable_sva(dev);
4683 
4684 	default:
4685 		return -ENODEV;
4686 	}
4687 }
4688 
4689 static int
4690 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4691 {
4692 	switch (feat) {
4693 	case IOMMU_DEV_FEAT_IOPF:
4694 		return 0;
4695 
4696 	case IOMMU_DEV_FEAT_SVA:
4697 		return intel_iommu_disable_sva(dev);
4698 
4699 	default:
4700 		return -ENODEV;
4701 	}
4702 }
4703 
4704 static bool intel_iommu_is_attach_deferred(struct device *dev)
4705 {
4706 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4707 
4708 	return translation_pre_enabled(info->iommu) && !info->domain;
4709 }
4710 
4711 /*
4712  * Check that the device does not live on an external facing PCI port that is
4713  * marked as untrusted. Such devices should not be able to apply quirks and
4714  * thus not be able to bypass the IOMMU restrictions.
4715  */
4716 static bool risky_device(struct pci_dev *pdev)
4717 {
4718 	if (pdev->untrusted) {
4719 		pci_info(pdev,
4720 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4721 			 pdev->vendor, pdev->device);
4722 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4723 		return true;
4724 	}
4725 	return false;
4726 }
4727 
4728 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4729 				       unsigned long iova, size_t size)
4730 {
4731 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4732 	unsigned long pages = aligned_nrpages(iova, size);
4733 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4734 	struct iommu_domain_info *info;
4735 	unsigned long i;
4736 
4737 	xa_for_each(&dmar_domain->iommu_array, i, info)
4738 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4739 }
4740 
4741 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4742 {
4743 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4744 	struct iommu_domain *domain;
4745 
4746 	/* Domain type specific cleanup: */
4747 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4748 	if (domain) {
4749 		switch (domain->type) {
4750 		case IOMMU_DOMAIN_SVA:
4751 			intel_svm_remove_dev_pasid(dev, pasid);
4752 			break;
4753 		default:
4754 			/* should never reach here */
4755 			WARN_ON(1);
4756 			break;
4757 		}
4758 	}
4759 
4760 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4761 }
4762 
4763 const struct iommu_ops intel_iommu_ops = {
4764 	.capable		= intel_iommu_capable,
4765 	.domain_alloc		= intel_iommu_domain_alloc,
4766 	.probe_device		= intel_iommu_probe_device,
4767 	.probe_finalize		= intel_iommu_probe_finalize,
4768 	.release_device		= intel_iommu_release_device,
4769 	.get_resv_regions	= intel_iommu_get_resv_regions,
4770 	.device_group		= intel_iommu_device_group,
4771 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4772 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4773 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4774 	.def_domain_type	= device_def_domain_type,
4775 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4776 	.pgsize_bitmap		= SZ_4K,
4777 #ifdef CONFIG_INTEL_IOMMU_SVM
4778 	.page_response		= intel_svm_page_response,
4779 #endif
4780 	.default_domain_ops = &(const struct iommu_domain_ops) {
4781 		.attach_dev		= intel_iommu_attach_device,
4782 		.map_pages		= intel_iommu_map_pages,
4783 		.unmap_pages		= intel_iommu_unmap_pages,
4784 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4785 		.flush_iotlb_all        = intel_flush_iotlb_all,
4786 		.iotlb_sync		= intel_iommu_tlb_sync,
4787 		.iova_to_phys		= intel_iommu_iova_to_phys,
4788 		.free			= intel_iommu_domain_free,
4789 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4790 	}
4791 };
4792 
4793 static void quirk_iommu_igfx(struct pci_dev *dev)
4794 {
4795 	if (risky_device(dev))
4796 		return;
4797 
4798 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4799 	dmar_map_gfx = 0;
4800 }
4801 
4802 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4810 
4811 /* Broadwell igfx malfunctions with dmar */
4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4817 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4818 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4820 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4821 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4822 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4823 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4824 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4825 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4826 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4827 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4828 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4829 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4830 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4831 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4832 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4833 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4836 
4837 static void quirk_iommu_rwbf(struct pci_dev *dev)
4838 {
4839 	if (risky_device(dev))
4840 		return;
4841 
4842 	/*
4843 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4844 	 * but needs it. Same seems to hold for the desktop versions.
4845 	 */
4846 	pci_info(dev, "Forcing write-buffer flush capability\n");
4847 	rwbf_quirk = 1;
4848 }
4849 
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4857 
4858 #define GGC 0x52
4859 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4860 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4861 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4862 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4863 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4864 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4865 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4866 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4867 
4868 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4869 {
4870 	unsigned short ggc;
4871 
4872 	if (risky_device(dev))
4873 		return;
4874 
4875 	if (pci_read_config_word(dev, GGC, &ggc))
4876 		return;
4877 
4878 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4879 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4880 		dmar_map_gfx = 0;
4881 	} else if (dmar_map_gfx) {
4882 		/* we have to ensure the gfx device is idle before we flush */
4883 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4884 		iommu_set_dma_strict();
4885 	}
4886 }
4887 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4889 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4891 
4892 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4893 {
4894 	unsigned short ver;
4895 
4896 	if (!IS_GFX_DEVICE(dev))
4897 		return;
4898 
4899 	ver = (dev->device >> 8) & 0xff;
4900 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4901 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4902 	    ver != 0x9a && ver != 0xa7)
4903 		return;
4904 
4905 	if (risky_device(dev))
4906 		return;
4907 
4908 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4909 	iommu_skip_te_disable = 1;
4910 }
4911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4912 
4913 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4914    ISOCH DMAR unit for the Azalia sound device, but not give it any
4915    TLB entries, which causes it to deadlock. Check for that.  We do
4916    this in a function called from init_dmars(), instead of in a PCI
4917    quirk, because we don't want to print the obnoxious "BIOS broken"
4918    message if VT-d is actually disabled.
4919 */
4920 static void __init check_tylersburg_isoch(void)
4921 {
4922 	struct pci_dev *pdev;
4923 	uint32_t vtisochctrl;
4924 
4925 	/* If there's no Azalia in the system anyway, forget it. */
4926 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4927 	if (!pdev)
4928 		return;
4929 
4930 	if (risky_device(pdev)) {
4931 		pci_dev_put(pdev);
4932 		return;
4933 	}
4934 
4935 	pci_dev_put(pdev);
4936 
4937 	/* System Management Registers. Might be hidden, in which case
4938 	   we can't do the sanity check. But that's OK, because the
4939 	   known-broken BIOSes _don't_ actually hide it, so far. */
4940 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4941 	if (!pdev)
4942 		return;
4943 
4944 	if (risky_device(pdev)) {
4945 		pci_dev_put(pdev);
4946 		return;
4947 	}
4948 
4949 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4950 		pci_dev_put(pdev);
4951 		return;
4952 	}
4953 
4954 	pci_dev_put(pdev);
4955 
4956 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4957 	if (vtisochctrl & 1)
4958 		return;
4959 
4960 	/* Drop all bits other than the number of TLB entries */
4961 	vtisochctrl &= 0x1c;
4962 
4963 	/* If we have the recommended number of TLB entries (16), fine. */
4964 	if (vtisochctrl == 0x10)
4965 		return;
4966 
4967 	/* Zero TLB entries? You get to ride the short bus to school. */
4968 	if (!vtisochctrl) {
4969 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4970 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4971 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4972 		     dmi_get_system_info(DMI_BIOS_VERSION),
4973 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4974 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4975 		return;
4976 	}
4977 
4978 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4979 	       vtisochctrl);
4980 }
4981 
4982 /*
4983  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4984  * invalidation completion before posted writes initiated with translated address
4985  * that utilized translations matching the invalidation address range, violating
4986  * the invalidation completion ordering.
4987  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4988  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4989  * under the control of the trusted/privileged host device driver must use this
4990  * quirk.
4991  * Device TLBs are invalidated under the following six conditions:
4992  * 1. Device driver does DMA API unmap IOVA
4993  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4994  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4995  *    exit_mmap() due to crash
4996  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4997  *    VM has to free pages that were unmapped
4998  * 5. Userspace driver unmaps a DMA buffer
4999  * 6. Cache invalidation in vSVA usage (upcoming)
5000  *
5001  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5002  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5003  * invalidate TLB the same way as normal user unmap which will use this quirk.
5004  * The dTLB invalidation after PASID cache flush does not need this quirk.
5005  *
5006  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5007  */
5008 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5009 			       unsigned long address, unsigned long mask,
5010 			       u32 pasid, u16 qdep)
5011 {
5012 	u16 sid;
5013 
5014 	if (likely(!info->dtlb_extra_inval))
5015 		return;
5016 
5017 	sid = PCI_DEVID(info->bus, info->devfn);
5018 	if (pasid == PASID_RID2PASID) {
5019 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5020 				   qdep, address, mask);
5021 	} else {
5022 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5023 					 pasid, qdep, address, mask);
5024 	}
5025 }
5026