xref: /freebsd/sys/dev/iommu/busdma_iommu.c (revision d0b2dbfa)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 The FreeBSD Foundation
5  *
6  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
7  * under sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/domainset.h>
35 #include <sys/malloc.h>
36 #include <sys/bus.h>
37 #include <sys/conf.h>
38 #include <sys/interrupt.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/lock.h>
42 #include <sys/proc.h>
43 #include <sys/memdesc.h>
44 #include <sys/msan.h>
45 #include <sys/mutex.h>
46 #include <sys/sysctl.h>
47 #include <sys/rman.h>
48 #include <sys/taskqueue.h>
49 #include <sys/tree.h>
50 #include <sys/uio.h>
51 #include <sys/vmem.h>
52 #include <dev/pci/pcireg.h>
53 #include <dev/pci/pcivar.h>
54 #include <vm/vm.h>
55 #include <vm/vm_extern.h>
56 #include <vm/vm_kern.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_map.h>
60 #include <dev/iommu/iommu.h>
61 #include <machine/atomic.h>
62 #include <machine/bus.h>
63 #include <machine/md_var.h>
64 #include <machine/iommu.h>
65 #include <dev/iommu/busdma_iommu.h>
66 
67 /*
68  * busdma_iommu.c, the implementation of the busdma(9) interface using
69  * IOMMU units from Intel VT-d.
70  */
71 
72 static bool
73 iommu_bus_dma_is_dev_disabled(int domain, int bus, int slot, int func)
74 {
75 	char str[128], *env;
76 	int default_bounce;
77 	bool ret;
78 	static const char bounce_str[] = "bounce";
79 	static const char iommu_str[] = "iommu";
80 	static const char dmar_str[] = "dmar"; /* compatibility */
81 
82 	default_bounce = 0;
83 	env = kern_getenv("hw.busdma.default");
84 	if (env != NULL) {
85 		if (strcmp(env, bounce_str) == 0)
86 			default_bounce = 1;
87 		else if (strcmp(env, iommu_str) == 0 ||
88 		    strcmp(env, dmar_str) == 0)
89 			default_bounce = 0;
90 		freeenv(env);
91 	}
92 
93 	snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d",
94 	    domain, bus, slot, func);
95 	env = kern_getenv(str);
96 	if (env == NULL)
97 		return (default_bounce != 0);
98 	if (strcmp(env, bounce_str) == 0)
99 		ret = true;
100 	else if (strcmp(env, iommu_str) == 0 ||
101 	    strcmp(env, dmar_str) == 0)
102 		ret = false;
103 	else
104 		ret = default_bounce != 0;
105 	freeenv(env);
106 	return (ret);
107 }
108 
109 /*
110  * Given original device, find the requester ID that will be seen by
111  * the IOMMU unit and used for page table lookup.  PCI bridges may take
112  * ownership of transactions from downstream devices, so it may not be
113  * the same as the BSF of the target device.  In those cases, all
114  * devices downstream of the bridge must share a single mapping
115  * domain, and must collectively be assigned to use either IOMMU or
116  * bounce mapping.
117  */
118 device_t
119 iommu_get_requester(device_t dev, uint16_t *rid)
120 {
121 	devclass_t pci_class;
122 	device_t l, pci, pcib, pcip, pcibp, requester;
123 	int cap_offset;
124 	uint16_t pcie_flags;
125 	bool bridge_is_pcie;
126 
127 	pci_class = devclass_find("pci");
128 	l = requester = dev;
129 
130 	*rid = pci_get_rid(dev);
131 
132 	/*
133 	 * Walk the bridge hierarchy from the target device to the
134 	 * host port to find the translating bridge nearest the IOMMU
135 	 * unit.
136 	 */
137 	for (;;) {
138 		pci = device_get_parent(l);
139 		KASSERT(pci != NULL, ("iommu_get_requester(%s): NULL parent "
140 		    "for %s", device_get_name(dev), device_get_name(l)));
141 		KASSERT(device_get_devclass(pci) == pci_class,
142 		    ("iommu_get_requester(%s): non-pci parent %s for %s",
143 		    device_get_name(dev), device_get_name(pci),
144 		    device_get_name(l)));
145 
146 		pcib = device_get_parent(pci);
147 		KASSERT(pcib != NULL, ("iommu_get_requester(%s): NULL bridge "
148 		    "for %s", device_get_name(dev), device_get_name(pci)));
149 
150 		/*
151 		 * The parent of our "bridge" isn't another PCI bus,
152 		 * so pcib isn't a PCI->PCI bridge but rather a host
153 		 * port, and the requester ID won't be translated
154 		 * further.
155 		 */
156 		pcip = device_get_parent(pcib);
157 		if (device_get_devclass(pcip) != pci_class)
158 			break;
159 		pcibp = device_get_parent(pcip);
160 
161 		if (pci_find_cap(l, PCIY_EXPRESS, &cap_offset) == 0) {
162 			/*
163 			 * Do not stop the loop even if the target
164 			 * device is PCIe, because it is possible (but
165 			 * unlikely) to have a PCI->PCIe bridge
166 			 * somewhere in the hierarchy.
167 			 */
168 			l = pcib;
169 		} else {
170 			/*
171 			 * Device is not PCIe, it cannot be seen as a
172 			 * requester by IOMMU unit.  Check whether the
173 			 * bridge is PCIe.
174 			 */
175 			bridge_is_pcie = pci_find_cap(pcib, PCIY_EXPRESS,
176 			    &cap_offset) == 0;
177 			requester = pcib;
178 
179 			/*
180 			 * Check for a buggy PCIe/PCI bridge that
181 			 * doesn't report the express capability.  If
182 			 * the bridge above it is express but isn't a
183 			 * PCI bridge, then we know pcib is actually a
184 			 * PCIe/PCI bridge.
185 			 */
186 			if (!bridge_is_pcie && pci_find_cap(pcibp,
187 			    PCIY_EXPRESS, &cap_offset) == 0) {
188 				pcie_flags = pci_read_config(pcibp,
189 				    cap_offset + PCIER_FLAGS, 2);
190 				if ((pcie_flags & PCIEM_FLAGS_TYPE) !=
191 				    PCIEM_TYPE_PCI_BRIDGE)
192 					bridge_is_pcie = true;
193 			}
194 
195 			if (bridge_is_pcie) {
196 				/*
197 				 * The current device is not PCIe, but
198 				 * the bridge above it is.  This is a
199 				 * PCIe->PCI bridge.  Assume that the
200 				 * requester ID will be the secondary
201 				 * bus number with slot and function
202 				 * set to zero.
203 				 *
204 				 * XXX: Doesn't handle the case where
205 				 * the bridge is PCIe->PCI-X, and the
206 				 * bridge will only take ownership of
207 				 * requests in some cases.  We should
208 				 * provide context entries with the
209 				 * same page tables for taken and
210 				 * non-taken transactions.
211 				 */
212 				*rid = PCI_RID(pci_get_bus(l), 0, 0);
213 				l = pcibp;
214 			} else {
215 				/*
216 				 * Neither the device nor the bridge
217 				 * above it are PCIe.  This is a
218 				 * conventional PCI->PCI bridge, which
219 				 * will use the bridge's BSF as the
220 				 * requester ID.
221 				 */
222 				*rid = pci_get_rid(pcib);
223 				l = pcib;
224 			}
225 		}
226 	}
227 	return (requester);
228 }
229 
230 struct iommu_ctx *
231 iommu_instantiate_ctx(struct iommu_unit *unit, device_t dev, bool rmrr)
232 {
233 	device_t requester;
234 	struct iommu_ctx *ctx;
235 	bool disabled;
236 	uint16_t rid;
237 
238 	requester = iommu_get_requester(dev, &rid);
239 
240 	/*
241 	 * If the user requested the IOMMU disabled for the device, we
242 	 * cannot disable the IOMMU unit, due to possibility of other
243 	 * devices on the same IOMMU unit still requiring translation.
244 	 * Instead provide the identity mapping for the device
245 	 * context.
246 	 */
247 	disabled = iommu_bus_dma_is_dev_disabled(pci_get_domain(requester),
248 	    pci_get_bus(requester), pci_get_slot(requester),
249 	    pci_get_function(requester));
250 	ctx = iommu_get_ctx(unit, requester, rid, disabled, rmrr);
251 	if (ctx == NULL)
252 		return (NULL);
253 	if (disabled) {
254 		/*
255 		 * Keep the first reference on context, release the
256 		 * later refs.
257 		 */
258 		IOMMU_LOCK(unit);
259 		if ((ctx->flags & IOMMU_CTX_DISABLED) == 0) {
260 			ctx->flags |= IOMMU_CTX_DISABLED;
261 			IOMMU_UNLOCK(unit);
262 		} else {
263 			iommu_free_ctx_locked(unit, ctx);
264 		}
265 		ctx = NULL;
266 	}
267 	return (ctx);
268 }
269 
270 struct iommu_ctx *
271 iommu_get_dev_ctx(device_t dev)
272 {
273 	struct iommu_unit *unit;
274 
275 	unit = iommu_find(dev, bootverbose);
276 	/* Not in scope of any IOMMU ? */
277 	if (unit == NULL)
278 		return (NULL);
279 	if (!unit->dma_enabled)
280 		return (NULL);
281 
282 #if defined(__amd64__) || defined(__i386__)
283 	dmar_quirks_pre_use(unit);
284 	dmar_instantiate_rmrr_ctxs(unit);
285 #endif
286 
287 	return (iommu_instantiate_ctx(unit, dev, false));
288 }
289 
290 bus_dma_tag_t
291 iommu_get_dma_tag(device_t dev, device_t child)
292 {
293 	struct iommu_ctx *ctx;
294 	bus_dma_tag_t res;
295 
296 	ctx = iommu_get_dev_ctx(child);
297 	if (ctx == NULL)
298 		return (NULL);
299 
300 	res = (bus_dma_tag_t)ctx->tag;
301 	return (res);
302 }
303 
304 bool
305 bus_dma_iommu_set_buswide(device_t dev)
306 {
307 	struct iommu_unit *unit;
308 	device_t parent;
309 	u_int busno, slot, func;
310 
311 	parent = device_get_parent(dev);
312 	if (device_get_devclass(parent) != devclass_find("pci"))
313 		return (false);
314 	unit = iommu_find(dev, bootverbose);
315 	if (unit == NULL)
316 		return (false);
317 	busno = pci_get_bus(dev);
318 	slot = pci_get_slot(dev);
319 	func = pci_get_function(dev);
320 	if (slot != 0 || func != 0) {
321 		if (bootverbose) {
322 			device_printf(dev,
323 			    "iommu%d pci%d:%d:%d requested buswide busdma\n",
324 			    unit->unit, busno, slot, func);
325 		}
326 		return (false);
327 	}
328 	iommu_set_buswide_ctx(unit, busno);
329 	return (true);
330 }
331 
332 void
333 iommu_set_buswide_ctx(struct iommu_unit *unit, u_int busno)
334 {
335 
336 	MPASS(busno <= PCI_BUSMAX);
337 	IOMMU_LOCK(unit);
338 	unit->buswide_ctxs[busno / NBBY / sizeof(uint32_t)] |=
339 	    1 << (busno % (NBBY * sizeof(uint32_t)));
340 	IOMMU_UNLOCK(unit);
341 }
342 
343 bool
344 iommu_is_buswide_ctx(struct iommu_unit *unit, u_int busno)
345 {
346 
347 	MPASS(busno <= PCI_BUSMAX);
348 	return ((unit->buswide_ctxs[busno / NBBY / sizeof(uint32_t)] &
349 	    (1U << (busno % (NBBY * sizeof(uint32_t))))) != 0);
350 }
351 
352 static MALLOC_DEFINE(M_IOMMU_DMAMAP, "iommu_dmamap", "IOMMU DMA Map");
353 
354 static void iommu_bus_schedule_dmamap(struct iommu_unit *unit,
355     struct bus_dmamap_iommu *map);
356 
357 static int
358 iommu_bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment,
359     bus_addr_t boundary, bus_addr_t lowaddr, bus_addr_t highaddr,
360     bus_dma_filter_t *filter, void *filterarg, bus_size_t maxsize,
361     int nsegments, bus_size_t maxsegsz, int flags, bus_dma_lock_t *lockfunc,
362     void *lockfuncarg, bus_dma_tag_t *dmat)
363 {
364 	struct bus_dma_tag_iommu *newtag, *oldtag;
365 	int error;
366 
367 	*dmat = NULL;
368 	error = common_bus_dma_tag_create(parent != NULL ?
369 	    &((struct bus_dma_tag_iommu *)parent)->common : NULL, alignment,
370 	    boundary, lowaddr, highaddr, filter, filterarg, maxsize,
371 	    nsegments, maxsegsz, flags, lockfunc, lockfuncarg,
372 	    sizeof(struct bus_dma_tag_iommu), (void **)&newtag);
373 	if (error != 0)
374 		goto out;
375 
376 	oldtag = (struct bus_dma_tag_iommu *)parent;
377 	newtag->common.impl = &bus_dma_iommu_impl;
378 	newtag->ctx = oldtag->ctx;
379 	newtag->owner = oldtag->owner;
380 
381 	*dmat = (bus_dma_tag_t)newtag;
382 out:
383 	CTR4(KTR_BUSDMA, "%s returned tag %p tag flags 0x%x error %d",
384 	    __func__, newtag, (newtag != NULL ? newtag->common.flags : 0),
385 	    error);
386 	return (error);
387 }
388 
389 static int
390 iommu_bus_dma_tag_set_domain(bus_dma_tag_t dmat)
391 {
392 
393 	return (0);
394 }
395 
396 static int
397 iommu_bus_dma_tag_destroy(bus_dma_tag_t dmat1)
398 {
399 	struct bus_dma_tag_iommu *dmat, *parent;
400 	struct bus_dma_tag_iommu *dmat_copy __unused;
401 	int error;
402 
403 	error = 0;
404 	dmat_copy = dmat = (struct bus_dma_tag_iommu *)dmat1;
405 
406 	if (dmat != NULL) {
407 		if (dmat->map_count != 0) {
408 			error = EBUSY;
409 			goto out;
410 		}
411 		while (dmat != NULL) {
412 			parent = (struct bus_dma_tag_iommu *)dmat->common.parent;
413 			if (atomic_fetchadd_int(&dmat->common.ref_count, -1) ==
414 			    1) {
415 				if (dmat == dmat->ctx->tag)
416 					iommu_free_ctx(dmat->ctx);
417 				free(dmat->segments, M_IOMMU_DMAMAP);
418 				free(dmat, M_DEVBUF);
419 				dmat = parent;
420 			} else
421 				dmat = NULL;
422 		}
423 	}
424 out:
425 	CTR3(KTR_BUSDMA, "%s tag %p error %d", __func__, dmat_copy, error);
426 	return (error);
427 }
428 
429 static bool
430 iommu_bus_dma_id_mapped(bus_dma_tag_t dmat, vm_paddr_t buf, bus_size_t buflen)
431 {
432 
433 	return (false);
434 }
435 
436 static int
437 iommu_bus_dmamap_create(bus_dma_tag_t dmat, int flags, bus_dmamap_t *mapp)
438 {
439 	struct bus_dma_tag_iommu *tag;
440 	struct bus_dmamap_iommu *map;
441 
442 	tag = (struct bus_dma_tag_iommu *)dmat;
443 	map = malloc_domainset(sizeof(*map), M_IOMMU_DMAMAP,
444 	    DOMAINSET_PREF(tag->common.domain), M_NOWAIT | M_ZERO);
445 	if (map == NULL) {
446 		*mapp = NULL;
447 		return (ENOMEM);
448 	}
449 	if (tag->segments == NULL) {
450 		tag->segments = malloc_domainset(sizeof(bus_dma_segment_t) *
451 		    tag->common.nsegments, M_IOMMU_DMAMAP,
452 		    DOMAINSET_PREF(tag->common.domain), M_NOWAIT);
453 		if (tag->segments == NULL) {
454 			free(map, M_IOMMU_DMAMAP);
455 			*mapp = NULL;
456 			return (ENOMEM);
457 		}
458 	}
459 	IOMMU_DMAMAP_INIT(map);
460 	TAILQ_INIT(&map->map_entries);
461 	map->tag = tag;
462 	map->locked = true;
463 	map->cansleep = false;
464 	tag->map_count++;
465 	*mapp = (bus_dmamap_t)map;
466 
467 	return (0);
468 }
469 
470 static int
471 iommu_bus_dmamap_destroy(bus_dma_tag_t dmat, bus_dmamap_t map1)
472 {
473 	struct bus_dma_tag_iommu *tag;
474 	struct bus_dmamap_iommu *map;
475 
476 	tag = (struct bus_dma_tag_iommu *)dmat;
477 	map = (struct bus_dmamap_iommu *)map1;
478 	if (map != NULL) {
479 		IOMMU_DMAMAP_LOCK(map);
480 		if (!TAILQ_EMPTY(&map->map_entries)) {
481 			IOMMU_DMAMAP_UNLOCK(map);
482 			return (EBUSY);
483 		}
484 		IOMMU_DMAMAP_DESTROY(map);
485 		free(map, M_IOMMU_DMAMAP);
486 	}
487 	tag->map_count--;
488 	return (0);
489 }
490 
491 
492 static int
493 iommu_bus_dmamem_alloc(bus_dma_tag_t dmat, void** vaddr, int flags,
494     bus_dmamap_t *mapp)
495 {
496 	struct bus_dma_tag_iommu *tag;
497 	struct bus_dmamap_iommu *map;
498 	int error, mflags;
499 	vm_memattr_t attr;
500 
501 	error = iommu_bus_dmamap_create(dmat, flags, mapp);
502 	if (error != 0)
503 		return (error);
504 
505 	mflags = (flags & BUS_DMA_NOWAIT) != 0 ? M_NOWAIT : M_WAITOK;
506 	mflags |= (flags & BUS_DMA_ZERO) != 0 ? M_ZERO : 0;
507 	attr = (flags & BUS_DMA_NOCACHE) != 0 ? VM_MEMATTR_UNCACHEABLE :
508 	    VM_MEMATTR_DEFAULT;
509 
510 	tag = (struct bus_dma_tag_iommu *)dmat;
511 	map = (struct bus_dmamap_iommu *)*mapp;
512 
513 	if (tag->common.maxsize < PAGE_SIZE &&
514 	    tag->common.alignment <= tag->common.maxsize &&
515 	    attr == VM_MEMATTR_DEFAULT) {
516 		*vaddr = malloc_domainset(tag->common.maxsize, M_DEVBUF,
517 		    DOMAINSET_PREF(tag->common.domain), mflags);
518 		map->flags |= BUS_DMAMAP_IOMMU_MALLOC;
519 	} else {
520 		*vaddr = kmem_alloc_attr_domainset(
521 		    DOMAINSET_PREF(tag->common.domain), tag->common.maxsize,
522 		    mflags, 0ul, BUS_SPACE_MAXADDR, attr);
523 		map->flags |= BUS_DMAMAP_IOMMU_KMEM_ALLOC;
524 	}
525 	if (*vaddr == NULL) {
526 		iommu_bus_dmamap_destroy(dmat, *mapp);
527 		*mapp = NULL;
528 		return (ENOMEM);
529 	}
530 	return (0);
531 }
532 
533 static void
534 iommu_bus_dmamem_free(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map1)
535 {
536 	struct bus_dma_tag_iommu *tag;
537 	struct bus_dmamap_iommu *map;
538 
539 	tag = (struct bus_dma_tag_iommu *)dmat;
540 	map = (struct bus_dmamap_iommu *)map1;
541 
542 	if ((map->flags & BUS_DMAMAP_IOMMU_MALLOC) != 0) {
543 		free(vaddr, M_DEVBUF);
544 		map->flags &= ~BUS_DMAMAP_IOMMU_MALLOC;
545 	} else {
546 		KASSERT((map->flags & BUS_DMAMAP_IOMMU_KMEM_ALLOC) != 0,
547 		    ("iommu_bus_dmamem_free for non alloced map %p", map));
548 		kmem_free(vaddr, tag->common.maxsize);
549 		map->flags &= ~BUS_DMAMAP_IOMMU_KMEM_ALLOC;
550 	}
551 
552 	iommu_bus_dmamap_destroy(dmat, map1);
553 }
554 
555 static int
556 iommu_bus_dmamap_load_something1(struct bus_dma_tag_iommu *tag,
557     struct bus_dmamap_iommu *map, vm_page_t *ma, int offset, bus_size_t buflen,
558     int flags, bus_dma_segment_t *segs, int *segp,
559     struct iommu_map_entries_tailq *entries)
560 {
561 	struct iommu_ctx *ctx;
562 	struct iommu_domain *domain;
563 	struct iommu_map_entry *entry;
564 	bus_size_t buflen1;
565 	int error, e_flags, idx, gas_flags, seg;
566 
567 	KASSERT(offset < IOMMU_PAGE_SIZE, ("offset %d", offset));
568 	if (segs == NULL)
569 		segs = tag->segments;
570 	ctx = tag->ctx;
571 	domain = ctx->domain;
572 	e_flags = IOMMU_MAP_ENTRY_READ |
573 	    ((flags & BUS_DMA_NOWRITE) == 0 ? IOMMU_MAP_ENTRY_WRITE : 0);
574 	seg = *segp;
575 	error = 0;
576 	idx = 0;
577 	while (buflen > 0) {
578 		seg++;
579 		if (seg >= tag->common.nsegments) {
580 			error = EFBIG;
581 			break;
582 		}
583 		buflen1 = buflen > tag->common.maxsegsz ?
584 		    tag->common.maxsegsz : buflen;
585 
586 		/*
587 		 * (Too) optimistically allow split if there are more
588 		 * then one segments left.
589 		 */
590 		gas_flags = map->cansleep ? IOMMU_MF_CANWAIT : 0;
591 		if (seg + 1 < tag->common.nsegments)
592 			gas_flags |= IOMMU_MF_CANSPLIT;
593 
594 		error = iommu_gas_map(domain, &tag->common, buflen1,
595 		    offset, e_flags, gas_flags, ma + idx, &entry);
596 		if (error != 0)
597 			break;
598 		/* Update buflen1 in case buffer split. */
599 		if (buflen1 > entry->end - entry->start - offset)
600 			buflen1 = entry->end - entry->start - offset;
601 
602 		KASSERT(vm_addr_align_ok(entry->start + offset,
603 		    tag->common.alignment),
604 		    ("alignment failed: ctx %p start 0x%jx offset %x "
605 		    "align 0x%jx", ctx, (uintmax_t)entry->start, offset,
606 		    (uintmax_t)tag->common.alignment));
607 		KASSERT(entry->end <= tag->common.lowaddr ||
608 		    entry->start >= tag->common.highaddr,
609 		    ("entry placement failed: ctx %p start 0x%jx end 0x%jx "
610 		    "lowaddr 0x%jx highaddr 0x%jx", ctx,
611 		    (uintmax_t)entry->start, (uintmax_t)entry->end,
612 		    (uintmax_t)tag->common.lowaddr,
613 		    (uintmax_t)tag->common.highaddr));
614 		KASSERT(vm_addr_bound_ok(entry->start + offset, buflen1,
615 		    tag->common.boundary),
616 		    ("boundary failed: ctx %p start 0x%jx end 0x%jx "
617 		    "boundary 0x%jx", ctx, (uintmax_t)entry->start,
618 		    (uintmax_t)entry->end, (uintmax_t)tag->common.boundary));
619 		KASSERT(buflen1 <= tag->common.maxsegsz,
620 		    ("segment too large: ctx %p start 0x%jx end 0x%jx "
621 		    "buflen1 0x%jx maxsegsz 0x%jx", ctx,
622 		    (uintmax_t)entry->start, (uintmax_t)entry->end,
623 		    (uintmax_t)buflen1, (uintmax_t)tag->common.maxsegsz));
624 
625 		KASSERT((entry->flags & IOMMU_MAP_ENTRY_MAP) != 0,
626 		    ("entry %p missing IOMMU_MAP_ENTRY_MAP", entry));
627 		TAILQ_INSERT_TAIL(entries, entry, dmamap_link);
628 
629 		segs[seg].ds_addr = entry->start + offset;
630 		segs[seg].ds_len = buflen1;
631 
632 		idx += OFF_TO_IDX(offset + buflen1);
633 		offset += buflen1;
634 		offset &= IOMMU_PAGE_MASK;
635 		buflen -= buflen1;
636 	}
637 	if (error == 0)
638 		*segp = seg;
639 	return (error);
640 }
641 
642 static int
643 iommu_bus_dmamap_load_something(struct bus_dma_tag_iommu *tag,
644     struct bus_dmamap_iommu *map, vm_page_t *ma, int offset, bus_size_t buflen,
645     int flags, bus_dma_segment_t *segs, int *segp)
646 {
647 	struct iommu_ctx *ctx;
648 	struct iommu_domain *domain;
649 	struct iommu_map_entries_tailq entries;
650 	int error;
651 
652 	ctx = tag->ctx;
653 	domain = ctx->domain;
654 	atomic_add_long(&ctx->loads, 1);
655 
656 	TAILQ_INIT(&entries);
657 	error = iommu_bus_dmamap_load_something1(tag, map, ma, offset,
658 	    buflen, flags, segs, segp, &entries);
659 	if (error == 0) {
660 		IOMMU_DMAMAP_LOCK(map);
661 		TAILQ_CONCAT(&map->map_entries, &entries, dmamap_link);
662 		IOMMU_DMAMAP_UNLOCK(map);
663 	} else if (!TAILQ_EMPTY(&entries)) {
664 		/*
665 		 * The busdma interface does not allow us to report
666 		 * partial buffer load, so unfortunately we have to
667 		 * revert all work done.
668 		 */
669 		IOMMU_DOMAIN_LOCK(domain);
670 		TAILQ_CONCAT(&domain->unload_entries, &entries, dmamap_link);
671 		IOMMU_DOMAIN_UNLOCK(domain);
672 		taskqueue_enqueue(domain->iommu->delayed_taskqueue,
673 		    &domain->unload_task);
674 	}
675 
676 	if (error == ENOMEM && (flags & BUS_DMA_NOWAIT) == 0 &&
677 	    !map->cansleep)
678 		error = EINPROGRESS;
679 	if (error == EINPROGRESS)
680 		iommu_bus_schedule_dmamap(domain->iommu, map);
681 	return (error);
682 }
683 
684 static int
685 iommu_bus_dmamap_load_ma(bus_dma_tag_t dmat, bus_dmamap_t map1,
686     struct vm_page **ma, bus_size_t tlen, int ma_offs, int flags,
687     bus_dma_segment_t *segs, int *segp)
688 {
689 	struct bus_dma_tag_iommu *tag;
690 	struct bus_dmamap_iommu *map;
691 
692 	tag = (struct bus_dma_tag_iommu *)dmat;
693 	map = (struct bus_dmamap_iommu *)map1;
694 	return (iommu_bus_dmamap_load_something(tag, map, ma, ma_offs, tlen,
695 	    flags, segs, segp));
696 }
697 
698 static int
699 iommu_bus_dmamap_load_phys(bus_dma_tag_t dmat, bus_dmamap_t map1,
700     vm_paddr_t buf, bus_size_t buflen, int flags, bus_dma_segment_t *segs,
701     int *segp)
702 {
703 	struct bus_dma_tag_iommu *tag;
704 	struct bus_dmamap_iommu *map;
705 	vm_page_t *ma, fma;
706 	vm_paddr_t pstart, pend, paddr;
707 	int error, i, ma_cnt, mflags, offset;
708 
709 	tag = (struct bus_dma_tag_iommu *)dmat;
710 	map = (struct bus_dmamap_iommu *)map1;
711 	pstart = trunc_page(buf);
712 	pend = round_page(buf + buflen);
713 	offset = buf & PAGE_MASK;
714 	ma_cnt = OFF_TO_IDX(pend - pstart);
715 	mflags = map->cansleep ? M_WAITOK : M_NOWAIT;
716 	ma = malloc(sizeof(vm_page_t) * ma_cnt, M_DEVBUF, mflags);
717 	if (ma == NULL)
718 		return (ENOMEM);
719 	fma = NULL;
720 	for (i = 0; i < ma_cnt; i++) {
721 		paddr = pstart + ptoa(i);
722 		ma[i] = PHYS_TO_VM_PAGE(paddr);
723 		if (ma[i] == NULL || VM_PAGE_TO_PHYS(ma[i]) != paddr) {
724 			/*
725 			 * If PHYS_TO_VM_PAGE() returned NULL or the
726 			 * vm_page was not initialized we'll use a
727 			 * fake page.
728 			 */
729 			if (fma == NULL) {
730 				fma = malloc(sizeof(struct vm_page) * ma_cnt,
731 				    M_DEVBUF, M_ZERO | mflags);
732 				if (fma == NULL) {
733 					free(ma, M_DEVBUF);
734 					return (ENOMEM);
735 				}
736 			}
737 			vm_page_initfake(&fma[i], pstart + ptoa(i),
738 			    VM_MEMATTR_DEFAULT);
739 			ma[i] = &fma[i];
740 		}
741 	}
742 	error = iommu_bus_dmamap_load_something(tag, map, ma, offset, buflen,
743 	    flags, segs, segp);
744 	free(fma, M_DEVBUF);
745 	free(ma, M_DEVBUF);
746 	return (error);
747 }
748 
749 static int
750 iommu_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map1, void *buf,
751     bus_size_t buflen, pmap_t pmap, int flags, bus_dma_segment_t *segs,
752     int *segp)
753 {
754 	struct bus_dma_tag_iommu *tag;
755 	struct bus_dmamap_iommu *map;
756 	vm_page_t *ma, fma;
757 	vm_paddr_t pstart, pend, paddr;
758 	int error, i, ma_cnt, mflags, offset;
759 
760 	tag = (struct bus_dma_tag_iommu *)dmat;
761 	map = (struct bus_dmamap_iommu *)map1;
762 	pstart = trunc_page((vm_offset_t)buf);
763 	pend = round_page((vm_offset_t)buf + buflen);
764 	offset = (vm_offset_t)buf & PAGE_MASK;
765 	ma_cnt = OFF_TO_IDX(pend - pstart);
766 	mflags = map->cansleep ? M_WAITOK : M_NOWAIT;
767 	ma = malloc(sizeof(vm_page_t) * ma_cnt, M_DEVBUF, mflags);
768 	if (ma == NULL)
769 		return (ENOMEM);
770 	fma = NULL;
771 	for (i = 0; i < ma_cnt; i++, pstart += PAGE_SIZE) {
772 		if (pmap == kernel_pmap)
773 			paddr = pmap_kextract(pstart);
774 		else
775 			paddr = pmap_extract(pmap, pstart);
776 		ma[i] = PHYS_TO_VM_PAGE(paddr);
777 		if (ma[i] == NULL || VM_PAGE_TO_PHYS(ma[i]) != paddr) {
778 			/*
779 			 * If PHYS_TO_VM_PAGE() returned NULL or the
780 			 * vm_page was not initialized we'll use a
781 			 * fake page.
782 			 */
783 			if (fma == NULL) {
784 				fma = malloc(sizeof(struct vm_page) * ma_cnt,
785 				    M_DEVBUF, M_ZERO | mflags);
786 				if (fma == NULL) {
787 					free(ma, M_DEVBUF);
788 					return (ENOMEM);
789 				}
790 			}
791 			vm_page_initfake(&fma[i], paddr, VM_MEMATTR_DEFAULT);
792 			ma[i] = &fma[i];
793 		}
794 	}
795 	error = iommu_bus_dmamap_load_something(tag, map, ma, offset, buflen,
796 	    flags, segs, segp);
797 	free(ma, M_DEVBUF);
798 	free(fma, M_DEVBUF);
799 	return (error);
800 }
801 
802 static void
803 iommu_bus_dmamap_waitok(bus_dma_tag_t dmat, bus_dmamap_t map1,
804     struct memdesc *mem, bus_dmamap_callback_t *callback, void *callback_arg)
805 {
806 	struct bus_dmamap_iommu *map;
807 
808 	if (map1 == NULL)
809 		return;
810 	map = (struct bus_dmamap_iommu *)map1;
811 	map->mem = *mem;
812 	map->tag = (struct bus_dma_tag_iommu *)dmat;
813 	map->callback = callback;
814 	map->callback_arg = callback_arg;
815 }
816 
817 static bus_dma_segment_t *
818 iommu_bus_dmamap_complete(bus_dma_tag_t dmat, bus_dmamap_t map1,
819     bus_dma_segment_t *segs, int nsegs, int error)
820 {
821 	struct bus_dma_tag_iommu *tag;
822 	struct bus_dmamap_iommu *map;
823 
824 	tag = (struct bus_dma_tag_iommu *)dmat;
825 	map = (struct bus_dmamap_iommu *)map1;
826 
827 	if (!map->locked) {
828 		KASSERT(map->cansleep,
829 		    ("map not locked and not sleepable context %p", map));
830 
831 		/*
832 		 * We are called from the delayed context.  Relock the
833 		 * driver.
834 		 */
835 		(tag->common.lockfunc)(tag->common.lockfuncarg, BUS_DMA_LOCK);
836 		map->locked = true;
837 	}
838 
839 	if (segs == NULL)
840 		segs = tag->segments;
841 	return (segs);
842 }
843 
844 /*
845  * The limitations of busdma KPI forces the iommu to perform the actual
846  * unload, consisting of the unmapping of the map entries page tables,
847  * from the delayed context on i386, since page table page mapping
848  * might require a sleep to be successfull.  The unfortunate
849  * consequence is that the DMA requests can be served some time after
850  * the bus_dmamap_unload() call returned.
851  *
852  * On amd64, we assume that sf allocation cannot fail.
853  */
854 static void
855 iommu_bus_dmamap_unload(bus_dma_tag_t dmat, bus_dmamap_t map1)
856 {
857 	struct bus_dma_tag_iommu *tag;
858 	struct bus_dmamap_iommu *map;
859 	struct iommu_ctx *ctx;
860 	struct iommu_domain *domain;
861 	struct iommu_map_entries_tailq entries;
862 
863 	tag = (struct bus_dma_tag_iommu *)dmat;
864 	map = (struct bus_dmamap_iommu *)map1;
865 	ctx = tag->ctx;
866 	domain = ctx->domain;
867 	atomic_add_long(&ctx->unloads, 1);
868 
869 	TAILQ_INIT(&entries);
870 	IOMMU_DMAMAP_LOCK(map);
871 	TAILQ_CONCAT(&entries, &map->map_entries, dmamap_link);
872 	IOMMU_DMAMAP_UNLOCK(map);
873 #if defined(IOMMU_DOMAIN_UNLOAD_SLEEP)
874 	IOMMU_DOMAIN_LOCK(domain);
875 	TAILQ_CONCAT(&domain->unload_entries, &entries, dmamap_link);
876 	IOMMU_DOMAIN_UNLOCK(domain);
877 	taskqueue_enqueue(domain->iommu->delayed_taskqueue,
878 	    &domain->unload_task);
879 #else
880 	THREAD_NO_SLEEPING();
881 	iommu_domain_unload(domain, &entries, false);
882 	THREAD_SLEEPING_OK();
883 	KASSERT(TAILQ_EMPTY(&entries), ("lazy iommu_ctx_unload %p", ctx));
884 #endif
885 }
886 
887 static void
888 iommu_bus_dmamap_sync(bus_dma_tag_t dmat, bus_dmamap_t map1,
889     bus_dmasync_op_t op)
890 {
891 	struct bus_dmamap_iommu *map __unused;
892 
893 	map = (struct bus_dmamap_iommu *)map1;
894 	kmsan_bus_dmamap_sync(&map->kmsan_mem, op);
895 }
896 
897 #ifdef KMSAN
898 static void
899 iommu_bus_dmamap_load_kmsan(bus_dmamap_t map1, struct memdesc *mem)
900 {
901 	struct bus_dmamap_iommu *map;
902 
903 	map = (struct bus_dmamap_iommu *)map1;
904 	if (map == NULL)
905 		return;
906 	memcpy(&map->kmsan_mem, mem, sizeof(struct memdesc));
907 }
908 #endif
909 
910 struct bus_dma_impl bus_dma_iommu_impl = {
911 	.tag_create = iommu_bus_dma_tag_create,
912 	.tag_destroy = iommu_bus_dma_tag_destroy,
913 	.tag_set_domain = iommu_bus_dma_tag_set_domain,
914 	.id_mapped = iommu_bus_dma_id_mapped,
915 	.map_create = iommu_bus_dmamap_create,
916 	.map_destroy = iommu_bus_dmamap_destroy,
917 	.mem_alloc = iommu_bus_dmamem_alloc,
918 	.mem_free = iommu_bus_dmamem_free,
919 	.load_phys = iommu_bus_dmamap_load_phys,
920 	.load_buffer = iommu_bus_dmamap_load_buffer,
921 	.load_ma = iommu_bus_dmamap_load_ma,
922 	.map_waitok = iommu_bus_dmamap_waitok,
923 	.map_complete = iommu_bus_dmamap_complete,
924 	.map_unload = iommu_bus_dmamap_unload,
925 	.map_sync = iommu_bus_dmamap_sync,
926 #ifdef KMSAN
927 	.load_kmsan = iommu_bus_dmamap_load_kmsan,
928 #endif
929 };
930 
931 static void
932 iommu_bus_task_dmamap(void *arg, int pending)
933 {
934 	struct bus_dma_tag_iommu *tag;
935 	struct bus_dmamap_iommu *map;
936 	struct iommu_unit *unit;
937 
938 	unit = arg;
939 	IOMMU_LOCK(unit);
940 	while ((map = TAILQ_FIRST(&unit->delayed_maps)) != NULL) {
941 		TAILQ_REMOVE(&unit->delayed_maps, map, delay_link);
942 		IOMMU_UNLOCK(unit);
943 		tag = map->tag;
944 		map->cansleep = true;
945 		map->locked = false;
946 		bus_dmamap_load_mem((bus_dma_tag_t)tag, (bus_dmamap_t)map,
947 		    &map->mem, map->callback, map->callback_arg,
948 		    BUS_DMA_WAITOK);
949 		map->cansleep = false;
950 		if (map->locked) {
951 			(tag->common.lockfunc)(tag->common.lockfuncarg,
952 			    BUS_DMA_UNLOCK);
953 		} else
954 			map->locked = true;
955 		map->cansleep = false;
956 		IOMMU_LOCK(unit);
957 	}
958 	IOMMU_UNLOCK(unit);
959 }
960 
961 static void
962 iommu_bus_schedule_dmamap(struct iommu_unit *unit, struct bus_dmamap_iommu *map)
963 {
964 
965 	map->locked = false;
966 	IOMMU_LOCK(unit);
967 	TAILQ_INSERT_TAIL(&unit->delayed_maps, map, delay_link);
968 	IOMMU_UNLOCK(unit);
969 	taskqueue_enqueue(unit->delayed_taskqueue, &unit->dmamap_load_task);
970 }
971 
972 int
973 iommu_init_busdma(struct iommu_unit *unit)
974 {
975 	int error;
976 
977 	unit->dma_enabled = 1;
978 	error = TUNABLE_INT_FETCH("hw.iommu.dma", &unit->dma_enabled);
979 	if (error == 0) /* compatibility */
980 		TUNABLE_INT_FETCH("hw.dmar.dma", &unit->dma_enabled);
981 	TAILQ_INIT(&unit->delayed_maps);
982 	TASK_INIT(&unit->dmamap_load_task, 0, iommu_bus_task_dmamap, unit);
983 	unit->delayed_taskqueue = taskqueue_create("iommu", M_WAITOK,
984 	    taskqueue_thread_enqueue, &unit->delayed_taskqueue);
985 	taskqueue_start_threads(&unit->delayed_taskqueue, 1, PI_DISK,
986 	    "iommu%d busdma taskq", unit->unit);
987 	return (0);
988 }
989 
990 void
991 iommu_fini_busdma(struct iommu_unit *unit)
992 {
993 
994 	if (unit->delayed_taskqueue == NULL)
995 		return;
996 
997 	taskqueue_drain(unit->delayed_taskqueue, &unit->dmamap_load_task);
998 	taskqueue_free(unit->delayed_taskqueue);
999 	unit->delayed_taskqueue = NULL;
1000 }
1001 
1002 int
1003 bus_dma_iommu_load_ident(bus_dma_tag_t dmat, bus_dmamap_t map1,
1004     vm_paddr_t start, vm_size_t length, int flags)
1005 {
1006 	struct bus_dma_tag_common *tc;
1007 	struct bus_dma_tag_iommu *tag;
1008 	struct bus_dmamap_iommu *map;
1009 	struct iommu_ctx *ctx;
1010 	struct iommu_domain *domain;
1011 	struct iommu_map_entry *entry;
1012 	vm_page_t *ma;
1013 	vm_size_t i;
1014 	int error;
1015 	bool waitok;
1016 
1017 	MPASS((start & PAGE_MASK) == 0);
1018 	MPASS((length & PAGE_MASK) == 0);
1019 	MPASS(length > 0);
1020 	MPASS(start + length >= start);
1021 	MPASS((flags & ~(BUS_DMA_NOWAIT | BUS_DMA_NOWRITE)) == 0);
1022 
1023 	tc = (struct bus_dma_tag_common *)dmat;
1024 	if (tc->impl != &bus_dma_iommu_impl)
1025 		return (0);
1026 
1027 	tag = (struct bus_dma_tag_iommu *)dmat;
1028 	ctx = tag->ctx;
1029 	domain = ctx->domain;
1030 	map = (struct bus_dmamap_iommu *)map1;
1031 	waitok = (flags & BUS_DMA_NOWAIT) != 0;
1032 
1033 	entry = iommu_gas_alloc_entry(domain, waitok ? 0 : IOMMU_PGF_WAITOK);
1034 	if (entry == NULL)
1035 		return (ENOMEM);
1036 	entry->start = start;
1037 	entry->end = start + length;
1038 	ma = malloc(sizeof(vm_page_t) * atop(length), M_TEMP, waitok ?
1039 	    M_WAITOK : M_NOWAIT);
1040 	if (ma == NULL) {
1041 		iommu_gas_free_entry(entry);
1042 		return (ENOMEM);
1043 	}
1044 	for (i = 0; i < atop(length); i++) {
1045 		ma[i] = vm_page_getfake(entry->start + PAGE_SIZE * i,
1046 		    VM_MEMATTR_DEFAULT);
1047 	}
1048 	error = iommu_gas_map_region(domain, entry, IOMMU_MAP_ENTRY_READ |
1049 	    ((flags & BUS_DMA_NOWRITE) ? 0 : IOMMU_MAP_ENTRY_WRITE) |
1050 	    IOMMU_MAP_ENTRY_MAP, waitok ? IOMMU_MF_CANWAIT : 0, ma);
1051 	if (error == 0) {
1052 		IOMMU_DMAMAP_LOCK(map);
1053 		TAILQ_INSERT_TAIL(&map->map_entries, entry, dmamap_link);
1054 		IOMMU_DMAMAP_UNLOCK(map);
1055 	} else {
1056 		iommu_gas_free_entry(entry);
1057 	}
1058 	for (i = 0; i < atop(length); i++)
1059 		vm_page_putfake(ma[i]);
1060 	free(ma, M_TEMP);
1061 	return (error);
1062 }
1063 
1064 static void
1065 iommu_domain_unload_task(void *arg, int pending)
1066 {
1067 	struct iommu_domain *domain;
1068 	struct iommu_map_entries_tailq entries;
1069 
1070 	domain = arg;
1071 	TAILQ_INIT(&entries);
1072 
1073 	for (;;) {
1074 		IOMMU_DOMAIN_LOCK(domain);
1075 		TAILQ_SWAP(&domain->unload_entries, &entries,
1076 		    iommu_map_entry, dmamap_link);
1077 		IOMMU_DOMAIN_UNLOCK(domain);
1078 		if (TAILQ_EMPTY(&entries))
1079 			break;
1080 		iommu_domain_unload(domain, &entries, true);
1081 	}
1082 }
1083 
1084 void
1085 iommu_domain_init(struct iommu_unit *unit, struct iommu_domain *domain,
1086     const struct iommu_domain_map_ops *ops)
1087 {
1088 
1089 	domain->ops = ops;
1090 	domain->iommu = unit;
1091 
1092 	TASK_INIT(&domain->unload_task, 0, iommu_domain_unload_task, domain);
1093 	RB_INIT(&domain->rb_root);
1094 	TAILQ_INIT(&domain->unload_entries);
1095 	mtx_init(&domain->lock, "iodom", NULL, MTX_DEF);
1096 }
1097 
1098 void
1099 iommu_domain_fini(struct iommu_domain *domain)
1100 {
1101 
1102 	mtx_destroy(&domain->lock);
1103 }
1104