1 // SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
2 /*
3  * NPU - NVlink and OpenCAPI
4  *
5  * Copyright 2013-2019 IBM Corp.
6  */
7 
8 #include <skiboot.h>
9 #include <io.h>
10 #include <timebase.h>
11 #include <pci-cfg.h>
12 #include <pci.h>
13 #include <pci-slot.h>
14 #include <pci-virt.h>
15 #include <opal.h>
16 #include <opal-api.h>
17 #include <cpu.h>
18 #include <device.h>
19 #include <ccan/str/str.h>
20 #include <ccan/array_size/array_size.h>
21 #include <affinity.h>
22 #include <npu2.h>
23 #include <lock.h>
24 #include <xscom.h>
25 #include <bitutils.h>
26 #include <chip.h>
27 #include <phys-map.h>
28 #include <nvram.h>
29 #include <xscom-p9-regs.h>
30 #include <phb4.h>
31 #include <cache-p9.h>
32 
33 #define VENDOR_CAP_START    0x80
34 #define VENDOR_CAP_END      0x90
35 #define VENDOR_CAP_LEN      0x10
36 #define VENDOR_CAP_VERSION  0x01
37 #define VENDOR_CAP_PCI_DEV_OFFSET 0x0d
38 
39 /*
40  * NPU2 BAR layout definition. We have 3 stacks and each of them
41  * contains 2 bricks. So every NPU2 has 6 bricks in total. There are 2
42  * PHY BARs and each of them is shared by 3 bricks. Every brick has
43  * one NTL BAR and two bricks share one GENID BAR. There is also a
44  * global MMIO BAR. We only expose DL and GENID BARs to the OS and all
45  * other BARs will be hidden in skiboot.
46  *
47  * Before the global MMIO BAR is configured, scom is the only way to
48  * access the BAR registers. At NPU2 PHB probing time, we rely on scom
49  * to assign all BARs until the global MMIO BAR is established.
50  *
51  * We need to access 4 SM registers in the same stack in order to
52  * configure one particular BAR.
53  */
54 
55 /* Set a specific flag in the vendor config space */
npu2_set_link_flag(struct npu2_dev * ndev,uint8_t flag)56 void npu2_set_link_flag(struct npu2_dev *ndev, uint8_t flag)
57 {
58 	ndev->nvlink.link_flags |= flag;
59 	PCI_VIRT_CFG_INIT_RO(ndev->nvlink.pvd, VENDOR_CAP_START +
60 			     VENDOR_CAP_PCI_DEV_OFFSET, 1, ndev->nvlink.link_flags);
61 }
62 
npu2_clear_link_flag(struct npu2_dev * ndev,uint8_t flag)63 void npu2_clear_link_flag(struct npu2_dev *ndev, uint8_t flag)
64 {
65 	ndev->nvlink.link_flags &= ~flag;
66 	PCI_VIRT_CFG_INIT_RO(ndev->nvlink.pvd, VENDOR_CAP_START +
67 			     VENDOR_CAP_PCI_DEV_OFFSET, 1, ndev->nvlink.link_flags);
68 }
69 
npu2_ioda_sel(struct npu2 * p,uint32_t table,uint32_t index,bool autoinc)70 static inline void npu2_ioda_sel(struct npu2 *p, uint32_t table,
71 				uint32_t index, bool autoinc)
72 {
73 	out_be64(p->regs + NPU2_ATS_IODA_TBL,
74 		 (autoinc ? NPU2_ATS_IODA_TBL_AUTOINC : 0ul)	|
75 		 SETFIELD(NPU2_ATS_IODA_TBL_SELECT, 0ul, table)	|
76 		 SETFIELD(NPU2_ATS_IODA_TBL_INDEX,  0ul, index));
77 }
78 
npu2_bdf_to_dev(struct npu2 * p,uint32_t bdfn)79 static struct npu2_dev *npu2_bdf_to_dev(struct npu2 *p,
80 					uint32_t bdfn)
81 {
82 	struct pci_virt_device *pvd;
83 
84 	/* All emulated devices are attached to root bus */
85 	if (bdfn & ~0xff)
86 		return NULL;
87 
88 	pvd = pci_virt_find_device(&p->phb_nvlink, bdfn);
89 	if (pvd)
90 		return pvd->data;
91 
92 	return NULL;
93 }
94 
npu2_get_bar(uint32_t gcid,struct npu2_bar * bar)95 static inline void npu2_get_bar(uint32_t gcid, struct npu2_bar *bar)
96 {
97 	phys_map_get(gcid, bar->type, bar->index, &bar->base, &bar->size);
98 }
99 
npu2_read_bar(struct npu2 * p,struct npu2_bar * bar)100 static void npu2_read_bar(struct npu2 *p, struct npu2_bar *bar)
101 {
102 	uint64_t reg, val;
103 	int enabled;
104 
105 	reg = NPU2_REG_OFFSET(0, NPU2_BLOCK_SM_0, bar->reg);
106 	val = npu2_read(p, reg);
107 
108 	switch (NPU2_REG(bar->reg)) {
109 	case NPU2_PHY_BAR:
110 		bar->base = GETFIELD(NPU2_PHY_BAR_ADDR, val) << 21;
111 		enabled = GETFIELD(NPU2_PHY_BAR_ENABLE, val);
112 
113 		if (NPU2_REG_STACK(reg) == NPU2_STACK_STCK_2)
114 			/* This is the global MMIO BAR */
115 			bar->size = 0x1000000;
116 		else
117 			bar->size = 0x200000;
118 		break;
119 	case NPU2_NTL0_BAR:
120 	case NPU2_NTL1_BAR:
121 		bar->base = GETFIELD(NPU2_NTL_BAR_ADDR, val) << 16;
122 		enabled = GETFIELD(NPU2_NTL_BAR_ENABLE, val);
123 		bar->size = 0x10000 << GETFIELD(NPU2_NTL_BAR_SIZE, val);
124 		break;
125 	case NPU2_GENID_BAR:
126 		bar->base = GETFIELD(NPU2_GENID_BAR_ADDR, val) << 16;
127 		enabled = GETFIELD(NPU2_GENID_BAR_ENABLE, val);
128 		bar->size = 0x20000;
129 		break;
130 	default:
131 		bar->base = 0ul;
132 		enabled = 0;
133 		bar->size = 0;
134 		break;
135 	}
136 
137 	bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, bar->flags, enabled);
138 }
139 
npu2_write_bar(struct npu2 * p,struct npu2_bar * bar,uint32_t gcid,uint32_t scom)140 static void npu2_write_bar(struct npu2 *p,
141 			   struct npu2_bar *bar,
142 			   uint32_t gcid,
143 			   uint32_t scom)
144 {
145 	uint64_t reg, val, enable = !!(bar->flags & NPU2_BAR_FLAG_ENABLED);
146 	int block;
147 
148 	switch (NPU2_REG(bar->reg)) {
149 	case NPU2_PHY_BAR:
150 		val = SETFIELD(NPU2_PHY_BAR_ADDR, 0ul, bar->base >> 21);
151 		val = SETFIELD(NPU2_PHY_BAR_ENABLE, val, enable);
152 		break;
153 	case NPU2_NTL0_BAR:
154 	case NPU2_NTL1_BAR:
155 		val = SETFIELD(NPU2_NTL_BAR_ADDR, 0ul, bar->base >> 16);
156 		val = SETFIELD(NPU2_NTL_BAR_ENABLE, val, enable);
157 		val = SETFIELD(NPU2_NTL_BAR_SIZE, val, 1);
158 		break;
159 	case NPU2_GENID_BAR:
160 		val = SETFIELD(NPU2_GENID_BAR_ADDR, 0ul, bar->base >> 16);
161 		val = SETFIELD(NPU2_GENID_BAR_ENABLE, val, enable);
162 		break;
163 	default:
164 		val = 0ul;
165 	}
166 
167 	for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
168 		reg = NPU2_REG_OFFSET(0, block, bar->reg);
169 		if (p)
170 			npu2_write(p, reg, val);
171 		else
172 			npu2_scom_write(gcid, scom, reg, NPU2_MISC_DA_LEN_8B, val);
173 	}
174 }
175 
176 /* Trap for PCI command (0x4) to enable or disable device's BARs */
npu2_cfg_write_cmd(void * dev,struct pci_cfg_reg_filter * pcrf __unused,uint32_t offset,uint32_t size,uint32_t * data,bool write)177 static int64_t npu2_cfg_write_cmd(void *dev,
178 				  struct pci_cfg_reg_filter *pcrf __unused,
179 				  uint32_t offset, uint32_t size,
180 				  uint32_t *data, bool write)
181 {
182 	struct pci_virt_device *pvd = dev;
183 	struct npu2_dev *ndev = pvd->data;
184 	struct npu2_bar *ntl_npu_bar, *genid_npu_bar;
185 	bool enabled;
186 
187 	if (!write)
188 		return OPAL_PARTIAL;
189 
190 	if (offset != PCI_CFG_CMD)
191 		return OPAL_PARAMETER;
192 	if (size != 1 && size != 2 && size != 4)
193 		return OPAL_PARAMETER;
194 
195 	/*
196 	 * Enable or disable NTL and GENID BAR. Two bricks share
197 	 * one GENID BAR, which is exposed via the first brick.
198 	 */
199 	enabled = !!(*data & PCI_CFG_CMD_MEM_EN);
200 	ntl_npu_bar = &ndev->bars[0].npu2_bar;
201 	genid_npu_bar = &ndev->bars[1].npu2_bar;
202 
203 	ntl_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, ntl_npu_bar->flags, enabled);
204 	npu2_write_bar(ndev->npu, ntl_npu_bar, 0, 0);
205 
206 	/*
207 	 * Enable/disable the GENID BAR. Two bricks share one GENID
208 	 * BAR which is exposed via the first brick so we need to
209 	 * track the enables separately.
210 	 */
211 	if (NPU2DEV_BRICK(ndev))
212 		genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED1, genid_npu_bar->flags,
213 						enabled);
214 	else
215 		genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED0, genid_npu_bar->flags,
216 						enabled);
217 
218 	/* Enable the BAR if either device requests it enabled, otherwise disable it */
219 	genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, genid_npu_bar->flags,
220 					!!(genid_npu_bar->flags & (NPU2_BAR_FLAG_ENABLED0 |
221 								   NPU2_BAR_FLAG_ENABLED1)));
222 	npu2_write_bar(ndev->npu, genid_npu_bar, 0, 0);
223 
224 	return OPAL_PARTIAL;
225 }
226 
npu2_cfg_read_bar(struct npu2_dev * dev __unused,struct pci_cfg_reg_filter * pcrf,uint32_t offset,uint32_t size,uint32_t * data)227 static int64_t npu2_cfg_read_bar(struct npu2_dev *dev __unused,
228 				 struct pci_cfg_reg_filter *pcrf,
229 				 uint32_t offset, uint32_t size,
230 				 uint32_t *data)
231 {
232 	struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data;
233 
234 	if (!(bar->flags & NPU2_PCIE_BAR_FLAG_TRAPPED))
235 		return OPAL_PARTIAL;
236 
237 	if ((size != 4) ||
238 	    (offset != pcrf->start && offset != pcrf->start + 4))
239 		return OPAL_PARAMETER;
240 
241 	if (bar->flags & NPU2_PCIE_BAR_FLAG_SIZE_HI)
242 		*data = bar->npu2_bar.size >> 32;
243 	else
244 		*data = bar->npu2_bar.size;
245 	bar->flags &= ~(NPU2_PCIE_BAR_FLAG_TRAPPED | NPU2_PCIE_BAR_FLAG_SIZE_HI);
246 
247 	return OPAL_SUCCESS;
248 }
249 
npu2_cfg_write_bar(struct npu2_dev * dev,struct pci_cfg_reg_filter * pcrf,uint32_t offset,uint32_t size,uint32_t data)250 static int64_t npu2_cfg_write_bar(struct npu2_dev *dev,
251 				  struct pci_cfg_reg_filter *pcrf,
252 				  uint32_t offset, uint32_t size,
253 				  uint32_t data)
254 {
255 	struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data;
256 	struct npu2_bar old_bar, *npu2_bar = &bar->npu2_bar;
257 
258 	if ((size != 4) ||
259 	    (offset != pcrf->start && offset != pcrf->start + 4))
260 		return OPAL_PARAMETER;
261 
262 	/* Return BAR size on next read */
263 	if (data == 0xffffffff) {
264 		bar->flags |= NPU2_PCIE_BAR_FLAG_TRAPPED;
265 		if (offset == pcrf->start + 4)
266 			bar->flags |= NPU2_PCIE_BAR_FLAG_SIZE_HI;
267 
268 		return OPAL_SUCCESS;
269 	}
270 
271 	if (offset == pcrf->start) {
272 		npu2_bar->base &= 0xffffffff00000000UL;
273 		npu2_bar->base |= (data & 0xfffffff0);
274 	} else {
275 		npu2_bar->base &= 0x00000000ffffffffUL;
276 		npu2_bar->base |= ((uint64_t)data << 32);
277 
278 		if (NPU2_REG(npu2_bar->reg) == NPU2_GENID_BAR && NPU2DEV_BRICK(dev))
279 			npu2_bar->base -= 0x10000;
280 
281 		old_bar.reg = npu2_bar->reg;
282 		npu2_read_bar(dev->npu, &old_bar);
283 
284 		/* Only allow changing the base address if the BAR is not enabled */
285 		if ((npu2_bar->flags & NPU2_BAR_FLAG_ENABLED) &&
286 		    (npu2_bar->base != old_bar.base)) {
287 			npu2_bar->base = old_bar.base;
288 			return OPAL_HARDWARE;
289 		}
290 
291 		npu2_write_bar(dev->npu, &bar->npu2_bar, 0, 0);
292 	}
293 
294 	/* To update the config cache */
295 	return OPAL_PARTIAL;
296 }
297 
npu2_dev_cfg_bar(void * dev,struct pci_cfg_reg_filter * pcrf,uint32_t offset,uint32_t len,uint32_t * data,bool write)298 static int64_t npu2_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
299 				uint32_t offset, uint32_t len, uint32_t *data,
300 				bool write)
301 {
302 	struct pci_virt_device *pvd = dev;
303 	struct npu2_dev *ndev = (struct npu2_dev *) pvd->data;
304 
305 	if (write)
306 		return npu2_cfg_write_bar(ndev, pcrf, offset, len, *data);
307 
308 	return npu2_cfg_read_bar(ndev, pcrf, offset, len, data);
309 }
310 
npu2_dev_cfg_exp_devcap(void * dev,struct pci_cfg_reg_filter * pcrf __unused,uint32_t offset,uint32_t size,uint32_t * data,bool write)311 static int64_t npu2_dev_cfg_exp_devcap(void *dev,
312 		struct pci_cfg_reg_filter *pcrf __unused,
313 		uint32_t offset, uint32_t size,
314 		uint32_t *data, bool write)
315 {
316 	struct pci_virt_device *pvd = dev;
317 	struct npu2_dev *ndev = pvd->data;
318 	int rc;
319 
320 	assert(write);
321 
322 	if ((size != 2) || (offset & 1)) {
323 		/* Short config writes are not supported */
324 		prlog(PR_ERR, "NPU%d: Unsupported write to pcie control register\n",
325 		      ndev->nvlink.phb->opal_id);
326 		return OPAL_PARAMETER;
327 	}
328 
329 	if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
330 		npu2_dev_procedure_reset(ndev);
331 
332 	rc = purge_l2_l3_caches();
333 	if (rc)
334 		return rc;
335 
336 	return OPAL_PARTIAL;
337 }
338 
339 #define NPU2_CFG_READ(size, type)					\
340 static int64_t npu2_cfg_read##size(struct phb *phb, uint32_t bdfn,	\
341 				   uint32_t offset, type *data)		\
342 {									\
343 	uint32_t val;							\
344 	int64_t ret;							\
345 									\
346 	ret = pci_virt_cfg_read(phb, bdfn, offset,			\
347 				sizeof(*data), &val);			\
348 	*data = (type)val;						\
349         return ret;							\
350 }
351 #define NPU2_CFG_WRITE(size, type)					\
352 static int64_t npu2_cfg_write##size(struct phb *phb, uint32_t bdfn,	\
353 				    uint32_t offset, type data)		\
354 {									\
355 	uint32_t val = data;						\
356 	int64_t ret;							\
357 									\
358 	ret = pci_virt_cfg_write(phb, bdfn, offset,			\
359 				 sizeof(data), val);			\
360 	return ret;							\
361 }
362 
363 NPU2_CFG_READ(8, u8);
364 NPU2_CFG_READ(16, u16);
365 NPU2_CFG_READ(32, u32);
366 NPU2_CFG_WRITE(8, u8);
367 NPU2_CFG_WRITE(16, u16);
368 NPU2_CFG_WRITE(32, u32);
369 
__npu2_dev_bind_pci_dev(struct phb * phb __unused,struct pci_device * pd,void * data)370 static int __npu2_dev_bind_pci_dev(struct phb *phb __unused,
371 				  struct pci_device *pd,
372 				  void *data)
373 {
374 	struct npu2_dev *dev = data;
375 	struct dt_node *pci_dt_node;
376 	char *pcislot;
377 
378 	/* Ignore non-nvidia PCI devices */
379 	if ((pd->vdid & 0xffff) != 0x10de)
380 		return 0;
381 
382 	/* Find the PCI device's slot location */
383 	for (pci_dt_node = pd->dn;
384 	     pci_dt_node && !dt_find_property(pci_dt_node, "ibm,loc-code");
385 	     pci_dt_node = pci_dt_node->parent);
386 
387 	if (!pci_dt_node)
388 		return 0;
389 
390 	pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,loc-code");
391 
392 	NPU2DEVDBG(dev, "Comparing GPU '%s' and NPU2 '%s'\n",
393 		   pcislot, dev->nvlink.slot_label);
394 
395 	if (streq(pcislot, dev->nvlink.slot_label))
396 		return 1;
397 
398 	return 0;
399 }
400 
npu2_gpu_bridge_sec_bus_reset(void * dev,struct pci_cfg_reg_filter * pcrf __unused,uint32_t offset,uint32_t len,uint32_t * data,bool write)401 static int64_t npu2_gpu_bridge_sec_bus_reset(void *dev,
402 		struct pci_cfg_reg_filter *pcrf __unused,
403 		uint32_t offset, uint32_t len,
404 		uint32_t *data, bool write)
405 {
406 	struct pci_device *pd = dev;
407 	struct pci_device *gpu;
408 	struct phb *npphb;
409 	struct npu2 *npu;
410 	struct dt_node *np;
411 	struct npu2_dev	*ndev;
412 	int i;
413 
414 	assert(write);
415 
416 	if ((len != 2) || (offset & 1)) {
417 		/* Short config writes are not supported */
418 		PCIERR(pd->phb, pd->bdfn,
419 		       "Unsupported write to bridge control register\n");
420 		return OPAL_PARAMETER;
421 	}
422 
423 	gpu = list_top(&pd->children, struct pci_device, link);
424 	if (gpu && (*data & PCI_CFG_BRCTL_SECONDARY_RESET)) {
425 		int64_t rc;
426 
427 		dt_for_each_compatible(dt_root, np, "ibm,power9-npu-pciex") {
428 			npphb = pci_get_phb(dt_prop_get_cell(np,
429 					"ibm,opal-phbid", 1));
430 			if (!npphb || npphb->phb_type != phb_type_npu_v2)
431 				continue;
432 
433 			npu = phb_to_npu2_nvlink(npphb);
434 			for (i = 0; i < npu->total_devices; ++i) {
435 				ndev = &npu->devices[i];
436 				if (ndev->nvlink.pd == gpu)
437 					npu2_dev_procedure_reset(ndev);
438 			}
439 		}
440 
441 		rc = purge_l2_l3_caches();
442 		if (rc)
443 			return rc;
444 	}
445 
446 	return OPAL_PARTIAL;
447 }
448 
npu2_dev_bind_pci_dev(struct npu2_dev * dev)449 static void npu2_dev_bind_pci_dev(struct npu2_dev *dev)
450 {
451 	struct phb *phb;
452 	uint32_t i;
453 
454 	if (dev->nvlink.pd)
455 		return;
456 
457 	for (i = 0; i < 64; i++) {
458 		if (dev->npu->phb_nvlink.opal_id == i)
459 			continue;
460 
461 		phb = pci_get_phb(i);
462 		if (!phb)
463 			continue;
464 
465 		dev->nvlink.pd = pci_walk_dev(phb, NULL, __npu2_dev_bind_pci_dev, dev);
466 		if (dev->nvlink.pd) {
467 			dev->nvlink.phb = phb;
468 			/* Found the device, set the bit in config space */
469 			npu2_set_link_flag(dev, NPU2_DEV_PCI_LINKED);
470 
471 			/*
472 			 * We define a custom sec bus reset handler for a slot
473 			 * with an NVLink-connected GPU to prevent HMIs which
474 			 * will otherwise happen if we reset GPU before
475 			 * resetting NVLinks.
476 			 */
477 			if (dev->nvlink.pd->parent &&
478 			    dev->nvlink.pd->parent->slot)
479 				pci_add_cfg_reg_filter(dev->nvlink.pd->parent,
480 						PCI_CFG_BRCTL, 2,
481 						PCI_REG_FLAG_WRITE,
482 						npu2_gpu_bridge_sec_bus_reset);
483 			return;
484 		}
485 	}
486 
487 	NPU2DEVINF(dev, "No PCI device found for slot '%s'\n",
488 		   dev->nvlink.slot_label);
489 }
490 
491 static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED;
492 
npu2_append_phandle(struct dt_node * dn,u32 phandle)493 static void npu2_append_phandle(struct dt_node *dn,
494 				u32 phandle)
495 {
496 	struct dt_property *prop;
497 	uint32_t *npu_phandles;
498 	size_t len;
499 
500 	/*
501 	 * Use a lock to make sure no one else has a reference to an
502 	 * ibm,npu property (this assumes this is the only function
503 	 * that holds a reference to it)
504 	 */
505 	lock(&pci_npu_phandle_lock);
506 
507 	/* This function shouldn't be called unless ibm,npu exists */
508 	prop = (struct dt_property *)dt_require_property(dn, "ibm,npu", -1);
509 
510 	/* Need to append to the properties */
511 	len = prop->len + sizeof(*npu_phandles);
512 	dt_resize_property(&prop, len);
513 
514 	npu_phandles = (uint32_t *)prop->prop;
515 	npu_phandles[len / sizeof(*npu_phandles) - 1] = phandle;
516 	unlock(&pci_npu_phandle_lock);
517 }
518 
npu2_create_memory_dn(uint64_t addr,uint64_t size)519 static struct dt_node *npu2_create_memory_dn(uint64_t addr, uint64_t size)
520 {
521 	struct dt_node *mem;
522 	static u32 chip_id = 255;
523 
524 	mem = dt_find_by_name_addr(dt_root, "memory", addr);
525 	if (mem)
526 		return mem;
527 
528 	mem = dt_new_addr(dt_root, "memory", addr);
529 	if (!mem)
530 		return NULL;
531 	dt_add_property_string(mem, "device_type", "memory");
532 	dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory");
533 	dt_add_property_u64s(mem, "reg", addr, size);
534 	dt_add_property_cells(mem, "ibm,chip-id", chip_id);
535 	dt_add_property_u64s(mem, "linux,usable-memory", addr, 0);
536 	dt_add_property_cells(mem, "ibm,associativity", 4, chip_id, chip_id, chip_id, chip_id);
537 	chip_id--;
538 
539 	assert(chip_id);
540 	return mem;
541 }
542 
543 /* There are potentially multiple links per GPU, so lookup the GPU memory based
544  * on bdfn. */
npu2_get_gpu_base(struct npu2_dev * ndev,uint64_t * addr,uint64_t * size)545 static void npu2_get_gpu_base(struct npu2_dev *ndev, uint64_t *addr, uint64_t *size)
546 {
547 	struct npu2 *p = ndev->npu;
548 	int group;
549 
550 	group = PCI_DEV(ndev->bdfn);
551 	phys_map_get(ndev->npu->chip_id, p->gpu_map_type, group, addr, size);
552 }
553 
npu2_dn_fixup_gmb(struct dt_node * pd_dn,struct npu2_dev * ndev)554 static void npu2_dn_fixup_gmb(struct dt_node *pd_dn, struct npu2_dev *ndev)
555 {
556 	uint64_t gpu_base, gpu_size, gta;
557 	struct dt_node *mem_dn;
558 
559 	npu2_get_gpu_base(ndev, &gpu_base, &gpu_size);
560 	mem_dn = npu2_create_memory_dn(gpu_base, gpu_size);
561 	assert(mem_dn);
562 	dt_add_property_cells(pd_dn, "memory-region", mem_dn->phandle);
563 
564 	/* Coral mode address compression. This is documented in Figure 3.5
565 	 * "P9->GPU RA Compression (Coral) of the NPU2 workbook". */
566 	gta  = ((gpu_base >> 42) & 0x1) << 42;
567 	gta |= ((gpu_base >> 45) & 0x3) << 43;
568 	gta |= ((gpu_base >> 49) & 0x3) << 45;
569 	gta |= gpu_base & ((1UL << 43) - 1);
570 
571 	dt_add_property_u64s(pd_dn, "ibm,device-tgt-addr", gta);
572 }
573 
npu2_assign_gmb(struct npu2_dev * ndev)574 static int npu2_assign_gmb(struct npu2_dev *ndev)
575 {
576 	struct npu2 *p = ndev->npu;
577 	int peers, mode;
578 	uint32_t bdfn;
579 	uint64_t base, size, reg, val, gmb;
580 
581 	/* Need to work out number of link peers. This amount to
582 	 * working out the maximum function number. So work start at
583 	 * the highest bdfn (fn = 6) and count back until we find a
584 	 * npu2_dev. */
585 	for (bdfn = (ndev->bdfn & ~0x7) | NPU2_LINKS_PER_CHIP;
586 	     PCI_FUNC(bdfn) != 0x7; bdfn = (bdfn & ~0x7) | (PCI_FUNC(bdfn) - 1))
587 		if (npu2_bdf_to_dev(p, bdfn))
588 			break;
589 	peers = PCI_FUNC(bdfn);
590 
591 	npu2_get_gpu_base(ndev, &base, &size);
592 
593 	NPU2DBG(p, "Setting BAR region dt:%llx\n", base);
594 	val = SETFIELD(NPU2_MEM_BAR_EN, 0ULL, 1);
595 	val = SETFIELD(NPU2_MEM_BAR_SEL_MEM, val, base >> (63-14));
596 	val = SETFIELD(NPU2_MEM_BAR_GROUP, val, base >> (63-18));
597 	val = SETFIELD(NPU2_MEM_BAR_CHIP, val, base >> (63-21));
598 	val = SETFIELD(NPU2_MEM_BAR_NODE_ADDR, val, base >> (63-33));
599 	val = SETFIELD(NPU2_MEM_BAR_POISON, val, 1);
600 	val = SETFIELD(NPU2_MEM_BAR_GRANULE, val, 0);
601 
602 	/* We don't know how much memory the GPU has, so we may as well just
603 	 * pass the whole aperture through at this point. */
604 	val = SETFIELD(NPU2_MEM_BAR_BAR_SIZE, val, ilog2(size >> 30));
605 
606 	switch (peers) {
607 	case 0:
608 		mode = 0;
609 		break;
610 	case 1:
611 		mode = 1;
612 		break;
613 	case 2:
614 		mode = 3;
615 		break;
616 	case 3:
617 		mode = 6;
618 		break;
619 	case 5:
620 		mode = 10;
621 		break;
622 	default:
623 		/* Hardware does not support this configuration */
624 		assert(0);
625 	}
626 
627 	mode += PCI_FUNC(ndev->bdfn);
628 	val = SETFIELD(NPU2_MEM_BAR_MODE, val, mode);
629 
630 	gmb = NPU2_GPU0_MEM_BAR;
631 	if (NPU2DEV_BRICK(ndev))
632 		gmb = NPU2_GPU1_MEM_BAR;
633 
634 	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
635 			      NPU2_BLOCK_SM_0, gmb);
636 
637 	npu2_write(p, reg, val);
638 	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
639 			      NPU2_BLOCK_SM_1, gmb);
640 	npu2_write(p, reg, val);
641 	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
642 			      NPU2_BLOCK_SM_2, gmb);
643 	npu2_write(p, reg, val);
644 	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
645 			      NPU2_BLOCK_SM_3, gmb);
646 	npu2_write(p, reg, val);
647 
648 	return 0;
649 }
650 
npu2_dn_fixup(struct phb * phb,struct pci_device * pd,void * data __unused)651 static int npu2_dn_fixup(struct phb *phb,
652 			 struct pci_device *pd,
653 			 void *data __unused)
654 {
655 	struct npu2 *p = phb_to_npu2_nvlink(phb);
656 	struct npu2_dev *dev;
657 	uint32_t speed;
658 	const char *label;
659 
660 	dev = npu2_bdf_to_dev(p, pd->bdfn);
661 	assert(dev);
662 	if (dev->nvlink.phb || dev->nvlink.pd)
663 		return 0;
664 
665 	npu2_assign_gmb(dev);
666 	npu2_dn_fixup_gmb(pd->dn, dev);
667 	dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dt_node->phandle);
668 
669 	/*
670 	 * NVLink supports multiple speeds and device drivers need to know what
671 	 * speed has been set by firmware. Hostboot does the inits that set the
672 	 * link speed and tell us via HDAT and we need to copy that from the
673 	 * link node.
674 	 */
675 	speed = dt_prop_get_u32_def(dev->dt_node, "nvidia,link-speed", 0xff);
676 	if (speed != 0xff)
677 		dt_add_property_cells(pd->dn, "ibm,nvlink-speed", speed);
678 
679 	/*
680 	 * NPU2 devices have a slot label that indicates which GPU slot
681 	 * this NPU is connected to. Add a location code to the NVlink
682 	 * device node based on the slot label.
683 	 */
684 	label = dt_prop_get_def(dev->dt_node, "ibm,slot-label", NULL);
685 	if (!label) {
686 		/**
687 		 * @fwts-label NPUNoPHBSlotLabel
688 		 * @fwts-advice No GPU/NPU2 slot information was found.
689 		 * NVLink2 functionality will not work.
690 		 */
691 		prlog(PR_ERR, "NPU: Cannot find GPU slot information\n");
692 		return 0;
693 	}
694 	dt_add_property_string(pd->dn, "ibm,loc-code", label);
695 
696 	dev->nvlink.slot_label = label;
697 
698 	/*
699 	 * Bind the emulated PCI device with the real one, which can't
700 	 * be done until the PCI devices are populated. Once the real
701 	 * PCI device is identified, we also need fix the device-tree
702 	 * for it
703 	 */
704 	npu2_dev_bind_pci_dev(dev);
705 	if (dev->nvlink.phb && dev->nvlink.pd && dev->nvlink.pd->dn) {
706 		if (dt_find_property(dev->nvlink.pd->dn, "ibm,npu"))
707 			npu2_append_phandle(dev->nvlink.pd->dn, pd->dn->phandle);
708 		else
709 			dt_add_property_cells(dev->nvlink.pd->dn, "ibm,npu", pd->dn->phandle);
710 
711 		dt_add_property_cells(pd->dn, "ibm,gpu", dev->nvlink.pd->dn->phandle);
712 		dev->nvlink.gpu_bdfn = dev->nvlink.pd->bdfn;
713 	}
714 
715 	return 0;
716 }
717 
npu2_links_per_gpu(struct phb * phb,struct pci_device * pd,void * data)718 static int npu2_links_per_gpu(struct phb *phb,
719 			      struct pci_device *pd,
720 			      void *data)
721 {
722 	struct npu2 *p = phb_to_npu2_nvlink(phb);
723 	struct npu2_dev *dev;
724 	int *nlinks = (int *)data;
725 
726 	dev = npu2_bdf_to_dev(p, pd->bdfn);
727 	assert(dev);
728 
729 	if (dev->nvlink.phb && dev->nvlink.pd && dev->nvlink.pd->dn) {
730 		const struct dt_property *prop;
731 		int n;
732 
733 		/* The link count is the number of phandles in "ibm,npu" */
734 		prop = dt_find_property(dev->nvlink.pd->dn, "ibm,npu");
735 		if (!prop)
736 			return 0;
737 
738 		/* Count could vary by gpu, so find the max */
739 		n = prop->len / sizeof(uint32_t);
740 		if (n > *nlinks)
741 			*nlinks = n;
742 	}
743 
744 	return 0;
745 }
746 
npu2_phb_fixup_scominit(struct dt_node * dn,int links_per_gpu)747 static void npu2_phb_fixup_scominit(struct dt_node *dn, int links_per_gpu)
748 {
749 	uint32_t gcid = dt_get_chip_id(dn);
750 	uint64_t val, mask;
751 
752 	/*
753 	 * MRBSP settings for 2- and 3-link GPU systems. These can improve
754 	 * GPU peer-to-peer fully ordered write performance.
755 	 */
756 	if (links_per_gpu == 3) {
757 		val = PPC_BIT(30) | PPC_BIT(34) | PPC_BIT(36) | PPC_BIT(37) |
758 		      PPC_BIT(44) | PPC_BIT(45);
759 		mask = PPC_BITMASK(28,39) | PPC_BITMASK(44,47);
760 	} else if (links_per_gpu == 2) {
761 		val = PPC_BIT(46) | PPC_BIT(47);
762 		mask = PPC_BITMASK(44,47);
763 	} else
764 		return;
765 
766 	xscom_write_mask(gcid, 0x50110c0, val, mask);
767 	xscom_write_mask(gcid, 0x50112c0, val, mask);
768 	xscom_write_mask(gcid, 0x50114c0, val, mask);
769 }
770 
npu2_phb_final_fixup(struct phb * phb)771 static void npu2_phb_final_fixup(struct phb *phb)
772 {
773 	int links_per_gpu = 0;
774 	struct dt_node *np;
775 
776 	pci_walk_dev(phb, NULL, npu2_dn_fixup, NULL);
777 
778 	/*
779 	 * Now that the emulated devices are bound to the real ones, we can
780 	 * determine links_per_gpu and do some final init.
781 	 */
782 	pci_walk_dev(phb, NULL, npu2_links_per_gpu, &links_per_gpu);
783 	dt_for_each_compatible(dt_root, np, "ibm,power9-npu")
784 		npu2_phb_fixup_scominit(np, links_per_gpu);
785 }
786 
npu2_init_ioda_cache(struct npu2 * p)787 static void npu2_init_ioda_cache(struct npu2 *p)
788 {
789 	/* TVT */
790 	memset(p->tve_cache, 0, sizeof(p->tve_cache));
791 }
792 
npu2_ioda_reset(struct phb * phb,bool purge)793 static int64_t npu2_ioda_reset(struct phb *phb, bool purge)
794 {
795 	struct npu2 *p = phb_to_npu2_nvlink(phb);
796 	uint32_t i;
797 
798 	if (purge) {
799 		NPU2DBG(p, "Purging all IODA tables...\n");
800 		npu2_init_ioda_cache(p);
801 	}
802 
803 	/* TVT */
804 	npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, 0, true);
805 	for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
806 		out_be64(p->regs + NPU2_ATS_IODA_DATA, p->tve_cache[i]);
807 
808 	return OPAL_SUCCESS;
809 }
810 
npu2_write_mcd(struct npu2 * p,uint64_t pcb_addr,uint64_t addr,uint64_t size)811 static void npu2_write_mcd(struct npu2 *p, uint64_t pcb_addr, uint64_t addr,
812 			   uint64_t size)
813 {
814 	uint64_t val;
815 
816 	NPU2DBG(p, "Setting MCD addr:%llx\n", pcb_addr);
817 	assert(is_pow2(size));
818 
819 	val = MCD_BANK_CN_VALID;
820 	val = SETFIELD(MCD_BANK_CN_SIZE, val, (size >> 25) - 1);
821 	val = SETFIELD(MCD_BANK_CN_ADDR, val, addr >> 25);
822 	xscom_write(p->chip_id, pcb_addr, val);
823 }
824 
npu2_mcd_init(struct npu2 * p)825 static void npu2_mcd_init(struct npu2 *p)
826 {
827 	int i;
828 	uint64_t size, addr, gpu_min_addr, gpu_max_addr, total_size;
829 
830 	/* Init memory cache directory (MCD) registers. */
831 	phys_map_get(p->chip_id, p->gpu_map_type, NPU2_LINKS_PER_CHIP - 1,
832 			&gpu_min_addr, NULL);
833 	phys_map_get(p->chip_id, p->gpu_map_type, 0, &gpu_max_addr, &size);
834 	gpu_max_addr += size;
835 
836 	/* We assume GPU memory is contiguous from the first possible GPU to the
837 	 * last and that the size is the same so best to check that. */
838 	for (i = 0; i < NPU2_LINKS_PER_CHIP; i++) {
839 		uint64_t tmp;
840 		phys_map_get(p->chip_id, p->gpu_map_type, i, &addr, &tmp);
841 		assert((addr >= gpu_min_addr) && (addr + tmp <= gpu_max_addr));
842 		assert(tmp == size);
843 	}
844 
845 	/* We have two MCDs, so if neccessary we can split the region covered
846 	 * across both if total_size is not a power of two. */
847 	total_size = gpu_max_addr - gpu_min_addr;
848 	size = 1ull << ilog2(total_size);
849 
850 	/* Allocate the biggest chunk first as we assume gpu_max_addr has the
851 	 * highest alignment. */
852 	addr = gpu_max_addr - size;
853 	npu2_write_mcd(p, MCD0_BANK0_CN3, addr, size);
854 	total_size -= size;
855 	if (total_size) {
856 	/* total_size was not a power of two, but the remainder should
857 	 * be if all GPUs were assigned the same size. */
858 		assert(is_pow2(total_size));
859 		size = 1ull << ilog2(total_size);
860 		addr -= size;
861 		assert(addr <= gpu_min_addr);
862 		npu2_write_mcd(p, MCD1_BANK0_CN3, addr, size);
863 	}
864 }
865 
npu2_hw_init(struct npu2 * p)866 static void npu2_hw_init(struct npu2 *p)
867 {
868 	uint64_t reg, val;
869 	int s, b;
870 
871 	npu2_ioda_reset(&p->phb_nvlink, false);
872 
873 	/* Enable XTS retry mode */
874 	val = npu2_read(p, NPU2_XTS_CFG);
875 	npu2_write(p, NPU2_XTS_CFG, val | NPU2_XTS_CFG_MMIOSD | NPU2_XTS_CFG_TRY_ATR_RO);
876 
877 	val = npu2_read(p, NPU2_XTS_CFG2);
878 	npu2_write(p, NPU2_XTS_CFG2, val | NPU2_XTS_CFG2_NO_FLUSH_ENA);
879 
880 	/*
881 	 * There are three different ways we configure the MCD and memory map.
882 	 * 1) Old way
883 	 *    Skiboot configures the MCD and puts GPUs at 4TB and below
884 	 * 2) New way with MCD
885 	 *    Hostboot configures the MCD and skiboot puts GPU at 4TB and above
886 	 * 3) New way without MCD
887 	 *    No one configures the MCD and skiboot puts GPU at 4TB and below
888 	 *
889 	 * 1) Will go away evenutally as it's a configuration that can
890 	 *    cause an xstop or data integrity problems. We are keeping
891 	 *    it around to support existing hostboot. Print error
892 	 *    message if used.
893 	 * 2) Is for smaller memory configurations and will be used
894 	 *    initially for GPUs on Witherspoon. Supports only to
895 	 *    512GB of memory and 4 GPUs per socket.
896 	 * 3) Is for fully populated configurations of 4TB of memory
897 	 *    and 6GPUs per socket. May have performance impacts.
898 	 *
899 	 * The different configurations can be detected via the following scoms:
900 	 * 1) 0x5011c0c bit 2 = 1, 0x5011c0a bits 42:48 = 0
901 	 * 2) 0x5011c0c bit 2 = 1, 0x5011c0a bits 42:48 = 7
902 	 * 3) 0x5011c0c bit 2 = 0, 0x5011c0a bits 42:48 = 0
903 	 */
904 
905 	/* Get 0x05011c0c bit 2 = 1 */
906 	xscom_read(p->chip_id, PB_CENT_HP_MODE_CURR, &val);
907 	if ((val & PB_CFG_CHG_RATE_GP_MASTER) != 0) {
908 		/* Get 0x05011c0a bits 42:48 */
909 		xscom_read(p->chip_id, PB_CENT_MODE, &val);
910 		if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 0) {
911 			/* 1) */
912 			NPU2DBG(p, "Using old memory map + MCD enabled in skiboot\n");
913 			NPU2ERR(p, "!!! Old firmware detected. Update hostboot for new MCD mapping !!!\n");
914 			p->gpu_map_type = GPU_MEM_4T_DOWN;
915 			npu2_mcd_init(p);
916 		} else if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 7) {
917 			/* 2) */
918 			NPU2DBG(p, "Using small memory map + MCD enabled\n");
919 			p->gpu_map_type = GPU_MEM_4T_UP;
920 		} else
921 			NPU2ERR(p, "!!! Unsupported NPU2 configuration. "
922 				"0x%llx!!!\n", val);
923 	} else {
924 		/* 3) */
925 		NPU2DBG(p, "Using large memory map + MCD disabled\n");
926 		p->gpu_map_type = GPU_MEM_4T_DOWN;
927 	}
928 
929 	/* Static initialization of every relaxed-ordering cfg[2] register */
930 	val = NPU2_RELAXED_ORDERING_CMD_CL_DMA_W |
931 	      NPU2_RELAXED_ORDERING_CMD_CL_DMA_W_HP |
932 	      NPU2_RELAXED_ORDERING_CMD_CL_DMA_INJ |
933 	      NPU2_RELAXED_ORDERING_CMD_PR_DMA_INJ |
934 	      NPU2_RELAXED_ORDERING_CMD_DMA_PR_W |
935 	      NPU2_RELAXED_ORDERING_CMD_CL_RD_NC_F0 |
936 	      NPU2_RELAXED_ORDERING_SOURCE4_RDENA;
937 
938 	for (s = NPU2_STACK_STCK_0; s <= NPU2_STACK_STCK_2; s++) {
939 		for (b = NPU2_BLOCK_SM_0; b <= NPU2_BLOCK_SM_3; b++) {
940 			reg = NPU2_REG_OFFSET(s, b, NPU2_RELAXED_ORDERING_CFG(2));
941 			npu2_write(p, reg, val);
942 		}
943 	}
944 }
945 
npu2_map_pe_dma_window_real(struct phb * phb,uint64_t pe_num,uint16_t window_id,uint64_t pci_start_addr __unused,uint64_t pci_mem_size __unused)946 static int64_t npu2_map_pe_dma_window_real(struct phb *phb,
947 					   uint64_t pe_num,
948 					   uint16_t window_id,
949 					   uint64_t pci_start_addr __unused,
950 					   uint64_t pci_mem_size __unused)
951 {
952 	struct npu2 *p = phb_to_npu2_nvlink(phb);
953 	uint64_t tve;
954 
955 	/* Sanity check. Each PE has one corresponding TVE */
956 	if (pe_num >= NPU2_MAX_PE_NUM ||
957 	    window_id != pe_num)
958 		return OPAL_PARAMETER;
959 
960 	if (pci_mem_size) {
961 		/* GPUs need to be able to access the MMIO memory space as well.
962 		 * On POWER9 this is above the top of ram so disable the TVT
963 		 * range check allowing access to all memory addresses. */
964 		tve = 0;
965 	} else {
966 		/* Disable */
967 		tve = PPC_BIT(51);
968 	}
969 
970 	npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
971 	out_be64(p->regs + NPU2_ATS_IODA_DATA, tve);
972 	p->tve_cache[window_id] = tve;
973 
974 	return OPAL_SUCCESS;
975 }
976 
npu2_map_pe_dma_window(struct phb * phb,uint64_t pe_num,uint16_t window_id,uint16_t tce_levels,uint64_t tce_table_addr,uint64_t tce_table_size,uint64_t tce_page_size)977 static int64_t npu2_map_pe_dma_window(struct phb *phb,
978 				      uint64_t pe_num,
979 				      uint16_t window_id,
980 				      uint16_t tce_levels,
981 				      uint64_t tce_table_addr,
982 				      uint64_t tce_table_size,
983 				      uint64_t tce_page_size)
984 {
985 	struct npu2 *p = phb_to_npu2_nvlink(phb);
986 	uint64_t tts_encoded;
987 	uint64_t data64 = 0;
988 
989 	/* Sanity check. Each PE has one corresponding TVE */
990 	if (pe_num >= NPU2_MAX_PE_NUM ||
991 	    window_id != pe_num)
992 		return OPAL_PARAMETER;
993 
994 	/*
995 	 * Special condition, zero TCE table size used to disable
996 	 * the TVE.
997 	 */
998 	if (!tce_table_size) {
999 		npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
1000 		out_be64(p->regs + NPU2_ATS_IODA_DATA, 0ul);
1001 		p->tve_cache[window_id] = 0ul;
1002 		return OPAL_SUCCESS;
1003 	}
1004 
1005 	/* Additional arguments validation */
1006 	if (tce_levels < 1 ||
1007 	    tce_levels > 4 ||
1008 	    !is_pow2(tce_table_size) ||
1009 	    tce_table_size < 0x1000)
1010 		return OPAL_PARAMETER;
1011 
1012 	/* TCE table size */
1013 	data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_TTA, 0ul, tce_table_addr >> 12);
1014 	tts_encoded = ilog2(tce_table_size) - 11;
1015 	if (tts_encoded > 39)
1016 		return OPAL_PARAMETER;
1017 	data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_SIZE, data64, tts_encoded);
1018 
1019 	/* TCE page size */
1020 	switch (tce_page_size) {
1021 	case 0x10000:		/* 64K */
1022 		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 5);
1023 		break;
1024 	case 0x1000000:		/* 16M */
1025 		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 13);
1026 		break;
1027 	case 0x10000000:	/* 256M */
1028 		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 17);
1029 		break;
1030 	case 0x1000:		/* 4K */
1031 	default:
1032 		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 1);
1033 	}
1034 
1035 	/* Number of levels */
1036 	data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_LEVEL, data64, tce_levels - 1);
1037 
1038 	/* Update to hardware */
1039 	npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
1040 	out_be64(p->regs + NPU2_ATS_IODA_DATA, data64);
1041 	p->tve_cache[window_id] = data64;
1042 
1043 	return OPAL_SUCCESS;
1044 }
1045 
npu2_set_pe(struct phb * phb,uint64_t pe_num,uint64_t bdfn,uint8_t bcompare,uint8_t dcompare,uint8_t fcompare,uint8_t action)1046 static int64_t npu2_set_pe(struct phb *phb,
1047 			   uint64_t pe_num,
1048 			   uint64_t bdfn,
1049 			   uint8_t bcompare,
1050 			   uint8_t dcompare,
1051 			   uint8_t fcompare,
1052 			   uint8_t action)
1053 {
1054 	struct npu2 *p;
1055 	struct npu2_dev *dev;
1056 	uint64_t reg, val;
1057 
1058 	/* Sanity check */
1059 	if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
1060 		return OPAL_PARAMETER;
1061 	if (pe_num >= NPU2_MAX_PE_NUM)
1062 		return OPAL_PARAMETER;
1063 	if (bdfn >> 8)
1064 		return OPAL_PARAMETER;
1065 	if (bcompare != OpalPciBusAll ||
1066 	    dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
1067 	    fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
1068 		return OPAL_UNSUPPORTED;
1069 	if (phb->phb_type != phb_type_npu_v2)
1070 		return OPAL_PARAMETER;
1071 
1072 	p = phb_to_npu2_nvlink(phb);
1073 	if (!p)
1074 		return OPAL_PARAMETER;
1075 
1076 	dev = npu2_bdf_to_dev(p, bdfn);
1077 	if (!dev)
1078 		return OPAL_PARAMETER;
1079 
1080 	val = NPU2_CQ_BRICK_BDF2PE_MAP_ENABLE;
1081 	val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_PE, val, pe_num);
1082 	val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_BDF, val, dev->nvlink.gpu_bdfn);
1083 
1084 	if (!NPU2DEV_BRICK(dev))
1085 		reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->brick_index/2,
1086 				      NPU2_BLOCK_CTL, NPU2_CQ_BRICK0_BDF2PE_MAP0);
1087 	else
1088 		reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->brick_index/2,
1089 				      NPU2_BLOCK_CTL, NPU2_CQ_BRICK1_BDF2PE_MAP0);
1090 
1091 	npu2_write(p, reg, val);
1092 	val = NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE;
1093 	val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_PE, val, pe_num);
1094 	val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_BDF, val, dev->nvlink.gpu_bdfn);
1095 	reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC,
1096 			      NPU2_MISC_BRICK0_BDF2PE_MAP0 + (dev->brick_index * 0x18));
1097 	npu2_write(p, reg, val);
1098 
1099 	return OPAL_SUCCESS;
1100 }
1101 
npu2_get_link_state(struct pci_slot * slot __unused,uint8_t * val)1102 static int64_t npu2_get_link_state(struct pci_slot *slot __unused, uint8_t *val)
1103 {
1104 	/*
1105 	 * As we're emulating all PCI stuff, the link bandwidth
1106 	 * isn't big deal anyway.
1107 	 */
1108 	*val = OPAL_SHPC_LINK_UP_x1;
1109 	return OPAL_SUCCESS;
1110 }
1111 
npu2_get_power_state(struct pci_slot * slot __unused,uint8_t * val)1112 static int64_t npu2_get_power_state(struct pci_slot *slot __unused, uint8_t *val)
1113 {
1114 	*val = PCI_SLOT_POWER_ON;
1115 	return OPAL_SUCCESS;
1116 }
1117 
npu2_hreset(struct pci_slot * slot __unused)1118 static int64_t npu2_hreset(struct pci_slot *slot __unused)
1119 {
1120 	struct npu2 *p;
1121 	int i;
1122 	struct npu2_dev *ndev;
1123 
1124 	p = phb_to_npu2_nvlink(slot->phb);
1125 	NPU2INF(p, "Hreset PHB state\n");
1126 
1127 	for (i = 0; i < p->total_devices; i++) {
1128 		ndev = &p->devices[i];
1129 		if (ndev) {
1130 			NPU2DEVINF(ndev, "Resetting device\n");
1131 			reset_ntl(ndev);
1132 		}
1133 	}
1134 	return purge_l2_l3_caches();
1135 }
1136 
npu2_freset(struct pci_slot * slot __unused)1137 static int64_t npu2_freset(struct pci_slot *slot __unused)
1138 {
1139 	return OPAL_SUCCESS;
1140 }
1141 
npu2_creset(struct pci_slot * slot)1142 static int64_t npu2_creset(struct pci_slot *slot)
1143 {
1144 	struct npu2 *p;
1145 	int i;
1146 	struct npu2_dev *ndev;
1147 
1148 	p = phb_to_npu2_nvlink(slot->phb);
1149 	NPU2INF(p, "Creset PHB state\n");
1150 
1151 	for (i = 0; i < p->total_devices; i++) {
1152 		ndev = &p->devices[i];
1153 		if (ndev) {
1154 			NPU2DEVINF(ndev, "Resetting device\n");
1155 			reset_ntl(ndev);
1156 		}
1157 	}
1158 	return OPAL_SUCCESS;
1159 }
1160 
npu2_slot_create(struct phb * phb)1161 static struct pci_slot *npu2_slot_create(struct phb *phb)
1162 {
1163 	struct pci_slot *slot;
1164 
1165 	slot = pci_slot_alloc(phb, NULL);
1166 	if (!slot)
1167 		return slot;
1168 
1169 	/* Elementary functions */
1170 	slot->ops.get_presence_state  = NULL;
1171 	slot->ops.get_link_state      = npu2_get_link_state;
1172 	slot->ops.get_power_state     = npu2_get_power_state;
1173 	slot->ops.get_attention_state = NULL;
1174 	slot->ops.get_latch_state     = NULL;
1175 	slot->ops.set_power_state     = NULL;
1176 	slot->ops.set_attention_state = NULL;
1177 
1178 	slot->ops.prepare_link_change = NULL;
1179 	slot->ops.poll_link           = NULL;
1180 	slot->ops.hreset              = npu2_hreset;
1181 	slot->ops.freset              = npu2_freset;
1182 	slot->ops.creset              = npu2_creset;
1183 
1184 	return slot;
1185 }
1186 
npu2_freeze_status(struct phb * phb __unused,uint64_t pe_number __unused,uint8_t * freeze_state,uint16_t * pci_error_type,uint16_t * severity)1187 int64_t npu2_freeze_status(struct phb *phb __unused,
1188 			   uint64_t pe_number __unused,
1189 			   uint8_t *freeze_state,
1190 			   uint16_t *pci_error_type,
1191 			   uint16_t *severity)
1192 {
1193 	/*
1194 	 * FIXME: When it's called by skiboot PCI config accessor,
1195 	 * the PE number is fixed to 0, which is incorrect. We need
1196 	 * introduce another PHB callback to translate it. For now,
1197 	 * it keeps the skiboot PCI enumeration going.
1198 	 */
1199 	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
1200 	*pci_error_type = OPAL_EEH_NO_ERROR;
1201 	if (severity)
1202 		*severity = OPAL_EEH_SEV_NO_ERROR;
1203 
1204 	return OPAL_SUCCESS;
1205 }
1206 
npu2_eeh_next_error(struct phb * phb,uint64_t * first_frozen_pe,uint16_t * pci_error_type,uint16_t * severity)1207 static int64_t npu2_eeh_next_error(struct phb *phb,
1208 				   uint64_t *first_frozen_pe,
1209 				   uint16_t *pci_error_type,
1210 				   uint16_t *severity)
1211 {
1212 	struct npu2 *p = phb_to_npu2_nvlink(phb);
1213 	int i;
1214 	uint64_t result = 0;
1215 
1216 	if (!first_frozen_pe || !pci_error_type || !severity)
1217 		return OPAL_PARAMETER;
1218 
1219 	*first_frozen_pe = -1;
1220 	*pci_error_type = OPAL_EEH_NO_ERROR;
1221 	*severity = OPAL_EEH_SEV_NO_ERROR;
1222 
1223 	for (i = 0; i < NPU2_MAX_PE_NUM; i++) {
1224 		result = npu2_read(p, NPU2_MISC_PESTB(i));
1225 		if (result > 0) {
1226 			*first_frozen_pe = i;
1227 			*pci_error_type = OPAL_EEH_PE_ERROR;
1228 			*severity = OPAL_EEH_SEV_PE_ER;
1229 			break;
1230 		}
1231 	}
1232 
1233 	return OPAL_SUCCESS;
1234 }
1235 
npu2_tce_kill(struct phb * phb,uint32_t kill_type,uint64_t pe_number,uint32_t tce_size,uint64_t dma_addr,uint32_t npages)1236 static int64_t npu2_tce_kill(struct phb *phb, uint32_t kill_type,
1237 			     uint64_t pe_number, uint32_t tce_size,
1238 			     uint64_t dma_addr, uint32_t npages)
1239 {
1240 	struct npu2 *npu = phb_to_npu2_nvlink(phb);
1241 	uint32_t tce_page_size;
1242 	uint64_t val;
1243 
1244 	if (pe_number > NPU2_MAX_PE_NUM)
1245 		return OPAL_PARAMETER;
1246 
1247 	sync();
1248 	switch(kill_type) {
1249 	case OPAL_PCI_TCE_KILL_PAGES:
1250 		tce_page_size = 1ULL << (
1251 				11 + GETFIELD(npu->tve_cache[pe_number],
1252 					NPU2_ATS_IODA_TBL_TVT_PSIZE));
1253 		if (tce_page_size != tce_size) {
1254 			NPU2ERR(npu, "npu2_tce_kill: Unexpected TCE size (got 0x%x expected 0x%x)\n",
1255 				tce_size, tce_page_size);
1256 			return OPAL_PARAMETER;
1257 		}
1258 
1259 		if (npages < 128) {
1260 			while (npages--) {
1261 				val = SETFIELD(NPU2_ATS_TCE_KILL_PENUM, dma_addr, pe_number);
1262 				npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ONE | val);
1263 				dma_addr += tce_size;
1264 			}
1265 			break;
1266 		}
1267 		/*
1268 		 * For too many TCEs do not bother with the loop above and simply
1269 		 * flush everything, going to be lot faster.
1270 		 */
1271 		/* Fall through */
1272 	case OPAL_PCI_TCE_KILL_PE:
1273 		/*
1274 		 * NPU2 doesn't support killing a PE so fall through
1275 		 * and do a kill all instead.
1276 		 */
1277 	case OPAL_PCI_TCE_KILL_ALL:
1278 		npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ALL);
1279 		break;
1280 	default:
1281 		return OPAL_PARAMETER;
1282 	}
1283 
1284 	return OPAL_SUCCESS;
1285 }
1286 
1287 static const struct phb_ops npu_ops = {
1288 	.cfg_read8		= npu2_cfg_read8,
1289 	.cfg_read16		= npu2_cfg_read16,
1290 	.cfg_read32		= npu2_cfg_read32,
1291 	.cfg_write8		= npu2_cfg_write8,
1292 	.cfg_write16		= npu2_cfg_write16,
1293 	.cfg_write32		= npu2_cfg_write32,
1294 	.device_init		= NULL,
1295 	.phb_final_fixup	= npu2_phb_final_fixup,
1296 	.ioda_reset		= npu2_ioda_reset,
1297 	.papr_errinjct_reset	= NULL,
1298 	.pci_reinit		= NULL,
1299 	.set_phb_mem_window	= NULL,
1300 	.phb_mmio_enable	= NULL,
1301 	.map_pe_mmio_window	= NULL,
1302 	.map_pe_dma_window	= npu2_map_pe_dma_window,
1303 	.map_pe_dma_window_real	= npu2_map_pe_dma_window_real,
1304 	.pci_msi_eoi		= NULL,
1305 	.set_xive_pe		= NULL,
1306 	.get_msi_32		= NULL,
1307 	.get_msi_64		= NULL,
1308 	.set_pe			= npu2_set_pe,
1309 	.set_peltv		= NULL,
1310 	.eeh_freeze_status	= npu2_freeze_status,
1311 	.eeh_freeze_clear	= NULL,
1312 	.eeh_freeze_set		= NULL,
1313 	.next_error		= npu2_eeh_next_error,
1314 	.err_inject		= NULL,
1315 	.get_diag_data2		= NULL,
1316 	.set_capi_mode		= NULL,
1317 	.set_capp_recovery	= NULL,
1318 	.tce_kill		= npu2_tce_kill,
1319 };
1320 
assign_mmio_bars(uint64_t gcid,uint32_t scom,uint64_t reg[2],uint64_t mm_win[2])1321 static void assign_mmio_bars(uint64_t gcid, uint32_t scom, uint64_t reg[2], uint64_t mm_win[2])
1322 {
1323 	uint32_t i;
1324 	struct npu2_bar *bar;
1325 	struct npu2_bar npu2_bars[] = {
1326 		/* NPU_REGS must be first in this list */
1327 		{ .type = NPU_REGS, .index = 0,
1328 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_PHY_BAR),
1329 		  .flags = NPU2_BAR_FLAG_ENABLED },
1330 		{ .type = NPU_PHY, .index = 0,
1331 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_PHY_BAR),
1332 		  .flags = NPU2_BAR_FLAG_ENABLED },
1333 		{ .type = NPU_PHY, .index = 1,
1334 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_PHY_BAR),
1335 		  .flags = NPU2_BAR_FLAG_ENABLED },
1336 		{ .type = NPU_NTL, .index = 0,
1337 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL0_BAR) },
1338 		{ .type = NPU_NTL, .index = 1,
1339 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL1_BAR) },
1340 		{ .type = NPU_NTL, .index = 2,
1341 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL0_BAR) },
1342 		{ .type = NPU_NTL, .index = 3,
1343 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL1_BAR) },
1344 		{ .type = NPU_NTL, .index = 4,
1345 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL0_BAR) },
1346 		{ .type = NPU_NTL, .index = 5,
1347 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL1_BAR) },
1348 		{ .type = NPU_GENID, .index = 0,
1349 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_GENID_BAR) },
1350 		{ .type = NPU_GENID, .index = 1,
1351 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_GENID_BAR) },
1352 		{ .type = NPU_GENID, .index = 2,
1353 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_GENID_BAR) },
1354 	};
1355 
1356 	for (i = 0; i < ARRAY_SIZE(npu2_bars); i++) {
1357 		bar = &npu2_bars[i];
1358 		npu2_get_bar(gcid, bar);
1359 		npu2_write_bar(NULL, bar, gcid, scom);
1360 	}
1361 
1362 	/* Global MMIO BAR */
1363 	reg[0] = npu2_bars[0].base;
1364 	reg[1] = npu2_bars[0].size;
1365 
1366 	/* NTL and GENID BARs are exposed to kernel via the mm
1367 	 * window */
1368 	mm_win[0] = npu2_bars[3].base;
1369 	mm_win[1] = npu2_bars[ARRAY_SIZE(npu2_bars) - 1].base +
1370 		    npu2_bars[ARRAY_SIZE(npu2_bars) - 1].size -
1371 		    mm_win[0];
1372 }
1373 
1374 /*
1375  * Set up NPU for NVLink and create PCI root device node
1376  * accordingly.
1377  */
npu2_nvlink_init_npu(struct npu2 * npu)1378 int npu2_nvlink_init_npu(struct npu2 *npu)
1379 {
1380 	struct dt_node *np;
1381 	uint64_t reg[2], mm_win[2], val, mask;
1382 
1383 	/* TODO: Clean this up with register names, etc. when we get
1384 	 * time. This just turns NVLink mode on in each brick and should
1385 	 * get replaced with a patch from ajd once we've worked out how
1386 	 * things are going to work there.
1387 	 *
1388 	 * Obviously if the year is now 2020 that didn't happen and you
1389 	 * should fix this :-) */
1390 
1391 	val = PPC_BIT(58);
1392 	mask = PPC_BIT(58) | /* CONFIG_NVLINK_MODE */
1393 	       PPC_BIT(40); /* CONFIG_ENABLE_SNARF_CPM */
1394 
1395 	/*
1396 	 * V100 GPUs are known to violate NVLink2 protocol if some GPU memory
1397 	 * mapped by a CPU was also "linear-block" mapped by a GPU. When this
1398 	 * happens, it breaks the NPU2 cache coherency state machine and
1399 	 * it throws machine checkstop. Disabling snarfing fixes this so let's
1400 	 * disable it by default.
1401 	 */
1402 	if (nvram_query_eq_dangerous("opal-npu2-snarf-cpm", "enable")) {
1403 		prlog(PR_WARNING, "NPU2#%d: enabling Probe.I.MO snarfing, a bad GPU driver may crash the system!\n",
1404 				npu->index);
1405 		val |= PPC_BIT(40); /* CONFIG_ENABLE_SNARF_CPM */
1406 	}
1407 
1408 	xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM0_MISC_CONFIG0,
1409 			 val, mask);
1410 	xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM1_MISC_CONFIG0,
1411 			 val, mask);
1412 	xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM2_MISC_CONFIG0,
1413 			 val, mask);
1414 	xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM3_MISC_CONFIG0,
1415 			 val, mask);
1416 	xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM0_MISC_CONFIG0,
1417 			 val, mask);
1418 	xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM1_MISC_CONFIG0,
1419 			 val, mask);
1420 	xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM2_MISC_CONFIG0,
1421 			 val, mask);
1422 	xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM3_MISC_CONFIG0,
1423 			 val, mask);
1424 	xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM0_MISC_CONFIG0,
1425 			 val, mask);
1426 	xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM1_MISC_CONFIG0,
1427 			 val, mask);
1428 	xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM2_MISC_CONFIG0,
1429 			 val, mask);
1430 	xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM3_MISC_CONFIG0,
1431 			 val, mask);
1432 
1433 	xscom_write_mask(npu->chip_id, 0x50110c0, PPC_BIT(53), PPC_BIT(53));
1434 	xscom_write_mask(npu->chip_id, 0x50112c0, PPC_BIT(53), PPC_BIT(53));
1435 	xscom_write_mask(npu->chip_id, 0x50114c0, PPC_BIT(53), PPC_BIT(53));
1436 	xscom_write_mask(npu->chip_id, 0x50110f1, PPC_BIT(41), PPC_BIT(41));
1437 	xscom_write_mask(npu->chip_id, 0x50112f1, PPC_BIT(41), PPC_BIT(41));
1438 	xscom_write_mask(npu->chip_id, 0x50114f1, PPC_BIT(41), PPC_BIT(41));
1439 
1440 	val = NPU2_NTL_MISC_CFG2_BRICK_ENABLE |
1441 	      NPU2_NTL_MISC_CFG2_NDL_TX_PARITY_ENA |
1442 	      NPU2_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA |
1443 	      NPU2_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA;
1444 	xscom_write_mask(npu->chip_id, 0x5011110, val, val);
1445 	xscom_write_mask(npu->chip_id, 0x5011130, val, val);
1446 	xscom_write_mask(npu->chip_id, 0x5011310, val, val);
1447 	xscom_write_mask(npu->chip_id, 0x5011330, val, val);
1448 	xscom_write_mask(npu->chip_id, 0x5011510, val, val);
1449 	xscom_write_mask(npu->chip_id, 0x5011530, val, val);
1450 
1451 	val = PPC_BIT(6) | PPC_BIT(7) | PPC_BIT(11);
1452 	xscom_write_mask(npu->chip_id, 0x5011009, val, PPC_BITMASK(6,11));
1453 	xscom_write_mask(npu->chip_id, 0x5011039, val, PPC_BITMASK(6,11));
1454 	xscom_write_mask(npu->chip_id, 0x5011069, val, PPC_BITMASK(6,11));
1455 	xscom_write_mask(npu->chip_id, 0x5011099, val, PPC_BITMASK(6,11));
1456 	xscom_write_mask(npu->chip_id, 0x5011209, val, PPC_BITMASK(6,11));
1457 	xscom_write_mask(npu->chip_id, 0x5011239, val, PPC_BITMASK(6,11));
1458 	xscom_write_mask(npu->chip_id, 0x5011269, val, PPC_BITMASK(6,11));
1459 	xscom_write_mask(npu->chip_id, 0x5011299, val, PPC_BITMASK(6,11));
1460 	xscom_write_mask(npu->chip_id, 0x5011409, val, PPC_BITMASK(6,11));
1461 	xscom_write_mask(npu->chip_id, 0x5011439, val, PPC_BITMASK(6,11));
1462 	xscom_write_mask(npu->chip_id, 0x5011469, val, PPC_BITMASK(6,11));
1463 	xscom_write_mask(npu->chip_id, 0x5011499, val, PPC_BITMASK(6,11));
1464 
1465 	/* Reassign the BARs */
1466 	assign_mmio_bars(npu->chip_id, npu->xscom_base, reg, mm_win);
1467 	npu->regs = (void *)reg[0];
1468 	npu->mm_base = mm_win[0];
1469 	npu->mm_size = mm_win[1];
1470 
1471 	if (reg[0] && reg[1])
1472 		prlog(PR_INFO, "   Global MMIO BAR:  %016llx (%lldMB)\n",
1473 		      reg[0], reg[1] >> 20);
1474 	else
1475 		prlog(PR_ERR, "    Global MMIO BAR: Disabled\n");
1476 
1477 	/* Populate PCI root device node */
1478 	np = dt_new_addr(dt_root, "pciex", reg[0]);
1479 	assert(np);
1480 	dt_add_property_strings(np,
1481 				"compatible",
1482 				"ibm,power9-npu-pciex",
1483 				"ibm,ioda2-npu2-phb");
1484 	dt_add_property_strings(np, "device_type", "pciex");
1485 	dt_add_property(np, "reg", reg, sizeof(reg));
1486 	dt_add_property_cells(np, "ibm,phb-index", npu2_get_phb_index(0));
1487 	dt_add_property_cells(np, "ibm,npu-index", npu->index);
1488 	dt_add_property_cells(np, "ibm,chip-id", npu->chip_id);
1489 	dt_add_property_cells(np, "ibm,xscom-base", npu->xscom_base);
1490 	dt_add_property_cells(np, "ibm,npcq", npu->dt_node->phandle);
1491 	dt_add_property_cells(np, "ibm,links", npu->total_devices);
1492 	dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win));
1493 	dt_add_property_cells(np, "ibm,phb-diag-data-size", 0);
1494 
1495 	/* Disable fast reboot - not currently supported */
1496 	disable_fast_reboot("NVLink device enabled");
1497 
1498 	npu2_nvlink_create_phb(npu, np);
1499 
1500 	return 0;
1501 }
1502 
npu2_populate_pcie_cap(struct npu2_dev * dev,uint32_t start,uint32_t prev_cap)1503 static uint32_t npu2_populate_pcie_cap(struct npu2_dev *dev,
1504 				       uint32_t start,
1505 				       uint32_t prev_cap)
1506 {
1507 	struct pci_virt_device *pvd = dev->nvlink.pvd;
1508 	uint32_t val;
1509 
1510 	/* Add capability list */
1511 	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
1512 	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP);
1513 
1514 	/* 0x00 - ID/PCIE capability */
1515 	val = PCI_CFG_CAP_ID_EXP;
1516 	val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20));
1517 	PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val);
1518 
1519 	/* 0x04 - Device capability
1520 	 *
1521 	 * We should support FLR. Otherwise, it might have
1522 	 * problem passing it through to userland via Linux
1523 	 * VFIO infrastructure
1524 	 */
1525 	val = ((PCIE_MPSS_128) |
1526 	       (PCIE_PHANTOM_NONE << 3) |
1527 	       (PCIE_L0SL_MAX_NO_LIMIT << 6) |
1528 	       (PCIE_L1L_MAX_NO_LIMIT << 9) |
1529 	       (PCICAP_EXP_DEVCAP_FUNC_RESET));
1530 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val);
1531 
1532 	pci_virt_add_filter(pvd, start + PCICAP_EXP_DEVCTL, 2,
1533 			    PCI_REG_FLAG_WRITE,
1534 			    npu2_dev_cfg_exp_devcap, NULL);
1535 
1536 	/* 0x08 - Device control and status */
1537 	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810,
1538 			  0xffff0000, 0x000f0000);
1539 
1540 	/* 0x0c - Link capability */
1541 	val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4));
1542 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val);
1543 
1544 	/* 0x10 - Link control and status */
1545 	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000,
1546 			 0xfffff000, 0xc0000000);
1547 
1548 	/* 0x14 - Slot capability */
1549 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
1550 
1551 	/* 0x18 - Slot control and status */
1552 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
1553 
1554 	/* 0x1c - Root control and capability */
1555 	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000,
1556 			  0xffffffe0, 0x00000000);
1557 
1558 	/* 0x20 - Root status */
1559 	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000,
1560 			 0xffffffff, 0x00010000);
1561 
1562 	/* 0x24 - Device capability 2 */
1563 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000);
1564 
1565 	/* 0x28 - Device Control and status 2 */
1566 	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000,
1567 			 0xffff0000, 0x00000000);
1568 
1569 	/* 0x2c - Link capability 2 */
1570 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007);
1571 
1572 	/* 0x30 - Link control and status 2 */
1573 	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003,
1574 			 0xffff0000, 0x00200000);
1575 
1576 	/* 0x34 - Slot capability 2 */
1577 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000);
1578 
1579 	/* 0x38 - Slot control and status 2 */
1580 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000);
1581 
1582 	return start + PCICAP_EXP_SCTL2 + 8;
1583 }
1584 
npu2_populate_vendor_cap(struct npu2_dev * dev,uint32_t start,uint32_t prev_cap)1585 static uint32_t npu2_populate_vendor_cap(struct npu2_dev *dev,
1586 					 uint32_t start,
1587 					 uint32_t prev_cap)
1588 {
1589 	struct pci_virt_device *pvd = dev->nvlink.pvd;
1590 
1591 	/* Capbility list */
1592 	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
1593 	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR);
1594 
1595 	/* Length and version */
1596 	PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, VENDOR_CAP_LEN);
1597 	PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, VENDOR_CAP_VERSION);
1598 
1599 	/*
1600 	 * Defaults when the trap can't handle the read/write (eg. due
1601 	 * to reading/writing less than 4 bytes).
1602 	 */
1603 	PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0);
1604 	PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0);
1605 
1606 	/* Add NVLink2 PHY procedures trap */
1607 	pci_virt_add_filter(pvd, start + 4, 8,
1608 			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
1609 			    npu2_dev_procedure,
1610 			    NULL);
1611 
1612 	/* Link index */
1613 	PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, dev->link_index);
1614 
1615 	return start + VENDOR_CAP_LEN;
1616 }
1617 
npu2_populate_cfg(struct npu2_dev * dev)1618 static void npu2_populate_cfg(struct npu2_dev *dev)
1619 {
1620 	struct pci_virt_device *pvd = dev->nvlink.pvd;
1621 	struct npu2_pcie_bar *bar;
1622 	uint32_t pos;
1623 
1624 	/* 0x00 - Vendor/Device ID */
1625 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
1626 
1627 	/* 0x04 - Command/Status */
1628 	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
1629 			  0xf9000000);
1630 
1631 	pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
1632 			    npu2_cfg_write_cmd, NULL);
1633 
1634 	/* 0x08 - Rev/Class/Cache */
1635 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800101);
1636 
1637 	/* 0x0c - CLS/Latency Timer/Header/BIST */
1638 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
1639 
1640 	/* 0x10/14 - BAR#0, NTL BAR */
1641 	bar = &dev->bars[0];
1642 	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4,
1643 			  (bar->npu2_bar.base & 0xfffffff0) | (bar->flags & 0xF),
1644 			  0x0000000f, 0x00000000);
1645 	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, (bar->npu2_bar.base >> 32),
1646 			  0x00000000, 0x00000000);
1647 	pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
1648 			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
1649 			    npu2_dev_cfg_bar, bar);
1650 
1651 	/* 0x18/1c - BAR#1, GENID BAR */
1652 	bar = &dev->bars[1];
1653 	if (NPU2DEV_BRICK(dev) == 0)
1654 		PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, (bar->npu2_bar.base & 0xfffffff0) |
1655 				  (bar->flags & 0xF),
1656 				  0x0000000f, 0x00000000);
1657 	else
1658 		/* Brick 1 gets the upper portion of the generation id register */
1659 		PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, ((bar->npu2_bar.base + 0x10000) & 0xfffffff0) |
1660 				  (bar->flags & 0xF),
1661 				  0x0000000f, 0x00000000);
1662 
1663 	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, (bar->npu2_bar.base >> 32), 0x00000000,
1664 			  0x00000000);
1665 	pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8,
1666 			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
1667 			    npu2_dev_cfg_bar, bar);
1668 
1669 	/* 0x20/0x24 - BARs, disabled */
1670 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
1671 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
1672 
1673 	/* 0x28 - Cardbus CIS pointer */
1674 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
1675 
1676 	/* 0x2c - Subsystem ID */
1677 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
1678 
1679 	/* 0x30 - ROM BAR, zero sized */
1680 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
1681 
1682 	/* 0x34 - PCI Capability */
1683 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
1684 
1685 	/* 0x38 - Reserved */
1686 	PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
1687 
1688 	/* 0x3c - INT line/pin/Minimal grant/Maximal latency */
1689 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); /* INT A */
1690 
1691 	/* PCIE and vendor specific capability */
1692 	pos = npu2_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP);
1693 	pos = npu2_populate_vendor_cap(dev, pos, 0x41);
1694 	PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0);
1695 }
1696 
npu_allocate_bdfn(struct npu2 * p,uint32_t group)1697 static uint32_t npu_allocate_bdfn(struct npu2 *p, uint32_t group)
1698 {
1699 	int i;
1700 	int bdfn = (group << 3);
1701 
1702 	for (i = 0; i < p->total_devices; i++) {
1703 		if ((p->devices[i].bdfn & 0xf8) == (bdfn & 0xf8))
1704 			bdfn++;
1705 	}
1706 
1707 	return bdfn;
1708 }
1709 
npu2_populate_devices(struct npu2 * p,struct dt_node * dn)1710 static void npu2_populate_devices(struct npu2 *p,
1711 				  struct dt_node *dn)
1712 {
1713 	struct npu2_dev *dev;
1714 	struct dt_node *npu2_dn, *link;
1715 	uint32_t npu_phandle, index = 0;
1716 	int stack;
1717 
1718 	/*
1719 	 * Get the npu node which has the links which we expand here
1720 	 * into pci like devices attached to our emulated phb.
1721 	 */
1722 	npu_phandle = dt_prop_get_u32(dn, "ibm,npcq");
1723 	npu2_dn = dt_find_by_phandle(dt_root, npu_phandle);
1724 	assert(npu2_dn);
1725 
1726 	/* Walk the link@x nodes to initialize devices */
1727 	p->total_devices = 0;
1728 	p->phb_nvlink.scan_map = 0;
1729 	dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") {
1730 		uint32_t group_id;
1731 		struct npu2_bar *npu2_bar;
1732 
1733 		dev = &p->devices[index];
1734 		dev->type = NPU2_DEV_TYPE_NVLINK;
1735 		dev->npu = p;
1736 		dev->dt_node = link;
1737 		dev->link_index = dt_prop_get_u32(link, "ibm,npu-link-index");
1738 		dev->brick_index = dev->link_index;
1739 
1740 		group_id = dt_prop_get_u32(link, "ibm,npu-group-id");
1741 		dev->bdfn = npu_allocate_bdfn(p, group_id);
1742 
1743 		/* This must be done after calling
1744 		 * npu_allocate_bdfn() */
1745 		p->total_devices++;
1746 		p->phb_nvlink.scan_map |= 0x1 << ((dev->bdfn & 0xf8) >> 3);
1747 
1748 		dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy");
1749 		dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
1750 
1751 		/* Populate BARs. BAR0/1 is the NTL bar. */
1752 		stack = NPU2_STACK_STCK_0 + NPU2DEV_STACK(dev);
1753 		npu2_bar = &dev->bars[0].npu2_bar;
1754 		npu2_bar->type = NPU_NTL;
1755 		npu2_bar->index = dev->brick_index;
1756 		npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2DEV_BRICK(dev) == 0 ?
1757 						NPU2_NTL0_BAR : NPU2_NTL1_BAR);
1758 	        npu2_get_bar(p->chip_id, npu2_bar);
1759 
1760 		dev->bars[0].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
1761 
1762 		/* BAR2/3 is the GENID bar. */
1763 		npu2_bar = &dev->bars[1].npu2_bar;
1764 		npu2_bar->type = NPU_GENID;
1765 		npu2_bar->index = NPU2DEV_STACK(dev);
1766 		npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2_GENID_BAR);
1767 	        npu2_get_bar(p->chip_id, npu2_bar);
1768 
1769 		/* The GENID is a single physical BAR that we split
1770 		 * for each emulated device */
1771 		npu2_bar->size = 0x10000;
1772 		if (NPU2DEV_BRICK(dev))
1773 			npu2_bar->base += 0x10000;
1774 		dev->bars[1].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
1775 
1776 		/* Initialize PCI virtual device */
1777 		dev->nvlink.pvd = pci_virt_add_device(&p->phb_nvlink, dev->bdfn, 0x100, dev);
1778 		if (dev->nvlink.pvd)
1779 			npu2_populate_cfg(dev);
1780 
1781 		index++;
1782 	}
1783 }
1784 
npu2_add_interrupt_map(struct npu2 * p,struct dt_node * dn)1785 static void npu2_add_interrupt_map(struct npu2 *p,
1786 				  struct dt_node *dn)
1787 {
1788 	struct dt_node *npu2_dn, *link, *phb_dn;
1789 	uint32_t npu2_phandle, index = 0, i;
1790 	uint32_t icsp = get_ics_phandle();
1791 	uint32_t *map;
1792 	size_t map_size;
1793 	uint32_t mask[] = {0xff00, 0x0, 0x0, 0x7};
1794 
1795 	assert(p->phb_nvlink.dt_node);
1796 	phb_dn = p->phb_nvlink.dt_node;
1797 
1798 	npu2_phandle = dt_prop_get_u32(dn, "ibm,npcq");
1799 	npu2_dn = dt_find_by_phandle(dt_root, npu2_phandle);
1800 	assert(npu2_dn);
1801 	map_size = 7 * sizeof(*map) * p->total_devices;
1802 	map = malloc(map_size);
1803 	index = 0;
1804 	dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") {
1805 		i = index * 7;
1806 		map[i + 0] = (p->devices[index].bdfn << 8);
1807 		map[i + 1] = 0;
1808 		map[i + 2] = 0;
1809 
1810 		map[i + 3] = 1; /* INT A */
1811 		map[i + 4] = icsp; /* interrupt-parent */
1812 		map[i + 5] = p->base_lsi + (index * 2) + 1; /* NDL No-Stall Event */
1813 		map[i + 6] = 0; /* 0 = EDGE, 1 = LEVEL. */
1814 		index++;
1815 	}
1816 	dt_add_property(phb_dn, "interrupt-map", map, map_size);
1817 	free(map);
1818 	dt_add_property(phb_dn, "interrupt-map-mask", mask, sizeof(mask));
1819 }
1820 
npu2_add_phb_properties(struct npu2 * p)1821 static void npu2_add_phb_properties(struct npu2 *p)
1822 {
1823 	struct dt_node *np = p->phb_nvlink.dt_node;
1824 	uint32_t icsp = get_ics_phandle();
1825 	uint64_t mm_base, mm_size;
1826 
1827 	/*
1828 	 * Add various properties that HB doesn't have to
1829 	 * add, some of them simply because they result from
1830 	 * policy decisions made in skiboot rather than in HB
1831 	 * such as the MMIO windows going to PCI, interrupts,
1832 	 * etc.
1833 	 */
1834 	dt_add_property_cells(np, "#address-cells", 3);
1835 	dt_add_property_cells(np, "#size-cells", 2);
1836 	dt_add_property_cells(np, "#interrupt-cells", 1);
1837 	dt_add_property_cells(np, "bus-range", 0, 0xff);
1838 	dt_add_property_cells(np, "clock-frequency", 0x200, 0);
1839         dt_add_property_cells(np, "interrupt-parent", icsp);
1840 
1841 	/* NPU2 PHB properties */
1842 	dt_add_property_cells(np, "ibm,opal-num-pes",
1843 			      NPU2_MAX_PE_NUM);
1844 	dt_add_property_cells(np, "ibm,opal-reserved-pe",
1845 			      NPU2_RESERVED_PE_NUM);
1846 	dt_add_property_cells(np, "ibm,supported-tce-sizes",
1847 			      12, // 4K
1848 			      16, // 64K
1849 			      24, // 16M
1850 			      28); // 256M
1851 
1852 	dt_add_property_u64s(np, "ibm,mmio-atsd",
1853 			MMIO_ATSD_ADDR(p->regs, 0),
1854 			MMIO_ATSD_ADDR(p->regs, 1),
1855 			MMIO_ATSD_ADDR(p->regs, 2),
1856 			MMIO_ATSD_ADDR(p->regs, 3),
1857 			MMIO_ATSD_ADDR(p->regs, 4),
1858 			MMIO_ATSD_ADDR(p->regs, 5),
1859 			MMIO_ATSD_ADDR(p->regs, 6),
1860 			MMIO_ATSD_ADDR(p->regs, 7));
1861 
1862 	/*
1863 	 * Memory window is exposed as 64-bits non-prefetchable
1864 	 * one because 64-bits prefetchable one is kind of special
1865 	 * to kernel.
1866 	 */
1867 	mm_base = p->mm_base;
1868 	mm_size = p->mm_size;
1869 	dt_add_property_cells(np, "ranges", 0x02000000,
1870 			      hi32(mm_base), lo32(mm_base),
1871 			      hi32(mm_base), lo32(mm_base),
1872 			      hi32(mm_size), lo32(mm_size));
1873 }
1874 
npu2_nvlink_create_phb(struct npu2 * npu,struct dt_node * dn)1875 void npu2_nvlink_create_phb(struct npu2 *npu, struct dt_node *dn)
1876 {
1877 	struct pci_slot *slot;
1878 
1879 	/* Generic PHB */
1880 	npu->phb_nvlink.dt_node = dn;
1881 	npu->phb_nvlink.ops = &npu_ops;
1882 	npu->phb_nvlink.phb_type = phb_type_npu_v2;
1883 	init_lock(&npu->lock);
1884 	init_lock(&npu->phb_nvlink.lock);
1885 	list_head_init(&npu->phb_nvlink.devices);
1886 	list_head_init(&npu->phb_nvlink.virt_devices);
1887 
1888 	npu2_populate_devices(npu, dn);
1889 	npu2_add_interrupt_map(npu, dn);
1890 	npu2_add_phb_properties(npu);
1891 
1892 	slot = npu2_slot_create(&npu->phb_nvlink);
1893 	if (!slot)
1894 	{
1895 		/**
1896 		 * @fwts-label NPUCannotCreatePHBSlot
1897 		 * @fwts-advice Firmware probably ran out of memory creating
1898 		 * NPU2 slot. NVLink functionality could be broken.
1899 		 */
1900 		prlog(PR_ERR, "NPU: Cannot create PHB slot\n");
1901 	}
1902 
1903 	pci_register_phb(&npu->phb_nvlink, OPAL_DYNAMIC_PHB_ID);
1904 
1905 	npu2_init_ioda_cache(npu);
1906 	npu2_hw_init(npu);
1907 }
1908 
1909 /*
1910  * Search a table for an entry with matching value under mask. Returns
1911  * the index and the current value in *value.
1912  */
npu_table_search(struct npu2 * p,uint64_t table_addr,int stride,int table_size,uint64_t * value,uint64_t mask)1913 static int npu_table_search(struct npu2 *p, uint64_t table_addr, int stride,
1914 			    int table_size, uint64_t *value, uint64_t mask)
1915 {
1916 	int i;
1917 	uint64_t val;
1918 
1919 	assert(value);
1920 
1921 	for (i = 0; i < table_size; i++) {
1922 		val = npu2_read(p, table_addr + i*stride);
1923 		if ((val & mask) == *value) {
1924 			*value = val;
1925 			return i;
1926 		}
1927 	}
1928 
1929 	return -1;
1930 }
1931 
1932 /*
1933  * Allocate a context ID and initialise the tables with the relevant
1934  * information. Returns the ID on or error if one couldn't be
1935  * allocated.
1936  */
1937 #define NPU2_VALID_ATS_MSR_BITS (MSR_DR | MSR_HV | MSR_PR | MSR_SF)
npu2_init_context(struct phb * phb,uint64_t msr,uint64_t bdf)1938 int64_t npu2_init_context(struct phb *phb, uint64_t msr, uint64_t bdf)
1939 {
1940 	struct npu2 *p;
1941 	uint64_t xts_bdf, old_xts_bdf_pid, xts_bdf_pid;
1942 	int id;
1943 
1944 	/*
1945 	 * MSR bits should be masked by the caller to allow for future
1946 	 * expansion if required.
1947 	 */
1948 	if (msr & ~NPU2_VALID_ATS_MSR_BITS)
1949 		return OPAL_UNSUPPORTED;
1950 
1951 	/*
1952 	 * Need to get LPARSHORT.
1953 	 */
1954 	p = phb_to_npu2_nvlink(phb);
1955 	lock(&p->lock);
1956 	xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0ul, bdf);
1957 	if (npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
1958 			     &xts_bdf, NPU2_XTS_BDF_MAP_BDF) < 0) {
1959 		NPU2ERR(p, "LPARID not associated with any GPU\n");
1960 		id = OPAL_PARAMETER;
1961 		goto out;
1962 	}
1963 
1964 	id = GETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf);
1965 	NPU2DBG(p, "Found LPARSHORT = 0x%x for BDF = 0x%03llx\n", id, bdf);
1966 
1967 	/* Enable this mapping for both real and virtual addresses */
1968 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATRGPA0, 0UL, 1);
1969 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATRGPA1, xts_bdf_pid, 1);
1970 
1971 	/* Enables TLBIE/MMIOSD forwarding for this entry */
1972 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATSD, xts_bdf_pid, 1);
1973 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_LPARSHORT, xts_bdf_pid, id);
1974 
1975 	/* Set the relevant MSR bits */
1976 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_DR, xts_bdf_pid,
1977 			       !!(msr & MSR_DR));
1978 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_HV, xts_bdf_pid,
1979 			       !!(msr & MSR_HV));
1980 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_PR, xts_bdf_pid,
1981 			       !!(msr & MSR_PR));
1982 
1983 	/* We don't support anything other than 64-bit so we can safely hardcode
1984 	 * it here */
1985 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_SF, xts_bdf_pid, 1);
1986 
1987 	/*
1988 	 * Throw an error if the wildcard entry for this bdf is already set
1989 	 * with different msr bits.
1990 	 */
1991 	old_xts_bdf_pid = npu2_read(p, NPU2_XTS_PID_MAP + id*0x20);
1992 	if (old_xts_bdf_pid) {
1993 		if (GETFIELD(NPU2_XTS_PID_MAP_MSR, old_xts_bdf_pid) !=
1994 		    GETFIELD(NPU2_XTS_PID_MAP_MSR, xts_bdf_pid)) {
1995 			NPU2ERR(p, "%s: Unexpected MSR value\n", __func__);
1996 			id = OPAL_PARAMETER;
1997 			goto out;
1998 		} else if (!p->ctx_ref[id]) {
1999 			NPU2ERR(p, "%s: Unexpected mapping\n", __func__);
2000 			id = OPAL_INTERNAL_ERROR;
2001 			goto out;
2002 		}
2003 	}
2004 
2005 	/* Write the entry */
2006 	if (!p->ctx_ref[id]) {
2007 		NPU2DBG(p, "XTS_PID_MAP[%03d] = 0x%08llx\n", id, xts_bdf_pid);
2008 		npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, xts_bdf_pid);
2009 
2010 		if (!GETFIELD(NPU2_XTS_BDF_MAP_VALID, xts_bdf)) {
2011 			xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_VALID, xts_bdf, 1);
2012 			npu2_write(p, NPU2_XTS_BDF_MAP + id*8, xts_bdf);
2013 		}
2014 	}
2015 	++p->ctx_ref[id];
2016 
2017 out:
2018 	unlock(&p->lock);
2019 	return id;
2020 }
2021 
npu2_destroy_context(struct phb * phb,uint64_t bdf)2022 int64_t npu2_destroy_context(struct phb *phb, uint64_t bdf)
2023 {
2024 	struct npu2 *p;
2025 	uint64_t xts_bdf;
2026 	int rc = OPAL_PARAMETER, id;
2027 
2028 	p = phb_to_npu2_nvlink(phb);
2029 	lock(&p->lock);
2030 
2031 	/* Need to find lparshort for this bdf */
2032 	xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0ul, bdf);
2033 	if (npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
2034 			     &xts_bdf, NPU2_XTS_BDF_MAP_BDF) < 0) {
2035 		NPU2ERR(p, "LPARID not associated with any GPU\n");
2036 	} else {
2037 		/*
2038 		 * The bdf/pid table contains wildcard entries and MSR bits
2039 		 * which we need to clear between switching a device from
2040 		 * a host to a guest or vice versa.
2041 		 */
2042 		id = GETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf);
2043 		if (p->ctx_ref[id]) {
2044 			--p->ctx_ref[id];
2045 			if (!p->ctx_ref[id]) {
2046 				NPU2DBG(p, "XTS_PID_MAP[%03d] = 0 (destroy)\n",
2047 					id);
2048 				npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, 0);
2049 			}
2050 			rc = OPAL_SUCCESS;
2051 		}
2052 	}
2053 	unlock(&p->lock);
2054 	return rc;
2055 }
2056 
2057 /*
2058  * Map the given virtual bdf to lparid with given lpcr.
2059  */
npu2_map_lpar(struct phb * phb,uint64_t bdf,uint64_t lparid,uint64_t lpcr)2060 int64_t npu2_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
2061 		      uint64_t lpcr)
2062 {
2063 	struct npu2 *p;
2064 	struct npu2_dev *ndev = NULL;
2065 	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
2066 	int i;
2067 	int id;
2068 	static uint64_t atsd_lpar_regs[] = {
2069 		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
2070 		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
2071 		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
2072 		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID
2073 	};
2074 
2075 	if (lpcr)
2076 		/* The LPCR bits are only required for hash based ATS,
2077 		 * which we don't currently support but may need to in
2078 		 * future. */
2079 		return OPAL_UNSUPPORTED;
2080 
2081 	p = phb_to_npu2_nvlink(phb);
2082 	lock(&p->lock);
2083 
2084 	/* Find any existing entries and update them */
2085 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0L, bdf);
2086 	id = npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
2087 			      &xts_bdf_lpar, NPU2_XTS_BDF_MAP_BDF);
2088 	if (id < 0) {
2089 		/* No existing mapping found, find space for a new one */
2090 		xts_bdf_lpar = 0;
2091 		id = npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
2092 				      &xts_bdf_lpar, -1UL);
2093 	}
2094 
2095 	if (id < 0) {
2096 		/* Unable to find a free mapping */
2097 		NPU2ERR(p, "No free XTS_BDF[] entry\n");
2098 		rc = OPAL_RESOURCE;
2099 		goto out;
2100 	}
2101 
2102 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_UNFILT, 0UL, 1);
2103 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BDF, xts_bdf_lpar, bdf);
2104 
2105 	/* We only support radix for the moment */
2106 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_XLAT, xts_bdf_lpar, 0x3);
2107 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid);
2108 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
2109 
2110 	/* Need to find an NVLink to send the ATSDs for this device over */
2111 	for (i = 0; i < p->total_devices; i++) {
2112 		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
2113 			ndev = &p->devices[i];
2114 			break;
2115 		}
2116 	}
2117 
2118 	if (!ndev) {
2119 		NPU2ERR(p, "Unable to find nvlink for bdf %llx\n", bdf);
2120 		rc = OPAL_PARAMETER;
2121 		goto out;
2122 	}
2123 
2124 	/*
2125 	 * We need to allocate an ATSD per NVLink bridge if possible,
2126 	 * use the ibm,npu-link-index property for that.
2127 	 */
2128 	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
2129 	if (!lparid)
2130 		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
2131 
2132 	if (ndev->link_index < ARRAY_SIZE(atsd_lpar_regs))
2133 		npu2_write(p, atsd_lpar_regs[ndev->link_index], atsd_lpar);
2134 	else
2135 		NPU2ERR(p, "Unable to assign ATSD for link index %u\n",
2136 				ndev->link_index);
2137 
2138 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_STACK, xts_bdf_lpar,
2139 				0x4 >> (ndev->brick_index / 2));
2140 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BRICK, xts_bdf_lpar,
2141 				(ndev->brick_index % 2));
2142 
2143 	NPU2DBG(p, "XTS_BDF_MAP[%03d] = 0x%08llx\n", id, xts_bdf_lpar);
2144 	npu2_write(p, NPU2_XTS_BDF_MAP + id*8, xts_bdf_lpar);
2145 
2146 	/* Reset wildcard in the PID map and the refcounter */
2147 	if (npu2_read(p, NPU2_XTS_PID_MAP + id*0x20) || p->ctx_ref[id]) {
2148 		prlog(PR_INFO, "Resetting PID MAP for LPID %lld\n", lparid);
2149 		p->ctx_ref[id] = 0;
2150 		npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, 0);
2151 	}
2152 
2153 out:
2154 	unlock(&p->lock);
2155 	return rc;
2156 }
2157 
npu2_relaxed_ordering_source_grpchp(uint32_t gcid)2158 static inline uint32_t npu2_relaxed_ordering_source_grpchp(uint32_t gcid)
2159 {
2160 	if (gcid & ~0x1b)
2161 		return OPAL_PARAMETER;
2162 
2163 	/* Repack 0bGGGGCCC to 0bGGCC */
2164 	return ((gcid & 0x18) >> 1) | (gcid & 0x3);
2165 }
2166 
npu2_relaxed_ordering_cfg_read(struct npu2_dev * ndev,int n)2167 static uint64_t npu2_relaxed_ordering_cfg_read(struct npu2_dev *ndev, int n)
2168 {
2169 	uint64_t reg = NPU2_SM_REG_OFFSET(ndev, 0, NPU2_RELAXED_ORDERING_CFG(n));
2170 
2171 	return npu2_read(ndev->npu, reg);
2172 }
2173 
npu2_relaxed_ordering_cfg_write(struct npu2_dev * ndev,int n,uint64_t val)2174 static void npu2_relaxed_ordering_cfg_write(struct npu2_dev *ndev, int n,
2175 					    uint64_t val)
2176 {
2177 	uint64_t reg;
2178 	int sm;
2179 
2180 	/* Set every register on our stack */
2181 	for (sm = NPU2_BLOCK_SM_0; sm <= NPU2_BLOCK_SM_3; sm++) {
2182 		reg = NPU2_SM_REG_OFFSET(ndev, sm, NPU2_RELAXED_ORDERING_CFG(n));
2183 		npu2_write(ndev->npu, reg, val);
2184 	}
2185 }
2186 
2187 /*
2188  * Parse the value of a relaxed ordering config register. Returns SOURCE0 or
2189  * SOURCE1 register mask if relaxed ordering is set for the given chip/pec.
2190  * Returns 0 if unset.
2191  */
npu2_relaxed_ordering_cfg_enabled(uint64_t val,uint32_t gcid,int pec)2192 static uint64_t npu2_relaxed_ordering_cfg_enabled(uint64_t val, uint32_t gcid,
2193 						  int pec)
2194 {
2195 	uint32_t src, grpchp;
2196 	uint64_t mask;
2197 	int i;
2198 
2199 	for (i = 0; i < 2; i++) {
2200 		mask = NPU2_RELAXED_ORDERING_SOURCE(i);
2201 		src = GETFIELD(mask, val);
2202 
2203 		if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, src))
2204 			continue;
2205 
2206 		if (GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src) != pec)
2207 			continue;
2208 
2209 		grpchp = GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src);
2210 		if (grpchp == npu2_relaxed_ordering_source_grpchp(gcid))
2211 			return mask;
2212 
2213 		if (grpchp == 0xf) /* match all */
2214 			return mask;
2215 	}
2216 
2217 	return 0;
2218 }
2219 
npu2_enable_relaxed_ordering(struct npu2_dev * ndev,uint32_t gcid,int pec)2220 static int npu2_enable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
2221 					int pec)
2222 {
2223 	uint64_t val, mask;
2224 	uint32_t src;
2225 	int rc = OPAL_RESOURCE;
2226 	int i;
2227 
2228 	NPU2DEVINF(ndev, "Enabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
2229 	lock(&ndev->npu->lock);
2230 
2231 	for (i = 0; i < 2; i++) {
2232 		val = npu2_relaxed_ordering_cfg_read(ndev, i);
2233 		if (!npu2_relaxed_ordering_cfg_enabled(val, gcid, pec))
2234 			continue;
2235 
2236 		/* Already enabled */
2237 		rc = OPAL_SUCCESS;
2238 		goto out;
2239 	}
2240 
2241 	src = NPU2_RELAXED_ORDERING_SOURCE_WRENA |
2242 	      NPU2_RELAXED_ORDERING_SOURCE_RDENA;
2243 	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src, pec);
2244 	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src,
2245 		       npu2_relaxed_ordering_source_grpchp(gcid));
2246 	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMIN, src, 0);
2247 	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMAX, src, 23);
2248 	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMIN, src, 0);
2249 	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMAX, src, 47);
2250 
2251 	/* Find somewhere to write this config */
2252 	for (i = 0; i < 2; i++) {
2253 		val = npu2_relaxed_ordering_cfg_read(ndev, i);
2254 
2255 		if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA << 32, val))
2256 			mask = NPU2_RELAXED_ORDERING_SOURCE(0);
2257 		else if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, val))
2258 			mask = NPU2_RELAXED_ORDERING_SOURCE(1);
2259 		else
2260 			continue;
2261 
2262 		val = SETFIELD(mask, val, src);
2263 		npu2_relaxed_ordering_cfg_write(ndev, i, val);
2264 
2265 		rc = OPAL_SUCCESS;
2266 		break;
2267 	}
2268 
2269 out:
2270 	unlock(&ndev->npu->lock);
2271 	return rc;
2272 }
2273 
npu2_disable_relaxed_ordering(struct npu2_dev * ndev,uint32_t gcid,int pec)2274 static void npu2_disable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
2275 					  int pec)
2276 {
2277 	uint64_t val, mask;
2278 	int i;
2279 
2280 	NPU2DEVINF(ndev, "Disabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
2281 	lock(&ndev->npu->lock);
2282 
2283 	for (i = 0; i < 2; i++) {
2284 		val = npu2_relaxed_ordering_cfg_read(ndev, i);
2285 
2286 		mask = npu2_relaxed_ordering_cfg_enabled(val, gcid, pec);
2287 		if (!mask)
2288 			continue;
2289 
2290 		val = SETFIELD(mask, val, 0);
2291 		npu2_relaxed_ordering_cfg_write(ndev, i, val);
2292 	}
2293 
2294 	unlock(&ndev->npu->lock);
2295 }
2296 
2297 /*
2298  * Enable or disable relaxed ordering on all nvlinks for a given PEC. May leave
2299  * relaxed ordering partially enabled if there are insufficient HW resources to
2300  * enable it on all links.
2301  */
npu2_set_relaxed_order(struct phb * phb,uint32_t gcid,int pec,bool enable)2302 int64_t npu2_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
2303 			       bool enable)
2304 {
2305 	struct npu2 *npu = phb_to_npu2_nvlink(phb);
2306 	struct npu2_dev *ndev;
2307 	int64_t rc = OPAL_SUCCESS;
2308 
2309 	for (int i = 0; i < npu->total_devices; i++) {
2310 		ndev = &npu->devices[i];
2311 		if (enable)
2312 			rc = npu2_enable_relaxed_ordering(ndev, gcid, pec);
2313 		else
2314 			npu2_disable_relaxed_ordering(ndev, gcid, pec);
2315 
2316 		if (rc != OPAL_SUCCESS) {
2317 			NPU2DEVINF(ndev, "Insufficient resources to activate relaxed ordering mode\n");
2318 			return OPAL_RESOURCE;
2319 		}
2320 	}
2321 
2322 	return OPAL_SUCCESS;
2323 }
2324