1 // SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
2 /*
3 * NPU - NVlink and OpenCAPI
4 *
5 * Copyright 2013-2019 IBM Corp.
6 */
7
8 #include <skiboot.h>
9 #include <io.h>
10 #include <timebase.h>
11 #include <pci-cfg.h>
12 #include <pci.h>
13 #include <pci-slot.h>
14 #include <pci-virt.h>
15 #include <opal.h>
16 #include <opal-api.h>
17 #include <cpu.h>
18 #include <device.h>
19 #include <ccan/str/str.h>
20 #include <ccan/array_size/array_size.h>
21 #include <affinity.h>
22 #include <npu2.h>
23 #include <lock.h>
24 #include <xscom.h>
25 #include <bitutils.h>
26 #include <chip.h>
27 #include <phys-map.h>
28 #include <nvram.h>
29 #include <xscom-p9-regs.h>
30 #include <phb4.h>
31 #include <cache-p9.h>
32
33 #define VENDOR_CAP_START 0x80
34 #define VENDOR_CAP_END 0x90
35 #define VENDOR_CAP_LEN 0x10
36 #define VENDOR_CAP_VERSION 0x01
37 #define VENDOR_CAP_PCI_DEV_OFFSET 0x0d
38
39 /*
40 * NPU2 BAR layout definition. We have 3 stacks and each of them
41 * contains 2 bricks. So every NPU2 has 6 bricks in total. There are 2
42 * PHY BARs and each of them is shared by 3 bricks. Every brick has
43 * one NTL BAR and two bricks share one GENID BAR. There is also a
44 * global MMIO BAR. We only expose DL and GENID BARs to the OS and all
45 * other BARs will be hidden in skiboot.
46 *
47 * Before the global MMIO BAR is configured, scom is the only way to
48 * access the BAR registers. At NPU2 PHB probing time, we rely on scom
49 * to assign all BARs until the global MMIO BAR is established.
50 *
51 * We need to access 4 SM registers in the same stack in order to
52 * configure one particular BAR.
53 */
54
55 /* Set a specific flag in the vendor config space */
npu2_set_link_flag(struct npu2_dev * ndev,uint8_t flag)56 void npu2_set_link_flag(struct npu2_dev *ndev, uint8_t flag)
57 {
58 ndev->nvlink.link_flags |= flag;
59 PCI_VIRT_CFG_INIT_RO(ndev->nvlink.pvd, VENDOR_CAP_START +
60 VENDOR_CAP_PCI_DEV_OFFSET, 1, ndev->nvlink.link_flags);
61 }
62
npu2_clear_link_flag(struct npu2_dev * ndev,uint8_t flag)63 void npu2_clear_link_flag(struct npu2_dev *ndev, uint8_t flag)
64 {
65 ndev->nvlink.link_flags &= ~flag;
66 PCI_VIRT_CFG_INIT_RO(ndev->nvlink.pvd, VENDOR_CAP_START +
67 VENDOR_CAP_PCI_DEV_OFFSET, 1, ndev->nvlink.link_flags);
68 }
69
npu2_ioda_sel(struct npu2 * p,uint32_t table,uint32_t index,bool autoinc)70 static inline void npu2_ioda_sel(struct npu2 *p, uint32_t table,
71 uint32_t index, bool autoinc)
72 {
73 out_be64(p->regs + NPU2_ATS_IODA_TBL,
74 (autoinc ? NPU2_ATS_IODA_TBL_AUTOINC : 0ul) |
75 SETFIELD(NPU2_ATS_IODA_TBL_SELECT, 0ul, table) |
76 SETFIELD(NPU2_ATS_IODA_TBL_INDEX, 0ul, index));
77 }
78
npu2_bdf_to_dev(struct npu2 * p,uint32_t bdfn)79 static struct npu2_dev *npu2_bdf_to_dev(struct npu2 *p,
80 uint32_t bdfn)
81 {
82 struct pci_virt_device *pvd;
83
84 /* All emulated devices are attached to root bus */
85 if (bdfn & ~0xff)
86 return NULL;
87
88 pvd = pci_virt_find_device(&p->phb_nvlink, bdfn);
89 if (pvd)
90 return pvd->data;
91
92 return NULL;
93 }
94
npu2_get_bar(uint32_t gcid,struct npu2_bar * bar)95 static inline void npu2_get_bar(uint32_t gcid, struct npu2_bar *bar)
96 {
97 phys_map_get(gcid, bar->type, bar->index, &bar->base, &bar->size);
98 }
99
npu2_read_bar(struct npu2 * p,struct npu2_bar * bar)100 static void npu2_read_bar(struct npu2 *p, struct npu2_bar *bar)
101 {
102 uint64_t reg, val;
103 int enabled;
104
105 reg = NPU2_REG_OFFSET(0, NPU2_BLOCK_SM_0, bar->reg);
106 val = npu2_read(p, reg);
107
108 switch (NPU2_REG(bar->reg)) {
109 case NPU2_PHY_BAR:
110 bar->base = GETFIELD(NPU2_PHY_BAR_ADDR, val) << 21;
111 enabled = GETFIELD(NPU2_PHY_BAR_ENABLE, val);
112
113 if (NPU2_REG_STACK(reg) == NPU2_STACK_STCK_2)
114 /* This is the global MMIO BAR */
115 bar->size = 0x1000000;
116 else
117 bar->size = 0x200000;
118 break;
119 case NPU2_NTL0_BAR:
120 case NPU2_NTL1_BAR:
121 bar->base = GETFIELD(NPU2_NTL_BAR_ADDR, val) << 16;
122 enabled = GETFIELD(NPU2_NTL_BAR_ENABLE, val);
123 bar->size = 0x10000 << GETFIELD(NPU2_NTL_BAR_SIZE, val);
124 break;
125 case NPU2_GENID_BAR:
126 bar->base = GETFIELD(NPU2_GENID_BAR_ADDR, val) << 16;
127 enabled = GETFIELD(NPU2_GENID_BAR_ENABLE, val);
128 bar->size = 0x20000;
129 break;
130 default:
131 bar->base = 0ul;
132 enabled = 0;
133 bar->size = 0;
134 break;
135 }
136
137 bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, bar->flags, enabled);
138 }
139
npu2_write_bar(struct npu2 * p,struct npu2_bar * bar,uint32_t gcid,uint32_t scom)140 static void npu2_write_bar(struct npu2 *p,
141 struct npu2_bar *bar,
142 uint32_t gcid,
143 uint32_t scom)
144 {
145 uint64_t reg, val, enable = !!(bar->flags & NPU2_BAR_FLAG_ENABLED);
146 int block;
147
148 switch (NPU2_REG(bar->reg)) {
149 case NPU2_PHY_BAR:
150 val = SETFIELD(NPU2_PHY_BAR_ADDR, 0ul, bar->base >> 21);
151 val = SETFIELD(NPU2_PHY_BAR_ENABLE, val, enable);
152 break;
153 case NPU2_NTL0_BAR:
154 case NPU2_NTL1_BAR:
155 val = SETFIELD(NPU2_NTL_BAR_ADDR, 0ul, bar->base >> 16);
156 val = SETFIELD(NPU2_NTL_BAR_ENABLE, val, enable);
157 val = SETFIELD(NPU2_NTL_BAR_SIZE, val, 1);
158 break;
159 case NPU2_GENID_BAR:
160 val = SETFIELD(NPU2_GENID_BAR_ADDR, 0ul, bar->base >> 16);
161 val = SETFIELD(NPU2_GENID_BAR_ENABLE, val, enable);
162 break;
163 default:
164 val = 0ul;
165 }
166
167 for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
168 reg = NPU2_REG_OFFSET(0, block, bar->reg);
169 if (p)
170 npu2_write(p, reg, val);
171 else
172 npu2_scom_write(gcid, scom, reg, NPU2_MISC_DA_LEN_8B, val);
173 }
174 }
175
176 /* Trap for PCI command (0x4) to enable or disable device's BARs */
npu2_cfg_write_cmd(void * dev,struct pci_cfg_reg_filter * pcrf __unused,uint32_t offset,uint32_t size,uint32_t * data,bool write)177 static int64_t npu2_cfg_write_cmd(void *dev,
178 struct pci_cfg_reg_filter *pcrf __unused,
179 uint32_t offset, uint32_t size,
180 uint32_t *data, bool write)
181 {
182 struct pci_virt_device *pvd = dev;
183 struct npu2_dev *ndev = pvd->data;
184 struct npu2_bar *ntl_npu_bar, *genid_npu_bar;
185 bool enabled;
186
187 if (!write)
188 return OPAL_PARTIAL;
189
190 if (offset != PCI_CFG_CMD)
191 return OPAL_PARAMETER;
192 if (size != 1 && size != 2 && size != 4)
193 return OPAL_PARAMETER;
194
195 /*
196 * Enable or disable NTL and GENID BAR. Two bricks share
197 * one GENID BAR, which is exposed via the first brick.
198 */
199 enabled = !!(*data & PCI_CFG_CMD_MEM_EN);
200 ntl_npu_bar = &ndev->bars[0].npu2_bar;
201 genid_npu_bar = &ndev->bars[1].npu2_bar;
202
203 ntl_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, ntl_npu_bar->flags, enabled);
204 npu2_write_bar(ndev->npu, ntl_npu_bar, 0, 0);
205
206 /*
207 * Enable/disable the GENID BAR. Two bricks share one GENID
208 * BAR which is exposed via the first brick so we need to
209 * track the enables separately.
210 */
211 if (NPU2DEV_BRICK(ndev))
212 genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED1, genid_npu_bar->flags,
213 enabled);
214 else
215 genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED0, genid_npu_bar->flags,
216 enabled);
217
218 /* Enable the BAR if either device requests it enabled, otherwise disable it */
219 genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, genid_npu_bar->flags,
220 !!(genid_npu_bar->flags & (NPU2_BAR_FLAG_ENABLED0 |
221 NPU2_BAR_FLAG_ENABLED1)));
222 npu2_write_bar(ndev->npu, genid_npu_bar, 0, 0);
223
224 return OPAL_PARTIAL;
225 }
226
npu2_cfg_read_bar(struct npu2_dev * dev __unused,struct pci_cfg_reg_filter * pcrf,uint32_t offset,uint32_t size,uint32_t * data)227 static int64_t npu2_cfg_read_bar(struct npu2_dev *dev __unused,
228 struct pci_cfg_reg_filter *pcrf,
229 uint32_t offset, uint32_t size,
230 uint32_t *data)
231 {
232 struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data;
233
234 if (!(bar->flags & NPU2_PCIE_BAR_FLAG_TRAPPED))
235 return OPAL_PARTIAL;
236
237 if ((size != 4) ||
238 (offset != pcrf->start && offset != pcrf->start + 4))
239 return OPAL_PARAMETER;
240
241 if (bar->flags & NPU2_PCIE_BAR_FLAG_SIZE_HI)
242 *data = bar->npu2_bar.size >> 32;
243 else
244 *data = bar->npu2_bar.size;
245 bar->flags &= ~(NPU2_PCIE_BAR_FLAG_TRAPPED | NPU2_PCIE_BAR_FLAG_SIZE_HI);
246
247 return OPAL_SUCCESS;
248 }
249
npu2_cfg_write_bar(struct npu2_dev * dev,struct pci_cfg_reg_filter * pcrf,uint32_t offset,uint32_t size,uint32_t data)250 static int64_t npu2_cfg_write_bar(struct npu2_dev *dev,
251 struct pci_cfg_reg_filter *pcrf,
252 uint32_t offset, uint32_t size,
253 uint32_t data)
254 {
255 struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data;
256 struct npu2_bar old_bar, *npu2_bar = &bar->npu2_bar;
257
258 if ((size != 4) ||
259 (offset != pcrf->start && offset != pcrf->start + 4))
260 return OPAL_PARAMETER;
261
262 /* Return BAR size on next read */
263 if (data == 0xffffffff) {
264 bar->flags |= NPU2_PCIE_BAR_FLAG_TRAPPED;
265 if (offset == pcrf->start + 4)
266 bar->flags |= NPU2_PCIE_BAR_FLAG_SIZE_HI;
267
268 return OPAL_SUCCESS;
269 }
270
271 if (offset == pcrf->start) {
272 npu2_bar->base &= 0xffffffff00000000UL;
273 npu2_bar->base |= (data & 0xfffffff0);
274 } else {
275 npu2_bar->base &= 0x00000000ffffffffUL;
276 npu2_bar->base |= ((uint64_t)data << 32);
277
278 if (NPU2_REG(npu2_bar->reg) == NPU2_GENID_BAR && NPU2DEV_BRICK(dev))
279 npu2_bar->base -= 0x10000;
280
281 old_bar.reg = npu2_bar->reg;
282 npu2_read_bar(dev->npu, &old_bar);
283
284 /* Only allow changing the base address if the BAR is not enabled */
285 if ((npu2_bar->flags & NPU2_BAR_FLAG_ENABLED) &&
286 (npu2_bar->base != old_bar.base)) {
287 npu2_bar->base = old_bar.base;
288 return OPAL_HARDWARE;
289 }
290
291 npu2_write_bar(dev->npu, &bar->npu2_bar, 0, 0);
292 }
293
294 /* To update the config cache */
295 return OPAL_PARTIAL;
296 }
297
npu2_dev_cfg_bar(void * dev,struct pci_cfg_reg_filter * pcrf,uint32_t offset,uint32_t len,uint32_t * data,bool write)298 static int64_t npu2_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
299 uint32_t offset, uint32_t len, uint32_t *data,
300 bool write)
301 {
302 struct pci_virt_device *pvd = dev;
303 struct npu2_dev *ndev = (struct npu2_dev *) pvd->data;
304
305 if (write)
306 return npu2_cfg_write_bar(ndev, pcrf, offset, len, *data);
307
308 return npu2_cfg_read_bar(ndev, pcrf, offset, len, data);
309 }
310
npu2_dev_cfg_exp_devcap(void * dev,struct pci_cfg_reg_filter * pcrf __unused,uint32_t offset,uint32_t size,uint32_t * data,bool write)311 static int64_t npu2_dev_cfg_exp_devcap(void *dev,
312 struct pci_cfg_reg_filter *pcrf __unused,
313 uint32_t offset, uint32_t size,
314 uint32_t *data, bool write)
315 {
316 struct pci_virt_device *pvd = dev;
317 struct npu2_dev *ndev = pvd->data;
318 int rc;
319
320 assert(write);
321
322 if ((size != 2) || (offset & 1)) {
323 /* Short config writes are not supported */
324 prlog(PR_ERR, "NPU%d: Unsupported write to pcie control register\n",
325 ndev->nvlink.phb->opal_id);
326 return OPAL_PARAMETER;
327 }
328
329 if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
330 npu2_dev_procedure_reset(ndev);
331
332 rc = purge_l2_l3_caches();
333 if (rc)
334 return rc;
335
336 return OPAL_PARTIAL;
337 }
338
339 #define NPU2_CFG_READ(size, type) \
340 static int64_t npu2_cfg_read##size(struct phb *phb, uint32_t bdfn, \
341 uint32_t offset, type *data) \
342 { \
343 uint32_t val; \
344 int64_t ret; \
345 \
346 ret = pci_virt_cfg_read(phb, bdfn, offset, \
347 sizeof(*data), &val); \
348 *data = (type)val; \
349 return ret; \
350 }
351 #define NPU2_CFG_WRITE(size, type) \
352 static int64_t npu2_cfg_write##size(struct phb *phb, uint32_t bdfn, \
353 uint32_t offset, type data) \
354 { \
355 uint32_t val = data; \
356 int64_t ret; \
357 \
358 ret = pci_virt_cfg_write(phb, bdfn, offset, \
359 sizeof(data), val); \
360 return ret; \
361 }
362
363 NPU2_CFG_READ(8, u8);
364 NPU2_CFG_READ(16, u16);
365 NPU2_CFG_READ(32, u32);
366 NPU2_CFG_WRITE(8, u8);
367 NPU2_CFG_WRITE(16, u16);
368 NPU2_CFG_WRITE(32, u32);
369
__npu2_dev_bind_pci_dev(struct phb * phb __unused,struct pci_device * pd,void * data)370 static int __npu2_dev_bind_pci_dev(struct phb *phb __unused,
371 struct pci_device *pd,
372 void *data)
373 {
374 struct npu2_dev *dev = data;
375 struct dt_node *pci_dt_node;
376 char *pcislot;
377
378 /* Ignore non-nvidia PCI devices */
379 if ((pd->vdid & 0xffff) != 0x10de)
380 return 0;
381
382 /* Find the PCI device's slot location */
383 for (pci_dt_node = pd->dn;
384 pci_dt_node && !dt_find_property(pci_dt_node, "ibm,loc-code");
385 pci_dt_node = pci_dt_node->parent);
386
387 if (!pci_dt_node)
388 return 0;
389
390 pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,loc-code");
391
392 NPU2DEVDBG(dev, "Comparing GPU '%s' and NPU2 '%s'\n",
393 pcislot, dev->nvlink.slot_label);
394
395 if (streq(pcislot, dev->nvlink.slot_label))
396 return 1;
397
398 return 0;
399 }
400
npu2_gpu_bridge_sec_bus_reset(void * dev,struct pci_cfg_reg_filter * pcrf __unused,uint32_t offset,uint32_t len,uint32_t * data,bool write)401 static int64_t npu2_gpu_bridge_sec_bus_reset(void *dev,
402 struct pci_cfg_reg_filter *pcrf __unused,
403 uint32_t offset, uint32_t len,
404 uint32_t *data, bool write)
405 {
406 struct pci_device *pd = dev;
407 struct pci_device *gpu;
408 struct phb *npphb;
409 struct npu2 *npu;
410 struct dt_node *np;
411 struct npu2_dev *ndev;
412 int i;
413
414 assert(write);
415
416 if ((len != 2) || (offset & 1)) {
417 /* Short config writes are not supported */
418 PCIERR(pd->phb, pd->bdfn,
419 "Unsupported write to bridge control register\n");
420 return OPAL_PARAMETER;
421 }
422
423 gpu = list_top(&pd->children, struct pci_device, link);
424 if (gpu && (*data & PCI_CFG_BRCTL_SECONDARY_RESET)) {
425 int64_t rc;
426
427 dt_for_each_compatible(dt_root, np, "ibm,power9-npu-pciex") {
428 npphb = pci_get_phb(dt_prop_get_cell(np,
429 "ibm,opal-phbid", 1));
430 if (!npphb || npphb->phb_type != phb_type_npu_v2)
431 continue;
432
433 npu = phb_to_npu2_nvlink(npphb);
434 for (i = 0; i < npu->total_devices; ++i) {
435 ndev = &npu->devices[i];
436 if (ndev->nvlink.pd == gpu)
437 npu2_dev_procedure_reset(ndev);
438 }
439 }
440
441 rc = purge_l2_l3_caches();
442 if (rc)
443 return rc;
444 }
445
446 return OPAL_PARTIAL;
447 }
448
npu2_dev_bind_pci_dev(struct npu2_dev * dev)449 static void npu2_dev_bind_pci_dev(struct npu2_dev *dev)
450 {
451 struct phb *phb;
452 uint32_t i;
453
454 if (dev->nvlink.pd)
455 return;
456
457 for (i = 0; i < 64; i++) {
458 if (dev->npu->phb_nvlink.opal_id == i)
459 continue;
460
461 phb = pci_get_phb(i);
462 if (!phb)
463 continue;
464
465 dev->nvlink.pd = pci_walk_dev(phb, NULL, __npu2_dev_bind_pci_dev, dev);
466 if (dev->nvlink.pd) {
467 dev->nvlink.phb = phb;
468 /* Found the device, set the bit in config space */
469 npu2_set_link_flag(dev, NPU2_DEV_PCI_LINKED);
470
471 /*
472 * We define a custom sec bus reset handler for a slot
473 * with an NVLink-connected GPU to prevent HMIs which
474 * will otherwise happen if we reset GPU before
475 * resetting NVLinks.
476 */
477 if (dev->nvlink.pd->parent &&
478 dev->nvlink.pd->parent->slot)
479 pci_add_cfg_reg_filter(dev->nvlink.pd->parent,
480 PCI_CFG_BRCTL, 2,
481 PCI_REG_FLAG_WRITE,
482 npu2_gpu_bridge_sec_bus_reset);
483 return;
484 }
485 }
486
487 NPU2DEVINF(dev, "No PCI device found for slot '%s'\n",
488 dev->nvlink.slot_label);
489 }
490
491 static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED;
492
npu2_append_phandle(struct dt_node * dn,u32 phandle)493 static void npu2_append_phandle(struct dt_node *dn,
494 u32 phandle)
495 {
496 struct dt_property *prop;
497 uint32_t *npu_phandles;
498 size_t len;
499
500 /*
501 * Use a lock to make sure no one else has a reference to an
502 * ibm,npu property (this assumes this is the only function
503 * that holds a reference to it)
504 */
505 lock(&pci_npu_phandle_lock);
506
507 /* This function shouldn't be called unless ibm,npu exists */
508 prop = (struct dt_property *)dt_require_property(dn, "ibm,npu", -1);
509
510 /* Need to append to the properties */
511 len = prop->len + sizeof(*npu_phandles);
512 dt_resize_property(&prop, len);
513
514 npu_phandles = (uint32_t *)prop->prop;
515 npu_phandles[len / sizeof(*npu_phandles) - 1] = phandle;
516 unlock(&pci_npu_phandle_lock);
517 }
518
npu2_create_memory_dn(uint64_t addr,uint64_t size)519 static struct dt_node *npu2_create_memory_dn(uint64_t addr, uint64_t size)
520 {
521 struct dt_node *mem;
522 static u32 chip_id = 255;
523
524 mem = dt_find_by_name_addr(dt_root, "memory", addr);
525 if (mem)
526 return mem;
527
528 mem = dt_new_addr(dt_root, "memory", addr);
529 if (!mem)
530 return NULL;
531 dt_add_property_string(mem, "device_type", "memory");
532 dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory");
533 dt_add_property_u64s(mem, "reg", addr, size);
534 dt_add_property_cells(mem, "ibm,chip-id", chip_id);
535 dt_add_property_u64s(mem, "linux,usable-memory", addr, 0);
536 dt_add_property_cells(mem, "ibm,associativity", 4, chip_id, chip_id, chip_id, chip_id);
537 chip_id--;
538
539 assert(chip_id);
540 return mem;
541 }
542
543 /* There are potentially multiple links per GPU, so lookup the GPU memory based
544 * on bdfn. */
npu2_get_gpu_base(struct npu2_dev * ndev,uint64_t * addr,uint64_t * size)545 static void npu2_get_gpu_base(struct npu2_dev *ndev, uint64_t *addr, uint64_t *size)
546 {
547 struct npu2 *p = ndev->npu;
548 int group;
549
550 group = PCI_DEV(ndev->bdfn);
551 phys_map_get(ndev->npu->chip_id, p->gpu_map_type, group, addr, size);
552 }
553
npu2_dn_fixup_gmb(struct dt_node * pd_dn,struct npu2_dev * ndev)554 static void npu2_dn_fixup_gmb(struct dt_node *pd_dn, struct npu2_dev *ndev)
555 {
556 uint64_t gpu_base, gpu_size, gta;
557 struct dt_node *mem_dn;
558
559 npu2_get_gpu_base(ndev, &gpu_base, &gpu_size);
560 mem_dn = npu2_create_memory_dn(gpu_base, gpu_size);
561 assert(mem_dn);
562 dt_add_property_cells(pd_dn, "memory-region", mem_dn->phandle);
563
564 /* Coral mode address compression. This is documented in Figure 3.5
565 * "P9->GPU RA Compression (Coral) of the NPU2 workbook". */
566 gta = ((gpu_base >> 42) & 0x1) << 42;
567 gta |= ((gpu_base >> 45) & 0x3) << 43;
568 gta |= ((gpu_base >> 49) & 0x3) << 45;
569 gta |= gpu_base & ((1UL << 43) - 1);
570
571 dt_add_property_u64s(pd_dn, "ibm,device-tgt-addr", gta);
572 }
573
npu2_assign_gmb(struct npu2_dev * ndev)574 static int npu2_assign_gmb(struct npu2_dev *ndev)
575 {
576 struct npu2 *p = ndev->npu;
577 int peers, mode;
578 uint32_t bdfn;
579 uint64_t base, size, reg, val, gmb;
580
581 /* Need to work out number of link peers. This amount to
582 * working out the maximum function number. So work start at
583 * the highest bdfn (fn = 6) and count back until we find a
584 * npu2_dev. */
585 for (bdfn = (ndev->bdfn & ~0x7) | NPU2_LINKS_PER_CHIP;
586 PCI_FUNC(bdfn) != 0x7; bdfn = (bdfn & ~0x7) | (PCI_FUNC(bdfn) - 1))
587 if (npu2_bdf_to_dev(p, bdfn))
588 break;
589 peers = PCI_FUNC(bdfn);
590
591 npu2_get_gpu_base(ndev, &base, &size);
592
593 NPU2DBG(p, "Setting BAR region dt:%llx\n", base);
594 val = SETFIELD(NPU2_MEM_BAR_EN, 0ULL, 1);
595 val = SETFIELD(NPU2_MEM_BAR_SEL_MEM, val, base >> (63-14));
596 val = SETFIELD(NPU2_MEM_BAR_GROUP, val, base >> (63-18));
597 val = SETFIELD(NPU2_MEM_BAR_CHIP, val, base >> (63-21));
598 val = SETFIELD(NPU2_MEM_BAR_NODE_ADDR, val, base >> (63-33));
599 val = SETFIELD(NPU2_MEM_BAR_POISON, val, 1);
600 val = SETFIELD(NPU2_MEM_BAR_GRANULE, val, 0);
601
602 /* We don't know how much memory the GPU has, so we may as well just
603 * pass the whole aperture through at this point. */
604 val = SETFIELD(NPU2_MEM_BAR_BAR_SIZE, val, ilog2(size >> 30));
605
606 switch (peers) {
607 case 0:
608 mode = 0;
609 break;
610 case 1:
611 mode = 1;
612 break;
613 case 2:
614 mode = 3;
615 break;
616 case 3:
617 mode = 6;
618 break;
619 case 5:
620 mode = 10;
621 break;
622 default:
623 /* Hardware does not support this configuration */
624 assert(0);
625 }
626
627 mode += PCI_FUNC(ndev->bdfn);
628 val = SETFIELD(NPU2_MEM_BAR_MODE, val, mode);
629
630 gmb = NPU2_GPU0_MEM_BAR;
631 if (NPU2DEV_BRICK(ndev))
632 gmb = NPU2_GPU1_MEM_BAR;
633
634 reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
635 NPU2_BLOCK_SM_0, gmb);
636
637 npu2_write(p, reg, val);
638 reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
639 NPU2_BLOCK_SM_1, gmb);
640 npu2_write(p, reg, val);
641 reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
642 NPU2_BLOCK_SM_2, gmb);
643 npu2_write(p, reg, val);
644 reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
645 NPU2_BLOCK_SM_3, gmb);
646 npu2_write(p, reg, val);
647
648 return 0;
649 }
650
npu2_dn_fixup(struct phb * phb,struct pci_device * pd,void * data __unused)651 static int npu2_dn_fixup(struct phb *phb,
652 struct pci_device *pd,
653 void *data __unused)
654 {
655 struct npu2 *p = phb_to_npu2_nvlink(phb);
656 struct npu2_dev *dev;
657 uint32_t speed;
658 const char *label;
659
660 dev = npu2_bdf_to_dev(p, pd->bdfn);
661 assert(dev);
662 if (dev->nvlink.phb || dev->nvlink.pd)
663 return 0;
664
665 npu2_assign_gmb(dev);
666 npu2_dn_fixup_gmb(pd->dn, dev);
667 dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dt_node->phandle);
668
669 /*
670 * NVLink supports multiple speeds and device drivers need to know what
671 * speed has been set by firmware. Hostboot does the inits that set the
672 * link speed and tell us via HDAT and we need to copy that from the
673 * link node.
674 */
675 speed = dt_prop_get_u32_def(dev->dt_node, "nvidia,link-speed", 0xff);
676 if (speed != 0xff)
677 dt_add_property_cells(pd->dn, "ibm,nvlink-speed", speed);
678
679 /*
680 * NPU2 devices have a slot label that indicates which GPU slot
681 * this NPU is connected to. Add a location code to the NVlink
682 * device node based on the slot label.
683 */
684 label = dt_prop_get_def(dev->dt_node, "ibm,slot-label", NULL);
685 if (!label) {
686 /**
687 * @fwts-label NPUNoPHBSlotLabel
688 * @fwts-advice No GPU/NPU2 slot information was found.
689 * NVLink2 functionality will not work.
690 */
691 prlog(PR_ERR, "NPU: Cannot find GPU slot information\n");
692 return 0;
693 }
694 dt_add_property_string(pd->dn, "ibm,loc-code", label);
695
696 dev->nvlink.slot_label = label;
697
698 /*
699 * Bind the emulated PCI device with the real one, which can't
700 * be done until the PCI devices are populated. Once the real
701 * PCI device is identified, we also need fix the device-tree
702 * for it
703 */
704 npu2_dev_bind_pci_dev(dev);
705 if (dev->nvlink.phb && dev->nvlink.pd && dev->nvlink.pd->dn) {
706 if (dt_find_property(dev->nvlink.pd->dn, "ibm,npu"))
707 npu2_append_phandle(dev->nvlink.pd->dn, pd->dn->phandle);
708 else
709 dt_add_property_cells(dev->nvlink.pd->dn, "ibm,npu", pd->dn->phandle);
710
711 dt_add_property_cells(pd->dn, "ibm,gpu", dev->nvlink.pd->dn->phandle);
712 dev->nvlink.gpu_bdfn = dev->nvlink.pd->bdfn;
713 }
714
715 return 0;
716 }
717
npu2_links_per_gpu(struct phb * phb,struct pci_device * pd,void * data)718 static int npu2_links_per_gpu(struct phb *phb,
719 struct pci_device *pd,
720 void *data)
721 {
722 struct npu2 *p = phb_to_npu2_nvlink(phb);
723 struct npu2_dev *dev;
724 int *nlinks = (int *)data;
725
726 dev = npu2_bdf_to_dev(p, pd->bdfn);
727 assert(dev);
728
729 if (dev->nvlink.phb && dev->nvlink.pd && dev->nvlink.pd->dn) {
730 const struct dt_property *prop;
731 int n;
732
733 /* The link count is the number of phandles in "ibm,npu" */
734 prop = dt_find_property(dev->nvlink.pd->dn, "ibm,npu");
735 if (!prop)
736 return 0;
737
738 /* Count could vary by gpu, so find the max */
739 n = prop->len / sizeof(uint32_t);
740 if (n > *nlinks)
741 *nlinks = n;
742 }
743
744 return 0;
745 }
746
npu2_phb_fixup_scominit(struct dt_node * dn,int links_per_gpu)747 static void npu2_phb_fixup_scominit(struct dt_node *dn, int links_per_gpu)
748 {
749 uint32_t gcid = dt_get_chip_id(dn);
750 uint64_t val, mask;
751
752 /*
753 * MRBSP settings for 2- and 3-link GPU systems. These can improve
754 * GPU peer-to-peer fully ordered write performance.
755 */
756 if (links_per_gpu == 3) {
757 val = PPC_BIT(30) | PPC_BIT(34) | PPC_BIT(36) | PPC_BIT(37) |
758 PPC_BIT(44) | PPC_BIT(45);
759 mask = PPC_BITMASK(28,39) | PPC_BITMASK(44,47);
760 } else if (links_per_gpu == 2) {
761 val = PPC_BIT(46) | PPC_BIT(47);
762 mask = PPC_BITMASK(44,47);
763 } else
764 return;
765
766 xscom_write_mask(gcid, 0x50110c0, val, mask);
767 xscom_write_mask(gcid, 0x50112c0, val, mask);
768 xscom_write_mask(gcid, 0x50114c0, val, mask);
769 }
770
npu2_phb_final_fixup(struct phb * phb)771 static void npu2_phb_final_fixup(struct phb *phb)
772 {
773 int links_per_gpu = 0;
774 struct dt_node *np;
775
776 pci_walk_dev(phb, NULL, npu2_dn_fixup, NULL);
777
778 /*
779 * Now that the emulated devices are bound to the real ones, we can
780 * determine links_per_gpu and do some final init.
781 */
782 pci_walk_dev(phb, NULL, npu2_links_per_gpu, &links_per_gpu);
783 dt_for_each_compatible(dt_root, np, "ibm,power9-npu")
784 npu2_phb_fixup_scominit(np, links_per_gpu);
785 }
786
npu2_init_ioda_cache(struct npu2 * p)787 static void npu2_init_ioda_cache(struct npu2 *p)
788 {
789 /* TVT */
790 memset(p->tve_cache, 0, sizeof(p->tve_cache));
791 }
792
npu2_ioda_reset(struct phb * phb,bool purge)793 static int64_t npu2_ioda_reset(struct phb *phb, bool purge)
794 {
795 struct npu2 *p = phb_to_npu2_nvlink(phb);
796 uint32_t i;
797
798 if (purge) {
799 NPU2DBG(p, "Purging all IODA tables...\n");
800 npu2_init_ioda_cache(p);
801 }
802
803 /* TVT */
804 npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, 0, true);
805 for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
806 out_be64(p->regs + NPU2_ATS_IODA_DATA, p->tve_cache[i]);
807
808 return OPAL_SUCCESS;
809 }
810
npu2_write_mcd(struct npu2 * p,uint64_t pcb_addr,uint64_t addr,uint64_t size)811 static void npu2_write_mcd(struct npu2 *p, uint64_t pcb_addr, uint64_t addr,
812 uint64_t size)
813 {
814 uint64_t val;
815
816 NPU2DBG(p, "Setting MCD addr:%llx\n", pcb_addr);
817 assert(is_pow2(size));
818
819 val = MCD_BANK_CN_VALID;
820 val = SETFIELD(MCD_BANK_CN_SIZE, val, (size >> 25) - 1);
821 val = SETFIELD(MCD_BANK_CN_ADDR, val, addr >> 25);
822 xscom_write(p->chip_id, pcb_addr, val);
823 }
824
npu2_mcd_init(struct npu2 * p)825 static void npu2_mcd_init(struct npu2 *p)
826 {
827 int i;
828 uint64_t size, addr, gpu_min_addr, gpu_max_addr, total_size;
829
830 /* Init memory cache directory (MCD) registers. */
831 phys_map_get(p->chip_id, p->gpu_map_type, NPU2_LINKS_PER_CHIP - 1,
832 &gpu_min_addr, NULL);
833 phys_map_get(p->chip_id, p->gpu_map_type, 0, &gpu_max_addr, &size);
834 gpu_max_addr += size;
835
836 /* We assume GPU memory is contiguous from the first possible GPU to the
837 * last and that the size is the same so best to check that. */
838 for (i = 0; i < NPU2_LINKS_PER_CHIP; i++) {
839 uint64_t tmp;
840 phys_map_get(p->chip_id, p->gpu_map_type, i, &addr, &tmp);
841 assert((addr >= gpu_min_addr) && (addr + tmp <= gpu_max_addr));
842 assert(tmp == size);
843 }
844
845 /* We have two MCDs, so if neccessary we can split the region covered
846 * across both if total_size is not a power of two. */
847 total_size = gpu_max_addr - gpu_min_addr;
848 size = 1ull << ilog2(total_size);
849
850 /* Allocate the biggest chunk first as we assume gpu_max_addr has the
851 * highest alignment. */
852 addr = gpu_max_addr - size;
853 npu2_write_mcd(p, MCD0_BANK0_CN3, addr, size);
854 total_size -= size;
855 if (total_size) {
856 /* total_size was not a power of two, but the remainder should
857 * be if all GPUs were assigned the same size. */
858 assert(is_pow2(total_size));
859 size = 1ull << ilog2(total_size);
860 addr -= size;
861 assert(addr <= gpu_min_addr);
862 npu2_write_mcd(p, MCD1_BANK0_CN3, addr, size);
863 }
864 }
865
npu2_hw_init(struct npu2 * p)866 static void npu2_hw_init(struct npu2 *p)
867 {
868 uint64_t reg, val;
869 int s, b;
870
871 npu2_ioda_reset(&p->phb_nvlink, false);
872
873 /* Enable XTS retry mode */
874 val = npu2_read(p, NPU2_XTS_CFG);
875 npu2_write(p, NPU2_XTS_CFG, val | NPU2_XTS_CFG_MMIOSD | NPU2_XTS_CFG_TRY_ATR_RO);
876
877 val = npu2_read(p, NPU2_XTS_CFG2);
878 npu2_write(p, NPU2_XTS_CFG2, val | NPU2_XTS_CFG2_NO_FLUSH_ENA);
879
880 /*
881 * There are three different ways we configure the MCD and memory map.
882 * 1) Old way
883 * Skiboot configures the MCD and puts GPUs at 4TB and below
884 * 2) New way with MCD
885 * Hostboot configures the MCD and skiboot puts GPU at 4TB and above
886 * 3) New way without MCD
887 * No one configures the MCD and skiboot puts GPU at 4TB and below
888 *
889 * 1) Will go away evenutally as it's a configuration that can
890 * cause an xstop or data integrity problems. We are keeping
891 * it around to support existing hostboot. Print error
892 * message if used.
893 * 2) Is for smaller memory configurations and will be used
894 * initially for GPUs on Witherspoon. Supports only to
895 * 512GB of memory and 4 GPUs per socket.
896 * 3) Is for fully populated configurations of 4TB of memory
897 * and 6GPUs per socket. May have performance impacts.
898 *
899 * The different configurations can be detected via the following scoms:
900 * 1) 0x5011c0c bit 2 = 1, 0x5011c0a bits 42:48 = 0
901 * 2) 0x5011c0c bit 2 = 1, 0x5011c0a bits 42:48 = 7
902 * 3) 0x5011c0c bit 2 = 0, 0x5011c0a bits 42:48 = 0
903 */
904
905 /* Get 0x05011c0c bit 2 = 1 */
906 xscom_read(p->chip_id, PB_CENT_HP_MODE_CURR, &val);
907 if ((val & PB_CFG_CHG_RATE_GP_MASTER) != 0) {
908 /* Get 0x05011c0a bits 42:48 */
909 xscom_read(p->chip_id, PB_CENT_MODE, &val);
910 if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 0) {
911 /* 1) */
912 NPU2DBG(p, "Using old memory map + MCD enabled in skiboot\n");
913 NPU2ERR(p, "!!! Old firmware detected. Update hostboot for new MCD mapping !!!\n");
914 p->gpu_map_type = GPU_MEM_4T_DOWN;
915 npu2_mcd_init(p);
916 } else if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 7) {
917 /* 2) */
918 NPU2DBG(p, "Using small memory map + MCD enabled\n");
919 p->gpu_map_type = GPU_MEM_4T_UP;
920 } else
921 NPU2ERR(p, "!!! Unsupported NPU2 configuration. "
922 "0x%llx!!!\n", val);
923 } else {
924 /* 3) */
925 NPU2DBG(p, "Using large memory map + MCD disabled\n");
926 p->gpu_map_type = GPU_MEM_4T_DOWN;
927 }
928
929 /* Static initialization of every relaxed-ordering cfg[2] register */
930 val = NPU2_RELAXED_ORDERING_CMD_CL_DMA_W |
931 NPU2_RELAXED_ORDERING_CMD_CL_DMA_W_HP |
932 NPU2_RELAXED_ORDERING_CMD_CL_DMA_INJ |
933 NPU2_RELAXED_ORDERING_CMD_PR_DMA_INJ |
934 NPU2_RELAXED_ORDERING_CMD_DMA_PR_W |
935 NPU2_RELAXED_ORDERING_CMD_CL_RD_NC_F0 |
936 NPU2_RELAXED_ORDERING_SOURCE4_RDENA;
937
938 for (s = NPU2_STACK_STCK_0; s <= NPU2_STACK_STCK_2; s++) {
939 for (b = NPU2_BLOCK_SM_0; b <= NPU2_BLOCK_SM_3; b++) {
940 reg = NPU2_REG_OFFSET(s, b, NPU2_RELAXED_ORDERING_CFG(2));
941 npu2_write(p, reg, val);
942 }
943 }
944 }
945
npu2_map_pe_dma_window_real(struct phb * phb,uint64_t pe_num,uint16_t window_id,uint64_t pci_start_addr __unused,uint64_t pci_mem_size __unused)946 static int64_t npu2_map_pe_dma_window_real(struct phb *phb,
947 uint64_t pe_num,
948 uint16_t window_id,
949 uint64_t pci_start_addr __unused,
950 uint64_t pci_mem_size __unused)
951 {
952 struct npu2 *p = phb_to_npu2_nvlink(phb);
953 uint64_t tve;
954
955 /* Sanity check. Each PE has one corresponding TVE */
956 if (pe_num >= NPU2_MAX_PE_NUM ||
957 window_id != pe_num)
958 return OPAL_PARAMETER;
959
960 if (pci_mem_size) {
961 /* GPUs need to be able to access the MMIO memory space as well.
962 * On POWER9 this is above the top of ram so disable the TVT
963 * range check allowing access to all memory addresses. */
964 tve = 0;
965 } else {
966 /* Disable */
967 tve = PPC_BIT(51);
968 }
969
970 npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
971 out_be64(p->regs + NPU2_ATS_IODA_DATA, tve);
972 p->tve_cache[window_id] = tve;
973
974 return OPAL_SUCCESS;
975 }
976
npu2_map_pe_dma_window(struct phb * phb,uint64_t pe_num,uint16_t window_id,uint16_t tce_levels,uint64_t tce_table_addr,uint64_t tce_table_size,uint64_t tce_page_size)977 static int64_t npu2_map_pe_dma_window(struct phb *phb,
978 uint64_t pe_num,
979 uint16_t window_id,
980 uint16_t tce_levels,
981 uint64_t tce_table_addr,
982 uint64_t tce_table_size,
983 uint64_t tce_page_size)
984 {
985 struct npu2 *p = phb_to_npu2_nvlink(phb);
986 uint64_t tts_encoded;
987 uint64_t data64 = 0;
988
989 /* Sanity check. Each PE has one corresponding TVE */
990 if (pe_num >= NPU2_MAX_PE_NUM ||
991 window_id != pe_num)
992 return OPAL_PARAMETER;
993
994 /*
995 * Special condition, zero TCE table size used to disable
996 * the TVE.
997 */
998 if (!tce_table_size) {
999 npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
1000 out_be64(p->regs + NPU2_ATS_IODA_DATA, 0ul);
1001 p->tve_cache[window_id] = 0ul;
1002 return OPAL_SUCCESS;
1003 }
1004
1005 /* Additional arguments validation */
1006 if (tce_levels < 1 ||
1007 tce_levels > 4 ||
1008 !is_pow2(tce_table_size) ||
1009 tce_table_size < 0x1000)
1010 return OPAL_PARAMETER;
1011
1012 /* TCE table size */
1013 data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_TTA, 0ul, tce_table_addr >> 12);
1014 tts_encoded = ilog2(tce_table_size) - 11;
1015 if (tts_encoded > 39)
1016 return OPAL_PARAMETER;
1017 data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_SIZE, data64, tts_encoded);
1018
1019 /* TCE page size */
1020 switch (tce_page_size) {
1021 case 0x10000: /* 64K */
1022 data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 5);
1023 break;
1024 case 0x1000000: /* 16M */
1025 data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 13);
1026 break;
1027 case 0x10000000: /* 256M */
1028 data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 17);
1029 break;
1030 case 0x1000: /* 4K */
1031 default:
1032 data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 1);
1033 }
1034
1035 /* Number of levels */
1036 data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_LEVEL, data64, tce_levels - 1);
1037
1038 /* Update to hardware */
1039 npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
1040 out_be64(p->regs + NPU2_ATS_IODA_DATA, data64);
1041 p->tve_cache[window_id] = data64;
1042
1043 return OPAL_SUCCESS;
1044 }
1045
npu2_set_pe(struct phb * phb,uint64_t pe_num,uint64_t bdfn,uint8_t bcompare,uint8_t dcompare,uint8_t fcompare,uint8_t action)1046 static int64_t npu2_set_pe(struct phb *phb,
1047 uint64_t pe_num,
1048 uint64_t bdfn,
1049 uint8_t bcompare,
1050 uint8_t dcompare,
1051 uint8_t fcompare,
1052 uint8_t action)
1053 {
1054 struct npu2 *p;
1055 struct npu2_dev *dev;
1056 uint64_t reg, val;
1057
1058 /* Sanity check */
1059 if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
1060 return OPAL_PARAMETER;
1061 if (pe_num >= NPU2_MAX_PE_NUM)
1062 return OPAL_PARAMETER;
1063 if (bdfn >> 8)
1064 return OPAL_PARAMETER;
1065 if (bcompare != OpalPciBusAll ||
1066 dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
1067 fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
1068 return OPAL_UNSUPPORTED;
1069 if (phb->phb_type != phb_type_npu_v2)
1070 return OPAL_PARAMETER;
1071
1072 p = phb_to_npu2_nvlink(phb);
1073 if (!p)
1074 return OPAL_PARAMETER;
1075
1076 dev = npu2_bdf_to_dev(p, bdfn);
1077 if (!dev)
1078 return OPAL_PARAMETER;
1079
1080 val = NPU2_CQ_BRICK_BDF2PE_MAP_ENABLE;
1081 val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_PE, val, pe_num);
1082 val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_BDF, val, dev->nvlink.gpu_bdfn);
1083
1084 if (!NPU2DEV_BRICK(dev))
1085 reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->brick_index/2,
1086 NPU2_BLOCK_CTL, NPU2_CQ_BRICK0_BDF2PE_MAP0);
1087 else
1088 reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->brick_index/2,
1089 NPU2_BLOCK_CTL, NPU2_CQ_BRICK1_BDF2PE_MAP0);
1090
1091 npu2_write(p, reg, val);
1092 val = NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE;
1093 val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_PE, val, pe_num);
1094 val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_BDF, val, dev->nvlink.gpu_bdfn);
1095 reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC,
1096 NPU2_MISC_BRICK0_BDF2PE_MAP0 + (dev->brick_index * 0x18));
1097 npu2_write(p, reg, val);
1098
1099 return OPAL_SUCCESS;
1100 }
1101
npu2_get_link_state(struct pci_slot * slot __unused,uint8_t * val)1102 static int64_t npu2_get_link_state(struct pci_slot *slot __unused, uint8_t *val)
1103 {
1104 /*
1105 * As we're emulating all PCI stuff, the link bandwidth
1106 * isn't big deal anyway.
1107 */
1108 *val = OPAL_SHPC_LINK_UP_x1;
1109 return OPAL_SUCCESS;
1110 }
1111
npu2_get_power_state(struct pci_slot * slot __unused,uint8_t * val)1112 static int64_t npu2_get_power_state(struct pci_slot *slot __unused, uint8_t *val)
1113 {
1114 *val = PCI_SLOT_POWER_ON;
1115 return OPAL_SUCCESS;
1116 }
1117
npu2_hreset(struct pci_slot * slot __unused)1118 static int64_t npu2_hreset(struct pci_slot *slot __unused)
1119 {
1120 struct npu2 *p;
1121 int i;
1122 struct npu2_dev *ndev;
1123
1124 p = phb_to_npu2_nvlink(slot->phb);
1125 NPU2INF(p, "Hreset PHB state\n");
1126
1127 for (i = 0; i < p->total_devices; i++) {
1128 ndev = &p->devices[i];
1129 if (ndev) {
1130 NPU2DEVINF(ndev, "Resetting device\n");
1131 reset_ntl(ndev);
1132 }
1133 }
1134 return purge_l2_l3_caches();
1135 }
1136
npu2_freset(struct pci_slot * slot __unused)1137 static int64_t npu2_freset(struct pci_slot *slot __unused)
1138 {
1139 return OPAL_SUCCESS;
1140 }
1141
npu2_creset(struct pci_slot * slot)1142 static int64_t npu2_creset(struct pci_slot *slot)
1143 {
1144 struct npu2 *p;
1145 int i;
1146 struct npu2_dev *ndev;
1147
1148 p = phb_to_npu2_nvlink(slot->phb);
1149 NPU2INF(p, "Creset PHB state\n");
1150
1151 for (i = 0; i < p->total_devices; i++) {
1152 ndev = &p->devices[i];
1153 if (ndev) {
1154 NPU2DEVINF(ndev, "Resetting device\n");
1155 reset_ntl(ndev);
1156 }
1157 }
1158 return OPAL_SUCCESS;
1159 }
1160
npu2_slot_create(struct phb * phb)1161 static struct pci_slot *npu2_slot_create(struct phb *phb)
1162 {
1163 struct pci_slot *slot;
1164
1165 slot = pci_slot_alloc(phb, NULL);
1166 if (!slot)
1167 return slot;
1168
1169 /* Elementary functions */
1170 slot->ops.get_presence_state = NULL;
1171 slot->ops.get_link_state = npu2_get_link_state;
1172 slot->ops.get_power_state = npu2_get_power_state;
1173 slot->ops.get_attention_state = NULL;
1174 slot->ops.get_latch_state = NULL;
1175 slot->ops.set_power_state = NULL;
1176 slot->ops.set_attention_state = NULL;
1177
1178 slot->ops.prepare_link_change = NULL;
1179 slot->ops.poll_link = NULL;
1180 slot->ops.hreset = npu2_hreset;
1181 slot->ops.freset = npu2_freset;
1182 slot->ops.creset = npu2_creset;
1183
1184 return slot;
1185 }
1186
npu2_freeze_status(struct phb * phb __unused,uint64_t pe_number __unused,uint8_t * freeze_state,uint16_t * pci_error_type,uint16_t * severity)1187 int64_t npu2_freeze_status(struct phb *phb __unused,
1188 uint64_t pe_number __unused,
1189 uint8_t *freeze_state,
1190 uint16_t *pci_error_type,
1191 uint16_t *severity)
1192 {
1193 /*
1194 * FIXME: When it's called by skiboot PCI config accessor,
1195 * the PE number is fixed to 0, which is incorrect. We need
1196 * introduce another PHB callback to translate it. For now,
1197 * it keeps the skiboot PCI enumeration going.
1198 */
1199 *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
1200 *pci_error_type = OPAL_EEH_NO_ERROR;
1201 if (severity)
1202 *severity = OPAL_EEH_SEV_NO_ERROR;
1203
1204 return OPAL_SUCCESS;
1205 }
1206
npu2_eeh_next_error(struct phb * phb,uint64_t * first_frozen_pe,uint16_t * pci_error_type,uint16_t * severity)1207 static int64_t npu2_eeh_next_error(struct phb *phb,
1208 uint64_t *first_frozen_pe,
1209 uint16_t *pci_error_type,
1210 uint16_t *severity)
1211 {
1212 struct npu2 *p = phb_to_npu2_nvlink(phb);
1213 int i;
1214 uint64_t result = 0;
1215
1216 if (!first_frozen_pe || !pci_error_type || !severity)
1217 return OPAL_PARAMETER;
1218
1219 *first_frozen_pe = -1;
1220 *pci_error_type = OPAL_EEH_NO_ERROR;
1221 *severity = OPAL_EEH_SEV_NO_ERROR;
1222
1223 for (i = 0; i < NPU2_MAX_PE_NUM; i++) {
1224 result = npu2_read(p, NPU2_MISC_PESTB(i));
1225 if (result > 0) {
1226 *first_frozen_pe = i;
1227 *pci_error_type = OPAL_EEH_PE_ERROR;
1228 *severity = OPAL_EEH_SEV_PE_ER;
1229 break;
1230 }
1231 }
1232
1233 return OPAL_SUCCESS;
1234 }
1235
npu2_tce_kill(struct phb * phb,uint32_t kill_type,uint64_t pe_number,uint32_t tce_size,uint64_t dma_addr,uint32_t npages)1236 static int64_t npu2_tce_kill(struct phb *phb, uint32_t kill_type,
1237 uint64_t pe_number, uint32_t tce_size,
1238 uint64_t dma_addr, uint32_t npages)
1239 {
1240 struct npu2 *npu = phb_to_npu2_nvlink(phb);
1241 uint32_t tce_page_size;
1242 uint64_t val;
1243
1244 if (pe_number > NPU2_MAX_PE_NUM)
1245 return OPAL_PARAMETER;
1246
1247 sync();
1248 switch(kill_type) {
1249 case OPAL_PCI_TCE_KILL_PAGES:
1250 tce_page_size = 1ULL << (
1251 11 + GETFIELD(npu->tve_cache[pe_number],
1252 NPU2_ATS_IODA_TBL_TVT_PSIZE));
1253 if (tce_page_size != tce_size) {
1254 NPU2ERR(npu, "npu2_tce_kill: Unexpected TCE size (got 0x%x expected 0x%x)\n",
1255 tce_size, tce_page_size);
1256 return OPAL_PARAMETER;
1257 }
1258
1259 if (npages < 128) {
1260 while (npages--) {
1261 val = SETFIELD(NPU2_ATS_TCE_KILL_PENUM, dma_addr, pe_number);
1262 npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ONE | val);
1263 dma_addr += tce_size;
1264 }
1265 break;
1266 }
1267 /*
1268 * For too many TCEs do not bother with the loop above and simply
1269 * flush everything, going to be lot faster.
1270 */
1271 /* Fall through */
1272 case OPAL_PCI_TCE_KILL_PE:
1273 /*
1274 * NPU2 doesn't support killing a PE so fall through
1275 * and do a kill all instead.
1276 */
1277 case OPAL_PCI_TCE_KILL_ALL:
1278 npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ALL);
1279 break;
1280 default:
1281 return OPAL_PARAMETER;
1282 }
1283
1284 return OPAL_SUCCESS;
1285 }
1286
1287 static const struct phb_ops npu_ops = {
1288 .cfg_read8 = npu2_cfg_read8,
1289 .cfg_read16 = npu2_cfg_read16,
1290 .cfg_read32 = npu2_cfg_read32,
1291 .cfg_write8 = npu2_cfg_write8,
1292 .cfg_write16 = npu2_cfg_write16,
1293 .cfg_write32 = npu2_cfg_write32,
1294 .device_init = NULL,
1295 .phb_final_fixup = npu2_phb_final_fixup,
1296 .ioda_reset = npu2_ioda_reset,
1297 .papr_errinjct_reset = NULL,
1298 .pci_reinit = NULL,
1299 .set_phb_mem_window = NULL,
1300 .phb_mmio_enable = NULL,
1301 .map_pe_mmio_window = NULL,
1302 .map_pe_dma_window = npu2_map_pe_dma_window,
1303 .map_pe_dma_window_real = npu2_map_pe_dma_window_real,
1304 .pci_msi_eoi = NULL,
1305 .set_xive_pe = NULL,
1306 .get_msi_32 = NULL,
1307 .get_msi_64 = NULL,
1308 .set_pe = npu2_set_pe,
1309 .set_peltv = NULL,
1310 .eeh_freeze_status = npu2_freeze_status,
1311 .eeh_freeze_clear = NULL,
1312 .eeh_freeze_set = NULL,
1313 .next_error = npu2_eeh_next_error,
1314 .err_inject = NULL,
1315 .get_diag_data2 = NULL,
1316 .set_capi_mode = NULL,
1317 .set_capp_recovery = NULL,
1318 .tce_kill = npu2_tce_kill,
1319 };
1320
assign_mmio_bars(uint64_t gcid,uint32_t scom,uint64_t reg[2],uint64_t mm_win[2])1321 static void assign_mmio_bars(uint64_t gcid, uint32_t scom, uint64_t reg[2], uint64_t mm_win[2])
1322 {
1323 uint32_t i;
1324 struct npu2_bar *bar;
1325 struct npu2_bar npu2_bars[] = {
1326 /* NPU_REGS must be first in this list */
1327 { .type = NPU_REGS, .index = 0,
1328 .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_PHY_BAR),
1329 .flags = NPU2_BAR_FLAG_ENABLED },
1330 { .type = NPU_PHY, .index = 0,
1331 .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_PHY_BAR),
1332 .flags = NPU2_BAR_FLAG_ENABLED },
1333 { .type = NPU_PHY, .index = 1,
1334 .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_PHY_BAR),
1335 .flags = NPU2_BAR_FLAG_ENABLED },
1336 { .type = NPU_NTL, .index = 0,
1337 .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL0_BAR) },
1338 { .type = NPU_NTL, .index = 1,
1339 .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL1_BAR) },
1340 { .type = NPU_NTL, .index = 2,
1341 .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL0_BAR) },
1342 { .type = NPU_NTL, .index = 3,
1343 .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL1_BAR) },
1344 { .type = NPU_NTL, .index = 4,
1345 .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL0_BAR) },
1346 { .type = NPU_NTL, .index = 5,
1347 .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL1_BAR) },
1348 { .type = NPU_GENID, .index = 0,
1349 .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_GENID_BAR) },
1350 { .type = NPU_GENID, .index = 1,
1351 .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_GENID_BAR) },
1352 { .type = NPU_GENID, .index = 2,
1353 .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_GENID_BAR) },
1354 };
1355
1356 for (i = 0; i < ARRAY_SIZE(npu2_bars); i++) {
1357 bar = &npu2_bars[i];
1358 npu2_get_bar(gcid, bar);
1359 npu2_write_bar(NULL, bar, gcid, scom);
1360 }
1361
1362 /* Global MMIO BAR */
1363 reg[0] = npu2_bars[0].base;
1364 reg[1] = npu2_bars[0].size;
1365
1366 /* NTL and GENID BARs are exposed to kernel via the mm
1367 * window */
1368 mm_win[0] = npu2_bars[3].base;
1369 mm_win[1] = npu2_bars[ARRAY_SIZE(npu2_bars) - 1].base +
1370 npu2_bars[ARRAY_SIZE(npu2_bars) - 1].size -
1371 mm_win[0];
1372 }
1373
1374 /*
1375 * Set up NPU for NVLink and create PCI root device node
1376 * accordingly.
1377 */
npu2_nvlink_init_npu(struct npu2 * npu)1378 int npu2_nvlink_init_npu(struct npu2 *npu)
1379 {
1380 struct dt_node *np;
1381 uint64_t reg[2], mm_win[2], val, mask;
1382
1383 /* TODO: Clean this up with register names, etc. when we get
1384 * time. This just turns NVLink mode on in each brick and should
1385 * get replaced with a patch from ajd once we've worked out how
1386 * things are going to work there.
1387 *
1388 * Obviously if the year is now 2020 that didn't happen and you
1389 * should fix this :-) */
1390
1391 val = PPC_BIT(58);
1392 mask = PPC_BIT(58) | /* CONFIG_NVLINK_MODE */
1393 PPC_BIT(40); /* CONFIG_ENABLE_SNARF_CPM */
1394
1395 /*
1396 * V100 GPUs are known to violate NVLink2 protocol if some GPU memory
1397 * mapped by a CPU was also "linear-block" mapped by a GPU. When this
1398 * happens, it breaks the NPU2 cache coherency state machine and
1399 * it throws machine checkstop. Disabling snarfing fixes this so let's
1400 * disable it by default.
1401 */
1402 if (nvram_query_eq_dangerous("opal-npu2-snarf-cpm", "enable")) {
1403 prlog(PR_WARNING, "NPU2#%d: enabling Probe.I.MO snarfing, a bad GPU driver may crash the system!\n",
1404 npu->index);
1405 val |= PPC_BIT(40); /* CONFIG_ENABLE_SNARF_CPM */
1406 }
1407
1408 xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM0_MISC_CONFIG0,
1409 val, mask);
1410 xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM1_MISC_CONFIG0,
1411 val, mask);
1412 xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM2_MISC_CONFIG0,
1413 val, mask);
1414 xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM3_MISC_CONFIG0,
1415 val, mask);
1416 xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM0_MISC_CONFIG0,
1417 val, mask);
1418 xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM1_MISC_CONFIG0,
1419 val, mask);
1420 xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM2_MISC_CONFIG0,
1421 val, mask);
1422 xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM3_MISC_CONFIG0,
1423 val, mask);
1424 xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM0_MISC_CONFIG0,
1425 val, mask);
1426 xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM1_MISC_CONFIG0,
1427 val, mask);
1428 xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM2_MISC_CONFIG0,
1429 val, mask);
1430 xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM3_MISC_CONFIG0,
1431 val, mask);
1432
1433 xscom_write_mask(npu->chip_id, 0x50110c0, PPC_BIT(53), PPC_BIT(53));
1434 xscom_write_mask(npu->chip_id, 0x50112c0, PPC_BIT(53), PPC_BIT(53));
1435 xscom_write_mask(npu->chip_id, 0x50114c0, PPC_BIT(53), PPC_BIT(53));
1436 xscom_write_mask(npu->chip_id, 0x50110f1, PPC_BIT(41), PPC_BIT(41));
1437 xscom_write_mask(npu->chip_id, 0x50112f1, PPC_BIT(41), PPC_BIT(41));
1438 xscom_write_mask(npu->chip_id, 0x50114f1, PPC_BIT(41), PPC_BIT(41));
1439
1440 val = NPU2_NTL_MISC_CFG2_BRICK_ENABLE |
1441 NPU2_NTL_MISC_CFG2_NDL_TX_PARITY_ENA |
1442 NPU2_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA |
1443 NPU2_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA;
1444 xscom_write_mask(npu->chip_id, 0x5011110, val, val);
1445 xscom_write_mask(npu->chip_id, 0x5011130, val, val);
1446 xscom_write_mask(npu->chip_id, 0x5011310, val, val);
1447 xscom_write_mask(npu->chip_id, 0x5011330, val, val);
1448 xscom_write_mask(npu->chip_id, 0x5011510, val, val);
1449 xscom_write_mask(npu->chip_id, 0x5011530, val, val);
1450
1451 val = PPC_BIT(6) | PPC_BIT(7) | PPC_BIT(11);
1452 xscom_write_mask(npu->chip_id, 0x5011009, val, PPC_BITMASK(6,11));
1453 xscom_write_mask(npu->chip_id, 0x5011039, val, PPC_BITMASK(6,11));
1454 xscom_write_mask(npu->chip_id, 0x5011069, val, PPC_BITMASK(6,11));
1455 xscom_write_mask(npu->chip_id, 0x5011099, val, PPC_BITMASK(6,11));
1456 xscom_write_mask(npu->chip_id, 0x5011209, val, PPC_BITMASK(6,11));
1457 xscom_write_mask(npu->chip_id, 0x5011239, val, PPC_BITMASK(6,11));
1458 xscom_write_mask(npu->chip_id, 0x5011269, val, PPC_BITMASK(6,11));
1459 xscom_write_mask(npu->chip_id, 0x5011299, val, PPC_BITMASK(6,11));
1460 xscom_write_mask(npu->chip_id, 0x5011409, val, PPC_BITMASK(6,11));
1461 xscom_write_mask(npu->chip_id, 0x5011439, val, PPC_BITMASK(6,11));
1462 xscom_write_mask(npu->chip_id, 0x5011469, val, PPC_BITMASK(6,11));
1463 xscom_write_mask(npu->chip_id, 0x5011499, val, PPC_BITMASK(6,11));
1464
1465 /* Reassign the BARs */
1466 assign_mmio_bars(npu->chip_id, npu->xscom_base, reg, mm_win);
1467 npu->regs = (void *)reg[0];
1468 npu->mm_base = mm_win[0];
1469 npu->mm_size = mm_win[1];
1470
1471 if (reg[0] && reg[1])
1472 prlog(PR_INFO, " Global MMIO BAR: %016llx (%lldMB)\n",
1473 reg[0], reg[1] >> 20);
1474 else
1475 prlog(PR_ERR, " Global MMIO BAR: Disabled\n");
1476
1477 /* Populate PCI root device node */
1478 np = dt_new_addr(dt_root, "pciex", reg[0]);
1479 assert(np);
1480 dt_add_property_strings(np,
1481 "compatible",
1482 "ibm,power9-npu-pciex",
1483 "ibm,ioda2-npu2-phb");
1484 dt_add_property_strings(np, "device_type", "pciex");
1485 dt_add_property(np, "reg", reg, sizeof(reg));
1486 dt_add_property_cells(np, "ibm,phb-index", npu2_get_phb_index(0));
1487 dt_add_property_cells(np, "ibm,npu-index", npu->index);
1488 dt_add_property_cells(np, "ibm,chip-id", npu->chip_id);
1489 dt_add_property_cells(np, "ibm,xscom-base", npu->xscom_base);
1490 dt_add_property_cells(np, "ibm,npcq", npu->dt_node->phandle);
1491 dt_add_property_cells(np, "ibm,links", npu->total_devices);
1492 dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win));
1493 dt_add_property_cells(np, "ibm,phb-diag-data-size", 0);
1494
1495 /* Disable fast reboot - not currently supported */
1496 disable_fast_reboot("NVLink device enabled");
1497
1498 npu2_nvlink_create_phb(npu, np);
1499
1500 return 0;
1501 }
1502
npu2_populate_pcie_cap(struct npu2_dev * dev,uint32_t start,uint32_t prev_cap)1503 static uint32_t npu2_populate_pcie_cap(struct npu2_dev *dev,
1504 uint32_t start,
1505 uint32_t prev_cap)
1506 {
1507 struct pci_virt_device *pvd = dev->nvlink.pvd;
1508 uint32_t val;
1509
1510 /* Add capability list */
1511 PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
1512 PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP);
1513
1514 /* 0x00 - ID/PCIE capability */
1515 val = PCI_CFG_CAP_ID_EXP;
1516 val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20));
1517 PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val);
1518
1519 /* 0x04 - Device capability
1520 *
1521 * We should support FLR. Otherwise, it might have
1522 * problem passing it through to userland via Linux
1523 * VFIO infrastructure
1524 */
1525 val = ((PCIE_MPSS_128) |
1526 (PCIE_PHANTOM_NONE << 3) |
1527 (PCIE_L0SL_MAX_NO_LIMIT << 6) |
1528 (PCIE_L1L_MAX_NO_LIMIT << 9) |
1529 (PCICAP_EXP_DEVCAP_FUNC_RESET));
1530 PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val);
1531
1532 pci_virt_add_filter(pvd, start + PCICAP_EXP_DEVCTL, 2,
1533 PCI_REG_FLAG_WRITE,
1534 npu2_dev_cfg_exp_devcap, NULL);
1535
1536 /* 0x08 - Device control and status */
1537 PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810,
1538 0xffff0000, 0x000f0000);
1539
1540 /* 0x0c - Link capability */
1541 val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4));
1542 PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val);
1543
1544 /* 0x10 - Link control and status */
1545 PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000,
1546 0xfffff000, 0xc0000000);
1547
1548 /* 0x14 - Slot capability */
1549 PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
1550
1551 /* 0x18 - Slot control and status */
1552 PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
1553
1554 /* 0x1c - Root control and capability */
1555 PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000,
1556 0xffffffe0, 0x00000000);
1557
1558 /* 0x20 - Root status */
1559 PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000,
1560 0xffffffff, 0x00010000);
1561
1562 /* 0x24 - Device capability 2 */
1563 PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000);
1564
1565 /* 0x28 - Device Control and status 2 */
1566 PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000,
1567 0xffff0000, 0x00000000);
1568
1569 /* 0x2c - Link capability 2 */
1570 PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007);
1571
1572 /* 0x30 - Link control and status 2 */
1573 PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003,
1574 0xffff0000, 0x00200000);
1575
1576 /* 0x34 - Slot capability 2 */
1577 PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000);
1578
1579 /* 0x38 - Slot control and status 2 */
1580 PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000);
1581
1582 return start + PCICAP_EXP_SCTL2 + 8;
1583 }
1584
npu2_populate_vendor_cap(struct npu2_dev * dev,uint32_t start,uint32_t prev_cap)1585 static uint32_t npu2_populate_vendor_cap(struct npu2_dev *dev,
1586 uint32_t start,
1587 uint32_t prev_cap)
1588 {
1589 struct pci_virt_device *pvd = dev->nvlink.pvd;
1590
1591 /* Capbility list */
1592 PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
1593 PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR);
1594
1595 /* Length and version */
1596 PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, VENDOR_CAP_LEN);
1597 PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, VENDOR_CAP_VERSION);
1598
1599 /*
1600 * Defaults when the trap can't handle the read/write (eg. due
1601 * to reading/writing less than 4 bytes).
1602 */
1603 PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0);
1604 PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0);
1605
1606 /* Add NVLink2 PHY procedures trap */
1607 pci_virt_add_filter(pvd, start + 4, 8,
1608 PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
1609 npu2_dev_procedure,
1610 NULL);
1611
1612 /* Link index */
1613 PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, dev->link_index);
1614
1615 return start + VENDOR_CAP_LEN;
1616 }
1617
npu2_populate_cfg(struct npu2_dev * dev)1618 static void npu2_populate_cfg(struct npu2_dev *dev)
1619 {
1620 struct pci_virt_device *pvd = dev->nvlink.pvd;
1621 struct npu2_pcie_bar *bar;
1622 uint32_t pos;
1623
1624 /* 0x00 - Vendor/Device ID */
1625 PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
1626
1627 /* 0x04 - Command/Status */
1628 PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
1629 0xf9000000);
1630
1631 pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
1632 npu2_cfg_write_cmd, NULL);
1633
1634 /* 0x08 - Rev/Class/Cache */
1635 PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800101);
1636
1637 /* 0x0c - CLS/Latency Timer/Header/BIST */
1638 PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
1639
1640 /* 0x10/14 - BAR#0, NTL BAR */
1641 bar = &dev->bars[0];
1642 PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4,
1643 (bar->npu2_bar.base & 0xfffffff0) | (bar->flags & 0xF),
1644 0x0000000f, 0x00000000);
1645 PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, (bar->npu2_bar.base >> 32),
1646 0x00000000, 0x00000000);
1647 pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
1648 PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
1649 npu2_dev_cfg_bar, bar);
1650
1651 /* 0x18/1c - BAR#1, GENID BAR */
1652 bar = &dev->bars[1];
1653 if (NPU2DEV_BRICK(dev) == 0)
1654 PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, (bar->npu2_bar.base & 0xfffffff0) |
1655 (bar->flags & 0xF),
1656 0x0000000f, 0x00000000);
1657 else
1658 /* Brick 1 gets the upper portion of the generation id register */
1659 PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, ((bar->npu2_bar.base + 0x10000) & 0xfffffff0) |
1660 (bar->flags & 0xF),
1661 0x0000000f, 0x00000000);
1662
1663 PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, (bar->npu2_bar.base >> 32), 0x00000000,
1664 0x00000000);
1665 pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8,
1666 PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
1667 npu2_dev_cfg_bar, bar);
1668
1669 /* 0x20/0x24 - BARs, disabled */
1670 PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
1671 PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
1672
1673 /* 0x28 - Cardbus CIS pointer */
1674 PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
1675
1676 /* 0x2c - Subsystem ID */
1677 PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
1678
1679 /* 0x30 - ROM BAR, zero sized */
1680 PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
1681
1682 /* 0x34 - PCI Capability */
1683 PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
1684
1685 /* 0x38 - Reserved */
1686 PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
1687
1688 /* 0x3c - INT line/pin/Minimal grant/Maximal latency */
1689 PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); /* INT A */
1690
1691 /* PCIE and vendor specific capability */
1692 pos = npu2_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP);
1693 pos = npu2_populate_vendor_cap(dev, pos, 0x41);
1694 PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0);
1695 }
1696
npu_allocate_bdfn(struct npu2 * p,uint32_t group)1697 static uint32_t npu_allocate_bdfn(struct npu2 *p, uint32_t group)
1698 {
1699 int i;
1700 int bdfn = (group << 3);
1701
1702 for (i = 0; i < p->total_devices; i++) {
1703 if ((p->devices[i].bdfn & 0xf8) == (bdfn & 0xf8))
1704 bdfn++;
1705 }
1706
1707 return bdfn;
1708 }
1709
npu2_populate_devices(struct npu2 * p,struct dt_node * dn)1710 static void npu2_populate_devices(struct npu2 *p,
1711 struct dt_node *dn)
1712 {
1713 struct npu2_dev *dev;
1714 struct dt_node *npu2_dn, *link;
1715 uint32_t npu_phandle, index = 0;
1716 int stack;
1717
1718 /*
1719 * Get the npu node which has the links which we expand here
1720 * into pci like devices attached to our emulated phb.
1721 */
1722 npu_phandle = dt_prop_get_u32(dn, "ibm,npcq");
1723 npu2_dn = dt_find_by_phandle(dt_root, npu_phandle);
1724 assert(npu2_dn);
1725
1726 /* Walk the link@x nodes to initialize devices */
1727 p->total_devices = 0;
1728 p->phb_nvlink.scan_map = 0;
1729 dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") {
1730 uint32_t group_id;
1731 struct npu2_bar *npu2_bar;
1732
1733 dev = &p->devices[index];
1734 dev->type = NPU2_DEV_TYPE_NVLINK;
1735 dev->npu = p;
1736 dev->dt_node = link;
1737 dev->link_index = dt_prop_get_u32(link, "ibm,npu-link-index");
1738 dev->brick_index = dev->link_index;
1739
1740 group_id = dt_prop_get_u32(link, "ibm,npu-group-id");
1741 dev->bdfn = npu_allocate_bdfn(p, group_id);
1742
1743 /* This must be done after calling
1744 * npu_allocate_bdfn() */
1745 p->total_devices++;
1746 p->phb_nvlink.scan_map |= 0x1 << ((dev->bdfn & 0xf8) >> 3);
1747
1748 dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy");
1749 dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
1750
1751 /* Populate BARs. BAR0/1 is the NTL bar. */
1752 stack = NPU2_STACK_STCK_0 + NPU2DEV_STACK(dev);
1753 npu2_bar = &dev->bars[0].npu2_bar;
1754 npu2_bar->type = NPU_NTL;
1755 npu2_bar->index = dev->brick_index;
1756 npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2DEV_BRICK(dev) == 0 ?
1757 NPU2_NTL0_BAR : NPU2_NTL1_BAR);
1758 npu2_get_bar(p->chip_id, npu2_bar);
1759
1760 dev->bars[0].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
1761
1762 /* BAR2/3 is the GENID bar. */
1763 npu2_bar = &dev->bars[1].npu2_bar;
1764 npu2_bar->type = NPU_GENID;
1765 npu2_bar->index = NPU2DEV_STACK(dev);
1766 npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2_GENID_BAR);
1767 npu2_get_bar(p->chip_id, npu2_bar);
1768
1769 /* The GENID is a single physical BAR that we split
1770 * for each emulated device */
1771 npu2_bar->size = 0x10000;
1772 if (NPU2DEV_BRICK(dev))
1773 npu2_bar->base += 0x10000;
1774 dev->bars[1].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
1775
1776 /* Initialize PCI virtual device */
1777 dev->nvlink.pvd = pci_virt_add_device(&p->phb_nvlink, dev->bdfn, 0x100, dev);
1778 if (dev->nvlink.pvd)
1779 npu2_populate_cfg(dev);
1780
1781 index++;
1782 }
1783 }
1784
npu2_add_interrupt_map(struct npu2 * p,struct dt_node * dn)1785 static void npu2_add_interrupt_map(struct npu2 *p,
1786 struct dt_node *dn)
1787 {
1788 struct dt_node *npu2_dn, *link, *phb_dn;
1789 uint32_t npu2_phandle, index = 0, i;
1790 uint32_t icsp = get_ics_phandle();
1791 uint32_t *map;
1792 size_t map_size;
1793 uint32_t mask[] = {0xff00, 0x0, 0x0, 0x7};
1794
1795 assert(p->phb_nvlink.dt_node);
1796 phb_dn = p->phb_nvlink.dt_node;
1797
1798 npu2_phandle = dt_prop_get_u32(dn, "ibm,npcq");
1799 npu2_dn = dt_find_by_phandle(dt_root, npu2_phandle);
1800 assert(npu2_dn);
1801 map_size = 7 * sizeof(*map) * p->total_devices;
1802 map = malloc(map_size);
1803 index = 0;
1804 dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") {
1805 i = index * 7;
1806 map[i + 0] = (p->devices[index].bdfn << 8);
1807 map[i + 1] = 0;
1808 map[i + 2] = 0;
1809
1810 map[i + 3] = 1; /* INT A */
1811 map[i + 4] = icsp; /* interrupt-parent */
1812 map[i + 5] = p->base_lsi + (index * 2) + 1; /* NDL No-Stall Event */
1813 map[i + 6] = 0; /* 0 = EDGE, 1 = LEVEL. */
1814 index++;
1815 }
1816 dt_add_property(phb_dn, "interrupt-map", map, map_size);
1817 free(map);
1818 dt_add_property(phb_dn, "interrupt-map-mask", mask, sizeof(mask));
1819 }
1820
npu2_add_phb_properties(struct npu2 * p)1821 static void npu2_add_phb_properties(struct npu2 *p)
1822 {
1823 struct dt_node *np = p->phb_nvlink.dt_node;
1824 uint32_t icsp = get_ics_phandle();
1825 uint64_t mm_base, mm_size;
1826
1827 /*
1828 * Add various properties that HB doesn't have to
1829 * add, some of them simply because they result from
1830 * policy decisions made in skiboot rather than in HB
1831 * such as the MMIO windows going to PCI, interrupts,
1832 * etc.
1833 */
1834 dt_add_property_cells(np, "#address-cells", 3);
1835 dt_add_property_cells(np, "#size-cells", 2);
1836 dt_add_property_cells(np, "#interrupt-cells", 1);
1837 dt_add_property_cells(np, "bus-range", 0, 0xff);
1838 dt_add_property_cells(np, "clock-frequency", 0x200, 0);
1839 dt_add_property_cells(np, "interrupt-parent", icsp);
1840
1841 /* NPU2 PHB properties */
1842 dt_add_property_cells(np, "ibm,opal-num-pes",
1843 NPU2_MAX_PE_NUM);
1844 dt_add_property_cells(np, "ibm,opal-reserved-pe",
1845 NPU2_RESERVED_PE_NUM);
1846 dt_add_property_cells(np, "ibm,supported-tce-sizes",
1847 12, // 4K
1848 16, // 64K
1849 24, // 16M
1850 28); // 256M
1851
1852 dt_add_property_u64s(np, "ibm,mmio-atsd",
1853 MMIO_ATSD_ADDR(p->regs, 0),
1854 MMIO_ATSD_ADDR(p->regs, 1),
1855 MMIO_ATSD_ADDR(p->regs, 2),
1856 MMIO_ATSD_ADDR(p->regs, 3),
1857 MMIO_ATSD_ADDR(p->regs, 4),
1858 MMIO_ATSD_ADDR(p->regs, 5),
1859 MMIO_ATSD_ADDR(p->regs, 6),
1860 MMIO_ATSD_ADDR(p->regs, 7));
1861
1862 /*
1863 * Memory window is exposed as 64-bits non-prefetchable
1864 * one because 64-bits prefetchable one is kind of special
1865 * to kernel.
1866 */
1867 mm_base = p->mm_base;
1868 mm_size = p->mm_size;
1869 dt_add_property_cells(np, "ranges", 0x02000000,
1870 hi32(mm_base), lo32(mm_base),
1871 hi32(mm_base), lo32(mm_base),
1872 hi32(mm_size), lo32(mm_size));
1873 }
1874
npu2_nvlink_create_phb(struct npu2 * npu,struct dt_node * dn)1875 void npu2_nvlink_create_phb(struct npu2 *npu, struct dt_node *dn)
1876 {
1877 struct pci_slot *slot;
1878
1879 /* Generic PHB */
1880 npu->phb_nvlink.dt_node = dn;
1881 npu->phb_nvlink.ops = &npu_ops;
1882 npu->phb_nvlink.phb_type = phb_type_npu_v2;
1883 init_lock(&npu->lock);
1884 init_lock(&npu->phb_nvlink.lock);
1885 list_head_init(&npu->phb_nvlink.devices);
1886 list_head_init(&npu->phb_nvlink.virt_devices);
1887
1888 npu2_populate_devices(npu, dn);
1889 npu2_add_interrupt_map(npu, dn);
1890 npu2_add_phb_properties(npu);
1891
1892 slot = npu2_slot_create(&npu->phb_nvlink);
1893 if (!slot)
1894 {
1895 /**
1896 * @fwts-label NPUCannotCreatePHBSlot
1897 * @fwts-advice Firmware probably ran out of memory creating
1898 * NPU2 slot. NVLink functionality could be broken.
1899 */
1900 prlog(PR_ERR, "NPU: Cannot create PHB slot\n");
1901 }
1902
1903 pci_register_phb(&npu->phb_nvlink, OPAL_DYNAMIC_PHB_ID);
1904
1905 npu2_init_ioda_cache(npu);
1906 npu2_hw_init(npu);
1907 }
1908
1909 /*
1910 * Search a table for an entry with matching value under mask. Returns
1911 * the index and the current value in *value.
1912 */
npu_table_search(struct npu2 * p,uint64_t table_addr,int stride,int table_size,uint64_t * value,uint64_t mask)1913 static int npu_table_search(struct npu2 *p, uint64_t table_addr, int stride,
1914 int table_size, uint64_t *value, uint64_t mask)
1915 {
1916 int i;
1917 uint64_t val;
1918
1919 assert(value);
1920
1921 for (i = 0; i < table_size; i++) {
1922 val = npu2_read(p, table_addr + i*stride);
1923 if ((val & mask) == *value) {
1924 *value = val;
1925 return i;
1926 }
1927 }
1928
1929 return -1;
1930 }
1931
1932 /*
1933 * Allocate a context ID and initialise the tables with the relevant
1934 * information. Returns the ID on or error if one couldn't be
1935 * allocated.
1936 */
1937 #define NPU2_VALID_ATS_MSR_BITS (MSR_DR | MSR_HV | MSR_PR | MSR_SF)
npu2_init_context(struct phb * phb,uint64_t msr,uint64_t bdf)1938 int64_t npu2_init_context(struct phb *phb, uint64_t msr, uint64_t bdf)
1939 {
1940 struct npu2 *p;
1941 uint64_t xts_bdf, old_xts_bdf_pid, xts_bdf_pid;
1942 int id;
1943
1944 /*
1945 * MSR bits should be masked by the caller to allow for future
1946 * expansion if required.
1947 */
1948 if (msr & ~NPU2_VALID_ATS_MSR_BITS)
1949 return OPAL_UNSUPPORTED;
1950
1951 /*
1952 * Need to get LPARSHORT.
1953 */
1954 p = phb_to_npu2_nvlink(phb);
1955 lock(&p->lock);
1956 xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0ul, bdf);
1957 if (npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
1958 &xts_bdf, NPU2_XTS_BDF_MAP_BDF) < 0) {
1959 NPU2ERR(p, "LPARID not associated with any GPU\n");
1960 id = OPAL_PARAMETER;
1961 goto out;
1962 }
1963
1964 id = GETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf);
1965 NPU2DBG(p, "Found LPARSHORT = 0x%x for BDF = 0x%03llx\n", id, bdf);
1966
1967 /* Enable this mapping for both real and virtual addresses */
1968 xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATRGPA0, 0UL, 1);
1969 xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATRGPA1, xts_bdf_pid, 1);
1970
1971 /* Enables TLBIE/MMIOSD forwarding for this entry */
1972 xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATSD, xts_bdf_pid, 1);
1973 xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_LPARSHORT, xts_bdf_pid, id);
1974
1975 /* Set the relevant MSR bits */
1976 xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_DR, xts_bdf_pid,
1977 !!(msr & MSR_DR));
1978 xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_HV, xts_bdf_pid,
1979 !!(msr & MSR_HV));
1980 xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_PR, xts_bdf_pid,
1981 !!(msr & MSR_PR));
1982
1983 /* We don't support anything other than 64-bit so we can safely hardcode
1984 * it here */
1985 xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_SF, xts_bdf_pid, 1);
1986
1987 /*
1988 * Throw an error if the wildcard entry for this bdf is already set
1989 * with different msr bits.
1990 */
1991 old_xts_bdf_pid = npu2_read(p, NPU2_XTS_PID_MAP + id*0x20);
1992 if (old_xts_bdf_pid) {
1993 if (GETFIELD(NPU2_XTS_PID_MAP_MSR, old_xts_bdf_pid) !=
1994 GETFIELD(NPU2_XTS_PID_MAP_MSR, xts_bdf_pid)) {
1995 NPU2ERR(p, "%s: Unexpected MSR value\n", __func__);
1996 id = OPAL_PARAMETER;
1997 goto out;
1998 } else if (!p->ctx_ref[id]) {
1999 NPU2ERR(p, "%s: Unexpected mapping\n", __func__);
2000 id = OPAL_INTERNAL_ERROR;
2001 goto out;
2002 }
2003 }
2004
2005 /* Write the entry */
2006 if (!p->ctx_ref[id]) {
2007 NPU2DBG(p, "XTS_PID_MAP[%03d] = 0x%08llx\n", id, xts_bdf_pid);
2008 npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, xts_bdf_pid);
2009
2010 if (!GETFIELD(NPU2_XTS_BDF_MAP_VALID, xts_bdf)) {
2011 xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_VALID, xts_bdf, 1);
2012 npu2_write(p, NPU2_XTS_BDF_MAP + id*8, xts_bdf);
2013 }
2014 }
2015 ++p->ctx_ref[id];
2016
2017 out:
2018 unlock(&p->lock);
2019 return id;
2020 }
2021
npu2_destroy_context(struct phb * phb,uint64_t bdf)2022 int64_t npu2_destroy_context(struct phb *phb, uint64_t bdf)
2023 {
2024 struct npu2 *p;
2025 uint64_t xts_bdf;
2026 int rc = OPAL_PARAMETER, id;
2027
2028 p = phb_to_npu2_nvlink(phb);
2029 lock(&p->lock);
2030
2031 /* Need to find lparshort for this bdf */
2032 xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0ul, bdf);
2033 if (npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
2034 &xts_bdf, NPU2_XTS_BDF_MAP_BDF) < 0) {
2035 NPU2ERR(p, "LPARID not associated with any GPU\n");
2036 } else {
2037 /*
2038 * The bdf/pid table contains wildcard entries and MSR bits
2039 * which we need to clear between switching a device from
2040 * a host to a guest or vice versa.
2041 */
2042 id = GETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf);
2043 if (p->ctx_ref[id]) {
2044 --p->ctx_ref[id];
2045 if (!p->ctx_ref[id]) {
2046 NPU2DBG(p, "XTS_PID_MAP[%03d] = 0 (destroy)\n",
2047 id);
2048 npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, 0);
2049 }
2050 rc = OPAL_SUCCESS;
2051 }
2052 }
2053 unlock(&p->lock);
2054 return rc;
2055 }
2056
2057 /*
2058 * Map the given virtual bdf to lparid with given lpcr.
2059 */
npu2_map_lpar(struct phb * phb,uint64_t bdf,uint64_t lparid,uint64_t lpcr)2060 int64_t npu2_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
2061 uint64_t lpcr)
2062 {
2063 struct npu2 *p;
2064 struct npu2_dev *ndev = NULL;
2065 uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
2066 int i;
2067 int id;
2068 static uint64_t atsd_lpar_regs[] = {
2069 NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
2070 NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
2071 NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
2072 NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID
2073 };
2074
2075 if (lpcr)
2076 /* The LPCR bits are only required for hash based ATS,
2077 * which we don't currently support but may need to in
2078 * future. */
2079 return OPAL_UNSUPPORTED;
2080
2081 p = phb_to_npu2_nvlink(phb);
2082 lock(&p->lock);
2083
2084 /* Find any existing entries and update them */
2085 xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0L, bdf);
2086 id = npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
2087 &xts_bdf_lpar, NPU2_XTS_BDF_MAP_BDF);
2088 if (id < 0) {
2089 /* No existing mapping found, find space for a new one */
2090 xts_bdf_lpar = 0;
2091 id = npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
2092 &xts_bdf_lpar, -1UL);
2093 }
2094
2095 if (id < 0) {
2096 /* Unable to find a free mapping */
2097 NPU2ERR(p, "No free XTS_BDF[] entry\n");
2098 rc = OPAL_RESOURCE;
2099 goto out;
2100 }
2101
2102 xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_UNFILT, 0UL, 1);
2103 xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BDF, xts_bdf_lpar, bdf);
2104
2105 /* We only support radix for the moment */
2106 xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_XLAT, xts_bdf_lpar, 0x3);
2107 xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid);
2108 xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
2109
2110 /* Need to find an NVLink to send the ATSDs for this device over */
2111 for (i = 0; i < p->total_devices; i++) {
2112 if (p->devices[i].nvlink.gpu_bdfn == bdf) {
2113 ndev = &p->devices[i];
2114 break;
2115 }
2116 }
2117
2118 if (!ndev) {
2119 NPU2ERR(p, "Unable to find nvlink for bdf %llx\n", bdf);
2120 rc = OPAL_PARAMETER;
2121 goto out;
2122 }
2123
2124 /*
2125 * We need to allocate an ATSD per NVLink bridge if possible,
2126 * use the ibm,npu-link-index property for that.
2127 */
2128 atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
2129 if (!lparid)
2130 atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
2131
2132 if (ndev->link_index < ARRAY_SIZE(atsd_lpar_regs))
2133 npu2_write(p, atsd_lpar_regs[ndev->link_index], atsd_lpar);
2134 else
2135 NPU2ERR(p, "Unable to assign ATSD for link index %u\n",
2136 ndev->link_index);
2137
2138 xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_STACK, xts_bdf_lpar,
2139 0x4 >> (ndev->brick_index / 2));
2140 xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BRICK, xts_bdf_lpar,
2141 (ndev->brick_index % 2));
2142
2143 NPU2DBG(p, "XTS_BDF_MAP[%03d] = 0x%08llx\n", id, xts_bdf_lpar);
2144 npu2_write(p, NPU2_XTS_BDF_MAP + id*8, xts_bdf_lpar);
2145
2146 /* Reset wildcard in the PID map and the refcounter */
2147 if (npu2_read(p, NPU2_XTS_PID_MAP + id*0x20) || p->ctx_ref[id]) {
2148 prlog(PR_INFO, "Resetting PID MAP for LPID %lld\n", lparid);
2149 p->ctx_ref[id] = 0;
2150 npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, 0);
2151 }
2152
2153 out:
2154 unlock(&p->lock);
2155 return rc;
2156 }
2157
npu2_relaxed_ordering_source_grpchp(uint32_t gcid)2158 static inline uint32_t npu2_relaxed_ordering_source_grpchp(uint32_t gcid)
2159 {
2160 if (gcid & ~0x1b)
2161 return OPAL_PARAMETER;
2162
2163 /* Repack 0bGGGGCCC to 0bGGCC */
2164 return ((gcid & 0x18) >> 1) | (gcid & 0x3);
2165 }
2166
npu2_relaxed_ordering_cfg_read(struct npu2_dev * ndev,int n)2167 static uint64_t npu2_relaxed_ordering_cfg_read(struct npu2_dev *ndev, int n)
2168 {
2169 uint64_t reg = NPU2_SM_REG_OFFSET(ndev, 0, NPU2_RELAXED_ORDERING_CFG(n));
2170
2171 return npu2_read(ndev->npu, reg);
2172 }
2173
npu2_relaxed_ordering_cfg_write(struct npu2_dev * ndev,int n,uint64_t val)2174 static void npu2_relaxed_ordering_cfg_write(struct npu2_dev *ndev, int n,
2175 uint64_t val)
2176 {
2177 uint64_t reg;
2178 int sm;
2179
2180 /* Set every register on our stack */
2181 for (sm = NPU2_BLOCK_SM_0; sm <= NPU2_BLOCK_SM_3; sm++) {
2182 reg = NPU2_SM_REG_OFFSET(ndev, sm, NPU2_RELAXED_ORDERING_CFG(n));
2183 npu2_write(ndev->npu, reg, val);
2184 }
2185 }
2186
2187 /*
2188 * Parse the value of a relaxed ordering config register. Returns SOURCE0 or
2189 * SOURCE1 register mask if relaxed ordering is set for the given chip/pec.
2190 * Returns 0 if unset.
2191 */
npu2_relaxed_ordering_cfg_enabled(uint64_t val,uint32_t gcid,int pec)2192 static uint64_t npu2_relaxed_ordering_cfg_enabled(uint64_t val, uint32_t gcid,
2193 int pec)
2194 {
2195 uint32_t src, grpchp;
2196 uint64_t mask;
2197 int i;
2198
2199 for (i = 0; i < 2; i++) {
2200 mask = NPU2_RELAXED_ORDERING_SOURCE(i);
2201 src = GETFIELD(mask, val);
2202
2203 if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, src))
2204 continue;
2205
2206 if (GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src) != pec)
2207 continue;
2208
2209 grpchp = GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src);
2210 if (grpchp == npu2_relaxed_ordering_source_grpchp(gcid))
2211 return mask;
2212
2213 if (grpchp == 0xf) /* match all */
2214 return mask;
2215 }
2216
2217 return 0;
2218 }
2219
npu2_enable_relaxed_ordering(struct npu2_dev * ndev,uint32_t gcid,int pec)2220 static int npu2_enable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
2221 int pec)
2222 {
2223 uint64_t val, mask;
2224 uint32_t src;
2225 int rc = OPAL_RESOURCE;
2226 int i;
2227
2228 NPU2DEVINF(ndev, "Enabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
2229 lock(&ndev->npu->lock);
2230
2231 for (i = 0; i < 2; i++) {
2232 val = npu2_relaxed_ordering_cfg_read(ndev, i);
2233 if (!npu2_relaxed_ordering_cfg_enabled(val, gcid, pec))
2234 continue;
2235
2236 /* Already enabled */
2237 rc = OPAL_SUCCESS;
2238 goto out;
2239 }
2240
2241 src = NPU2_RELAXED_ORDERING_SOURCE_WRENA |
2242 NPU2_RELAXED_ORDERING_SOURCE_RDENA;
2243 src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src, pec);
2244 src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src,
2245 npu2_relaxed_ordering_source_grpchp(gcid));
2246 src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMIN, src, 0);
2247 src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMAX, src, 23);
2248 src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMIN, src, 0);
2249 src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMAX, src, 47);
2250
2251 /* Find somewhere to write this config */
2252 for (i = 0; i < 2; i++) {
2253 val = npu2_relaxed_ordering_cfg_read(ndev, i);
2254
2255 if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA << 32, val))
2256 mask = NPU2_RELAXED_ORDERING_SOURCE(0);
2257 else if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, val))
2258 mask = NPU2_RELAXED_ORDERING_SOURCE(1);
2259 else
2260 continue;
2261
2262 val = SETFIELD(mask, val, src);
2263 npu2_relaxed_ordering_cfg_write(ndev, i, val);
2264
2265 rc = OPAL_SUCCESS;
2266 break;
2267 }
2268
2269 out:
2270 unlock(&ndev->npu->lock);
2271 return rc;
2272 }
2273
npu2_disable_relaxed_ordering(struct npu2_dev * ndev,uint32_t gcid,int pec)2274 static void npu2_disable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
2275 int pec)
2276 {
2277 uint64_t val, mask;
2278 int i;
2279
2280 NPU2DEVINF(ndev, "Disabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
2281 lock(&ndev->npu->lock);
2282
2283 for (i = 0; i < 2; i++) {
2284 val = npu2_relaxed_ordering_cfg_read(ndev, i);
2285
2286 mask = npu2_relaxed_ordering_cfg_enabled(val, gcid, pec);
2287 if (!mask)
2288 continue;
2289
2290 val = SETFIELD(mask, val, 0);
2291 npu2_relaxed_ordering_cfg_write(ndev, i, val);
2292 }
2293
2294 unlock(&ndev->npu->lock);
2295 }
2296
2297 /*
2298 * Enable or disable relaxed ordering on all nvlinks for a given PEC. May leave
2299 * relaxed ordering partially enabled if there are insufficient HW resources to
2300 * enable it on all links.
2301 */
npu2_set_relaxed_order(struct phb * phb,uint32_t gcid,int pec,bool enable)2302 int64_t npu2_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
2303 bool enable)
2304 {
2305 struct npu2 *npu = phb_to_npu2_nvlink(phb);
2306 struct npu2_dev *ndev;
2307 int64_t rc = OPAL_SUCCESS;
2308
2309 for (int i = 0; i < npu->total_devices; i++) {
2310 ndev = &npu->devices[i];
2311 if (enable)
2312 rc = npu2_enable_relaxed_ordering(ndev, gcid, pec);
2313 else
2314 npu2_disable_relaxed_ordering(ndev, gcid, pec);
2315
2316 if (rc != OPAL_SUCCESS) {
2317 NPU2DEVINF(ndev, "Insufficient resources to activate relaxed ordering mode\n");
2318 return OPAL_RESOURCE;
2319 }
2320 }
2321
2322 return OPAL_SUCCESS;
2323 }
2324