1 /* Copyright 2013-2018 IBM Corp.
2  *
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *      http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12  * implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include <skiboot.h>
17 #include <io.h>
18 #include <timebase.h>
19 #include <pci-cfg.h>
20 #include <pci.h>
21 #include <pci-slot.h>
22 #include <pci-virt.h>
23 #include <opal.h>
24 #include <opal-api.h>
25 #include <cpu.h>
26 #include <device.h>
27 #include <ccan/str/str.h>
28 #include <ccan/array_size/array_size.h>
29 #include <affinity.h>
30 #include <npu2.h>
31 #include <lock.h>
32 #include <xscom.h>
33 #include <bitutils.h>
34 #include <chip.h>
35 #include <phys-map.h>
36 #include <nvram.h>
37 #include <xscom-p9-regs.h>
38 #include <phb4.h>
39 
40 #define VENDOR_CAP_START    0x80
41 #define VENDOR_CAP_END      0x90
42 #define VENDOR_CAP_LEN      0x10
43 #define VENDOR_CAP_VERSION  0x01
44 #define VENDOR_CAP_PCI_DEV_OFFSET 0x0d
45 
46 /*
47  * NPU2 BAR layout definition. We have 3 stacks and each of them
48  * contains 2 bricks. So every NPU2 has 6 bricks in total. There are 2
49  * PHY BARs and each of them is shared by 3 bricks. Every brick has
50  * one NTL BAR and two bricks share one GENID BAR. There is also a
51  * global MMIO BAR. We only expose DL and GENID BARs to the OS and all
52  * other BARs will be hidden in skiboot.
53  *
54  * Before the global MMIO BAR is configured, scom is the only way to
55  * access the BAR registers. At NPU2 PHB probing time, we rely on scom
56  * to assign all BARs until the global MMIO BAR is established.
57  *
58  * We need to access 4 SM registers in the same stack in order to
59  * configure one particular BAR.
60  */
61 
62 /* Set a specific flag in the vendor config space */
npu2_set_link_flag(struct npu2_dev * ndev,uint8_t flag)63 void npu2_set_link_flag(struct npu2_dev *ndev, uint8_t flag)
64 {
65 	ndev->nvlink.link_flags |= flag;
66 	PCI_VIRT_CFG_INIT_RO(ndev->nvlink.pvd, VENDOR_CAP_START +
67 			     VENDOR_CAP_PCI_DEV_OFFSET, 1, ndev->nvlink.link_flags);
68 }
69 
npu2_clear_link_flag(struct npu2_dev * ndev,uint8_t flag)70 void npu2_clear_link_flag(struct npu2_dev *ndev, uint8_t flag)
71 {
72 	ndev->nvlink.link_flags &= ~flag;
73 	PCI_VIRT_CFG_INIT_RO(ndev->nvlink.pvd, VENDOR_CAP_START +
74 			     VENDOR_CAP_PCI_DEV_OFFSET, 1, ndev->nvlink.link_flags);
75 }
76 
npu2_ioda_sel(struct npu2 * p,uint32_t table,uint32_t index,bool autoinc)77 static inline void npu2_ioda_sel(struct npu2 *p, uint32_t table,
78 				uint32_t index, bool autoinc)
79 {
80 	out_be64(p->regs + NPU2_ATS_IODA_TBL,
81 		 (autoinc ? NPU2_ATS_IODA_TBL_AUTOINC : 0ul)	|
82 		 SETFIELD(NPU2_ATS_IODA_TBL_SELECT, 0ul, table)	|
83 		 SETFIELD(NPU2_ATS_IODA_TBL_INDEX,  0ul, index));
84 }
85 
npu2_bdf_to_dev(struct npu2 * p,uint32_t bdfn)86 static struct npu2_dev *npu2_bdf_to_dev(struct npu2 *p,
87 					uint32_t bdfn)
88 {
89 	struct pci_virt_device *pvd;
90 
91 	/* All emulated devices are attached to root bus */
92 	if (bdfn & ~0xff)
93 		return NULL;
94 
95 	pvd = pci_virt_find_device(&p->phb_nvlink, bdfn);
96 	if (pvd)
97 		return pvd->data;
98 
99 	return NULL;
100 }
101 
npu2_get_bar(uint32_t gcid,struct npu2_bar * bar)102 static inline void npu2_get_bar(uint32_t gcid, struct npu2_bar *bar)
103 {
104 	phys_map_get(gcid, bar->type, bar->index, &bar->base, &bar->size);
105 }
106 
npu2_read_bar(struct npu2 * p,struct npu2_bar * bar)107 static void npu2_read_bar(struct npu2 *p, struct npu2_bar *bar)
108 {
109 	uint64_t reg, val;
110 	int enabled;
111 
112 	reg = NPU2_REG_OFFSET(0, NPU2_BLOCK_SM_0, bar->reg);
113 	val = npu2_read(p, reg);
114 
115 	switch (NPU2_REG(bar->reg)) {
116 	case NPU2_PHY_BAR:
117 		bar->base = GETFIELD(NPU2_PHY_BAR_ADDR, val) << 21;
118 		enabled = GETFIELD(NPU2_PHY_BAR_ENABLE, val);
119 
120 		if (NPU2_REG_STACK(reg) == NPU2_STACK_STCK_2)
121 			/* This is the global MMIO BAR */
122 			bar->size = 0x1000000;
123 		else
124 			bar->size = 0x200000;
125 		break;
126 	case NPU2_NTL0_BAR:
127 	case NPU2_NTL1_BAR:
128 		bar->base = GETFIELD(NPU2_NTL_BAR_ADDR, val) << 16;
129 		enabled = GETFIELD(NPU2_NTL_BAR_ENABLE, val);
130 		bar->size = 0x10000 << GETFIELD(NPU2_NTL_BAR_SIZE, val);
131 		break;
132 	case NPU2_GENID_BAR:
133 		bar->base = GETFIELD(NPU2_GENID_BAR_ADDR, val) << 16;
134 		enabled = GETFIELD(NPU2_GENID_BAR_ENABLE, val);
135 		bar->size = 0x20000;
136 		break;
137 	default:
138 		bar->base = 0ul;
139 		enabled = 0;
140 		bar->size = 0;
141 		break;
142 	}
143 
144 	bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, bar->flags, enabled);
145 }
146 
npu2_write_bar(struct npu2 * p,struct npu2_bar * bar,uint32_t gcid,uint32_t scom)147 static void npu2_write_bar(struct npu2 *p,
148 			   struct npu2_bar *bar,
149 			   uint32_t gcid,
150 			   uint32_t scom)
151 {
152 	uint64_t reg, val, enable = !!(bar->flags & NPU2_BAR_FLAG_ENABLED);
153 	int block;
154 
155 	switch (NPU2_REG(bar->reg)) {
156 	case NPU2_PHY_BAR:
157 		val = SETFIELD(NPU2_PHY_BAR_ADDR, 0ul, bar->base >> 21);
158 		val = SETFIELD(NPU2_PHY_BAR_ENABLE, val, enable);
159 		break;
160 	case NPU2_NTL0_BAR:
161 	case NPU2_NTL1_BAR:
162 		val = SETFIELD(NPU2_NTL_BAR_ADDR, 0ul, bar->base >> 16);
163 		val = SETFIELD(NPU2_NTL_BAR_ENABLE, val, enable);
164 		val = SETFIELD(NPU2_NTL_BAR_SIZE, val, 1);
165 		break;
166 	case NPU2_GENID_BAR:
167 		val = SETFIELD(NPU2_GENID_BAR_ADDR, 0ul, bar->base >> 16);
168 		val = SETFIELD(NPU2_GENID_BAR_ENABLE, val, enable);
169 		break;
170 	default:
171 		val = 0ul;
172 	}
173 
174 	for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
175 		reg = NPU2_REG_OFFSET(0, block, bar->reg);
176 		if (p)
177 			npu2_write(p, reg, val);
178 		else
179 			npu2_scom_write(gcid, scom, reg, NPU2_MISC_DA_LEN_8B, val);
180 	}
181 }
182 
183 /* Trap for PCI command (0x4) to enable or disable device's BARs */
npu2_cfg_write_cmd(void * dev,struct pci_cfg_reg_filter * pcrf __unused,uint32_t offset,uint32_t size,uint32_t * data,bool write)184 static int64_t npu2_cfg_write_cmd(void *dev,
185 				  struct pci_cfg_reg_filter *pcrf __unused,
186 				  uint32_t offset, uint32_t size,
187 				  uint32_t *data, bool write)
188 {
189 	struct pci_virt_device *pvd = dev;
190 	struct npu2_dev *ndev = pvd->data;
191 	struct npu2_bar *ntl_npu_bar, *genid_npu_bar;
192 	bool enabled;
193 
194 	if (!write)
195 		return OPAL_PARTIAL;
196 
197 	if (offset != PCI_CFG_CMD)
198 		return OPAL_PARAMETER;
199 	if (size != 1 && size != 2 && size != 4)
200 		return OPAL_PARAMETER;
201 
202 	/*
203 	 * Enable or disable NTL and GENID BAR. Two bricks share
204 	 * one GENID BAR, which is exposed via the first brick.
205 	 */
206 	enabled = !!(*data & PCI_CFG_CMD_MEM_EN);
207 	ntl_npu_bar = &ndev->bars[0].npu2_bar;
208 	genid_npu_bar = &ndev->bars[1].npu2_bar;
209 
210 	ntl_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, ntl_npu_bar->flags, enabled);
211 	npu2_write_bar(ndev->npu, ntl_npu_bar, 0, 0);
212 
213 	/*
214 	 * Enable/disable the GENID BAR. Two bricks share one GENID
215 	 * BAR which is exposed via the first brick so we need to
216 	 * track the enables separately.
217 	 */
218 	if (NPU2DEV_BRICK(ndev))
219 		genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED1, genid_npu_bar->flags,
220 						enabled);
221 	else
222 		genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED0, genid_npu_bar->flags,
223 						enabled);
224 
225 	/* Enable the BAR if either device requests it enabled, otherwise disable it */
226 	genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, genid_npu_bar->flags,
227 					!!(genid_npu_bar->flags & (NPU2_BAR_FLAG_ENABLED0 |
228 								   NPU2_BAR_FLAG_ENABLED1)));
229 	npu2_write_bar(ndev->npu, genid_npu_bar, 0, 0);
230 
231 	return OPAL_PARTIAL;
232 }
233 
npu2_cfg_read_bar(struct npu2_dev * dev __unused,struct pci_cfg_reg_filter * pcrf,uint32_t offset,uint32_t size,uint32_t * data)234 static int64_t npu2_cfg_read_bar(struct npu2_dev *dev __unused,
235 				 struct pci_cfg_reg_filter *pcrf,
236 				 uint32_t offset, uint32_t size,
237 				 uint32_t *data)
238 {
239 	struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data;
240 
241 	if (!(bar->flags & NPU2_PCIE_BAR_FLAG_TRAPPED))
242 		return OPAL_PARTIAL;
243 
244 	if ((size != 4) ||
245 	    (offset != pcrf->start && offset != pcrf->start + 4))
246 		return OPAL_PARAMETER;
247 
248 	if (bar->flags & NPU2_PCIE_BAR_FLAG_SIZE_HI)
249 		*data = bar->npu2_bar.size >> 32;
250 	else
251 		*data = bar->npu2_bar.size;
252 	bar->flags &= ~(NPU2_PCIE_BAR_FLAG_TRAPPED | NPU2_PCIE_BAR_FLAG_SIZE_HI);
253 
254 	return OPAL_SUCCESS;
255 }
256 
npu2_cfg_write_bar(struct npu2_dev * dev,struct pci_cfg_reg_filter * pcrf,uint32_t offset,uint32_t size,uint32_t data)257 static int64_t npu2_cfg_write_bar(struct npu2_dev *dev,
258 				  struct pci_cfg_reg_filter *pcrf,
259 				  uint32_t offset, uint32_t size,
260 				  uint32_t data)
261 {
262 	struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data;
263 	struct npu2_bar old_bar, *npu2_bar = &bar->npu2_bar;
264 
265 	if ((size != 4) ||
266 	    (offset != pcrf->start && offset != pcrf->start + 4))
267 		return OPAL_PARAMETER;
268 
269 	/* Return BAR size on next read */
270 	if (data == 0xffffffff) {
271 		bar->flags |= NPU2_PCIE_BAR_FLAG_TRAPPED;
272 		if (offset == pcrf->start + 4)
273 			bar->flags |= NPU2_PCIE_BAR_FLAG_SIZE_HI;
274 
275 		return OPAL_SUCCESS;
276 	}
277 
278 	if (offset == pcrf->start) {
279 		npu2_bar->base &= 0xffffffff00000000UL;
280 		npu2_bar->base |= (data & 0xfffffff0);
281 	} else {
282 		npu2_bar->base &= 0x00000000ffffffffUL;
283 		npu2_bar->base |= ((uint64_t)data << 32);
284 
285 		if (NPU2_REG(npu2_bar->reg) == NPU2_GENID_BAR && NPU2DEV_BRICK(dev))
286 			npu2_bar->base -= 0x10000;
287 
288 		old_bar.reg = npu2_bar->reg;
289 		npu2_read_bar(dev->npu, &old_bar);
290 
291 		/* Only allow changing the base address if the BAR is not enabled */
292 		if ((npu2_bar->flags & NPU2_BAR_FLAG_ENABLED) &&
293 		    (npu2_bar->base != old_bar.base)) {
294 			npu2_bar->base = old_bar.base;
295 			return OPAL_HARDWARE;
296 		}
297 
298 		npu2_write_bar(dev->npu, &bar->npu2_bar, 0, 0);
299 	}
300 
301 	/* To update the config cache */
302 	return OPAL_PARTIAL;
303 }
304 
npu2_dev_cfg_bar(void * dev,struct pci_cfg_reg_filter * pcrf,uint32_t offset,uint32_t len,uint32_t * data,bool write)305 static int64_t npu2_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
306 				uint32_t offset, uint32_t len, uint32_t *data,
307 				bool write)
308 {
309 	struct pci_virt_device *pvd = dev;
310 	struct npu2_dev *ndev = (struct npu2_dev *) pvd->data;
311 
312 	if (write)
313 		return npu2_cfg_write_bar(ndev, pcrf, offset, len, *data);
314 
315 	return npu2_cfg_read_bar(ndev, pcrf, offset, len, data);
316 }
317 
start_l2_purge(uint32_t chip_id,uint32_t core_id)318 static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
319 {
320 	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
321 	int rc;
322 
323 	rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
324 			      L2_PRD_PURGE_CMD_TYPE_MASK);
325 	if (!rc)
326 		rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
327 			      L2_PRD_PURGE_CMD_TRIGGER);
328 	if (rc)
329 		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
330 		      "failed %i\n", core_id, rc);
331 	return rc;
332 }
333 
wait_l2_purge(uint32_t chip_id,uint32_t core_id)334 static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
335 {
336 	uint64_t val;
337 	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
338 	unsigned long now = mftb();
339 	unsigned long end = now + msecs_to_tb(L2_L3_PRD_PURGE_TIMEOUT_MS);
340 	int rc;
341 
342 	while (1) {
343 		rc = xscom_read(chip_id, addr, &val);
344 		if (rc) {
345 			prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
346 			      "failed %i\n", core_id, rc);
347 			break;
348 		}
349 		if (!(val & L2_PRD_PURGE_CMD_REG_BUSY))
350 			break;
351 		now = mftb();
352 		if (tb_compare(now, end) == TB_AAFTERB) {
353 			prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
354 			      core_id, rc);
355 			return OPAL_BUSY;
356 		}
357 	}
358 
359 	/* We have to clear the trigger bit ourselves */
360 	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
361 	rc = xscom_write(chip_id, addr, val);
362 	if (rc)
363 		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
364 		      core_id, rc);
365 	return rc;
366 }
367 
start_l3_purge(uint32_t chip_id,uint32_t core_id)368 static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
369 {
370 	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
371 	int rc;
372 
373 	rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
374 			      L3_PRD_PURGE_TTYPE_MASK);
375 	if (!rc)
376 		rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
377 			      L3_PRD_PURGE_REQ);
378 	if (rc)
379 		prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
380 		      "failed %i\n", core_id, rc);
381 	return rc;
382 }
383 
wait_l3_purge(uint32_t chip_id,uint32_t core_id)384 static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
385 {
386 	uint64_t val;
387 	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
388 	unsigned long now = mftb();
389 	unsigned long end = now + msecs_to_tb(L2_L3_PRD_PURGE_TIMEOUT_MS);
390 	int rc;
391 
392 	/* Trigger bit is automatically set to zero when flushing is done */
393 	while (1) {
394 		rc = xscom_read(chip_id, addr, &val);
395 		if (rc) {
396 			prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
397 			      "failed %i\n", core_id, rc);
398 			break;
399 		}
400 		if (!(val & L3_PRD_PURGE_REQ))
401 			break;
402 		now = mftb();
403 		if (tb_compare(now, end) == TB_AAFTERB) {
404 			prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
405 			      core_id, rc);
406 			return OPAL_BUSY;
407 		}
408 	}
409 	return rc;
410 }
411 
purge_l2_l3_caches(void)412 static int64_t purge_l2_l3_caches(void)
413 {
414 	struct cpu_thread *t;
415 	uint64_t core_id, prev_core_id = (uint64_t)-1;
416 	int rc;
417 	unsigned long now = mftb();
418 
419 	for_each_ungarded_cpu(t) {
420 		/* Only need to do it once per core chiplet */
421 		core_id = pir_to_core_id(t->pir);
422 		if (prev_core_id == core_id)
423 			continue;
424 		prev_core_id = core_id;
425 		rc = start_l2_purge(t->chip_id, core_id);
426 		if (rc)
427 			goto trace_exit;
428 		rc = start_l3_purge(t->chip_id, core_id);
429 		if (rc)
430 			goto trace_exit;
431 	}
432 
433 	prev_core_id = (uint64_t)-1;
434 	for_each_ungarded_cpu(t) {
435 		/* Only need to do it once per core chiplet */
436 		core_id = pir_to_core_id(t->pir);
437 		if (prev_core_id == core_id)
438 			continue;
439 		prev_core_id = core_id;
440 
441 		rc = wait_l2_purge(t->chip_id, core_id);
442 		if (rc)
443 			goto trace_exit;
444 		rc = wait_l3_purge(t->chip_id, core_id);
445 		if (rc)
446 			goto trace_exit;
447 	}
448 
449 trace_exit:
450 	prlog(PR_TRACE, "L2/L3 purging took %ldus\n",
451 			tb_to_usecs(mftb() - now));
452 
453 	return rc;
454 }
455 
npu2_dev_cfg_exp_devcap(void * dev,struct pci_cfg_reg_filter * pcrf __unused,uint32_t offset,uint32_t size,uint32_t * data,bool write)456 static int64_t npu2_dev_cfg_exp_devcap(void *dev,
457 		struct pci_cfg_reg_filter *pcrf __unused,
458 		uint32_t offset, uint32_t size,
459 		uint32_t *data, bool write)
460 {
461 	struct pci_virt_device *pvd = dev;
462 	struct npu2_dev *ndev = pvd->data;
463 	int rc;
464 
465 	assert(write);
466 
467 	if ((size != 2) || (offset & 1)) {
468 		/* Short config writes are not supported */
469 		prlog(PR_ERR, "NPU%d: Unsupported write to pcie control register\n",
470 		      ndev->nvlink.phb->opal_id);
471 		return OPAL_PARAMETER;
472 	}
473 
474 	if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
475 		npu2_dev_procedure_reset(ndev);
476 
477 	rc = purge_l2_l3_caches();
478 	if (rc)
479 		return rc;
480 
481 	return OPAL_PARTIAL;
482 }
483 
484 #define NPU2_CFG_READ(size, type)					\
485 static int64_t npu2_cfg_read##size(struct phb *phb, uint32_t bdfn,	\
486 				   uint32_t offset, type *data)		\
487 {									\
488 	uint32_t val;							\
489 	int64_t ret;							\
490 									\
491 	ret = pci_virt_cfg_read(phb, bdfn, offset,			\
492 				sizeof(*data), &val);			\
493 	*data = (type)val;						\
494         return ret;							\
495 }
496 #define NPU2_CFG_WRITE(size, type)					\
497 static int64_t npu2_cfg_write##size(struct phb *phb, uint32_t bdfn,	\
498 				    uint32_t offset, type data)		\
499 {									\
500 	uint32_t val = data;						\
501 	int64_t ret;							\
502 									\
503 	ret = pci_virt_cfg_write(phb, bdfn, offset,			\
504 				 sizeof(data), val);			\
505 	return ret;							\
506 }
507 
508 NPU2_CFG_READ(8, u8);
509 NPU2_CFG_READ(16, u16);
510 NPU2_CFG_READ(32, u32);
511 NPU2_CFG_WRITE(8, u8);
512 NPU2_CFG_WRITE(16, u16);
513 NPU2_CFG_WRITE(32, u32);
514 
__npu2_dev_bind_pci_dev(struct phb * phb __unused,struct pci_device * pd,void * data)515 static int __npu2_dev_bind_pci_dev(struct phb *phb __unused,
516 				  struct pci_device *pd,
517 				  void *data)
518 {
519 	struct npu2_dev *dev = data;
520 	struct dt_node *pci_dt_node;
521 	char *pcislot;
522 
523 	/* Ignore non-nvidia PCI devices */
524 	if ((pd->vdid & 0xffff) != 0x10de)
525 		return 0;
526 
527 	/* Find the PCI device's slot location */
528 	for (pci_dt_node = pd->dn;
529 	     pci_dt_node && !dt_find_property(pci_dt_node, "ibm,loc-code");
530 	     pci_dt_node = pci_dt_node->parent);
531 
532 	if (!pci_dt_node)
533 		return 0;
534 
535 	pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,loc-code");
536 
537 	NPU2DEVDBG(dev, "Comparing GPU '%s' and NPU2 '%s'\n",
538 		   pcislot, dev->nvlink.slot_label);
539 
540 	if (streq(pcislot, dev->nvlink.slot_label))
541 		return 1;
542 
543 	return 0;
544 }
545 
npu2_gpu_bridge_sec_bus_reset(void * dev,struct pci_cfg_reg_filter * pcrf __unused,uint32_t offset,uint32_t len,uint32_t * data,bool write)546 static int64_t npu2_gpu_bridge_sec_bus_reset(void *dev,
547 		struct pci_cfg_reg_filter *pcrf __unused,
548 		uint32_t offset, uint32_t len,
549 		uint32_t *data, bool write)
550 {
551 	struct pci_device *pd = dev;
552 	struct pci_device *gpu;
553 	struct phb *npphb;
554 	struct npu2 *npu;
555 	struct dt_node *np;
556 	struct npu2_dev	*ndev;
557 	int i;
558 
559 	assert(write);
560 
561 	if ((len != 2) || (offset & 1)) {
562 		/* Short config writes are not supported */
563 		PCIERR(pd->phb, pd->bdfn,
564 		       "Unsupported write to bridge control register\n");
565 		return OPAL_PARAMETER;
566 	}
567 
568 	gpu = list_top(&pd->children, struct pci_device, link);
569 	if (gpu && (*data & PCI_CFG_BRCTL_SECONDARY_RESET)) {
570 		int64_t rc;
571 
572 		dt_for_each_compatible(dt_root, np, "ibm,power9-npu-pciex") {
573 			npphb = pci_get_phb(dt_prop_get_cell(np,
574 					"ibm,opal-phbid", 1));
575 			if (!npphb || npphb->phb_type != phb_type_npu_v2)
576 				continue;
577 
578 			npu = phb_to_npu2_nvlink(npphb);
579 			for (i = 0; i < npu->total_devices; ++i) {
580 				ndev = &npu->devices[i];
581 				if (ndev->nvlink.pd == gpu)
582 					npu2_dev_procedure_reset(ndev);
583 			}
584 		}
585 
586 		rc = purge_l2_l3_caches();
587 		if (rc)
588 			return rc;
589 	}
590 
591 	return OPAL_PARTIAL;
592 }
593 
npu2_dev_bind_pci_dev(struct npu2_dev * dev)594 static void npu2_dev_bind_pci_dev(struct npu2_dev *dev)
595 {
596 	struct phb *phb;
597 	uint32_t i;
598 
599 	if (dev->nvlink.pd)
600 		return;
601 
602 	for (i = 0; i < 64; i++) {
603 		if (dev->npu->phb_nvlink.opal_id == i)
604 			continue;
605 
606 		phb = pci_get_phb(i);
607 		if (!phb)
608 			continue;
609 
610 		dev->nvlink.pd = pci_walk_dev(phb, NULL, __npu2_dev_bind_pci_dev, dev);
611 		if (dev->nvlink.pd) {
612 			dev->nvlink.phb = phb;
613 			/* Found the device, set the bit in config space */
614 			npu2_set_link_flag(dev, NPU2_DEV_PCI_LINKED);
615 
616 			/*
617 			 * We define a custom sec bus reset handler for a slot
618 			 * with an NVLink-connected GPU to prevent HMIs which
619 			 * will otherwise happen if we reset GPU before
620 			 * resetting NVLinks.
621 			 */
622 			if (dev->nvlink.pd->parent &&
623 			    dev->nvlink.pd->parent->slot)
624 				pci_add_cfg_reg_filter(dev->nvlink.pd->parent,
625 						PCI_CFG_BRCTL, 2,
626 						PCI_REG_FLAG_WRITE,
627 						npu2_gpu_bridge_sec_bus_reset);
628 			return;
629 		}
630 	}
631 
632 	NPU2DEVINF(dev, "No PCI device found for slot '%s'\n",
633 		   dev->nvlink.slot_label);
634 }
635 
636 static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED;
637 
npu2_append_phandle(struct dt_node * dn,u32 phandle)638 static void npu2_append_phandle(struct dt_node *dn,
639 				u32 phandle)
640 {
641 	struct dt_property *prop;
642 	uint32_t *npu_phandles;
643 	size_t len;
644 
645 	/*
646 	 * Use a lock to make sure no one else has a reference to an
647 	 * ibm,npu property (this assumes this is the only function
648 	 * that holds a reference to it)
649 	 */
650 	lock(&pci_npu_phandle_lock);
651 
652 	/* This function shouldn't be called unless ibm,npu exists */
653 	prop = (struct dt_property *)dt_require_property(dn, "ibm,npu", -1);
654 
655 	/* Need to append to the properties */
656 	len = prop->len + sizeof(*npu_phandles);
657 	dt_resize_property(&prop, len);
658 	prop->len = len;
659 
660 	npu_phandles = (uint32_t *)prop->prop;
661 	npu_phandles[len / sizeof(*npu_phandles) - 1] = phandle;
662 	unlock(&pci_npu_phandle_lock);
663 }
664 
npu2_create_memory_dn(uint64_t addr,uint64_t size)665 static struct dt_node *npu2_create_memory_dn(uint64_t addr, uint64_t size)
666 {
667 	struct dt_node *mem;
668 	static u32 chip_id = 255;
669 
670 	mem = dt_find_by_name_addr(dt_root, "memory", addr);
671 	if (mem)
672 		return mem;
673 
674 	mem = dt_new_addr(dt_root, "memory", addr);
675 	if (!mem)
676 		return NULL;
677 	dt_add_property_string(mem, "device_type", "memory");
678 	dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory");
679 	dt_add_property_u64s(mem, "reg", addr, size);
680 	dt_add_property_cells(mem, "ibm,chip-id", chip_id);
681 	dt_add_property_u64s(mem, "linux,usable-memory", addr, 0);
682 	dt_add_property_cells(mem, "ibm,associativity", 4, chip_id, chip_id, chip_id, chip_id);
683 	chip_id--;
684 
685 	assert(chip_id);
686 	return mem;
687 }
688 
689 /* There are potentially multiple links per GPU, so lookup the GPU memory based
690  * on bdfn. */
npu2_get_gpu_base(struct npu2_dev * ndev,uint64_t * addr,uint64_t * size)691 static void npu2_get_gpu_base(struct npu2_dev *ndev, uint64_t *addr, uint64_t *size)
692 {
693 	struct npu2 *p = ndev->npu;
694 	int group;
695 
696 	group = (ndev->bdfn >> 3) & 0x1f;
697 	phys_map_get(ndev->npu->chip_id, p->gpu_map_type, group, addr, size);
698 }
699 
npu2_dn_fixup_gmb(struct dt_node * pd_dn,struct npu2_dev * ndev)700 static void npu2_dn_fixup_gmb(struct dt_node *pd_dn, struct npu2_dev *ndev)
701 {
702 	uint64_t gpu_base, gpu_size, gta;
703 	struct dt_node *mem_dn;
704 
705 	npu2_get_gpu_base(ndev, &gpu_base, &gpu_size);
706 	mem_dn = npu2_create_memory_dn(gpu_base, gpu_size);
707 	assert(mem_dn);
708 	dt_add_property_cells(pd_dn, "memory-region", mem_dn->phandle);
709 
710 	/* Coral mode address compression. This is documented in Figure 3.5
711 	 * "P9->GPU RA Compression (Coral) of the NPU2 workbook". */
712 	gta  = ((gpu_base >> 42) & 0x1) << 42;
713 	gta |= ((gpu_base >> 45) & 0x3) << 43;
714 	gta |= ((gpu_base >> 49) & 0x3) << 45;
715 	gta |= gpu_base & ((1UL << 43) - 1);
716 
717 	dt_add_property_u64s(pd_dn, "ibm,device-tgt-addr", gta);
718 }
719 
npu2_assign_gmb(struct npu2_dev * ndev)720 static int npu2_assign_gmb(struct npu2_dev *ndev)
721 {
722 	struct npu2 *p = ndev->npu;
723 	int peers, mode;
724 	uint32_t bdfn;
725 	uint64_t base, size, reg, val, gmb;
726 
727 	/* Need to work out number of link peers. This amount to
728 	 * working out the maximum function number. So work start at
729 	 * the highest bdfn (fn = 6) and count back until we find a
730 	 * npu2_dev. */
731 	for (bdfn = (ndev->bdfn & ~0x7) | NPU2_LINKS_PER_CHIP;
732 	     (bdfn & 0x7) != 0x7; bdfn = (bdfn & ~0x7) | ((bdfn & 0x7) - 1))
733 		if (npu2_bdf_to_dev(p, bdfn))
734 			break;
735 	peers = bdfn & 0x7;
736 
737 	npu2_get_gpu_base(ndev, &base, &size);
738 
739 	NPU2DBG(p, "Setting BAR region dt:%llx\n", base);
740 	val = SETFIELD(NPU2_MEM_BAR_EN, 0ULL, 1);
741 	val = SETFIELD(NPU2_MEM_BAR_SEL_MEM, val, base >> (63-14));
742 	val = SETFIELD(NPU2_MEM_BAR_GROUP, val, base >> (63-18));
743 	val = SETFIELD(NPU2_MEM_BAR_CHIP, val, base >> (63-21));
744 	val = SETFIELD(NPU2_MEM_BAR_NODE_ADDR, val, base >> (63-33));
745 	val = SETFIELD(NPU2_MEM_BAR_POISON, val, 1);
746 	val = SETFIELD(NPU2_MEM_BAR_GRANULE, val, 0);
747 
748 	/* We don't know how much memory the GPU has, so we may as well just
749 	 * pass the whole aperture through at this point. */
750 	val = SETFIELD(NPU2_MEM_BAR_BAR_SIZE, val, ilog2(size >> 30));
751 
752 	switch (peers) {
753 	case 0:
754 		mode = 0;
755 		break;
756 	case 1:
757 		mode = 1;
758 		break;
759 	case 2:
760 		mode = 3;
761 		break;
762 	case 3:
763 		mode = 6;
764 		break;
765 	case 5:
766 		mode = 10;
767 		break;
768 	default:
769 		/* Hardware does not support this configuration */
770 		assert(0);
771 	}
772 
773 	mode += ndev->bdfn & 0x7;
774 	val = SETFIELD(NPU2_MEM_BAR_MODE, val, mode);
775 
776 	gmb = NPU2_GPU0_MEM_BAR;
777 	if (NPU2DEV_BRICK(ndev))
778 		gmb = NPU2_GPU1_MEM_BAR;
779 
780 	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
781 			      NPU2_BLOCK_SM_0, gmb);
782 
783 	npu2_write(p, reg, val);
784 	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
785 			      NPU2_BLOCK_SM_1, gmb);
786 	npu2_write(p, reg, val);
787 	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
788 			      NPU2_BLOCK_SM_2, gmb);
789 	npu2_write(p, reg, val);
790 	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
791 			      NPU2_BLOCK_SM_3, gmb);
792 	npu2_write(p, reg, val);
793 
794 	return 0;
795 }
796 
npu2_dn_fixup(struct phb * phb,struct pci_device * pd,void * data __unused)797 static int npu2_dn_fixup(struct phb *phb,
798 			 struct pci_device *pd,
799 			 void *data __unused)
800 {
801 	struct npu2 *p = phb_to_npu2_nvlink(phb);
802 	struct npu2_dev *dev;
803 	uint32_t speed;
804 	const char *label;
805 
806 	dev = npu2_bdf_to_dev(p, pd->bdfn);
807 	assert(dev);
808 	if (dev->nvlink.phb || dev->nvlink.pd)
809 		return 0;
810 
811 	npu2_assign_gmb(dev);
812 	npu2_dn_fixup_gmb(pd->dn, dev);
813 	dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dt_node->phandle);
814 
815 	/*
816 	 * NVLink supports multiple speeds and device drivers need to know what
817 	 * speed has been set by firmware. Hostboot does the inits that set the
818 	 * link speed and tell us via HDAT and we need to copy that from the
819 	 * link node.
820 	 */
821 	speed = dt_prop_get_u32_def(dev->dt_node, "nvidia,link-speed", 0xff);
822 	if (speed != 0xff)
823 		dt_add_property_cells(pd->dn, "ibm,nvlink-speed", speed);
824 
825 	/*
826 	 * NPU2 devices have a slot label that indicates which GPU slot
827 	 * this NPU is connected to. Add a location code to the NVlink
828 	 * device node based on the slot label.
829 	 */
830 	label = dt_prop_get_def(dev->dt_node, "ibm,slot-label", NULL);
831 	if (!label) {
832 		/**
833 		 * @fwts-label NPUNoPHBSlotLabel
834 		 * @fwts-advice No GPU/NPU2 slot information was found.
835 		 * NVLink2 functionality will not work.
836 		 */
837 		prlog(PR_ERR, "NPU: Cannot find GPU slot information\n");
838 		return 0;
839 	}
840 	dt_add_property_string(pd->dn, "ibm,loc-code", label);
841 
842 	dev->nvlink.slot_label = label;
843 
844 	/*
845 	 * Bind the emulated PCI device with the real one, which can't
846 	 * be done until the PCI devices are populated. Once the real
847 	 * PCI device is identified, we also need fix the device-tree
848 	 * for it
849 	 */
850 	npu2_dev_bind_pci_dev(dev);
851 	if (dev->nvlink.phb && dev->nvlink.pd && dev->nvlink.pd->dn) {
852 		if (dt_find_property(dev->nvlink.pd->dn, "ibm,npu"))
853 			npu2_append_phandle(dev->nvlink.pd->dn, pd->dn->phandle);
854 		else
855 			dt_add_property_cells(dev->nvlink.pd->dn, "ibm,npu", pd->dn->phandle);
856 
857 		dt_add_property_cells(pd->dn, "ibm,gpu", dev->nvlink.pd->dn->phandle);
858 		dev->nvlink.gpu_bdfn = dev->nvlink.pd->bdfn;
859 	}
860 
861 	return 0;
862 }
863 
npu2_links_per_gpu(struct phb * phb,struct pci_device * pd,void * data)864 static int npu2_links_per_gpu(struct phb *phb,
865 			      struct pci_device *pd,
866 			      void *data)
867 {
868 	struct npu2 *p = phb_to_npu2_nvlink(phb);
869 	struct npu2_dev *dev;
870 	int *nlinks = (int *)data;
871 
872 	dev = npu2_bdf_to_dev(p, pd->bdfn);
873 	assert(dev);
874 
875 	if (dev->nvlink.phb && dev->nvlink.pd && dev->nvlink.pd->dn) {
876 		const struct dt_property *prop;
877 		int n;
878 
879 		/* The link count is the number of phandles in "ibm,npu" */
880 		prop = dt_find_property(dev->nvlink.pd->dn, "ibm,npu");
881 		if (!prop)
882 			return 0;
883 
884 		/* Count could vary by gpu, so find the max */
885 		n = prop->len / sizeof(uint32_t);
886 		if (n > *nlinks)
887 			*nlinks = n;
888 	}
889 
890 	return 0;
891 }
892 
npu2_phb_fixup_scominit(struct dt_node * dn,int links_per_gpu)893 static void npu2_phb_fixup_scominit(struct dt_node *dn, int links_per_gpu)
894 {
895 	uint32_t gcid = dt_get_chip_id(dn);
896 	uint64_t val, mask;
897 
898 	/*
899 	 * MRBSP settings for 2- and 3-link GPU systems. These can improve
900 	 * GPU peer-to-peer fully ordered write performance.
901 	 */
902 	if (links_per_gpu == 3) {
903 		val = PPC_BIT(30) | PPC_BIT(34) | PPC_BIT(36) | PPC_BIT(37) |
904 		      PPC_BIT(44) | PPC_BIT(45);
905 		mask = PPC_BITMASK(28,39) | PPC_BITMASK(44,47);
906 	} else if (links_per_gpu == 2) {
907 		val = PPC_BIT(46) | PPC_BIT(47);
908 		mask = PPC_BITMASK(44,47);
909 	} else
910 		return;
911 
912 	xscom_write_mask(gcid, 0x50110c0, val, mask);
913 	xscom_write_mask(gcid, 0x50112c0, val, mask);
914 	xscom_write_mask(gcid, 0x50114c0, val, mask);
915 }
916 
npu2_phb_final_fixup(struct phb * phb)917 static void npu2_phb_final_fixup(struct phb *phb)
918 {
919 	int links_per_gpu = 0;
920 	struct dt_node *np;
921 
922 	pci_walk_dev(phb, NULL, npu2_dn_fixup, NULL);
923 
924 	/*
925 	 * Now that the emulated devices are bound to the real ones, we can
926 	 * determine links_per_gpu and do some final init.
927 	 */
928 	pci_walk_dev(phb, NULL, npu2_links_per_gpu, &links_per_gpu);
929 	dt_for_each_compatible(dt_root, np, "ibm,power9-npu")
930 		npu2_phb_fixup_scominit(np, links_per_gpu);
931 }
932 
npu2_init_ioda_cache(struct npu2 * p)933 static void npu2_init_ioda_cache(struct npu2 *p)
934 {
935 	/* TVT */
936 	memset(p->tve_cache, 0, sizeof(p->tve_cache));
937 }
938 
npu2_ioda_reset(struct phb * phb,bool purge)939 static int64_t npu2_ioda_reset(struct phb *phb, bool purge)
940 {
941 	struct npu2 *p = phb_to_npu2_nvlink(phb);
942 	uint32_t i;
943 
944 	if (purge) {
945 		NPU2DBG(p, "Purging all IODA tables...\n");
946 		npu2_init_ioda_cache(p);
947 	}
948 
949 	/* TVT */
950 	npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, 0, true);
951 	for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
952 		out_be64(p->regs + NPU2_ATS_IODA_DATA, p->tve_cache[i]);
953 
954 	return OPAL_SUCCESS;
955 }
956 
npu2_write_mcd(struct npu2 * p,uint64_t pcb_addr,uint64_t addr,uint64_t size)957 static void npu2_write_mcd(struct npu2 *p, uint64_t pcb_addr, uint64_t addr,
958 			   uint64_t size)
959 {
960 	uint64_t val;
961 
962 	NPU2DBG(p, "Setting MCD addr:%llx\n", pcb_addr);
963 	assert(is_pow2(size));
964 
965 	val = MCD_BANK_CN_VALID;
966 	val = SETFIELD(MCD_BANK_CN_SIZE, val, (size >> 25) - 1);
967 	val = SETFIELD(MCD_BANK_CN_ADDR, val, addr >> 25);
968 	xscom_write(p->chip_id, pcb_addr, val);
969 }
970 
npu2_mcd_init(struct npu2 * p)971 static void npu2_mcd_init(struct npu2 *p)
972 {
973 	int i;
974 	uint64_t size, addr, gpu_min_addr, gpu_max_addr, total_size;
975 
976 	/* Init memory cache directory (MCD) registers. */
977 	phys_map_get(p->chip_id, p->gpu_map_type, NPU2_LINKS_PER_CHIP - 1,
978 			&gpu_min_addr, NULL);
979 	phys_map_get(p->chip_id, p->gpu_map_type, 0, &gpu_max_addr, &size);
980 	gpu_max_addr += size;
981 
982 	/* We assume GPU memory is contiguous from the first possible GPU to the
983 	 * last and that the size is the same so best to check that. */
984 	for (i = 0; i < NPU2_LINKS_PER_CHIP; i++) {
985 		uint64_t tmp;
986 		phys_map_get(p->chip_id, p->gpu_map_type, i, &addr, &tmp);
987 		assert((addr >= gpu_min_addr) && (addr + tmp <= gpu_max_addr));
988 		assert(tmp == size);
989 	}
990 
991 	/* We have two MCDs, so if neccessary we can split the region covered
992 	 * across both if total_size is not a power of two. */
993 	total_size = gpu_max_addr - gpu_min_addr;
994 	size = 1ull << ilog2(total_size);
995 
996 	/* Allocate the biggest chunk first as we assume gpu_max_addr has the
997 	 * highest alignment. */
998 	addr = gpu_max_addr - size;
999 	npu2_write_mcd(p, MCD0_BANK0_CN3, addr, size);
1000 	total_size -= size;
1001 	if (total_size) {
1002 	/* total_size was not a power of two, but the remainder should
1003 	 * be if all GPUs were assigned the same size. */
1004 		assert(is_pow2(total_size));
1005 		size = 1ull << ilog2(total_size);
1006 		addr -= size;
1007 		assert(addr <= gpu_min_addr);
1008 		npu2_write_mcd(p, MCD1_BANK0_CN3, addr, size);
1009 	}
1010 }
1011 
npu2_hw_init(struct npu2 * p)1012 static void npu2_hw_init(struct npu2 *p)
1013 {
1014 	uint64_t reg, val;
1015 	int s, b;
1016 
1017 	npu2_ioda_reset(&p->phb_nvlink, false);
1018 
1019 	/* Enable XTS retry mode */
1020 	val = npu2_read(p, NPU2_XTS_CFG);
1021 	npu2_write(p, NPU2_XTS_CFG, val | NPU2_XTS_CFG_MMIOSD | NPU2_XTS_CFG_TRY_ATR_RO);
1022 
1023 	val = npu2_read(p, NPU2_XTS_CFG2);
1024 	npu2_write(p, NPU2_XTS_CFG2, val | NPU2_XTS_CFG2_NO_FLUSH_ENA);
1025 
1026 	/*
1027 	 * There are three different ways we configure the MCD and memory map.
1028 	 * 1) Old way
1029 	 *    Skiboot configures the MCD and puts GPUs at 4TB and below
1030 	 * 2) New way with MCD
1031 	 *    Hostboot configures the MCD and skiboot puts GPU at 4TB and above
1032 	 * 3) New way without MCD
1033 	 *    No one configures the MCD and skiboot puts GPU at 4TB and below
1034 	 *
1035 	 * 1) Will go away evenutally as it's a configuration that can
1036 	 *    cause an xstop or data integrity problems. We are keeping
1037 	 *    it around to support existing hostboot. Print error
1038 	 *    message if used.
1039 	 * 2) Is for smaller memory configurations and will be used
1040 	 *    initially for GPUs on Witherspoon. Supports only to
1041 	 *    512GB of memory and 4 GPUs per socket.
1042 	 * 3) Is for fully populated configurations of 4TB of memory
1043 	 *    and 6GPUs per socket. May have performance impacts.
1044 	 *
1045 	 * The different configurations can be detected via the following scoms:
1046 	 * 1) 0x5011c0c bit 2 = 1, 0x5011c0a bits 42:48 = 0
1047 	 * 2) 0x5011c0c bit 2 = 1, 0x5011c0a bits 42:48 = 7
1048 	 * 3) 0x5011c0c bit 2 = 0, 0x5011c0a bits 42:48 = 0
1049 	 */
1050 
1051 	/* Get 0x05011c0c bit 2 = 1 */
1052 	xscom_read(p->chip_id, PB_CENT_HP_MODE_CURR, &val);
1053 	if ((val & PB_CFG_CHG_RATE_GP_MASTER) != 0) {
1054 		/* Get 0x05011c0a bits 42:48 */
1055 		xscom_read(p->chip_id, PB_CENT_MODE, &val);
1056 		if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 0) {
1057 			/* 1) */
1058 			NPU2DBG(p, "Using old memory map + MCD enabled in skiboot\n");
1059 			NPU2ERR(p, "!!! Old firmware detected. Update hostboot for new MCD mapping !!!\n");
1060 			p->gpu_map_type = GPU_MEM_4T_DOWN;
1061 			npu2_mcd_init(p);
1062 		} else if (GETFIELD(PB_CFG_CHIP_ADDR_EXTENSION_MASK_CENT, val) == 7) {
1063 			/* 2) */
1064 			NPU2DBG(p, "Using small memory map + MCD enabled\n");
1065 			p->gpu_map_type = GPU_MEM_4T_UP;
1066 		} else
1067 			NPU2ERR(p, "!!! Unsupported NPU2 configuration. "
1068 				"0x%llx!!!\n", val);
1069 	} else {
1070 		/* 3) */
1071 		NPU2DBG(p, "Using large memory map + MCD disabled\n");
1072 		p->gpu_map_type = GPU_MEM_4T_DOWN;
1073 	}
1074 
1075 	/* Static initialization of every relaxed-ordering cfg[2] register */
1076 	val = NPU2_RELAXED_ORDERING_CMD_CL_DMA_W |
1077 	      NPU2_RELAXED_ORDERING_CMD_CL_DMA_W_HP |
1078 	      NPU2_RELAXED_ORDERING_CMD_CL_DMA_INJ |
1079 	      NPU2_RELAXED_ORDERING_CMD_PR_DMA_INJ |
1080 	      NPU2_RELAXED_ORDERING_CMD_DMA_PR_W |
1081 	      NPU2_RELAXED_ORDERING_CMD_CL_RD_NC_F0 |
1082 	      NPU2_RELAXED_ORDERING_SOURCE4_RDENA;
1083 
1084 	for (s = NPU2_STACK_STCK_0; s <= NPU2_STACK_STCK_2; s++) {
1085 		for (b = NPU2_BLOCK_SM_0; b <= NPU2_BLOCK_SM_3; b++) {
1086 			reg = NPU2_REG_OFFSET(s, b, NPU2_RELAXED_ORDERING_CFG(2));
1087 			npu2_write(p, reg, val);
1088 		}
1089 	}
1090 }
1091 
npu2_map_pe_dma_window_real(struct phb * phb,uint64_t pe_num,uint16_t window_id,uint64_t pci_start_addr __unused,uint64_t pci_mem_size __unused)1092 static int64_t npu2_map_pe_dma_window_real(struct phb *phb,
1093 					   uint64_t pe_num,
1094 					   uint16_t window_id,
1095 					   uint64_t pci_start_addr __unused,
1096 					   uint64_t pci_mem_size __unused)
1097 {
1098 	struct npu2 *p = phb_to_npu2_nvlink(phb);
1099 	uint64_t tve;
1100 
1101 	/* Sanity check. Each PE has one corresponding TVE */
1102 	if (pe_num >= NPU2_MAX_PE_NUM ||
1103 	    window_id != pe_num)
1104 		return OPAL_PARAMETER;
1105 
1106 	if (pci_mem_size) {
1107 		/* GPUs need to be able to access the MMIO memory space as well.
1108 		 * On POWER9 this is above the top of ram so disable the TVT
1109 		 * range check allowing access to all memory addresses. */
1110 		tve = 0;
1111 	} else {
1112 		/* Disable */
1113 		tve = PPC_BIT(51);
1114 	}
1115 
1116 	npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
1117 	out_be64(p->regs + NPU2_ATS_IODA_DATA, tve);
1118 	p->tve_cache[window_id] = tve;
1119 
1120 	return OPAL_SUCCESS;
1121 }
1122 
npu2_map_pe_dma_window(struct phb * phb,uint64_t pe_num,uint16_t window_id,uint16_t tce_levels,uint64_t tce_table_addr,uint64_t tce_table_size,uint64_t tce_page_size)1123 static int64_t npu2_map_pe_dma_window(struct phb *phb,
1124 				      uint64_t pe_num,
1125 				      uint16_t window_id,
1126 				      uint16_t tce_levels,
1127 				      uint64_t tce_table_addr,
1128 				      uint64_t tce_table_size,
1129 				      uint64_t tce_page_size)
1130 {
1131 	struct npu2 *p = phb_to_npu2_nvlink(phb);
1132 	uint64_t tts_encoded;
1133 	uint64_t data64 = 0;
1134 
1135 	/* Sanity check. Each PE has one corresponding TVE */
1136 	if (pe_num >= NPU2_MAX_PE_NUM ||
1137 	    window_id != pe_num)
1138 		return OPAL_PARAMETER;
1139 
1140 	/*
1141 	 * Special condition, zero TCE table size used to disable
1142 	 * the TVE.
1143 	 */
1144 	if (!tce_table_size) {
1145 		npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
1146 		out_be64(p->regs + NPU2_ATS_IODA_DATA, 0ul);
1147 		p->tve_cache[window_id] = 0ul;
1148 		return OPAL_SUCCESS;
1149 	}
1150 
1151 	/* Additional arguments validation */
1152 	if (tce_levels < 1 ||
1153 	    tce_levels > 4 ||
1154 	    !is_pow2(tce_table_size) ||
1155 	    tce_table_size < 0x1000)
1156 		return OPAL_PARAMETER;
1157 
1158 	/* TCE table size */
1159 	data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_TTA, 0ul, tce_table_addr >> 12);
1160 	tts_encoded = ilog2(tce_table_size) - 11;
1161 	if (tts_encoded > 39)
1162 		return OPAL_PARAMETER;
1163 	data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_SIZE, data64, tts_encoded);
1164 
1165 	/* TCE page size */
1166 	switch (tce_page_size) {
1167 	case 0x10000:		/* 64K */
1168 		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 5);
1169 		break;
1170 	case 0x1000000:		/* 16M */
1171 		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 13);
1172 		break;
1173 	case 0x10000000:	/* 256M */
1174 		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 17);
1175 		break;
1176 	case 0x1000:		/* 4K */
1177 	default:
1178 		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 1);
1179 	}
1180 
1181 	/* Number of levels */
1182 	data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_LEVEL, data64, tce_levels - 1);
1183 
1184 	/* Update to hardware */
1185 	npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
1186 	out_be64(p->regs + NPU2_ATS_IODA_DATA, data64);
1187 	p->tve_cache[window_id] = data64;
1188 
1189 	return OPAL_SUCCESS;
1190 }
1191 
npu2_set_pe(struct phb * phb,uint64_t pe_num,uint64_t bdfn,uint8_t bcompare,uint8_t dcompare,uint8_t fcompare,uint8_t action)1192 static int64_t npu2_set_pe(struct phb *phb,
1193 			   uint64_t pe_num,
1194 			   uint64_t bdfn,
1195 			   uint8_t bcompare,
1196 			   uint8_t dcompare,
1197 			   uint8_t fcompare,
1198 			   uint8_t action)
1199 {
1200 	struct npu2 *p;
1201 	struct npu2_dev *dev;
1202 	uint64_t reg, val;
1203 
1204 	/* Sanity check */
1205 	if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
1206 		return OPAL_PARAMETER;
1207 	if (pe_num >= NPU2_MAX_PE_NUM)
1208 		return OPAL_PARAMETER;
1209 	if (bdfn >> 8)
1210 		return OPAL_PARAMETER;
1211 	if (bcompare != OpalPciBusAll ||
1212 	    dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
1213 	    fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
1214 		return OPAL_UNSUPPORTED;
1215 	if (phb->phb_type != phb_type_npu_v2)
1216 		return OPAL_PARAMETER;
1217 
1218 	p = phb_to_npu2_nvlink(phb);
1219 	if (!p)
1220 		return OPAL_PARAMETER;
1221 
1222 	dev = npu2_bdf_to_dev(p, bdfn);
1223 	if (!dev)
1224 		return OPAL_PARAMETER;
1225 
1226 	val = NPU2_CQ_BRICK_BDF2PE_MAP_ENABLE;
1227 	val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_PE, val, pe_num);
1228 	val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_BDF, val, dev->nvlink.gpu_bdfn);
1229 
1230 	if (!NPU2DEV_BRICK(dev))
1231 		reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->brick_index/2,
1232 				      NPU2_BLOCK_CTL, NPU2_CQ_BRICK0_BDF2PE_MAP0);
1233 	else
1234 		reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->brick_index/2,
1235 				      NPU2_BLOCK_CTL, NPU2_CQ_BRICK1_BDF2PE_MAP0);
1236 
1237 	npu2_write(p, reg, val);
1238 	val = NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE;
1239 	val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_PE, val, pe_num);
1240 	val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_BDF, val, dev->nvlink.gpu_bdfn);
1241 	reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC,
1242 			      NPU2_MISC_BRICK0_BDF2PE_MAP0 + (dev->brick_index * 0x18));
1243 	npu2_write(p, reg, val);
1244 
1245 	return OPAL_SUCCESS;
1246 }
1247 
npu2_get_link_state(struct pci_slot * slot __unused,uint8_t * val)1248 static int64_t npu2_get_link_state(struct pci_slot *slot __unused, uint8_t *val)
1249 {
1250 	/*
1251 	 * As we're emulating all PCI stuff, the link bandwidth
1252 	 * isn't big deal anyway.
1253 	 */
1254 	*val = OPAL_SHPC_LINK_UP_x1;
1255 	return OPAL_SUCCESS;
1256 }
1257 
npu2_get_power_state(struct pci_slot * slot __unused,uint8_t * val)1258 static int64_t npu2_get_power_state(struct pci_slot *slot __unused, uint8_t *val)
1259 {
1260 	*val = PCI_SLOT_POWER_ON;
1261 	return OPAL_SUCCESS;
1262 }
1263 
npu2_hreset(struct pci_slot * slot __unused)1264 static int64_t npu2_hreset(struct pci_slot *slot __unused)
1265 {
1266 	struct npu2 *p;
1267 	int i;
1268 	struct npu2_dev *ndev;
1269 
1270 	p = phb_to_npu2_nvlink(slot->phb);
1271 	NPU2INF(p, "Hreset PHB state\n");
1272 
1273 	for (i = 0; i < p->total_devices; i++) {
1274 		ndev = &p->devices[i];
1275 		if (ndev) {
1276 			NPU2DEVINF(ndev, "Resetting device\n");
1277 			reset_ntl(ndev);
1278 		}
1279 	}
1280 	return purge_l2_l3_caches();
1281 }
1282 
npu2_freset(struct pci_slot * slot __unused)1283 static int64_t npu2_freset(struct pci_slot *slot __unused)
1284 {
1285 	return OPAL_SUCCESS;
1286 }
1287 
npu2_creset(struct pci_slot * slot)1288 static int64_t npu2_creset(struct pci_slot *slot)
1289 {
1290 	struct npu2 *p;
1291 	int i;
1292 	struct npu2_dev *ndev;
1293 
1294 	p = phb_to_npu2_nvlink(slot->phb);
1295 	NPU2INF(p, "Creset PHB state\n");
1296 
1297 	for (i = 0; i < p->total_devices; i++) {
1298 		ndev = &p->devices[i];
1299 		if (ndev) {
1300 			NPU2DEVINF(ndev, "Resetting device\n");
1301 			reset_ntl(ndev);
1302 		}
1303 	}
1304 	return OPAL_SUCCESS;
1305 }
1306 
npu2_slot_create(struct phb * phb)1307 static struct pci_slot *npu2_slot_create(struct phb *phb)
1308 {
1309 	struct pci_slot *slot;
1310 
1311 	slot = pci_slot_alloc(phb, NULL);
1312 	if (!slot)
1313 		return slot;
1314 
1315 	/* Elementary functions */
1316 	slot->ops.get_presence_state  = NULL;
1317 	slot->ops.get_link_state      = npu2_get_link_state;
1318 	slot->ops.get_power_state     = npu2_get_power_state;
1319 	slot->ops.get_attention_state = NULL;
1320 	slot->ops.get_latch_state     = NULL;
1321 	slot->ops.set_power_state     = NULL;
1322 	slot->ops.set_attention_state = NULL;
1323 
1324 	slot->ops.prepare_link_change = NULL;
1325 	slot->ops.poll_link           = NULL;
1326 	slot->ops.hreset              = npu2_hreset;
1327 	slot->ops.freset              = npu2_freset;
1328 	slot->ops.creset              = npu2_creset;
1329 
1330 	return slot;
1331 }
1332 
npu2_freeze_status(struct phb * phb __unused,uint64_t pe_number __unused,uint8_t * freeze_state,uint16_t * pci_error_type,uint16_t * severity)1333 int64_t npu2_freeze_status(struct phb *phb __unused,
1334 			   uint64_t pe_number __unused,
1335 			   uint8_t *freeze_state,
1336 			   uint16_t *pci_error_type,
1337 			   uint16_t *severity)
1338 {
1339 	/*
1340 	 * FIXME: When it's called by skiboot PCI config accessor,
1341 	 * the PE number is fixed to 0, which is incorrect. We need
1342 	 * introduce another PHB callback to translate it. For now,
1343 	 * it keeps the skiboot PCI enumeration going.
1344 	 */
1345 	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
1346 	*pci_error_type = OPAL_EEH_NO_ERROR;
1347 	if (severity)
1348 		*severity = OPAL_EEH_SEV_NO_ERROR;
1349 
1350 	return OPAL_SUCCESS;
1351 }
1352 
npu2_eeh_next_error(struct phb * phb,uint64_t * first_frozen_pe,uint16_t * pci_error_type,uint16_t * severity)1353 static int64_t npu2_eeh_next_error(struct phb *phb,
1354 				   uint64_t *first_frozen_pe,
1355 				   uint16_t *pci_error_type,
1356 				   uint16_t *severity)
1357 {
1358 	struct npu2 *p = phb_to_npu2_nvlink(phb);
1359 	int i;
1360 	uint64_t result = 0;
1361 
1362 	if (!first_frozen_pe || !pci_error_type || !severity)
1363 		return OPAL_PARAMETER;
1364 
1365 	*first_frozen_pe = -1;
1366 	*pci_error_type = OPAL_EEH_NO_ERROR;
1367 	*severity = OPAL_EEH_SEV_NO_ERROR;
1368 
1369 	for (i = 0; i < NPU2_MAX_PE_NUM; i++) {
1370 		result = npu2_read(p, NPU2_MISC_PESTB(i));
1371 		if (result > 0) {
1372 			*first_frozen_pe = i;
1373 			*pci_error_type = OPAL_EEH_PE_ERROR;
1374 			*severity = OPAL_EEH_SEV_PE_ER;
1375 			break;
1376 		}
1377 	}
1378 
1379 	return OPAL_SUCCESS;
1380 }
1381 
npu2_tce_kill(struct phb * phb,uint32_t kill_type,uint64_t pe_number,uint32_t tce_size,uint64_t dma_addr,uint32_t npages)1382 static int64_t npu2_tce_kill(struct phb *phb, uint32_t kill_type,
1383 			     uint64_t pe_number, uint32_t tce_size,
1384 			     uint64_t dma_addr, uint32_t npages)
1385 {
1386 	struct npu2 *npu = phb_to_npu2_nvlink(phb);
1387 	uint32_t tce_page_size;
1388 	uint64_t val;
1389 
1390 	if (pe_number > NPU2_MAX_PE_NUM)
1391 		return OPAL_PARAMETER;
1392 
1393 	sync();
1394 	switch(kill_type) {
1395 	case OPAL_PCI_TCE_KILL_PAGES:
1396 		tce_page_size = 1ULL << (
1397 				11 + GETFIELD(npu->tve_cache[pe_number],
1398 					NPU2_ATS_IODA_TBL_TVT_PSIZE));
1399 		if (tce_page_size != tce_size) {
1400 			NPU2ERR(npu, "npu2_tce_kill: Unexpected TCE size (got 0x%x expected 0x%x)\n",
1401 				tce_size, tce_page_size);
1402 			return OPAL_PARAMETER;
1403 		}
1404 
1405 		while (npages--) {
1406 			val = SETFIELD(NPU2_ATS_TCE_KILL_PENUM, dma_addr, pe_number);
1407 			npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ONE | val);
1408 			dma_addr += tce_size;
1409 		}
1410 		break;
1411 	case OPAL_PCI_TCE_KILL_PE:
1412 		/*
1413 		 * NPU2 doesn't support killing a PE so fall through
1414 		 * and do a kill all instead.
1415 		 */
1416 	case OPAL_PCI_TCE_KILL_ALL:
1417 		npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ALL);
1418 		break;
1419 	default:
1420 		return OPAL_PARAMETER;
1421 	}
1422 
1423 	return OPAL_SUCCESS;
1424 }
1425 
1426 static const struct phb_ops npu_ops = {
1427 	.cfg_read8		= npu2_cfg_read8,
1428 	.cfg_read16		= npu2_cfg_read16,
1429 	.cfg_read32		= npu2_cfg_read32,
1430 	.cfg_write8		= npu2_cfg_write8,
1431 	.cfg_write16		= npu2_cfg_write16,
1432 	.cfg_write32		= npu2_cfg_write32,
1433 	.choose_bus		= NULL,
1434 	.device_init		= NULL,
1435 	.phb_final_fixup	= npu2_phb_final_fixup,
1436 	.ioda_reset		= npu2_ioda_reset,
1437 	.papr_errinjct_reset	= NULL,
1438 	.pci_reinit		= NULL,
1439 	.set_phb_mem_window	= NULL,
1440 	.phb_mmio_enable	= NULL,
1441 	.map_pe_mmio_window	= NULL,
1442 	.map_pe_dma_window	= npu2_map_pe_dma_window,
1443 	.map_pe_dma_window_real	= npu2_map_pe_dma_window_real,
1444 	.pci_msi_eoi		= NULL,
1445 	.set_xive_pe		= NULL,
1446 	.get_msi_32		= NULL,
1447 	.get_msi_64		= NULL,
1448 	.set_pe			= npu2_set_pe,
1449 	.set_peltv		= NULL,
1450 	.eeh_freeze_status	= npu2_freeze_status,
1451 	.eeh_freeze_clear	= NULL,
1452 	.eeh_freeze_set		= NULL,
1453 	.next_error		= npu2_eeh_next_error,
1454 	.err_inject		= NULL,
1455 	.get_diag_data2		= NULL,
1456 	.set_capi_mode		= NULL,
1457 	.set_capp_recovery	= NULL,
1458 	.tce_kill		= npu2_tce_kill,
1459 };
1460 
assign_mmio_bars(uint64_t gcid,uint32_t scom,uint64_t reg[2],uint64_t mm_win[2])1461 static void assign_mmio_bars(uint64_t gcid, uint32_t scom, uint64_t reg[2], uint64_t mm_win[2])
1462 {
1463 	uint32_t i;
1464 	struct npu2_bar *bar;
1465 	struct npu2_bar npu2_bars[] = {
1466 		/* NPU_REGS must be first in this list */
1467 		{ .type = NPU_REGS, .index = 0,
1468 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_PHY_BAR),
1469 		  .flags = NPU2_BAR_FLAG_ENABLED },
1470 		{ .type = NPU_PHY, .index = 0,
1471 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_PHY_BAR),
1472 		  .flags = NPU2_BAR_FLAG_ENABLED },
1473 		{ .type = NPU_PHY, .index = 1,
1474 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_PHY_BAR),
1475 		  .flags = NPU2_BAR_FLAG_ENABLED },
1476 		{ .type = NPU_NTL, .index = 0,
1477 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL0_BAR) },
1478 		{ .type = NPU_NTL, .index = 1,
1479 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL1_BAR) },
1480 		{ .type = NPU_NTL, .index = 2,
1481 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL0_BAR) },
1482 		{ .type = NPU_NTL, .index = 3,
1483 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL1_BAR) },
1484 		{ .type = NPU_NTL, .index = 4,
1485 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL0_BAR) },
1486 		{ .type = NPU_NTL, .index = 5,
1487 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL1_BAR) },
1488 		{ .type = NPU_GENID, .index = 0,
1489 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_GENID_BAR) },
1490 		{ .type = NPU_GENID, .index = 1,
1491 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_GENID_BAR) },
1492 		{ .type = NPU_GENID, .index = 2,
1493 		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_GENID_BAR) },
1494 	};
1495 
1496 	for (i = 0; i < ARRAY_SIZE(npu2_bars); i++) {
1497 		bar = &npu2_bars[i];
1498 		npu2_get_bar(gcid, bar);
1499 		npu2_write_bar(NULL, bar, gcid, scom);
1500 	}
1501 
1502 	/* Global MMIO BAR */
1503 	reg[0] = npu2_bars[0].base;
1504 	reg[1] = npu2_bars[0].size;
1505 
1506 	/* NTL and GENID BARs are exposed to kernel via the mm
1507 	 * window */
1508 	mm_win[0] = npu2_bars[3].base;
1509 	mm_win[1] = npu2_bars[ARRAY_SIZE(npu2_bars) - 1].base +
1510 		    npu2_bars[ARRAY_SIZE(npu2_bars) - 1].size -
1511 		    mm_win[0];
1512 }
1513 
1514 /*
1515  * Set up NPU for NVLink and create PCI root device node
1516  * accordingly.
1517  */
npu2_nvlink_init_npu(struct npu2 * npu)1518 int npu2_nvlink_init_npu(struct npu2 *npu)
1519 {
1520 	struct dt_node *np;
1521 	uint64_t reg[2], mm_win[2], val, mask;
1522 
1523 	/* TODO: Clean this up with register names, etc. when we get
1524 	 * time. This just turns NVLink mode on in each brick and should
1525 	 * get replaced with a patch from ajd once we've worked out how
1526 	 * things are going to work there.
1527 	 *
1528 	 * Obviously if the year is now 2020 that didn't happen and you
1529 	 * should fix this :-) */
1530 
1531 	val = PPC_BIT(58);
1532 	mask = PPC_BIT(58) | /* CONFIG_NVLINK_MODE */
1533 	       PPC_BIT(40); /* CONFIG_ENABLE_SNARF_CPM */
1534 
1535 	/*
1536 	 * V100 GPUs are known to violate NVLink2 protocol if some GPU memory
1537 	 * mapped by a CPU was also "linear-block" mapped by a GPU. When this
1538 	 * happens, it breaks the NPU2 cache coherency state machine and
1539 	 * it throws machine checkstop. Disabling snarfing fixes this so let's
1540 	 * disable it by default.
1541 	 */
1542 	if (nvram_query_eq_dangerous("opal-npu2-snarf-cpm", "enable")) {
1543 		prlog(PR_WARNING, "NPU2#%d: enabling Probe.I.MO snarfing, a bad GPU driver may crash the system!\n",
1544 				npu->index);
1545 		val |= PPC_BIT(40); /* CONFIG_ENABLE_SNARF_CPM */
1546 	}
1547 
1548 	xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM0_MISC_CONFIG0,
1549 			 val, mask);
1550 	xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM1_MISC_CONFIG0,
1551 			 val, mask);
1552 	xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM2_MISC_CONFIG0,
1553 			 val, mask);
1554 	xscom_write_mask(npu->chip_id, NPU_STCK0_CS_SM3_MISC_CONFIG0,
1555 			 val, mask);
1556 	xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM0_MISC_CONFIG0,
1557 			 val, mask);
1558 	xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM1_MISC_CONFIG0,
1559 			 val, mask);
1560 	xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM2_MISC_CONFIG0,
1561 			 val, mask);
1562 	xscom_write_mask(npu->chip_id, NPU_STCK1_CS_SM3_MISC_CONFIG0,
1563 			 val, mask);
1564 	xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM0_MISC_CONFIG0,
1565 			 val, mask);
1566 	xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM1_MISC_CONFIG0,
1567 			 val, mask);
1568 	xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM2_MISC_CONFIG0,
1569 			 val, mask);
1570 	xscom_write_mask(npu->chip_id, NPU_STCK2_CS_SM3_MISC_CONFIG0,
1571 			 val, mask);
1572 
1573 	xscom_write_mask(npu->chip_id, 0x50110c0, PPC_BIT(53), PPC_BIT(53));
1574 	xscom_write_mask(npu->chip_id, 0x50112c0, PPC_BIT(53), PPC_BIT(53));
1575 	xscom_write_mask(npu->chip_id, 0x50114c0, PPC_BIT(53), PPC_BIT(53));
1576 	xscom_write_mask(npu->chip_id, 0x50110f1, PPC_BIT(41), PPC_BIT(41));
1577 	xscom_write_mask(npu->chip_id, 0x50112f1, PPC_BIT(41), PPC_BIT(41));
1578 	xscom_write_mask(npu->chip_id, 0x50114f1, PPC_BIT(41), PPC_BIT(41));
1579 
1580 	val = NPU2_NTL_MISC_CFG2_BRICK_ENABLE |
1581 	      NPU2_NTL_MISC_CFG2_NDL_TX_PARITY_ENA |
1582 	      NPU2_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA |
1583 	      NPU2_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA;
1584 	xscom_write_mask(npu->chip_id, 0x5011110, val, val);
1585 	xscom_write_mask(npu->chip_id, 0x5011130, val, val);
1586 	xscom_write_mask(npu->chip_id, 0x5011310, val, val);
1587 	xscom_write_mask(npu->chip_id, 0x5011330, val, val);
1588 	xscom_write_mask(npu->chip_id, 0x5011510, val, val);
1589 	xscom_write_mask(npu->chip_id, 0x5011530, val, val);
1590 
1591 	val = PPC_BIT(6) | PPC_BIT(7) | PPC_BIT(11);
1592 	xscom_write_mask(npu->chip_id, 0x5011009, val, PPC_BITMASK(6,11));
1593 	xscom_write_mask(npu->chip_id, 0x5011039, val, PPC_BITMASK(6,11));
1594 	xscom_write_mask(npu->chip_id, 0x5011069, val, PPC_BITMASK(6,11));
1595 	xscom_write_mask(npu->chip_id, 0x5011099, val, PPC_BITMASK(6,11));
1596 	xscom_write_mask(npu->chip_id, 0x5011209, val, PPC_BITMASK(6,11));
1597 	xscom_write_mask(npu->chip_id, 0x5011239, val, PPC_BITMASK(6,11));
1598 	xscom_write_mask(npu->chip_id, 0x5011269, val, PPC_BITMASK(6,11));
1599 	xscom_write_mask(npu->chip_id, 0x5011299, val, PPC_BITMASK(6,11));
1600 	xscom_write_mask(npu->chip_id, 0x5011409, val, PPC_BITMASK(6,11));
1601 	xscom_write_mask(npu->chip_id, 0x5011439, val, PPC_BITMASK(6,11));
1602 	xscom_write_mask(npu->chip_id, 0x5011469, val, PPC_BITMASK(6,11));
1603 	xscom_write_mask(npu->chip_id, 0x5011499, val, PPC_BITMASK(6,11));
1604 
1605 	/* Reassign the BARs */
1606 	assign_mmio_bars(npu->chip_id, npu->xscom_base, reg, mm_win);
1607 	npu->regs = (void *)reg[0];
1608 	npu->mm_base = mm_win[0];
1609 	npu->mm_size = mm_win[1];
1610 
1611 	if (reg[0] && reg[1])
1612 		prlog(PR_INFO, "   Global MMIO BAR:  %016llx (%lldMB)\n",
1613 		      reg[0], reg[1] >> 20);
1614 	else
1615 		prlog(PR_ERR, "    Global MMIO BAR: Disabled\n");
1616 
1617 	/* Populate PCI root device node */
1618 	np = dt_new_addr(dt_root, "pciex", reg[0]);
1619 	assert(np);
1620 	dt_add_property_strings(np,
1621 				"compatible",
1622 				"ibm,power9-npu-pciex",
1623 				"ibm,ioda2-npu2-phb");
1624 	dt_add_property_strings(np, "device_type", "pciex");
1625 	dt_add_property(np, "reg", reg, sizeof(reg));
1626 	dt_add_property_cells(np, "ibm,phb-index", npu->phb_index);
1627 	dt_add_property_cells(np, "ibm,npu-index", npu->index);
1628 	dt_add_property_cells(np, "ibm,chip-id", npu->chip_id);
1629 	dt_add_property_cells(np, "ibm,xscom-base", npu->xscom_base);
1630 	dt_add_property_cells(np, "ibm,npcq", npu->dt_node->phandle);
1631 	dt_add_property_cells(np, "ibm,links", npu->total_devices);
1632 	dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win));
1633 	dt_add_property_cells(np, "ibm,phb-diag-data-size", 0);
1634 
1635 	/* Disable fast reboot - not currently supported */
1636 	disable_fast_reboot("NVLink device enabled");
1637 
1638 	npu2_nvlink_create_phb(npu, np);
1639 
1640 	return 0;
1641 }
1642 
npu2_populate_pcie_cap(struct npu2_dev * dev,uint32_t start,uint32_t prev_cap)1643 static uint32_t npu2_populate_pcie_cap(struct npu2_dev *dev,
1644 				       uint32_t start,
1645 				       uint32_t prev_cap)
1646 {
1647 	struct pci_virt_device *pvd = dev->nvlink.pvd;
1648 	uint32_t val;
1649 
1650 	/* Add capability list */
1651 	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
1652 	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP);
1653 
1654 	/* 0x00 - ID/PCIE capability */
1655 	val = PCI_CFG_CAP_ID_EXP;
1656 	val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20));
1657 	PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val);
1658 
1659 	/* 0x04 - Device capability
1660 	 *
1661 	 * We should support FLR. Otherwise, it might have
1662 	 * problem passing it through to userland via Linux
1663 	 * VFIO infrastructure
1664 	 */
1665 	val = ((PCIE_MPSS_128) |
1666 	       (PCIE_PHANTOM_NONE << 3) |
1667 	       (PCIE_L0SL_MAX_NO_LIMIT << 6) |
1668 	       (PCIE_L1L_MAX_NO_LIMIT << 9) |
1669 	       (PCICAP_EXP_DEVCAP_FUNC_RESET));
1670 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val);
1671 
1672 	pci_virt_add_filter(pvd, start + PCICAP_EXP_DEVCTL, 2,
1673 			    PCI_REG_FLAG_WRITE,
1674 			    npu2_dev_cfg_exp_devcap, NULL);
1675 
1676 	/* 0x08 - Device control and status */
1677 	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810,
1678 			  0xffff0000, 0x000f0000);
1679 
1680 	/* 0x0c - Link capability */
1681 	val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4));
1682 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val);
1683 
1684 	/* 0x10 - Link control and status */
1685 	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000,
1686 			 0xfffff000, 0xc0000000);
1687 
1688 	/* 0x14 - Slot capability */
1689 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
1690 
1691 	/* 0x18 - Slot control and status */
1692 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
1693 
1694 	/* 0x1c - Root control and capability */
1695 	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000,
1696 			  0xffffffe0, 0x00000000);
1697 
1698 	/* 0x20 - Root status */
1699 	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000,
1700 			 0xffffffff, 0x00010000);
1701 
1702 	/* 0x24 - Device capability 2 */
1703 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000);
1704 
1705 	/* 0x28 - Device Control and status 2 */
1706 	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000,
1707 			 0xffff0000, 0x00000000);
1708 
1709 	/* 0x2c - Link capability 2 */
1710 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007);
1711 
1712 	/* 0x30 - Link control and status 2 */
1713 	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003,
1714 			 0xffff0000, 0x00200000);
1715 
1716 	/* 0x34 - Slot capability 2 */
1717 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000);
1718 
1719 	/* 0x38 - Slot control and status 2 */
1720 	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000);
1721 
1722 	return start + PCICAP_EXP_SCTL2 + 8;
1723 }
1724 
npu2_populate_vendor_cap(struct npu2_dev * dev,uint32_t start,uint32_t prev_cap)1725 static uint32_t npu2_populate_vendor_cap(struct npu2_dev *dev,
1726 					 uint32_t start,
1727 					 uint32_t prev_cap)
1728 {
1729 	struct pci_virt_device *pvd = dev->nvlink.pvd;
1730 
1731 	/* Capbility list */
1732 	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
1733 	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR);
1734 
1735 	/* Length and version */
1736 	PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, VENDOR_CAP_LEN);
1737 	PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, VENDOR_CAP_VERSION);
1738 
1739 	/*
1740 	 * Defaults when the trap can't handle the read/write (eg. due
1741 	 * to reading/writing less than 4 bytes).
1742 	 */
1743 	PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0);
1744 	PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0);
1745 
1746 	/* Add NVLink2 PHY procedures trap */
1747 	pci_virt_add_filter(pvd, start + 4, 8,
1748 			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
1749 			    npu2_dev_procedure,
1750 			    NULL);
1751 
1752 	/* Link index */
1753 	PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, dev->link_index);
1754 
1755 	return start + VENDOR_CAP_LEN;
1756 }
1757 
npu2_populate_cfg(struct npu2_dev * dev)1758 static void npu2_populate_cfg(struct npu2_dev *dev)
1759 {
1760 	struct pci_virt_device *pvd = dev->nvlink.pvd;
1761 	struct npu2_pcie_bar *bar;
1762 	uint32_t pos;
1763 
1764 	/* 0x00 - Vendor/Device ID */
1765 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
1766 
1767 	/* 0x04 - Command/Status */
1768 	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
1769 			  0xf9000000);
1770 
1771 	pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
1772 			    npu2_cfg_write_cmd, NULL);
1773 
1774 	/* 0x08 - Rev/Class/Cache */
1775 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800101);
1776 
1777 	/* 0x0c - CLS/Latency Timer/Header/BIST */
1778 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
1779 
1780 	/* 0x10/14 - BAR#0, NTL BAR */
1781 	bar = &dev->bars[0];
1782 	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4,
1783 			  (bar->npu2_bar.base & 0xfffffff0) | (bar->flags & 0xF),
1784 			  0x0000000f, 0x00000000);
1785 	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, (bar->npu2_bar.base >> 32),
1786 			  0x00000000, 0x00000000);
1787 	pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
1788 			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
1789 			    npu2_dev_cfg_bar, bar);
1790 
1791 	/* 0x18/1c - BAR#1, GENID BAR */
1792 	bar = &dev->bars[1];
1793 	if (NPU2DEV_BRICK(dev) == 0)
1794 		PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, (bar->npu2_bar.base & 0xfffffff0) |
1795 				  (bar->flags & 0xF),
1796 				  0x0000000f, 0x00000000);
1797 	else
1798 		/* Brick 1 gets the upper portion of the generation id register */
1799 		PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, ((bar->npu2_bar.base + 0x10000) & 0xfffffff0) |
1800 				  (bar->flags & 0xF),
1801 				  0x0000000f, 0x00000000);
1802 
1803 	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, (bar->npu2_bar.base >> 32), 0x00000000,
1804 			  0x00000000);
1805 	pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8,
1806 			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
1807 			    npu2_dev_cfg_bar, bar);
1808 
1809 	/* 0x20/0x24 - BARs, disabled */
1810 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
1811 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
1812 
1813 	/* 0x28 - Cardbus CIS pointer */
1814 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
1815 
1816 	/* 0x2c - Subsystem ID */
1817 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
1818 
1819 	/* 0x30 - ROM BAR, zero sized */
1820 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
1821 
1822 	/* 0x34 - PCI Capability */
1823 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
1824 
1825 	/* 0x38 - Reserved */
1826 	PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
1827 
1828 	/* 0x3c - INT line/pin/Minimal grant/Maximal latency */
1829 	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); /* INT A */
1830 
1831 	/* PCIE and vendor specific capability */
1832 	pos = npu2_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP);
1833 	pos = npu2_populate_vendor_cap(dev, pos, 0x41);
1834 	PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0);
1835 }
1836 
npu_allocate_bdfn(struct npu2 * p,uint32_t group)1837 static uint32_t npu_allocate_bdfn(struct npu2 *p, uint32_t group)
1838 {
1839 	int i;
1840 	int bdfn = (group << 3);
1841 
1842 	for (i = 0; i < p->total_devices; i++) {
1843 		if ((p->devices[i].bdfn & 0xf8) == (bdfn & 0xf8))
1844 			bdfn++;
1845 	}
1846 
1847 	return bdfn;
1848 }
1849 
npu2_populate_devices(struct npu2 * p,struct dt_node * dn)1850 static void npu2_populate_devices(struct npu2 *p,
1851 				  struct dt_node *dn)
1852 {
1853 	struct npu2_dev *dev;
1854 	struct dt_node *npu2_dn, *link;
1855 	uint32_t npu_phandle, index = 0;
1856 	int stack;
1857 
1858 	/*
1859 	 * Get the npu node which has the links which we expand here
1860 	 * into pci like devices attached to our emulated phb.
1861 	 */
1862 	npu_phandle = dt_prop_get_u32(dn, "ibm,npcq");
1863 	npu2_dn = dt_find_by_phandle(dt_root, npu_phandle);
1864 	assert(npu2_dn);
1865 
1866 	/* Walk the link@x nodes to initialize devices */
1867 	p->total_devices = 0;
1868 	p->phb_nvlink.scan_map = 0;
1869 	dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") {
1870 		uint32_t group_id;
1871 		struct npu2_bar *npu2_bar;
1872 
1873 		dev = &p->devices[index];
1874 		dev->type = NPU2_DEV_TYPE_NVLINK;
1875 		dev->npu = p;
1876 		dev->dt_node = link;
1877 		dev->link_index = dt_prop_get_u32(link, "ibm,npu-link-index");
1878 		dev->brick_index = dev->link_index;
1879 
1880 		group_id = dt_prop_get_u32(link, "ibm,npu-group-id");
1881 		dev->bdfn = npu_allocate_bdfn(p, group_id);
1882 
1883 		/* This must be done after calling
1884 		 * npu_allocate_bdfn() */
1885 		p->total_devices++;
1886 		p->phb_nvlink.scan_map |= 0x1 << ((dev->bdfn & 0xf8) >> 3);
1887 
1888 		dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy");
1889 		dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
1890 
1891 		/* Populate BARs. BAR0/1 is the NTL bar. */
1892 		stack = NPU2_STACK_STCK_0 + NPU2DEV_STACK(dev);
1893 		npu2_bar = &dev->bars[0].npu2_bar;
1894 		npu2_bar->type = NPU_NTL;
1895 		npu2_bar->index = dev->brick_index;
1896 		npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2DEV_BRICK(dev) == 0 ?
1897 						NPU2_NTL0_BAR : NPU2_NTL1_BAR);
1898 	        npu2_get_bar(p->chip_id, npu2_bar);
1899 
1900 		dev->bars[0].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
1901 
1902 		/* BAR2/3 is the GENID bar. */
1903 		npu2_bar = &dev->bars[1].npu2_bar;
1904 		npu2_bar->type = NPU_GENID;
1905 		npu2_bar->index = NPU2DEV_STACK(dev);
1906 		npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2_GENID_BAR);
1907 	        npu2_get_bar(p->chip_id, npu2_bar);
1908 
1909 		/* The GENID is a single physical BAR that we split
1910 		 * for each emulated device */
1911 		npu2_bar->size = 0x10000;
1912 		if (NPU2DEV_BRICK(dev))
1913 			npu2_bar->base += 0x10000;
1914 		dev->bars[1].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
1915 
1916 		/* Initialize PCI virtual device */
1917 		dev->nvlink.pvd = pci_virt_add_device(&p->phb_nvlink, dev->bdfn, 0x100, dev);
1918 		if (dev->nvlink.pvd)
1919 			npu2_populate_cfg(dev);
1920 
1921 		index++;
1922 	}
1923 }
1924 
npu2_add_interrupt_map(struct npu2 * p,struct dt_node * dn)1925 static void npu2_add_interrupt_map(struct npu2 *p,
1926 				  struct dt_node *dn)
1927 {
1928 	struct dt_node *npu2_dn, *link, *phb_dn;
1929 	uint32_t npu2_phandle, index = 0, i;
1930 	uint32_t icsp = get_ics_phandle();
1931 	uint32_t *map;
1932 	size_t map_size;
1933 	uint32_t mask[] = {0xff00, 0x0, 0x0, 0x7};
1934 
1935 	assert(p->phb_nvlink.dt_node);
1936 	phb_dn = p->phb_nvlink.dt_node;
1937 
1938 	npu2_phandle = dt_prop_get_u32(dn, "ibm,npcq");
1939 	npu2_dn = dt_find_by_phandle(dt_root, npu2_phandle);
1940 	assert(npu2_dn);
1941 	map_size = 7 * sizeof(*map) * p->total_devices;
1942 	map = malloc(map_size);
1943 	index = 0;
1944 	dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") {
1945 		i = index * 7;
1946 		map[i + 0] = (p->devices[index].bdfn << 8);
1947 		map[i + 1] = 0;
1948 		map[i + 2] = 0;
1949 
1950 		map[i + 3] = 1; /* INT A */
1951 		map[i + 4] = icsp; /* interrupt-parent */
1952 		map[i + 5] = p->base_lsi + (index * 2) + 1; /* NDL No-Stall Event */
1953 		map[i + 6] = 0; /* 0 = EDGE, 1 = LEVEL. */
1954 		index++;
1955 	}
1956 	dt_add_property(phb_dn, "interrupt-map", map, map_size);
1957 	free(map);
1958 	dt_add_property(phb_dn, "interrupt-map-mask", mask, sizeof(mask));
1959 }
1960 
npu2_add_phb_properties(struct npu2 * p)1961 static void npu2_add_phb_properties(struct npu2 *p)
1962 {
1963 	struct dt_node *np = p->phb_nvlink.dt_node;
1964 	uint32_t icsp = get_ics_phandle();
1965 	uint64_t mm_base, mm_size;
1966 
1967 	/*
1968 	 * Add various properties that HB doesn't have to
1969 	 * add, some of them simply because they result from
1970 	 * policy decisions made in skiboot rather than in HB
1971 	 * such as the MMIO windows going to PCI, interrupts,
1972 	 * etc.
1973 	 */
1974 	dt_add_property_cells(np, "#address-cells", 3);
1975 	dt_add_property_cells(np, "#size-cells", 2);
1976 	dt_add_property_cells(np, "#interrupt-cells", 1);
1977 	dt_add_property_cells(np, "bus-range", 0, 0xff);
1978 	dt_add_property_cells(np, "clock-frequency", 0x200, 0);
1979         dt_add_property_cells(np, "interrupt-parent", icsp);
1980 
1981 	/* NPU2 PHB properties */
1982 	dt_add_property_cells(np, "ibm,opal-num-pes",
1983 			      NPU2_MAX_PE_NUM);
1984 	dt_add_property_cells(np, "ibm,opal-reserved-pe",
1985 			      NPU2_RESERVED_PE_NUM);
1986 	dt_add_property_cells(np, "ibm,supported-tce-sizes",
1987 			      12, // 4K
1988 			      16, // 64K
1989 			      24, // 16M
1990 			      28); // 256M
1991 
1992 	dt_add_property_u64s(np, "ibm,mmio-atsd",
1993 			MMIO_ATSD_ADDR(p->regs, 0),
1994 			MMIO_ATSD_ADDR(p->regs, 1),
1995 			MMIO_ATSD_ADDR(p->regs, 2),
1996 			MMIO_ATSD_ADDR(p->regs, 3),
1997 			MMIO_ATSD_ADDR(p->regs, 4),
1998 			MMIO_ATSD_ADDR(p->regs, 5),
1999 			MMIO_ATSD_ADDR(p->regs, 6),
2000 			MMIO_ATSD_ADDR(p->regs, 7));
2001 
2002 	/*
2003 	 * Memory window is exposed as 64-bits non-prefetchable
2004 	 * one because 64-bits prefetchable one is kind of special
2005 	 * to kernel.
2006 	 */
2007 	mm_base = p->mm_base;
2008 	mm_size = p->mm_size;
2009 	dt_add_property_cells(np, "ranges", 0x02000000,
2010 			      hi32(mm_base), lo32(mm_base),
2011 			      hi32(mm_base), lo32(mm_base),
2012 			      hi32(mm_size), lo32(mm_size));
2013 }
2014 
npu2_nvlink_create_phb(struct npu2 * npu,struct dt_node * dn)2015 void npu2_nvlink_create_phb(struct npu2 *npu, struct dt_node *dn)
2016 {
2017 	struct pci_slot *slot;
2018 
2019 	/* Generic PHB */
2020 	npu->phb_nvlink.dt_node = dn;
2021 	npu->phb_nvlink.ops = &npu_ops;
2022 	npu->phb_nvlink.phb_type = phb_type_npu_v2;
2023 	init_lock(&npu->lock);
2024 	init_lock(&npu->phb_nvlink.lock);
2025 	list_head_init(&npu->phb_nvlink.devices);
2026 	list_head_init(&npu->phb_nvlink.virt_devices);
2027 
2028 	npu2_populate_devices(npu, dn);
2029 	npu2_add_interrupt_map(npu, dn);
2030 	npu2_add_phb_properties(npu);
2031 
2032 	slot = npu2_slot_create(&npu->phb_nvlink);
2033 	if (!slot)
2034 	{
2035 		/**
2036 		 * @fwts-label NPUCannotCreatePHBSlot
2037 		 * @fwts-advice Firmware probably ran out of memory creating
2038 		 * NPU2 slot. NVLink functionality could be broken.
2039 		 */
2040 		prlog(PR_ERR, "NPU: Cannot create PHB slot\n");
2041 	}
2042 
2043 	pci_register_phb(&npu->phb_nvlink, OPAL_DYNAMIC_PHB_ID);
2044 
2045 	npu2_init_ioda_cache(npu);
2046 	npu2_hw_init(npu);
2047 }
2048 
2049 /*
2050  * Search a table for an entry with matching value under mask. Returns
2051  * the index and the current value in *value.
2052  */
npu_table_search(struct npu2 * p,uint64_t table_addr,int stride,int table_size,uint64_t * value,uint64_t mask)2053 static int npu_table_search(struct npu2 *p, uint64_t table_addr, int stride,
2054 			    int table_size, uint64_t *value, uint64_t mask)
2055 {
2056 	int i;
2057 	uint64_t val;
2058 
2059 	assert(value);
2060 
2061 	for (i = 0; i < table_size; i++) {
2062 		val = npu2_read(p, table_addr + i*stride);
2063 		if ((val & mask) == *value) {
2064 			*value = val;
2065 			return i;
2066 		}
2067 	}
2068 
2069 	return -1;
2070 }
2071 
2072 /*
2073  * Allocate a context ID and initialise the tables with the relevant
2074  * information. Returns the ID on or error if one couldn't be
2075  * allocated.
2076  */
2077 #define NPU2_VALID_ATS_MSR_BITS (MSR_DR | MSR_HV | MSR_PR | MSR_SF)
opal_npu_init_context(uint64_t phb_id,int pasid __unused,uint64_t msr,uint64_t bdf)2078 static int64_t opal_npu_init_context(uint64_t phb_id, int pasid __unused,
2079 				     uint64_t msr, uint64_t bdf)
2080 {
2081 	struct phb *phb = pci_get_phb(phb_id);
2082 	struct npu2 *p;
2083 	uint64_t xts_bdf, old_xts_bdf_pid, xts_bdf_pid;
2084 	int id;
2085 
2086 	if (!phb || phb->phb_type != phb_type_npu_v2)
2087 		return OPAL_PARAMETER;
2088 
2089 	/*
2090 	 * MSR bits should be masked by the caller to allow for future
2091 	 * expansion if required.
2092 	 */
2093 	if (msr & ~NPU2_VALID_ATS_MSR_BITS)
2094 		return OPAL_UNSUPPORTED;
2095 
2096 	/*
2097 	 * Need to get LPARSHORT.
2098 	 */
2099 	p = phb_to_npu2_nvlink(phb);
2100 	lock(&p->lock);
2101 	xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0ul, bdf);
2102 	if (npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
2103 			     &xts_bdf, NPU2_XTS_BDF_MAP_BDF) < 0) {
2104 		NPU2ERR(p, "LPARID not associated with any GPU\n");
2105 		id = OPAL_PARAMETER;
2106 		goto out;
2107 	}
2108 
2109 	id = GETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf);
2110 	NPU2DBG(p, "Found LPARSHORT = 0x%x for BDF = 0x%03llx\n", id, bdf);
2111 
2112 	/* Enable this mapping for both real and virtual addresses */
2113 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATRGPA0, 0UL, 1);
2114 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATRGPA1, xts_bdf_pid, 1);
2115 
2116 	/* Enables TLBIE/MMIOSD forwarding for this entry */
2117 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_VALID_ATSD, xts_bdf_pid, 1);
2118 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_LPARSHORT, xts_bdf_pid, id);
2119 
2120 	/* Set the relevant MSR bits */
2121 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_DR, xts_bdf_pid,
2122 			       !!(msr & MSR_DR));
2123 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_HV, xts_bdf_pid,
2124 			       !!(msr & MSR_HV));
2125 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_PR, xts_bdf_pid,
2126 			       !!(msr & MSR_PR));
2127 
2128 	/* We don't support anything other than 64-bit so we can safely hardcode
2129 	 * it here */
2130 	xts_bdf_pid = SETFIELD(NPU2_XTS_PID_MAP_MSR_SF, xts_bdf_pid, 1);
2131 
2132 	/*
2133 	 * Throw an error if the wildcard entry for this bdf is already set
2134 	 * with different msr bits.
2135 	 */
2136 	old_xts_bdf_pid = npu2_read(p, NPU2_XTS_PID_MAP + id*0x20);
2137 	if (old_xts_bdf_pid) {
2138 		if (GETFIELD(NPU2_XTS_PID_MAP_MSR, old_xts_bdf_pid) !=
2139 		    GETFIELD(NPU2_XTS_PID_MAP_MSR, xts_bdf_pid)) {
2140 			NPU2ERR(p, "%s: Unexpected MSR value\n", __func__);
2141 			id = OPAL_PARAMETER;
2142 			goto out;
2143 		} else if (!p->ctx_ref[id]) {
2144 			NPU2ERR(p, "%s: Unexpected mapping\n", __func__);
2145 			id = OPAL_INTERNAL_ERROR;
2146 			goto out;
2147 		}
2148 	}
2149 
2150 	/* Write the entry */
2151 	if (!p->ctx_ref[id]) {
2152 		NPU2DBG(p, "XTS_PID_MAP[%03d] = 0x%08llx\n", id, xts_bdf_pid);
2153 		npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, xts_bdf_pid);
2154 
2155 		if (!GETFIELD(NPU2_XTS_BDF_MAP_VALID, xts_bdf)) {
2156 			xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_VALID, xts_bdf, 1);
2157 			npu2_write(p, NPU2_XTS_BDF_MAP + id*8, xts_bdf);
2158 		}
2159 	}
2160 	++p->ctx_ref[id];
2161 
2162 out:
2163 	unlock(&p->lock);
2164 	return id;
2165 }
2166 opal_call(OPAL_NPU_INIT_CONTEXT, opal_npu_init_context, 4);
2167 
opal_npu_destroy_context(uint64_t phb_id,uint64_t pid __unused,uint64_t bdf)2168 static int opal_npu_destroy_context(uint64_t phb_id, uint64_t pid __unused,
2169 				    uint64_t bdf)
2170 {
2171 	struct phb *phb = pci_get_phb(phb_id);
2172 	struct npu2 *p;
2173 	uint64_t xts_bdf;
2174 	int rc = OPAL_PARAMETER, id;
2175 
2176 	if (!phb || phb->phb_type != phb_type_npu_v2)
2177 		return OPAL_PARAMETER;
2178 
2179 	p = phb_to_npu2_nvlink(phb);
2180 	lock(&p->lock);
2181 
2182 	/* Need to find lparshort for this bdf */
2183 	xts_bdf = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0ul, bdf);
2184 	if (npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
2185 			     &xts_bdf, NPU2_XTS_BDF_MAP_BDF) < 0) {
2186 		NPU2ERR(p, "LPARID not associated with any GPU\n");
2187 	} else {
2188 		/*
2189 		 * The bdf/pid table contains wildcard entries and MSR bits
2190 		 * which we need to clear between switching a device from
2191 		 * a host to a guest or vice versa.
2192 		 */
2193 		id = GETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf);
2194 		if (p->ctx_ref[id]) {
2195 			--p->ctx_ref[id];
2196 			if (!p->ctx_ref[id]) {
2197 				NPU2DBG(p, "XTS_PID_MAP[%03d] = 0 (destroy)\n",
2198 					id);
2199 				npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, 0);
2200 			}
2201 			rc = OPAL_SUCCESS;
2202 		}
2203 	}
2204 	unlock(&p->lock);
2205 	return rc;
2206 }
2207 opal_call(OPAL_NPU_DESTROY_CONTEXT, opal_npu_destroy_context, 3);
2208 
2209 /*
2210  * Map the given virtual bdf to lparid with given lpcr.
2211  */
opal_npu_map_lpar(uint64_t phb_id,uint64_t bdf,uint64_t lparid,uint64_t lpcr)2212 static int opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
2213 			     uint64_t lpcr)
2214 {
2215 	struct phb *phb = pci_get_phb(phb_id);
2216 	struct npu2 *p;
2217 	struct npu2_dev *ndev = NULL;
2218 	uint64_t xts_bdf_lpar, atsd_lpar, rc = OPAL_SUCCESS;
2219 	int i;
2220 	int id;
2221 	static uint64_t atsd_lpar_regs[] = {
2222 		NPU2_XTS_MMIO_ATSD0_LPARID, NPU2_XTS_MMIO_ATSD1_LPARID,
2223 		NPU2_XTS_MMIO_ATSD2_LPARID, NPU2_XTS_MMIO_ATSD3_LPARID,
2224 		NPU2_XTS_MMIO_ATSD4_LPARID, NPU2_XTS_MMIO_ATSD5_LPARID,
2225 		NPU2_XTS_MMIO_ATSD6_LPARID, NPU2_XTS_MMIO_ATSD7_LPARID
2226 	};
2227 
2228 	if (!phb || phb->phb_type != phb_type_npu_v2)
2229 		return OPAL_PARAMETER;
2230 
2231 	if (lpcr)
2232 		/* The LPCR bits are only required for hash based ATS,
2233 		 * which we don't currently support but may need to in
2234 		 * future. */
2235 		return OPAL_UNSUPPORTED;
2236 
2237 	p = phb_to_npu2_nvlink(phb);
2238 	lock(&p->lock);
2239 
2240 	/* Find any existing entries and update them */
2241 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BDF, 0L, bdf);
2242 	id = npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
2243 			      &xts_bdf_lpar, NPU2_XTS_BDF_MAP_BDF);
2244 	if (id < 0) {
2245 		/* No existing mapping found, find space for a new one */
2246 		xts_bdf_lpar = 0;
2247 		id = npu_table_search(p, NPU2_XTS_BDF_MAP, 8, NPU2_XTS_BDF_MAP_SIZE,
2248 				      &xts_bdf_lpar, -1UL);
2249 	}
2250 
2251 	if (id < 0) {
2252 		/* Unable to find a free mapping */
2253 		NPU2ERR(p, "No free XTS_BDF[] entry\n");
2254 		rc = OPAL_RESOURCE;
2255 		goto out;
2256 	}
2257 
2258 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_UNFILT, 0UL, 1);
2259 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BDF, xts_bdf_lpar, bdf);
2260 
2261 	/* We only support radix for the moment */
2262 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_XLAT, xts_bdf_lpar, 0x3);
2263 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARID, xts_bdf_lpar, lparid);
2264 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_LPARSHORT, xts_bdf_lpar, id);
2265 
2266 	/* Need to find an NVLink to send the ATSDs for this device over */
2267 	for (i = 0; i < p->total_devices; i++) {
2268 		if (p->devices[i].nvlink.gpu_bdfn == bdf) {
2269 			ndev = &p->devices[i];
2270 			break;
2271 		}
2272 	}
2273 
2274 	if (!ndev) {
2275 		NPU2ERR(p, "Unable to find nvlink for bdf %llx\n", bdf);
2276 		rc = OPAL_PARAMETER;
2277 		goto out;
2278 	}
2279 
2280 	/*
2281 	 * We need to allocate an ATSD per NVLink bridge if possible,
2282 	 * use the ibm,npu-link-index property for that.
2283 	 */
2284 	atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_LPARID, 0, lparid);
2285 	if (!lparid)
2286 		atsd_lpar = SETFIELD(NPU2_XTS_MMIO_ATSD_MSR_HV, atsd_lpar, 1);
2287 
2288 	if (ndev->link_index < ARRAY_SIZE(atsd_lpar_regs))
2289 		npu2_write(p, atsd_lpar_regs[ndev->link_index], atsd_lpar);
2290 	else
2291 		NPU2ERR(p, "Unable to assign ATSD for link index %u\n",
2292 				ndev->link_index);
2293 
2294 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_STACK, xts_bdf_lpar,
2295 				0x4 >> (ndev->brick_index / 2));
2296 	xts_bdf_lpar = SETFIELD(NPU2_XTS_BDF_MAP_BRICK, xts_bdf_lpar,
2297 				(ndev->brick_index % 2));
2298 
2299 	NPU2DBG(p, "XTS_BDF_MAP[%03d] = 0x%08llx\n", id, xts_bdf_lpar);
2300 	npu2_write(p, NPU2_XTS_BDF_MAP + id*8, xts_bdf_lpar);
2301 
2302 	/* Reset wildcard in the PID map and the refcounter */
2303 	if (npu2_read(p, NPU2_XTS_PID_MAP + id*0x20) || p->ctx_ref[id]) {
2304 		prlog(PR_INFO, "Resetting PID MAP for LPID %lld\n", lparid);
2305 		p->ctx_ref[id] = 0;
2306 		npu2_write(p, NPU2_XTS_PID_MAP + id*0x20, 0);
2307 	}
2308 
2309 out:
2310 	unlock(&p->lock);
2311 	return rc;
2312 }
2313 opal_call(OPAL_NPU_MAP_LPAR, opal_npu_map_lpar, 4);
2314 
npu2_relaxed_ordering_source_grpchp(uint32_t gcid)2315 static inline uint32_t npu2_relaxed_ordering_source_grpchp(uint32_t gcid)
2316 {
2317 	if (gcid & ~0x1b)
2318 		return OPAL_PARAMETER;
2319 
2320 	/* Repack 0bGGGGCCC to 0bGGCC */
2321 	return ((gcid & 0x18) >> 1) | (gcid & 0x3);
2322 }
2323 
npu2_relaxed_ordering_cfg_read(struct npu2_dev * ndev,int n)2324 static uint64_t npu2_relaxed_ordering_cfg_read(struct npu2_dev *ndev, int n)
2325 {
2326 	uint64_t reg = NPU2_SM_REG_OFFSET(ndev, 0, NPU2_RELAXED_ORDERING_CFG(n));
2327 
2328 	return npu2_read(ndev->npu, reg);
2329 }
2330 
npu2_relaxed_ordering_cfg_write(struct npu2_dev * ndev,int n,uint64_t val)2331 static void npu2_relaxed_ordering_cfg_write(struct npu2_dev *ndev, int n,
2332 					    uint64_t val)
2333 {
2334 	uint64_t reg;
2335 	int sm;
2336 
2337 	/* Set every register on our stack */
2338 	for (sm = NPU2_BLOCK_SM_0; sm <= NPU2_BLOCK_SM_3; sm++) {
2339 		reg = NPU2_SM_REG_OFFSET(ndev, sm, NPU2_RELAXED_ORDERING_CFG(n));
2340 		npu2_write(ndev->npu, reg, val);
2341 	}
2342 }
2343 
2344 /*
2345  * Parse the value of a relaxed ordering config register. Returns SOURCE0 or
2346  * SOURCE1 register mask if relaxed ordering is set for the given chip/pec.
2347  * Returns 0 if unset.
2348  */
npu2_relaxed_ordering_cfg_enabled(uint64_t val,uint32_t gcid,int pec)2349 static uint64_t npu2_relaxed_ordering_cfg_enabled(uint64_t val, uint32_t gcid,
2350 						  int pec)
2351 {
2352 	uint32_t src, grpchp;
2353 	uint64_t mask;
2354 	int i;
2355 
2356 	for (i = 0; i < 2; i++) {
2357 		mask = NPU2_RELAXED_ORDERING_SOURCE(i);
2358 		src = GETFIELD(mask, val);
2359 
2360 		if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, src))
2361 			continue;
2362 
2363 		if (GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src) != pec)
2364 			continue;
2365 
2366 		grpchp = GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src);
2367 		if (grpchp == npu2_relaxed_ordering_source_grpchp(gcid))
2368 			return mask;
2369 
2370 		if (grpchp == 0xf) /* match all */
2371 			return mask;
2372 	}
2373 
2374 	return 0;
2375 }
2376 
npu2_enable_relaxed_ordering(struct npu2_dev * ndev,uint32_t gcid,int pec)2377 static int npu2_enable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
2378 					int pec)
2379 {
2380 	uint64_t val, mask;
2381 	uint32_t src;
2382 	int rc = OPAL_RESOURCE;
2383 	int i;
2384 
2385 	NPU2DEVINF(ndev, "Enabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
2386 	lock(&ndev->npu->lock);
2387 
2388 	for (i = 0; i < 2; i++) {
2389 		val = npu2_relaxed_ordering_cfg_read(ndev, i);
2390 		if (!npu2_relaxed_ordering_cfg_enabled(val, gcid, pec))
2391 			continue;
2392 
2393 		/* Already enabled */
2394 		rc = OPAL_SUCCESS;
2395 		goto out;
2396 	}
2397 
2398 	src = NPU2_RELAXED_ORDERING_SOURCE_WRENA |
2399 	      NPU2_RELAXED_ORDERING_SOURCE_RDENA;
2400 	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src, pec);
2401 	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src,
2402 		       npu2_relaxed_ordering_source_grpchp(gcid));
2403 	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMIN, src, 0);
2404 	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMAX, src, 23);
2405 	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMIN, src, 0);
2406 	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMAX, src, 47);
2407 
2408 	/* Find somewhere to write this config */
2409 	for (i = 0; i < 2; i++) {
2410 		val = npu2_relaxed_ordering_cfg_read(ndev, i);
2411 
2412 		if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA << 32, val))
2413 			mask = NPU2_RELAXED_ORDERING_SOURCE(0);
2414 		else if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, val))
2415 			mask = NPU2_RELAXED_ORDERING_SOURCE(1);
2416 		else
2417 			continue;
2418 
2419 		val = SETFIELD(mask, val, src);
2420 		npu2_relaxed_ordering_cfg_write(ndev, i, val);
2421 
2422 		rc = OPAL_SUCCESS;
2423 		break;
2424 	}
2425 
2426 out:
2427 	unlock(&ndev->npu->lock);
2428 	return rc;
2429 }
2430 
npu2_disable_relaxed_ordering(struct npu2_dev * ndev,uint32_t gcid,int pec)2431 static void npu2_disable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
2432 					  int pec)
2433 {
2434 	uint64_t val, mask;
2435 	int i;
2436 
2437 	NPU2DEVINF(ndev, "Disabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
2438 	lock(&ndev->npu->lock);
2439 
2440 	for (i = 0; i < 2; i++) {
2441 		val = npu2_relaxed_ordering_cfg_read(ndev, i);
2442 
2443 		mask = npu2_relaxed_ordering_cfg_enabled(val, gcid, pec);
2444 		if (!mask)
2445 			continue;
2446 
2447 		val = SETFIELD(mask, val, 0);
2448 		npu2_relaxed_ordering_cfg_write(ndev, i, val);
2449 	}
2450 
2451 	unlock(&ndev->npu->lock);
2452 }
2453 
2454 /*
2455  * Enable or disable relaxed ordering on all nvlinks for a given PEC. May leave
2456  * relaxed ordering partially enabled if there are insufficient HW resources to
2457  * enable it on all links.
2458  */
npu2_set_relaxed_ordering(uint32_t gcid,int pec,bool enable)2459 static int npu2_set_relaxed_ordering(uint32_t gcid, int pec, bool enable)
2460 {
2461 	int rc = OPAL_SUCCESS;
2462 	struct phb *phb;
2463 	struct npu2 *npu;
2464 	struct npu2_dev *ndev;
2465 
2466 	for_each_phb(phb) {
2467 		if (phb->phb_type != phb_type_npu_v2)
2468 			continue;
2469 
2470 		npu = phb_to_npu2_nvlink(phb);
2471 		for (int i = 0; i < npu->total_devices; i++) {
2472 			ndev = &npu->devices[i];
2473 			if (enable)
2474 				rc = npu2_enable_relaxed_ordering(ndev, gcid, pec);
2475 			else
2476 				npu2_disable_relaxed_ordering(ndev, gcid, pec);
2477 
2478 			if (rc != OPAL_SUCCESS) {
2479 				NPU2DEVINF(ndev, "Insufficient resources to activate relaxed ordering mode\n");
2480 				return OPAL_RESOURCE;
2481 			}
2482 		}
2483 	}
2484 
2485 	return OPAL_SUCCESS;
2486 }
2487 
npu2_check_relaxed_ordering(struct phb * phb __unused,struct pci_device * pd,void * enable)2488 static int npu2_check_relaxed_ordering(struct phb *phb __unused,
2489 				       struct pci_device *pd, void *enable)
2490 {
2491 	/*
2492 	 * IBM PCIe bridge devices (ie. the root ports) can always allow relaxed
2493 	 * ordering
2494 	 */
2495 	if (pd->vdid == 0x04c11014)
2496 		pd->allow_relaxed_ordering = true;
2497 
2498 	PCIDBG(phb, pd->bdfn, "Checking relaxed ordering config\n");
2499 	if (pd->allow_relaxed_ordering)
2500 		return 0;
2501 
2502 	PCIDBG(phb, pd->bdfn, "Relaxed ordering not allowed\n");
2503 	*(bool *) enable = false;
2504 
2505 	return 1;
2506 }
2507 
opal_npu_set_relaxed_order(uint64_t phb_id,uint16_t bdfn,bool request_enabled)2508 static int64_t opal_npu_set_relaxed_order(uint64_t phb_id, uint16_t bdfn,
2509 					  bool request_enabled)
2510 {
2511 	struct phb *phb = pci_get_phb(phb_id);
2512 	struct phb4 *phb4;
2513 	uint32_t chip_id, pec;
2514 	struct pci_device *pd;
2515 	bool enable = true;
2516 
2517 	if (!phb || phb->phb_type != phb_type_pcie_v4)
2518 		return OPAL_PARAMETER;
2519 
2520 	phb4 = phb_to_phb4(phb);
2521 	pec = phb4->pec;
2522 	chip_id = phb4->chip_id;
2523 
2524 	if (npu2_relaxed_ordering_source_grpchp(chip_id) == OPAL_PARAMETER)
2525 		return OPAL_PARAMETER;
2526 
2527 	pd = pci_find_dev(phb, bdfn);
2528 	if (!pd)
2529 		return OPAL_PARAMETER;
2530 
2531 	/*
2532 	 * Not changing state, so no need to rescan PHB devices to determine if
2533 	 * we need to enable/disable it
2534 	 */
2535 	if (pd->allow_relaxed_ordering == request_enabled)
2536 		return OPAL_SUCCESS;
2537 
2538 	pd->allow_relaxed_ordering = request_enabled;
2539 
2540 	/*
2541 	 * Walk all devices on this PHB to ensure they all support relaxed
2542 	 * ordering
2543 	 */
2544 	pci_walk_dev(phb, NULL, npu2_check_relaxed_ordering, &enable);
2545 
2546 	if (request_enabled && !enable) {
2547 		/*
2548 		 * Not all devices on this PHB support relaxed-ordering
2549 		 * mode so we can't enable it as requested
2550 		 */
2551 		prlog(PR_INFO, "Cannot set relaxed ordering for PEC %d on chip %d\n",
2552 		      pec, chip_id);
2553 		return OPAL_CONSTRAINED;
2554 	}
2555 
2556 	if (npu2_set_relaxed_ordering(chip_id, pec, request_enabled) != OPAL_SUCCESS) {
2557 		npu2_set_relaxed_ordering(chip_id, pec, false);
2558 		return OPAL_RESOURCE;
2559 	}
2560 
2561 	phb4->ro_state = request_enabled;
2562 	return OPAL_SUCCESS;
2563 }
2564 opal_call(OPAL_NPU_SET_RELAXED_ORDER, opal_npu_set_relaxed_order, 3);
2565 
opal_npu_get_relaxed_order(uint64_t phb_id,uint16_t bdfn __unused)2566 static int64_t opal_npu_get_relaxed_order(uint64_t phb_id,
2567 					  uint16_t bdfn __unused)
2568 {
2569 	struct phb *phb = pci_get_phb(phb_id);
2570 	struct phb4 *phb4;
2571 
2572 	if (!phb || phb->phb_type != phb_type_pcie_v4)
2573 		return OPAL_PARAMETER;
2574 
2575 	phb4 = phb_to_phb4(phb);
2576 	return phb4->ro_state;
2577 }
2578 opal_call(OPAL_NPU_GET_RELAXED_ORDER, opal_npu_get_relaxed_order, 2);
2579