1 /*-
2  * Copyright (c) 2015 Nathan Whitehorn
3  * Copyright (c) 2017-2018 Semihalf
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/bus.h>
33 #include <sys/pcpu.h>
34 #include <sys/proc.h>
35 #include <sys/smp.h>
36 #include <vm/vm.h>
37 #include <vm/pmap.h>
38 
39 #include <machine/bus.h>
40 #include <machine/cpu.h>
41 #include <machine/hid.h>
42 #include <machine/platformvar.h>
43 #include <machine/pmap.h>
44 #include <machine/rtas.h>
45 #include <machine/smp.h>
46 #include <machine/spr.h>
47 #include <machine/trap.h>
48 
49 #include <dev/ofw/openfirm.h>
50 #include <dev/ofw/ofw_bus.h>
51 #include <dev/ofw/ofw_bus_subr.h>
52 #include <machine/ofw_machdep.h>
53 #include <powerpc/aim/mmu_oea64.h>
54 
55 #include "platform_if.h"
56 #include "opal.h"
57 
58 #ifdef SMP
59 extern void *ap_pcpu;
60 #endif
61 
62 void (*powernv_smp_ap_extra_init)(void);
63 
64 static int powernv_probe(platform_t);
65 static int powernv_attach(platform_t);
66 void powernv_mem_regions(platform_t, struct mem_region *phys, int *physsz,
67     struct mem_region *avail, int *availsz);
68 static void powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz);
69 static u_long powernv_timebase_freq(platform_t, struct cpuref *cpuref);
70 static int powernv_smp_first_cpu(platform_t, struct cpuref *cpuref);
71 static int powernv_smp_next_cpu(platform_t, struct cpuref *cpuref);
72 static int powernv_smp_get_bsp(platform_t, struct cpuref *cpuref);
73 static void powernv_smp_ap_init(platform_t);
74 #ifdef SMP
75 static int powernv_smp_start_cpu(platform_t, struct pcpu *cpu);
76 static void powernv_smp_probe_threads(platform_t);
77 static struct cpu_group *powernv_smp_topo(platform_t plat);
78 #endif
79 static void powernv_reset(platform_t);
80 static void powernv_cpu_idle(sbintime_t sbt);
81 static int powernv_cpuref_init(void);
82 static int powernv_node_numa_domain(platform_t platform, phandle_t node);
83 
84 static platform_method_t powernv_methods[] = {
85 	PLATFORMMETHOD(platform_probe, 		powernv_probe),
86 	PLATFORMMETHOD(platform_attach,		powernv_attach),
87 	PLATFORMMETHOD(platform_mem_regions,	powernv_mem_regions),
88 	PLATFORMMETHOD(platform_numa_mem_regions,	powernv_numa_mem_regions),
89 	PLATFORMMETHOD(platform_timebase_freq,	powernv_timebase_freq),
90 
91 	PLATFORMMETHOD(platform_smp_ap_init,	powernv_smp_ap_init),
92 	PLATFORMMETHOD(platform_smp_first_cpu,	powernv_smp_first_cpu),
93 	PLATFORMMETHOD(platform_smp_next_cpu,	powernv_smp_next_cpu),
94 	PLATFORMMETHOD(platform_smp_get_bsp,	powernv_smp_get_bsp),
95 #ifdef SMP
96 	PLATFORMMETHOD(platform_smp_start_cpu,	powernv_smp_start_cpu),
97 	PLATFORMMETHOD(platform_smp_probe_threads,	powernv_smp_probe_threads),
98 	PLATFORMMETHOD(platform_smp_topo,	powernv_smp_topo),
99 #endif
100 	PLATFORMMETHOD(platform_node_numa_domain,	powernv_node_numa_domain),
101 
102 	PLATFORMMETHOD(platform_reset,		powernv_reset),
103 	{ 0, 0 }
104 };
105 
106 static platform_def_t powernv_platform = {
107 	"powernv",
108 	powernv_methods,
109 	0
110 };
111 
112 static struct cpuref platform_cpuref[MAXCPU];
113 static int platform_cpuref_cnt;
114 static int platform_cpuref_valid;
115 static int platform_associativity;
116 
117 PLATFORM_DEF(powernv_platform);
118 
119 static uint64_t powernv_boot_pir;
120 
121 static int
122 powernv_probe(platform_t plat)
123 {
124 	if (opal_check() == 0)
125 		return (BUS_PROBE_SPECIFIC);
126 
127 	return (ENXIO);
128 }
129 
130 static int
131 powernv_attach(platform_t plat)
132 {
133 	uint32_t nptlp, shift = 0, slb_encoding = 0;
134 	int32_t lp_size, lp_encoding;
135 	char buf[255];
136 	pcell_t refpoints[3];
137 	pcell_t prop;
138 	phandle_t cpu;
139 	phandle_t opal;
140 	int res, len, idx;
141 	register_t msr;
142 	bool has_lp;
143 
144 	/* Ping OPAL again just to make sure */
145 	opal_check();
146 
147 #if BYTE_ORDER == LITTLE_ENDIAN
148 	opal_call(OPAL_REINIT_CPUS, 2 /* Little endian */);
149 #else
150 	opal_call(OPAL_REINIT_CPUS, 1 /* Big endian */);
151 #endif
152 	opal = OF_finddevice("/ibm,opal");
153 
154 	platform_associativity = 4; /* Skiboot default. */
155 	if (OF_getencprop(opal, "ibm,associativity-reference-points", refpoints,
156 	    sizeof(refpoints)) > 0) {
157 		platform_associativity = refpoints[0];
158 	}
159 
160        if (cpu_idle_hook == NULL)
161                 cpu_idle_hook = powernv_cpu_idle;
162 
163 	powernv_boot_pir = mfspr(SPR_PIR);
164 
165 	/* LPID must not be altered when PSL_DR or PSL_IR is set */
166 	msr = mfmsr();
167 	mtmsr(msr & ~(PSL_DR | PSL_IR));
168 
169 	/* Direct interrupts to SRR instead of HSRR and reset LPCR otherwise */
170 	mtspr(SPR_LPID, 0);
171 	isync();
172 
173 	if (cpu_features2 & PPC_FEATURE2_ARCH_3_00)
174 		lpcr |= LPCR_HVICE;
175 
176 #if BYTE_ORDER == LITTLE_ENDIAN
177 	lpcr |= LPCR_ILE;
178 #endif
179 
180 	mtspr(SPR_LPCR, lpcr);
181 	isync();
182 
183 	mtmsr(msr);
184 
185 	powernv_cpuref_init();
186 
187 	/* Set SLB count from device tree */
188 	cpu = OF_peer(0);
189 	cpu = OF_child(cpu);
190 	while (cpu != 0) {
191 		res = OF_getprop(cpu, "name", buf, sizeof(buf));
192 		if (res > 0 && strcmp(buf, "cpus") == 0)
193 			break;
194 		cpu = OF_peer(cpu);
195 	}
196 	if (cpu == 0)
197 		goto out;
198 
199 	cpu = OF_child(cpu);
200 	while (cpu != 0) {
201 		res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
202 		if (res > 0 && strcmp(buf, "cpu") == 0)
203 			break;
204 		cpu = OF_peer(cpu);
205 	}
206 	if (cpu == 0)
207 		goto out;
208 
209 	res = OF_getencprop(cpu, "ibm,slb-size", &prop, sizeof(prop));
210 	if (res > 0)
211 		n_slbs = prop;
212 
213 	/*
214 	 * Scan the large page size property for PAPR compatible machines.
215 	 * See PAPR D.5 Changes to Section 5.1.4, 'CPU Node Properties'
216 	 * for the encoding of the property.
217 	 */
218 
219 	len = OF_getproplen(cpu, "ibm,segment-page-sizes");
220 	if (len > 0) {
221 		/*
222 		 * We have to use a variable length array on the stack
223 		 * since we have very limited stack space.
224 		 */
225 		pcell_t arr[len/sizeof(cell_t)];
226 		res = OF_getencprop(cpu, "ibm,segment-page-sizes", arr,
227 		    sizeof(arr));
228 		len /= 4;
229 		idx = 0;
230 		has_lp = false;
231 		while (len > 0) {
232 			shift = arr[idx];
233 			slb_encoding = arr[idx + 1];
234 			nptlp = arr[idx + 2];
235 			idx += 3;
236 			len -= 3;
237 			while (len > 0 && nptlp) {
238 				lp_size = arr[idx];
239 				lp_encoding = arr[idx+1];
240 				if (slb_encoding == SLBV_L && lp_encoding == 0)
241 					has_lp = true;
242 
243 				if (slb_encoding == SLB_PGSZ_4K_4K &&
244 				    lp_encoding == LP_4K_16M)
245 					moea64_has_lp_4k_16m = true;
246 
247 				idx += 2;
248 				len -= 2;
249 				nptlp--;
250 			}
251 			if (has_lp && moea64_has_lp_4k_16m)
252 				break;
253 		}
254 
255 		if (!has_lp)
256 			panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) "
257 			    "not supported by this system.");
258 
259 		moea64_large_page_shift = shift;
260 		moea64_large_page_size = 1ULL << lp_size;
261 	}
262 
263 out:
264 	return (0);
265 }
266 
267 void
268 powernv_mem_regions(platform_t plat, struct mem_region *phys, int *physsz,
269     struct mem_region *avail, int *availsz)
270 {
271 
272 	ofw_mem_regions(phys, physsz, avail, availsz);
273 }
274 
275 static void
276 powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz)
277 {
278 
279 	ofw_numa_mem_regions(phys, physsz);
280 }
281 
282 static u_long
283 powernv_timebase_freq(platform_t plat, struct cpuref *cpuref)
284 {
285 	char buf[8];
286 	phandle_t cpu, dev, root;
287 	int res;
288 	int32_t ticks = -1;
289 
290 	root = OF_peer(0);
291 	dev = OF_child(root);
292 	while (dev != 0) {
293 		res = OF_getprop(dev, "name", buf, sizeof(buf));
294 		if (res > 0 && strcmp(buf, "cpus") == 0)
295 			break;
296 		dev = OF_peer(dev);
297 	}
298 
299 	for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) {
300 		res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
301 		if (res > 0 && strcmp(buf, "cpu") == 0)
302 			break;
303 	}
304 	if (cpu == 0)
305 		return (512000000);
306 
307 	OF_getencprop(cpu, "timebase-frequency", &ticks, sizeof(ticks));
308 
309 	if (ticks <= 0)
310 		panic("Unable to determine timebase frequency!");
311 
312 	return (ticks);
313 
314 }
315 
316 static int
317 powernv_cpuref_init(void)
318 {
319 	phandle_t cpu, dev;
320 	char buf[32];
321 	int a, res, tmp_cpuref_cnt;
322 	static struct cpuref tmp_cpuref[MAXCPU];
323 	cell_t interrupt_servers[32];
324 	uint64_t bsp;
325 
326 	if (platform_cpuref_valid)
327 		return (0);
328 
329 	dev = OF_peer(0);
330 	dev = OF_child(dev);
331 	while (dev != 0) {
332 		res = OF_getprop(dev, "name", buf, sizeof(buf));
333 		if (res > 0 && strcmp(buf, "cpus") == 0)
334 			break;
335 		dev = OF_peer(dev);
336 	}
337 
338 	bsp = 0;
339 	tmp_cpuref_cnt = 0;
340 	for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) {
341 		res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
342 		if (res > 0 && strcmp(buf, "cpu") == 0) {
343 			if (!ofw_bus_node_status_okay(cpu))
344 				continue;
345 			res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s");
346 			if (res > 0) {
347 				OF_getencprop(cpu, "ibm,ppc-interrupt-server#s",
348 				    interrupt_servers, res);
349 
350 				for (a = 0; a < res/sizeof(cell_t); a++) {
351 					tmp_cpuref[tmp_cpuref_cnt].cr_hwref = interrupt_servers[a];
352 					tmp_cpuref[tmp_cpuref_cnt].cr_cpuid = tmp_cpuref_cnt;
353 					tmp_cpuref[tmp_cpuref_cnt].cr_domain =
354 					    powernv_node_numa_domain(NULL, cpu);
355 					if (interrupt_servers[a] == (uint32_t)powernv_boot_pir)
356 						bsp = tmp_cpuref_cnt;
357 
358 					tmp_cpuref_cnt++;
359 				}
360 			}
361 		}
362 	}
363 
364 	/* Map IDs, so BSP has CPUID 0 regardless of hwref */
365 	for (a = bsp; a < tmp_cpuref_cnt; a++) {
366 		platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref;
367 		platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt;
368 		platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain;
369 		platform_cpuref_cnt++;
370 	}
371 	for (a = 0; a < bsp; a++) {
372 		platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref;
373 		platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt;
374 		platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain;
375 		platform_cpuref_cnt++;
376 	}
377 
378 	platform_cpuref_valid = 1;
379 
380 	return (0);
381 }
382 
383 static int
384 powernv_smp_first_cpu(platform_t plat, struct cpuref *cpuref)
385 {
386 	if (platform_cpuref_valid == 0)
387 		return (EINVAL);
388 
389 	cpuref->cr_cpuid = 0;
390 	cpuref->cr_hwref = platform_cpuref[0].cr_hwref;
391 	cpuref->cr_domain = platform_cpuref[0].cr_domain;
392 
393 	return (0);
394 }
395 
396 static int
397 powernv_smp_next_cpu(platform_t plat, struct cpuref *cpuref)
398 {
399 	int id;
400 
401 	if (platform_cpuref_valid == 0)
402 		return (EINVAL);
403 
404 	id = cpuref->cr_cpuid + 1;
405 	if (id >= platform_cpuref_cnt)
406 		return (ENOENT);
407 
408 	cpuref->cr_cpuid = platform_cpuref[id].cr_cpuid;
409 	cpuref->cr_hwref = platform_cpuref[id].cr_hwref;
410 	cpuref->cr_domain = platform_cpuref[id].cr_domain;
411 
412 	return (0);
413 }
414 
415 static int
416 powernv_smp_get_bsp(platform_t plat, struct cpuref *cpuref)
417 {
418 
419 	cpuref->cr_cpuid = platform_cpuref[0].cr_cpuid;
420 	cpuref->cr_hwref = platform_cpuref[0].cr_hwref;
421 	cpuref->cr_domain = platform_cpuref[0].cr_domain;
422 	return (0);
423 }
424 
425 #ifdef SMP
426 static int
427 powernv_smp_start_cpu(platform_t plat, struct pcpu *pc)
428 {
429 	int result;
430 
431 	ap_pcpu = pc;
432 	powerpc_sync();
433 
434 	result = opal_call(OPAL_START_CPU, pc->pc_hwref, EXC_RST);
435 	if (result != OPAL_SUCCESS) {
436 		printf("OPAL error (%d): unable to start AP %d\n",
437 		    result, (int)pc->pc_hwref);
438 		return (ENXIO);
439 	}
440 
441 	return (0);
442 }
443 
444 static void
445 powernv_smp_probe_threads(platform_t plat)
446 {
447 	char buf[8];
448 	phandle_t cpu, dev, root;
449 	int res, nthreads;
450 
451 	root = OF_peer(0);
452 
453 	dev = OF_child(root);
454 	while (dev != 0) {
455 		res = OF_getprop(dev, "name", buf, sizeof(buf));
456 		if (res > 0 && strcmp(buf, "cpus") == 0)
457 			break;
458 		dev = OF_peer(dev);
459 	}
460 
461 	nthreads = 1;
462 	for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) {
463 		res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
464 		if (res <= 0 || strcmp(buf, "cpu") != 0)
465 			continue;
466 
467 		res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s");
468 
469 		if (res >= 0)
470 			nthreads = res / sizeof(cell_t);
471 		else
472 			nthreads = 1;
473 		break;
474 	}
475 
476 	smp_threads_per_core = nthreads;
477 	if (mp_ncpus % nthreads == 0)
478 		mp_ncores = mp_ncpus / nthreads;
479 }
480 
481 static struct cpu_group *
482 cpu_group_init(struct cpu_group *group, struct cpu_group *parent,
483     const cpuset_t *cpus, int children, int level, int flags)
484 {
485 	struct cpu_group *child;
486 
487 	child = children != 0 ? smp_topo_alloc(children) : NULL;
488 
489 	group->cg_parent = parent;
490 	group->cg_child = child;
491 	CPU_COPY(cpus, &group->cg_mask);
492 	group->cg_count = CPU_COUNT(cpus);
493 	group->cg_children = children;
494 	group->cg_level = level;
495 	group->cg_flags = flags;
496 
497 	return (child);
498 }
499 
500 static struct cpu_group *
501 powernv_smp_topo(platform_t plat)
502 {
503 	struct cpu_group *core, *dom, *root;
504 	cpuset_t corecpus, domcpus;
505 	int cpuid, i, j, k, ncores;
506 
507 	if (mp_ncpus % smp_threads_per_core != 0) {
508 		printf("%s: irregular SMP topology (%d threads, %d per core)\n",
509 		    __func__, mp_ncpus, smp_threads_per_core);
510 		return (smp_topo_none());
511 	}
512 
513 	root = smp_topo_alloc(1);
514 	dom = cpu_group_init(root, NULL, &all_cpus, vm_ndomains, CG_SHARE_NONE,
515 	    0);
516 
517 	/*
518 	 * Redundant layers will be collapsed by the caller so we don't need a
519 	 * special case for a single domain.
520 	 */
521 	for (i = 0; i < vm_ndomains; i++, dom++) {
522 		CPU_COPY(&cpuset_domain[i], &domcpus);
523 		ncores = CPU_COUNT(&domcpus) / smp_threads_per_core;
524 		KASSERT(CPU_COUNT(&domcpus) % smp_threads_per_core == 0,
525 		    ("%s: domain %d core count not divisible by thread count",
526 		    __func__, i));
527 
528 		core = cpu_group_init(dom, root, &domcpus, ncores, CG_SHARE_L3,
529 		    0);
530 		for (j = 0; j < ncores; j++, core++) {
531 			/*
532 			 * Assume that consecutive CPU IDs correspond to sibling
533 			 * threads.
534 			 */
535 			CPU_ZERO(&corecpus);
536 			for (k = 0; k < smp_threads_per_core; k++) {
537 				cpuid = CPU_FFS(&domcpus) - 1;
538 				CPU_CLR(cpuid, &domcpus);
539 				CPU_SET(cpuid, &corecpus);
540 			}
541 			(void)cpu_group_init(core, dom, &corecpus, 0,
542 			    CG_SHARE_L1, CG_FLAG_SMT);
543 		}
544 	}
545 
546 	return (root);
547 }
548 
549 #endif
550 
551 static void
552 powernv_reset(platform_t platform)
553 {
554 
555 	opal_call(OPAL_CEC_REBOOT);
556 }
557 
558 static void
559 powernv_smp_ap_init(platform_t platform)
560 {
561 
562 	if (powernv_smp_ap_extra_init != NULL)
563 		powernv_smp_ap_extra_init();
564 }
565 
566 static void
567 powernv_cpu_idle(sbintime_t sbt)
568 {
569 }
570 
571 static int
572 powernv_node_numa_domain(platform_t platform, phandle_t node)
573 {
574 	/* XXX: Is locking necessary in here? */
575 	static int numa_domains[MAXMEMDOM];
576 	static int numa_max_domain;
577 	cell_t associativity[5];
578 	int i, res;
579 
580 #ifndef NUMA
581 	return (0);
582 #endif
583 	i = 0;
584 	TUNABLE_INT_FETCH("vm.numa.disabled", &i);
585 	if (i)
586 		return (0);
587 
588 	res = OF_getencprop(node, "ibm,associativity",
589 		associativity, sizeof(associativity));
590 
591 	/*
592 	 * If this node doesn't have associativity, or if there are not
593 	 * enough elements in it, check its parent.
594 	 */
595 	if (res < (int)(sizeof(cell_t) * (platform_associativity + 1))) {
596 		node = OF_parent(node);
597 		/* If already at the root, use default domain. */
598 		if (node == 0)
599 			return (0);
600 		return (powernv_node_numa_domain(platform, node));
601 	}
602 
603 	for (i = 0; i < numa_max_domain; i++) {
604 		if (numa_domains[i] == associativity[platform_associativity])
605 			return (i);
606 	}
607 	if (i < MAXMEMDOM)
608 		numa_domains[numa_max_domain++] =
609 		    associativity[platform_associativity];
610 	else
611 		i = 0;
612 
613 	return (i);
614 }
615 
616 /* Set up the Nest MMU on POWER9 relatively early, but after pmap is setup. */
617 static void
618 powernv_setup_nmmu(void *unused)
619 {
620 	if (opal_check() != 0)
621 		return;
622 	opal_call(OPAL_NMMU_SET_PTCR, -1, mfspr(SPR_PTCR));
623 }
624 
625 SYSINIT(powernv_setup_nmmu, SI_SUB_CPU, SI_ORDER_ANY, powernv_setup_nmmu, NULL);
626