xref: /illumos-gate/usr/src/uts/i86pc/os/cpr_impl.c (revision 7417cfde)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Platform specific implementation code
27  * Currently only suspend to RAM is supported (ACPI S3)
28  */
29 
30 #define	SUNDDI_IMPL
31 
32 #include <sys/types.h>
33 #include <sys/promif.h>
34 #include <sys/prom_isa.h>
35 #include <sys/prom_plat.h>
36 #include <sys/cpuvar.h>
37 #include <sys/pte.h>
38 #include <vm/hat.h>
39 #include <vm/page.h>
40 #include <vm/as.h>
41 #include <sys/cpr.h>
42 #include <sys/kmem.h>
43 #include <sys/clock.h>
44 #include <sys/kmem.h>
45 #include <sys/panic.h>
46 #include <vm/seg_kmem.h>
47 #include <sys/cpu_module.h>
48 #include <sys/callb.h>
49 #include <sys/machsystm.h>
50 #include <sys/vmsystm.h>
51 #include <sys/systm.h>
52 #include <sys/archsystm.h>
53 #include <sys/stack.h>
54 #include <sys/fs/ufs_fs.h>
55 #include <sys/memlist.h>
56 #include <sys/bootconf.h>
57 #include <sys/thread.h>
58 #include <sys/x_call.h>
59 #include <sys/smp_impldefs.h>
60 #include <vm/vm_dep.h>
61 #include <sys/psm.h>
62 #include <sys/epm.h>
63 #include <sys/cpr_wakecode.h>
64 #include <sys/x86_archext.h>
65 #include <sys/reboot.h>
66 #include <sys/acpi/acpi.h>
67 #include <sys/acpica.h>
68 
69 #define	AFMT	"%lx"
70 
71 extern int	flushes_require_xcalls;
72 extern cpuset_t	cpu_ready_set;
73 
74 #if defined(__amd64)
75 extern void	*wc_long_mode_64(void);
76 #endif	/* __amd64 */
77 extern int	tsc_gethrtime_enable;
78 extern	void	i_cpr_start_cpu(void);
79 
80 ushort_t	cpr_mach_type = CPR_MACHTYPE_X86;
81 void		(*cpr_start_cpu_func)(void) = i_cpr_start_cpu;
82 
83 static wc_cpu_t	*wc_other_cpus = NULL;
84 static cpuset_t procset;
85 
86 static void
87 init_real_mode_platter(int cpun, uint32_t offset, uint_t cr4, wc_desctbr_t gdt);
88 
89 static int i_cpr_platform_alloc(psm_state_request_t *req);
90 static void i_cpr_platform_free(psm_state_request_t *req);
91 static int i_cpr_save_apic(psm_state_request_t *req);
92 static int i_cpr_restore_apic(psm_state_request_t *req);
93 static int wait_for_set(cpuset_t *set, int who);
94 
95 static	void i_cpr_save_stack(kthread_t *t, wc_cpu_t *wc_cpu);
96 void i_cpr_restore_stack(kthread_t *t, greg_t *save_stack);
97 
98 #ifdef STACK_GROWTH_DOWN
99 #define	CPR_GET_STACK_START(t) ((t)->t_stkbase)
100 #define	CPR_GET_STACK_END(t) ((t)->t_stk)
101 #else
102 #define	CPR_GET_STACK_START(t) ((t)->t_stk)
103 #define	CPR_GET_STACK_END(t) ((t)->t_stkbase)
104 #endif	/* STACK_GROWTH_DOWN */
105 
106 /*
107  * restart paused slave cpus
108  */
109 void
110 i_cpr_machdep_setup(void)
111 {
112 	if (ncpus > 1) {
113 		CPR_DEBUG(CPR_DEBUG1, ("MP restarted...\n"));
114 		mutex_enter(&cpu_lock);
115 		start_cpus();
116 		mutex_exit(&cpu_lock);
117 	}
118 }
119 
120 
121 /*
122  * Stop all interrupt activities in the system
123  */
124 void
125 i_cpr_stop_intr(void)
126 {
127 	(void) spl7();
128 }
129 
130 /*
131  * Set machine up to take interrupts
132  */
133 void
134 i_cpr_enable_intr(void)
135 {
136 	(void) spl0();
137 }
138 
139 /*
140  * Save miscellaneous information which needs to be written to the
141  * state file.  This information is required to re-initialize
142  * kernel/prom handshaking.
143  */
144 void
145 i_cpr_save_machdep_info(void)
146 {
147 	int notcalled = 0;
148 	ASSERT(notcalled);
149 }
150 
151 
152 void
153 i_cpr_set_tbr(void)
154 {
155 }
156 
157 
158 processorid_t
159 i_cpr_bootcpuid(void)
160 {
161 	return (0);
162 }
163 
164 /*
165  * cpu0 should contain bootcpu info
166  */
167 cpu_t *
168 i_cpr_bootcpu(void)
169 {
170 	ASSERT(MUTEX_HELD(&cpu_lock));
171 
172 	return (cpu_get(i_cpr_bootcpuid()));
173 }
174 
175 /*
176  *	Save context for the specified CPU
177  */
178 void *
179 i_cpr_save_context(void *arg)
180 {
181 	long	index = (long)arg;
182 	psm_state_request_t *papic_state;
183 	int resuming;
184 	int	ret;
185 	wc_cpu_t	*wc_cpu = wc_other_cpus + index;
186 
187 	PMD(PMD_SX, ("i_cpr_save_context() index = %ld\n", index))
188 
189 	ASSERT(index < NCPU);
190 
191 	papic_state = &(wc_cpu)->wc_apic_state;
192 
193 	ret = i_cpr_platform_alloc(papic_state);
194 	ASSERT(ret == 0);
195 
196 	ret = i_cpr_save_apic(papic_state);
197 	ASSERT(ret == 0);
198 
199 	i_cpr_save_stack(curthread, wc_cpu);
200 
201 	/*
202 	 * wc_save_context returns twice, once when susending and
203 	 * once when resuming,  wc_save_context() returns 0 when
204 	 * suspending and non-zero upon resume
205 	 */
206 	resuming = (wc_save_context(wc_cpu) == 0);
207 
208 	/*
209 	 * do NOT call any functions after this point, because doing so
210 	 * will modify the stack that we are running on
211 	 */
212 
213 	if (resuming) {
214 
215 		ret = i_cpr_restore_apic(papic_state);
216 		ASSERT(ret == 0);
217 
218 		i_cpr_platform_free(papic_state);
219 
220 		/*
221 		 * Enable interrupts on this cpu.
222 		 * Do not bind interrupts to this CPU's local APIC until
223 		 * the CPU is ready to receive interrupts.
224 		 */
225 		ASSERT(CPU->cpu_id != i_cpr_bootcpuid());
226 		mutex_enter(&cpu_lock);
227 		cpu_enable_intr(CPU);
228 		mutex_exit(&cpu_lock);
229 
230 		/*
231 		 * Setting the bit in cpu_ready_set must be the last operation
232 		 * in processor initialization; the boot CPU will continue to
233 		 * boot once it sees this bit set for all active CPUs.
234 		 */
235 		CPUSET_ATOMIC_ADD(cpu_ready_set, CPU->cpu_id);
236 
237 		PMD(PMD_SX,
238 		    ("i_cpr_save_context() resuming cpu %d in cpu_ready_set\n",
239 		    CPU->cpu_id))
240 	} else {
241 		/*
242 		 * Disable interrupts on this CPU so that PSM knows not to bind
243 		 * interrupts here on resume until the CPU has executed
244 		 * cpu_enable_intr() (above) in the resume path.
245 		 * We explicitly do not grab cpu_lock here because at this point
246 		 * in the suspend process, the boot cpu owns cpu_lock and all
247 		 * other cpus are also executing in the pause thread (only
248 		 * modifying their respective CPU structure).
249 		 */
250 		(void) cpu_disable_intr(CPU);
251 	}
252 
253 	PMD(PMD_SX, ("i_cpr_save_context: wc_save_context returns %d\n",
254 	    resuming))
255 
256 	return (NULL);
257 }
258 
259 static ushort_t *warm_reset_vector = NULL;
260 
261 static ushort_t *
262 map_warm_reset_vector()
263 {
264 	/*LINTED*/
265 	if (!(warm_reset_vector = (ushort_t *)psm_map_phys(WARM_RESET_VECTOR,
266 	    sizeof (ushort_t *), PROT_READ|PROT_WRITE)))
267 		return (NULL);
268 
269 	/*
270 	 * setup secondary cpu bios boot up vector
271 	 */
272 	*warm_reset_vector = (ushort_t)((caddr_t)
273 	    /*LINTED*/
274 	    ((struct rm_platter *)rm_platter_va)->rm_code - rm_platter_va
275 	    + ((ulong_t)rm_platter_va & 0xf));
276 	warm_reset_vector++;
277 	*warm_reset_vector = (ushort_t)(rm_platter_pa >> 4);
278 
279 	--warm_reset_vector;
280 	return (warm_reset_vector);
281 }
282 
283 void
284 i_cpr_pre_resume_cpus()
285 {
286 	/*
287 	 * this is a cut down version of start_other_cpus()
288 	 * just do the initialization to wake the other cpus
289 	 */
290 	unsigned who;
291 	int boot_cpuid = i_cpr_bootcpuid();
292 	uint32_t		code_length = 0;
293 	caddr_t			wakevirt = rm_platter_va;
294 	/*LINTED*/
295 	wakecode_t		*wp = (wakecode_t *)wakevirt;
296 	char *str = "i_cpr_pre_resume_cpus";
297 	extern int get_tsc_ready();
298 	int err;
299 
300 	/*LINTED*/
301 	rm_platter_t *real_mode_platter = (rm_platter_t *)rm_platter_va;
302 
303 	/*
304 	 * If startup wasn't able to find a page under 1M, we cannot
305 	 * proceed.
306 	 */
307 	if (rm_platter_va == 0) {
308 		cmn_err(CE_WARN, "Cannot suspend the system because no "
309 		    "memory below 1M could be found for processor startup");
310 		return;
311 	}
312 
313 	/*
314 	 * Copy the real mode code at "real_mode_start" to the
315 	 * page at rm_platter_va.
316 	 */
317 	warm_reset_vector = map_warm_reset_vector();
318 	if (warm_reset_vector == NULL) {
319 		PMD(PMD_SX, ("i_cpr_pre_resume_cpus() returning #2\n"))
320 		return;
321 	}
322 
323 	flushes_require_xcalls = 1;
324 
325 	/*
326 	 * We lock our affinity to the master CPU to ensure that all slave CPUs
327 	 * do their TSC syncs with the same CPU.
328 	 */
329 
330 	affinity_set(CPU_CURRENT);
331 
332 	/*
333 	 * Mark the boot cpu as being ready and in the procset, since we are
334 	 * running on that cpu.
335 	 */
336 	CPUSET_ONLY(cpu_ready_set, boot_cpuid);
337 	CPUSET_ONLY(procset, boot_cpuid);
338 
339 	for (who = 0; who < max_ncpus; who++) {
340 
341 		wc_cpu_t	*cpup = wc_other_cpus + who;
342 		wc_desctbr_t	gdt;
343 
344 		if (who == boot_cpuid)
345 			continue;
346 
347 		if (!CPU_IN_SET(mp_cpus, who))
348 			continue;
349 
350 		PMD(PMD_SX, ("%s() waking up %d cpu\n", str, who))
351 
352 		bcopy(cpup, &(wp->wc_cpu), sizeof (wc_cpu_t));
353 
354 		gdt.base = cpup->wc_gdt_base;
355 		gdt.limit = cpup->wc_gdt_limit;
356 
357 #if defined(__amd64)
358 		code_length = (uint32_t)wc_long_mode_64 - (uint32_t)wc_rm_start;
359 #else
360 		code_length = 0;
361 #endif
362 
363 		init_real_mode_platter(who, code_length, cpup->wc_cr4, gdt);
364 
365 		mutex_enter(&cpu_lock);
366 		err = mach_cpuid_start(who, rm_platter_va);
367 		mutex_exit(&cpu_lock);
368 		if (err != 0) {
369 			cmn_err(CE_WARN, "cpu%d: failed to start during "
370 			    "suspend/resume error %d", who, err);
371 			continue;
372 		}
373 
374 		PMD(PMD_SX, ("%s() #1 waiting for %d in procset\n", str, who))
375 
376 		if (!wait_for_set(&procset, who))
377 			continue;
378 
379 		PMD(PMD_SX, ("%s() %d cpu started\n", str, who))
380 
381 		PMD(PMD_SX, ("%s() tsc_ready = %d\n", str, get_tsc_ready()))
382 
383 		if (tsc_gethrtime_enable) {
384 			PMD(PMD_SX, ("%s() calling tsc_sync_master\n", str))
385 			tsc_sync_master(who);
386 		}
387 
388 		PMD(PMD_SX, ("%s() waiting for %d in cpu_ready_set\n", str,
389 		    who))
390 		/*
391 		 * Wait for cpu to declare that it is ready, we want the
392 		 * cpus to start serially instead of in parallel, so that
393 		 * they do not contend with each other in wc_rm_start()
394 		 */
395 		if (!wait_for_set(&cpu_ready_set, who))
396 			continue;
397 
398 		/*
399 		 * do not need to re-initialize dtrace using dtrace_cpu_init
400 		 * function
401 		 */
402 		PMD(PMD_SX, ("%s() cpu %d now ready\n", str, who))
403 	}
404 
405 	affinity_clear();
406 
407 	PMD(PMD_SX, ("%s() all cpus now ready\n", str))
408 
409 }
410 
411 static void
412 unmap_warm_reset_vector(ushort_t *warm_reset_vector)
413 {
414 	psm_unmap_phys((caddr_t)warm_reset_vector, sizeof (ushort_t *));
415 }
416 
417 /*
418  * We need to setup a 1:1 (virtual to physical) mapping for the
419  * page containing the wakeup code.
420  */
421 static struct as *save_as;	/* when switching to kas */
422 
423 static void
424 unmap_wakeaddr_1to1(uint64_t wakephys)
425 {
426 	uintptr_t	wp = (uintptr_t)wakephys;
427 	hat_setup(save_as->a_hat, 0);	/* switch back from kernel hat */
428 	hat_unload(kas.a_hat, (caddr_t)wp, PAGESIZE, HAT_UNLOAD);
429 }
430 
431 void
432 i_cpr_post_resume_cpus()
433 {
434 	uint64_t	wakephys = rm_platter_pa;
435 
436 	if (warm_reset_vector != NULL)
437 		unmap_warm_reset_vector(warm_reset_vector);
438 
439 	hat_unload(kas.a_hat, (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE,
440 	    HAT_UNLOAD);
441 
442 	/*
443 	 * cmi_post_mpstartup() is only required upon boot not upon
444 	 * resume from RAM
445 	 */
446 
447 	PT(PT_UNDO1to1);
448 	/* Tear down 1:1 mapping for wakeup code */
449 	unmap_wakeaddr_1to1(wakephys);
450 }
451 
452 /* ARGSUSED */
453 void
454 i_cpr_handle_xc(int flag)
455 {
456 }
457 
458 int
459 i_cpr_reusable_supported(void)
460 {
461 	return (0);
462 }
463 static void
464 map_wakeaddr_1to1(uint64_t wakephys)
465 {
466 	uintptr_t	wp = (uintptr_t)wakephys;
467 	hat_devload(kas.a_hat, (caddr_t)wp, PAGESIZE, btop(wakephys),
468 	    (PROT_READ|PROT_WRITE|PROT_EXEC|HAT_STORECACHING_OK|HAT_NOSYNC),
469 	    HAT_LOAD);
470 	save_as = curthread->t_procp->p_as;
471 	hat_setup(kas.a_hat, 0);	/* switch to kernel-only hat */
472 }
473 
474 
475 void
476 prt_other_cpus()
477 {
478 	int	who;
479 
480 	if (ncpus == 1) {
481 		PMD(PMD_SX, ("prt_other_cpus() other cpu table empty for "
482 		    "uniprocessor machine\n"))
483 		return;
484 	}
485 
486 	for (who = 0; who < max_ncpus; who++) {
487 
488 		wc_cpu_t	*cpup = wc_other_cpus + who;
489 
490 		if (!CPU_IN_SET(mp_cpus, who))
491 			continue;
492 
493 		PMD(PMD_SX, ("prt_other_cpus() who = %d, gdt=%p:%x, "
494 		    "idt=%p:%x, ldt=%lx, tr=%lx, kgsbase="
495 		    AFMT ", sp=%lx\n", who,
496 		    (void *)cpup->wc_gdt_base, cpup->wc_gdt_limit,
497 		    (void *)cpup->wc_idt_base, cpup->wc_idt_limit,
498 		    (long)cpup->wc_ldt, (long)cpup->wc_tr,
499 		    (long)cpup->wc_kgsbase, (long)cpup->wc_rsp))
500 	}
501 }
502 
503 /*
504  * Power down the system.
505  */
506 int
507 i_cpr_power_down(int sleeptype)
508 {
509 	caddr_t		wakevirt = rm_platter_va;
510 	uint64_t	wakephys = rm_platter_pa;
511 	ulong_t		saved_intr;
512 	uint32_t	code_length = 0;
513 	wc_desctbr_t	gdt;
514 	/*LINTED*/
515 	wakecode_t	*wp = (wakecode_t *)wakevirt;
516 	/*LINTED*/
517 	rm_platter_t	*wcpp = (rm_platter_t *)wakevirt;
518 	wc_cpu_t	*cpup = &(wp->wc_cpu);
519 	dev_info_t	*ppm;
520 	int		ret = 0;
521 	power_req_t	power_req;
522 	char *str =	"i_cpr_power_down";
523 #if defined(__amd64)
524 	/*LINTED*/
525 	rm_platter_t *real_mode_platter = (rm_platter_t *)rm_platter_va;
526 #endif
527 	extern int	cpr_suspend_succeeded;
528 	extern void	kernel_wc_code();
529 
530 	ASSERT(sleeptype == CPR_TORAM);
531 	ASSERT(CPU->cpu_id == 0);
532 
533 	if ((ppm = PPM(ddi_root_node())) == NULL) {
534 		PMD(PMD_SX, ("%s: root node not claimed\n", str))
535 		return (ENOTTY);
536 	}
537 
538 	PMD(PMD_SX, ("Entering %s()\n", str))
539 
540 	PT(PT_IC);
541 	saved_intr = intr_clear();
542 
543 	PT(PT_1to1);
544 	/* Setup 1:1 mapping for wakeup code */
545 	map_wakeaddr_1to1(wakephys);
546 
547 	PMD(PMD_SX, ("ncpus=%d\n", ncpus))
548 
549 	PMD(PMD_SX, ("wc_rm_end - wc_rm_start=%lx WC_CODESIZE=%x\n",
550 	    ((size_t)((uint_t)wc_rm_end - (uint_t)wc_rm_start)), WC_CODESIZE))
551 
552 	PMD(PMD_SX, ("wakevirt=%p, wakephys=%x\n",
553 	    (void *)wakevirt, (uint_t)wakephys))
554 
555 	ASSERT(((size_t)((uint_t)wc_rm_end - (uint_t)wc_rm_start)) <
556 	    WC_CODESIZE);
557 
558 	bzero(wakevirt, PAGESIZE);
559 
560 	/* Copy code to rm_platter */
561 	bcopy((caddr_t)wc_rm_start, wakevirt,
562 	    (size_t)((uint_t)wc_rm_end - (uint_t)wc_rm_start));
563 
564 	prt_other_cpus();
565 
566 #if defined(__amd64)
567 
568 	PMD(PMD_SX, ("real_mode_platter->rm_cr4=%lx, getcr4()=%lx\n",
569 	    (ulong_t)real_mode_platter->rm_cr4, (ulong_t)getcr4()))
570 	PMD(PMD_SX, ("real_mode_platter->rm_pdbr=%lx, getcr3()=%lx\n",
571 	    (ulong_t)real_mode_platter->rm_pdbr, getcr3()))
572 
573 	real_mode_platter->rm_cr4 = getcr4();
574 	real_mode_platter->rm_pdbr = getcr3();
575 
576 	rmp_gdt_init(real_mode_platter);
577 
578 	/*
579 	 * Since the CPU needs to jump to protected mode using an identity
580 	 * mapped address, we need to calculate it here.
581 	 */
582 	real_mode_platter->rm_longmode64_addr = rm_platter_pa +
583 	    ((uint32_t)wc_long_mode_64 - (uint32_t)wc_rm_start);
584 
585 	PMD(PMD_SX, ("real_mode_platter->rm_cr4=%lx, getcr4()=%lx\n",
586 	    (ulong_t)real_mode_platter->rm_cr4, getcr4()))
587 
588 	PMD(PMD_SX, ("real_mode_platter->rm_pdbr=%lx, getcr3()=%lx\n",
589 	    (ulong_t)real_mode_platter->rm_pdbr, getcr3()))
590 
591 	PMD(PMD_SX, ("real_mode_platter->rm_longmode64_addr=%lx\n",
592 	    (ulong_t)real_mode_platter->rm_longmode64_addr))
593 
594 #endif
595 
596 	PT(PT_SC);
597 	if (wc_save_context(cpup)) {
598 
599 		ret = i_cpr_platform_alloc(&(wc_other_cpus->wc_apic_state));
600 		if (ret != 0)
601 			return (ret);
602 
603 		ret = i_cpr_save_apic(&(wc_other_cpus->wc_apic_state));
604 		PMD(PMD_SX, ("%s: i_cpr_save_apic() returned %d\n", str, ret))
605 		if (ret != 0)
606 			return (ret);
607 
608 		PMD(PMD_SX, ("wakephys=%x, kernel_wc_code=%p\n",
609 		    (uint_t)wakephys, (void *)&kernel_wc_code))
610 		PMD(PMD_SX, ("virtaddr=%lx, retaddr=%lx\n",
611 		    (long)cpup->wc_virtaddr, (long)cpup->wc_retaddr))
612 		PMD(PMD_SX, ("ebx=%x, edi=%x, esi=%x, ebp=%x, esp=%x\n",
613 		    cpup->wc_ebx, cpup->wc_edi, cpup->wc_esi, cpup->wc_ebp,
614 		    cpup->wc_esp))
615 		PMD(PMD_SX, ("cr0=%lx, cr3=%lx, cr4=%lx\n",
616 		    (long)cpup->wc_cr0, (long)cpup->wc_cr3,
617 		    (long)cpup->wc_cr4))
618 		PMD(PMD_SX, ("cs=%x, ds=%x, es=%x, ss=%x, fs=%lx, gs=%lx, "
619 		    "flgs=%lx\n", cpup->wc_cs, cpup->wc_ds, cpup->wc_es,
620 		    cpup->wc_ss, (long)cpup->wc_fs, (long)cpup->wc_gs,
621 		    (long)cpup->wc_eflags))
622 
623 		PMD(PMD_SX, ("gdt=%p:%x, idt=%p:%x, ldt=%lx, tr=%lx, "
624 		    "kgbase=%lx\n", (void *)cpup->wc_gdt_base,
625 		    cpup->wc_gdt_limit, (void *)cpup->wc_idt_base,
626 		    cpup->wc_idt_limit, (long)cpup->wc_ldt,
627 		    (long)cpup->wc_tr, (long)cpup->wc_kgsbase))
628 
629 		gdt.base = cpup->wc_gdt_base;
630 		gdt.limit = cpup->wc_gdt_limit;
631 
632 #if defined(__amd64)
633 		code_length = (uint32_t)wc_long_mode_64 -
634 		    (uint32_t)wc_rm_start;
635 #else
636 		code_length = 0;
637 #endif
638 
639 		init_real_mode_platter(0, code_length, cpup->wc_cr4, gdt);
640 
641 #if defined(__amd64)
642 		PMD(PMD_SX, ("real_mode_platter->rm_cr4=%lx, getcr4()=%lx\n",
643 		    (ulong_t)wcpp->rm_cr4, getcr4()))
644 
645 		PMD(PMD_SX, ("real_mode_platter->rm_pdbr=%lx, getcr3()=%lx\n",
646 		    (ulong_t)wcpp->rm_pdbr, getcr3()))
647 
648 		PMD(PMD_SX, ("real_mode_platter->rm_longmode64_addr=%lx\n",
649 		    (ulong_t)wcpp->rm_longmode64_addr))
650 
651 		PMD(PMD_SX,
652 		    ("real_mode_platter->rm_temp_gdt[TEMPGDT_KCODE64]=%lx\n",
653 		    (ulong_t)wcpp->rm_temp_gdt[TEMPGDT_KCODE64]))
654 #endif
655 
656 		PMD(PMD_SX, ("gdt=%p:%x, idt=%p:%x, ldt=%lx, tr=%lx, "
657 		    "kgsbase=%lx\n", (void *)wcpp->rm_gdt_base,
658 		    wcpp->rm_gdt_lim, (void *)wcpp->rm_idt_base,
659 		    wcpp->rm_idt_lim, (long)cpup->wc_ldt, (long)cpup->wc_tr,
660 		    (long)cpup->wc_kgsbase))
661 
662 		power_req.request_type = PMR_PPM_ENTER_SX;
663 		power_req.req.ppm_power_enter_sx_req.sx_state = S3;
664 		power_req.req.ppm_power_enter_sx_req.test_point =
665 		    cpr_test_point;
666 		power_req.req.ppm_power_enter_sx_req.wakephys = wakephys;
667 
668 		PMD(PMD_SX, ("%s: pm_ctlops PMR_PPM_ENTER_SX\n", str))
669 		PT(PT_PPMCTLOP);
670 		(void) pm_ctlops(ppm, ddi_root_node(), DDI_CTLOPS_POWER,
671 		    &power_req, &ret);
672 		PMD(PMD_SX, ("%s: returns %d\n", str, ret))
673 
674 		/*
675 		 * If it works, we get control back to the else branch below
676 		 * If we get control back here, it didn't work.
677 		 * XXX return EINVAL here?
678 		 */
679 
680 		unmap_wakeaddr_1to1(wakephys);
681 		intr_restore(saved_intr);
682 
683 		return (ret);
684 	} else {
685 		cpr_suspend_succeeded = 1;
686 
687 		power_req.request_type = PMR_PPM_EXIT_SX;
688 		power_req.req.ppm_power_enter_sx_req.sx_state = S3;
689 
690 		PMD(PMD_SX, ("%s: pm_ctlops PMR_PPM_EXIT_SX\n", str))
691 		PT(PT_PPMCTLOP);
692 		(void) pm_ctlops(ppm, ddi_root_node(), DDI_CTLOPS_POWER,
693 		    &power_req, &ret);
694 		PMD(PMD_SX, ("%s: returns %d\n", str, ret))
695 
696 		ret = i_cpr_restore_apic(&(wc_other_cpus->wc_apic_state));
697 		/*
698 		 * the restore should never fail, if the saved suceeded
699 		 */
700 		ASSERT(ret == 0);
701 
702 		i_cpr_platform_free(&(wc_other_cpus->wc_apic_state));
703 
704 		/*
705 		 * Enable interrupts on boot cpu.
706 		 */
707 		ASSERT(CPU->cpu_id == i_cpr_bootcpuid());
708 		mutex_enter(&cpu_lock);
709 		cpu_enable_intr(CPU);
710 		mutex_exit(&cpu_lock);
711 
712 		PT(PT_INTRRESTORE);
713 		intr_restore(saved_intr);
714 		PT(PT_CPU);
715 
716 		return (ret);
717 	}
718 }
719 
720 /*
721  * Stop all other cpu's before halting or rebooting. We pause the cpu's
722  * instead of sending a cross call.
723  * Stolen from sun4/os/mp_states.c
724  */
725 
726 static int cpu_are_paused;	/* sic */
727 
728 void
729 i_cpr_stop_other_cpus(void)
730 {
731 	mutex_enter(&cpu_lock);
732 	if (cpu_are_paused) {
733 		mutex_exit(&cpu_lock);
734 		return;
735 	}
736 	pause_cpus(NULL);
737 	cpu_are_paused = 1;
738 
739 	mutex_exit(&cpu_lock);
740 }
741 
742 int
743 i_cpr_is_supported(int sleeptype)
744 {
745 	extern int cpr_supported_override;
746 	extern int cpr_platform_enable;
747 	extern int pm_S3_enabled;
748 
749 	if (sleeptype != CPR_TORAM)
750 		return (0);
751 
752 	/*
753 	 * The next statement tests if a specific platform has turned off
754 	 * cpr support.
755 	 */
756 	if (cpr_supported_override)
757 		return (0);
758 
759 	/*
760 	 * If a platform has specifically turned on cpr support ...
761 	 */
762 	if (cpr_platform_enable)
763 		return (1);
764 
765 	return (pm_S3_enabled);
766 }
767 
768 void
769 i_cpr_bitmap_cleanup(void)
770 {
771 }
772 
773 void
774 i_cpr_free_memory_resources(void)
775 {
776 }
777 
778 /*
779  * Needed only for S3 so far
780  */
781 static int
782 i_cpr_platform_alloc(psm_state_request_t *req)
783 {
784 #ifdef DEBUG
785 	char	*str = "i_cpr_platform_alloc";
786 #endif
787 
788 	PMD(PMD_SX, ("cpu = %d, %s(%p) \n", CPU->cpu_id, str, (void *)req))
789 
790 	if (psm_state == NULL) {
791 		PMD(PMD_SX, ("%s() : psm_state == NULL\n", str))
792 		return (0);
793 	}
794 
795 	req->psr_cmd = PSM_STATE_ALLOC;
796 	return ((*psm_state)(req));
797 }
798 
799 /*
800  * Needed only for S3 so far
801  */
802 static void
803 i_cpr_platform_free(psm_state_request_t *req)
804 {
805 #ifdef DEBUG
806 	char	*str = "i_cpr_platform_free";
807 #endif
808 
809 	PMD(PMD_SX, ("cpu = %d, %s(%p) \n", CPU->cpu_id, str, (void *)req))
810 
811 	if (psm_state == NULL) {
812 		PMD(PMD_SX, ("%s() : psm_state == NULL\n", str))
813 		return;
814 	}
815 
816 	req->psr_cmd = PSM_STATE_FREE;
817 	(void) (*psm_state)(req);
818 }
819 
820 static int
821 i_cpr_save_apic(psm_state_request_t *req)
822 {
823 #ifdef DEBUG
824 	char	*str = "i_cpr_save_apic";
825 #endif
826 
827 	if (psm_state == NULL) {
828 		PMD(PMD_SX, ("%s() : psm_state == NULL\n", str))
829 		return (0);
830 	}
831 
832 	req->psr_cmd = PSM_STATE_SAVE;
833 	return ((*psm_state)(req));
834 }
835 
836 static int
837 i_cpr_restore_apic(psm_state_request_t *req)
838 {
839 #ifdef DEBUG
840 	char	*str = "i_cpr_restore_apic";
841 #endif
842 
843 	if (psm_state == NULL) {
844 		PMD(PMD_SX, ("%s() : psm_state == NULL\n", str))
845 		return (0);
846 	}
847 
848 	req->psr_cmd = PSM_STATE_RESTORE;
849 	return ((*psm_state)(req));
850 }
851 
852 
853 /* stop lint complaining about offset not being used in 32bit mode */
854 #if !defined(__amd64)
855 /*ARGSUSED*/
856 #endif
857 static void
858 init_real_mode_platter(int cpun, uint32_t offset, uint_t cr4, wc_desctbr_t gdt)
859 {
860 	/*LINTED*/
861 	rm_platter_t *real_mode_platter = (rm_platter_t *)rm_platter_va;
862 
863 	/*
864 	 * Fill up the real mode platter to make it easy for real mode code to
865 	 * kick it off. This area should really be one passed by boot to kernel
866 	 * and guaranteed to be below 1MB and aligned to 16 bytes. Should also
867 	 * have identical physical and virtual address in paged mode.
868 	 */
869 
870 	real_mode_platter->rm_pdbr = getcr3();
871 	real_mode_platter->rm_cpu = cpun;
872 	real_mode_platter->rm_cr4 = cr4;
873 
874 	real_mode_platter->rm_gdt_base = gdt.base;
875 	real_mode_platter->rm_gdt_lim = gdt.limit;
876 
877 #if defined(__amd64)
878 	if (getcr3() > 0xffffffffUL)
879 		panic("Cannot initialize CPUs; kernel's 64-bit page tables\n"
880 		    "located above 4G in physical memory (@ 0x%llx).",
881 		    (unsigned long long)getcr3());
882 
883 	/*
884 	 * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY
885 	 * by code in real_mode_start():
886 	 *
887 	 * GDT[0]:  NULL selector
888 	 * GDT[1]:  64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1
889 	 *
890 	 * Clear the IDT as interrupts will be off and a limit of 0 will cause
891 	 * the CPU to triple fault and reset on an NMI, seemingly as reasonable
892 	 * a course of action as any other, though it may cause the entire
893 	 * platform to reset in some cases...
894 	 */
895 	real_mode_platter->rm_temp_gdt[0] = 0ULL;
896 	real_mode_platter->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL;
897 
898 	real_mode_platter->rm_temp_gdt_lim = (ushort_t)
899 	    (sizeof (real_mode_platter->rm_temp_gdt) - 1);
900 	real_mode_platter->rm_temp_gdt_base = rm_platter_pa +
901 	    (uint32_t)(&((rm_platter_t *)0)->rm_temp_gdt);
902 
903 	real_mode_platter->rm_temp_idt_lim = 0;
904 	real_mode_platter->rm_temp_idt_base = 0;
905 
906 	/*
907 	 * Since the CPU needs to jump to protected mode using an identity
908 	 * mapped address, we need to calculate it here.
909 	 */
910 	real_mode_platter->rm_longmode64_addr = rm_platter_pa + offset;
911 #endif	/* __amd64 */
912 
913 	/* return; */
914 }
915 
916 void
917 i_cpr_start_cpu(void)
918 {
919 
920 	struct cpu *cp = CPU;
921 
922 	char *str = "i_cpr_start_cpu";
923 	extern void init_cpu_syscall(struct cpu *cp);
924 
925 	PMD(PMD_SX, ("%s() called\n", str))
926 
927 	PMD(PMD_SX, ("%s() #0 cp->cpu_base_spl %d\n", str,
928 	    cp->cpu_base_spl))
929 
930 	mutex_enter(&cpu_lock);
931 	if (cp == i_cpr_bootcpu()) {
932 		mutex_exit(&cpu_lock);
933 		PMD(PMD_SX,
934 		    ("%s() called on bootcpu nothing to do!\n", str))
935 		return;
936 	}
937 	mutex_exit(&cpu_lock);
938 
939 	/*
940 	 * We need to Sync PAT with cpu0's PAT. We have to do
941 	 * this with interrupts disabled.
942 	 */
943 	if (is_x86_feature(x86_featureset, X86FSET_PAT))
944 		pat_sync();
945 
946 	/*
947 	 * Initialize this CPU's syscall handlers
948 	 */
949 	init_cpu_syscall(cp);
950 
951 	PMD(PMD_SX, ("%s() #1 cp->cpu_base_spl %d\n", str, cp->cpu_base_spl))
952 
953 	/*
954 	 * Do not need to call cpuid_pass2(), cpuid_pass3(), cpuid_pass4() or
955 	 * init_cpu_info(), since the work that they do is only needed to
956 	 * be done once at boot time
957 	 */
958 
959 
960 	mutex_enter(&cpu_lock);
961 	CPUSET_ADD(procset, cp->cpu_id);
962 	mutex_exit(&cpu_lock);
963 
964 	PMD(PMD_SX, ("%s() #2 cp->cpu_base_spl %d\n", str,
965 	    cp->cpu_base_spl))
966 
967 	if (tsc_gethrtime_enable) {
968 		PMD(PMD_SX, ("%s() calling tsc_sync_slave\n", str))
969 		tsc_sync_slave();
970 	}
971 
972 	PMD(PMD_SX, ("%s() cp->cpu_id %d, cp->cpu_intr_actv %d\n", str,
973 	    cp->cpu_id, cp->cpu_intr_actv))
974 	PMD(PMD_SX, ("%s() #3 cp->cpu_base_spl %d\n", str,
975 	    cp->cpu_base_spl))
976 
977 	(void) spl0();		/* enable interrupts */
978 
979 	PMD(PMD_SX, ("%s() #4 cp->cpu_base_spl %d\n", str,
980 	    cp->cpu_base_spl))
981 
982 	/*
983 	 * Set up the CPU module for this CPU.  This can't be done before
984 	 * this CPU is made CPU_READY, because we may (in heterogeneous systems)
985 	 * need to go load another CPU module.  The act of attempting to load
986 	 * a module may trigger a cross-call, which will ASSERT unless this
987 	 * cpu is CPU_READY.
988 	 */
989 
990 	/*
991 	 * cmi already been init'd (during boot), so do not need to do it again
992 	 */
993 #ifdef PM_REINITMCAONRESUME
994 	if (is_x86_feature(x86_featureset, X86FSET_MCA))
995 		cmi_mca_init();
996 #endif
997 
998 	PMD(PMD_SX, ("%s() returning\n", str))
999 
1000 	/* return; */
1001 }
1002 
1003 void
1004 i_cpr_alloc_cpus(void)
1005 {
1006 	char *str = "i_cpr_alloc_cpus";
1007 
1008 	PMD(PMD_SX, ("%s() CPU->cpu_id %d\n", str, CPU->cpu_id))
1009 	/*
1010 	 * we allocate this only when we actually need it to save on
1011 	 * kernel memory
1012 	 */
1013 
1014 	if (wc_other_cpus == NULL) {
1015 		wc_other_cpus = kmem_zalloc(max_ncpus * sizeof (wc_cpu_t),
1016 		    KM_SLEEP);
1017 	}
1018 
1019 }
1020 
1021 void
1022 i_cpr_free_cpus(void)
1023 {
1024 	int index;
1025 	wc_cpu_t *wc_cpu;
1026 
1027 	if (wc_other_cpus != NULL) {
1028 		for (index = 0; index < max_ncpus; index++) {
1029 			wc_cpu = wc_other_cpus + index;
1030 			if (wc_cpu->wc_saved_stack != NULL) {
1031 				kmem_free(wc_cpu->wc_saved_stack,
1032 				    wc_cpu->wc_saved_stack_size);
1033 			}
1034 		}
1035 
1036 		kmem_free((void *) wc_other_cpus,
1037 		    max_ncpus * sizeof (wc_cpu_t));
1038 		wc_other_cpus = NULL;
1039 	}
1040 }
1041 
1042 /*
1043  * wrapper for acpica_ddi_save_resources()
1044  */
1045 void
1046 i_cpr_save_configuration(dev_info_t *dip)
1047 {
1048 	acpica_ddi_save_resources(dip);
1049 }
1050 
1051 /*
1052  * wrapper for acpica_ddi_restore_resources()
1053  */
1054 void
1055 i_cpr_restore_configuration(dev_info_t *dip)
1056 {
1057 	acpica_ddi_restore_resources(dip);
1058 }
1059 
1060 static int
1061 wait_for_set(cpuset_t *set, int who)
1062 {
1063 	int delays;
1064 	char *str = "wait_for_set";
1065 
1066 	for (delays = 0; !CPU_IN_SET(*set, who); delays++) {
1067 		if (delays == 500) {
1068 			/*
1069 			 * After five seconds, things are probably
1070 			 * looking a bit bleak - explain the hang.
1071 			 */
1072 			cmn_err(CE_NOTE, "cpu%d: started, "
1073 			    "but not running in the kernel yet", who);
1074 			PMD(PMD_SX, ("%s() %d cpu started "
1075 			    "but not running in the kernel yet\n",
1076 			    str, who))
1077 		} else if (delays > 2000) {
1078 			/*
1079 			 * We waited at least 20 seconds, bail ..
1080 			 */
1081 			cmn_err(CE_WARN, "cpu%d: timed out", who);
1082 			PMD(PMD_SX, ("%s() %d cpu timed out\n",
1083 			    str, who))
1084 			return (0);
1085 		}
1086 
1087 		/*
1088 		 * wait at least 10ms, then check again..
1089 		 */
1090 		drv_usecwait(10000);
1091 	}
1092 
1093 	return (1);
1094 }
1095 
1096 static	void
1097 i_cpr_save_stack(kthread_t *t, wc_cpu_t *wc_cpu)
1098 {
1099 	size_t	stack_size;	/* size of stack */
1100 	caddr_t	start = CPR_GET_STACK_START(t);	/* stack start */
1101 	caddr_t	end = CPR_GET_STACK_END(t);	/* stack end  */
1102 
1103 	stack_size = (size_t)end - (size_t)start;
1104 
1105 	if (wc_cpu->wc_saved_stack_size < stack_size) {
1106 		if (wc_cpu->wc_saved_stack != NULL) {
1107 			kmem_free(wc_cpu->wc_saved_stack,
1108 			    wc_cpu->wc_saved_stack_size);
1109 		}
1110 		wc_cpu->wc_saved_stack = kmem_zalloc(stack_size, KM_SLEEP);
1111 		wc_cpu->wc_saved_stack_size = stack_size;
1112 	}
1113 
1114 	bcopy(start, wc_cpu->wc_saved_stack, stack_size);
1115 }
1116 
1117 void
1118 i_cpr_restore_stack(kthread_t *t, greg_t *save_stack)
1119 {
1120 	size_t	stack_size;	/* size of stack */
1121 	caddr_t	start = CPR_GET_STACK_START(t);	/* stack start */
1122 	caddr_t	end = CPR_GET_STACK_END(t);	/* stack end  */
1123 
1124 	stack_size = (size_t)end - (size_t)start;
1125 
1126 	bcopy(save_stack, start, stack_size);
1127 }
1128