xref: /openbsd/sys/kern/subr_hibernate.c (revision 3bef86f7)
1 /*	$OpenBSD: subr_hibernate.c,v 1.138 2022/09/03 18:17:15 mlarkin Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/systm.h>
25 #include <sys/disklabel.h>
26 #include <sys/disk.h>
27 #include <sys/conf.h>
28 #include <sys/buf.h>
29 #include <sys/fcntl.h>
30 #include <sys/stat.h>
31 #include <sys/atomic.h>
32 
33 #include <uvm/uvm.h>
34 #include <uvm/uvm_swap.h>
35 
36 #include <machine/hibernate.h>
37 
38 /* Make sure the signature can fit in one block */
39 CTASSERT(sizeof(union hibernate_info) <= DEV_BSIZE);
40 
41 /*
42  * Hibernate piglet layout information
43  *
44  * The piglet is a scratch area of memory allocated by the suspending kernel.
45  * Its phys and virt addrs are recorded in the signature block. The piglet is
46  * used to guarantee an unused area of memory that can be used by the resuming
47  * kernel for various things. The piglet is excluded during unpack operations.
48  * The piglet size is presently 4*HIBERNATE_CHUNK_SIZE (typically 4*4MB).
49  *
50  * Offset from piglet_base	Purpose
51  * ----------------------------------------------------------------------------
52  * 0				Private page for suspend I/O write functions
53  * 1*PAGE_SIZE			I/O page used during hibernate suspend
54  * 2*PAGE_SIZE			I/O page used during hibernate suspend
55  * 3*PAGE_SIZE			copy page used during hibernate suspend
56  * 4*PAGE_SIZE			final chunk ordering list (24 pages)
57  * 28*PAGE_SIZE			RLE utility page
58  * 29*PAGE_SIZE			start of hiballoc area
59  * 30*PAGE_SIZE			preserved entropy
60  * 110*PAGE_SIZE		end of hiballoc area (80 pages)
61  * 366*PAGE_SIZE		end of retguard preservation region (256 pages)
62  * ...				unused
63  * HIBERNATE_CHUNK_SIZE		start of hibernate chunk table
64  * 2*HIBERNATE_CHUNK_SIZE	bounce area for chunks being unpacked
65  * 4*HIBERNATE_CHUNK_SIZE	end of piglet
66  */
67 
68 /* Temporary vaddr ranges used during hibernate */
69 vaddr_t hibernate_temp_page;
70 vaddr_t hibernate_copy_page;
71 vaddr_t hibernate_rle_page;
72 
73 /* Hibernate info as read from disk during resume */
74 union hibernate_info disk_hib;
75 
76 /*
77  * Global copy of the pig start address. This needs to be a global as we
78  * switch stacks after computing it - it can't be stored on the stack.
79  */
80 paddr_t global_pig_start;
81 
82 /*
83  * Global copies of the piglet start addresses (PA/VA). We store these
84  * as globals to avoid having to carry them around as parameters, as the
85  * piglet is allocated early and freed late - its lifecycle extends beyond
86  * that of the hibernate info union which is calculated on suspend/resume.
87  */
88 vaddr_t global_piglet_va;
89 paddr_t global_piglet_pa;
90 
91 /* #define HIB_DEBUG */
92 #ifdef HIB_DEBUG
93 int	hib_debug = 99;
94 #define DPRINTF(x...)     do { if (hib_debug) printf(x); } while (0)
95 #define DNPRINTF(n,x...)  do { if (hib_debug > (n)) printf(x); } while (0)
96 #else
97 #define DPRINTF(x...)
98 #define DNPRINTF(n,x...)
99 #endif
100 
101 #ifndef NO_PROPOLICE
102 extern long __guard_local;
103 #endif /* ! NO_PROPOLICE */
104 
105 /* Retguard phys address (need to skip this region during unpack) */
106 paddr_t retguard_start_phys, retguard_end_phys;
107 extern char __retguard_start, __retguard_end;
108 
109 void hibernate_copy_chunk_to_piglet(paddr_t, vaddr_t, size_t);
110 int hibernate_calc_rle(paddr_t, paddr_t);
111 int hibernate_write_rle(union hibernate_info *, paddr_t, paddr_t, daddr_t *,
112 	size_t *);
113 
114 #define MAX_RLE (HIBERNATE_CHUNK_SIZE / PAGE_SIZE)
115 
116 /*
117  * Hib alloc enforced alignment.
118  */
119 #define HIB_ALIGN		8 /* bytes alignment */
120 
121 /*
122  * sizeof builtin operation, but with alignment constraint.
123  */
124 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
125 
126 struct hiballoc_entry {
127 	size_t			hibe_use;
128 	size_t			hibe_space;
129 	RBT_ENTRY(hiballoc_entry) hibe_entry;
130 };
131 
132 /*
133  * Sort hibernate memory ranges by ascending PA
134  */
135 void
136 hibernate_sort_ranges(union hibernate_info *hib_info)
137 {
138 	int i, j;
139 	struct hibernate_memory_range *ranges;
140 	paddr_t base, end;
141 
142 	ranges = hib_info->ranges;
143 
144 	for (i = 1; i < hib_info->nranges; i++) {
145 		j = i;
146 		while (j > 0 && ranges[j - 1].base > ranges[j].base) {
147 			base = ranges[j].base;
148 			end = ranges[j].end;
149 			ranges[j].base = ranges[j - 1].base;
150 			ranges[j].end = ranges[j - 1].end;
151 			ranges[j - 1].base = base;
152 			ranges[j - 1].end = end;
153 			j--;
154 		}
155 	}
156 }
157 
158 /*
159  * Compare hiballoc entries based on the address they manage.
160  *
161  * Since the address is fixed, relative to struct hiballoc_entry,
162  * we just compare the hiballoc_entry pointers.
163  */
164 static __inline int
165 hibe_cmp(const struct hiballoc_entry *l, const struct hiballoc_entry *r)
166 {
167 	vaddr_t vl = (vaddr_t)l;
168 	vaddr_t vr = (vaddr_t)r;
169 
170 	return vl < vr ? -1 : (vl > vr);
171 }
172 
173 RBT_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
174 
175 /*
176  * Given a hiballoc entry, return the address it manages.
177  */
178 static __inline void *
179 hib_entry_to_addr(struct hiballoc_entry *entry)
180 {
181 	caddr_t addr;
182 
183 	addr = (caddr_t)entry;
184 	addr += HIB_SIZEOF(struct hiballoc_entry);
185 	return addr;
186 }
187 
188 /*
189  * Given an address, find the hiballoc that corresponds.
190  */
191 static __inline struct hiballoc_entry*
192 hib_addr_to_entry(void *addr_param)
193 {
194 	caddr_t addr;
195 
196 	addr = (caddr_t)addr_param;
197 	addr -= HIB_SIZEOF(struct hiballoc_entry);
198 	return (struct hiballoc_entry*)addr;
199 }
200 
201 RBT_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp);
202 
203 /*
204  * Allocate memory from the arena.
205  *
206  * Returns NULL if no memory is available.
207  */
208 void *
209 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
210 {
211 	struct hiballoc_entry *entry, *new_entry;
212 	size_t find_sz;
213 
214 	/*
215 	 * Enforce alignment of HIB_ALIGN bytes.
216 	 *
217 	 * Note that, because the entry is put in front of the allocation,
218 	 * 0-byte allocations are guaranteed a unique address.
219 	 */
220 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
221 
222 	/*
223 	 * Find an entry with hibe_space >= find_sz.
224 	 *
225 	 * If the root node is not large enough, we switch to tree traversal.
226 	 * Because all entries are made at the bottom of the free space,
227 	 * traversal from the end has a slightly better chance of yielding
228 	 * a sufficiently large space.
229 	 */
230 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
231 	entry = RBT_ROOT(hiballoc_addr, &arena->hib_addrs);
232 	if (entry != NULL && entry->hibe_space < find_sz) {
233 		RBT_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
234 			if (entry->hibe_space >= find_sz)
235 				break;
236 		}
237 	}
238 
239 	/*
240 	 * Insufficient or too fragmented memory.
241 	 */
242 	if (entry == NULL)
243 		return NULL;
244 
245 	/*
246 	 * Create new entry in allocated space.
247 	 */
248 	new_entry = (struct hiballoc_entry*)(
249 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
250 	new_entry->hibe_space = entry->hibe_space - find_sz;
251 	new_entry->hibe_use = alloc_sz;
252 
253 	/*
254 	 * Insert entry.
255 	 */
256 	if (RBT_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
257 		panic("hib_alloc: insert failure");
258 	entry->hibe_space = 0;
259 
260 	/* Return address managed by entry. */
261 	return hib_entry_to_addr(new_entry);
262 }
263 
264 void
265 hib_getentropy(char **bufp, size_t *bufplen)
266 {
267 	if (!bufp || !bufplen)
268 		return;
269 
270 	*bufp = (char *)(global_piglet_va + (29 * PAGE_SIZE));
271 	*bufplen = PAGE_SIZE;
272 }
273 
274 /*
275  * Free a pointer previously allocated from this arena.
276  *
277  * If addr is NULL, this will be silently accepted.
278  */
279 void
280 hib_free(struct hiballoc_arena *arena, void *addr)
281 {
282 	struct hiballoc_entry *entry, *prev;
283 
284 	if (addr == NULL)
285 		return;
286 
287 	/*
288 	 * Derive entry from addr and check it is really in this arena.
289 	 */
290 	entry = hib_addr_to_entry(addr);
291 	if (RBT_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
292 		panic("hib_free: freed item %p not in hib arena", addr);
293 
294 	/*
295 	 * Give the space in entry to its predecessor.
296 	 *
297 	 * If entry has no predecessor, change its used space into free space
298 	 * instead.
299 	 */
300 	prev = RBT_PREV(hiballoc_addr, entry);
301 	if (prev != NULL &&
302 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
303 	    prev->hibe_use + prev->hibe_space) == entry) {
304 		/* Merge entry. */
305 		RBT_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
306 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
307 		    entry->hibe_use + entry->hibe_space;
308 	} else {
309 		/* Flip used memory to free space. */
310 		entry->hibe_space += entry->hibe_use;
311 		entry->hibe_use = 0;
312 	}
313 }
314 
315 /*
316  * Initialize hiballoc.
317  *
318  * The allocator will manage memory at ptr, which is len bytes.
319  */
320 int
321 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
322 {
323 	struct hiballoc_entry *entry;
324 	caddr_t ptr;
325 	size_t len;
326 
327 	RBT_INIT(hiballoc_addr, &arena->hib_addrs);
328 
329 	/*
330 	 * Hib allocator enforces HIB_ALIGN alignment.
331 	 * Fixup ptr and len.
332 	 */
333 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
334 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
335 	len &= ~((size_t)HIB_ALIGN - 1);
336 
337 	/*
338 	 * Insufficient memory to be able to allocate and also do bookkeeping.
339 	 */
340 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
341 		return ENOMEM;
342 
343 	/*
344 	 * Create entry describing space.
345 	 */
346 	entry = (struct hiballoc_entry*)ptr;
347 	entry->hibe_use = 0;
348 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
349 	RBT_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
350 
351 	return 0;
352 }
353 
354 /*
355  * Zero all free memory.
356  */
357 void
358 uvm_pmr_zero_everything(void)
359 {
360 	struct uvm_pmemrange	*pmr;
361 	struct vm_page		*pg;
362 	int			 i;
363 
364 	uvm_lock_fpageq();
365 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
366 		/* Zero single pages. */
367 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
368 		    != NULL) {
369 			uvm_pmr_remove(pmr, pg);
370 			uvm_pagezero(pg);
371 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
372 			uvmexp.zeropages++;
373 			uvm_pmr_insert(pmr, pg, 0);
374 		}
375 
376 		/* Zero multi page ranges. */
377 		while ((pg = RBT_ROOT(uvm_pmr_size,
378 		    &pmr->size[UVM_PMR_MEMTYPE_DIRTY])) != NULL) {
379 			pg--; /* Size tree always has second page. */
380 			uvm_pmr_remove(pmr, pg);
381 			for (i = 0; i < pg->fpgsz; i++) {
382 				uvm_pagezero(&pg[i]);
383 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
384 				uvmexp.zeropages++;
385 			}
386 			uvm_pmr_insert(pmr, pg, 0);
387 		}
388 	}
389 	uvm_unlock_fpageq();
390 }
391 
392 /*
393  * Mark all memory as dirty.
394  *
395  * Used to inform the system that the clean memory isn't clean for some
396  * reason, for example because we just came back from hibernate.
397  */
398 void
399 uvm_pmr_dirty_everything(void)
400 {
401 	struct uvm_pmemrange	*pmr;
402 	struct vm_page		*pg;
403 	int			 i;
404 
405 	uvm_lock_fpageq();
406 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
407 		/* Dirty single pages. */
408 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
409 		    != NULL) {
410 			uvm_pmr_remove(pmr, pg);
411 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
412 			uvm_pmr_insert(pmr, pg, 0);
413 		}
414 
415 		/* Dirty multi page ranges. */
416 		while ((pg = RBT_ROOT(uvm_pmr_size,
417 		    &pmr->size[UVM_PMR_MEMTYPE_ZERO])) != NULL) {
418 			pg--; /* Size tree always has second page. */
419 			uvm_pmr_remove(pmr, pg);
420 			for (i = 0; i < pg->fpgsz; i++)
421 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
422 			uvm_pmr_insert(pmr, pg, 0);
423 		}
424 	}
425 
426 	uvmexp.zeropages = 0;
427 	uvm_unlock_fpageq();
428 }
429 
430 /*
431  * Allocate an area that can hold sz bytes and doesn't overlap with
432  * the piglet at piglet_pa.
433  */
434 int
435 uvm_pmr_alloc_pig(paddr_t *pa, psize_t sz, paddr_t piglet_pa)
436 {
437 	struct uvm_constraint_range pig_constraint;
438 	struct kmem_pa_mode kp_pig = {
439 		.kp_constraint = &pig_constraint,
440 		.kp_maxseg = 1
441 	};
442 	vaddr_t va;
443 
444 	sz = round_page(sz);
445 
446 	pig_constraint.ucr_low = piglet_pa + 4 * HIBERNATE_CHUNK_SIZE;
447 	pig_constraint.ucr_high = -1;
448 
449 	va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
450 	if (va == 0) {
451 		pig_constraint.ucr_low = 0;
452 		pig_constraint.ucr_high = piglet_pa - 1;
453 
454 		va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
455 		if (va == 0)
456 			return ENOMEM;
457 	}
458 
459 	pmap_extract(pmap_kernel(), va, pa);
460 	return 0;
461 }
462 
463 /*
464  * Allocate a piglet area.
465  *
466  * This needs to be in DMA-safe memory.
467  * Piglets are aligned.
468  *
469  * sz and align in bytes.
470  *
471  * The call will sleep for the pagedaemon to attempt to free memory.
472  * The pagedaemon may decide its not possible to free enough memory, causing
473  * the allocation to fail.
474  */
475 int
476 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
477 {
478 	struct kmem_pa_mode kp_piglet = {
479 		.kp_constraint = &dma_constraint,
480 		.kp_align = align,
481 		.kp_maxseg = 1
482 	};
483 
484 	/* Ensure align is a power of 2 */
485 	KASSERT((align & (align - 1)) == 0);
486 
487 	/*
488 	 * Fixup arguments: align must be at least PAGE_SIZE,
489 	 * sz will be converted to pagecount, since that is what
490 	 * pmemrange uses internally.
491 	 */
492 	if (align < PAGE_SIZE)
493 		kp_piglet.kp_align = PAGE_SIZE;
494 
495 	sz = round_page(sz);
496 
497 	*va = (vaddr_t)km_alloc(sz, &kv_any, &kp_piglet, &kd_nowait);
498 	if (*va == 0)
499 		return ENOMEM;
500 
501 	pmap_extract(pmap_kernel(), *va, pa);
502 	return 0;
503 }
504 
505 /*
506  * Free a piglet area.
507  */
508 void
509 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
510 {
511 	/*
512 	 * Fix parameters.
513 	 */
514 	sz = round_page(sz);
515 
516 	/*
517 	 * Free the physical and virtual memory.
518 	 */
519 	km_free((void *)va, sz, &kv_any, &kp_dma_contig);
520 }
521 
522 /*
523  * Physmem RLE compression support.
524  *
525  * Given a physical page address, return the number of pages starting at the
526  * address that are free.  Clamps to the number of pages in
527  * HIBERNATE_CHUNK_SIZE. Returns 0 if the page at addr is not free.
528  */
529 int
530 uvm_page_rle(paddr_t addr)
531 {
532 	struct vm_page		*pg, *pg_end;
533 	struct vm_physseg	*vmp;
534 	int			 pseg_idx, off_idx;
535 
536 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
537 	if (pseg_idx == -1)
538 		return 0;
539 
540 	vmp = &vm_physmem[pseg_idx];
541 	pg = &vmp->pgs[off_idx];
542 	if (!(pg->pg_flags & PQ_FREE))
543 		return 0;
544 
545 	/*
546 	 * Search for the first non-free page after pg.
547 	 * Note that the page may not be the first page in a free pmemrange,
548 	 * therefore pg->fpgsz cannot be used.
549 	 */
550 	for (pg_end = pg; pg_end <= vmp->lastpg &&
551 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE &&
552 	    (pg_end - pg) < HIBERNATE_CHUNK_SIZE/PAGE_SIZE; pg_end++)
553 		;
554 	return pg_end - pg;
555 }
556 
557 /*
558  * Fills out the hibernate_info union pointed to by hib
559  * with information about this machine (swap signature block
560  * offsets, number of memory ranges, kernel in use, etc)
561  */
562 int
563 get_hibernate_info(union hibernate_info *hib, int suspend)
564 {
565 	struct disklabel dl;
566 	char err_string[128], *dl_ret;
567 	int part;
568 	SHA2_CTX ctx;
569 	void *fn;
570 
571 #ifndef NO_PROPOLICE
572 	/* Save propolice guard */
573 	hib->guard = __guard_local;
574 #endif /* ! NO_PROPOLICE */
575 
576 	/* Determine I/O function to use */
577 	hib->io_func = get_hibernate_io_function(swdevt[0].sw_dev);
578 	if (hib->io_func == NULL)
579 		return (1);
580 
581 	/* Calculate hibernate device */
582 	hib->dev = swdevt[0].sw_dev;
583 
584 	/* Read disklabel (used to calculate signature and image offsets) */
585 	dl_ret = disk_readlabel(&dl, hib->dev, err_string, sizeof(err_string));
586 
587 	if (dl_ret) {
588 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
589 		return (1);
590 	}
591 
592 	/* Make sure we have a swap partition. */
593 	part = DISKPART(hib->dev);
594 	if (dl.d_npartitions <= part ||
595 	    dl.d_partitions[part].p_fstype != FS_SWAP ||
596 	    DL_GETPSIZE(&dl.d_partitions[part]) == 0)
597 		return (1);
598 
599 	/* Magic number */
600 	hib->magic = HIBERNATE_MAGIC;
601 
602 	/* Calculate signature block location */
603 	hib->sig_offset = DL_GETPSIZE(&dl.d_partitions[part]) -
604 	    sizeof(union hibernate_info)/DEV_BSIZE;
605 
606 	SHA256Init(&ctx);
607 	SHA256Update(&ctx, version, strlen(version));
608 	fn = printf;
609 	SHA256Update(&ctx, &fn, sizeof(fn));
610 	fn = malloc;
611 	SHA256Update(&ctx, &fn, sizeof(fn));
612 	fn = km_alloc;
613 	SHA256Update(&ctx, &fn, sizeof(fn));
614 	fn = strlen;
615 	SHA256Update(&ctx, &fn, sizeof(fn));
616 	SHA256Final((u_int8_t *)&hib->kern_hash, &ctx);
617 
618 	if (suspend) {
619 		/* Grab the previously-allocated piglet addresses */
620 		hib->piglet_va = global_piglet_va;
621 		hib->piglet_pa = global_piglet_pa;
622 		hib->io_page = (void *)hib->piglet_va;
623 
624 		/*
625 		 * Initialization of the hibernate IO function for drivers
626 		 * that need to do prep work (such as allocating memory or
627 		 * setting up data structures that cannot safely be done
628 		 * during suspend without causing side effects). There is
629 		 * a matching HIB_DONE call performed after the write is
630 		 * completed.
631 		 */
632 		if (hib->io_func(hib->dev, DL_GETPOFFSET(&dl.d_partitions[part]),
633 		    (vaddr_t)NULL, DL_GETPSIZE(&dl.d_partitions[part]),
634 		    HIB_INIT, hib->io_page))
635 			goto fail;
636 
637 	} else {
638 		/*
639 		 * Resuming kernels use a regular private page for the driver
640 		 * No need to free this I/O page as it will vanish as part of
641 		 * the resume.
642 		 */
643 		hib->io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
644 		if (!hib->io_page)
645 			goto fail;
646 	}
647 
648 	if (get_hibernate_info_md(hib))
649 		goto fail;
650 
651 	return (0);
652 
653 fail:
654 	return (1);
655 }
656 
657 /*
658  * Allocate nitems*size bytes from the hiballoc area presently in use
659  */
660 void *
661 hibernate_zlib_alloc(void *unused, int nitems, int size)
662 {
663 	struct hibernate_zlib_state *hibernate_state;
664 
665 	hibernate_state =
666 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
667 
668 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
669 }
670 
671 /*
672  * Free the memory pointed to by addr in the hiballoc area presently in
673  * use
674  */
675 void
676 hibernate_zlib_free(void *unused, void *addr)
677 {
678 	struct hibernate_zlib_state *hibernate_state;
679 
680 	hibernate_state =
681 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
682 
683 	hib_free(&hibernate_state->hiballoc_arena, addr);
684 }
685 
686 /*
687  * Inflate next page of data from the image stream.
688  * The rle parameter is modified on exit to contain the number of pages to
689  * skip in the output stream (or 0 if this page was inflated into).
690  *
691  * Returns 0 if the stream contains additional data, or 1 if the stream is
692  * finished.
693  */
694 int
695 hibernate_inflate_page(int *rle)
696 {
697 	struct hibernate_zlib_state *hibernate_state;
698 	int i;
699 
700 	hibernate_state =
701 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
702 
703 	/* Set up the stream for RLE code inflate */
704 	hibernate_state->hib_stream.next_out = (unsigned char *)rle;
705 	hibernate_state->hib_stream.avail_out = sizeof(*rle);
706 
707 	/* Inflate RLE code */
708 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
709 	if (i != Z_OK && i != Z_STREAM_END) {
710 		/*
711 		 * XXX - this will likely reboot/hang most machines
712 		 *       since the console output buffer will be unmapped,
713 		 *       but there's not much else we can do here.
714 		 */
715 		panic("rle inflate stream error");
716 	}
717 
718 	if (hibernate_state->hib_stream.avail_out != 0) {
719 		/*
720 		 * XXX - this will likely reboot/hang most machines
721 		 *       since the console output buffer will be unmapped,
722 		 *       but there's not much else we can do here.
723 		 */
724 		panic("rle short inflate error");
725 	}
726 
727 	if (*rle < 0 || *rle > 1024) {
728 		/*
729 		 * XXX - this will likely reboot/hang most machines
730 		 *       since the console output buffer will be unmapped,
731 		 *       but there's not much else we can do here.
732 		 */
733 		panic("invalid rle count");
734 	}
735 
736 	if (i == Z_STREAM_END)
737 		return (1);
738 
739 	if (*rle != 0)
740 		return (0);
741 
742 	/* Set up the stream for page inflate */
743 	hibernate_state->hib_stream.next_out =
744 		(unsigned char *)HIBERNATE_INFLATE_PAGE;
745 	hibernate_state->hib_stream.avail_out = PAGE_SIZE;
746 
747 	/* Process next block of data */
748 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
749 	if (i != Z_OK && i != Z_STREAM_END) {
750 		/*
751 		 * XXX - this will likely reboot/hang most machines
752 		 *       since the console output buffer will be unmapped,
753 		 *       but there's not much else we can do here.
754 		 */
755 		panic("inflate error");
756 	}
757 
758 	/* We should always have extracted a full page ... */
759 	if (hibernate_state->hib_stream.avail_out != 0) {
760 		/*
761 		 * XXX - this will likely reboot/hang most machines
762 		 *       since the console output buffer will be unmapped,
763 		 *       but there's not much else we can do here.
764 		 */
765 		panic("incomplete page");
766 	}
767 
768 	return (i == Z_STREAM_END);
769 }
770 
771 /*
772  * Inflate size bytes from src into dest, skipping any pages in
773  * [src..dest] that are special (see hibernate_inflate_skip)
774  *
775  * This function executes while using the resume-time stack
776  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
777  * will likely hang or reset the machine since the console output buffer
778  * will be unmapped.
779  */
780 void
781 hibernate_inflate_region(union hibernate_info *hib, paddr_t dest,
782     paddr_t src, size_t size)
783 {
784 	int end_stream = 0, rle, skip;
785 	struct hibernate_zlib_state *hibernate_state;
786 
787 	hibernate_state =
788 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
789 
790 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
791 	hibernate_state->hib_stream.avail_in = size;
792 
793 	do {
794 		/*
795 		 * Is this a special page? If yes, redirect the
796 		 * inflate output to a scratch page (eg, discard it)
797 		 */
798 		skip = hibernate_inflate_skip(hib, dest);
799 		if (skip == HIB_SKIP) {
800 			hibernate_enter_resume_mapping(
801 			    HIBERNATE_INFLATE_PAGE,
802 			    HIBERNATE_INFLATE_PAGE, 0);
803 		} else if (skip == HIB_MOVE) {
804 			/*
805 			 * Special case : retguard region. This gets moved
806 			 * temporarily into the piglet region and copied into
807 			 * place immediately before resume
808 			 */
809 			hibernate_enter_resume_mapping(
810 			    HIBERNATE_INFLATE_PAGE,
811 			    hib->piglet_pa + (110 * PAGE_SIZE) +
812 			    hib->retguard_ofs, 0);
813 			hib->retguard_ofs += PAGE_SIZE;
814 			if (hib->retguard_ofs > 255 * PAGE_SIZE) {
815 				/*
816 				 * XXX - this will likely reboot/hang most
817 				 *       machines since the console output
818 				 *       buffer will be unmapped, but there's
819 				 *       not much else we can do here.
820 				 */
821 				panic("retguard move error, out of space");
822 			}
823 		} else {
824 			hibernate_enter_resume_mapping(
825 			    HIBERNATE_INFLATE_PAGE, dest, 0);
826 		}
827 
828 		hibernate_flush();
829 		end_stream = hibernate_inflate_page(&rle);
830 
831 		if (rle == 0)
832 			dest += PAGE_SIZE;
833 		else
834 			dest += (rle * PAGE_SIZE);
835 	} while (!end_stream);
836 }
837 
838 /*
839  * deflate from src into the I/O page, up to 'remaining' bytes
840  *
841  * Returns number of input bytes consumed, and may reset
842  * the 'remaining' parameter if not all the output space was consumed
843  * (this information is needed to know how much to write to disk)
844  */
845 size_t
846 hibernate_deflate(union hibernate_info *hib, paddr_t src,
847     size_t *remaining)
848 {
849 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
850 	struct hibernate_zlib_state *hibernate_state;
851 
852 	hibernate_state =
853 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
854 
855 	/* Set up the stream for deflate */
856 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
857 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
858 	hibernate_state->hib_stream.next_out =
859 		(unsigned char *)hibernate_io_page + (PAGE_SIZE - *remaining);
860 	hibernate_state->hib_stream.avail_out = *remaining;
861 
862 	/* Process next block of data */
863 	if (deflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH) != Z_OK)
864 		panic("hibernate zlib deflate error");
865 
866 	/* Update pointers and return number of bytes consumed */
867 	*remaining = hibernate_state->hib_stream.avail_out;
868 	return (PAGE_SIZE - (src & PAGE_MASK)) -
869 	    hibernate_state->hib_stream.avail_in;
870 }
871 
872 /*
873  * Write the hibernation information specified in hiber_info
874  * to the location in swap previously calculated (last block of
875  * swap), called the "signature block".
876  */
877 int
878 hibernate_write_signature(union hibernate_info *hib)
879 {
880 	/* Write hibernate info to disk */
881 	return (hib->io_func(hib->dev, hib->sig_offset,
882 	    (vaddr_t)hib, DEV_BSIZE, HIB_W,
883 	    hib->io_page));
884 }
885 
886 /*
887  * Write the memory chunk table to the area in swap immediately
888  * preceding the signature block. The chunk table is stored
889  * in the piglet when this function is called.  Returns errno.
890  */
891 int
892 hibernate_write_chunktable(union hibernate_info *hib)
893 {
894 	vaddr_t hibernate_chunk_table_start;
895 	size_t hibernate_chunk_table_size;
896 	int i, err;
897 
898 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
899 
900 	hibernate_chunk_table_start = hib->piglet_va +
901 	    HIBERNATE_CHUNK_SIZE;
902 
903 	/* Write chunk table */
904 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
905 		if ((err = hib->io_func(hib->dev,
906 		    hib->chunktable_offset + (i/DEV_BSIZE),
907 		    (vaddr_t)(hibernate_chunk_table_start + i),
908 		    MAXPHYS, HIB_W, hib->io_page))) {
909 			DPRINTF("chunktable write error: %d\n", err);
910 			return (err);
911 		}
912 	}
913 
914 	return (0);
915 }
916 
917 /*
918  * Write an empty hiber_info to the swap signature block, which is
919  * guaranteed to not match any valid hib.
920  */
921 int
922 hibernate_clear_signature(union hibernate_info *hib)
923 {
924 	union hibernate_info blank_hiber_info;
925 
926 	/* Zero out a blank hiber_info */
927 	memset(&blank_hiber_info, 0, sizeof(union hibernate_info));
928 
929 	/* Write (zeroed) hibernate info to disk */
930 	DPRINTF("clearing hibernate signature block location: %lld\n",
931 		hib->sig_offset);
932 	if (hibernate_block_io(hib,
933 	    hib->sig_offset,
934 	    DEV_BSIZE, (vaddr_t)&blank_hiber_info, 1))
935 		printf("Warning: could not clear hibernate signature\n");
936 
937 	return (0);
938 }
939 
940 /*
941  * Compare two hibernate_infos to determine if they are the same (eg,
942  * we should be performing a hibernate resume on this machine.
943  * Not all fields are checked - just enough to verify that the machine
944  * has the same memory configuration and kernel as the one that
945  * wrote the signature previously.
946  */
947 int
948 hibernate_compare_signature(union hibernate_info *mine,
949     union hibernate_info *disk)
950 {
951 	u_int i;
952 
953 	if (mine->nranges != disk->nranges) {
954 		printf("unhibernate failed: memory layout changed\n");
955 		return (1);
956 	}
957 
958 	if (bcmp(mine->kern_hash, disk->kern_hash, SHA256_DIGEST_LENGTH) != 0) {
959 		printf("unhibernate failed: original kernel changed\n");
960 		return (1);
961 	}
962 
963 	for (i = 0; i < mine->nranges; i++) {
964 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
965 		    (mine->ranges[i].end != disk->ranges[i].end) ) {
966 			DPRINTF("hib range %d mismatch [%p-%p != %p-%p]\n",
967 				i,
968 				(void *)mine->ranges[i].base,
969 				(void *)mine->ranges[i].end,
970 				(void *)disk->ranges[i].base,
971 				(void *)disk->ranges[i].end);
972 			printf("unhibernate failed: memory size changed\n");
973 			return (1);
974 		}
975 	}
976 
977 	return (0);
978 }
979 
980 /*
981  * Transfers xfer_size bytes between the hibernate device specified in
982  * hib_info at offset blkctr and the vaddr specified at dest.
983  *
984  * Separate offsets and pages are used to handle misaligned reads (reads
985  * that span a page boundary).
986  *
987  * blkctr specifies a relative offset (relative to the start of swap),
988  * not an absolute disk offset
989  *
990  */
991 int
992 hibernate_block_io(union hibernate_info *hib, daddr_t blkctr,
993     size_t xfer_size, vaddr_t dest, int iswrite)
994 {
995 	struct buf *bp;
996 	struct bdevsw *bdsw;
997 	int error;
998 
999 	bp = geteblk(xfer_size);
1000 	bdsw = &bdevsw[major(hib->dev)];
1001 
1002 	error = (*bdsw->d_open)(hib->dev, FREAD, S_IFCHR, curproc);
1003 	if (error) {
1004 		printf("hibernate_block_io open failed\n");
1005 		return (1);
1006 	}
1007 
1008 	if (iswrite)
1009 		bcopy((caddr_t)dest, bp->b_data, xfer_size);
1010 
1011 	bp->b_bcount = xfer_size;
1012 	bp->b_blkno = blkctr;
1013 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
1014 	SET(bp->b_flags, B_BUSY | (iswrite ? B_WRITE : B_READ) | B_RAW);
1015 	bp->b_dev = hib->dev;
1016 	(*bdsw->d_strategy)(bp);
1017 
1018 	error = biowait(bp);
1019 	if (error) {
1020 		printf("hib block_io biowait error %d blk %lld size %zu\n",
1021 			error, (long long)blkctr, xfer_size);
1022 		error = (*bdsw->d_close)(hib->dev, 0, S_IFCHR,
1023 		    curproc);
1024 		if (error)
1025 			printf("hibernate_block_io error close failed\n");
1026 		return (1);
1027 	}
1028 
1029 	error = (*bdsw->d_close)(hib->dev, FREAD, S_IFCHR, curproc);
1030 	if (error) {
1031 		printf("hibernate_block_io close failed\n");
1032 		return (1);
1033 	}
1034 
1035 	if (!iswrite)
1036 		bcopy(bp->b_data, (caddr_t)dest, xfer_size);
1037 
1038 	bp->b_flags |= B_INVAL;
1039 	brelse(bp);
1040 
1041 	return (0);
1042 }
1043 
1044 /*
1045  * Preserve one page worth of random data, generated from the resuming
1046  * kernel's arc4random. After resume, this preserved entropy can be used
1047  * to further improve the un-hibernated machine's entropy pool. This
1048  * random data is stored in the piglet, which is preserved across the
1049  * unpack operation, and is restored later in the resume process (see
1050  * hib_getentropy)
1051  */
1052 void
1053 hibernate_preserve_entropy(union hibernate_info *hib)
1054 {
1055 	void *entropy;
1056 
1057 	entropy = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
1058 
1059 	if (!entropy)
1060 		return;
1061 
1062 	pmap_activate(curproc);
1063 	pmap_kenter_pa((vaddr_t)entropy,
1064 	    (paddr_t)(hib->piglet_pa + (29 * PAGE_SIZE)),
1065 	    PROT_READ | PROT_WRITE);
1066 
1067 	arc4random_buf((void *)entropy, PAGE_SIZE);
1068 	pmap_kremove((vaddr_t)entropy, PAGE_SIZE);
1069 	km_free(entropy, PAGE_SIZE, &kv_any, &kp_none);
1070 }
1071 
1072 #ifndef NO_PROPOLICE
1073 vaddr_t
1074 hibernate_unprotect_ssp(void)
1075 {
1076 	struct kmem_dyn_mode kd_avoidalias;
1077 	vaddr_t va = trunc_page((vaddr_t)&__guard_local);
1078 	paddr_t pa;
1079 
1080 	pmap_extract(pmap_kernel(), va, &pa);
1081 
1082 	memset(&kd_avoidalias, 0, sizeof kd_avoidalias);
1083 	kd_avoidalias.kd_prefer = pa;
1084 	kd_avoidalias.kd_waitok = 1;
1085 	va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_avoidalias);
1086 	if (!va)
1087 		panic("hibernate_unprotect_ssp");
1088 
1089 	pmap_kenter_pa(va, pa, PROT_READ | PROT_WRITE);
1090 	pmap_update(pmap_kernel());
1091 
1092 	return va;
1093 }
1094 
1095 void
1096 hibernate_reprotect_ssp(vaddr_t va)
1097 {
1098 	pmap_kremove(va, PAGE_SIZE);
1099 	km_free((void *)va, PAGE_SIZE, &kv_any, &kp_none);
1100 }
1101 #endif /* NO_PROPOLICE */
1102 
1103 /*
1104  * Reads the signature block from swap, checks against the current machine's
1105  * information. If the information matches, perform a resume by reading the
1106  * saved image into the pig area, and unpacking.
1107  *
1108  * Must be called with interrupts enabled.
1109  */
1110 void
1111 hibernate_resume(void)
1112 {
1113 	union hibernate_info hib;
1114 	int s;
1115 #ifndef NO_PROPOLICE
1116 	vsize_t off = (vaddr_t)&__guard_local -
1117 	    trunc_page((vaddr_t)&__guard_local);
1118 	vaddr_t guard_va;
1119 #endif
1120 
1121 	/* Get current running machine's hibernate info */
1122 	memset(&hib, 0, sizeof(hib));
1123 	if (get_hibernate_info(&hib, 0)) {
1124 		DPRINTF("couldn't retrieve machine's hibernate info\n");
1125 		return;
1126 	}
1127 
1128 	/* Read hibernate info from disk */
1129 	s = splbio();
1130 
1131 	DPRINTF("reading hibernate signature block location: %lld\n",
1132 		hib.sig_offset);
1133 
1134 	if (hibernate_block_io(&hib,
1135 	    hib.sig_offset,
1136 	    DEV_BSIZE, (vaddr_t)&disk_hib, 0)) {
1137 		DPRINTF("error in hibernate read");
1138 		splx(s);
1139 		return;
1140 	}
1141 
1142 	/* Check magic number */
1143 	if (disk_hib.magic != HIBERNATE_MAGIC) {
1144 		DPRINTF("wrong magic number in hibernate signature: %x\n",
1145 			disk_hib.magic);
1146 		splx(s);
1147 		return;
1148 	}
1149 
1150 	/*
1151 	 * We (possibly) found a hibernate signature. Clear signature first,
1152 	 * to prevent accidental resume or endless resume cycles later.
1153 	 */
1154 	if (hibernate_clear_signature(&hib)) {
1155 		DPRINTF("error clearing hibernate signature block\n");
1156 		splx(s);
1157 		return;
1158 	}
1159 
1160 	/*
1161 	 * If on-disk and in-memory hibernate signatures match,
1162 	 * this means we should do a resume from hibernate.
1163 	 */
1164 	if (hibernate_compare_signature(&hib, &disk_hib)) {
1165 		DPRINTF("mismatched hibernate signature block\n");
1166 		splx(s);
1167 		return;
1168 	}
1169 	disk_hib.dev = hib.dev;
1170 
1171 #ifdef MULTIPROCESSOR
1172 	/* XXX - if we fail later, we may need to rehatch APs on some archs */
1173 	DPRINTF("hibernate: quiescing APs\n");
1174 	hibernate_quiesce_cpus();
1175 #endif /* MULTIPROCESSOR */
1176 
1177 	/* Read the image from disk into the image (pig) area */
1178 	if (hibernate_read_image(&disk_hib))
1179 		goto fail;
1180 
1181 	DPRINTF("hibernate: quiescing devices\n");
1182 	if (config_suspend_all(DVACT_QUIESCE) != 0)
1183 		goto fail;
1184 
1185 #ifndef NO_PROPOLICE
1186 	guard_va = hibernate_unprotect_ssp();
1187 #endif /* NO_PROPOLICE */
1188 
1189 	(void) splhigh();
1190 	hibernate_disable_intr_machdep();
1191 	cold = 2;
1192 
1193 	DPRINTF("hibernate: suspending devices\n");
1194 	if (config_suspend_all(DVACT_SUSPEND) != 0) {
1195 		cold = 0;
1196 		hibernate_enable_intr_machdep();
1197 #ifndef NO_PROPOLICE
1198 		hibernate_reprotect_ssp(guard_va);
1199 #endif /* ! NO_PROPOLICE */
1200 		goto fail;
1201 	}
1202 
1203 	pmap_extract(pmap_kernel(), (vaddr_t)&__retguard_start,
1204 	    &retguard_start_phys);
1205 	pmap_extract(pmap_kernel(), (vaddr_t)&__retguard_end,
1206 	    &retguard_end_phys);
1207 
1208 	hibernate_preserve_entropy(&disk_hib);
1209 
1210 	printf("Unpacking image...\n");
1211 
1212 	/* Switch stacks */
1213 	DPRINTF("hibernate: switching stacks\n");
1214 	hibernate_switch_stack_machdep();
1215 
1216 #ifndef NO_PROPOLICE
1217 	/* Start using suspended kernel's propolice guard */
1218 	*(long *)(guard_va + off) = disk_hib.guard;
1219 	hibernate_reprotect_ssp(guard_va);
1220 #endif /* ! NO_PROPOLICE */
1221 
1222 	/* Unpack and resume */
1223 	hibernate_unpack_image(&disk_hib);
1224 
1225 fail:
1226 	splx(s);
1227 	printf("\nUnable to resume hibernated image\n");
1228 }
1229 
1230 /*
1231  * Unpack image from pig area to original location by looping through the
1232  * list of output chunks in the order they should be restored (fchunks).
1233  *
1234  * Note that due to the stack smash protector and the fact that we have
1235  * switched stacks, it is not permitted to return from this function.
1236  */
1237 void
1238 hibernate_unpack_image(union hibernate_info *hib)
1239 {
1240 	struct hibernate_disk_chunk *chunks;
1241 	union hibernate_info local_hib;
1242 	paddr_t image_cur = global_pig_start;
1243 	short i, *fchunks;
1244 	char *pva;
1245 
1246 	/* Piglet will be identity mapped (VA == PA) */
1247 	pva = (char *)hib->piglet_pa;
1248 
1249 	fchunks = (short *)(pva + (4 * PAGE_SIZE));
1250 
1251 	chunks = (struct hibernate_disk_chunk *)(pva + HIBERNATE_CHUNK_SIZE);
1252 
1253 	/* Can't use hiber_info that's passed in after this point */
1254 	bcopy(hib, &local_hib, sizeof(union hibernate_info));
1255 	local_hib.retguard_ofs = 0;
1256 
1257 	/* VA == PA */
1258 	local_hib.piglet_va = local_hib.piglet_pa;
1259 
1260 	/*
1261 	 * Point of no return. Once we pass this point, only kernel code can
1262 	 * be accessed. No global variables or other kernel data structures
1263 	 * are guaranteed to be coherent after unpack starts.
1264 	 *
1265 	 * The image is now in high memory (pig area), we unpack from the pig
1266 	 * to the correct location in memory. We'll eventually end up copying
1267 	 * on top of ourself, but we are assured the kernel code here is the
1268 	 * same between the hibernated and resuming kernel, and we are running
1269 	 * on our own stack, so the overwrite is ok.
1270 	 */
1271 	DPRINTF("hibernate: activating alt. pagetable and starting unpack\n");
1272 	hibernate_activate_resume_pt_machdep();
1273 
1274 	for (i = 0; i < local_hib.chunk_ctr; i++) {
1275 		/* Reset zlib for inflate */
1276 		if (hibernate_zlib_reset(&local_hib, 0) != Z_OK)
1277 			panic("hibernate failed to reset zlib for inflate");
1278 
1279 		hibernate_process_chunk(&local_hib, &chunks[fchunks[i]],
1280 		    image_cur);
1281 
1282 		image_cur += chunks[fchunks[i]].compressed_size;
1283 	}
1284 
1285 	/*
1286 	 * Resume the loaded kernel by jumping to the MD resume vector.
1287 	 * We won't be returning from this call. We pass the location of
1288 	 * the retguard save area so the MD code can replace it before
1289 	 * resuming. See the piglet layout at the top of this file for
1290 	 * more information on the layout of the piglet area.
1291 	 *
1292 	 * We use 'global_piglet_va' here since by the time we are at
1293 	 * this point, we have already unpacked the image, and we want
1294 	 * the suspended kernel's view of what the piglet was, before
1295 	 * suspend occurred (since we will need to use that in the retguard
1296 	 * copy code in hibernate_resume_machdep.)
1297 	 */
1298 	hibernate_resume_machdep(global_piglet_va + (110 * PAGE_SIZE));
1299 }
1300 
1301 /*
1302  * Bounce a compressed image chunk to the piglet, entering mappings for the
1303  * copied pages as needed
1304  */
1305 void
1306 hibernate_copy_chunk_to_piglet(paddr_t img_cur, vaddr_t piglet, size_t size)
1307 {
1308 	size_t ct, ofs;
1309 	paddr_t src = img_cur;
1310 	vaddr_t dest = piglet;
1311 
1312 	/* Copy first partial page */
1313 	ct = (PAGE_SIZE) - (src & PAGE_MASK);
1314 	ofs = (src & PAGE_MASK);
1315 
1316 	if (ct < PAGE_SIZE) {
1317 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE,
1318 			(src - ofs), 0);
1319 		hibernate_flush();
1320 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE + ofs), (caddr_t)dest, ct);
1321 		src += ct;
1322 		dest += ct;
1323 	}
1324 
1325 	/* Copy remaining pages */
1326 	while (src < size + img_cur) {
1327 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE, src, 0);
1328 		hibernate_flush();
1329 		ct = PAGE_SIZE;
1330 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE), (caddr_t)dest, ct);
1331 		hibernate_flush();
1332 		src += ct;
1333 		dest += ct;
1334 	}
1335 }
1336 
1337 /*
1338  * Process a chunk by bouncing it to the piglet, followed by unpacking
1339  */
1340 void
1341 hibernate_process_chunk(union hibernate_info *hib,
1342     struct hibernate_disk_chunk *chunk, paddr_t img_cur)
1343 {
1344 	char *pva = (char *)hib->piglet_va;
1345 
1346 	hibernate_copy_chunk_to_piglet(img_cur,
1347 	 (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)), chunk->compressed_size);
1348 	hibernate_inflate_region(hib, chunk->base,
1349 	    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1350 	    chunk->compressed_size);
1351 }
1352 
1353 /*
1354  * Calculate RLE component for 'inaddr'. Clamps to max RLE pages between
1355  * inaddr and range_end.
1356  */
1357 int
1358 hibernate_calc_rle(paddr_t inaddr, paddr_t range_end)
1359 {
1360 	int rle;
1361 
1362 	rle = uvm_page_rle(inaddr);
1363 	KASSERT(rle >= 0 && rle <= MAX_RLE);
1364 
1365 	/* Clamp RLE to range end */
1366 	if (rle > 0 && inaddr + (rle * PAGE_SIZE) > range_end)
1367 		rle = (range_end - inaddr) / PAGE_SIZE;
1368 
1369 	return (rle);
1370 }
1371 
1372 /*
1373  * Write the RLE byte for page at 'inaddr' to the output stream.
1374  * Returns the number of pages to be skipped at 'inaddr'.
1375  */
1376 int
1377 hibernate_write_rle(union hibernate_info *hib, paddr_t inaddr,
1378 	paddr_t range_end, daddr_t *blkctr,
1379 	size_t *out_remaining)
1380 {
1381 	int rle, err, *rleloc;
1382 	struct hibernate_zlib_state *hibernate_state;
1383 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1384 
1385 	hibernate_state =
1386 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1387 
1388 	rle = hibernate_calc_rle(inaddr, range_end);
1389 
1390 	rleloc = (int *)hibernate_rle_page + MAX_RLE - 1;
1391 	*rleloc = rle;
1392 
1393 	/* Deflate the RLE byte into the stream */
1394 	hibernate_deflate(hib, (paddr_t)rleloc, out_remaining);
1395 
1396 	/* Did we fill the output page? If so, flush to disk */
1397 	if (*out_remaining == 0) {
1398 		if ((err = hib->io_func(hib->dev, *blkctr + hib->image_offset,
1399 			(vaddr_t)hibernate_io_page, PAGE_SIZE, HIB_W,
1400 			hib->io_page))) {
1401 				DPRINTF("hib write error %d\n", err);
1402 				return (err);
1403 		}
1404 
1405 		*blkctr += PAGE_SIZE / DEV_BSIZE;
1406 		*out_remaining = PAGE_SIZE;
1407 
1408 		/* If we didn't deflate the entire RLE byte, finish it now */
1409 		if (hibernate_state->hib_stream.avail_in != 0)
1410 			hibernate_deflate(hib,
1411 				(vaddr_t)hibernate_state->hib_stream.next_in,
1412 				out_remaining);
1413 	}
1414 
1415 	return (rle);
1416 }
1417 
1418 /*
1419  * Write a compressed version of this machine's memory to disk, at the
1420  * precalculated swap offset:
1421  *
1422  * end of swap - signature block size - chunk table size - memory size
1423  *
1424  * The function begins by looping through each phys mem range, cutting each
1425  * one into MD sized chunks. These chunks are then compressed individually
1426  * and written out to disk, in phys mem order. Some chunks might compress
1427  * more than others, and for this reason, each chunk's size is recorded
1428  * in the chunk table, which is written to disk after the image has
1429  * properly been compressed and written (in hibernate_write_chunktable).
1430  *
1431  * When this function is called, the machine is nearly suspended - most
1432  * devices are quiesced/suspended, interrupts are off, and cold has
1433  * been set. This means that there can be no side effects once the
1434  * write has started, and the write function itself can also have no
1435  * side effects. This also means no printfs are permitted (since printf
1436  * has side effects.)
1437  *
1438  * Return values :
1439  *
1440  * 0      - success
1441  * EIO    - I/O error occurred writing the chunks
1442  * EINVAL - Failed to write a complete range
1443  * ENOMEM - Memory allocation failure during preparation of the zlib arena
1444  */
1445 int
1446 hibernate_write_chunks(union hibernate_info *hib)
1447 {
1448 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1449 	size_t nblocks, out_remaining, used;
1450 	struct hibernate_disk_chunk *chunks;
1451 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1452 	daddr_t blkctr = 0;
1453 	int i, rle, err;
1454 	struct hibernate_zlib_state *hibernate_state;
1455 
1456 	hibernate_state =
1457 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1458 
1459 	hib->chunk_ctr = 0;
1460 
1461 	/*
1462 	 * Map the utility VAs to the piglet. See the piglet map at the
1463 	 * top of this file for piglet layout information.
1464 	 */
1465 	hibernate_copy_page = hib->piglet_va + 3 * PAGE_SIZE;
1466 	hibernate_rle_page = hib->piglet_va + 28 * PAGE_SIZE;
1467 
1468 	chunks = (struct hibernate_disk_chunk *)(hib->piglet_va +
1469 	    HIBERNATE_CHUNK_SIZE);
1470 
1471 	/* Calculate the chunk regions */
1472 	for (i = 0; i < hib->nranges; i++) {
1473 		range_base = hib->ranges[i].base;
1474 		range_end = hib->ranges[i].end;
1475 
1476 		inaddr = range_base;
1477 
1478 		while (inaddr < range_end) {
1479 			chunks[hib->chunk_ctr].base = inaddr;
1480 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1481 				chunks[hib->chunk_ctr].end = inaddr +
1482 				    HIBERNATE_CHUNK_SIZE;
1483 			else
1484 				chunks[hib->chunk_ctr].end = range_end;
1485 
1486 			inaddr += HIBERNATE_CHUNK_SIZE;
1487 			hib->chunk_ctr ++;
1488 		}
1489 	}
1490 
1491 	uvm_pmr_dirty_everything();
1492 	uvm_pmr_zero_everything();
1493 
1494 	/* Compress and write the chunks in the chunktable */
1495 	for (i = 0; i < hib->chunk_ctr; i++) {
1496 		range_base = chunks[i].base;
1497 		range_end = chunks[i].end;
1498 
1499 		chunks[i].offset = blkctr + hib->image_offset;
1500 
1501 		/* Reset zlib for deflate */
1502 		if (hibernate_zlib_reset(hib, 1) != Z_OK) {
1503 			DPRINTF("hibernate_zlib_reset failed for deflate\n");
1504 			return (ENOMEM);
1505 		}
1506 
1507 		inaddr = range_base;
1508 
1509 		/*
1510 		 * For each range, loop through its phys mem region
1511 		 * and write out the chunks (the last chunk might be
1512 		 * smaller than the chunk size).
1513 		 */
1514 		while (inaddr < range_end) {
1515 			out_remaining = PAGE_SIZE;
1516 			while (out_remaining > 0 && inaddr < range_end) {
1517 				/*
1518 				 * Adjust for regions that are not evenly
1519 				 * divisible by PAGE_SIZE or overflowed
1520 				 * pages from the previous iteration.
1521 				 */
1522 				temp_inaddr = (inaddr & PAGE_MASK) +
1523 				    hibernate_copy_page;
1524 
1525 				/* Deflate from temp_inaddr to IO page */
1526 				if (inaddr != range_end) {
1527 					if (inaddr % PAGE_SIZE == 0) {
1528 						rle = hibernate_write_rle(hib,
1529 							inaddr,
1530 							range_end,
1531 							&blkctr,
1532 							&out_remaining);
1533 					}
1534 
1535 					if (rle == 0) {
1536 						pmap_kenter_pa(hibernate_temp_page,
1537 							inaddr & PMAP_PA_MASK,
1538 							PROT_READ);
1539 
1540 						bcopy((caddr_t)hibernate_temp_page,
1541 							(caddr_t)hibernate_copy_page,
1542 							PAGE_SIZE);
1543 						inaddr += hibernate_deflate(hib,
1544 							temp_inaddr,
1545 							&out_remaining);
1546 					} else {
1547 						inaddr += rle * PAGE_SIZE;
1548 						if (inaddr > range_end)
1549 							inaddr = range_end;
1550 					}
1551 
1552 				}
1553 
1554 				if (out_remaining == 0) {
1555 					/* Filled up the page */
1556 					nblocks = PAGE_SIZE / DEV_BSIZE;
1557 
1558 					if ((err = hib->io_func(hib->dev,
1559 					    blkctr + hib->image_offset,
1560 					    (vaddr_t)hibernate_io_page,
1561 					    PAGE_SIZE, HIB_W, hib->io_page))) {
1562 						DPRINTF("hib write error %d\n",
1563 						    err);
1564 						return (err);
1565 					}
1566 
1567 					blkctr += nblocks;
1568 				}
1569 			}
1570 		}
1571 
1572 		if (inaddr != range_end) {
1573 			DPRINTF("deflate range ended prematurely\n");
1574 			return (EINVAL);
1575 		}
1576 
1577 		/*
1578 		 * End of range. Round up to next secsize bytes
1579 		 * after finishing compress
1580 		 */
1581 		if (out_remaining == 0)
1582 			out_remaining = PAGE_SIZE;
1583 
1584 		/* Finish compress */
1585 		hibernate_state->hib_stream.next_in = (unsigned char *)inaddr;
1586 		hibernate_state->hib_stream.avail_in = 0;
1587 		hibernate_state->hib_stream.next_out =
1588 		    (unsigned char *)hibernate_io_page +
1589 			(PAGE_SIZE - out_remaining);
1590 
1591 		/* We have an extra output page available for finalize */
1592 		hibernate_state->hib_stream.avail_out =
1593 			out_remaining + PAGE_SIZE;
1594 
1595 		if ((err = deflate(&hibernate_state->hib_stream, Z_FINISH)) !=
1596 		    Z_STREAM_END) {
1597 			DPRINTF("deflate error in output stream: %d\n", err);
1598 			return (err);
1599 		}
1600 
1601 		out_remaining = hibernate_state->hib_stream.avail_out;
1602 
1603 		used = 2 * PAGE_SIZE - out_remaining;
1604 		nblocks = used / DEV_BSIZE;
1605 
1606 		/* Round up to next block if needed */
1607 		if (used % DEV_BSIZE != 0)
1608 			nblocks ++;
1609 
1610 		/* Write final block(s) for this chunk */
1611 		if ((err = hib->io_func(hib->dev, blkctr + hib->image_offset,
1612 		    (vaddr_t)hibernate_io_page, nblocks*DEV_BSIZE,
1613 		    HIB_W, hib->io_page))) {
1614 			DPRINTF("hib final write error %d\n", err);
1615 			return (err);
1616 		}
1617 
1618 		blkctr += nblocks;
1619 
1620 		chunks[i].compressed_size = (blkctr + hib->image_offset -
1621 		    chunks[i].offset) * DEV_BSIZE;
1622 	}
1623 
1624 	hib->chunktable_offset = hib->image_offset + blkctr;
1625 	return (0);
1626 }
1627 
1628 /*
1629  * Reset the zlib stream state and allocate a new hiballoc area for either
1630  * inflate or deflate. This function is called once for each hibernate chunk.
1631  * Calling hiballoc_init multiple times is acceptable since the memory it is
1632  * provided is unmanaged memory (stolen). We use the memory provided to us
1633  * by the piglet allocated via the supplied hib.
1634  */
1635 int
1636 hibernate_zlib_reset(union hibernate_info *hib, int deflate)
1637 {
1638 	vaddr_t hibernate_zlib_start;
1639 	size_t hibernate_zlib_size;
1640 	char *pva = (char *)hib->piglet_va;
1641 	struct hibernate_zlib_state *hibernate_state;
1642 
1643 	hibernate_state =
1644 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1645 
1646 	if (!deflate)
1647 		pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1648 
1649 	/*
1650 	 * See piglet layout information at the start of this file for
1651 	 * information on the zlib page assignments.
1652 	 */
1653 	hibernate_zlib_start = (vaddr_t)(pva + (30 * PAGE_SIZE));
1654 	hibernate_zlib_size = 80 * PAGE_SIZE;
1655 
1656 	memset((void *)hibernate_zlib_start, 0, hibernate_zlib_size);
1657 	memset(hibernate_state, 0, PAGE_SIZE);
1658 
1659 	/* Set up stream structure */
1660 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1661 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1662 
1663 	/* Initialize the hiballoc arena for zlib allocs/frees */
1664 	hiballoc_init(&hibernate_state->hiballoc_arena,
1665 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1666 
1667 	if (deflate) {
1668 		return deflateInit(&hibernate_state->hib_stream,
1669 		    Z_BEST_SPEED);
1670 	} else
1671 		return inflateInit(&hibernate_state->hib_stream);
1672 }
1673 
1674 /*
1675  * Reads the hibernated memory image from disk, whose location and
1676  * size are recorded in hib. Begin by reading the persisted
1677  * chunk table, which records the original chunk placement location
1678  * and compressed size for each. Next, allocate a pig region of
1679  * sufficient size to hold the compressed image. Next, read the
1680  * chunks into the pig area (calling hibernate_read_chunks to do this),
1681  * and finally, if all of the above succeeds, clear the hibernate signature.
1682  * The function will then return to hibernate_resume, which will proceed
1683  * to unpack the pig image to the correct place in memory.
1684  */
1685 int
1686 hibernate_read_image(union hibernate_info *hib)
1687 {
1688 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1689 	paddr_t image_start, image_end, pig_start, pig_end;
1690 	struct hibernate_disk_chunk *chunks;
1691 	daddr_t blkctr;
1692 	vaddr_t chunktable = (vaddr_t)NULL;
1693 	paddr_t piglet_chunktable = hib->piglet_pa +
1694 	    HIBERNATE_CHUNK_SIZE;
1695 	int i, status;
1696 
1697 	status = 0;
1698 	pmap_activate(curproc);
1699 
1700 	/* Calculate total chunk table size in disk blocks */
1701 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / DEV_BSIZE;
1702 
1703 	blkctr = hib->chunktable_offset;
1704 
1705 	chunktable = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1706 	    &kp_none, &kd_nowait);
1707 
1708 	if (!chunktable)
1709 		return (1);
1710 
1711 	/* Map chunktable pages */
1712 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE; i += PAGE_SIZE)
1713 		pmap_kenter_pa(chunktable + i, piglet_chunktable + i,
1714 		    PROT_READ | PROT_WRITE);
1715 	pmap_update(pmap_kernel());
1716 
1717 	/* Read the chunktable from disk into the piglet chunktable */
1718 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1719 	    i += MAXPHYS, blkctr += MAXPHYS/DEV_BSIZE)
1720 		hibernate_block_io(hib, blkctr, MAXPHYS,
1721 		    chunktable + i, 0);
1722 
1723 	blkctr = hib->image_offset;
1724 	compressed_size = 0;
1725 
1726 	chunks = (struct hibernate_disk_chunk *)chunktable;
1727 
1728 	for (i = 0; i < hib->chunk_ctr; i++)
1729 		compressed_size += chunks[i].compressed_size;
1730 
1731 	disk_size = compressed_size;
1732 
1733 	printf("unhibernating @ block %lld length %luMB\n",
1734 	    hib->sig_offset - chunktable_size,
1735 	    compressed_size / (1024 * 1024));
1736 
1737 	/* Allocate the pig area */
1738 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1739 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz, hib->piglet_pa) == ENOMEM) {
1740 		status = 1;
1741 		goto unmap;
1742 	}
1743 
1744 	pig_end = pig_start + pig_sz;
1745 
1746 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1747 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1748 	image_start = image_end - disk_size;
1749 
1750 	hibernate_read_chunks(hib, image_start, image_end, disk_size,
1751 	    chunks);
1752 
1753 	/* Prepare the resume time pmap/page table */
1754 	hibernate_populate_resume_pt(hib, image_start, image_end);
1755 
1756 unmap:
1757 	/* Unmap chunktable pages */
1758 	pmap_kremove(chunktable, HIBERNATE_CHUNK_TABLE_SIZE);
1759 	pmap_update(pmap_kernel());
1760 
1761 	return (status);
1762 }
1763 
1764 /*
1765  * Read the hibernated memory chunks from disk (chunk information at this
1766  * point is stored in the piglet) into the pig area specified by
1767  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1768  * only chunk with overlap possibilities.
1769  */
1770 int
1771 hibernate_read_chunks(union hibernate_info *hib, paddr_t pig_start,
1772     paddr_t pig_end, size_t image_compr_size,
1773     struct hibernate_disk_chunk *chunks)
1774 {
1775 	paddr_t img_cur, piglet_base;
1776 	daddr_t blkctr;
1777 	size_t processed, compressed_size, read_size;
1778 	int nchunks, nfchunks, num_io_pages;
1779 	vaddr_t tempva, hibernate_fchunk_area;
1780 	short *fchunks, i, j;
1781 
1782 	tempva = (vaddr_t)NULL;
1783 	hibernate_fchunk_area = (vaddr_t)NULL;
1784 	nfchunks = 0;
1785 	piglet_base = hib->piglet_pa;
1786 	global_pig_start = pig_start;
1787 
1788 	/*
1789 	 * These mappings go into the resuming kernel's page table, and are
1790 	 * used only during image read. They disappear from existence
1791 	 * when the suspended kernel is unpacked on top of us.
1792 	 */
1793 	tempva = (vaddr_t)km_alloc(MAXPHYS + PAGE_SIZE, &kv_any, &kp_none,
1794 		&kd_nowait);
1795 	if (!tempva)
1796 		return (1);
1797 	hibernate_fchunk_area = (vaddr_t)km_alloc(24 * PAGE_SIZE, &kv_any,
1798 	    &kp_none, &kd_nowait);
1799 	if (!hibernate_fchunk_area)
1800 		return (1);
1801 
1802 	/* Final output chunk ordering VA */
1803 	fchunks = (short *)hibernate_fchunk_area;
1804 
1805 	/* Map the chunk ordering region */
1806 	for(i = 0; i < 24 ; i++)
1807 		pmap_kenter_pa(hibernate_fchunk_area + (i * PAGE_SIZE),
1808 			piglet_base + ((4 + i) * PAGE_SIZE),
1809 			PROT_READ | PROT_WRITE);
1810 	pmap_update(pmap_kernel());
1811 
1812 	nchunks = hib->chunk_ctr;
1813 
1814 	/* Initially start all chunks as unplaced */
1815 	for (i = 0; i < nchunks; i++)
1816 		chunks[i].flags = 0;
1817 
1818 	/*
1819 	 * Search the list for chunks that are outside the pig area. These
1820 	 * can be placed first in the final output list.
1821 	 */
1822 	for (i = 0; i < nchunks; i++) {
1823 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1824 			fchunks[nfchunks] = i;
1825 			nfchunks++;
1826 			chunks[i].flags |= HIBERNATE_CHUNK_PLACED;
1827 		}
1828 	}
1829 
1830 	/*
1831 	 * Walk the ordering, place the chunks in ascending memory order.
1832 	 */
1833 	for (i = 0; i < nchunks; i++) {
1834 		if (chunks[i].flags != HIBERNATE_CHUNK_PLACED) {
1835 			fchunks[nfchunks] = i;
1836 			nfchunks++;
1837 			chunks[i].flags = HIBERNATE_CHUNK_PLACED;
1838 		}
1839 	}
1840 
1841 	img_cur = pig_start;
1842 
1843 	for (i = 0; i < nfchunks; i++) {
1844 		blkctr = chunks[fchunks[i]].offset;
1845 		processed = 0;
1846 		compressed_size = chunks[fchunks[i]].compressed_size;
1847 
1848 		while (processed < compressed_size) {
1849 			if (compressed_size - processed >= MAXPHYS)
1850 				read_size = MAXPHYS;
1851 			else
1852 				read_size = compressed_size - processed;
1853 
1854 			/*
1855 			 * We're reading read_size bytes, offset from the
1856 			 * start of a page by img_cur % PAGE_SIZE, so the
1857 			 * end will be read_size + (img_cur % PAGE_SIZE)
1858 			 * from the start of the first page.  Round that
1859 			 * up to the next page size.
1860 			 */
1861 			num_io_pages = (read_size + (img_cur % PAGE_SIZE)
1862 				+ PAGE_SIZE - 1) / PAGE_SIZE;
1863 
1864 			KASSERT(num_io_pages <= MAXPHYS/PAGE_SIZE + 1);
1865 
1866 			/* Map pages for this read */
1867 			for (j = 0; j < num_io_pages; j ++)
1868 				pmap_kenter_pa(tempva + j * PAGE_SIZE,
1869 				    img_cur + j * PAGE_SIZE,
1870 				    PROT_READ | PROT_WRITE);
1871 
1872 			pmap_update(pmap_kernel());
1873 
1874 			hibernate_block_io(hib, blkctr, read_size,
1875 			    tempva + (img_cur & PAGE_MASK), 0);
1876 
1877 			blkctr += (read_size / DEV_BSIZE);
1878 
1879 			pmap_kremove(tempva, num_io_pages * PAGE_SIZE);
1880 			pmap_update(pmap_kernel());
1881 
1882 			processed += read_size;
1883 			img_cur += read_size;
1884 		}
1885 	}
1886 
1887 	pmap_kremove(hibernate_fchunk_area, 24 * PAGE_SIZE);
1888 	pmap_update(pmap_kernel());
1889 
1890 	return (0);
1891 }
1892 
1893 /*
1894  * Hibernating a machine comprises the following operations:
1895  *  1. Calculating this machine's hibernate_info information
1896  *  2. Allocating a piglet and saving the piglet's physaddr
1897  *  3. Calculating the memory chunks
1898  *  4. Writing the compressed chunks to disk
1899  *  5. Writing the chunk table
1900  *  6. Writing the signature block (hibernate_info)
1901  *
1902  * On most architectures, the function calling hibernate_suspend would
1903  * then power off the machine using some MD-specific implementation.
1904  */
1905 int
1906 hibernate_suspend(void)
1907 {
1908 	union hibernate_info hib;
1909 	u_long start, end;
1910 
1911 	/*
1912 	 * Calculate memory ranges, swap offsets, etc.
1913 	 * This also allocates a piglet whose physaddr is stored in
1914 	 * hib->piglet_pa and vaddr stored in hib->piglet_va
1915 	 */
1916 	if (get_hibernate_info(&hib, 1)) {
1917 		DPRINTF("failed to obtain hibernate info\n");
1918 		return (1);
1919 	}
1920 
1921 	/* Find a page-addressed region in swap [start,end] */
1922 	if (uvm_hibswap(hib.dev, &start, &end)) {
1923 		printf("hibernate: cannot find any swap\n");
1924 		return (1);
1925 	}
1926 
1927 	if (end - start < 1000) {
1928 		printf("hibernate: insufficient swap (%lu is too small)\n",
1929 			end - start + 1);
1930 		return (1);
1931 	}
1932 
1933 	pmap_extract(pmap_kernel(), (vaddr_t)&__retguard_start,
1934 	    &retguard_start_phys);
1935 	pmap_extract(pmap_kernel(), (vaddr_t)&__retguard_end,
1936 	    &retguard_end_phys);
1937 
1938 	/* Calculate block offsets in swap */
1939 	hib.image_offset = ctod(start);
1940 
1941 	DPRINTF("hibernate @ block %lld max-length %lu blocks\n",
1942 	    hib.image_offset, ctod(end) - ctod(start) + 1);
1943 
1944 	pmap_activate(curproc);
1945 	DPRINTF("hibernate: writing chunks\n");
1946 	if (hibernate_write_chunks(&hib)) {
1947 		DPRINTF("hibernate_write_chunks failed\n");
1948 		return (1);
1949 	}
1950 
1951 	DPRINTF("hibernate: writing chunktable\n");
1952 	if (hibernate_write_chunktable(&hib)) {
1953 		DPRINTF("hibernate_write_chunktable failed\n");
1954 		return (1);
1955 	}
1956 
1957 	DPRINTF("hibernate: writing signature\n");
1958 	if (hibernate_write_signature(&hib)) {
1959 		DPRINTF("hibernate_write_signature failed\n");
1960 		return (1);
1961 	}
1962 
1963 	/* Allow the disk to settle */
1964 	delay(500000);
1965 
1966 	/*
1967 	 * Give the device-specific I/O function a notification that we're
1968 	 * done, and that it can clean up or shutdown as needed.
1969 	 */
1970 	hib.io_func(hib.dev, 0, (vaddr_t)NULL, 0, HIB_DONE, hib.io_page);
1971 	return (0);
1972 }
1973 
1974 int
1975 hibernate_alloc(void)
1976 {
1977 	KASSERT(global_piglet_va == 0);
1978 	KASSERT(hibernate_temp_page == 0);
1979 
1980 	pmap_activate(curproc);
1981 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1982 	    PROT_READ | PROT_WRITE);
1983 
1984 	/* Allocate a piglet, store its addresses in the supplied globals */
1985 	if (uvm_pmr_alloc_piglet(&global_piglet_va, &global_piglet_pa,
1986 	    HIBERNATE_CHUNK_SIZE * 4, HIBERNATE_CHUNK_SIZE))
1987 		goto unmap;
1988 
1989 	/*
1990 	 * Allocate VA for the temp page.
1991 	 *
1992 	 * This will become part of the suspended kernel and will
1993 	 * be freed in hibernate_free, upon resume (or hibernate
1994 	 * failure)
1995 	 */
1996 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1997 	    &kp_none, &kd_nowait);
1998 	if (!hibernate_temp_page) {
1999 		uvm_pmr_free_piglet(global_piglet_va, 4 * HIBERNATE_CHUNK_SIZE);
2000 		global_piglet_va = 0;
2001 		goto unmap;
2002 	}
2003 	return (0);
2004 unmap:
2005 	pmap_kremove(HIBERNATE_HIBALLOC_PAGE, PAGE_SIZE);
2006 	pmap_update(pmap_kernel());
2007 	return (ENOMEM);
2008 }
2009 
2010 /*
2011  * Free items allocated by hibernate_alloc()
2012  */
2013 void
2014 hibernate_free(void)
2015 {
2016 	pmap_activate(curproc);
2017 
2018 	if (global_piglet_va)
2019 		uvm_pmr_free_piglet(global_piglet_va,
2020 		    4 * HIBERNATE_CHUNK_SIZE);
2021 
2022 	if (hibernate_temp_page) {
2023 		pmap_kremove(hibernate_temp_page, PAGE_SIZE);
2024 		km_free((void *)hibernate_temp_page, PAGE_SIZE,
2025 		    &kv_any, &kp_none);
2026 	}
2027 
2028 	global_piglet_va = 0;
2029 	hibernate_temp_page = 0;
2030 	pmap_kremove(HIBERNATE_HIBALLOC_PAGE, PAGE_SIZE);
2031 	pmap_update(pmap_kernel());
2032 }
2033