xref: /original-bsd/sys/vm/swap_pager.c (revision babae2df)
1 /*
2  * Copyright (c) 1990 University of Utah.
3  * Copyright (c) 1991, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the Systems Programming Group of the University of Utah Computer
8  * Science Department.
9  *
10  * %sccs.include.redist.c%
11  *
12  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
13  *
14  *	@(#)swap_pager.c	8.9 (Berkeley) 03/21/94
15  */
16 
17 /*
18  * Quick hack to page to dedicated partition(s).
19  * TODO:
20  *	Add multiprocessor locks
21  *	Deal with async writes in a better fashion
22  */
23 
24 #include <sys/param.h>
25 #include <sys/systm.h>
26 #include <sys/proc.h>
27 #include <sys/buf.h>
28 #include <sys/map.h>
29 #include <sys/vnode.h>
30 #include <sys/malloc.h>
31 
32 #include <miscfs/specfs/specdev.h>
33 
34 #include <vm/vm.h>
35 #include <vm/vm_page.h>
36 #include <vm/vm_pageout.h>
37 #include <vm/swap_pager.h>
38 
39 #define NSWSIZES	16	/* size of swtab */
40 #define MAXDADDRS	64	/* max # of disk addrs for fixed allocations */
41 #ifndef NPENDINGIO
42 #define NPENDINGIO	64	/* max # of pending cleans */
43 #endif
44 
45 #ifdef DEBUG
46 int	swpagerdebug = 0x100;
47 #define	SDB_FOLLOW	0x001
48 #define SDB_INIT	0x002
49 #define SDB_ALLOC	0x004
50 #define SDB_IO		0x008
51 #define SDB_WRITE	0x010
52 #define SDB_FAIL	0x020
53 #define SDB_ALLOCBLK	0x040
54 #define SDB_FULL	0x080
55 #define SDB_ANOM	0x100
56 #define SDB_ANOMPANIC	0x200
57 #define SDB_CLUSTER	0x400
58 #define SDB_PARANOIA	0x800
59 #endif
60 
61 TAILQ_HEAD(swpclean, swpagerclean);
62 
63 struct swpagerclean {
64 	TAILQ_ENTRY(swpagerclean)	spc_list;
65 	int				spc_flags;
66 	struct buf			*spc_bp;
67 	sw_pager_t			spc_swp;
68 	vm_offset_t			spc_kva;
69 	vm_page_t			spc_m;
70 	int				spc_npages;
71 } swcleanlist[NPENDINGIO];
72 typedef struct swpagerclean *swp_clean_t;
73 
74 /* spc_flags values */
75 #define SPC_FREE	0x00
76 #define SPC_BUSY	0x01
77 #define SPC_DONE	0x02
78 #define SPC_ERROR	0x04
79 
80 struct swtab {
81 	vm_size_t st_osize;	/* size of object (bytes) */
82 	int	  st_bsize;	/* vs. size of swap block (DEV_BSIZE units) */
83 #ifdef DEBUG
84 	u_long	  st_inuse;	/* number in this range in use */
85 	u_long	  st_usecnt;	/* total used of this size */
86 #endif
87 } swtab[NSWSIZES+1];
88 
89 #ifdef DEBUG
90 int		swap_pager_poip;	/* pageouts in progress */
91 int		swap_pager_piip;	/* pageins in progress */
92 #endif
93 
94 int		swap_pager_maxcluster;	/* maximum cluster size */
95 int		swap_pager_npendingio;	/* number of pager clean structs */
96 
97 struct swpclean	swap_pager_inuse;	/* list of pending page cleans */
98 struct swpclean	swap_pager_free;	/* list of free pager clean structs */
99 struct pagerlst	swap_pager_list;	/* list of "named" anon regions */
100 
101 static void 		swap_pager_init __P((void));
102 static vm_pager_t	swap_pager_alloc
103 			    __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t));
104 static void		swap_pager_clean __P((int));
105 #ifdef DEBUG
106 static void		swap_pager_clean_check __P((vm_page_t *, int, int));
107 #endif
108 static void		swap_pager_cluster
109 			    __P((vm_pager_t, vm_offset_t,
110 				 vm_offset_t *, vm_offset_t *));
111 static void		swap_pager_dealloc __P((vm_pager_t));
112 static int		swap_pager_getpage
113 			    __P((vm_pager_t, vm_page_t *, int, boolean_t));
114 static boolean_t	swap_pager_haspage __P((vm_pager_t, vm_offset_t));
115 static int		swap_pager_io __P((sw_pager_t, vm_page_t *, int, int));
116 static void		swap_pager_iodone __P((struct buf *));
117 static int		swap_pager_putpage
118 			    __P((vm_pager_t, vm_page_t *, int, boolean_t));
119 
120 struct pagerops swappagerops = {
121 	swap_pager_init,
122 	swap_pager_alloc,
123 	swap_pager_dealloc,
124 	swap_pager_getpage,
125 	swap_pager_putpage,
126 	swap_pager_haspage,
127 	swap_pager_cluster
128 };
129 
130 static void
131 swap_pager_init()
132 {
133 	register swp_clean_t spc;
134 	register int i, bsize;
135 	extern int dmmin, dmmax;
136 	int maxbsize;
137 
138 #ifdef DEBUG
139 	if (swpagerdebug & (SDB_FOLLOW|SDB_INIT))
140 		printf("swpg_init()\n");
141 #endif
142 	dfltpagerops = &swappagerops;
143 	TAILQ_INIT(&swap_pager_list);
144 
145 	/*
146 	 * Allocate async IO structures.
147 	 *
148 	 * XXX it would be nice if we could do this dynamically based on
149 	 * the value of nswbuf (since we are ultimately limited by that)
150 	 * but neither nswbuf or malloc has been initialized yet.  So the
151 	 * structs are statically allocated above.
152 	 */
153 	swap_pager_npendingio = NPENDINGIO;
154 
155 	/*
156 	 * Initialize clean lists
157 	 */
158 	TAILQ_INIT(&swap_pager_inuse);
159 	TAILQ_INIT(&swap_pager_free);
160 	for (i = 0, spc = swcleanlist; i < swap_pager_npendingio; i++, spc++) {
161 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
162 		spc->spc_flags = SPC_FREE;
163 	}
164 
165 	/*
166 	 * Calculate the swap allocation constants.
167 	 */
168         if (dmmin == 0) {
169                 dmmin = DMMIN;
170 		if (dmmin < CLBYTES/DEV_BSIZE)
171 			dmmin = CLBYTES/DEV_BSIZE;
172 	}
173         if (dmmax == 0)
174                 dmmax = DMMAX;
175 
176 	/*
177 	 * Fill in our table of object size vs. allocation size
178 	 */
179 	bsize = btodb(PAGE_SIZE);
180 	if (bsize < dmmin)
181 		bsize = dmmin;
182 	maxbsize = btodb(sizeof(sw_bm_t) * NBBY * PAGE_SIZE);
183 	if (maxbsize > dmmax)
184 		maxbsize = dmmax;
185 	for (i = 0; i < NSWSIZES; i++) {
186 		swtab[i].st_osize = (vm_size_t) (MAXDADDRS * dbtob(bsize));
187 		swtab[i].st_bsize = bsize;
188 		if (bsize <= btodb(MAXPHYS))
189 			swap_pager_maxcluster = dbtob(bsize);
190 #ifdef DEBUG
191 		if (swpagerdebug & SDB_INIT)
192 			printf("swpg_init: ix %d, size %x, bsize %x\n",
193 			       i, swtab[i].st_osize, swtab[i].st_bsize);
194 #endif
195 		if (bsize >= maxbsize)
196 			break;
197 		bsize *= 2;
198 	}
199 	swtab[i].st_osize = 0;
200 	swtab[i].st_bsize = bsize;
201 }
202 
203 /*
204  * Allocate a pager structure and associated resources.
205  * Note that if we are called from the pageout daemon (handle == NULL)
206  * we should not wait for memory as it could resulting in deadlock.
207  */
208 static vm_pager_t
209 swap_pager_alloc(handle, size, prot, foff)
210 	caddr_t handle;
211 	register vm_size_t size;
212 	vm_prot_t prot;
213 	vm_offset_t foff;
214 {
215 	register vm_pager_t pager;
216 	register sw_pager_t swp;
217 	struct swtab *swt;
218 	int waitok;
219 
220 #ifdef DEBUG
221 	if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOC))
222 		printf("swpg_alloc(%x, %x, %x)\n", handle, size, prot);
223 #endif
224 	/*
225 	 * If this is a "named" anonymous region, look it up and
226 	 * return the appropriate pager if it exists.
227 	 */
228 	if (handle) {
229 		pager = vm_pager_lookup(&swap_pager_list, handle);
230 		if (pager != NULL) {
231 			/*
232 			 * Use vm_object_lookup to gain a reference
233 			 * to the object and also to remove from the
234 			 * object cache.
235 			 */
236 			if (vm_object_lookup(pager) == NULL)
237 				panic("swap_pager_alloc: bad object");
238 			return(pager);
239 		}
240 	}
241 	/*
242 	 * Pager doesn't exist, allocate swap management resources
243 	 * and initialize.
244 	 */
245 	waitok = handle ? M_WAITOK : M_NOWAIT;
246 	pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok);
247 	if (pager == NULL)
248 		return(NULL);
249 	swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok);
250 	if (swp == NULL) {
251 #ifdef DEBUG
252 		if (swpagerdebug & SDB_FAIL)
253 			printf("swpg_alloc: swpager malloc failed\n");
254 #endif
255 		free((caddr_t)pager, M_VMPAGER);
256 		return(NULL);
257 	}
258 	size = round_page(size);
259 	for (swt = swtab; swt->st_osize; swt++)
260 		if (size <= swt->st_osize)
261 			break;
262 #ifdef DEBUG
263 	swt->st_inuse++;
264 	swt->st_usecnt++;
265 #endif
266 	swp->sw_osize = size;
267 	swp->sw_bsize = swt->st_bsize;
268 	swp->sw_nblocks = (btodb(size) + swp->sw_bsize - 1) / swp->sw_bsize;
269 	swp->sw_blocks = (sw_blk_t)
270 		malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks),
271 		       M_VMPGDATA, M_NOWAIT);
272 	if (swp->sw_blocks == NULL) {
273 		free((caddr_t)swp, M_VMPGDATA);
274 		free((caddr_t)pager, M_VMPAGER);
275 #ifdef DEBUG
276 		if (swpagerdebug & SDB_FAIL)
277 			printf("swpg_alloc: sw_blocks malloc failed\n");
278 		swt->st_inuse--;
279 		swt->st_usecnt--;
280 #endif
281 		return(FALSE);
282 	}
283 	bzero((caddr_t)swp->sw_blocks,
284 	      swp->sw_nblocks * sizeof(*swp->sw_blocks));
285 	swp->sw_poip = 0;
286 	if (handle) {
287 		vm_object_t object;
288 
289 		swp->sw_flags = SW_NAMED;
290 		TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list);
291 		/*
292 		 * Consistant with other pagers: return with object
293 		 * referenced.  Can't do this with handle == NULL
294 		 * since it might be the pageout daemon calling.
295 		 */
296 		object = vm_object_allocate(size);
297 		vm_object_enter(object, pager);
298 		vm_object_setpager(object, pager, 0, FALSE);
299 	} else {
300 		swp->sw_flags = 0;
301 		pager->pg_list.tqe_next = NULL;
302 		pager->pg_list.tqe_prev = NULL;
303 	}
304 	pager->pg_handle = handle;
305 	pager->pg_ops = &swappagerops;
306 	pager->pg_type = PG_SWAP;
307 	pager->pg_flags = PG_CLUSTERPUT;
308 	pager->pg_data = swp;
309 
310 #ifdef DEBUG
311 	if (swpagerdebug & SDB_ALLOC)
312 		printf("swpg_alloc: pg_data %x, %x of %x at %x\n",
313 		       swp, swp->sw_nblocks, swp->sw_bsize, swp->sw_blocks);
314 #endif
315 	return(pager);
316 }
317 
318 static void
319 swap_pager_dealloc(pager)
320 	vm_pager_t pager;
321 {
322 	register int i;
323 	register sw_blk_t bp;
324 	register sw_pager_t swp;
325 	struct swtab *swt;
326 	int s;
327 
328 #ifdef DEBUG
329 	/* save panic time state */
330 	if ((swpagerdebug & SDB_ANOMPANIC) && panicstr)
331 		return;
332 	if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOC))
333 		printf("swpg_dealloc(%x)\n", pager);
334 #endif
335 	/*
336 	 * Remove from list right away so lookups will fail if we
337 	 * block for pageout completion.
338 	 */
339 	swp = (sw_pager_t) pager->pg_data;
340 	if (swp->sw_flags & SW_NAMED) {
341 		TAILQ_REMOVE(&swap_pager_list, pager, pg_list);
342 		swp->sw_flags &= ~SW_NAMED;
343 	}
344 #ifdef DEBUG
345 	for (swt = swtab; swt->st_osize; swt++)
346 		if (swp->sw_osize <= swt->st_osize)
347 			break;
348 	swt->st_inuse--;
349 #endif
350 
351 	/*
352 	 * Wait for all pageouts to finish and remove
353 	 * all entries from cleaning list.
354 	 */
355 	s = splbio();
356 	while (swp->sw_poip) {
357 		swp->sw_flags |= SW_WANTED;
358 		(void) tsleep(swp, PVM, "swpgdealloc", 0);
359 	}
360 	splx(s);
361 	swap_pager_clean(B_WRITE);
362 
363 	/*
364 	 * Free left over swap blocks
365 	 */
366 	for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++)
367 		if (bp->swb_block) {
368 #ifdef DEBUG
369 			if (swpagerdebug & (SDB_ALLOCBLK|SDB_FULL))
370 				printf("swpg_dealloc: blk %x\n",
371 				       bp->swb_block);
372 #endif
373 			rmfree(swapmap, swp->sw_bsize, bp->swb_block);
374 		}
375 	/*
376 	 * Free swap management resources
377 	 */
378 	free((caddr_t)swp->sw_blocks, M_VMPGDATA);
379 	free((caddr_t)swp, M_VMPGDATA);
380 	free((caddr_t)pager, M_VMPAGER);
381 }
382 
383 static int
384 swap_pager_getpage(pager, mlist, npages, sync)
385 	vm_pager_t pager;
386 	vm_page_t *mlist;
387 	int npages;
388 	boolean_t sync;
389 {
390 #ifdef DEBUG
391 	if (swpagerdebug & SDB_FOLLOW)
392 		printf("swpg_getpage(%x, %x, %x, %x)\n",
393 		       pager, mlist, npages, sync);
394 #endif
395 	return(swap_pager_io((sw_pager_t)pager->pg_data,
396 			     mlist, npages, B_READ));
397 }
398 
399 static int
400 swap_pager_putpage(pager, mlist, npages, sync)
401 	vm_pager_t pager;
402 	vm_page_t *mlist;
403 	int npages;
404 	boolean_t sync;
405 {
406 	int flags;
407 
408 #ifdef DEBUG
409 	if (swpagerdebug & SDB_FOLLOW)
410 		printf("swpg_putpage(%x, %x, %x, %x)\n",
411 		       pager, mlist, npages, sync);
412 #endif
413 	if (pager == NULL) {
414 		swap_pager_clean(B_WRITE);
415 		return (VM_PAGER_OK);		/* ??? */
416 	}
417 	flags = B_WRITE;
418 	if (!sync)
419 		flags |= B_ASYNC;
420 	return(swap_pager_io((sw_pager_t)pager->pg_data,
421 			     mlist, npages, flags));
422 }
423 
424 static boolean_t
425 swap_pager_haspage(pager, offset)
426 	vm_pager_t pager;
427 	vm_offset_t offset;
428 {
429 	register sw_pager_t swp;
430 	register sw_blk_t swb;
431 	int ix;
432 
433 #ifdef DEBUG
434 	if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOCBLK))
435 		printf("swpg_haspage(%x, %x) ", pager, offset);
436 #endif
437 	swp = (sw_pager_t) pager->pg_data;
438 	ix = offset / dbtob(swp->sw_bsize);
439 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
440 #ifdef DEBUG
441 		if (swpagerdebug & (SDB_FAIL|SDB_FOLLOW|SDB_ALLOCBLK))
442 			printf("swpg_haspage: %x bad offset %x, ix %x\n",
443 			       swp->sw_blocks, offset, ix);
444 #endif
445 		return(FALSE);
446 	}
447 	swb = &swp->sw_blocks[ix];
448 	if (swb->swb_block)
449 		ix = atop(offset % dbtob(swp->sw_bsize));
450 #ifdef DEBUG
451 	if (swpagerdebug & SDB_ALLOCBLK)
452 		printf("%x blk %x+%x ", swp->sw_blocks, swb->swb_block, ix);
453 	if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOCBLK))
454 		printf("-> %c\n",
455 		       "FT"[swb->swb_block && (swb->swb_mask & (1 << ix))]);
456 #endif
457 	if (swb->swb_block && (swb->swb_mask & (1 << ix)))
458 		return(TRUE);
459 	return(FALSE);
460 }
461 
462 static void
463 swap_pager_cluster(pager, offset, loffset, hoffset)
464 	vm_pager_t	pager;
465 	vm_offset_t	offset;
466 	vm_offset_t	*loffset;
467 	vm_offset_t	*hoffset;
468 {
469 	sw_pager_t swp;
470 	register int bsize;
471 	vm_offset_t loff, hoff;
472 
473 #ifdef DEBUG
474 	if (swpagerdebug & (SDB_FOLLOW|SDB_CLUSTER))
475 		printf("swpg_cluster(%x, %x) ", pager, offset);
476 #endif
477 	swp = (sw_pager_t) pager->pg_data;
478 	bsize = dbtob(swp->sw_bsize);
479 	if (bsize > swap_pager_maxcluster)
480 		bsize = swap_pager_maxcluster;
481 
482 	loff = offset - (offset % bsize);
483 	if (loff >= swp->sw_osize)
484 		panic("swap_pager_cluster: bad offset");
485 
486 	hoff = loff + bsize;
487 	if (hoff > swp->sw_osize)
488 		hoff = swp->sw_osize;
489 
490 	*loffset = loff;
491 	*hoffset = hoff;
492 #ifdef DEBUG
493 	if (swpagerdebug & (SDB_FOLLOW|SDB_CLUSTER))
494 		printf("returns [%x-%x]\n", loff, hoff);
495 #endif
496 }
497 
498 /*
499  * Scaled down version of swap().
500  * Assumes that PAGE_SIZE < MAXPHYS; i.e. only one operation needed.
501  * BOGUS:  lower level IO routines expect a KVA so we have to map our
502  * provided physical page into the KVA to keep them happy.
503  */
504 static int
505 swap_pager_io(swp, mlist, npages, flags)
506 	register sw_pager_t swp;
507 	vm_page_t *mlist;
508 	int npages;
509 	int flags;
510 {
511 	register struct buf *bp;
512 	register sw_blk_t swb;
513 	register int s;
514 	int ix, mask;
515 	boolean_t rv;
516 	vm_offset_t kva, off;
517 	swp_clean_t spc;
518 	vm_page_t m;
519 
520 #ifdef DEBUG
521 	/* save panic time state */
522 	if ((swpagerdebug & SDB_ANOMPANIC) && panicstr)
523 		return (VM_PAGER_FAIL);		/* XXX: correct return? */
524 	if (swpagerdebug & (SDB_FOLLOW|SDB_IO))
525 		printf("swpg_io(%x, %x, %x, %x)\n", swp, mlist, npages, flags);
526 	if (flags & B_READ) {
527 		if (flags & B_ASYNC)
528 			panic("swap_pager_io: cannot do ASYNC reads");
529 		if (npages != 1)
530 			panic("swap_pager_io: cannot do clustered reads");
531 	}
532 #endif
533 
534 	/*
535 	 * First determine if the page exists in the pager if this is
536 	 * a sync read.  This quickly handles cases where we are
537 	 * following shadow chains looking for the top level object
538 	 * with the page.
539 	 */
540 	m = *mlist;
541 	off = m->offset + m->object->paging_offset;
542 	ix = off / dbtob(swp->sw_bsize);
543 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
544 #ifdef DEBUG
545 		if ((flags & B_READ) == 0 && (swpagerdebug & SDB_ANOM)) {
546 			printf("swap_pager_io: no swap block on write\n");
547 			return(VM_PAGER_BAD);
548 		}
549 #endif
550 		return(VM_PAGER_FAIL);
551 	}
552 	swb = &swp->sw_blocks[ix];
553 	off = off % dbtob(swp->sw_bsize);
554 	if ((flags & B_READ) &&
555 	    (swb->swb_block == 0 || (swb->swb_mask & (1 << atop(off))) == 0))
556 		return(VM_PAGER_FAIL);
557 
558 	/*
559 	 * For reads (pageins) and synchronous writes, we clean up
560 	 * all completed async pageouts.
561 	 */
562 	if ((flags & B_ASYNC) == 0) {
563 		s = splbio();
564 		swap_pager_clean(flags&B_READ);
565 #ifdef DEBUG
566 		if (swpagerdebug & SDB_PARANOIA)
567 			swap_pager_clean_check(mlist, npages, flags&B_READ);
568 #endif
569 		splx(s);
570 	}
571 	/*
572 	 * For async writes (pageouts), we cleanup completed pageouts so
573 	 * that all available resources are freed.  Also tells us if this
574 	 * page is already being cleaned.  If it is, or no resources
575 	 * are available, we try again later.
576 	 */
577 	else {
578 		swap_pager_clean(B_WRITE);
579 #ifdef DEBUG
580 		if (swpagerdebug & SDB_PARANOIA)
581 			swap_pager_clean_check(mlist, npages, B_WRITE);
582 #endif
583 		if (swap_pager_free.tqh_first == NULL) {
584 #ifdef DEBUG
585 			if (swpagerdebug & SDB_FAIL)
586 				printf("%s: no available io headers\n",
587 				       "swap_pager_io");
588 #endif
589 			return(VM_PAGER_AGAIN);
590 		}
591 	}
592 
593 	/*
594 	 * Allocate a swap block if necessary.
595 	 */
596 	if (swb->swb_block == 0) {
597 		swb->swb_block = rmalloc(swapmap, swp->sw_bsize);
598 		if (swb->swb_block == 0) {
599 #ifdef DEBUG
600 			if (swpagerdebug & SDB_FAIL)
601 				printf("swpg_io: rmalloc of %x failed\n",
602 				       swp->sw_bsize);
603 #endif
604 			/*
605 			 * XXX this is technically a resource shortage that
606 			 * should return AGAIN, but the situation isn't likely
607 			 * to be remedied just by delaying a little while and
608 			 * trying again (the pageout daemon's current response
609 			 * to AGAIN) so we just return FAIL.
610 			 */
611 			return(VM_PAGER_FAIL);
612 		}
613 #ifdef DEBUG
614 		if (swpagerdebug & (SDB_FULL|SDB_ALLOCBLK))
615 			printf("swpg_io: %x alloc blk %x at ix %x\n",
616 			       swp->sw_blocks, swb->swb_block, ix);
617 #endif
618 	}
619 
620 	/*
621 	 * Allocate a kernel virtual address and initialize so that PTE
622 	 * is available for lower level IO drivers.
623 	 */
624 	kva = vm_pager_map_pages(mlist, npages, !(flags & B_ASYNC));
625 	if (kva == NULL) {
626 #ifdef DEBUG
627 		if (swpagerdebug & SDB_FAIL)
628 			printf("%s: no KVA space to map pages\n",
629 			       "swap_pager_io");
630 #endif
631 		return(VM_PAGER_AGAIN);
632 	}
633 
634 	/*
635 	 * Get a swap buffer header and initialize it.
636 	 */
637 	s = splbio();
638 	while (bswlist.b_actf == NULL) {
639 #ifdef DEBUG
640 		if (swpagerdebug & SDB_ANOM)
641 			printf("swap_pager_io: wait on swbuf for %x (%d)\n",
642 			       m, flags);
643 #endif
644 		bswlist.b_flags |= B_WANTED;
645 		tsleep((caddr_t)&bswlist, PSWP+1, "swpgiobuf", 0);
646 	}
647 	bp = bswlist.b_actf;
648 	bswlist.b_actf = bp->b_actf;
649 	splx(s);
650 	bp->b_flags = B_BUSY | (flags & B_READ);
651 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
652 	bp->b_data = (caddr_t)kva;
653 	bp->b_blkno = swb->swb_block + btodb(off);
654 	VHOLD(swapdev_vp);
655 	bp->b_vp = swapdev_vp;
656 	if (swapdev_vp->v_type == VBLK)
657 		bp->b_dev = swapdev_vp->v_rdev;
658 	bp->b_bcount = npages * PAGE_SIZE;
659 
660 	/*
661 	 * For writes we set up additional buffer fields, record a pageout
662 	 * in progress and mark that these swap blocks are now allocated.
663 	 */
664 	if ((bp->b_flags & B_READ) == 0) {
665 		bp->b_dirtyoff = 0;
666 		bp->b_dirtyend = npages * PAGE_SIZE;
667 		swapdev_vp->v_numoutput++;
668 		s = splbio();
669 		swp->sw_poip++;
670 		splx(s);
671 		mask = (~(~0 << npages)) << atop(off);
672 #ifdef DEBUG
673 		swap_pager_poip++;
674 		if (swpagerdebug & SDB_WRITE)
675 			printf("swpg_io: write: bp=%x swp=%x poip=%d\n",
676 			       bp, swp, swp->sw_poip);
677 		if ((swpagerdebug & SDB_ALLOCBLK) &&
678 		    (swb->swb_mask & mask) != mask)
679 			printf("swpg_io: %x write %d pages at %x+%x\n",
680 			       swp->sw_blocks, npages, swb->swb_block,
681 			       atop(off));
682 		if (swpagerdebug & SDB_CLUSTER)
683 			printf("swpg_io: off=%x, npg=%x, mask=%x, bmask=%x\n",
684 			       off, npages, mask, swb->swb_mask);
685 #endif
686 		swb->swb_mask |= mask;
687 	}
688 	/*
689 	 * If this is an async write we set up still more buffer fields
690 	 * and place a "cleaning" entry on the inuse queue.
691 	 */
692 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC) {
693 #ifdef DEBUG
694 		if (swap_pager_free.tqh_first == NULL)
695 			panic("swpg_io: lost spc");
696 #endif
697 		spc = swap_pager_free.tqh_first;
698 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
699 #ifdef DEBUG
700 		if (spc->spc_flags != SPC_FREE)
701 			panic("swpg_io: bad free spc");
702 #endif
703 		spc->spc_flags = SPC_BUSY;
704 		spc->spc_bp = bp;
705 		spc->spc_swp = swp;
706 		spc->spc_kva = kva;
707 		/*
708 		 * Record the first page.  This allows swap_pager_clean
709 		 * to efficiently handle the common case of a single page.
710 		 * For clusters, it allows us to locate the object easily
711 		 * and we then reconstruct the rest of the mlist from spc_kva.
712 		 */
713 		spc->spc_m = m;
714 		spc->spc_npages = npages;
715 		bp->b_flags |= B_CALL;
716 		bp->b_iodone = swap_pager_iodone;
717 		s = splbio();
718 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
719 		splx(s);
720 	}
721 
722 	/*
723 	 * Finally, start the IO operation.
724 	 * If it is async we are all done, otherwise we must wait for
725 	 * completion and cleanup afterwards.
726 	 */
727 #ifdef DEBUG
728 	if (swpagerdebug & SDB_IO)
729 		printf("swpg_io: IO start: bp %x, db %x, va %x, pa %x\n",
730 		       bp, swb->swb_block+btodb(off), kva, VM_PAGE_TO_PHYS(m));
731 #endif
732 	VOP_STRATEGY(bp);
733 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC) {
734 #ifdef DEBUG
735 		if (swpagerdebug & SDB_IO)
736 			printf("swpg_io:  IO started: bp %x\n", bp);
737 #endif
738 		return(VM_PAGER_PEND);
739 	}
740 	s = splbio();
741 #ifdef DEBUG
742 	if (flags & B_READ)
743 		swap_pager_piip++;
744 	else
745 		swap_pager_poip++;
746 #endif
747 	while ((bp->b_flags & B_DONE) == 0)
748 		(void) tsleep(bp, PVM, "swpgio", 0);
749 	if ((flags & B_READ) == 0)
750 		--swp->sw_poip;
751 #ifdef DEBUG
752 	if (flags & B_READ)
753 		--swap_pager_piip;
754 	else
755 		--swap_pager_poip;
756 #endif
757 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
758 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
759 	bp->b_actf = bswlist.b_actf;
760 	bswlist.b_actf = bp;
761 	if (bp->b_vp)
762 		brelvp(bp);
763 	if (bswlist.b_flags & B_WANTED) {
764 		bswlist.b_flags &= ~B_WANTED;
765 		wakeup(&bswlist);
766 	}
767 	if ((flags & B_READ) == 0 && rv == VM_PAGER_OK) {
768 		m->flags |= PG_CLEAN;
769 		pmap_clear_modify(VM_PAGE_TO_PHYS(m));
770 	}
771 	splx(s);
772 #ifdef DEBUG
773 	if (swpagerdebug & SDB_IO)
774 		printf("swpg_io:  IO done: bp %x, rv %d\n", bp, rv);
775 	if ((swpagerdebug & SDB_FAIL) && rv == VM_PAGER_ERROR)
776 		printf("swpg_io: IO error\n");
777 #endif
778 	vm_pager_unmap_pages(kva, npages);
779 	return(rv);
780 }
781 
782 static void
783 swap_pager_clean(rw)
784 	int rw;
785 {
786 	register swp_clean_t spc;
787 	register int s, i;
788 	vm_object_t object;
789 	vm_page_t m;
790 
791 #ifdef DEBUG
792 	/* save panic time state */
793 	if ((swpagerdebug & SDB_ANOMPANIC) && panicstr)
794 		return;
795 	if (swpagerdebug & SDB_FOLLOW)
796 		printf("swpg_clean(%x)\n", rw);
797 #endif
798 
799 	for (;;) {
800 		/*
801 		 * Look up and removal from inuse list must be done
802 		 * at splbio() to avoid conflicts with swap_pager_iodone.
803 		 */
804 		s = splbio();
805 		for (spc = swap_pager_inuse.tqh_first;
806 		     spc != NULL;
807 		     spc = spc->spc_list.tqe_next) {
808 			/*
809 			 * If the operation is done, remove it from the
810 			 * list and process it.
811 			 *
812 			 * XXX if we can't get the object lock we also
813 			 * leave it on the list and try again later.
814 			 * Is there something better we could do?
815 			 */
816 			if ((spc->spc_flags & SPC_DONE) &&
817 			    vm_object_lock_try(spc->spc_m->object)) {
818 				TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
819 				break;
820 			}
821 		}
822 		splx(s);
823 
824 		/*
825 		 * No operations done, thats all we can do for now.
826 		 */
827 		if (spc == NULL)
828 			break;
829 
830 		/*
831 		 * Found a completed operation so finish it off.
832 		 * Note: no longer at splbio since entry is off the list.
833 		 */
834 		m = spc->spc_m;
835 		object = m->object;
836 
837 		/*
838 		 * Process each page in the cluster.
839 		 * The first page is explicitly kept in the cleaning
840 		 * entry, others must be reconstructed from the KVA.
841 		 */
842 		for (i = 0; i < spc->spc_npages; i++) {
843 			if (i)
844 				m = vm_pager_atop(spc->spc_kva + ptoa(i));
845 			/*
846 			 * If no error mark as clean and inform the pmap
847 			 * system.  If there was an error, mark as dirty
848 			 * so we will try again.
849 			 *
850 			 * XXX could get stuck doing this, should give up
851 			 * after awhile.
852 			 */
853 			if (spc->spc_flags & SPC_ERROR) {
854 				printf("%s: clean of page %x failed\n",
855 				       "swap_pager_clean",
856 				       VM_PAGE_TO_PHYS(m));
857 				m->flags |= PG_LAUNDRY;
858 			} else {
859 				m->flags |= PG_CLEAN;
860 				pmap_clear_modify(VM_PAGE_TO_PHYS(m));
861 			}
862 			m->flags &= ~PG_BUSY;
863 			PAGE_WAKEUP(m);
864 		}
865 
866 		/*
867 		 * Done with the object, decrement the paging count
868 		 * and unlock it.
869 		 */
870 		if (--object->paging_in_progress == 0)
871 			wakeup(object);
872 		vm_object_unlock(object);
873 
874 		/*
875 		 * Free up KVM used and put the entry back on the list.
876 		 */
877 		vm_pager_unmap_pages(spc->spc_kva, spc->spc_npages);
878 		spc->spc_flags = SPC_FREE;
879 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
880 #ifdef DEBUG
881 		if (swpagerdebug & SDB_WRITE)
882 			printf("swpg_clean: free spc %x\n", spc);
883 #endif
884 	}
885 }
886 
887 #ifdef DEBUG
888 static void
889 swap_pager_clean_check(mlist, npages, rw)
890 	vm_page_t *mlist;
891 	int npages;
892 	int rw;
893 {
894 	register swp_clean_t spc;
895 	boolean_t bad;
896 	int i, j, s;
897 	vm_page_t m;
898 
899 	if (panicstr)
900 		return;
901 
902 	bad = FALSE;
903 	s = splbio();
904 	for (spc = swap_pager_inuse.tqh_first;
905 	     spc != NULL;
906 	     spc = spc->spc_list.tqe_next) {
907 		for (j = 0; j < spc->spc_npages; j++) {
908 			m = vm_pager_atop(spc->spc_kva + ptoa(j));
909 			for (i = 0; i < npages; i++)
910 				if (m == mlist[i]) {
911 					if (swpagerdebug & SDB_ANOM)
912 						printf(
913 		"swpg_clean_check: %s: page %x on list, flags %x\n",
914 		rw == B_WRITE ? "write" : "read", mlist[i], spc->spc_flags);
915 					bad = TRUE;
916 				}
917 		}
918 	}
919 	splx(s);
920 	if (bad)
921 		panic("swpg_clean_check");
922 }
923 #endif
924 
925 static void
926 swap_pager_iodone(bp)
927 	register struct buf *bp;
928 {
929 	register swp_clean_t spc;
930 	daddr_t blk;
931 	int s;
932 
933 #ifdef DEBUG
934 	/* save panic time state */
935 	if ((swpagerdebug & SDB_ANOMPANIC) && panicstr)
936 		return;
937 	if (swpagerdebug & SDB_FOLLOW)
938 		printf("swpg_iodone(%x)\n", bp);
939 #endif
940 	s = splbio();
941 	for (spc = swap_pager_inuse.tqh_first;
942 	     spc != NULL;
943 	     spc = spc->spc_list.tqe_next)
944 		if (spc->spc_bp == bp)
945 			break;
946 #ifdef DEBUG
947 	if (spc == NULL)
948 		panic("swap_pager_iodone: bp not found");
949 #endif
950 
951 	spc->spc_flags &= ~SPC_BUSY;
952 	spc->spc_flags |= SPC_DONE;
953 	if (bp->b_flags & B_ERROR)
954 		spc->spc_flags |= SPC_ERROR;
955 	spc->spc_bp = NULL;
956 	blk = bp->b_blkno;
957 
958 #ifdef DEBUG
959 	--swap_pager_poip;
960 	if (swpagerdebug & SDB_WRITE)
961 		printf("swpg_iodone: bp=%x swp=%x flags=%x spc=%x poip=%x\n",
962 		       bp, spc->spc_swp, spc->spc_swp->sw_flags,
963 		       spc, spc->spc_swp->sw_poip);
964 #endif
965 
966 	spc->spc_swp->sw_poip--;
967 	if (spc->spc_swp->sw_flags & SW_WANTED) {
968 		spc->spc_swp->sw_flags &= ~SW_WANTED;
969 		wakeup(spc->spc_swp);
970 	}
971 
972 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
973 	bp->b_actf = bswlist.b_actf;
974 	bswlist.b_actf = bp;
975 	if (bp->b_vp)
976 		brelvp(bp);
977 	if (bswlist.b_flags & B_WANTED) {
978 		bswlist.b_flags &= ~B_WANTED;
979 		wakeup(&bswlist);
980 	}
981 	wakeup(&vm_pages_needed);
982 	splx(s);
983 }
984