xref: /illumos-gate/usr/src/uts/common/vm/vm_seg.c (revision 03831d35)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 /*
43  * VM - segment management.
44  */
45 
46 #include <sys/types.h>
47 #include <sys/inttypes.h>
48 #include <sys/t_lock.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/kmem.h>
52 #include <sys/vmsystm.h>
53 #include <sys/debug.h>
54 #include <sys/cmn_err.h>
55 #include <sys/callb.h>
56 #include <sys/mem_config.h>
57 
58 #include <vm/hat.h>
59 #include <vm/as.h>
60 #include <vm/seg.h>
61 #include <vm/seg_kmem.h>
62 
63 /*
64  * kstats for segment advise
65  */
66 segadvstat_t segadvstat = {
67 	{ "MADV_FREE_hit",	KSTAT_DATA_ULONG },
68 	{ "MADV_FREE_miss",	KSTAT_DATA_ULONG },
69 };
70 
71 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
72 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
73 
74 /* #define	PDEBUG */
75 #if defined(PDEBUG) || defined(lint) || defined(__lint)
76 int pdebug = 0;
77 #else
78 #define	pdebug		0
79 #endif	/* PDEBUG */
80 
81 #define	PPRINTF				if (pdebug) printf
82 #define	PPRINT(x)			PPRINTF(x)
83 #define	PPRINT1(x, a)			PPRINTF(x, a)
84 #define	PPRINT2(x, a, b)		PPRINTF(x, a, b)
85 #define	PPRINT3(x, a, b, c)		PPRINTF(x, a, b, c)
86 #define	PPRINT4(x, a, b, c, d)		PPRINTF(x, a, b, c, d)
87 #define	PPRINT5(x, a, b, c, d, e)	PPRINTF(x, a, b, c, d, e)
88 
89 #define	P_HASHMASK		(p_hashsize - 1)
90 #define	P_BASESHIFT		6
91 
92 /*
93  * entry in the segment page cache
94  */
95 struct seg_pcache {
96 	struct seg_pcache *p_hnext;	/* list for hashed blocks */
97 	struct seg_pcache *p_hprev;
98 	int		p_active;	/* active count */
99 	int		p_ref;		/* ref bit */
100 	size_t		p_len;		/* segment length */
101 	caddr_t		p_addr;		/* base address */
102 	struct seg 	*p_seg;		/* segment */
103 	struct page	**p_pp;		/* pp shadow list */
104 	enum seg_rw	p_rw;		/* rw */
105 	uint_t		p_flags;	/* bit flags */
106 	int		(*p_callback)(struct seg *, caddr_t, size_t,
107 			    struct page **, enum seg_rw);
108 };
109 
110 struct seg_phash {
111 	struct seg_pcache *p_hnext;	/* list for hashed blocks */
112 	struct seg_pcache *p_hprev;
113 	int p_qlen;			/* Q length */
114 	kmutex_t p_hmutex;		/* protects hash bucket */
115 };
116 
117 static int seg_preap_time = 20;	/* reclaim every 20 secs */
118 static int seg_pmaxqlen = 5;	/* max Q length in hash list */
119 static int seg_ppcount = 5;	/* max # of purges per reclaim interval */
120 static int seg_plazy = 1;	/* if 1, pages are cached after pageunlock */
121 static pgcnt_t seg_pwindow;	/* max # of pages that can be cached */
122 static pgcnt_t seg_plocked;	/* # of pages which are cached by pagelock */
123 static pgcnt_t seg_plocked_window; /* # pages from window */
124 int seg_preapahead;
125 
126 static uint_t seg_pdisable = 0;	/* if not 0, caching temporarily disabled */
127 
128 static int seg_pupdate_active = 1;	/* background reclaim thread */
129 static clock_t seg_preap_interval;	/* reap interval in ticks */
130 
131 static kmutex_t seg_pcache;	/* protects the whole pagelock cache */
132 static kmutex_t seg_pmem;	/* protects window counter */
133 static ksema_t seg_psaync_sem;	/* sema for reclaim thread */
134 static struct seg_phash *p_hashtab;
135 static int p_hashsize = 0;
136 
137 #define	p_hash(seg) \
138 	(P_HASHMASK & \
139 	((uintptr_t)(seg) >> P_BASESHIFT))
140 
141 #define	p_match(pcp, seg, addr, len, rw) \
142 	(((pcp)->p_seg == (seg) && \
143 	(pcp)->p_addr == (addr) && \
144 	(pcp)->p_rw == (rw) && \
145 	(pcp)->p_len == (len)) ? 1 : 0)
146 
147 #define	p_match_pp(pcp, seg, addr, len, pp, rw) \
148 	(((pcp)->p_seg == (seg) && \
149 	(pcp)->p_addr == (addr) && \
150 	(pcp)->p_pp == (pp) && \
151 	(pcp)->p_rw == (rw) && \
152 	(pcp)->p_len == (len)) ? 1 : 0)
153 
154 
155 /*
156  * lookup an address range in pagelock cache. Return shadow list
157  * and bump up active count.
158  */
159 struct page **
160 seg_plookup(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
161 {
162 	struct seg_pcache *pcp;
163 	struct seg_phash *hp;
164 
165 	/*
166 	 * Skip pagelock cache, while DR is in progress or
167 	 * seg_pcache is off.
168 	 */
169 	if (seg_pdisable || seg_plazy == 0) {
170 		return (NULL);
171 	}
172 
173 	hp = &p_hashtab[p_hash(seg)];
174 	mutex_enter(&hp->p_hmutex);
175 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
176 	    pcp = pcp->p_hnext) {
177 		if (p_match(pcp, seg, addr, len, rw)) {
178 			pcp->p_active++;
179 			mutex_exit(&hp->p_hmutex);
180 
181 			PPRINT5("seg_plookup hit: seg %p, addr %p, "
182 			    "len %lx, count %d, pplist %p \n",
183 			    (void *)seg, (void *)addr, len, pcp->p_active,
184 			    (void *)pcp->p_pp);
185 
186 			return (pcp->p_pp);
187 		}
188 	}
189 	mutex_exit(&hp->p_hmutex);
190 
191 	PPRINT("seg_plookup miss:\n");
192 
193 	return (NULL);
194 }
195 
196 /*
197  * mark address range inactive. If the cache is off or the address
198  * range is not in the cache we call the segment driver to reclaim
199  * the pages. Otherwise just decrement active count and set ref bit.
200  */
201 void
202 seg_pinactive(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
203     enum seg_rw rw, int (*callback)(struct seg *, caddr_t, size_t,
204     struct page **, enum seg_rw))
205 {
206 	struct seg_pcache *pcp;
207 	struct seg_phash *hp;
208 
209 	if (seg_plazy == 0) {
210 		(void) (*callback)(seg, addr, len, pp, rw);
211 		return;
212 	}
213 	hp = &p_hashtab[p_hash(seg)];
214 	mutex_enter(&hp->p_hmutex);
215 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
216 	    pcp = pcp->p_hnext) {
217 		if (p_match_pp(pcp, seg, addr, len, pp, rw)) {
218 			pcp->p_active--;
219 			ASSERT(pcp->p_active >= 0);
220 			if (pcp->p_active == 0 && seg_pdisable) {
221 				int npages;
222 
223 				ASSERT(callback == pcp->p_callback);
224 				/* free the entry */
225 				hp->p_qlen--;
226 				pcp->p_hprev->p_hnext = pcp->p_hnext;
227 				pcp->p_hnext->p_hprev = pcp->p_hprev;
228 				mutex_exit(&hp->p_hmutex);
229 				npages = pcp->p_len >> PAGESHIFT;
230 				mutex_enter(&seg_pmem);
231 				seg_plocked -= npages;
232 				if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
233 					seg_plocked_window -= npages;
234 				}
235 				mutex_exit(&seg_pmem);
236 				kmem_free(pcp, sizeof (struct seg_pcache));
237 				goto out;
238 			}
239 			pcp->p_ref = 1;
240 			mutex_exit(&hp->p_hmutex);
241 			return;
242 		}
243 	}
244 	mutex_exit(&hp->p_hmutex);
245 out:
246 	(void) (*callback)(seg, addr, len, pp, rw);
247 }
248 
249 /*
250  * The seg_pinsert_check() is used by segment drivers to predict whether
251  * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
252  */
253 
254 int
255 seg_pinsert_check(struct seg *seg, size_t len, uint_t flags)
256 {
257 	struct seg_phash *hp;
258 
259 	if (seg_plazy == 0) {
260 		return (SEGP_FAIL);
261 	}
262 	if (seg_pdisable != 0) {
263 		return (SEGP_FAIL);
264 	}
265 	ASSERT((len & PAGEOFFSET) == 0);
266 	hp = &p_hashtab[p_hash(seg)];
267 	if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
268 		return (SEGP_FAIL);
269 	}
270 	/*
271 	 * If the SEGP_FORCE_WIRED flag is set,
272 	 * we skip the check for seg_pwindow.
273 	 */
274 	if ((flags & SEGP_FORCE_WIRED) == 0) {
275 		pgcnt_t npages;
276 
277 		npages = len >> PAGESHIFT;
278 		if ((seg_plocked_window + npages) > seg_pwindow) {
279 			return (SEGP_FAIL);
280 		}
281 	}
282 	return (SEGP_SUCCESS);
283 }
284 
285 
286 /*
287  * insert address range with shadow list into pagelock cache. If
288  * the cache is off or caching is temporarily disabled or the allowed
289  * 'window' is exceeded - return SEGP_FAIL. Otherwise return
290  * SEGP_SUCCESS.
291  */
292 int
293 seg_pinsert(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
294     enum seg_rw rw, uint_t flags, int (*callback)(struct seg *, caddr_t,
295     size_t, struct page **, enum seg_rw))
296 {
297 	struct seg_pcache *pcp;
298 	struct seg_phash *hp;
299 	pgcnt_t npages;
300 
301 	if (seg_plazy == 0) {
302 		return (SEGP_FAIL);
303 	}
304 	if (seg_pdisable != 0) {
305 		return (SEGP_FAIL);
306 	}
307 	ASSERT((len & PAGEOFFSET) == 0);
308 	hp = &p_hashtab[p_hash(seg)];
309 	if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
310 		return (SEGP_FAIL);
311 	}
312 	npages = len >> PAGESHIFT;
313 	mutex_enter(&seg_pmem);
314 	/*
315 	 * If the SEGP_FORCE_WIRED flag is set,
316 	 * we skip the check for seg_pwindow.
317 	 */
318 	if ((flags & SEGP_FORCE_WIRED) == 0) {
319 		seg_plocked_window += npages;
320 		if (seg_plocked_window > seg_pwindow) {
321 			seg_plocked_window -= npages;
322 			mutex_exit(&seg_pmem);
323 			return (SEGP_FAIL);
324 		}
325 	}
326 	seg_plocked += npages;
327 	mutex_exit(&seg_pmem);
328 
329 	pcp = kmem_alloc(sizeof (struct seg_pcache), KM_SLEEP);
330 	pcp->p_seg = seg;
331 	pcp->p_addr = addr;
332 	pcp->p_len = len;
333 	pcp->p_pp = pp;
334 	pcp->p_rw = rw;
335 	pcp->p_callback = callback;
336 	pcp->p_active = 1;
337 	pcp->p_flags = flags;
338 
339 	PPRINT4("seg_pinsert: seg %p, addr %p, len %lx, pplist %p\n",
340 	    (void *)seg, (void *)addr, len, (void *)pp);
341 
342 	hp = &p_hashtab[p_hash(seg)];
343 	mutex_enter(&hp->p_hmutex);
344 	hp->p_qlen++;
345 	pcp->p_hnext = hp->p_hnext;
346 	pcp->p_hprev = (struct seg_pcache *)hp;
347 	hp->p_hnext->p_hprev = pcp;
348 	hp->p_hnext = pcp;
349 	mutex_exit(&hp->p_hmutex);
350 	return (SEGP_SUCCESS);
351 }
352 
353 /*
354  * purge all entries from the pagelock cache if not active
355  * and not recently used. Drop all locks and call through
356  * the address space into the segment driver to reclaim
357  * the pages. This makes sure we get the address space
358  * and segment driver locking right.
359  */
360 static void
361 seg_ppurge_all(int force)
362 {
363 	struct seg_pcache *delcallb_list = NULL;
364 	struct seg_pcache *pcp;
365 	struct seg_phash *hp;
366 	int purge_count = 0;
367 	pgcnt_t npages = 0;
368 	pgcnt_t npages_window = 0;
369 
370 	/*
371 	 * if the cache if off or empty, return
372 	 */
373 	if (seg_plazy == 0 || seg_plocked == 0) {
374 		return;
375 	}
376 	for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
377 		mutex_enter(&hp->p_hmutex);
378 		pcp = hp->p_hnext;
379 
380 		/*
381 		 * While 'force' is set, seg_pasync_thread is not
382 		 * throttled.  This is to speedup flushing of seg_pcache
383 		 * in preparation for DR.
384 		 *
385 		 * In normal case, when 'force' is not set, we throttle
386 		 * seg_pasync_thread so that we don't spend all the time
387 		 * time in purging the cache.
388 		 */
389 		while ((pcp != (struct seg_pcache *)hp) &&
390 				(force || (purge_count <= seg_ppcount))) {
391 
392 			/*
393 			 * purge entries which are not active and
394 			 * have not been used recently and
395 			 * have the SEGP_ASYNC_FLUSH flag.
396 			 *
397 			 * In the 'force' case, we ignore the
398 			 * SEGP_ASYNC_FLUSH flag.
399 			 */
400 			if (!(pcp->p_flags & SEGP_ASYNC_FLUSH))
401 				pcp->p_ref = 1;
402 			if (force)
403 				pcp->p_ref = 0;
404 			if (!pcp->p_ref && !pcp->p_active) {
405 				struct as *as = pcp->p_seg->s_as;
406 
407 				/*
408 				 * try to get the readers lock on the address
409 				 * space before taking out the cache element.
410 				 * This ensures as_pagereclaim() can actually
411 				 * call through the address space and free
412 				 * the pages. If we don't get the lock, just
413 				 * skip this entry. The pages will be reclaimed
414 				 * by the segment driver at unmap time.
415 				 */
416 				if (AS_LOCK_TRYENTER(as, &as->a_lock,
417 				    RW_READER)) {
418 					hp->p_qlen--;
419 					pcp->p_hprev->p_hnext = pcp->p_hnext;
420 					pcp->p_hnext->p_hprev = pcp->p_hprev;
421 					pcp->p_hprev = delcallb_list;
422 					delcallb_list = pcp;
423 					purge_count++;
424 				}
425 			} else {
426 				pcp->p_ref = 0;
427 			}
428 			pcp = pcp->p_hnext;
429 		}
430 		mutex_exit(&hp->p_hmutex);
431 		if (!force && purge_count > seg_ppcount)
432 			break;
433 	}
434 
435 	/*
436 	 * run the delayed callback list. We don't want to hold the
437 	 * cache lock during a call through the address space.
438 	 */
439 	while (delcallb_list != NULL) {
440 		struct as *as;
441 
442 		pcp = delcallb_list;
443 		delcallb_list = pcp->p_hprev;
444 		as = pcp->p_seg->s_as;
445 
446 		PPRINT4("seg_ppurge_all: purge seg %p, addr %p, len %lx, "
447 		    "pplist %p\n", (void *)pcp->p_seg, (void *)pcp->p_addr,
448 		    pcp->p_len, (void *)pcp->p_pp);
449 
450 		as_pagereclaim(as, pcp->p_pp, pcp->p_addr,
451 		    pcp->p_len, pcp->p_rw);
452 		AS_LOCK_EXIT(as, &as->a_lock);
453 		npages += pcp->p_len >> PAGESHIFT;
454 		if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
455 			npages_window += pcp->p_len >> PAGESHIFT;
456 		}
457 		kmem_free(pcp, sizeof (struct seg_pcache));
458 	}
459 	mutex_enter(&seg_pmem);
460 	seg_plocked -= npages;
461 	seg_plocked_window -= npages_window;
462 	mutex_exit(&seg_pmem);
463 }
464 
465 /*
466  * Remove cached pages for segment(s) entries from hashtable.
467  * The segments are identified by a given clients callback
468  * function.
469  * This is useful for multiple seg's cached on behalf of
470  * dummy segment (ISM/DISM) with common callback function.
471  * The clients callback function may return status indicating
472  * that the last seg's entry has been purged. In such a case
473  * the seg_ppurge_seg() stops searching hashtable and exits.
474  * Otherwise all hashtable entries are scanned.
475  */
476 void
477 seg_ppurge_seg(int (*callback)(struct seg *, caddr_t, size_t,
478     struct page **, enum seg_rw))
479 {
480 	struct seg_pcache *pcp, *npcp;
481 	struct seg_phash *hp;
482 	pgcnt_t npages = 0;
483 	pgcnt_t npages_window = 0;
484 	int	done = 0;
485 
486 	/*
487 	 * if the cache if off or empty, return
488 	 */
489 	if (seg_plazy == 0 || seg_plocked == 0) {
490 		return;
491 	}
492 	mutex_enter(&seg_pcache);
493 	seg_pdisable++;
494 	mutex_exit(&seg_pcache);
495 
496 	for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
497 
498 		mutex_enter(&hp->p_hmutex);
499 		pcp = hp->p_hnext;
500 		while (pcp != (struct seg_pcache *)hp) {
501 
502 			/*
503 			 * purge entries which are not active
504 			 */
505 			npcp = pcp->p_hnext;
506 			if (!pcp->p_active && pcp->p_callback == callback) {
507 				hp->p_qlen--;
508 				pcp->p_hprev->p_hnext = pcp->p_hnext;
509 				pcp->p_hnext->p_hprev = pcp->p_hprev;
510 
511 				if ((*pcp->p_callback)(pcp->p_seg, pcp->p_addr,
512 				    pcp->p_len, pcp->p_pp, pcp->p_rw)) {
513 					done = 1;
514 				}
515 
516 				npages += pcp->p_len >> PAGESHIFT;
517 				if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
518 					npages_window +=
519 					    pcp->p_len >> PAGESHIFT;
520 				}
521 				kmem_free(pcp, sizeof (struct seg_pcache));
522 			}
523 			pcp = npcp;
524 			if (done)
525 				break;
526 		}
527 		mutex_exit(&hp->p_hmutex);
528 		if (done)
529 			break;
530 	}
531 
532 	mutex_enter(&seg_pcache);
533 	seg_pdisable--;
534 	mutex_exit(&seg_pcache);
535 
536 	mutex_enter(&seg_pmem);
537 	seg_plocked -= npages;
538 	seg_plocked_window -= npages_window;
539 	mutex_exit(&seg_pmem);
540 }
541 
542 /*
543  * purge all entries for a given segment. Since we
544  * callback into the segment driver directly for page
545  * reclaim the caller needs to hold the right locks.
546  */
547 void
548 seg_ppurge(struct seg *seg)
549 {
550 	struct seg_pcache *delcallb_list = NULL;
551 	struct seg_pcache *pcp;
552 	struct seg_phash *hp;
553 	pgcnt_t npages = 0;
554 	pgcnt_t npages_window = 0;
555 
556 	if (seg_plazy == 0) {
557 		return;
558 	}
559 	hp = &p_hashtab[p_hash(seg)];
560 	mutex_enter(&hp->p_hmutex);
561 	pcp = hp->p_hnext;
562 	while (pcp != (struct seg_pcache *)hp) {
563 		if (pcp->p_seg == seg) {
564 			if (pcp->p_active) {
565 				break;
566 			}
567 			hp->p_qlen--;
568 			pcp->p_hprev->p_hnext = pcp->p_hnext;
569 			pcp->p_hnext->p_hprev = pcp->p_hprev;
570 			pcp->p_hprev = delcallb_list;
571 			delcallb_list = pcp;
572 		}
573 		pcp = pcp->p_hnext;
574 	}
575 	mutex_exit(&hp->p_hmutex);
576 	while (delcallb_list != NULL) {
577 		pcp = delcallb_list;
578 		delcallb_list = pcp->p_hprev;
579 
580 		PPRINT4("seg_ppurge: purge seg %p, addr %p, len %lx, "
581 		    "pplist %p\n", (void *)seg, (void *)pcp->p_addr,
582 		    pcp->p_len, (void *)pcp->p_pp);
583 
584 		ASSERT(seg == pcp->p_seg);
585 		(void) (*pcp->p_callback)(seg, pcp->p_addr,
586 		    pcp->p_len, pcp->p_pp, pcp->p_rw);
587 		npages += pcp->p_len >> PAGESHIFT;
588 		if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
589 			npages_window += pcp->p_len >> PAGESHIFT;
590 		}
591 		kmem_free(pcp, sizeof (struct seg_pcache));
592 	}
593 	mutex_enter(&seg_pmem);
594 	seg_plocked -= npages;
595 	seg_plocked_window -= npages_window;
596 	mutex_exit(&seg_pmem);
597 }
598 
599 static void seg_pinit_mem_config(void);
600 
601 /*
602  * setup the pagelock cache
603  */
604 static void
605 seg_pinit(void)
606 {
607 	struct seg_phash *hp;
608 	int i;
609 	uint_t physmegs;
610 
611 	sema_init(&seg_psaync_sem, 0, NULL, SEMA_DEFAULT, NULL);
612 
613 	mutex_enter(&seg_pcache);
614 	if (p_hashtab == NULL) {
615 		physmegs = physmem >> (20 - PAGESHIFT);
616 
617 		/* If p_hashsize was not set in /etc/system ... */
618 		if (p_hashsize == 0) {
619 			/*
620 			 * Choose p_hashsize based on physmem.
621 			 */
622 			if (physmegs < 64) {
623 				p_hashsize = 64;
624 			} else if (physmegs < 1024) {
625 				p_hashsize = 1024;
626 			} else if (physmegs < 10 * 1024) {
627 				p_hashsize = 8192;
628 			} else if (physmegs < 20 * 1024) {
629 				p_hashsize = 2 * 8192;
630 				seg_pmaxqlen = 16;
631 			} else {
632 				p_hashsize = 128 * 1024;
633 				seg_pmaxqlen = 128;
634 			}
635 		}
636 
637 		p_hashtab = kmem_zalloc(
638 			p_hashsize * sizeof (struct seg_phash), KM_SLEEP);
639 		for (i = 0; i < p_hashsize; i++) {
640 			hp = (struct seg_phash *)&p_hashtab[i];
641 			hp->p_hnext = (struct seg_pcache *)hp;
642 			hp->p_hprev = (struct seg_pcache *)hp;
643 			mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
644 		}
645 		if (seg_pwindow == 0) {
646 			if (physmegs < 24) {
647 				/* don't use cache */
648 				seg_plazy = 0;
649 			} else if (physmegs < 64) {
650 				seg_pwindow = physmem >> 5; /* 3% of memory */
651 			} else if (physmegs < 10 * 1024) {
652 				seg_pwindow = physmem >> 3; /* 12% of memory */
653 			} else {
654 				seg_pwindow = physmem >> 1;
655 			}
656 		}
657 	}
658 	mutex_exit(&seg_pcache);
659 
660 	seg_pinit_mem_config();
661 }
662 
663 /*
664  * called by pageout if memory is low
665  */
666 void
667 seg_preap(void)
668 {
669 	/*
670 	 * if the cache if off or empty, return
671 	 */
672 	if (seg_plocked == 0 || seg_plazy == 0) {
673 		return;
674 	}
675 	sema_v(&seg_psaync_sem);
676 }
677 
678 static void seg_pupdate(void *);
679 
680 /*
681  * run as a backgroud thread and reclaim pagelock
682  * pages which have not been used recently
683  */
684 void
685 seg_pasync_thread(void)
686 {
687 	callb_cpr_t cpr_info;
688 	kmutex_t pasync_lock;	/* just for CPR stuff */
689 
690 	mutex_init(&pasync_lock, NULL, MUTEX_DEFAULT, NULL);
691 
692 	CALLB_CPR_INIT(&cpr_info, &pasync_lock,
693 		callb_generic_cpr, "seg_pasync");
694 
695 	if (seg_preap_interval == 0) {
696 		seg_preap_interval = seg_preap_time * hz;
697 	} else {
698 		seg_preap_interval *= hz;
699 	}
700 	if (seg_plazy && seg_pupdate_active) {
701 		(void) timeout(seg_pupdate, NULL, seg_preap_interval);
702 	}
703 
704 	for (;;) {
705 		mutex_enter(&pasync_lock);
706 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
707 		mutex_exit(&pasync_lock);
708 		sema_p(&seg_psaync_sem);
709 		mutex_enter(&pasync_lock);
710 		CALLB_CPR_SAFE_END(&cpr_info, &pasync_lock);
711 		mutex_exit(&pasync_lock);
712 
713 		seg_ppurge_all(0);
714 	}
715 }
716 
717 static void
718 seg_pupdate(void *dummy)
719 {
720 	sema_v(&seg_psaync_sem);
721 
722 	if (seg_plazy && seg_pupdate_active) {
723 		(void) timeout(seg_pupdate, dummy, seg_preap_interval);
724 	}
725 }
726 
727 static struct kmem_cache *seg_cache;
728 
729 /*
730  * Initialize segment management data structures.
731  */
732 void
733 seg_init(void)
734 {
735 	kstat_t *ksp;
736 
737 	seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
738 		0, NULL, NULL, NULL, NULL, NULL, 0);
739 
740 	ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
741 		segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
742 	if (ksp) {
743 		ksp->ks_data = (void *)segadvstat_ptr;
744 		kstat_install(ksp);
745 	}
746 
747 	seg_pinit();
748 }
749 
750 /*
751  * Allocate a segment to cover [base, base+size]
752  * and attach it to the specified address space.
753  */
754 struct seg *
755 seg_alloc(struct as *as, caddr_t base, size_t size)
756 {
757 	struct seg *new;
758 	caddr_t segbase;
759 	size_t segsize;
760 
761 	segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
762 	segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
763 	    (uintptr_t)segbase;
764 
765 	if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
766 		return ((struct seg *)NULL);	/* bad virtual addr range */
767 
768 	if (as != &kas &&
769 	    valid_usr_range(segbase, segsize, 0, as,
770 	    as->a_userlimit) != RANGE_OKAY)
771 		return ((struct seg *)NULL);	/* bad virtual addr range */
772 
773 	new = kmem_cache_alloc(seg_cache, KM_SLEEP);
774 	new->s_ops = NULL;
775 	new->s_data = NULL;
776 	new->s_szc = 0;
777 	new->s_flags = 0;
778 	if (seg_attach(as, segbase, segsize, new) < 0) {
779 		kmem_cache_free(seg_cache, new);
780 		return ((struct seg *)NULL);
781 	}
782 	/* caller must fill in ops, data */
783 	return (new);
784 }
785 
786 /*
787  * Attach a segment to the address space.  Used by seg_alloc()
788  * and for kernel startup to attach to static segments.
789  */
790 int
791 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
792 {
793 	seg->s_as = as;
794 	seg->s_base = base;
795 	seg->s_size = size;
796 
797 	/*
798 	 * as_addseg() will add the segment at the appropraite point
799 	 * in the list. It will return -1 if there is overlap with
800 	 * an already existing segment.
801 	 */
802 	return (as_addseg(as, seg));
803 }
804 
805 /*
806  * Unmap a segment and free it from its associated address space.
807  * This should be called by anybody who's finished with a whole segment's
808  * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
809  * responsibility of the segment driver to unlink the the segment
810  * from the address space, and to free public and private data structures
811  * associated with the segment.  (This is typically done by a call to
812  * seg_free()).
813  */
814 void
815 seg_unmap(struct seg *seg)
816 {
817 #ifdef DEBUG
818 	int ret;
819 #endif /* DEBUG */
820 
821 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
822 
823 	/* Shouldn't have called seg_unmap if mapping isn't yet established */
824 	ASSERT(seg->s_data != NULL);
825 
826 	/* Unmap the whole mapping */
827 #ifdef DEBUG
828 	ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
829 	ASSERT(ret == 0);
830 #else
831 	SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
832 #endif /* DEBUG */
833 }
834 
835 /*
836  * Free the segment from its associated as. This should only be called
837  * if a mapping to the segment has not yet been established (e.g., if
838  * an error occurs in the middle of doing an as_map when the segment
839  * has already been partially set up) or if it has already been deleted
840  * (e.g., from a segment driver unmap routine if the unmap applies to the
841  * entire segment). If the mapping is currently set up then seg_unmap() should
842  * be called instead.
843  */
844 void
845 seg_free(struct seg *seg)
846 {
847 	register struct as *as = seg->s_as;
848 	struct seg *tseg = as_removeseg(as, seg);
849 
850 	ASSERT(tseg == seg);
851 
852 	/*
853 	 * If the segment private data field is NULL,
854 	 * then segment driver is not attached yet.
855 	 */
856 	if (seg->s_data != NULL)
857 		SEGOP_FREE(seg);
858 
859 	kmem_cache_free(seg_cache, seg);
860 }
861 
862 /*ARGSUSED*/
863 static void
864 seg_p_mem_config_post_add(
865 	void *arg,
866 	pgcnt_t delta_pages)
867 {
868 	/* Nothing to do. */
869 }
870 
871 /*
872  * Attempt to purge seg_pcache.  May need to return before this has
873  * completed to allow other pre_del callbacks to unlock pages. This is
874  * ok because:
875  *	1) The seg_pdisable flag has been set so at least we won't
876  *	cache anymore locks and the locks we couldn't purge
877  *	will not be held if they do get released by a subsequent
878  *	pre-delete callback.
879  *
880  *	2) The rest of the memory delete thread processing does not
881  *	depend on the changes made in this pre-delete callback. No
882  *	panics will result, the worst that will happen is that the
883  *	DR code will timeout and cancel the delete.
884  */
885 /*ARGSUSED*/
886 static int
887 seg_p_mem_config_pre_del(
888 	void *arg,
889 	pgcnt_t delta_pages)
890 {
891 	pgcnt_t	old_plocked;
892 	int stall_count = 0;
893 
894 	mutex_enter(&seg_pcache);
895 	seg_pdisable++;
896 	ASSERT(seg_pdisable != 0);
897 	mutex_exit(&seg_pcache);
898 
899 	/*
900 	 * Attempt to empty the cache. Terminate if seg_plocked does not
901 	 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
902 	 */
903 	while (seg_plocked != 0) {
904 		old_plocked = seg_plocked;
905 		seg_ppurge_all(1);
906 		if (seg_plocked == old_plocked) {
907 			if (stall_count++ > SEGP_STALL_THRESHOLD) {
908 				cmn_err(CE_NOTE, "!Pre-delete couldn't purge"
909 					" pagelock cache - continuing");
910 				break;
911 			}
912 		} else
913 			stall_count = 0;
914 		if (seg_plocked != 0)
915 			delay(hz/SEGP_PREDEL_DELAY_FACTOR);
916 	}
917 	return (0);
918 }
919 
920 /*ARGSUSED*/
921 static void
922 seg_p_mem_config_post_del(
923 	void *arg,
924 	pgcnt_t delta_pages,
925 	int cancelled)
926 {
927 	mutex_enter(&seg_pcache);
928 	ASSERT(seg_pdisable != 0);
929 	seg_pdisable--;
930 	mutex_exit(&seg_pcache);
931 }
932 
933 static kphysm_setup_vector_t seg_p_mem_config_vec = {
934 	KPHYSM_SETUP_VECTOR_VERSION,
935 	seg_p_mem_config_post_add,
936 	seg_p_mem_config_pre_del,
937 	seg_p_mem_config_post_del,
938 };
939 
940 static void
941 seg_pinit_mem_config(void)
942 {
943 	int ret;
944 
945 	ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
946 	/*
947 	 * Want to catch this in the debug kernel. At run time, if the
948 	 * callbacks don't get run all will be OK as the disable just makes
949 	 * it more likely that the pages can be collected.
950 	 */
951 	ASSERT(ret == 0);
952 }
953