xref: /illumos-gate/usr/src/uts/common/vm/anon.h (revision f3041bfa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2015, Joyent, Inc. All rights reserved.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	 All Rights Reserved   */
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #ifndef	_VM_ANON_H
40 #define	_VM_ANON_H
41 
42 #include <sys/cred.h>
43 #include <sys/zone.h>
44 #include <vm/seg.h>
45 #include <vm/vpage.h>
46 
47 #ifdef	__cplusplus
48 extern "C" {
49 #endif
50 
51 /*
52  * VM - Anonymous pages.
53  */
54 
55 typedef	unsigned long anoff_t;		/* anon offsets */
56 
57 /*
58  *	Each anonymous page, either in memory or in swap, has an anon structure.
59  * The structure (slot) provides a level of indirection between anonymous pages
60  * and their backing store.
61  *
62  *	(an_vp, an_off) names the vnode of the anonymous page for this slot.
63  *
64  * 	(an_pvp, an_poff) names the location of the physical backing store
65  * 	for the page this slot represents. If the name is null there is no
66  * 	associated physical store. The physical backing store location can
67  *	change while the slot is in use.
68  *
69  *	an_hash is a hash list of anon slots. The list is hashed by
70  * 	(an_vp, an_off) of the associated anonymous page and provides a
71  *	method of going from the name of an anonymous page to its
72  * 	associated anon slot.
73  *
74  *	an_refcnt holds a reference count which is the number of separate
75  * 	copies that will need to be created in case of copy-on-write.
76  *	A refcnt > 0 protects the existence of the slot. The refcnt is
77  * 	initialized to 1 when the anon slot is created in anon_alloc().
78  *	If a client obtains an anon slot and allows multiple threads to
79  * 	share it, then it is the client's responsibility to insure that
80  *	it does not allow one thread to try to reference the slot at the
81  *	same time as another is trying to decrement the last count and
82  *	destroy the anon slot. E.g., the seg_vn segment type protects
83  *	against this with higher level locks.
84  */
85 
86 struct anon {
87 	struct vnode *an_vp;	/* vnode of anon page */
88 	struct vnode *an_pvp;	/* vnode of physical backing store */
89 	anoff_t an_off;		/* offset of anon page */
90 	anoff_t an_poff;	/* offset in vnode */
91 	struct anon *an_hash;	/* hash table of anon slots */
92 	int an_refcnt;		/* # of people sharing slot */
93 };
94 
95 #define	AN_CACHE_ALIGN_LOG2	4	/* log2(AN_CACHE_ALIGN) */
96 #define	AN_CACHE_ALIGN	(1U << AN_CACHE_ALIGN_LOG2) /* anon address aligned */
97 						/* 16 bytes */
98 
99 
100 #ifdef _KERNEL
101 /*
102  * The swapinfo_lock protects:
103  *		swapinfo list
104  *		individual swapinfo structures
105  *
106  * The anoninfo_lock protects:
107  *		anoninfo counters
108  *
109  * The anonhash_lock protects:
110  *		anon hash lists
111  *		anon slot fields
112  *
113  * Fields in the anon slot which are read-only for the life of the slot
114  * (an_vp, an_off) do not require the anonhash_lock be held to access them.
115  * If you access a field without the anonhash_lock held you must be holding
116  * the slot with an_refcnt to make sure it isn't destroyed.
117  * To write (an_pvp, an_poff) in a given slot you must also hold the
118  * p_iolock of the anonymous page for slot.
119  */
120 extern kmutex_t anoninfo_lock;
121 extern kmutex_t swapinfo_lock;
122 extern pad_mutex_t *anonhash_lock;
123 extern pad_mutex_t anon_array_lock[];
124 extern kcondvar_t anon_array_cv[];
125 
126 /*
127  * Global hash table to provide a function from (vp, off) -> ap
128  */
129 extern size_t anon_hash_size;
130 extern unsigned int anon_hash_shift;
131 extern struct anon **anon_hash;
132 #define	ANON_HASH_SIZE	anon_hash_size
133 #define	ANON_HASHAVELEN	4
134 /*
135  * Try to use as many bits of randomness from both vp and off as we can.
136  * This should help spreading evenly for a variety of workloads.  See comments
137  * for PAGE_HASH_FUNC for more explanation.
138  */
139 #define	ANON_HASH(vp, off)	\
140 	(((((uintptr_t)(off) >> PAGESHIFT) ^ \
141 		((uintptr_t)(off) >> (PAGESHIFT + anon_hash_shift))) ^ \
142 		(((uintptr_t)(vp) >> 3) ^ \
143 		((uintptr_t)(vp) >> (3 + anon_hash_shift)) ^ \
144 		((uintptr_t)(vp) >> (3 + 2 * anon_hash_shift)) ^ \
145 		((uintptr_t)(vp) << \
146 		    (anon_hash_shift - AN_VPSHIFT - VNODE_ALIGN_LOG2)))) & \
147 		(anon_hash_size - 1))
148 
149 #define	AH_LOCK_SIZE	(2 << NCPU_LOG2)
150 
151 #define	AH_MUTEX(vp, off)				\
152 	(&anonhash_lock[(ANON_HASH((vp), (off)) &	\
153 	    (AH_LOCK_SIZE - 1))].pad_mutex)
154 
155 #endif	/* _KERNEL */
156 
157 /*
158  * Declaration for the Global counters to accurately
159  * track the kernel foot print in memory.
160  */
161 extern  pgcnt_t pages_locked;
162 extern  pgcnt_t pages_claimed;
163 extern  pgcnt_t pages_useclaim;
164 extern  pgcnt_t obp_pages;
165 
166 /*
167  * Anonymous backing store accounting structure for swapctl.
168  *
169  * ani_max = maximum amount of swap space
170  *	(including potentially available physical memory)
171  * ani_free = amount of unallocated anonymous memory
172  *	(some of which might be reserved and including
173  *	potentially available physical memory)
174  * ani_resv = amount of claimed (reserved) anonymous memory
175  *
176  * The swap data can be aquired more efficiently through the
177  * kstats interface.
178  * Total slots currently available for reservation =
179  *	MAX(ani_max - ani_resv, 0) + (availrmem - swapfs_minfree)
180  */
181 struct anoninfo {
182 	pgcnt_t	ani_max;
183 	pgcnt_t	ani_free;
184 	pgcnt_t	ani_resv;
185 };
186 
187 #ifdef _SYSCALL32
188 struct anoninfo32 {
189 	size32_t ani_max;
190 	size32_t ani_free;
191 	size32_t ani_resv;
192 };
193 #endif /* _SYSCALL32 */
194 
195 /*
196  * Define the NCPU pool of the ani_free counters. Update the counter
197  * of the cpu on which the thread is running and in every clock intr
198  * sync anoninfo.ani_free with the current total off all the NCPU entries.
199  */
200 
201 typedef	struct	ani_free {
202 	pgcnt_t		ani_count;
203 	uchar_t		pad[64 - sizeof (pgcnt_t)];
204 			/* XXX 64 = cacheline size */
205 } ani_free_t;
206 
207 #define	ANI_MAX_POOL	(NCPU_P2)
208 extern	ani_free_t	*ani_free_pool;
209 
210 /*
211  * Since each CPU has its own bucket in ani_free_pool, there should be no
212  * contention here.
213  */
214 #define	ANI_ADD(inc)	{ \
215 	pgcnt_t	*ani_countp; \
216 	int	index; \
217 	index = (CPU->cpu_seqid & (ANI_MAX_POOL - 1)); \
218 	ani_countp = &ani_free_pool[index].ani_count; \
219 	atomic_add_long(ani_countp, inc); \
220 }
221 
222 extern void	set_anoninfo(void);
223 
224 /*
225  * Anon array pointers are allocated in chunks. Each chunk
226  * has PAGESIZE/sizeof(u_long *) of anon pointers.
227  * There are two levels of arrays for anon array pointers larger
228  * than a chunk. The first level points to anon array chunks.
229  * The second level consists of chunks of anon pointers.
230  *
231  * If anon array is smaller than a chunk then the whole anon array
232  * is created (memory is allocated for whole anon array).
233  * If anon array is larger than a chunk only first level array is
234  * allocated. Then other arrays (chunks) are allocated only when
235  * they are initialized with anon pointers.
236  */
237 struct anon_hdr {
238 	kmutex_t serial_lock;	/* serialize array chunk allocation */
239 	pgcnt_t	size;		/* number of pointers to (anon) pages */
240 	void	**array_chunk;	/* pointers to anon pointers or chunks of */
241 				/* anon pointers */
242 	int	flags;		/* ANON_ALLOC_FORCE force preallocation of */
243 				/* whole anon array	*/
244 };
245 
246 #ifdef	_LP64
247 #define	ANON_PTRSHIFT	3
248 #define	ANON_PTRMASK	~7
249 #else
250 #define	ANON_PTRSHIFT	2
251 #define	ANON_PTRMASK	~3
252 #endif
253 
254 #define	ANON_CHUNK_SIZE		(PAGESIZE >> ANON_PTRSHIFT)
255 #define	ANON_CHUNK_SHIFT	(PAGESHIFT - ANON_PTRSHIFT)
256 #define	ANON_CHUNK_OFF		(ANON_CHUNK_SIZE - 1)
257 
258 /*
259  * Anon flags.
260  */
261 #define	ANON_SLEEP		0x0	/* ok to block */
262 #define	ANON_NOSLEEP		0x1	/* non-blocking call */
263 #define	ANON_ALLOC_FORCE	0x2	/* force single level anon array */
264 #define	ANON_GROWDOWN		0x4	/* anon array should grow downward */
265 
266 struct kshmid;
267 
268 /*
269  * The anon_map structure is used by various clients of the anon layer to
270  * manage anonymous memory.   When anonymous memory is shared,
271  * then the different clients sharing it will point to the
272  * same anon_map structure.  Also, if a segment is unmapped
273  * in the middle where an anon_map structure exists, the
274  * newly created segment will also share the anon_map structure,
275  * although the two segments will use different ranges of the
276  * anon array.  When mappings are private (or shared with
277  * a reference count of 1), an unmap operation will free up
278  * a range of anon slots in the array given by the anon_map
279  * structure.  Because of fragmentation due to this unmapping,
280  * we have to store the size of the anon array in the anon_map
281  * structure so that we can free everything when the referernce
282  * count goes to zero.
283  *
284  * A new rangelock scheme is introduced to make the anon layer scale.
285  * A reader/writer lock per anon_amp and an array of system-wide hash
286  * locks, anon_array_lock[] are introduced to replace serial_lock and
287  * anonmap lock.  The writer lock is held when we want to singlethreaD
288  * the reference to the anon array pointers or when references to
289  * anon_map's members, whereas reader lock and anon_array_lock are
290  * held to allows multiple threads to reference different part of
291  * anon array.  A global set of condition variables, anon_array_cv,
292  * are used with anon_array_lock[] to make the hold time of the locks
293  * short.
294  *
295  * szc is used to calculate the index of hash locks and cv's.  We
296  * could've just used seg->s_szc if not for the possible sharing of
297  * anon_amp between SYSV shared memory and ISM, so now we introduce
298  * szc in the anon_map structure.  For MAP_SHARED, the amp->szc is either
299  * 0 (base page size) or page_num_pagesizes() - 1, while MAP_PRIVATE
300  * the amp->szc could be anything in [0, page_num_pagesizes() - 1].
301  */
302 typedef struct anon_map {
303 	krwlock_t a_rwlock;	/* protect anon_map and anon array */
304 	size_t	size;		/* size in bytes mapped by the anon array */
305 	struct	anon_hdr *ahp; 	/* anon array header pointer, containing */
306 				/* anon pointer array(s) */
307 	size_t	swresv;		/* swap space reserved for this anon_map */
308 	ulong_t	refcnt;		/* reference count on this structure */
309 	ushort_t a_szc;		/* max szc among shared processes */
310 	void	*locality;	/* lgroup locality info */
311 	struct kshmid *a_sp;	/* kshmid if amp backs sysV, or NULL */
312 	int	a_purgewait;	/* somebody waits for slocks to go away */
313 	kcondvar_t a_purgecv;	/* cv for waiting for slocks to go away */
314 	kmutex_t a_purgemtx;	/* mutex for anonmap_purge() */
315 	spgcnt_t a_softlockcnt; /* number of pages locked in pcache */
316 	kmutex_t a_pmtx;	/* protects amp's pcache list */
317 	pcache_link_t a_phead;	/* head of amp's pcache list */
318 } amp_t;
319 
320 #ifdef _KERNEL
321 
322 #define	ANON_BUSY		0x1
323 #define	ANON_ISBUSY(slot)	(*(slot) & ANON_BUSY)
324 #define	ANON_SETBUSY(slot)	(*(slot) |= ANON_BUSY)
325 #define	ANON_CLRBUSY(slot)	(*(slot) &= ~ANON_BUSY)
326 
327 #define	ANON_MAP_SHIFT		6	/* log2(sizeof (struct anon_map)) */
328 #define	ANON_ARRAY_SHIFT	7	/* log2(ANON_LOCKSIZE) */
329 #define	ANON_LOCKSIZE		128
330 
331 #define	ANON_LOCK_ENTER(lock, type)	rw_enter((lock), (type))
332 #define	ANON_LOCK_EXIT(lock)		rw_exit((lock))
333 #define	ANON_LOCK_HELD(lock)		RW_LOCK_HELD((lock))
334 #define	ANON_READ_HELD(lock)		RW_READ_HELD((lock))
335 #define	ANON_WRITE_HELD(lock)		RW_WRITE_HELD((lock))
336 
337 #define	ANON_ARRAY_HASH(amp, idx)\
338 	((((idx) + ((idx) >> ANON_ARRAY_SHIFT) +\
339 	((idx) >> (ANON_ARRAY_SHIFT << 1)) +\
340 	((idx) >> (ANON_ARRAY_SHIFT + (ANON_ARRAY_SHIFT << 1)))) ^\
341 	((uintptr_t)(amp) >> ANON_MAP_SHIFT)) & (ANON_LOCKSIZE - 1))
342 
343 typedef struct anon_sync_obj {
344 	kmutex_t	*sync_mutex;
345 	kcondvar_t	*sync_cv;
346 	ulong_t		*sync_data;
347 } anon_sync_obj_t;
348 
349 /*
350  * Anonymous backing store accounting structure for kernel.
351  * ani_max = total reservable slots on physical (disk-backed) swap
352  * ani_phys_resv = total phys slots reserved for use by clients
353  * ani_mem_resv = total mem slots reserved for use by clients
354  * ani_free = # unallocated physical slots + # of reserved unallocated
355  * memory slots
356  */
357 
358 /*
359  * Initial total swap slots available for reservation
360  */
361 #define	TOTAL_AVAILABLE_SWAP \
362 	(k_anoninfo.ani_max + MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
363 
364 /*
365  * Swap slots currently available for reservation
366  */
367 #define	CURRENT_TOTAL_AVAILABLE_SWAP				\
368 	((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) +	\
369 	    MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
370 
371 struct k_anoninfo {
372 	pgcnt_t	ani_max;	/* total reservable slots on phys */
373 					/* (disk) swap */
374 	pgcnt_t	ani_free;	/* # of unallocated phys and mem slots */
375 	pgcnt_t	ani_phys_resv;	/* # of reserved phys (disk) slots */
376 	pgcnt_t	ani_mem_resv;	/* # of reserved mem slots */
377 	pgcnt_t	ani_locked_swap; /* # of swap slots locked in reserved */
378 				/* mem swap */
379 };
380 
381 extern	struct k_anoninfo k_anoninfo;
382 
383 extern void	anon_init(void);
384 extern struct	anon *anon_alloc(struct vnode *, anoff_t);
385 extern void	anon_dup(struct anon_hdr *, ulong_t,
386 		    struct anon_hdr *, ulong_t, size_t);
387 extern void	anon_dup_fill_holes(struct anon_hdr *, ulong_t,
388 		    struct anon_hdr *, ulong_t, size_t, uint_t, int);
389 extern int	anon_fill_cow_holes(struct seg *, caddr_t, struct anon_hdr *,
390 		    ulong_t, struct vnode *, u_offset_t, size_t, uint_t,
391 		    uint_t, struct vpage [], struct cred *);
392 extern void	anon_free(struct anon_hdr *, ulong_t, size_t);
393 extern void	anon_free_pages(struct anon_hdr *, ulong_t, size_t, uint_t);
394 extern int	anon_disclaim(struct anon_map *,
395 		    ulong_t, size_t, uint_t, pgcnt_t *);
396 extern int	anon_getpage(struct anon **, uint_t *, struct page **,
397 		    size_t, struct seg *, caddr_t, enum seg_rw, struct cred *);
398 extern int	swap_getconpage(struct vnode *, u_offset_t, size_t,
399 		    uint_t *, page_t *[], size_t, page_t *, uint_t *,
400 		    spgcnt_t *, struct seg *, caddr_t,
401 		    enum seg_rw, struct cred *);
402 extern int	anon_map_getpages(struct anon_map *, ulong_t,
403 		    uint_t, struct seg *, caddr_t, uint_t,
404 		    uint_t *, page_t *[], uint_t *,
405 		    struct vpage [], enum seg_rw, int, int, int, struct cred *);
406 extern int	anon_map_privatepages(struct anon_map *, ulong_t,
407 		    uint_t, struct seg *, caddr_t, uint_t,
408 		    page_t *[], struct vpage [], int, int, struct cred *);
409 extern struct	page *anon_private(struct anon **, struct seg *,
410 		    caddr_t, uint_t, struct page *,
411 		    int, struct cred *);
412 extern struct	page *anon_zero(struct seg *, caddr_t,
413 		    struct anon **, struct cred *);
414 extern int	anon_map_createpages(struct anon_map *, ulong_t,
415 		    size_t, struct page **,
416 		    struct seg *, caddr_t,
417 		    enum seg_rw, struct cred *);
418 extern int	anon_map_demotepages(struct anon_map *, ulong_t,
419 		    struct seg *, caddr_t, uint_t,
420 		    struct vpage [], struct cred *);
421 extern void	anon_shmap_free_pages(struct anon_map *, ulong_t, size_t);
422 extern int	anon_resvmem(size_t, boolean_t, zone_t *, int);
423 extern void	anon_unresvmem(size_t, zone_t *);
424 extern struct	anon_map *anonmap_alloc(size_t, size_t, int);
425 extern void	anonmap_free(struct anon_map *);
426 extern void	anonmap_purge(struct anon_map *);
427 extern void	anon_swap_free(struct anon *, struct page *);
428 extern void	anon_decref(struct anon *);
429 extern int	non_anon(struct anon_hdr *, ulong_t, u_offset_t *, size_t *);
430 extern pgcnt_t	anon_pages(struct anon_hdr *, ulong_t, pgcnt_t);
431 extern int	anon_swap_adjust(pgcnt_t);
432 extern void	anon_swap_restore(pgcnt_t);
433 extern struct	anon_hdr *anon_create(pgcnt_t, int);
434 extern void	anon_release(struct anon_hdr *, pgcnt_t);
435 extern struct	anon *anon_get_ptr(struct anon_hdr *, ulong_t);
436 extern ulong_t	*anon_get_slot(struct anon_hdr *, ulong_t);
437 extern struct	anon *anon_get_next_ptr(struct anon_hdr *, ulong_t *);
438 extern int	anon_set_ptr(struct anon_hdr *, ulong_t, struct anon *, int);
439 extern int 	anon_copy_ptr(struct anon_hdr *, ulong_t,
440 		    struct anon_hdr *, ulong_t, pgcnt_t, int);
441 extern pgcnt_t	anon_grow(struct anon_hdr *, ulong_t *, pgcnt_t, pgcnt_t, int);
442 extern void	anon_array_enter(struct anon_map *, ulong_t,
443 			anon_sync_obj_t *);
444 extern int	anon_array_try_enter(struct anon_map *, ulong_t,
445 			anon_sync_obj_t *);
446 extern void	anon_array_exit(anon_sync_obj_t *);
447 
448 /*
449  * anon_resv checks to see if there is enough swap space to fulfill a
450  * request and if so, reserves the appropriate anonymous memory resources.
451  * anon_checkspace just checks to see if there is space to fulfill the request,
452  * without taking any resources.  Both return 1 if successful and 0 if not.
453  *
454  * Macros are provided as anon reservation is usually charged to the zone of
455  * the current process.  In some cases (such as anon reserved by tmpfs), a
456  * zone pointer is needed to charge the appropriate zone.
457  */
458 #define	anon_unresv(size)		anon_unresvmem(size, curproc->p_zone)
459 #define	anon_unresv_zone(size, zone)	anon_unresvmem(size, zone)
460 #define	anon_resv(size)			\
461 	anon_resvmem((size), 1, curproc->p_zone, 1)
462 #define	anon_resv_zone(size, zone)	anon_resvmem((size), 1, zone, 1)
463 #define	anon_checkspace(size, zone)	anon_resvmem((size), 0, zone, 0)
464 #define	anon_try_resv_zone(size, zone)	anon_resvmem((size), 1, zone, 0)
465 
466 /*
467  * Flags to anon_private
468  */
469 #define	STEAL_PAGE	0x1	/* page can be stolen */
470 #define	LOCK_PAGE	0x2	/* page must be ``logically'' locked */
471 
472 /*
473  * SEGKP ANON pages that are locked are assumed to be LWP stack pages
474  * and thus count towards the user pages locked count.
475  * This value is protected by the same lock as availrmem.
476  */
477 extern pgcnt_t anon_segkp_pages_locked;
478 
479 extern int anon_debug;
480 
481 #ifdef ANON_DEBUG
482 
483 #define	A_ANON	0x01
484 #define	A_RESV	0x02
485 #define	A_MRESV	0x04
486 
487 /* vararg-like debugging macro. */
488 #define	ANON_PRINT(f, printf_args) \
489 		if (anon_debug & f) \
490 			printf printf_args
491 
492 #else	/* ANON_DEBUG */
493 
494 #define	ANON_PRINT(f, printf_args)
495 
496 #endif	/* ANON_DEBUG */
497 
498 #endif	/* _KERNEL */
499 
500 #ifdef	__cplusplus
501 }
502 #endif
503 
504 #endif	/* _VM_ANON_H */
505