1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2003 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright 2017 Joyent, Inc. 29 */ 30 31 #ifndef _SYS_POLL_IMPL_H 32 #define _SYS_POLL_IMPL_H 33 34 /* 35 * Caching Poll Subsystem: 36 * 37 * Each kernel thread (1), if engaged in poll system call, has a reference to 38 * a pollstate_t (2), which contains relevant flags and locks. The pollstate_t 39 * contains a pointer to a pollcache_t (3), which caches the state of previous 40 * calls to poll. A bitmap (4) is stored inside the poll cache, where each 41 * bit represents a file descriptor. The bits are set if the corresponding 42 * device has a polled event pending. Only fds with their bit set will be 43 * examined on the next poll invocation. The pollstate_t also contains a list 44 * of fd sets (5), which are represented by the pollcacheset_t type. These 45 * structures keep track of the pollfd_t arrays (6) passed in from userland. 46 * Each polled file descriptor has a corresponding polldat_t which can be 47 * chained onto a device's pollhead, and these are kept in a hash table (7) 48 * inside the pollcache_t. The hash table allows efficient conversion of a 49 * given fd to its corresponding polldat_t. 50 * 51 * (1) (2) 52 * +-----------+ +-------------+ 53 * | kthread_t |--->| pollstate_t |-->+-------------+ (6) 54 * +-----------+ +-------------+(5)| pcacheset_t |->[_][_][_][_] pollfd_t 55 * | +-------------+ 56 * | | pcacheset_t |->[_][_][_][_] pollfd_t 57 * (1a) | +-------------+ 58 * +---------------+ | 59 * | /dev/poll tbl | | 60 * +-v-------------+ | 61 * | | 62 * +------------------+ | 63 * (7) (3) V v 64 * polldat hash +-------------+ (4) bitmap representing fd space 65 * [_][_][_][_]<----| |--->000010010010001010101010101010110 66 * | | | | | pollcache_t | 67 * . v . . | | 68 * [polldat_t] +-------------+ 69 * | 70 * [polldat_t] 71 * | 72 * v 73 * NULL 74 * 75 * 76 * Both poll system call and /dev/poll use the pollcache_t structure 77 * definition and the routines managing the structure. But poll(2) and 78 * /dev/poll have their own copy of the structures. The /dev/poll driver 79 * table (1a) contains an array of pointers, each pointing at a pollcache_t 80 * struct (3). A device minor number is used as an device table index. 81 * 82 */ 83 #include <sys/poll.h> 84 85 #if defined(_KERNEL) || defined(_KMEMUSER) 86 87 #include <sys/thread.h> 88 #include <sys/file.h> 89 #include <sys/port_kernel.h> 90 91 #ifdef __cplusplus 92 extern "C" { 93 #endif 94 95 /* 96 * Typedefs 97 */ 98 struct pollcache; 99 struct pollstate; 100 struct pcachelink; 101 struct polldat; 102 103 typedef struct pollcache pollcache_t; 104 typedef struct pollstate pollstate_t; 105 typedef struct pcachelink pcachelink_t; 106 typedef struct polldat polldat_t; 107 108 /* 109 * description of pollcacheset structure 110 */ 111 typedef struct pollcacheset { 112 uintptr_t pcs_usradr; /* usr pollfd array address */ 113 pollfd_t *pcs_pollfd; /* cached poll lists */ 114 size_t pcs_nfds; /* number of poll fd in cached list */ 115 ulong_t pcs_count; /* for LU replacement policy */ 116 } pollcacheset_t; 117 118 #define POLLFDSETS 2 119 120 /* 121 * Maximum depth for recusive poll operations. 122 */ 123 #define POLLMAXDEPTH 5 124 125 /* 126 * State information kept by each polling thread 127 */ 128 struct pollstate { 129 pollfd_t *ps_pollfd; /* hold the current poll list */ 130 size_t ps_nfds; /* size of ps_pollfd */ 131 kmutex_t ps_lock; /* mutex for sleep/wakeup */ 132 pollcache_t *ps_pcache; /* cached poll fd set */ 133 pollcacheset_t *ps_pcacheset; /* cached poll lists */ 134 int ps_nsets; /* no. of cached poll sets */ 135 pollfd_t *ps_dpbuf; /* return pollfd buf used by devpoll */ 136 size_t ps_dpbufsize; /* size of ps_dpbuf */ 137 int ps_depth; /* epoll recursion depth */ 138 pollcache_t *ps_pc_stack[POLLMAXDEPTH]; /* epoll recursion state */ 139 pollcache_t *ps_contend_pc; /* pollcache waited on */ 140 pollstate_t *ps_contend_nextp; /* next in contender list */ 141 pollstate_t **ps_contend_pnextp; /* pointer-to-previous-next */ 142 int ps_flags; /* state flags */ 143 }; 144 145 /* pollstate flags */ 146 #define POLLSTATE_STALEMATE 0x1 147 #define POLLSTATE_ULFAIL 0x2 148 149 /* pollstate_enter results */ 150 #define PSE_SUCCESS 0 151 #define PSE_FAIL_DEPTH 1 152 #define PSE_FAIL_LOOP 2 153 #define PSE_FAIL_DEADLOCK 3 154 #define PSE_FAIL_POLLSTATE 4 155 156 /* 157 * poll cache size defines 158 */ 159 #define POLLCHUNKSHIFT 8 /* hash table increment size is 256 */ 160 #define POLLHASHCHUNKSZ (1 << POLLCHUNKSHIFT) 161 #define POLLHASHINC 2 /* poll hash table growth factor */ 162 #define POLLHASHTHRESHOLD 2 /* poll hash list length threshold */ 163 #define POLLHASH(x, y) ((y) % (x)) /* poll hash function */ 164 165 /* 166 * poll.c assumes the POLLMAPCHUNK is power of 2 167 */ 168 #define POLLMAPCHUNK 2048 /* bitmap inc -- each for 2K of polled fd's */ 169 170 /* 171 * used to refrence from watched fd back to the fd position in cached 172 * poll list for quick revents update. 173 */ 174 typedef struct xref { 175 ssize_t xf_position; /* xref fd position in poll fd list */ 176 short xf_refcnt; /* ref cnt of same fd in poll list */ 177 } xref_t; 178 179 #define POLLPOSINVAL (-1L) /* xf_position is invalid */ 180 #define POLLPOSTRANS (-2L) /* xf_position is transient state */ 181 182 183 typedef enum pclstate { 184 PCL_INIT = 0, /* just allocated/zeroed, prior */ 185 PCL_VALID, /* linked with both parent and child pollcaches */ 186 PCL_STALE, /* still linked but marked stale, pending refresh */ 187 PCL_INVALID, /* dissociated from one pollcache, awaiting cleanup */ 188 PCL_FREE /* only meant to indicate use-after-free */ 189 } pclstate_t; 190 191 /* 192 * The pcachelink struct creates an association between parent and child 193 * pollcaches in a recursive /dev/poll operation. Fields are protected by 194 * pcl_lock although manipulation of pcl_child_next or pcl_parent_next also 195 * requires holding pc_lock in the respective pcl_parent_pc or pcl_child_pc 196 * pollcache. 197 */ 198 struct pcachelink { 199 kmutex_t pcl_lock; /* protects contents */ 200 pclstate_t pcl_state; /* status of link entry */ 201 int pcl_refcnt; /* ref cnt of linked pcaches */ 202 pollcache_t *pcl_child_pc; /* child pollcache */ 203 pollcache_t *pcl_parent_pc; /* parent pollcache */ 204 pcachelink_t *pcl_child_next; /* next in child list */ 205 pcachelink_t *pcl_parent_next; /* next in parents list */ 206 }; 207 208 209 /* 210 * polldat is an entry for a cached poll fd. A polldat struct can be in 211 * poll cache table as well as on pollhead ph_list, which is used by 212 * pollwakeup to wake up a sleeping poller. There should be one polldat 213 * per polled fd hanging off pollstate struct. 214 */ 215 struct polldat { 216 int pd_fd; /* cached poll fd */ 217 int pd_events; /* union of all polled events */ 218 file_t *pd_fp; /* used to detect fd reuse */ 219 pollhead_t *pd_php; /* used to undo poll registration */ 220 kthread_t *pd_thread; /* used for waking up a sleep thrd */ 221 pollcache_t *pd_pcache; /* a ptr to the pollcache of this fd */ 222 polldat_t *pd_next; /* next on pollhead's ph_list */ 223 polldat_t *pd_hashnext; /* next on pollhead's ph_list */ 224 int pd_count; /* total count from all ref'ed sets */ 225 int pd_nsets; /* num of xref sets, used by poll(2) */ 226 xref_t *pd_ref; /* ptr to xref info, 1 for each set */ 227 port_kevent_t *pd_portev; /* associated port event struct */ 228 uf_entry_gen_t pd_gen; /* fd generation at cache time */ 229 uint64_t pd_epolldata; /* epoll data, if any */ 230 }; 231 232 /* 233 * One cache for each thread that polls. Points to a bitmap (used by pollwakeup) 234 * and a hash table of polldats. 235 * The offset of pc_lock field must be kept in sync with the pc_lock offset 236 * of port_fdcache_t, both structs implement pc_lock with offset 0 (see also 237 * pollrelock()). 238 */ 239 struct pollcache { 240 kmutex_t pc_lock; /* lock to protect pollcache */ 241 ulong_t *pc_bitmap; /* point to poll fd bitmap */ 242 polldat_t **pc_hash; /* points to a hash table of ptrs */ 243 int pc_mapend; /* the largest fd encountered so far */ 244 int pc_mapsize; /* the size of current map */ 245 int pc_hashsize; /* the size of current hash table */ 246 int pc_fdcount; /* track how many fd's are hashed */ 247 int pc_flag; /* see pc_flag define below */ 248 int pc_busy; /* can only exit when its 0 */ 249 kmutex_t pc_no_exit; /* protects pc_busy*, can't be nested */ 250 kcondvar_t pc_busy_cv; /* cv to wait on if ps_busy != 0 */ 251 kcondvar_t pc_cv; /* cv to wait on if needed */ 252 pid_t pc_pid; /* for check acc rights, devpoll only */ 253 int pc_mapstart; /* where search start, devpoll only */ 254 pcachelink_t *pc_parents; /* linked list of epoll parents */ 255 pcachelink_t *pc_children; /* linked list of epoll children */ 256 }; 257 258 /* pc_flag */ 259 #define PC_POLLWAKE 0x02 /* pollwakeup() occurred */ 260 #define PC_EPOLL 0x04 /* pollcache is epoll-enabled */ 261 262 #if defined(_KERNEL) 263 /* 264 * Internal routines. 265 */ 266 extern void pollnotify(pollcache_t *, int); 267 268 /* 269 * public poll head interfaces (see poll.h): 270 * 271 * pollhead_clean clean up all polldats on a pollhead list 272 */ 273 extern void pollhead_clean(pollhead_t *); 274 275 /* 276 * private poll head interfaces: 277 * 278 * pollhead_insert adds a polldat to a pollhead list 279 * pollhead_delete removes a polldat from a pollhead list 280 */ 281 extern void pollhead_insert(pollhead_t *, polldat_t *); 282 extern void pollhead_delete(pollhead_t *, polldat_t *); 283 284 /* 285 * poll state interfaces: 286 * 287 * pollstate_create initializes per-thread pollstate 288 * pollstate_destroy cleans up per-thread pollstate 289 * pollstate_enter safely lock pollcache for pollstate 290 * pollstate_exit unlock pollcache from pollstate 291 */ 292 extern pollstate_t *pollstate_create(void); 293 extern void pollstate_destroy(pollstate_t *); 294 extern int pollstate_enter(pollcache_t *); 295 extern void pollstate_exit(pollcache_t *); 296 297 /* 298 * public pcache interfaces: 299 * 300 * pcache_alloc allocate a poll cache skeleton 301 * pcache_create creates all poll cache supporting data struct 302 * pcache_insert cache a poll fd, calls pcache_insert_fd 303 * pcache_lookup given an fd list, returns a cookie 304 * pcache_poll polls the cache for fd's having events on them 305 * pcache_clean clean up all the pollhead and fpollinfo reference 306 * pcache_destroy destroys the pcache 307 */ 308 extern pollcache_t *pcache_alloc(); 309 extern void pcache_create(pollcache_t *, nfds_t); 310 extern int pcache_insert(pollstate_t *, file_t *, pollfd_t *, int *, ssize_t, 311 int); 312 extern int pcache_poll(pollfd_t *, pollstate_t *, nfds_t, int *, int); 313 extern void pcache_clean(pollcache_t *); 314 extern void pcache_destroy(pollcache_t *); 315 316 /* 317 * private pcache interfaces: 318 * 319 * pcache_lookup_fd lookup an fd, returns a polldat 320 * pcache_alloc_fd allocates and returns a polldat 321 * pcache_insert_fd insert an fd into pcache (called by pcache_insert) 322 * pcache_delete_fd insert an fd into pcache (called by pcacheset_delete_fd) 323 * pcache_grow_hashtbl grows the pollcache hash table and rehash 324 * pcache_grow_map grows the pollcache bitmap 325 * pcache_update_xref update cross ref (from polldat back to cacheset) info 326 * pcache_clean_entry cleanup an entry in pcache and more... 327 * pcache_wake_parents wake linked parent pollcaches 328 */ 329 extern polldat_t *pcache_lookup_fd(pollcache_t *, int); 330 extern polldat_t *pcache_alloc_fd(int); 331 extern void pcache_insert_fd(pollcache_t *, polldat_t *, nfds_t); 332 extern int pcache_delete_fd(pollstate_t *, int, size_t, int, uint_t); 333 extern void pcache_grow_hashtbl(pollcache_t *, nfds_t); 334 extern void pcache_grow_map(pollcache_t *, int); 335 extern void pcache_update_xref(pollcache_t *, int, ssize_t, int); 336 extern void pcache_clean_entry(pollstate_t *, int); 337 extern void pcache_wake_parents(pollcache_t *); 338 339 /* 340 * pcacheset interfaces: 341 * 342 * pcacheset_create creates new pcachesets (easier for dynamic pcachesets) 343 * pcacheset_destroy destroys a pcacheset 344 * pcacheset_cache_list caches and polls a new poll list 345 * pcacheset_remove_list removes (usually a partial) cached poll list 346 * pcacheset_resolve resolves extant pcacheset and fd list 347 * pcacheset_cmp compares a pcacheset with an fd list 348 * pcacheset_invalidate invalidate entries in pcachesets 349 * pcacheset_reset_count resets the usage counter of pcachesets 350 * pcacheset_replace selects a poll cacheset for replacement 351 */ 352 extern pollcacheset_t *pcacheset_create(int); 353 extern void pcacheset_destroy(pollcacheset_t *, int); 354 extern int pcacheset_cache_list(pollstate_t *, pollfd_t *, int *, int); 355 extern void pcacheset_remove_list(pollstate_t *, pollfd_t *, int, int, int, 356 int); 357 extern int pcacheset_resolve(pollstate_t *, nfds_t, int *, int); 358 extern int pcacheset_cmp(pollfd_t *, pollfd_t *, pollfd_t *, int); 359 extern void pcacheset_invalidate(pollstate_t *, polldat_t *); 360 extern void pcacheset_reset_count(pollstate_t *, int); 361 extern int pcacheset_replace(pollstate_t *); 362 363 #endif /* defined(_KERNEL) */ 364 365 #ifdef __cplusplus 366 } 367 #endif 368 369 #endif /* defined(_KERNEL) || defined(_KMEMUSER) */ 370 371 #endif /* _SYS_POLL_IMPL_H */ 372