1f841f6adSraf /*
2f841f6adSraf * CDDL HEADER START
3f841f6adSraf *
4f841f6adSraf * The contents of this file are subject to the terms of the
5f841f6adSraf * Common Development and Distribution License (the "License").
6f841f6adSraf * You may not use this file except in compliance with the License.
7f841f6adSraf *
8f841f6adSraf * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9f841f6adSraf * or http://www.opensolaris.org/os/licensing.
10f841f6adSraf * See the License for the specific language governing permissions
11f841f6adSraf * and limitations under the License.
12f841f6adSraf *
13f841f6adSraf * When distributing Covered Code, include this CDDL HEADER in each
14f841f6adSraf * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15f841f6adSraf * If applicable, add the following below this CDDL HEADER, with the
16f841f6adSraf * fields enclosed by brackets "[]" replaced with your own identifying
17f841f6adSraf * information: Portions Copyright [yyyy] [name of copyright owner]
18f841f6adSraf *
19f841f6adSraf * CDDL HEADER END
20f841f6adSraf */
21f841f6adSraf
22f841f6adSraf /*
23a574db85Sraf * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24f841f6adSraf * Use is subject to license terms.
25f841f6adSraf */
26f841f6adSraf
277257d1b4Sraf #include "lint.h"
28f841f6adSraf #include "thr_uberdata.h"
29f841f6adSraf #include "asyncio.h"
30f841f6adSraf #include <atomic.h>
31f841f6adSraf #include <sys/param.h>
32f841f6adSraf #include <sys/file.h>
33f841f6adSraf #include <sys/port.h>
34f841f6adSraf
35f841f6adSraf static int _aio_hash_insert(aio_result_t *, aio_req_t *);
36f841f6adSraf static aio_req_t *_aio_req_get(aio_worker_t *);
37f841f6adSraf static void _aio_req_add(aio_req_t *, aio_worker_t **, int);
38f841f6adSraf static void _aio_req_del(aio_worker_t *, aio_req_t *, int);
39f841f6adSraf static void _aio_work_done(aio_worker_t *);
40f841f6adSraf static void _aio_enq_doneq(aio_req_t *);
41f841f6adSraf
42f841f6adSraf extern void _aio_lio_free(aio_lio_t *);
43f841f6adSraf
44f841f6adSraf extern int __fdsync(int, int);
454d86dd30Sraf extern int __fcntl(int, int, ...);
46f841f6adSraf extern int _port_dispatch(int, int, int, int, uintptr_t, void *);
47f841f6adSraf
48f841f6adSraf static int _aio_fsync_del(aio_worker_t *, aio_req_t *);
49f841f6adSraf static void _aiodone(aio_req_t *, ssize_t, int);
50f841f6adSraf static void _aio_cancel_work(aio_worker_t *, int, int *, int *);
51f841f6adSraf static void _aio_finish_request(aio_worker_t *, ssize_t, int);
52f841f6adSraf
53f841f6adSraf /*
54f841f6adSraf * switch for kernel async I/O
55f841f6adSraf */
56f841f6adSraf int _kaio_ok = 0; /* 0 = disabled, 1 = on, -1 = error */
57f841f6adSraf
58f841f6adSraf /*
59f841f6adSraf * Key for thread-specific data
60f841f6adSraf */
61f841f6adSraf pthread_key_t _aio_key;
62f841f6adSraf
63f841f6adSraf /*
64f841f6adSraf * Array for determining whether or not a file supports kaio.
65f841f6adSraf * Initialized in _kaio_init().
66f841f6adSraf */
67f841f6adSraf uint32_t *_kaio_supported = NULL;
68f841f6adSraf
69f841f6adSraf /*
70f841f6adSraf * workers for read/write requests
71f841f6adSraf * (__aio_mutex lock protects circular linked list of workers)
72f841f6adSraf */
73f841f6adSraf aio_worker_t *__workers_rw; /* circular list of AIO workers */
74f841f6adSraf aio_worker_t *__nextworker_rw; /* next worker in list of workers */
75f841f6adSraf int __rw_workerscnt; /* number of read/write workers */
76f841f6adSraf
77f841f6adSraf /*
78f841f6adSraf * worker for notification requests.
79f841f6adSraf */
80f841f6adSraf aio_worker_t *__workers_no; /* circular list of AIO workers */
81f841f6adSraf aio_worker_t *__nextworker_no; /* next worker in list of workers */
82f841f6adSraf int __no_workerscnt; /* number of write workers */
83f841f6adSraf
84f841f6adSraf aio_req_t *_aio_done_tail; /* list of done requests */
85f841f6adSraf aio_req_t *_aio_done_head;
86f841f6adSraf
87f841f6adSraf mutex_t __aio_initlock = DEFAULTMUTEX; /* makes aio initialization atomic */
88f841f6adSraf cond_t __aio_initcv = DEFAULTCV;
89f841f6adSraf int __aio_initbusy = 0;
90f841f6adSraf
91f841f6adSraf mutex_t __aio_mutex = DEFAULTMUTEX; /* protects counts, and linked lists */
92f841f6adSraf cond_t _aio_iowait_cv = DEFAULTCV; /* wait for userland I/Os */
93f841f6adSraf
94f841f6adSraf pid_t __pid = (pid_t)-1; /* initialize as invalid pid */
95f841f6adSraf int _sigio_enabled = 0; /* when set, send SIGIO signal */
96f841f6adSraf
97f841f6adSraf aio_hash_t *_aio_hash;
98f841f6adSraf
99f841f6adSraf aio_req_t *_aio_doneq; /* double linked done queue list */
100f841f6adSraf
101f841f6adSraf int _aio_donecnt = 0;
102f841f6adSraf int _aio_waitncnt = 0; /* # of requests for aio_waitn */
103f841f6adSraf int _aio_doneq_cnt = 0;
104f841f6adSraf int _aio_outstand_cnt = 0; /* # of outstanding requests */
105f841f6adSraf int _kaio_outstand_cnt = 0; /* # of outstanding kaio requests */
106f841f6adSraf int _aio_req_done_cnt = 0; /* req. done but not in "done queue" */
107f841f6adSraf int _aio_kernel_suspend = 0; /* active kernel kaio calls */
108f841f6adSraf int _aio_suscv_cnt = 0; /* aio_suspend calls waiting on cv's */
109f841f6adSraf
110f841f6adSraf int _max_workers = 256; /* max number of workers permitted */
111f841f6adSraf int _min_workers = 4; /* min number of workers */
112f841f6adSraf int _minworkload = 2; /* min number of request in q */
113f841f6adSraf int _aio_worker_cnt = 0; /* number of workers to do requests */
114f841f6adSraf int __uaio_ok = 0; /* AIO has been enabled */
115f841f6adSraf sigset_t _worker_set; /* worker's signal mask */
116f841f6adSraf
117f841f6adSraf int _aiowait_flag = 0; /* when set, aiowait() is inprogress */
118f841f6adSraf int _aio_flags = 0; /* see asyncio.h defines for */
119f841f6adSraf
120f841f6adSraf aio_worker_t *_kaiowp = NULL; /* points to kaio cleanup thread */
121f841f6adSraf
122f841f6adSraf int hz; /* clock ticks per second */
123f841f6adSraf
124f841f6adSraf static int
_kaio_supported_init(void)125f841f6adSraf _kaio_supported_init(void)
126f841f6adSraf {
127f841f6adSraf void *ptr;
128f841f6adSraf size_t size;
129f841f6adSraf
130f841f6adSraf if (_kaio_supported != NULL) /* already initialized */
131f841f6adSraf return (0);
132f841f6adSraf
133f841f6adSraf size = MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t);
134f841f6adSraf ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
135f841f6adSraf MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
136f841f6adSraf if (ptr == MAP_FAILED)
137f841f6adSraf return (-1);
138f841f6adSraf _kaio_supported = ptr;
139f841f6adSraf return (0);
140f841f6adSraf }
141f841f6adSraf
142f841f6adSraf /*
143f841f6adSraf * The aio subsystem is initialized when an AIO request is made.
144f841f6adSraf * Constants are initialized like the max number of workers that
145f841f6adSraf * the subsystem can create, and the minimum number of workers
146f841f6adSraf * permitted before imposing some restrictions. Also, some
147f841f6adSraf * workers are created.
148f841f6adSraf */
149f841f6adSraf int
__uaio_init(void)150f841f6adSraf __uaio_init(void)
151f841f6adSraf {
152f841f6adSraf int ret = -1;
153f841f6adSraf int i;
154a574db85Sraf int cancel_state;
155f841f6adSraf
156f841f6adSraf lmutex_lock(&__aio_initlock);
157a574db85Sraf (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state);
158f841f6adSraf while (__aio_initbusy)
159a574db85Sraf (void) cond_wait(&__aio_initcv, &__aio_initlock);
160a574db85Sraf (void) pthread_setcancelstate(cancel_state, NULL);
161f841f6adSraf if (__uaio_ok) { /* already initialized */
162f841f6adSraf lmutex_unlock(&__aio_initlock);
163f841f6adSraf return (0);
164f841f6adSraf }
165f841f6adSraf __aio_initbusy = 1;
166f841f6adSraf lmutex_unlock(&__aio_initlock);
167f841f6adSraf
168f841f6adSraf hz = (int)sysconf(_SC_CLK_TCK);
169f841f6adSraf __pid = getpid();
170f841f6adSraf
171f841f6adSraf setup_cancelsig(SIGAIOCANCEL);
172f841f6adSraf
173f841f6adSraf if (_kaio_supported_init() != 0)
174f841f6adSraf goto out;
175f841f6adSraf
176f841f6adSraf /*
177f841f6adSraf * Allocate and initialize the hash table.
178f7499066Ssp92102 * Do this only once, even if __uaio_init() is called twice.
179f841f6adSraf */
180f7499066Ssp92102 if (_aio_hash == NULL) {
181f841f6adSraf /* LINTED pointer cast */
182f841f6adSraf _aio_hash = (aio_hash_t *)mmap(NULL,
183f841f6adSraf HASHSZ * sizeof (aio_hash_t), PROT_READ | PROT_WRITE,
184f841f6adSraf MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
185f841f6adSraf if ((void *)_aio_hash == MAP_FAILED) {
186f841f6adSraf _aio_hash = NULL;
187f841f6adSraf goto out;
188f841f6adSraf }
189f841f6adSraf for (i = 0; i < HASHSZ; i++)
190f7499066Ssp92102 (void) mutex_init(&_aio_hash[i].hash_lock,
191f7499066Ssp92102 USYNC_THREAD, NULL);
192f7499066Ssp92102 }
193f841f6adSraf
194f841f6adSraf /*
195f841f6adSraf * Initialize worker's signal mask to only catch SIGAIOCANCEL.
196f841f6adSraf */
197f841f6adSraf (void) sigfillset(&_worker_set);
198f841f6adSraf (void) sigdelset(&_worker_set, SIGAIOCANCEL);
199f841f6adSraf
200f841f6adSraf /*
201f7499066Ssp92102 * Create one worker to send asynchronous notifications.
202f7499066Ssp92102 * Do this only once, even if __uaio_init() is called twice.
203f7499066Ssp92102 */
204f7499066Ssp92102 if (__no_workerscnt == 0 &&
205f7499066Ssp92102 (_aio_create_worker(NULL, AIONOTIFY) != 0)) {
206f7499066Ssp92102 errno = EAGAIN;
207f7499066Ssp92102 goto out;
208f7499066Ssp92102 }
209f7499066Ssp92102
210f7499066Ssp92102 /*
211f841f6adSraf * Create the minimum number of read/write workers.
212f7499066Ssp92102 * And later check whether atleast one worker is created;
213f7499066Ssp92102 * lwp_create() calls could fail because of segkp exhaustion.
214f841f6adSraf */
215f841f6adSraf for (i = 0; i < _min_workers; i++)
216f841f6adSraf (void) _aio_create_worker(NULL, AIOREAD);
217f7499066Ssp92102 if (__rw_workerscnt == 0) {
218f7499066Ssp92102 errno = EAGAIN;
219f7499066Ssp92102 goto out;
220f7499066Ssp92102 }
221f841f6adSraf
222f841f6adSraf ret = 0;
223f841f6adSraf out:
224f841f6adSraf lmutex_lock(&__aio_initlock);
225f841f6adSraf if (ret == 0)
226f841f6adSraf __uaio_ok = 1;
227f841f6adSraf __aio_initbusy = 0;
228f841f6adSraf (void) cond_broadcast(&__aio_initcv);
229f841f6adSraf lmutex_unlock(&__aio_initlock);
230f841f6adSraf return (ret);
231f841f6adSraf }
232f841f6adSraf
233f841f6adSraf /*
234f841f6adSraf * Called from close() before actually performing the real _close().
235f841f6adSraf */
236f841f6adSraf void
_aio_close(int fd)237f841f6adSraf _aio_close(int fd)
238f841f6adSraf {
239f841f6adSraf if (fd < 0) /* avoid cancelling everything */
240f841f6adSraf return;
241f841f6adSraf /*
242f841f6adSraf * Cancel all outstanding aio requests for this file descriptor.
243f841f6adSraf */
244f841f6adSraf if (__uaio_ok)
245f841f6adSraf (void) aiocancel_all(fd);
246f841f6adSraf /*
247f841f6adSraf * If we have allocated the bit array, clear the bit for this file.
248f841f6adSraf * The next open may re-use this file descriptor and the new file
249f841f6adSraf * may have different kaio() behaviour.
250f841f6adSraf */
251f841f6adSraf if (_kaio_supported != NULL)
252f841f6adSraf CLEAR_KAIO_SUPPORTED(fd);
253f841f6adSraf }
254f841f6adSraf
255f841f6adSraf /*
256f841f6adSraf * special kaio cleanup thread sits in a loop in the
257f841f6adSraf * kernel waiting for pending kaio requests to complete.
258f841f6adSraf */
259f841f6adSraf void *
_kaio_cleanup_thread(void * arg)260f841f6adSraf _kaio_cleanup_thread(void *arg)
261f841f6adSraf {
262f841f6adSraf if (pthread_setspecific(_aio_key, arg) != 0)
263f841f6adSraf aio_panic("_kaio_cleanup_thread, pthread_setspecific()");
264f841f6adSraf (void) _kaio(AIOSTART);
265f841f6adSraf return (arg);
266f841f6adSraf }
267f841f6adSraf
268f841f6adSraf /*
269f841f6adSraf * initialize kaio.
270f841f6adSraf */
271f841f6adSraf void
_kaio_init()272f841f6adSraf _kaio_init()
273f841f6adSraf {
274f841f6adSraf int error;
275f841f6adSraf sigset_t oset;
276a574db85Sraf int cancel_state;
277f841f6adSraf
278f841f6adSraf lmutex_lock(&__aio_initlock);
279a574db85Sraf (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state);
280f841f6adSraf while (__aio_initbusy)
281a574db85Sraf (void) cond_wait(&__aio_initcv, &__aio_initlock);
282a574db85Sraf (void) pthread_setcancelstate(cancel_state, NULL);
283f841f6adSraf if (_kaio_ok) { /* already initialized */
284f841f6adSraf lmutex_unlock(&__aio_initlock);
285f841f6adSraf return;
286f841f6adSraf }
287f841f6adSraf __aio_initbusy = 1;
288f841f6adSraf lmutex_unlock(&__aio_initlock);
289f841f6adSraf
290f841f6adSraf if (_kaio_supported_init() != 0)
291f841f6adSraf error = ENOMEM;
292f841f6adSraf else if ((_kaiowp = _aio_worker_alloc()) == NULL)
293f841f6adSraf error = ENOMEM;
294f841f6adSraf else if ((error = (int)_kaio(AIOINIT)) == 0) {
295f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
296f841f6adSraf error = thr_create(NULL, AIOSTKSIZE, _kaio_cleanup_thread,
297f841f6adSraf _kaiowp, THR_DAEMON, &_kaiowp->work_tid);
298f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
299f841f6adSraf }
300f841f6adSraf if (error && _kaiowp != NULL) {
301f841f6adSraf _aio_worker_free(_kaiowp);
302f841f6adSraf _kaiowp = NULL;
303f841f6adSraf }
304f841f6adSraf
305f841f6adSraf lmutex_lock(&__aio_initlock);
306f841f6adSraf if (error)
307f841f6adSraf _kaio_ok = -1;
308f841f6adSraf else
309f841f6adSraf _kaio_ok = 1;
310f841f6adSraf __aio_initbusy = 0;
311f841f6adSraf (void) cond_broadcast(&__aio_initcv);
312f841f6adSraf lmutex_unlock(&__aio_initlock);
313f841f6adSraf }
314f841f6adSraf
315f841f6adSraf int
aioread(int fd,caddr_t buf,int bufsz,off_t offset,int whence,aio_result_t * resultp)316f841f6adSraf aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
317f841f6adSraf aio_result_t *resultp)
318f841f6adSraf {
319f841f6adSraf return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOREAD));
320f841f6adSraf }
321f841f6adSraf
322f841f6adSraf int
aiowrite(int fd,caddr_t buf,int bufsz,off_t offset,int whence,aio_result_t * resultp)323f841f6adSraf aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
324f841f6adSraf aio_result_t *resultp)
325f841f6adSraf {
326f841f6adSraf return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOWRITE));
327f841f6adSraf }
328f841f6adSraf
329f841f6adSraf #if !defined(_LP64)
330f841f6adSraf int
aioread64(int fd,caddr_t buf,int bufsz,off64_t offset,int whence,aio_result_t * resultp)331f841f6adSraf aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
332f841f6adSraf aio_result_t *resultp)
333f841f6adSraf {
334f841f6adSraf return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAREAD64));
335f841f6adSraf }
336f841f6adSraf
337f841f6adSraf int
aiowrite64(int fd,caddr_t buf,int bufsz,off64_t offset,int whence,aio_result_t * resultp)338f841f6adSraf aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
339f841f6adSraf aio_result_t *resultp)
340f841f6adSraf {
341f841f6adSraf return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAWRITE64));
342f841f6adSraf }
343f841f6adSraf #endif /* !defined(_LP64) */
344f841f6adSraf
345f841f6adSraf int
_aiorw(int fd,caddr_t buf,int bufsz,offset_t offset,int whence,aio_result_t * resultp,int mode)346f841f6adSraf _aiorw(int fd, caddr_t buf, int bufsz, offset_t offset, int whence,
347f841f6adSraf aio_result_t *resultp, int mode)
348f841f6adSraf {
349f841f6adSraf aio_req_t *reqp;
350f841f6adSraf aio_args_t *ap;
351f841f6adSraf offset_t loffset;
352967072a1Spraks struct stat64 stat64;
353f841f6adSraf int error = 0;
354f841f6adSraf int kerr;
355f841f6adSraf int umode;
356f841f6adSraf
357f841f6adSraf switch (whence) {
358f841f6adSraf
359f841f6adSraf case SEEK_SET:
360f841f6adSraf loffset = offset;
361f841f6adSraf break;
362f841f6adSraf case SEEK_CUR:
363f841f6adSraf if ((loffset = llseek(fd, 0, SEEK_CUR)) == -1)
364f841f6adSraf error = -1;
365f841f6adSraf else
366f841f6adSraf loffset += offset;
367f841f6adSraf break;
368f841f6adSraf case SEEK_END:
369967072a1Spraks if (fstat64(fd, &stat64) == -1)
370f841f6adSraf error = -1;
371f841f6adSraf else
372967072a1Spraks loffset = offset + stat64.st_size;
373f841f6adSraf break;
374f841f6adSraf default:
375f841f6adSraf errno = EINVAL;
376f841f6adSraf error = -1;
377f841f6adSraf }
378f841f6adSraf
379f841f6adSraf if (error)
380f841f6adSraf return (error);
381f841f6adSraf
382f841f6adSraf /* initialize kaio */
383f841f6adSraf if (!_kaio_ok)
384f841f6adSraf _kaio_init();
385f841f6adSraf
386f841f6adSraf /*
387f841f6adSraf * _aio_do_request() needs the original request code (mode) to be able
388f841f6adSraf * to choose the appropiate 32/64 bit function. All other functions
389f841f6adSraf * only require the difference between READ and WRITE (umode).
390f841f6adSraf */
391f841f6adSraf if (mode == AIOAREAD64 || mode == AIOAWRITE64)
392f841f6adSraf umode = mode - AIOAREAD64;
393f841f6adSraf else
394f841f6adSraf umode = mode;
395f841f6adSraf
396f841f6adSraf /*
397f841f6adSraf * Try kernel aio first.
398f841f6adSraf * If errno is ENOTSUP/EBADFD, fall back to the thread implementation.
399f841f6adSraf */
400f841f6adSraf if (_kaio_ok > 0 && KAIO_SUPPORTED(fd)) {
401f841f6adSraf resultp->aio_errno = 0;
402f841f6adSraf sig_mutex_lock(&__aio_mutex);
403f841f6adSraf _kaio_outstand_cnt++;
404967072a1Spraks sig_mutex_unlock(&__aio_mutex);
405f841f6adSraf kerr = (int)_kaio(((resultp->aio_return == AIO_INPROGRESS) ?
406f841f6adSraf (umode | AIO_POLL_BIT) : umode),
407f841f6adSraf fd, buf, bufsz, loffset, resultp);
408f841f6adSraf if (kerr == 0) {
409f841f6adSraf return (0);
410f841f6adSraf }
411967072a1Spraks sig_mutex_lock(&__aio_mutex);
412f841f6adSraf _kaio_outstand_cnt--;
413f841f6adSraf sig_mutex_unlock(&__aio_mutex);
414f841f6adSraf if (errno != ENOTSUP && errno != EBADFD)
415f841f6adSraf return (-1);
416f841f6adSraf if (errno == EBADFD)
417f841f6adSraf SET_KAIO_NOT_SUPPORTED(fd);
418f841f6adSraf }
419f841f6adSraf
420f841f6adSraf if (!__uaio_ok && __uaio_init() == -1)
421f841f6adSraf return (-1);
422f841f6adSraf
423f841f6adSraf if ((reqp = _aio_req_alloc()) == NULL) {
424f841f6adSraf errno = EAGAIN;
425f841f6adSraf return (-1);
426f841f6adSraf }
427f841f6adSraf
428f841f6adSraf /*
429f841f6adSraf * _aio_do_request() checks reqp->req_op to differentiate
430f841f6adSraf * between 32 and 64 bit access.
431f841f6adSraf */
432f841f6adSraf reqp->req_op = mode;
433f841f6adSraf reqp->req_resultp = resultp;
434f841f6adSraf ap = &reqp->req_args;
435f841f6adSraf ap->fd = fd;
436f841f6adSraf ap->buf = buf;
437f841f6adSraf ap->bufsz = bufsz;
438f841f6adSraf ap->offset = loffset;
439f841f6adSraf
440f841f6adSraf if (_aio_hash_insert(resultp, reqp) != 0) {
441f841f6adSraf _aio_req_free(reqp);
442f841f6adSraf errno = EINVAL;
443f841f6adSraf return (-1);
444f841f6adSraf }
445f841f6adSraf /*
446f841f6adSraf * _aio_req_add() only needs the difference between READ and
447f841f6adSraf * WRITE to choose the right worker queue.
448f841f6adSraf */
449f841f6adSraf _aio_req_add(reqp, &__nextworker_rw, umode);
450f841f6adSraf return (0);
451f841f6adSraf }
452f841f6adSraf
453f841f6adSraf int
aiocancel(aio_result_t * resultp)454f841f6adSraf aiocancel(aio_result_t *resultp)
455f841f6adSraf {
456f841f6adSraf aio_req_t *reqp;
457f841f6adSraf aio_worker_t *aiowp;
458f841f6adSraf int ret;
459f841f6adSraf int done = 0;
460f841f6adSraf int canceled = 0;
461f841f6adSraf
462f841f6adSraf if (!__uaio_ok) {
463f841f6adSraf errno = EINVAL;
464f841f6adSraf return (-1);
465f841f6adSraf }
466f841f6adSraf
467f841f6adSraf sig_mutex_lock(&__aio_mutex);
468f841f6adSraf reqp = _aio_hash_find(resultp);
469f841f6adSraf if (reqp == NULL) {
470f841f6adSraf if (_aio_outstand_cnt == _aio_req_done_cnt)
471f841f6adSraf errno = EINVAL;
472f841f6adSraf else
473f841f6adSraf errno = EACCES;
474f841f6adSraf ret = -1;
475f841f6adSraf } else {
476f841f6adSraf aiowp = reqp->req_worker;
477f841f6adSraf sig_mutex_lock(&aiowp->work_qlock1);
478f841f6adSraf (void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
479f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
480f841f6adSraf
481f841f6adSraf if (canceled) {
482f841f6adSraf ret = 0;
483f841f6adSraf } else {
484f841f6adSraf if (_aio_outstand_cnt == 0 ||
485f841f6adSraf _aio_outstand_cnt == _aio_req_done_cnt)
486f841f6adSraf errno = EINVAL;
487f841f6adSraf else
488f841f6adSraf errno = EACCES;
489f841f6adSraf ret = -1;
490f841f6adSraf }
491f841f6adSraf }
492f841f6adSraf sig_mutex_unlock(&__aio_mutex);
493f841f6adSraf return (ret);
494f841f6adSraf }
495f841f6adSraf
496a574db85Sraf static void
_aiowait_cleanup(void * arg __unused)497*4a38094cSToomas Soome _aiowait_cleanup(void *arg __unused)
498a574db85Sraf {
499a574db85Sraf sig_mutex_lock(&__aio_mutex);
500a574db85Sraf _aiowait_flag--;
501a574db85Sraf sig_mutex_unlock(&__aio_mutex);
502a574db85Sraf }
503a574db85Sraf
504f841f6adSraf /*
505a574db85Sraf * This must be asynch safe and cancel safe
506f841f6adSraf */
507f841f6adSraf aio_result_t *
aiowait(struct timeval * uwait)508f841f6adSraf aiowait(struct timeval *uwait)
509f841f6adSraf {
510f841f6adSraf aio_result_t *uresultp;
511f841f6adSraf aio_result_t *kresultp;
512f841f6adSraf aio_result_t *resultp;
513f841f6adSraf int dontblock;
514f841f6adSraf int timedwait = 0;
515f841f6adSraf int kaio_errno = 0;
516f841f6adSraf struct timeval twait;
517f841f6adSraf struct timeval *wait = NULL;
518f841f6adSraf hrtime_t hrtend;
519f841f6adSraf hrtime_t hres;
520f841f6adSraf
521f841f6adSraf if (uwait) {
522f841f6adSraf /*
523f841f6adSraf * Check for a valid specified wait time.
524f841f6adSraf * If it is invalid, fail the call right away.
525f841f6adSraf */
526f841f6adSraf if (uwait->tv_sec < 0 || uwait->tv_usec < 0 ||
527f841f6adSraf uwait->tv_usec >= MICROSEC) {
528f841f6adSraf errno = EINVAL;
529f841f6adSraf return ((aio_result_t *)-1);
530f841f6adSraf }
531f841f6adSraf
532f841f6adSraf if (uwait->tv_sec > 0 || uwait->tv_usec > 0) {
533f841f6adSraf hrtend = gethrtime() +
534f841f6adSraf (hrtime_t)uwait->tv_sec * NANOSEC +
535f841f6adSraf (hrtime_t)uwait->tv_usec * (NANOSEC / MICROSEC);
536f841f6adSraf twait = *uwait;
537f841f6adSraf wait = &twait;
538f841f6adSraf timedwait++;
539f841f6adSraf } else {
540f841f6adSraf /* polling */
541f841f6adSraf sig_mutex_lock(&__aio_mutex);
542f841f6adSraf if (_kaio_outstand_cnt == 0) {
543f841f6adSraf kresultp = (aio_result_t *)-1;
544f841f6adSraf } else {
545f841f6adSraf kresultp = (aio_result_t *)_kaio(AIOWAIT,
546f841f6adSraf (struct timeval *)-1, 1);
547f841f6adSraf if (kresultp != (aio_result_t *)-1 &&
548f841f6adSraf kresultp != NULL &&
549f841f6adSraf kresultp != (aio_result_t *)1) {
550f841f6adSraf _kaio_outstand_cnt--;
551f841f6adSraf sig_mutex_unlock(&__aio_mutex);
552f841f6adSraf return (kresultp);
553f841f6adSraf }
554f841f6adSraf }
555f841f6adSraf uresultp = _aio_req_done();
556f841f6adSraf sig_mutex_unlock(&__aio_mutex);
557f841f6adSraf if (uresultp != NULL &&
558f841f6adSraf uresultp != (aio_result_t *)-1) {
559f841f6adSraf return (uresultp);
560f841f6adSraf }
561f841f6adSraf if (uresultp == (aio_result_t *)-1 &&
562f841f6adSraf kresultp == (aio_result_t *)-1) {
563f841f6adSraf errno = EINVAL;
564f841f6adSraf return ((aio_result_t *)-1);
565f841f6adSraf } else {
566f841f6adSraf return (NULL);
567f841f6adSraf }
568f841f6adSraf }
569f841f6adSraf }
570f841f6adSraf
571f841f6adSraf for (;;) {
572f841f6adSraf sig_mutex_lock(&__aio_mutex);
573f841f6adSraf uresultp = _aio_req_done();
574f841f6adSraf if (uresultp != NULL && uresultp != (aio_result_t *)-1) {
575f841f6adSraf sig_mutex_unlock(&__aio_mutex);
576f841f6adSraf resultp = uresultp;
577f841f6adSraf break;
578f841f6adSraf }
579f841f6adSraf _aiowait_flag++;
580f841f6adSraf dontblock = (uresultp == (aio_result_t *)-1);
581f841f6adSraf if (dontblock && _kaio_outstand_cnt == 0) {
582f841f6adSraf kresultp = (aio_result_t *)-1;
583f841f6adSraf kaio_errno = EINVAL;
584f841f6adSraf } else {
585f841f6adSraf sig_mutex_unlock(&__aio_mutex);
586a574db85Sraf pthread_cleanup_push(_aiowait_cleanup, NULL);
587a574db85Sraf _cancel_prologue();
588f841f6adSraf kresultp = (aio_result_t *)_kaio(AIOWAIT,
589f841f6adSraf wait, dontblock);
590a574db85Sraf _cancel_epilogue();
591a574db85Sraf pthread_cleanup_pop(0);
592f841f6adSraf sig_mutex_lock(&__aio_mutex);
593f841f6adSraf kaio_errno = errno;
594f841f6adSraf }
595f841f6adSraf _aiowait_flag--;
596f841f6adSraf sig_mutex_unlock(&__aio_mutex);
597f841f6adSraf if (kresultp == (aio_result_t *)1) {
598f841f6adSraf /* aiowait() awakened by an aionotify() */
599f841f6adSraf continue;
600f841f6adSraf } else if (kresultp != NULL &&
601f841f6adSraf kresultp != (aio_result_t *)-1) {
602f841f6adSraf resultp = kresultp;
603f841f6adSraf sig_mutex_lock(&__aio_mutex);
604f841f6adSraf _kaio_outstand_cnt--;
605f841f6adSraf sig_mutex_unlock(&__aio_mutex);
606f841f6adSraf break;
607f841f6adSraf } else if (kresultp == (aio_result_t *)-1 &&
608f841f6adSraf kaio_errno == EINVAL &&
609f841f6adSraf uresultp == (aio_result_t *)-1) {
610f841f6adSraf errno = kaio_errno;
611f841f6adSraf resultp = (aio_result_t *)-1;
612f841f6adSraf break;
613f841f6adSraf } else if (kresultp == (aio_result_t *)-1 &&
614f841f6adSraf kaio_errno == EINTR) {
615f841f6adSraf errno = kaio_errno;
616f841f6adSraf resultp = (aio_result_t *)-1;
617f841f6adSraf break;
618f841f6adSraf } else if (timedwait) {
619f841f6adSraf hres = hrtend - gethrtime();
620f841f6adSraf if (hres <= 0) {
621f841f6adSraf /* time is up; return */
622f841f6adSraf resultp = NULL;
623f841f6adSraf break;
624f841f6adSraf } else {
625f841f6adSraf /*
626f841f6adSraf * Some time left. Round up the remaining time
627f841f6adSraf * in nanoseconds to microsec. Retry the call.
628f841f6adSraf */
629f841f6adSraf hres += (NANOSEC / MICROSEC) - 1;
630f841f6adSraf wait->tv_sec = hres / NANOSEC;
631f841f6adSraf wait->tv_usec =
632f841f6adSraf (hres % NANOSEC) / (NANOSEC / MICROSEC);
633f841f6adSraf }
634f841f6adSraf } else {
635f841f6adSraf ASSERT(kresultp == NULL && uresultp == NULL);
636f841f6adSraf resultp = NULL;
637f841f6adSraf continue;
638f841f6adSraf }
639f841f6adSraf }
640f841f6adSraf return (resultp);
641f841f6adSraf }
642f841f6adSraf
643f841f6adSraf /*
644f841f6adSraf * _aio_get_timedelta calculates the remaining time and stores the result
645f841f6adSraf * into timespec_t *wait.
646f841f6adSraf */
647f841f6adSraf
648f841f6adSraf int
_aio_get_timedelta(timespec_t * end,timespec_t * wait)649f841f6adSraf _aio_get_timedelta(timespec_t *end, timespec_t *wait)
650f841f6adSraf {
651f841f6adSraf int ret = 0;
652f841f6adSraf struct timeval cur;
653f841f6adSraf timespec_t curtime;
654f841f6adSraf
655f841f6adSraf (void) gettimeofday(&cur, NULL);
656f841f6adSraf curtime.tv_sec = cur.tv_sec;
657f841f6adSraf curtime.tv_nsec = cur.tv_usec * 1000; /* convert us to ns */
658f841f6adSraf
659f841f6adSraf if (end->tv_sec >= curtime.tv_sec) {
660f841f6adSraf wait->tv_sec = end->tv_sec - curtime.tv_sec;
661f841f6adSraf if (end->tv_nsec >= curtime.tv_nsec) {
662f841f6adSraf wait->tv_nsec = end->tv_nsec - curtime.tv_nsec;
663f841f6adSraf if (wait->tv_sec == 0 && wait->tv_nsec == 0)
664f841f6adSraf ret = -1; /* timer expired */
665f841f6adSraf } else {
666f841f6adSraf if (end->tv_sec > curtime.tv_sec) {
667f841f6adSraf wait->tv_sec -= 1;
668f841f6adSraf wait->tv_nsec = NANOSEC -
669f841f6adSraf (curtime.tv_nsec - end->tv_nsec);
670f841f6adSraf } else {
671f841f6adSraf ret = -1; /* timer expired */
672f841f6adSraf }
673f841f6adSraf }
674f841f6adSraf } else {
675f841f6adSraf ret = -1;
676f841f6adSraf }
677f841f6adSraf return (ret);
678f841f6adSraf }
679f841f6adSraf
680f841f6adSraf /*
681f841f6adSraf * If closing by file descriptor: we will simply cancel all the outstanding
682f841f6adSraf * aio`s and return. Those aio's in question will have either noticed the
683f841f6adSraf * cancellation notice before, during, or after initiating io.
684f841f6adSraf */
685f841f6adSraf int
aiocancel_all(int fd)686f841f6adSraf aiocancel_all(int fd)
687f841f6adSraf {
688f841f6adSraf aio_req_t *reqp;
689bced1f33Spraks aio_req_t **reqpp, *last;
690f841f6adSraf aio_worker_t *first;
691f841f6adSraf aio_worker_t *next;
692f841f6adSraf int canceled = 0;
693f841f6adSraf int done = 0;
694f841f6adSraf int cancelall = 0;
695f841f6adSraf
696f841f6adSraf sig_mutex_lock(&__aio_mutex);
697f841f6adSraf
698f841f6adSraf if (_aio_outstand_cnt == 0) {
699f841f6adSraf sig_mutex_unlock(&__aio_mutex);
700f841f6adSraf return (AIO_ALLDONE);
701f841f6adSraf }
702f841f6adSraf
703f841f6adSraf /*
704f841f6adSraf * Cancel requests from the read/write workers' queues.
705f841f6adSraf */
706f841f6adSraf first = __nextworker_rw;
707f841f6adSraf next = first;
708f841f6adSraf do {
709f841f6adSraf _aio_cancel_work(next, fd, &canceled, &done);
710f841f6adSraf } while ((next = next->work_forw) != first);
711f841f6adSraf
712f841f6adSraf /*
713f841f6adSraf * finally, check if there are requests on the done queue that
714f841f6adSraf * should be canceled.
715f841f6adSraf */
716f841f6adSraf if (fd < 0)
717f841f6adSraf cancelall = 1;
718f841f6adSraf reqpp = &_aio_done_tail;
719bced1f33Spraks last = _aio_done_tail;
720f841f6adSraf while ((reqp = *reqpp) != NULL) {
721f841f6adSraf if (cancelall || reqp->req_args.fd == fd) {
722f841f6adSraf *reqpp = reqp->req_next;
723bced1f33Spraks if (last == reqp) {
724bced1f33Spraks last = reqp->req_next;
725bced1f33Spraks }
726bced1f33Spraks if (_aio_done_head == reqp) {
727bced1f33Spraks /* this should be the last req in list */
728bced1f33Spraks _aio_done_head = last;
729bced1f33Spraks }
730f841f6adSraf _aio_donecnt--;
731bced1f33Spraks _aio_set_result(reqp, -1, ECANCELED);
732f841f6adSraf (void) _aio_hash_del(reqp->req_resultp);
733f841f6adSraf _aio_req_free(reqp);
734bced1f33Spraks } else {
735f841f6adSraf reqpp = &reqp->req_next;
736bced1f33Spraks last = reqp;
737f841f6adSraf }
738bced1f33Spraks }
739bced1f33Spraks
740f841f6adSraf if (cancelall) {
741f841f6adSraf ASSERT(_aio_donecnt == 0);
742f841f6adSraf _aio_done_head = NULL;
743f841f6adSraf }
744f841f6adSraf sig_mutex_unlock(&__aio_mutex);
745f841f6adSraf
746f841f6adSraf if (canceled && done == 0)
747f841f6adSraf return (AIO_CANCELED);
748f841f6adSraf else if (done && canceled == 0)
749f841f6adSraf return (AIO_ALLDONE);
750f841f6adSraf else if ((canceled + done == 0) && KAIO_SUPPORTED(fd))
751f841f6adSraf return ((int)_kaio(AIOCANCEL, fd, NULL));
752f841f6adSraf return (AIO_NOTCANCELED);
753f841f6adSraf }
754f841f6adSraf
755f841f6adSraf /*
756f841f6adSraf * Cancel requests from a given work queue. If the file descriptor
757f841f6adSraf * parameter, fd, is non-negative, then only cancel those requests
758f841f6adSraf * in this queue that are to this file descriptor. If the fd
759f841f6adSraf * parameter is -1, then cancel all requests.
760f841f6adSraf */
761f841f6adSraf static void
_aio_cancel_work(aio_worker_t * aiowp,int fd,int * canceled,int * done)762f841f6adSraf _aio_cancel_work(aio_worker_t *aiowp, int fd, int *canceled, int *done)
763f841f6adSraf {
764f841f6adSraf aio_req_t *reqp;
765f841f6adSraf
766f841f6adSraf sig_mutex_lock(&aiowp->work_qlock1);
767f841f6adSraf /*
768f841f6adSraf * cancel queued requests first.
769f841f6adSraf */
770f841f6adSraf reqp = aiowp->work_tail1;
771f841f6adSraf while (reqp != NULL) {
772f841f6adSraf if (fd < 0 || reqp->req_args.fd == fd) {
773f841f6adSraf if (_aio_cancel_req(aiowp, reqp, canceled, done)) {
774f841f6adSraf /*
775f841f6adSraf * Callers locks were dropped.
776f841f6adSraf * reqp is invalid; start traversing
777f841f6adSraf * the list from the beginning again.
778f841f6adSraf */
779f841f6adSraf reqp = aiowp->work_tail1;
780f841f6adSraf continue;
781f841f6adSraf }
782f841f6adSraf }
783f841f6adSraf reqp = reqp->req_next;
784f841f6adSraf }
785f841f6adSraf /*
786f841f6adSraf * Since the queued requests have been canceled, there can
787f841f6adSraf * only be one inprogress request that should be canceled.
788f841f6adSraf */
789f841f6adSraf if ((reqp = aiowp->work_req) != NULL &&
790f841f6adSraf (fd < 0 || reqp->req_args.fd == fd))
791f841f6adSraf (void) _aio_cancel_req(aiowp, reqp, canceled, done);
792f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
793f841f6adSraf }
794f841f6adSraf
795f841f6adSraf /*
796f841f6adSraf * Cancel a request. Return 1 if the callers locks were temporarily
797f841f6adSraf * dropped, otherwise return 0.
798f841f6adSraf */
799f841f6adSraf int
_aio_cancel_req(aio_worker_t * aiowp,aio_req_t * reqp,int * canceled,int * done)800f841f6adSraf _aio_cancel_req(aio_worker_t *aiowp, aio_req_t *reqp, int *canceled, int *done)
801f841f6adSraf {
802f841f6adSraf int ostate = reqp->req_state;
803f841f6adSraf
804f841f6adSraf ASSERT(MUTEX_HELD(&__aio_mutex));
805f841f6adSraf ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
806f841f6adSraf if (ostate == AIO_REQ_CANCELED)
807f841f6adSraf return (0);
808bced1f33Spraks if (ostate == AIO_REQ_DONE && !POSIX_AIO(reqp) &&
809bced1f33Spraks aiowp->work_prev1 == reqp) {
810bced1f33Spraks ASSERT(aiowp->work_done1 != 0);
811bced1f33Spraks /*
812bced1f33Spraks * If not on the done queue yet, just mark it CANCELED,
813bced1f33Spraks * _aio_work_done() will do the necessary clean up.
814bced1f33Spraks * This is required to ensure that aiocancel_all() cancels
815bced1f33Spraks * all the outstanding requests, including this one which
816bced1f33Spraks * is not yet on done queue but has been marked done.
817bced1f33Spraks */
818bced1f33Spraks _aio_set_result(reqp, -1, ECANCELED);
819bced1f33Spraks (void) _aio_hash_del(reqp->req_resultp);
820bced1f33Spraks reqp->req_state = AIO_REQ_CANCELED;
821bced1f33Spraks (*canceled)++;
822bced1f33Spraks return (0);
823bced1f33Spraks }
824bced1f33Spraks
825f841f6adSraf if (ostate == AIO_REQ_DONE || ostate == AIO_REQ_DONEQ) {
826f841f6adSraf (*done)++;
827f841f6adSraf return (0);
828f841f6adSraf }
829f841f6adSraf if (reqp->req_op == AIOFSYNC && reqp != aiowp->work_req) {
830f841f6adSraf ASSERT(POSIX_AIO(reqp));
831f841f6adSraf /* Cancel the queued aio_fsync() request */
832f841f6adSraf if (!reqp->req_head->lio_canned) {
833f841f6adSraf reqp->req_head->lio_canned = 1;
834f841f6adSraf _aio_outstand_cnt--;
835f841f6adSraf (*canceled)++;
836f841f6adSraf }
837f841f6adSraf return (0);
838f841f6adSraf }
839f841f6adSraf reqp->req_state = AIO_REQ_CANCELED;
840f841f6adSraf _aio_req_del(aiowp, reqp, ostate);
841f841f6adSraf (void) _aio_hash_del(reqp->req_resultp);
842f841f6adSraf (*canceled)++;
843f841f6adSraf if (reqp == aiowp->work_req) {
844f841f6adSraf ASSERT(ostate == AIO_REQ_INPROGRESS);
845f841f6adSraf /*
846f841f6adSraf * Set the result values now, before _aiodone() is called.
847f841f6adSraf * We do this because the application can expect aio_return
848f841f6adSraf * and aio_errno to be set to -1 and ECANCELED, respectively,
849f841f6adSraf * immediately after a successful return from aiocancel()
850f841f6adSraf * or aio_cancel().
851f841f6adSraf */
852f841f6adSraf _aio_set_result(reqp, -1, ECANCELED);
853f841f6adSraf (void) thr_kill(aiowp->work_tid, SIGAIOCANCEL);
854f841f6adSraf return (0);
855f841f6adSraf }
856f841f6adSraf if (!POSIX_AIO(reqp)) {
857f841f6adSraf _aio_outstand_cnt--;
858f841f6adSraf _aio_set_result(reqp, -1, ECANCELED);
859bced1f33Spraks _aio_req_free(reqp);
860f841f6adSraf return (0);
861f841f6adSraf }
862f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
863f841f6adSraf sig_mutex_unlock(&__aio_mutex);
864f841f6adSraf _aiodone(reqp, -1, ECANCELED);
865f841f6adSraf sig_mutex_lock(&__aio_mutex);
866f841f6adSraf sig_mutex_lock(&aiowp->work_qlock1);
867f841f6adSraf return (1);
868f841f6adSraf }
869f841f6adSraf
870f841f6adSraf int
_aio_create_worker(aio_req_t * reqp,int mode)871f841f6adSraf _aio_create_worker(aio_req_t *reqp, int mode)
872f841f6adSraf {
873f841f6adSraf aio_worker_t *aiowp, **workers, **nextworker;
874f841f6adSraf int *aio_workerscnt;
875f841f6adSraf void *(*func)(void *);
876f841f6adSraf sigset_t oset;
877f841f6adSraf int error;
878f841f6adSraf
879f841f6adSraf /*
880f841f6adSraf * Put the new worker thread in the right queue.
881f841f6adSraf */
882f841f6adSraf switch (mode) {
883f841f6adSraf case AIOREAD:
884f841f6adSraf case AIOWRITE:
885f841f6adSraf case AIOAREAD:
886f841f6adSraf case AIOAWRITE:
887f841f6adSraf #if !defined(_LP64)
888f841f6adSraf case AIOAREAD64:
889f841f6adSraf case AIOAWRITE64:
890f841f6adSraf #endif
891f841f6adSraf workers = &__workers_rw;
892f841f6adSraf nextworker = &__nextworker_rw;
893f841f6adSraf aio_workerscnt = &__rw_workerscnt;
894f841f6adSraf func = _aio_do_request;
895f841f6adSraf break;
896f841f6adSraf case AIONOTIFY:
897f841f6adSraf workers = &__workers_no;
898f841f6adSraf nextworker = &__nextworker_no;
899f841f6adSraf func = _aio_do_notify;
900f841f6adSraf aio_workerscnt = &__no_workerscnt;
901f841f6adSraf break;
902f841f6adSraf default:
903f841f6adSraf aio_panic("_aio_create_worker: invalid mode");
904f841f6adSraf break;
905f841f6adSraf }
906f841f6adSraf
907f841f6adSraf if ((aiowp = _aio_worker_alloc()) == NULL)
908f841f6adSraf return (-1);
909f841f6adSraf
910f841f6adSraf if (reqp) {
911f841f6adSraf reqp->req_state = AIO_REQ_QUEUED;
912f841f6adSraf reqp->req_worker = aiowp;
913f841f6adSraf aiowp->work_head1 = reqp;
914f841f6adSraf aiowp->work_tail1 = reqp;
915f841f6adSraf aiowp->work_next1 = reqp;
916f841f6adSraf aiowp->work_count1 = 1;
917f841f6adSraf aiowp->work_minload1 = 1;
918f841f6adSraf }
919f841f6adSraf
920f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
921f841f6adSraf error = thr_create(NULL, AIOSTKSIZE, func, aiowp,
922f841f6adSraf THR_DAEMON | THR_SUSPENDED, &aiowp->work_tid);
923f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
924f841f6adSraf if (error) {
925f841f6adSraf if (reqp) {
926f841f6adSraf reqp->req_state = 0;
927f841f6adSraf reqp->req_worker = NULL;
928f841f6adSraf }
929f841f6adSraf _aio_worker_free(aiowp);
930f841f6adSraf return (-1);
931f841f6adSraf }
932f841f6adSraf
933f841f6adSraf lmutex_lock(&__aio_mutex);
934f841f6adSraf (*aio_workerscnt)++;
935f841f6adSraf if (*workers == NULL) {
936f841f6adSraf aiowp->work_forw = aiowp;
937f841f6adSraf aiowp->work_backw = aiowp;
938f841f6adSraf *nextworker = aiowp;
939f841f6adSraf *workers = aiowp;
940f841f6adSraf } else {
941f841f6adSraf aiowp->work_backw = (*workers)->work_backw;
942f841f6adSraf aiowp->work_forw = (*workers);
943f841f6adSraf (*workers)->work_backw->work_forw = aiowp;
944f841f6adSraf (*workers)->work_backw = aiowp;
945f841f6adSraf }
946f841f6adSraf _aio_worker_cnt++;
947f841f6adSraf lmutex_unlock(&__aio_mutex);
948f841f6adSraf
949f841f6adSraf (void) thr_continue(aiowp->work_tid);
950f841f6adSraf
951f841f6adSraf return (0);
952f841f6adSraf }
953f841f6adSraf
954f841f6adSraf /*
955f841f6adSraf * This is the worker's main routine.
956f841f6adSraf * The task of this function is to execute all queued requests;
957f841f6adSraf * once the last pending request is executed this function will block
958f841f6adSraf * in _aio_idle(). A new incoming request must wakeup this thread to
959f841f6adSraf * restart the work.
960f841f6adSraf * Every worker has an own work queue. The queue lock is required
961f841f6adSraf * to synchronize the addition of new requests for this worker or
962f841f6adSraf * cancellation of pending/running requests.
963f841f6adSraf *
964f841f6adSraf * Cancellation scenarios:
965f841f6adSraf * The cancellation of a request is being done asynchronously using
966f841f6adSraf * _aio_cancel_req() from another thread context.
967f841f6adSraf * A queued request can be cancelled in different manners :
968f841f6adSraf * a) request is queued but not "in progress" or "done" (AIO_REQ_QUEUED):
969f841f6adSraf * - lock the queue -> remove the request -> unlock the queue
970f841f6adSraf * - this function/thread does not detect this cancellation process
971f841f6adSraf * b) request is in progress (AIO_REQ_INPROGRESS) :
972f841f6adSraf * - this function first allow the cancellation of the running
973f841f6adSraf * request with the flag "work_cancel_flg=1"
974f841f6adSraf * see _aio_req_get() -> _aio_cancel_on()
975f841f6adSraf * During this phase, it is allowed to interrupt the worker
976f841f6adSraf * thread running the request (this thread) using the SIGAIOCANCEL
977f841f6adSraf * signal.
978f841f6adSraf * Once this thread returns from the kernel (because the request
979f841f6adSraf * is just done), then it must disable a possible cancellation
980f841f6adSraf * and proceed to finish the request. To disable the cancellation
981f841f6adSraf * this thread must use _aio_cancel_off() to set "work_cancel_flg=0".
982f841f6adSraf * c) request is already done (AIO_REQ_DONE || AIO_REQ_DONEQ):
983f841f6adSraf * same procedure as in a)
984f841f6adSraf *
985f841f6adSraf * To b)
986f841f6adSraf * This thread uses sigsetjmp() to define the position in the code, where
987f841f6adSraf * it wish to continue working in the case that a SIGAIOCANCEL signal
988f841f6adSraf * is detected.
989f841f6adSraf * Normally this thread should get the cancellation signal during the
990f841f6adSraf * kernel phase (reading or writing). In that case the signal handler
991f841f6adSraf * aiosigcancelhndlr() is activated using the worker thread context,
992f841f6adSraf * which again will use the siglongjmp() function to break the standard
993f841f6adSraf * code flow and jump to the "sigsetjmp" position, provided that
994f841f6adSraf * "work_cancel_flg" is set to "1".
995f841f6adSraf * Because the "work_cancel_flg" is only manipulated by this worker
996f841f6adSraf * thread and it can only run on one CPU at a given time, it is not
997f841f6adSraf * necessary to protect that flag with the queue lock.
998f841f6adSraf * Returning from the kernel (read or write system call) we must
999f841f6adSraf * first disable the use of the SIGAIOCANCEL signal and accordingly
1000f841f6adSraf * the use of the siglongjmp() function to prevent a possible deadlock:
1001f841f6adSraf * - It can happens that this worker thread returns from the kernel and
1002f841f6adSraf * blocks in "work_qlock1",
1003f841f6adSraf * - then a second thread cancels the apparently "in progress" request
1004f841f6adSraf * and sends the SIGAIOCANCEL signal to the worker thread,
1005f841f6adSraf * - the worker thread gets assigned the "work_qlock1" and will returns
1006f841f6adSraf * from the kernel,
1007f841f6adSraf * - the kernel detects the pending signal and activates the signal
1008f841f6adSraf * handler instead,
1009f841f6adSraf * - if the "work_cancel_flg" is still set then the signal handler
1010f841f6adSraf * should use siglongjmp() to cancel the "in progress" request and
1011f841f6adSraf * it would try to acquire the same work_qlock1 in _aio_req_get()
1012f841f6adSraf * for a second time => deadlock.
1013f841f6adSraf * To avoid that situation we disable the cancellation of the request
1014f841f6adSraf * in progress BEFORE we try to acquire the work_qlock1.
1015f841f6adSraf * In that case the signal handler will not call siglongjmp() and the
1016f841f6adSraf * worker thread will continue running the standard code flow.
1017f841f6adSraf * Then this thread must check the AIO_REQ_CANCELED flag to emulate
1018f841f6adSraf * an eventually required siglongjmp() freeing the work_qlock1 and
1019f841f6adSraf * avoiding a deadlock.
1020f841f6adSraf */
1021f841f6adSraf void *
_aio_do_request(void * arglist)1022f841f6adSraf _aio_do_request(void *arglist)
1023f841f6adSraf {
1024f841f6adSraf aio_worker_t *aiowp = (aio_worker_t *)arglist;
1025f841f6adSraf ulwp_t *self = curthread;
1026f841f6adSraf struct aio_args *arg;
1027f841f6adSraf aio_req_t *reqp; /* current AIO request */
1028f841f6adSraf ssize_t retval;
10294d86dd30Sraf int append;
1030f841f6adSraf int error;
1031f841f6adSraf
1032f841f6adSraf if (pthread_setspecific(_aio_key, aiowp) != 0)
1033f841f6adSraf aio_panic("_aio_do_request, pthread_setspecific()");
1034f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &_worker_set, NULL);
1035f841f6adSraf ASSERT(aiowp->work_req == NULL);
1036f841f6adSraf
1037f841f6adSraf /*
1038f841f6adSraf * We resume here when an operation is cancelled.
1039f841f6adSraf * On first entry, aiowp->work_req == NULL, so all
1040f841f6adSraf * we do is block SIGAIOCANCEL.
1041f841f6adSraf */
1042f841f6adSraf (void) sigsetjmp(aiowp->work_jmp_buf, 0);
1043f841f6adSraf ASSERT(self->ul_sigdefer == 0);
1044f841f6adSraf
1045f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */
1046f841f6adSraf if (aiowp->work_req != NULL)
1047f841f6adSraf _aio_finish_request(aiowp, -1, ECANCELED);
1048f841f6adSraf
1049f841f6adSraf for (;;) {
1050f841f6adSraf /*
1051f841f6adSraf * Put completed requests on aio_done_list. This has
1052f841f6adSraf * to be done as part of the main loop to ensure that
1053f841f6adSraf * we don't artificially starve any aiowait'ers.
1054f841f6adSraf */
1055f841f6adSraf if (aiowp->work_done1)
1056f841f6adSraf _aio_work_done(aiowp);
1057f841f6adSraf
1058f841f6adSraf top:
1059f841f6adSraf /* consume any deferred SIGAIOCANCEL signal here */
1060f841f6adSraf sigon(self);
1061f841f6adSraf sigoff(self);
1062f841f6adSraf
1063f841f6adSraf while ((reqp = _aio_req_get(aiowp)) == NULL) {
1064f841f6adSraf if (_aio_idle(aiowp) != 0)
1065f841f6adSraf goto top;
1066f841f6adSraf }
1067f841f6adSraf arg = &reqp->req_args;
1068f841f6adSraf ASSERT(reqp->req_state == AIO_REQ_INPROGRESS ||
1069f841f6adSraf reqp->req_state == AIO_REQ_CANCELED);
1070f841f6adSraf error = 0;
1071f841f6adSraf
1072f841f6adSraf switch (reqp->req_op) {
1073f841f6adSraf case AIOREAD:
1074f841f6adSraf case AIOAREAD:
1075f841f6adSraf sigon(self); /* unblock SIGAIOCANCEL */
1076f841f6adSraf retval = pread(arg->fd, arg->buf,
1077f841f6adSraf arg->bufsz, arg->offset);
1078f841f6adSraf if (retval == -1) {
1079f841f6adSraf if (errno == ESPIPE) {
1080f841f6adSraf retval = read(arg->fd,
1081f841f6adSraf arg->buf, arg->bufsz);
1082f841f6adSraf if (retval == -1)
1083f841f6adSraf error = errno;
1084f841f6adSraf } else {
1085f841f6adSraf error = errno;
1086f841f6adSraf }
1087f841f6adSraf }
1088f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */
1089f841f6adSraf break;
1090f841f6adSraf case AIOWRITE:
1091f841f6adSraf case AIOAWRITE:
10924d86dd30Sraf /*
10934d86dd30Sraf * The SUSv3 POSIX spec for aio_write() states:
10944d86dd30Sraf * If O_APPEND is set for the file descriptor,
10954d86dd30Sraf * write operations append to the file in the
10964d86dd30Sraf * same order as the calls were made.
10974d86dd30Sraf * but, somewhat inconsistently, it requires pwrite()
10984d86dd30Sraf * to ignore the O_APPEND setting. So we have to use
10994d86dd30Sraf * fcntl() to get the open modes and call write() for
11004d86dd30Sraf * the O_APPEND case.
11014d86dd30Sraf */
11024d86dd30Sraf append = (__fcntl(arg->fd, F_GETFL) & O_APPEND);
1103f841f6adSraf sigon(self); /* unblock SIGAIOCANCEL */
11044d86dd30Sraf retval = append?
11054d86dd30Sraf write(arg->fd, arg->buf, arg->bufsz) :
11064d86dd30Sraf pwrite(arg->fd, arg->buf, arg->bufsz,
11074d86dd30Sraf arg->offset);
1108f841f6adSraf if (retval == -1) {
1109f841f6adSraf if (errno == ESPIPE) {
1110f841f6adSraf retval = write(arg->fd,
1111f841f6adSraf arg->buf, arg->bufsz);
1112f841f6adSraf if (retval == -1)
1113f841f6adSraf error = errno;
1114f841f6adSraf } else {
1115f841f6adSraf error = errno;
1116f841f6adSraf }
1117f841f6adSraf }
1118f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */
1119f841f6adSraf break;
1120f841f6adSraf #if !defined(_LP64)
1121f841f6adSraf case AIOAREAD64:
1122f841f6adSraf sigon(self); /* unblock SIGAIOCANCEL */
1123f841f6adSraf retval = pread64(arg->fd, arg->buf,
1124f841f6adSraf arg->bufsz, arg->offset);
1125f841f6adSraf if (retval == -1) {
1126f841f6adSraf if (errno == ESPIPE) {
1127f841f6adSraf retval = read(arg->fd,
1128f841f6adSraf arg->buf, arg->bufsz);
1129f841f6adSraf if (retval == -1)
1130f841f6adSraf error = errno;
1131f841f6adSraf } else {
1132f841f6adSraf error = errno;
1133f841f6adSraf }
1134f841f6adSraf }
1135f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */
1136f841f6adSraf break;
1137f841f6adSraf case AIOAWRITE64:
11384d86dd30Sraf /*
11394d86dd30Sraf * The SUSv3 POSIX spec for aio_write() states:
11404d86dd30Sraf * If O_APPEND is set for the file descriptor,
11414d86dd30Sraf * write operations append to the file in the
11424d86dd30Sraf * same order as the calls were made.
11434d86dd30Sraf * but, somewhat inconsistently, it requires pwrite()
11444d86dd30Sraf * to ignore the O_APPEND setting. So we have to use
11454d86dd30Sraf * fcntl() to get the open modes and call write() for
11464d86dd30Sraf * the O_APPEND case.
11474d86dd30Sraf */
11484d86dd30Sraf append = (__fcntl(arg->fd, F_GETFL) & O_APPEND);
1149f841f6adSraf sigon(self); /* unblock SIGAIOCANCEL */
11504d86dd30Sraf retval = append?
11514d86dd30Sraf write(arg->fd, arg->buf, arg->bufsz) :
11524d86dd30Sraf pwrite64(arg->fd, arg->buf, arg->bufsz,
11534d86dd30Sraf arg->offset);
1154f841f6adSraf if (retval == -1) {
1155f841f6adSraf if (errno == ESPIPE) {
1156f841f6adSraf retval = write(arg->fd,
1157f841f6adSraf arg->buf, arg->bufsz);
1158f841f6adSraf if (retval == -1)
1159f841f6adSraf error = errno;
1160f841f6adSraf } else {
1161f841f6adSraf error = errno;
1162f841f6adSraf }
1163f841f6adSraf }
1164f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */
1165f841f6adSraf break;
1166f841f6adSraf #endif /* !defined(_LP64) */
1167f841f6adSraf case AIOFSYNC:
1168f841f6adSraf if (_aio_fsync_del(aiowp, reqp))
1169f841f6adSraf goto top;
1170f841f6adSraf ASSERT(reqp->req_head == NULL);
1171f841f6adSraf /*
1172f841f6adSraf * All writes for this fsync request are now
1173f841f6adSraf * acknowledged. Now make these writes visible
1174f841f6adSraf * and put the final request into the hash table.
1175f841f6adSraf */
1176f841f6adSraf if (reqp->req_state == AIO_REQ_CANCELED) {
1177f841f6adSraf /* EMPTY */;
1178f841f6adSraf } else if (arg->offset == O_SYNC) {
1179f841f6adSraf if ((retval = __fdsync(arg->fd, FSYNC)) == -1)
1180f841f6adSraf error = errno;
1181f841f6adSraf } else {
1182f841f6adSraf if ((retval = __fdsync(arg->fd, FDSYNC)) == -1)
1183f841f6adSraf error = errno;
1184f841f6adSraf }
1185f841f6adSraf if (_aio_hash_insert(reqp->req_resultp, reqp) != 0)
1186f841f6adSraf aio_panic("_aio_do_request(): AIOFSYNC: "
1187f841f6adSraf "request already in hash table");
1188f841f6adSraf break;
1189f841f6adSraf default:
1190f841f6adSraf aio_panic("_aio_do_request, bad op");
1191f841f6adSraf }
1192f841f6adSraf
1193f841f6adSraf _aio_finish_request(aiowp, retval, error);
1194f841f6adSraf }
1195f841f6adSraf /* NOTREACHED */
1196f841f6adSraf return (NULL);
1197f841f6adSraf }
1198f841f6adSraf
1199f841f6adSraf /*
1200f841f6adSraf * Perform the tail processing for _aio_do_request().
1201f841f6adSraf * The in-progress request may or may not have been cancelled.
1202f841f6adSraf */
1203f841f6adSraf static void
_aio_finish_request(aio_worker_t * aiowp,ssize_t retval,int error)1204f841f6adSraf _aio_finish_request(aio_worker_t *aiowp, ssize_t retval, int error)
1205f841f6adSraf {
1206f841f6adSraf aio_req_t *reqp;
1207f841f6adSraf
1208f841f6adSraf sig_mutex_lock(&aiowp->work_qlock1);
1209f841f6adSraf if ((reqp = aiowp->work_req) == NULL)
1210f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
1211f841f6adSraf else {
1212f841f6adSraf aiowp->work_req = NULL;
1213f841f6adSraf if (reqp->req_state == AIO_REQ_CANCELED) {
1214f841f6adSraf retval = -1;
1215f841f6adSraf error = ECANCELED;
1216f841f6adSraf }
1217f841f6adSraf if (!POSIX_AIO(reqp)) {
121834b3058fSpraks int notify;
1219bced1f33Spraks if (reqp->req_state == AIO_REQ_INPROGRESS) {
1220bced1f33Spraks reqp->req_state = AIO_REQ_DONE;
1221bced1f33Spraks _aio_set_result(reqp, retval, error);
1222bced1f33Spraks }
1223f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
1224f841f6adSraf sig_mutex_lock(&__aio_mutex);
122534b3058fSpraks /*
122634b3058fSpraks * If it was canceled, this request will not be
122734b3058fSpraks * added to done list. Just free it.
122834b3058fSpraks */
122934b3058fSpraks if (error == ECANCELED) {
1230f841f6adSraf _aio_outstand_cnt--;
123134b3058fSpraks _aio_req_free(reqp);
123234b3058fSpraks } else {
123334b3058fSpraks _aio_req_done_cnt++;
123434b3058fSpraks }
123534b3058fSpraks /*
123634b3058fSpraks * Notify any thread that may have blocked
123734b3058fSpraks * because it saw an outstanding request.
123834b3058fSpraks */
123934b3058fSpraks notify = 0;
124034b3058fSpraks if (_aio_outstand_cnt == 0 && _aiowait_flag) {
124134b3058fSpraks notify = 1;
124234b3058fSpraks }
1243f841f6adSraf sig_mutex_unlock(&__aio_mutex);
124434b3058fSpraks if (notify) {
124534b3058fSpraks (void) _kaio(AIONOTIFY);
124634b3058fSpraks }
1247f841f6adSraf } else {
1248f841f6adSraf if (reqp->req_state == AIO_REQ_INPROGRESS)
1249f841f6adSraf reqp->req_state = AIO_REQ_DONE;
1250f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
1251f841f6adSraf _aiodone(reqp, retval, error);
1252f841f6adSraf }
1253f841f6adSraf }
1254f841f6adSraf }
1255f841f6adSraf
1256f841f6adSraf void
_aio_req_mark_done(aio_req_t * reqp)1257f841f6adSraf _aio_req_mark_done(aio_req_t *reqp)
1258f841f6adSraf {
1259f841f6adSraf #if !defined(_LP64)
1260f841f6adSraf if (reqp->req_largefile)
1261f841f6adSraf ((aiocb64_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
1262f841f6adSraf else
1263f841f6adSraf #endif
1264f841f6adSraf ((aiocb_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
1265f841f6adSraf }
1266f841f6adSraf
1267f841f6adSraf /*
1268f841f6adSraf * Sleep for 'ticks' clock ticks to give somebody else a chance to run,
1269f841f6adSraf * hopefully to consume one of our queued signals.
1270f841f6adSraf */
1271f841f6adSraf static void
_aio_delay(int ticks)1272f841f6adSraf _aio_delay(int ticks)
1273f841f6adSraf {
1274f841f6adSraf (void) usleep(ticks * (MICROSEC / hz));
1275f841f6adSraf }
1276f841f6adSraf
1277f841f6adSraf /*
1278f841f6adSraf * Actually send the notifications.
1279f841f6adSraf * We could block indefinitely here if the application
1280f841f6adSraf * is not listening for the signal or port notifications.
1281f841f6adSraf */
1282f841f6adSraf static void
send_notification(notif_param_t * npp)1283f841f6adSraf send_notification(notif_param_t *npp)
1284f841f6adSraf {
1285f841f6adSraf extern int __sigqueue(pid_t pid, int signo,
1286f841f6adSraf /* const union sigval */ void *value, int si_code, int block);
1287f841f6adSraf
1288f841f6adSraf if (npp->np_signo)
1289f841f6adSraf (void) __sigqueue(__pid, npp->np_signo, npp->np_user,
1290f841f6adSraf SI_ASYNCIO, 1);
1291f841f6adSraf else if (npp->np_port >= 0)
1292f841f6adSraf (void) _port_dispatch(npp->np_port, 0, PORT_SOURCE_AIO,
1293f841f6adSraf npp->np_event, npp->np_object, npp->np_user);
1294f841f6adSraf
1295f841f6adSraf if (npp->np_lio_signo)
1296f841f6adSraf (void) __sigqueue(__pid, npp->np_lio_signo, npp->np_lio_user,
1297f841f6adSraf SI_ASYNCIO, 1);
1298f841f6adSraf else if (npp->np_lio_port >= 0)
1299f841f6adSraf (void) _port_dispatch(npp->np_lio_port, 0, PORT_SOURCE_AIO,
1300f841f6adSraf npp->np_lio_event, npp->np_lio_object, npp->np_lio_user);
1301f841f6adSraf }
1302f841f6adSraf
1303f841f6adSraf /*
1304f841f6adSraf * Asynchronous notification worker.
1305f841f6adSraf */
1306f841f6adSraf void *
_aio_do_notify(void * arg)1307f841f6adSraf _aio_do_notify(void *arg)
1308f841f6adSraf {
1309f841f6adSraf aio_worker_t *aiowp = (aio_worker_t *)arg;
1310f841f6adSraf aio_req_t *reqp;
1311f841f6adSraf
1312f841f6adSraf /*
1313f841f6adSraf * This isn't really necessary. All signals are blocked.
1314f841f6adSraf */
1315f841f6adSraf if (pthread_setspecific(_aio_key, aiowp) != 0)
1316f841f6adSraf aio_panic("_aio_do_notify, pthread_setspecific()");
1317f841f6adSraf
1318f841f6adSraf /*
1319f841f6adSraf * Notifications are never cancelled.
1320f841f6adSraf * All signals remain blocked, forever.
1321f841f6adSraf */
1322f841f6adSraf for (;;) {
1323f841f6adSraf while ((reqp = _aio_req_get(aiowp)) == NULL) {
1324f841f6adSraf if (_aio_idle(aiowp) != 0)
1325f841f6adSraf aio_panic("_aio_do_notify: _aio_idle() failed");
1326f841f6adSraf }
1327f841f6adSraf send_notification(&reqp->req_notify);
1328f841f6adSraf _aio_req_free(reqp);
1329f841f6adSraf }
1330f841f6adSraf
1331f841f6adSraf /* NOTREACHED */
1332f841f6adSraf return (NULL);
1333f841f6adSraf }
1334f841f6adSraf
1335f841f6adSraf /*
1336f841f6adSraf * Do the completion semantics for a request that was either canceled
1337f841f6adSraf * by _aio_cancel_req() or was completed by _aio_do_request().
1338f841f6adSraf */
1339f841f6adSraf static void
_aiodone(aio_req_t * reqp,ssize_t retval,int error)1340f841f6adSraf _aiodone(aio_req_t *reqp, ssize_t retval, int error)
1341f841f6adSraf {
1342f841f6adSraf aio_result_t *resultp = reqp->req_resultp;
1343f841f6adSraf int notify = 0;
1344f841f6adSraf aio_lio_t *head;
1345f841f6adSraf int sigev_none;
1346f841f6adSraf int sigev_signal;
1347f841f6adSraf int sigev_thread;
1348f841f6adSraf int sigev_port;
1349f841f6adSraf notif_param_t np;
1350f841f6adSraf
1351f841f6adSraf /*
1352f841f6adSraf * We call _aiodone() only for Posix I/O.
1353f841f6adSraf */
1354f841f6adSraf ASSERT(POSIX_AIO(reqp));
1355f841f6adSraf
1356f841f6adSraf sigev_none = 0;
1357f841f6adSraf sigev_signal = 0;
1358f841f6adSraf sigev_thread = 0;
1359f841f6adSraf sigev_port = 0;
1360f841f6adSraf np.np_signo = 0;
1361f841f6adSraf np.np_port = -1;
1362f841f6adSraf np.np_lio_signo = 0;
1363f841f6adSraf np.np_lio_port = -1;
1364f841f6adSraf
1365f841f6adSraf switch (reqp->req_sigevent.sigev_notify) {
1366f841f6adSraf case SIGEV_NONE:
1367f841f6adSraf sigev_none = 1;
1368f841f6adSraf break;
1369f841f6adSraf case SIGEV_SIGNAL:
1370f841f6adSraf sigev_signal = 1;
1371f841f6adSraf break;
1372f841f6adSraf case SIGEV_THREAD:
1373f841f6adSraf sigev_thread = 1;
1374f841f6adSraf break;
1375f841f6adSraf case SIGEV_PORT:
1376f841f6adSraf sigev_port = 1;
1377f841f6adSraf break;
1378f841f6adSraf default:
1379f841f6adSraf aio_panic("_aiodone: improper sigev_notify");
1380f841f6adSraf break;
1381f841f6adSraf }
1382f841f6adSraf
1383f841f6adSraf /*
1384f841f6adSraf * Figure out the notification parameters while holding __aio_mutex.
1385f841f6adSraf * Actually perform the notifications after dropping __aio_mutex.
1386f841f6adSraf * This allows us to sleep for a long time (if the notifications
1387f841f6adSraf * incur delays) without impeding other async I/O operations.
1388f841f6adSraf */
1389f841f6adSraf
1390f841f6adSraf sig_mutex_lock(&__aio_mutex);
1391f841f6adSraf
1392f841f6adSraf if (sigev_signal) {
1393f841f6adSraf if ((np.np_signo = reqp->req_sigevent.sigev_signo) != 0)
1394f841f6adSraf notify = 1;
1395f841f6adSraf np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
1396f841f6adSraf } else if (sigev_thread | sigev_port) {
1397f841f6adSraf if ((np.np_port = reqp->req_sigevent.sigev_signo) >= 0)
1398f841f6adSraf notify = 1;
1399f841f6adSraf np.np_event = reqp->req_op;
1400f841f6adSraf if (np.np_event == AIOFSYNC && reqp->req_largefile)
1401f841f6adSraf np.np_event = AIOFSYNC64;
1402f841f6adSraf np.np_object = (uintptr_t)reqp->req_aiocbp;
1403f841f6adSraf np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
1404f841f6adSraf }
1405f841f6adSraf
1406f841f6adSraf if (resultp->aio_errno == EINPROGRESS)
1407f841f6adSraf _aio_set_result(reqp, retval, error);
1408f841f6adSraf
1409f841f6adSraf _aio_outstand_cnt--;
1410f841f6adSraf
1411f841f6adSraf head = reqp->req_head;
1412f841f6adSraf reqp->req_head = NULL;
1413f841f6adSraf
1414f841f6adSraf if (sigev_none) {
1415f841f6adSraf _aio_enq_doneq(reqp);
1416f841f6adSraf reqp = NULL;
1417f841f6adSraf } else {
1418f841f6adSraf (void) _aio_hash_del(resultp);
1419f841f6adSraf _aio_req_mark_done(reqp);
1420f841f6adSraf }
1421f841f6adSraf
1422f841f6adSraf _aio_waitn_wakeup();
1423f841f6adSraf
1424f841f6adSraf /*
1425f841f6adSraf * __aio_waitn() sets AIO_WAIT_INPROGRESS and
1426f841f6adSraf * __aio_suspend() increments "_aio_kernel_suspend"
1427f841f6adSraf * when they are waiting in the kernel for completed I/Os.
1428f841f6adSraf *
1429f841f6adSraf * _kaio(AIONOTIFY) awakes the corresponding function
1430f841f6adSraf * in the kernel; then the corresponding __aio_waitn() or
1431f841f6adSraf * __aio_suspend() function could reap the recently
1432f841f6adSraf * completed I/Os (_aiodone()).
1433f841f6adSraf */
1434f841f6adSraf if ((_aio_flags & AIO_WAIT_INPROGRESS) || _aio_kernel_suspend > 0)
1435f841f6adSraf (void) _kaio(AIONOTIFY);
1436f841f6adSraf
1437f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1438f841f6adSraf
1439f841f6adSraf if (head != NULL) {
1440f841f6adSraf /*
1441f841f6adSraf * If all the lio requests have completed,
1442f841f6adSraf * prepare to notify the waiting thread.
1443f841f6adSraf */
1444f841f6adSraf sig_mutex_lock(&head->lio_mutex);
1445f841f6adSraf ASSERT(head->lio_refcnt == head->lio_nent);
1446f841f6adSraf if (head->lio_refcnt == 1) {
1447f841f6adSraf int waiting = 0;
1448f841f6adSraf if (head->lio_mode == LIO_WAIT) {
1449f841f6adSraf if ((waiting = head->lio_waiting) != 0)
1450f841f6adSraf (void) cond_signal(&head->lio_cond_cv);
1451f841f6adSraf } else if (head->lio_port < 0) { /* none or signal */
1452f841f6adSraf if ((np.np_lio_signo = head->lio_signo) != 0)
1453f841f6adSraf notify = 1;
1454f841f6adSraf np.np_lio_user = head->lio_sigval.sival_ptr;
1455f841f6adSraf } else { /* thread or port */
1456f841f6adSraf notify = 1;
1457f841f6adSraf np.np_lio_port = head->lio_port;
1458f841f6adSraf np.np_lio_event = head->lio_event;
1459f841f6adSraf np.np_lio_object =
1460f841f6adSraf (uintptr_t)head->lio_sigevent;
1461f841f6adSraf np.np_lio_user = head->lio_sigval.sival_ptr;
1462f841f6adSraf }
1463f841f6adSraf head->lio_nent = head->lio_refcnt = 0;
1464f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
1465f841f6adSraf if (waiting == 0)
1466f841f6adSraf _aio_lio_free(head);
1467f841f6adSraf } else {
1468f841f6adSraf head->lio_nent--;
1469f841f6adSraf head->lio_refcnt--;
1470f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
1471f841f6adSraf }
1472f841f6adSraf }
1473f841f6adSraf
1474f841f6adSraf /*
1475f841f6adSraf * The request is completed; now perform the notifications.
1476f841f6adSraf */
1477f841f6adSraf if (notify) {
1478f841f6adSraf if (reqp != NULL) {
1479f841f6adSraf /*
1480f841f6adSraf * We usually put the request on the notification
1481f841f6adSraf * queue because we don't want to block and delay
1482f841f6adSraf * other operations behind us in the work queue.
1483f841f6adSraf * Also we must never block on a cancel notification
1484f841f6adSraf * because we are being called from an application
1485f841f6adSraf * thread in this case and that could lead to deadlock
1486f841f6adSraf * if no other thread is receiving notificatins.
1487f841f6adSraf */
1488f841f6adSraf reqp->req_notify = np;
1489f841f6adSraf reqp->req_op = AIONOTIFY;
1490f841f6adSraf _aio_req_add(reqp, &__workers_no, AIONOTIFY);
1491f841f6adSraf reqp = NULL;
1492f841f6adSraf } else {
1493f841f6adSraf /*
1494f841f6adSraf * We already put the request on the done queue,
1495f841f6adSraf * so we can't queue it to the notification queue.
1496f841f6adSraf * Just do the notification directly.
1497f841f6adSraf */
1498f841f6adSraf send_notification(&np);
1499f841f6adSraf }
1500f841f6adSraf }
1501f841f6adSraf
1502f841f6adSraf if (reqp != NULL)
1503f841f6adSraf _aio_req_free(reqp);
1504f841f6adSraf }
1505f841f6adSraf
1506f841f6adSraf /*
1507f841f6adSraf * Delete fsync requests from list head until there is
1508f841f6adSraf * only one left. Return 0 when there is only one,
1509f841f6adSraf * otherwise return a non-zero value.
1510f841f6adSraf */
1511f841f6adSraf static int
_aio_fsync_del(aio_worker_t * aiowp,aio_req_t * reqp)1512f841f6adSraf _aio_fsync_del(aio_worker_t *aiowp, aio_req_t *reqp)
1513f841f6adSraf {
1514f841f6adSraf aio_lio_t *head = reqp->req_head;
1515f841f6adSraf int rval = 0;
1516f841f6adSraf
1517f841f6adSraf ASSERT(reqp == aiowp->work_req);
1518f841f6adSraf sig_mutex_lock(&aiowp->work_qlock1);
1519f841f6adSraf sig_mutex_lock(&head->lio_mutex);
1520f841f6adSraf if (head->lio_refcnt > 1) {
1521f841f6adSraf head->lio_refcnt--;
1522f841f6adSraf head->lio_nent--;
1523f841f6adSraf aiowp->work_req = NULL;
1524f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
1525f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
1526f841f6adSraf sig_mutex_lock(&__aio_mutex);
1527f841f6adSraf _aio_outstand_cnt--;
1528f841f6adSraf _aio_waitn_wakeup();
1529f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1530f841f6adSraf _aio_req_free(reqp);
1531f841f6adSraf return (1);
1532f841f6adSraf }
1533f841f6adSraf ASSERT(head->lio_nent == 1 && head->lio_refcnt == 1);
1534f841f6adSraf reqp->req_head = NULL;
1535f841f6adSraf if (head->lio_canned)
1536f841f6adSraf reqp->req_state = AIO_REQ_CANCELED;
1537f841f6adSraf if (head->lio_mode == LIO_DESTROY) {
1538f841f6adSraf aiowp->work_req = NULL;
1539f841f6adSraf rval = 1;
1540f841f6adSraf }
1541f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
1542f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
1543f841f6adSraf head->lio_refcnt--;
1544f841f6adSraf head->lio_nent--;
1545f841f6adSraf _aio_lio_free(head);
1546f841f6adSraf if (rval != 0)
1547f841f6adSraf _aio_req_free(reqp);
1548f841f6adSraf return (rval);
1549f841f6adSraf }
1550f841f6adSraf
1551f841f6adSraf /*
1552f841f6adSraf * A worker is set idle when its work queue is empty.
1553f841f6adSraf * The worker checks again that it has no more work
1554f841f6adSraf * and then goes to sleep waiting for more work.
1555f841f6adSraf */
1556f841f6adSraf int
_aio_idle(aio_worker_t * aiowp)1557f841f6adSraf _aio_idle(aio_worker_t *aiowp)
1558f841f6adSraf {
1559f841f6adSraf int error = 0;
1560f841f6adSraf
1561f841f6adSraf sig_mutex_lock(&aiowp->work_qlock1);
1562f841f6adSraf if (aiowp->work_count1 == 0) {
1563f841f6adSraf ASSERT(aiowp->work_minload1 == 0);
1564f841f6adSraf aiowp->work_idleflg = 1;
1565f841f6adSraf /*
1566f841f6adSraf * A cancellation handler is not needed here.
1567f841f6adSraf * aio worker threads are never cancelled via pthread_cancel().
1568f841f6adSraf */
1569f841f6adSraf error = sig_cond_wait(&aiowp->work_idle_cv,
1570f841f6adSraf &aiowp->work_qlock1);
1571f841f6adSraf /*
1572f841f6adSraf * The idle flag is normally cleared before worker is awakened
1573f841f6adSraf * by aio_req_add(). On error (EINTR), we clear it ourself.
1574f841f6adSraf */
1575f841f6adSraf if (error)
1576f841f6adSraf aiowp->work_idleflg = 0;
1577f841f6adSraf }
1578f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
1579f841f6adSraf return (error);
1580f841f6adSraf }
1581f841f6adSraf
1582f841f6adSraf /*
1583f841f6adSraf * A worker's completed AIO requests are placed onto a global
1584f841f6adSraf * done queue. The application is only sent a SIGIO signal if
1585f841f6adSraf * the process has a handler enabled and it is not waiting via
1586f841f6adSraf * aiowait().
1587f841f6adSraf */
1588f841f6adSraf static void
_aio_work_done(aio_worker_t * aiowp)1589f841f6adSraf _aio_work_done(aio_worker_t *aiowp)
1590f841f6adSraf {
1591f841f6adSraf aio_req_t *reqp;
1592f841f6adSraf
1593bced1f33Spraks sig_mutex_lock(&__aio_mutex);
1594f841f6adSraf sig_mutex_lock(&aiowp->work_qlock1);
1595f841f6adSraf reqp = aiowp->work_prev1;
1596f841f6adSraf reqp->req_next = NULL;
1597f841f6adSraf aiowp->work_done1 = 0;
1598f841f6adSraf aiowp->work_tail1 = aiowp->work_next1;
1599f841f6adSraf if (aiowp->work_tail1 == NULL)
1600f841f6adSraf aiowp->work_head1 = NULL;
1601f841f6adSraf aiowp->work_prev1 = NULL;
1602f841f6adSraf _aio_outstand_cnt--;
1603f841f6adSraf _aio_req_done_cnt--;
1604bced1f33Spraks if (reqp->req_state == AIO_REQ_CANCELED) {
1605bced1f33Spraks /*
1606bced1f33Spraks * Request got cancelled after it was marked done. This can
1607bced1f33Spraks * happen because _aio_finish_request() marks it AIO_REQ_DONE
1608bced1f33Spraks * and drops all locks. Don't add the request to the done
1609bced1f33Spraks * queue and just discard it.
1610bced1f33Spraks */
1611bced1f33Spraks sig_mutex_unlock(&aiowp->work_qlock1);
1612bced1f33Spraks _aio_req_free(reqp);
1613bced1f33Spraks if (_aio_outstand_cnt == 0 && _aiowait_flag) {
1614bced1f33Spraks sig_mutex_unlock(&__aio_mutex);
1615bced1f33Spraks (void) _kaio(AIONOTIFY);
1616bced1f33Spraks } else {
1617bced1f33Spraks sig_mutex_unlock(&__aio_mutex);
1618bced1f33Spraks }
1619bced1f33Spraks return;
1620bced1f33Spraks }
1621bced1f33Spraks sig_mutex_unlock(&aiowp->work_qlock1);
1622bced1f33Spraks _aio_donecnt++;
1623f841f6adSraf ASSERT(_aio_donecnt > 0 &&
1624f841f6adSraf _aio_outstand_cnt >= 0 &&
1625f841f6adSraf _aio_req_done_cnt >= 0);
1626f841f6adSraf ASSERT(reqp != NULL);
1627f841f6adSraf
1628f841f6adSraf if (_aio_done_tail == NULL) {
1629f841f6adSraf _aio_done_head = _aio_done_tail = reqp;
1630f841f6adSraf } else {
1631f841f6adSraf _aio_done_head->req_next = reqp;
1632f841f6adSraf _aio_done_head = reqp;
1633f841f6adSraf }
1634f841f6adSraf
1635f841f6adSraf if (_aiowait_flag) {
1636f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1637f841f6adSraf (void) _kaio(AIONOTIFY);
1638f841f6adSraf } else {
1639f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1640f841f6adSraf if (_sigio_enabled)
1641f841f6adSraf (void) kill(__pid, SIGIO);
1642f841f6adSraf }
1643f841f6adSraf }
1644f841f6adSraf
1645f841f6adSraf /*
1646f841f6adSraf * The done queue consists of AIO requests that are in either the
1647f841f6adSraf * AIO_REQ_DONE or AIO_REQ_CANCELED state. Requests that were cancelled
1648f841f6adSraf * are discarded. If the done queue is empty then NULL is returned.
1649f841f6adSraf * Otherwise the address of a done aio_result_t is returned.
1650f841f6adSraf */
1651f841f6adSraf aio_result_t *
_aio_req_done(void)1652f841f6adSraf _aio_req_done(void)
1653f841f6adSraf {
1654f841f6adSraf aio_req_t *reqp;
1655f841f6adSraf aio_result_t *resultp;
1656f841f6adSraf
1657f841f6adSraf ASSERT(MUTEX_HELD(&__aio_mutex));
1658f841f6adSraf
1659f841f6adSraf if ((reqp = _aio_done_tail) != NULL) {
1660f841f6adSraf if ((_aio_done_tail = reqp->req_next) == NULL)
1661f841f6adSraf _aio_done_head = NULL;
1662f841f6adSraf ASSERT(_aio_donecnt > 0);
1663f841f6adSraf _aio_donecnt--;
1664f841f6adSraf (void) _aio_hash_del(reqp->req_resultp);
1665f841f6adSraf resultp = reqp->req_resultp;
1666f841f6adSraf ASSERT(reqp->req_state == AIO_REQ_DONE);
1667f841f6adSraf _aio_req_free(reqp);
1668f841f6adSraf return (resultp);
1669f841f6adSraf }
1670f841f6adSraf /* is queue empty? */
1671f841f6adSraf if (reqp == NULL && _aio_outstand_cnt == 0) {
1672f841f6adSraf return ((aio_result_t *)-1);
1673f841f6adSraf }
1674f841f6adSraf return (NULL);
1675f841f6adSraf }
1676f841f6adSraf
1677f841f6adSraf /*
1678f841f6adSraf * Set the return and errno values for the application's use.
1679f841f6adSraf *
1680f841f6adSraf * For the Posix interfaces, we must set the return value first followed
1681f841f6adSraf * by the errno value because the Posix interfaces allow for a change
1682f841f6adSraf * in the errno value from EINPROGRESS to something else to signal
1683f841f6adSraf * the completion of the asynchronous request.
1684f841f6adSraf *
1685f841f6adSraf * The opposite is true for the Solaris interfaces. These allow for
1686f841f6adSraf * a change in the return value from AIO_INPROGRESS to something else
1687f841f6adSraf * to signal the completion of the asynchronous request.
1688f841f6adSraf */
1689f841f6adSraf void
_aio_set_result(aio_req_t * reqp,ssize_t retval,int error)1690f841f6adSraf _aio_set_result(aio_req_t *reqp, ssize_t retval, int error)
1691f841f6adSraf {
1692f841f6adSraf aio_result_t *resultp = reqp->req_resultp;
1693f841f6adSraf
1694f841f6adSraf if (POSIX_AIO(reqp)) {
1695f841f6adSraf resultp->aio_return = retval;
1696f841f6adSraf membar_producer();
1697f841f6adSraf resultp->aio_errno = error;
1698f841f6adSraf } else {
1699f841f6adSraf resultp->aio_errno = error;
1700f841f6adSraf membar_producer();
1701f841f6adSraf resultp->aio_return = retval;
1702f841f6adSraf }
1703f841f6adSraf }
1704f841f6adSraf
1705f841f6adSraf /*
1706f841f6adSraf * Add an AIO request onto the next work queue.
1707f841f6adSraf * A circular list of workers is used to choose the next worker.
1708f841f6adSraf */
1709f841f6adSraf void
_aio_req_add(aio_req_t * reqp,aio_worker_t ** nextworker,int mode)1710f841f6adSraf _aio_req_add(aio_req_t *reqp, aio_worker_t **nextworker, int mode)
1711f841f6adSraf {
1712f841f6adSraf ulwp_t *self = curthread;
1713f841f6adSraf aio_worker_t *aiowp;
1714f841f6adSraf aio_worker_t *first;
1715f841f6adSraf int load_bal_flg = 1;
1716f841f6adSraf int found;
1717f841f6adSraf
1718f841f6adSraf ASSERT(reqp->req_state != AIO_REQ_DONEQ);
1719f841f6adSraf reqp->req_next = NULL;
1720f841f6adSraf /*
1721f841f6adSraf * Try to acquire the next worker's work queue. If it is locked,
1722f841f6adSraf * then search the list of workers until a queue is found unlocked,
1723f841f6adSraf * or until the list is completely traversed at which point another
1724f841f6adSraf * worker will be created.
1725f841f6adSraf */
1726f841f6adSraf sigoff(self); /* defer SIGIO */
1727f841f6adSraf sig_mutex_lock(&__aio_mutex);
1728f841f6adSraf first = aiowp = *nextworker;
1729f841f6adSraf if (mode != AIONOTIFY)
1730f841f6adSraf _aio_outstand_cnt++;
1731f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1732f841f6adSraf
1733f841f6adSraf switch (mode) {
1734f841f6adSraf case AIOREAD:
1735f841f6adSraf case AIOWRITE:
1736f841f6adSraf case AIOAREAD:
1737f841f6adSraf case AIOAWRITE:
1738f841f6adSraf #if !defined(_LP64)
1739f841f6adSraf case AIOAREAD64:
1740f841f6adSraf case AIOAWRITE64:
1741f841f6adSraf #endif
1742f841f6adSraf /* try to find an idle worker */
1743f841f6adSraf found = 0;
1744f841f6adSraf do {
1745f841f6adSraf if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
1746f841f6adSraf if (aiowp->work_idleflg) {
1747f841f6adSraf found = 1;
1748f841f6adSraf break;
1749f841f6adSraf }
1750f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
1751f841f6adSraf }
1752f841f6adSraf } while ((aiowp = aiowp->work_forw) != first);
1753f841f6adSraf
1754f841f6adSraf if (found) {
1755f841f6adSraf aiowp->work_minload1++;
1756f841f6adSraf break;
1757f841f6adSraf }
1758f841f6adSraf
1759f841f6adSraf /* try to acquire some worker's queue lock */
1760f841f6adSraf do {
1761f841f6adSraf if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
1762f841f6adSraf found = 1;
1763f841f6adSraf break;
1764f841f6adSraf }
1765f841f6adSraf } while ((aiowp = aiowp->work_forw) != first);
1766f841f6adSraf
1767f841f6adSraf /*
1768f841f6adSraf * Create more workers when the workers appear overloaded.
1769f841f6adSraf * Either all the workers are busy draining their queues
1770f841f6adSraf * or no worker's queue lock could be acquired.
1771f841f6adSraf */
1772f841f6adSraf if (!found) {
1773f841f6adSraf if (_aio_worker_cnt < _max_workers) {
1774f841f6adSraf if (_aio_create_worker(reqp, mode))
1775f841f6adSraf aio_panic("_aio_req_add: add worker");
1776f841f6adSraf sigon(self); /* reenable SIGIO */
1777f841f6adSraf return;
1778f841f6adSraf }
1779f841f6adSraf
1780f841f6adSraf /*
1781f841f6adSraf * No worker available and we have created
1782f841f6adSraf * _max_workers, keep going through the
1783f841f6adSraf * list slowly until we get a lock
1784f841f6adSraf */
1785f841f6adSraf while (sig_mutex_trylock(&aiowp->work_qlock1) != 0) {
1786f841f6adSraf /*
1787f841f6adSraf * give someone else a chance
1788f841f6adSraf */
1789f841f6adSraf _aio_delay(1);
1790f841f6adSraf aiowp = aiowp->work_forw;
1791f841f6adSraf }
1792f841f6adSraf }
1793f841f6adSraf
1794f841f6adSraf ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
1795f841f6adSraf if (_aio_worker_cnt < _max_workers &&
1796f841f6adSraf aiowp->work_minload1 >= _minworkload) {
1797f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
1798f841f6adSraf sig_mutex_lock(&__aio_mutex);
1799f841f6adSraf *nextworker = aiowp->work_forw;
1800f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1801f841f6adSraf if (_aio_create_worker(reqp, mode))
1802f841f6adSraf aio_panic("aio_req_add: add worker");
1803f841f6adSraf sigon(self); /* reenable SIGIO */
1804f841f6adSraf return;
1805f841f6adSraf }
1806f841f6adSraf aiowp->work_minload1++;
1807f841f6adSraf break;
1808f841f6adSraf case AIOFSYNC:
1809f841f6adSraf case AIONOTIFY:
1810f841f6adSraf load_bal_flg = 0;
1811f841f6adSraf sig_mutex_lock(&aiowp->work_qlock1);
1812f841f6adSraf break;
1813f841f6adSraf default:
1814f841f6adSraf aio_panic("_aio_req_add: invalid mode");
1815f841f6adSraf break;
1816f841f6adSraf }
1817f841f6adSraf /*
1818f841f6adSraf * Put request onto worker's work queue.
1819f841f6adSraf */
1820f841f6adSraf if (aiowp->work_tail1 == NULL) {
1821f841f6adSraf ASSERT(aiowp->work_count1 == 0);
1822f841f6adSraf aiowp->work_tail1 = reqp;
1823f841f6adSraf aiowp->work_next1 = reqp;
1824f841f6adSraf } else {
1825f841f6adSraf aiowp->work_head1->req_next = reqp;
1826f841f6adSraf if (aiowp->work_next1 == NULL)
1827f841f6adSraf aiowp->work_next1 = reqp;
1828f841f6adSraf }
1829f841f6adSraf reqp->req_state = AIO_REQ_QUEUED;
1830f841f6adSraf reqp->req_worker = aiowp;
1831f841f6adSraf aiowp->work_head1 = reqp;
1832f841f6adSraf /*
1833f841f6adSraf * Awaken worker if it is not currently active.
1834f841f6adSraf */
1835f841f6adSraf if (aiowp->work_count1++ == 0 && aiowp->work_idleflg) {
1836f841f6adSraf aiowp->work_idleflg = 0;
1837f841f6adSraf (void) cond_signal(&aiowp->work_idle_cv);
1838f841f6adSraf }
1839f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
1840f841f6adSraf
1841f841f6adSraf if (load_bal_flg) {
1842f841f6adSraf sig_mutex_lock(&__aio_mutex);
1843f841f6adSraf *nextworker = aiowp->work_forw;
1844f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1845f841f6adSraf }
1846f841f6adSraf sigon(self); /* reenable SIGIO */
1847f841f6adSraf }
1848f841f6adSraf
1849f841f6adSraf /*
1850f841f6adSraf * Get an AIO request for a specified worker.
1851f841f6adSraf * If the work queue is empty, return NULL.
1852f841f6adSraf */
1853f841f6adSraf aio_req_t *
_aio_req_get(aio_worker_t * aiowp)1854f841f6adSraf _aio_req_get(aio_worker_t *aiowp)
1855f841f6adSraf {
1856f841f6adSraf aio_req_t *reqp;
1857f841f6adSraf
1858f841f6adSraf sig_mutex_lock(&aiowp->work_qlock1);
1859f841f6adSraf if ((reqp = aiowp->work_next1) != NULL) {
1860f841f6adSraf /*
1861f841f6adSraf * Remove a POSIX request from the queue; the
1862f841f6adSraf * request queue is a singularly linked list
1863f841f6adSraf * with a previous pointer. The request is
1864f841f6adSraf * removed by updating the previous pointer.
1865f841f6adSraf *
1866f841f6adSraf * Non-posix requests are left on the queue
1867f841f6adSraf * to eventually be placed on the done queue.
1868f841f6adSraf */
1869f841f6adSraf
1870f841f6adSraf if (POSIX_AIO(reqp)) {
1871f841f6adSraf if (aiowp->work_prev1 == NULL) {
1872f841f6adSraf aiowp->work_tail1 = reqp->req_next;
1873f841f6adSraf if (aiowp->work_tail1 == NULL)
1874f841f6adSraf aiowp->work_head1 = NULL;
1875f841f6adSraf } else {
1876f841f6adSraf aiowp->work_prev1->req_next = reqp->req_next;
1877f841f6adSraf if (aiowp->work_head1 == reqp)
1878f841f6adSraf aiowp->work_head1 = reqp->req_next;
1879f841f6adSraf }
1880f841f6adSraf
1881f841f6adSraf } else {
1882f841f6adSraf aiowp->work_prev1 = reqp;
1883f841f6adSraf ASSERT(aiowp->work_done1 >= 0);
1884f841f6adSraf aiowp->work_done1++;
1885f841f6adSraf }
1886f841f6adSraf ASSERT(reqp != reqp->req_next);
1887f841f6adSraf aiowp->work_next1 = reqp->req_next;
1888f841f6adSraf ASSERT(aiowp->work_count1 >= 1);
1889f841f6adSraf aiowp->work_count1--;
1890f841f6adSraf switch (reqp->req_op) {
1891f841f6adSraf case AIOREAD:
1892f841f6adSraf case AIOWRITE:
1893f841f6adSraf case AIOAREAD:
1894f841f6adSraf case AIOAWRITE:
1895f841f6adSraf #if !defined(_LP64)
1896f841f6adSraf case AIOAREAD64:
1897f841f6adSraf case AIOAWRITE64:
1898f841f6adSraf #endif
1899f841f6adSraf ASSERT(aiowp->work_minload1 > 0);
1900f841f6adSraf aiowp->work_minload1--;
1901f841f6adSraf break;
1902f841f6adSraf }
1903f841f6adSraf reqp->req_state = AIO_REQ_INPROGRESS;
1904f841f6adSraf }
1905f841f6adSraf aiowp->work_req = reqp;
1906f841f6adSraf ASSERT(reqp != NULL || aiowp->work_count1 == 0);
1907f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
1908f841f6adSraf return (reqp);
1909f841f6adSraf }
1910f841f6adSraf
1911f841f6adSraf static void
_aio_req_del(aio_worker_t * aiowp,aio_req_t * reqp,int ostate)1912f841f6adSraf _aio_req_del(aio_worker_t *aiowp, aio_req_t *reqp, int ostate)
1913f841f6adSraf {
1914f841f6adSraf aio_req_t **last;
1915f841f6adSraf aio_req_t *lastrp;
1916f841f6adSraf aio_req_t *next;
1917f841f6adSraf
1918f841f6adSraf ASSERT(aiowp != NULL);
1919f841f6adSraf ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
1920f841f6adSraf if (POSIX_AIO(reqp)) {
1921f841f6adSraf if (ostate != AIO_REQ_QUEUED)
1922f841f6adSraf return;
1923f841f6adSraf }
1924f841f6adSraf last = &aiowp->work_tail1;
1925f841f6adSraf lastrp = aiowp->work_tail1;
1926f841f6adSraf ASSERT(ostate == AIO_REQ_QUEUED || ostate == AIO_REQ_INPROGRESS);
1927f841f6adSraf while ((next = *last) != NULL) {
1928f841f6adSraf if (next == reqp) {
1929f841f6adSraf *last = next->req_next;
1930f841f6adSraf if (aiowp->work_next1 == next)
1931f841f6adSraf aiowp->work_next1 = next->req_next;
1932f841f6adSraf
1933bced1f33Spraks /*
1934bced1f33Spraks * if this is the first request on the queue, move
1935bced1f33Spraks * the lastrp pointer forward.
1936bced1f33Spraks */
1937bced1f33Spraks if (lastrp == next)
1938bced1f33Spraks lastrp = next->req_next;
1939bced1f33Spraks
1940bced1f33Spraks /*
1941bced1f33Spraks * if this request is pointed by work_head1, then
1942bced1f33Spraks * make work_head1 point to the last request that is
1943bced1f33Spraks * present on the queue.
1944bced1f33Spraks */
1945f841f6adSraf if (aiowp->work_head1 == next)
1946f841f6adSraf aiowp->work_head1 = lastrp;
1947bced1f33Spraks
1948bced1f33Spraks /*
1949bced1f33Spraks * work_prev1 is used only in non posix case and it
1950bced1f33Spraks * points to the current AIO_REQ_INPROGRESS request.
1951bced1f33Spraks * If work_prev1 points to this request which is being
1952bced1f33Spraks * deleted, make work_prev1 NULL and set work_done1
1953bced1f33Spraks * to 0.
1954bced1f33Spraks *
1955bced1f33Spraks * A worker thread can be processing only one request
1956bced1f33Spraks * at a time.
1957bced1f33Spraks */
1958bced1f33Spraks if (aiowp->work_prev1 == next) {
1959bced1f33Spraks ASSERT(ostate == AIO_REQ_INPROGRESS &&
1960bced1f33Spraks !POSIX_AIO(reqp) && aiowp->work_done1 > 0);
1961bced1f33Spraks aiowp->work_prev1 = NULL;
1962bced1f33Spraks aiowp->work_done1--;
1963f841f6adSraf }
1964f841f6adSraf
1965f841f6adSraf if (ostate == AIO_REQ_QUEUED) {
1966f841f6adSraf ASSERT(aiowp->work_count1 >= 1);
1967f841f6adSraf aiowp->work_count1--;
1968f841f6adSraf ASSERT(aiowp->work_minload1 >= 1);
1969f841f6adSraf aiowp->work_minload1--;
1970f841f6adSraf }
1971f841f6adSraf return;
1972f841f6adSraf }
1973f841f6adSraf last = &next->req_next;
1974f841f6adSraf lastrp = next;
1975f841f6adSraf }
1976f841f6adSraf /* NOTREACHED */
1977f841f6adSraf }
1978f841f6adSraf
1979f841f6adSraf static void
_aio_enq_doneq(aio_req_t * reqp)1980f841f6adSraf _aio_enq_doneq(aio_req_t *reqp)
1981f841f6adSraf {
1982f841f6adSraf if (_aio_doneq == NULL) {
1983f841f6adSraf _aio_doneq = reqp;
1984f841f6adSraf reqp->req_next = reqp->req_prev = reqp;
1985f841f6adSraf } else {
1986f841f6adSraf reqp->req_next = _aio_doneq;
1987f841f6adSraf reqp->req_prev = _aio_doneq->req_prev;
1988f841f6adSraf _aio_doneq->req_prev->req_next = reqp;
1989f841f6adSraf _aio_doneq->req_prev = reqp;
1990f841f6adSraf }
1991f841f6adSraf reqp->req_state = AIO_REQ_DONEQ;
1992f841f6adSraf _aio_doneq_cnt++;
1993f841f6adSraf }
1994f841f6adSraf
1995f841f6adSraf /*
1996f841f6adSraf * caller owns the _aio_mutex
1997f841f6adSraf */
1998f841f6adSraf aio_req_t *
_aio_req_remove(aio_req_t * reqp)1999f841f6adSraf _aio_req_remove(aio_req_t *reqp)
2000f841f6adSraf {
2001f841f6adSraf if (reqp && reqp->req_state != AIO_REQ_DONEQ)
2002f841f6adSraf return (NULL);
2003f841f6adSraf
2004f841f6adSraf if (reqp) {
2005f841f6adSraf /* request in done queue */
2006f841f6adSraf if (_aio_doneq == reqp)
2007f841f6adSraf _aio_doneq = reqp->req_next;
2008f841f6adSraf if (_aio_doneq == reqp) {
2009f841f6adSraf /* only one request on queue */
2010f841f6adSraf _aio_doneq = NULL;
2011f841f6adSraf } else {
2012f841f6adSraf aio_req_t *tmp = reqp->req_next;
2013f841f6adSraf reqp->req_prev->req_next = tmp;
2014f841f6adSraf tmp->req_prev = reqp->req_prev;
2015f841f6adSraf }
2016f841f6adSraf } else if ((reqp = _aio_doneq) != NULL) {
2017f841f6adSraf if (reqp == reqp->req_next) {
2018f841f6adSraf /* only one request on queue */
2019f841f6adSraf _aio_doneq = NULL;
2020f841f6adSraf } else {
2021f841f6adSraf reqp->req_prev->req_next = _aio_doneq = reqp->req_next;
2022f841f6adSraf _aio_doneq->req_prev = reqp->req_prev;
2023f841f6adSraf }
2024f841f6adSraf }
2025f841f6adSraf if (reqp) {
2026f841f6adSraf _aio_doneq_cnt--;
2027f841f6adSraf reqp->req_next = reqp->req_prev = reqp;
2028f841f6adSraf reqp->req_state = AIO_REQ_DONE;
2029f841f6adSraf }
2030f841f6adSraf return (reqp);
2031f841f6adSraf }
2032f841f6adSraf
2033f841f6adSraf /*
2034f841f6adSraf * An AIO request is identified by an aio_result_t pointer. The library
2035f841f6adSraf * maps this aio_result_t pointer to its internal representation using a
2036f841f6adSraf * hash table. This function adds an aio_result_t pointer to the hash table.
2037f841f6adSraf */
2038f841f6adSraf static int
_aio_hash_insert(aio_result_t * resultp,aio_req_t * reqp)2039f841f6adSraf _aio_hash_insert(aio_result_t *resultp, aio_req_t *reqp)
2040f841f6adSraf {
2041f841f6adSraf aio_hash_t *hashp;
2042f841f6adSraf aio_req_t **prev;
2043f841f6adSraf aio_req_t *next;
2044f841f6adSraf
2045f841f6adSraf hashp = _aio_hash + AIOHASH(resultp);
2046f841f6adSraf lmutex_lock(&hashp->hash_lock);
2047f841f6adSraf prev = &hashp->hash_ptr;
2048f841f6adSraf while ((next = *prev) != NULL) {
2049f841f6adSraf if (resultp == next->req_resultp) {
2050f841f6adSraf lmutex_unlock(&hashp->hash_lock);
2051f841f6adSraf return (-1);
2052f841f6adSraf }
2053f841f6adSraf prev = &next->req_link;
2054f841f6adSraf }
2055f841f6adSraf *prev = reqp;
2056f841f6adSraf ASSERT(reqp->req_link == NULL);
2057f841f6adSraf lmutex_unlock(&hashp->hash_lock);
2058f841f6adSraf return (0);
2059f841f6adSraf }
2060f841f6adSraf
2061f841f6adSraf /*
2062f841f6adSraf * Remove an entry from the hash table.
2063f841f6adSraf */
2064f841f6adSraf aio_req_t *
_aio_hash_del(aio_result_t * resultp)2065f841f6adSraf _aio_hash_del(aio_result_t *resultp)
2066f841f6adSraf {
2067f841f6adSraf aio_hash_t *hashp;
2068f841f6adSraf aio_req_t **prev;
2069f841f6adSraf aio_req_t *next = NULL;
2070f841f6adSraf
2071f841f6adSraf if (_aio_hash != NULL) {
2072f841f6adSraf hashp = _aio_hash + AIOHASH(resultp);
2073f841f6adSraf lmutex_lock(&hashp->hash_lock);
2074f841f6adSraf prev = &hashp->hash_ptr;
2075f841f6adSraf while ((next = *prev) != NULL) {
2076f841f6adSraf if (resultp == next->req_resultp) {
2077f841f6adSraf *prev = next->req_link;
2078f841f6adSraf next->req_link = NULL;
2079f841f6adSraf break;
2080f841f6adSraf }
2081f841f6adSraf prev = &next->req_link;
2082f841f6adSraf }
2083f841f6adSraf lmutex_unlock(&hashp->hash_lock);
2084f841f6adSraf }
2085f841f6adSraf return (next);
2086f841f6adSraf }
2087f841f6adSraf
2088f841f6adSraf /*
2089f841f6adSraf * find an entry in the hash table
2090f841f6adSraf */
2091f841f6adSraf aio_req_t *
_aio_hash_find(aio_result_t * resultp)2092f841f6adSraf _aio_hash_find(aio_result_t *resultp)
2093f841f6adSraf {
2094f841f6adSraf aio_hash_t *hashp;
2095f841f6adSraf aio_req_t **prev;
2096f841f6adSraf aio_req_t *next = NULL;
2097f841f6adSraf
2098f841f6adSraf if (_aio_hash != NULL) {
2099f841f6adSraf hashp = _aio_hash + AIOHASH(resultp);
2100f841f6adSraf lmutex_lock(&hashp->hash_lock);
2101f841f6adSraf prev = &hashp->hash_ptr;
2102f841f6adSraf while ((next = *prev) != NULL) {
2103f841f6adSraf if (resultp == next->req_resultp)
2104f841f6adSraf break;
2105f841f6adSraf prev = &next->req_link;
2106f841f6adSraf }
2107f841f6adSraf lmutex_unlock(&hashp->hash_lock);
2108f841f6adSraf }
2109f841f6adSraf return (next);
2110f841f6adSraf }
2111f841f6adSraf
2112f841f6adSraf /*
2113f841f6adSraf * AIO interface for POSIX
2114f841f6adSraf */
2115f841f6adSraf int
_aio_rw(aiocb_t * aiocbp,aio_lio_t * lio_head,aio_worker_t ** nextworker,int mode,int flg)2116f841f6adSraf _aio_rw(aiocb_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
2117f841f6adSraf int mode, int flg)
2118f841f6adSraf {
2119f841f6adSraf aio_req_t *reqp;
2120f841f6adSraf aio_args_t *ap;
2121f841f6adSraf int kerr;
2122f841f6adSraf
2123f841f6adSraf if (aiocbp == NULL) {
2124f841f6adSraf errno = EINVAL;
2125f841f6adSraf return (-1);
2126f841f6adSraf }
2127f841f6adSraf
2128f841f6adSraf /* initialize kaio */
2129f841f6adSraf if (!_kaio_ok)
2130f841f6adSraf _kaio_init();
2131f841f6adSraf
2132f841f6adSraf aiocbp->aio_state = NOCHECK;
2133f841f6adSraf
2134f841f6adSraf /*
2135f841f6adSraf * If we have been called because a list I/O
2136f841f6adSraf * kaio() failed, we dont want to repeat the
2137f841f6adSraf * system call
2138f841f6adSraf */
2139f841f6adSraf
2140f841f6adSraf if (flg & AIO_KAIO) {
2141f841f6adSraf /*
2142f841f6adSraf * Try kernel aio first.
2143f841f6adSraf * If errno is ENOTSUP/EBADFD,
2144f841f6adSraf * fall back to the thread implementation.
2145f841f6adSraf */
2146f841f6adSraf if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
2147f841f6adSraf aiocbp->aio_resultp.aio_errno = EINPROGRESS;
2148f841f6adSraf aiocbp->aio_state = CHECK;
2149f841f6adSraf kerr = (int)_kaio(mode, aiocbp);
2150f841f6adSraf if (kerr == 0)
2151f841f6adSraf return (0);
2152f841f6adSraf if (errno != ENOTSUP && errno != EBADFD) {
2153f841f6adSraf aiocbp->aio_resultp.aio_errno = errno;
2154f841f6adSraf aiocbp->aio_resultp.aio_return = -1;
2155f841f6adSraf aiocbp->aio_state = NOCHECK;
2156f841f6adSraf return (-1);
2157f841f6adSraf }
2158f841f6adSraf if (errno == EBADFD)
2159f841f6adSraf SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
2160f841f6adSraf }
2161f841f6adSraf }
2162f841f6adSraf
2163f841f6adSraf aiocbp->aio_resultp.aio_errno = EINPROGRESS;
2164f841f6adSraf aiocbp->aio_state = USERAIO;
2165f841f6adSraf
2166f841f6adSraf if (!__uaio_ok && __uaio_init() == -1)
2167f841f6adSraf return (-1);
2168f841f6adSraf
2169f841f6adSraf if ((reqp = _aio_req_alloc()) == NULL) {
2170f841f6adSraf errno = EAGAIN;
2171f841f6adSraf return (-1);
2172f841f6adSraf }
2173f841f6adSraf
2174f841f6adSraf /*
2175f841f6adSraf * If an LIO request, add the list head to the aio request
2176f841f6adSraf */
2177f841f6adSraf reqp->req_head = lio_head;
2178f841f6adSraf reqp->req_type = AIO_POSIX_REQ;
2179f841f6adSraf reqp->req_op = mode;
2180f841f6adSraf reqp->req_largefile = 0;
2181f841f6adSraf
2182f841f6adSraf if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
2183f841f6adSraf reqp->req_sigevent.sigev_notify = SIGEV_NONE;
2184f841f6adSraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2185f841f6adSraf reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
2186f841f6adSraf reqp->req_sigevent.sigev_signo =
2187f841f6adSraf aiocbp->aio_sigevent.sigev_signo;
2188f841f6adSraf reqp->req_sigevent.sigev_value.sival_ptr =
2189f841f6adSraf aiocbp->aio_sigevent.sigev_value.sival_ptr;
2190f841f6adSraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
2191f841f6adSraf port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
2192f841f6adSraf reqp->req_sigevent.sigev_notify = SIGEV_PORT;
2193f841f6adSraf /*
2194f841f6adSraf * Reuse the sigevent structure to contain the port number
2195f841f6adSraf * and the user value. Same for SIGEV_THREAD, below.
2196f841f6adSraf */
2197f841f6adSraf reqp->req_sigevent.sigev_signo =
2198f841f6adSraf pn->portnfy_port;
2199f841f6adSraf reqp->req_sigevent.sigev_value.sival_ptr =
2200f841f6adSraf pn->portnfy_user;
2201f841f6adSraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
2202f841f6adSraf reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
2203f841f6adSraf /*
2204f841f6adSraf * The sigevent structure contains the port number
2205f841f6adSraf * and the user value. Same for SIGEV_PORT, above.
2206f841f6adSraf */
2207f841f6adSraf reqp->req_sigevent.sigev_signo =
2208f841f6adSraf aiocbp->aio_sigevent.sigev_signo;
2209f841f6adSraf reqp->req_sigevent.sigev_value.sival_ptr =
2210f841f6adSraf aiocbp->aio_sigevent.sigev_value.sival_ptr;
2211f841f6adSraf }
2212f841f6adSraf
2213f841f6adSraf reqp->req_resultp = &aiocbp->aio_resultp;
2214f841f6adSraf reqp->req_aiocbp = aiocbp;
2215f841f6adSraf ap = &reqp->req_args;
2216f841f6adSraf ap->fd = aiocbp->aio_fildes;
2217f841f6adSraf ap->buf = (caddr_t)aiocbp->aio_buf;
2218f841f6adSraf ap->bufsz = aiocbp->aio_nbytes;
2219f841f6adSraf ap->offset = aiocbp->aio_offset;
2220f841f6adSraf
2221f841f6adSraf if ((flg & AIO_NO_DUPS) &&
2222f841f6adSraf _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
2223f841f6adSraf aio_panic("_aio_rw(): request already in hash table");
2224f841f6adSraf _aio_req_free(reqp);
2225f841f6adSraf errno = EINVAL;
2226f841f6adSraf return (-1);
2227f841f6adSraf }
2228f841f6adSraf _aio_req_add(reqp, nextworker, mode);
2229f841f6adSraf return (0);
2230f841f6adSraf }
2231f841f6adSraf
2232f841f6adSraf #if !defined(_LP64)
2233f841f6adSraf /*
2234f841f6adSraf * 64-bit AIO interface for POSIX
2235f841f6adSraf */
2236f841f6adSraf int
_aio_rw64(aiocb64_t * aiocbp,aio_lio_t * lio_head,aio_worker_t ** nextworker,int mode,int flg)2237f841f6adSraf _aio_rw64(aiocb64_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
2238f841f6adSraf int mode, int flg)
2239f841f6adSraf {
2240f841f6adSraf aio_req_t *reqp;
2241f841f6adSraf aio_args_t *ap;
2242f841f6adSraf int kerr;
2243f841f6adSraf
2244f841f6adSraf if (aiocbp == NULL) {
2245f841f6adSraf errno = EINVAL;
2246f841f6adSraf return (-1);
2247f841f6adSraf }
2248f841f6adSraf
2249f841f6adSraf /* initialize kaio */
2250f841f6adSraf if (!_kaio_ok)
2251f841f6adSraf _kaio_init();
2252f841f6adSraf
2253f841f6adSraf aiocbp->aio_state = NOCHECK;
2254f841f6adSraf
2255f841f6adSraf /*
2256f841f6adSraf * If we have been called because a list I/O
2257f841f6adSraf * kaio() failed, we dont want to repeat the
2258f841f6adSraf * system call
2259f841f6adSraf */
2260f841f6adSraf
2261f841f6adSraf if (flg & AIO_KAIO) {
2262f841f6adSraf /*
2263f841f6adSraf * Try kernel aio first.
2264f841f6adSraf * If errno is ENOTSUP/EBADFD,
2265f841f6adSraf * fall back to the thread implementation.
2266f841f6adSraf */
2267f841f6adSraf if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
2268f841f6adSraf aiocbp->aio_resultp.aio_errno = EINPROGRESS;
2269f841f6adSraf aiocbp->aio_state = CHECK;
2270f841f6adSraf kerr = (int)_kaio(mode, aiocbp);
2271f841f6adSraf if (kerr == 0)
2272f841f6adSraf return (0);
2273f841f6adSraf if (errno != ENOTSUP && errno != EBADFD) {
2274f841f6adSraf aiocbp->aio_resultp.aio_errno = errno;
2275f841f6adSraf aiocbp->aio_resultp.aio_return = -1;
2276f841f6adSraf aiocbp->aio_state = NOCHECK;
2277f841f6adSraf return (-1);
2278f841f6adSraf }
2279f841f6adSraf if (errno == EBADFD)
2280f841f6adSraf SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
2281f841f6adSraf }
2282f841f6adSraf }
2283f841f6adSraf
2284f841f6adSraf aiocbp->aio_resultp.aio_errno = EINPROGRESS;
2285f841f6adSraf aiocbp->aio_state = USERAIO;
2286f841f6adSraf
2287f841f6adSraf if (!__uaio_ok && __uaio_init() == -1)
2288f841f6adSraf return (-1);
2289f841f6adSraf
2290f841f6adSraf if ((reqp = _aio_req_alloc()) == NULL) {
2291f841f6adSraf errno = EAGAIN;
2292f841f6adSraf return (-1);
2293f841f6adSraf }
2294f841f6adSraf
2295f841f6adSraf /*
2296f841f6adSraf * If an LIO request, add the list head to the aio request
2297f841f6adSraf */
2298f841f6adSraf reqp->req_head = lio_head;
2299f841f6adSraf reqp->req_type = AIO_POSIX_REQ;
2300f841f6adSraf reqp->req_op = mode;
2301f841f6adSraf reqp->req_largefile = 1;
2302f841f6adSraf
2303f841f6adSraf if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
2304f841f6adSraf reqp->req_sigevent.sigev_notify = SIGEV_NONE;
2305f841f6adSraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2306f841f6adSraf reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
2307f841f6adSraf reqp->req_sigevent.sigev_signo =
2308f841f6adSraf aiocbp->aio_sigevent.sigev_signo;
2309f841f6adSraf reqp->req_sigevent.sigev_value.sival_ptr =
2310f841f6adSraf aiocbp->aio_sigevent.sigev_value.sival_ptr;
2311f841f6adSraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
2312f841f6adSraf port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
2313f841f6adSraf reqp->req_sigevent.sigev_notify = SIGEV_PORT;
2314f841f6adSraf reqp->req_sigevent.sigev_signo =
2315f841f6adSraf pn->portnfy_port;
2316f841f6adSraf reqp->req_sigevent.sigev_value.sival_ptr =
2317f841f6adSraf pn->portnfy_user;
2318f841f6adSraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
2319f841f6adSraf reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
2320f841f6adSraf reqp->req_sigevent.sigev_signo =
2321f841f6adSraf aiocbp->aio_sigevent.sigev_signo;
2322f841f6adSraf reqp->req_sigevent.sigev_value.sival_ptr =
2323f841f6adSraf aiocbp->aio_sigevent.sigev_value.sival_ptr;
2324f841f6adSraf }
2325f841f6adSraf
2326f841f6adSraf reqp->req_resultp = &aiocbp->aio_resultp;
2327f841f6adSraf reqp->req_aiocbp = aiocbp;
2328f841f6adSraf ap = &reqp->req_args;
2329f841f6adSraf ap->fd = aiocbp->aio_fildes;
2330f841f6adSraf ap->buf = (caddr_t)aiocbp->aio_buf;
2331f841f6adSraf ap->bufsz = aiocbp->aio_nbytes;
2332f841f6adSraf ap->offset = aiocbp->aio_offset;
2333f841f6adSraf
2334f841f6adSraf if ((flg & AIO_NO_DUPS) &&
2335f841f6adSraf _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
2336f841f6adSraf aio_panic("_aio_rw64(): request already in hash table");
2337f841f6adSraf _aio_req_free(reqp);
2338f841f6adSraf errno = EINVAL;
2339f841f6adSraf return (-1);
2340f841f6adSraf }
2341f841f6adSraf _aio_req_add(reqp, nextworker, mode);
2342f841f6adSraf return (0);
2343f841f6adSraf }
2344f841f6adSraf #endif /* !defined(_LP64) */
2345