xref: /illumos-gate/usr/src/lib/libc/port/aio/aio.c (revision a574db85)
1f841f6adSraf /*
2f841f6adSraf  * CDDL HEADER START
3f841f6adSraf  *
4f841f6adSraf  * The contents of this file are subject to the terms of the
5f841f6adSraf  * Common Development and Distribution License (the "License").
6f841f6adSraf  * You may not use this file except in compliance with the License.
7f841f6adSraf  *
8f841f6adSraf  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9f841f6adSraf  * or http://www.opensolaris.org/os/licensing.
10f841f6adSraf  * See the License for the specific language governing permissions
11f841f6adSraf  * and limitations under the License.
12f841f6adSraf  *
13f841f6adSraf  * When distributing Covered Code, include this CDDL HEADER in each
14f841f6adSraf  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15f841f6adSraf  * If applicable, add the following below this CDDL HEADER, with the
16f841f6adSraf  * fields enclosed by brackets "[]" replaced with your own identifying
17f841f6adSraf  * information: Portions Copyright [yyyy] [name of copyright owner]
18f841f6adSraf  *
19f841f6adSraf  * CDDL HEADER END
20f841f6adSraf  */
21f841f6adSraf 
22f841f6adSraf /*
23*a574db85Sraf  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24f841f6adSraf  * Use is subject to license terms.
25f841f6adSraf  */
26f841f6adSraf 
27f841f6adSraf #pragma ident	"%Z%%M%	%I%	%E% SMI"
28f841f6adSraf 
29f841f6adSraf #include "synonyms.h"
30f841f6adSraf #include "thr_uberdata.h"
31f841f6adSraf #include "asyncio.h"
32f841f6adSraf #include <atomic.h>
33f841f6adSraf #include <sys/param.h>
34f841f6adSraf #include <sys/file.h>
35f841f6adSraf #include <sys/port.h>
36f841f6adSraf 
37f841f6adSraf static int _aio_hash_insert(aio_result_t *, aio_req_t *);
38f841f6adSraf static aio_req_t *_aio_req_get(aio_worker_t *);
39f841f6adSraf static void _aio_req_add(aio_req_t *, aio_worker_t **, int);
40f841f6adSraf static void _aio_req_del(aio_worker_t *, aio_req_t *, int);
41f841f6adSraf static void _aio_work_done(aio_worker_t *);
42f841f6adSraf static void _aio_enq_doneq(aio_req_t *);
43f841f6adSraf 
44f841f6adSraf extern void _aio_lio_free(aio_lio_t *);
45f841f6adSraf 
46f841f6adSraf extern int __fdsync(int, int);
47f841f6adSraf extern int _port_dispatch(int, int, int, int, uintptr_t, void *);
48f841f6adSraf 
49f841f6adSraf static int _aio_fsync_del(aio_worker_t *, aio_req_t *);
50f841f6adSraf static void _aiodone(aio_req_t *, ssize_t, int);
51f841f6adSraf static void _aio_cancel_work(aio_worker_t *, int, int *, int *);
52f841f6adSraf static void _aio_finish_request(aio_worker_t *, ssize_t, int);
53f841f6adSraf 
54f841f6adSraf /*
55f841f6adSraf  * switch for kernel async I/O
56f841f6adSraf  */
57f841f6adSraf int _kaio_ok = 0;		/* 0 = disabled, 1 = on, -1 = error */
58f841f6adSraf 
59f841f6adSraf /*
60f841f6adSraf  * Key for thread-specific data
61f841f6adSraf  */
62f841f6adSraf pthread_key_t _aio_key;
63f841f6adSraf 
64f841f6adSraf /*
65f841f6adSraf  * Array for determining whether or not a file supports kaio.
66f841f6adSraf  * Initialized in _kaio_init().
67f841f6adSraf  */
68f841f6adSraf uint32_t *_kaio_supported = NULL;
69f841f6adSraf 
70f841f6adSraf /*
71f841f6adSraf  *  workers for read/write requests
72f841f6adSraf  * (__aio_mutex lock protects circular linked list of workers)
73f841f6adSraf  */
74f841f6adSraf aio_worker_t *__workers_rw;	/* circular list of AIO workers */
75f841f6adSraf aio_worker_t *__nextworker_rw;	/* next worker in list of workers */
76f841f6adSraf int __rw_workerscnt;		/* number of read/write workers */
77f841f6adSraf 
78f841f6adSraf /*
79f841f6adSraf  * worker for notification requests.
80f841f6adSraf  */
81f841f6adSraf aio_worker_t *__workers_no;	/* circular list of AIO workers */
82f841f6adSraf aio_worker_t *__nextworker_no;	/* next worker in list of workers */
83f841f6adSraf int __no_workerscnt;		/* number of write workers */
84f841f6adSraf 
85f841f6adSraf aio_req_t *_aio_done_tail;		/* list of done requests */
86f841f6adSraf aio_req_t *_aio_done_head;
87f841f6adSraf 
88f841f6adSraf mutex_t __aio_initlock = DEFAULTMUTEX;	/* makes aio initialization atomic */
89f841f6adSraf cond_t __aio_initcv = DEFAULTCV;
90f841f6adSraf int __aio_initbusy = 0;
91f841f6adSraf 
92f841f6adSraf mutex_t __aio_mutex = DEFAULTMUTEX;	/* protects counts, and linked lists */
93f841f6adSraf cond_t _aio_iowait_cv = DEFAULTCV;	/* wait for userland I/Os */
94f841f6adSraf 
95f841f6adSraf pid_t __pid = (pid_t)-1;		/* initialize as invalid pid */
96f841f6adSraf int _sigio_enabled = 0;			/* when set, send SIGIO signal */
97f841f6adSraf 
98f841f6adSraf aio_hash_t *_aio_hash;
99f841f6adSraf 
100f841f6adSraf aio_req_t *_aio_doneq;			/* double linked done queue list */
101f841f6adSraf 
102f841f6adSraf int _aio_donecnt = 0;
103f841f6adSraf int _aio_waitncnt = 0;			/* # of requests for aio_waitn */
104f841f6adSraf int _aio_doneq_cnt = 0;
105f841f6adSraf int _aio_outstand_cnt = 0;		/* # of outstanding requests */
106f841f6adSraf int _kaio_outstand_cnt = 0;		/* # of outstanding kaio requests */
107f841f6adSraf int _aio_req_done_cnt = 0;		/* req. done but not in "done queue" */
108f841f6adSraf int _aio_kernel_suspend = 0;		/* active kernel kaio calls */
109f841f6adSraf int _aio_suscv_cnt = 0;			/* aio_suspend calls waiting on cv's */
110f841f6adSraf 
111f841f6adSraf int _max_workers = 256;			/* max number of workers permitted */
112f841f6adSraf int _min_workers = 4;			/* min number of workers */
113f841f6adSraf int _minworkload = 2;			/* min number of request in q */
114f841f6adSraf int _aio_worker_cnt = 0;		/* number of workers to do requests */
115f841f6adSraf int __uaio_ok = 0;			/* AIO has been enabled */
116f841f6adSraf sigset_t _worker_set;			/* worker's signal mask */
117f841f6adSraf 
118f841f6adSraf int _aiowait_flag = 0;			/* when set, aiowait() is inprogress */
119f841f6adSraf int _aio_flags = 0;			/* see asyncio.h defines for */
120f841f6adSraf 
121f841f6adSraf aio_worker_t *_kaiowp = NULL;		/* points to kaio cleanup thread */
122f841f6adSraf 
123f841f6adSraf int hz;					/* clock ticks per second */
124f841f6adSraf 
125f841f6adSraf static int
126f841f6adSraf _kaio_supported_init(void)
127f841f6adSraf {
128f841f6adSraf 	void *ptr;
129f841f6adSraf 	size_t size;
130f841f6adSraf 
131f841f6adSraf 	if (_kaio_supported != NULL)	/* already initialized */
132f841f6adSraf 		return (0);
133f841f6adSraf 
134f841f6adSraf 	size = MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t);
135f841f6adSraf 	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
136f841f6adSraf 	    MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
137f841f6adSraf 	if (ptr == MAP_FAILED)
138f841f6adSraf 		return (-1);
139f841f6adSraf 	_kaio_supported = ptr;
140f841f6adSraf 	return (0);
141f841f6adSraf }
142f841f6adSraf 
143f841f6adSraf /*
144f841f6adSraf  * The aio subsystem is initialized when an AIO request is made.
145f841f6adSraf  * Constants are initialized like the max number of workers that
146f841f6adSraf  * the subsystem can create, and the minimum number of workers
147f841f6adSraf  * permitted before imposing some restrictions.  Also, some
148f841f6adSraf  * workers are created.
149f841f6adSraf  */
150f841f6adSraf int
151f841f6adSraf __uaio_init(void)
152f841f6adSraf {
153f841f6adSraf 	int ret = -1;
154f841f6adSraf 	int i;
155*a574db85Sraf 	int cancel_state;
156f841f6adSraf 
157f841f6adSraf 	lmutex_lock(&__aio_initlock);
158*a574db85Sraf 	(void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state);
159f841f6adSraf 	while (__aio_initbusy)
160*a574db85Sraf 		(void) cond_wait(&__aio_initcv, &__aio_initlock);
161*a574db85Sraf 	(void) pthread_setcancelstate(cancel_state, NULL);
162f841f6adSraf 	if (__uaio_ok) {	/* already initialized */
163f841f6adSraf 		lmutex_unlock(&__aio_initlock);
164f841f6adSraf 		return (0);
165f841f6adSraf 	}
166f841f6adSraf 	__aio_initbusy = 1;
167f841f6adSraf 	lmutex_unlock(&__aio_initlock);
168f841f6adSraf 
169f841f6adSraf 	hz = (int)sysconf(_SC_CLK_TCK);
170f841f6adSraf 	__pid = getpid();
171f841f6adSraf 
172f841f6adSraf 	setup_cancelsig(SIGAIOCANCEL);
173f841f6adSraf 
174f841f6adSraf 	if (_kaio_supported_init() != 0)
175f841f6adSraf 		goto out;
176f841f6adSraf 
177f841f6adSraf 	/*
178f841f6adSraf 	 * Allocate and initialize the hash table.
179f7499066Ssp92102 	 * Do this only once, even if __uaio_init() is called twice.
180f841f6adSraf 	 */
181f7499066Ssp92102 	if (_aio_hash == NULL) {
182f841f6adSraf 		/* LINTED pointer cast */
183f841f6adSraf 		_aio_hash = (aio_hash_t *)mmap(NULL,
184f841f6adSraf 		    HASHSZ * sizeof (aio_hash_t), PROT_READ | PROT_WRITE,
185f841f6adSraf 		    MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
186f841f6adSraf 		if ((void *)_aio_hash == MAP_FAILED) {
187f841f6adSraf 			_aio_hash = NULL;
188f841f6adSraf 			goto out;
189f841f6adSraf 		}
190f841f6adSraf 		for (i = 0; i < HASHSZ; i++)
191f7499066Ssp92102 			(void) mutex_init(&_aio_hash[i].hash_lock,
192f7499066Ssp92102 			    USYNC_THREAD, NULL);
193f7499066Ssp92102 	}
194f841f6adSraf 
195f841f6adSraf 	/*
196f841f6adSraf 	 * Initialize worker's signal mask to only catch SIGAIOCANCEL.
197f841f6adSraf 	 */
198f841f6adSraf 	(void) sigfillset(&_worker_set);
199f841f6adSraf 	(void) sigdelset(&_worker_set, SIGAIOCANCEL);
200f841f6adSraf 
201f841f6adSraf 	/*
202f7499066Ssp92102 	 * Create one worker to send asynchronous notifications.
203f7499066Ssp92102 	 * Do this only once, even if __uaio_init() is called twice.
204f7499066Ssp92102 	 */
205f7499066Ssp92102 	if (__no_workerscnt == 0 &&
206f7499066Ssp92102 	    (_aio_create_worker(NULL, AIONOTIFY) != 0)) {
207f7499066Ssp92102 		errno = EAGAIN;
208f7499066Ssp92102 		goto out;
209f7499066Ssp92102 	}
210f7499066Ssp92102 
211f7499066Ssp92102 	/*
212f841f6adSraf 	 * Create the minimum number of read/write workers.
213f7499066Ssp92102 	 * And later check whether atleast one worker is created;
214f7499066Ssp92102 	 * lwp_create() calls could fail because of segkp exhaustion.
215f841f6adSraf 	 */
216f841f6adSraf 	for (i = 0; i < _min_workers; i++)
217f841f6adSraf 		(void) _aio_create_worker(NULL, AIOREAD);
218f7499066Ssp92102 	if (__rw_workerscnt == 0) {
219f7499066Ssp92102 		errno = EAGAIN;
220f7499066Ssp92102 		goto out;
221f7499066Ssp92102 	}
222f841f6adSraf 
223f841f6adSraf 	ret = 0;
224f841f6adSraf out:
225f841f6adSraf 	lmutex_lock(&__aio_initlock);
226f841f6adSraf 	if (ret == 0)
227f841f6adSraf 		__uaio_ok = 1;
228f841f6adSraf 	__aio_initbusy = 0;
229f841f6adSraf 	(void) cond_broadcast(&__aio_initcv);
230f841f6adSraf 	lmutex_unlock(&__aio_initlock);
231f841f6adSraf 	return (ret);
232f841f6adSraf }
233f841f6adSraf 
234f841f6adSraf /*
235f841f6adSraf  * Called from close() before actually performing the real _close().
236f841f6adSraf  */
237f841f6adSraf void
238f841f6adSraf _aio_close(int fd)
239f841f6adSraf {
240f841f6adSraf 	if (fd < 0)	/* avoid cancelling everything */
241f841f6adSraf 		return;
242f841f6adSraf 	/*
243f841f6adSraf 	 * Cancel all outstanding aio requests for this file descriptor.
244f841f6adSraf 	 */
245f841f6adSraf 	if (__uaio_ok)
246f841f6adSraf 		(void) aiocancel_all(fd);
247f841f6adSraf 	/*
248f841f6adSraf 	 * If we have allocated the bit array, clear the bit for this file.
249f841f6adSraf 	 * The next open may re-use this file descriptor and the new file
250f841f6adSraf 	 * may have different kaio() behaviour.
251f841f6adSraf 	 */
252f841f6adSraf 	if (_kaio_supported != NULL)
253f841f6adSraf 		CLEAR_KAIO_SUPPORTED(fd);
254f841f6adSraf }
255f841f6adSraf 
256f841f6adSraf /*
257f841f6adSraf  * special kaio cleanup thread sits in a loop in the
258f841f6adSraf  * kernel waiting for pending kaio requests to complete.
259f841f6adSraf  */
260f841f6adSraf void *
261f841f6adSraf _kaio_cleanup_thread(void *arg)
262f841f6adSraf {
263f841f6adSraf 	if (pthread_setspecific(_aio_key, arg) != 0)
264f841f6adSraf 		aio_panic("_kaio_cleanup_thread, pthread_setspecific()");
265f841f6adSraf 	(void) _kaio(AIOSTART);
266f841f6adSraf 	return (arg);
267f841f6adSraf }
268f841f6adSraf 
269f841f6adSraf /*
270f841f6adSraf  * initialize kaio.
271f841f6adSraf  */
272f841f6adSraf void
273f841f6adSraf _kaio_init()
274f841f6adSraf {
275f841f6adSraf 	int error;
276f841f6adSraf 	sigset_t oset;
277*a574db85Sraf 	int cancel_state;
278f841f6adSraf 
279f841f6adSraf 	lmutex_lock(&__aio_initlock);
280*a574db85Sraf 	(void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state);
281f841f6adSraf 	while (__aio_initbusy)
282*a574db85Sraf 		(void) cond_wait(&__aio_initcv, &__aio_initlock);
283*a574db85Sraf 	(void) pthread_setcancelstate(cancel_state, NULL);
284f841f6adSraf 	if (_kaio_ok) {		/* already initialized */
285f841f6adSraf 		lmutex_unlock(&__aio_initlock);
286f841f6adSraf 		return;
287f841f6adSraf 	}
288f841f6adSraf 	__aio_initbusy = 1;
289f841f6adSraf 	lmutex_unlock(&__aio_initlock);
290f841f6adSraf 
291f841f6adSraf 	if (_kaio_supported_init() != 0)
292f841f6adSraf 		error = ENOMEM;
293f841f6adSraf 	else if ((_kaiowp = _aio_worker_alloc()) == NULL)
294f841f6adSraf 		error = ENOMEM;
295f841f6adSraf 	else if ((error = (int)_kaio(AIOINIT)) == 0) {
296f841f6adSraf 		(void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
297f841f6adSraf 		error = thr_create(NULL, AIOSTKSIZE, _kaio_cleanup_thread,
298f841f6adSraf 		    _kaiowp, THR_DAEMON, &_kaiowp->work_tid);
299f841f6adSraf 		(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
300f841f6adSraf 	}
301f841f6adSraf 	if (error && _kaiowp != NULL) {
302f841f6adSraf 		_aio_worker_free(_kaiowp);
303f841f6adSraf 		_kaiowp = NULL;
304f841f6adSraf 	}
305f841f6adSraf 
306f841f6adSraf 	lmutex_lock(&__aio_initlock);
307f841f6adSraf 	if (error)
308f841f6adSraf 		_kaio_ok = -1;
309f841f6adSraf 	else
310f841f6adSraf 		_kaio_ok = 1;
311f841f6adSraf 	__aio_initbusy = 0;
312f841f6adSraf 	(void) cond_broadcast(&__aio_initcv);
313f841f6adSraf 	lmutex_unlock(&__aio_initlock);
314f841f6adSraf }
315f841f6adSraf 
316f841f6adSraf int
317f841f6adSraf aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
318f841f6adSraf     aio_result_t *resultp)
319f841f6adSraf {
320f841f6adSraf 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOREAD));
321f841f6adSraf }
322f841f6adSraf 
323f841f6adSraf int
324f841f6adSraf aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
325f841f6adSraf     aio_result_t *resultp)
326f841f6adSraf {
327f841f6adSraf 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOWRITE));
328f841f6adSraf }
329f841f6adSraf 
330f841f6adSraf #if !defined(_LP64)
331f841f6adSraf int
332f841f6adSraf aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
333f841f6adSraf     aio_result_t *resultp)
334f841f6adSraf {
335f841f6adSraf 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAREAD64));
336f841f6adSraf }
337f841f6adSraf 
338f841f6adSraf int
339f841f6adSraf aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
340f841f6adSraf     aio_result_t *resultp)
341f841f6adSraf {
342f841f6adSraf 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAWRITE64));
343f841f6adSraf }
344f841f6adSraf #endif	/* !defined(_LP64) */
345f841f6adSraf 
346f841f6adSraf int
347f841f6adSraf _aiorw(int fd, caddr_t buf, int bufsz, offset_t offset, int whence,
348f841f6adSraf     aio_result_t *resultp, int mode)
349f841f6adSraf {
350f841f6adSraf 	aio_req_t *reqp;
351f841f6adSraf 	aio_args_t *ap;
352f841f6adSraf 	offset_t loffset;
353967072a1Spraks 	struct stat64 stat64;
354f841f6adSraf 	int error = 0;
355f841f6adSraf 	int kerr;
356f841f6adSraf 	int umode;
357f841f6adSraf 
358f841f6adSraf 	switch (whence) {
359f841f6adSraf 
360f841f6adSraf 	case SEEK_SET:
361f841f6adSraf 		loffset = offset;
362f841f6adSraf 		break;
363f841f6adSraf 	case SEEK_CUR:
364f841f6adSraf 		if ((loffset = llseek(fd, 0, SEEK_CUR)) == -1)
365f841f6adSraf 			error = -1;
366f841f6adSraf 		else
367f841f6adSraf 			loffset += offset;
368f841f6adSraf 		break;
369f841f6adSraf 	case SEEK_END:
370967072a1Spraks 		if (fstat64(fd, &stat64) == -1)
371f841f6adSraf 			error = -1;
372f841f6adSraf 		else
373967072a1Spraks 			loffset = offset + stat64.st_size;
374f841f6adSraf 		break;
375f841f6adSraf 	default:
376f841f6adSraf 		errno = EINVAL;
377f841f6adSraf 		error = -1;
378f841f6adSraf 	}
379f841f6adSraf 
380f841f6adSraf 	if (error)
381f841f6adSraf 		return (error);
382f841f6adSraf 
383f841f6adSraf 	/* initialize kaio */
384f841f6adSraf 	if (!_kaio_ok)
385f841f6adSraf 		_kaio_init();
386f841f6adSraf 
387f841f6adSraf 	/*
388f841f6adSraf 	 * _aio_do_request() needs the original request code (mode) to be able
389f841f6adSraf 	 * to choose the appropiate 32/64 bit function.  All other functions
390f841f6adSraf 	 * only require the difference between READ and WRITE (umode).
391f841f6adSraf 	 */
392f841f6adSraf 	if (mode == AIOAREAD64 || mode == AIOAWRITE64)
393f841f6adSraf 		umode = mode - AIOAREAD64;
394f841f6adSraf 	else
395f841f6adSraf 		umode = mode;
396f841f6adSraf 
397f841f6adSraf 	/*
398f841f6adSraf 	 * Try kernel aio first.
399f841f6adSraf 	 * If errno is ENOTSUP/EBADFD, fall back to the thread implementation.
400f841f6adSraf 	 */
401f841f6adSraf 	if (_kaio_ok > 0 && KAIO_SUPPORTED(fd)) {
402f841f6adSraf 		resultp->aio_errno = 0;
403f841f6adSraf 		sig_mutex_lock(&__aio_mutex);
404f841f6adSraf 		_kaio_outstand_cnt++;
405967072a1Spraks 		sig_mutex_unlock(&__aio_mutex);
406f841f6adSraf 		kerr = (int)_kaio(((resultp->aio_return == AIO_INPROGRESS) ?
407f841f6adSraf 		    (umode | AIO_POLL_BIT) : umode),
408f841f6adSraf 		    fd, buf, bufsz, loffset, resultp);
409f841f6adSraf 		if (kerr == 0) {
410f841f6adSraf 			return (0);
411f841f6adSraf 		}
412967072a1Spraks 		sig_mutex_lock(&__aio_mutex);
413f841f6adSraf 		_kaio_outstand_cnt--;
414f841f6adSraf 		sig_mutex_unlock(&__aio_mutex);
415f841f6adSraf 		if (errno != ENOTSUP && errno != EBADFD)
416f841f6adSraf 			return (-1);
417f841f6adSraf 		if (errno == EBADFD)
418f841f6adSraf 			SET_KAIO_NOT_SUPPORTED(fd);
419f841f6adSraf 	}
420f841f6adSraf 
421f841f6adSraf 	if (!__uaio_ok && __uaio_init() == -1)
422f841f6adSraf 		return (-1);
423f841f6adSraf 
424f841f6adSraf 	if ((reqp = _aio_req_alloc()) == NULL) {
425f841f6adSraf 		errno = EAGAIN;
426f841f6adSraf 		return (-1);
427f841f6adSraf 	}
428f841f6adSraf 
429f841f6adSraf 	/*
430f841f6adSraf 	 * _aio_do_request() checks reqp->req_op to differentiate
431f841f6adSraf 	 * between 32 and 64 bit access.
432f841f6adSraf 	 */
433f841f6adSraf 	reqp->req_op = mode;
434f841f6adSraf 	reqp->req_resultp = resultp;
435f841f6adSraf 	ap = &reqp->req_args;
436f841f6adSraf 	ap->fd = fd;
437f841f6adSraf 	ap->buf = buf;
438f841f6adSraf 	ap->bufsz = bufsz;
439f841f6adSraf 	ap->offset = loffset;
440f841f6adSraf 
441f841f6adSraf 	if (_aio_hash_insert(resultp, reqp) != 0) {
442f841f6adSraf 		_aio_req_free(reqp);
443f841f6adSraf 		errno = EINVAL;
444f841f6adSraf 		return (-1);
445f841f6adSraf 	}
446f841f6adSraf 	/*
447f841f6adSraf 	 * _aio_req_add() only needs the difference between READ and
448f841f6adSraf 	 * WRITE to choose the right worker queue.
449f841f6adSraf 	 */
450f841f6adSraf 	_aio_req_add(reqp, &__nextworker_rw, umode);
451f841f6adSraf 	return (0);
452f841f6adSraf }
453f841f6adSraf 
454f841f6adSraf int
455f841f6adSraf aiocancel(aio_result_t *resultp)
456f841f6adSraf {
457f841f6adSraf 	aio_req_t *reqp;
458f841f6adSraf 	aio_worker_t *aiowp;
459f841f6adSraf 	int ret;
460f841f6adSraf 	int done = 0;
461f841f6adSraf 	int canceled = 0;
462f841f6adSraf 
463f841f6adSraf 	if (!__uaio_ok) {
464f841f6adSraf 		errno = EINVAL;
465f841f6adSraf 		return (-1);
466f841f6adSraf 	}
467f841f6adSraf 
468f841f6adSraf 	sig_mutex_lock(&__aio_mutex);
469f841f6adSraf 	reqp = _aio_hash_find(resultp);
470f841f6adSraf 	if (reqp == NULL) {
471f841f6adSraf 		if (_aio_outstand_cnt == _aio_req_done_cnt)
472f841f6adSraf 			errno = EINVAL;
473f841f6adSraf 		else
474f841f6adSraf 			errno = EACCES;
475f841f6adSraf 		ret = -1;
476f841f6adSraf 	} else {
477f841f6adSraf 		aiowp = reqp->req_worker;
478f841f6adSraf 		sig_mutex_lock(&aiowp->work_qlock1);
479f841f6adSraf 		(void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
480f841f6adSraf 		sig_mutex_unlock(&aiowp->work_qlock1);
481f841f6adSraf 
482f841f6adSraf 		if (canceled) {
483f841f6adSraf 			ret = 0;
484f841f6adSraf 		} else {
485f841f6adSraf 			if (_aio_outstand_cnt == 0 ||
486f841f6adSraf 			    _aio_outstand_cnt == _aio_req_done_cnt)
487f841f6adSraf 				errno = EINVAL;
488f841f6adSraf 			else
489f841f6adSraf 				errno = EACCES;
490f841f6adSraf 			ret = -1;
491f841f6adSraf 		}
492f841f6adSraf 	}
493f841f6adSraf 	sig_mutex_unlock(&__aio_mutex);
494f841f6adSraf 	return (ret);
495f841f6adSraf }
496f841f6adSraf 
497*a574db85Sraf /* ARGSUSED */
498*a574db85Sraf static void
499*a574db85Sraf _aiowait_cleanup(void *arg)
500*a574db85Sraf {
501*a574db85Sraf 	sig_mutex_lock(&__aio_mutex);
502*a574db85Sraf 	_aiowait_flag--;
503*a574db85Sraf 	sig_mutex_unlock(&__aio_mutex);
504*a574db85Sraf }
505*a574db85Sraf 
506f841f6adSraf /*
507*a574db85Sraf  * This must be asynch safe and cancel safe
508f841f6adSraf  */
509f841f6adSraf aio_result_t *
510f841f6adSraf aiowait(struct timeval *uwait)
511f841f6adSraf {
512f841f6adSraf 	aio_result_t *uresultp;
513f841f6adSraf 	aio_result_t *kresultp;
514f841f6adSraf 	aio_result_t *resultp;
515f841f6adSraf 	int dontblock;
516f841f6adSraf 	int timedwait = 0;
517f841f6adSraf 	int kaio_errno = 0;
518f841f6adSraf 	struct timeval twait;
519f841f6adSraf 	struct timeval *wait = NULL;
520f841f6adSraf 	hrtime_t hrtend;
521f841f6adSraf 	hrtime_t hres;
522f841f6adSraf 
523f841f6adSraf 	if (uwait) {
524f841f6adSraf 		/*
525f841f6adSraf 		 * Check for a valid specified wait time.
526f841f6adSraf 		 * If it is invalid, fail the call right away.
527f841f6adSraf 		 */
528f841f6adSraf 		if (uwait->tv_sec < 0 || uwait->tv_usec < 0 ||
529f841f6adSraf 		    uwait->tv_usec >= MICROSEC) {
530f841f6adSraf 			errno = EINVAL;
531f841f6adSraf 			return ((aio_result_t *)-1);
532f841f6adSraf 		}
533f841f6adSraf 
534f841f6adSraf 		if (uwait->tv_sec > 0 || uwait->tv_usec > 0) {
535f841f6adSraf 			hrtend = gethrtime() +
536f841f6adSraf 			    (hrtime_t)uwait->tv_sec * NANOSEC +
537f841f6adSraf 			    (hrtime_t)uwait->tv_usec * (NANOSEC / MICROSEC);
538f841f6adSraf 			twait = *uwait;
539f841f6adSraf 			wait = &twait;
540f841f6adSraf 			timedwait++;
541f841f6adSraf 		} else {
542f841f6adSraf 			/* polling */
543f841f6adSraf 			sig_mutex_lock(&__aio_mutex);
544f841f6adSraf 			if (_kaio_outstand_cnt == 0) {
545f841f6adSraf 				kresultp = (aio_result_t *)-1;
546f841f6adSraf 			} else {
547f841f6adSraf 				kresultp = (aio_result_t *)_kaio(AIOWAIT,
548f841f6adSraf 				    (struct timeval *)-1, 1);
549f841f6adSraf 				if (kresultp != (aio_result_t *)-1 &&
550f841f6adSraf 				    kresultp != NULL &&
551f841f6adSraf 				    kresultp != (aio_result_t *)1) {
552f841f6adSraf 					_kaio_outstand_cnt--;
553f841f6adSraf 					sig_mutex_unlock(&__aio_mutex);
554f841f6adSraf 					return (kresultp);
555f841f6adSraf 				}
556f841f6adSraf 			}
557f841f6adSraf 			uresultp = _aio_req_done();
558f841f6adSraf 			sig_mutex_unlock(&__aio_mutex);
559f841f6adSraf 			if (uresultp != NULL &&
560f841f6adSraf 			    uresultp != (aio_result_t *)-1) {
561f841f6adSraf 				return (uresultp);
562f841f6adSraf 			}
563f841f6adSraf 			if (uresultp == (aio_result_t *)-1 &&
564f841f6adSraf 			    kresultp == (aio_result_t *)-1) {
565f841f6adSraf 				errno = EINVAL;
566f841f6adSraf 				return ((aio_result_t *)-1);
567f841f6adSraf 			} else {
568f841f6adSraf 				return (NULL);
569f841f6adSraf 			}
570f841f6adSraf 		}
571f841f6adSraf 	}
572f841f6adSraf 
573f841f6adSraf 	for (;;) {
574f841f6adSraf 		sig_mutex_lock(&__aio_mutex);
575f841f6adSraf 		uresultp = _aio_req_done();
576f841f6adSraf 		if (uresultp != NULL && uresultp != (aio_result_t *)-1) {
577f841f6adSraf 			sig_mutex_unlock(&__aio_mutex);
578f841f6adSraf 			resultp = uresultp;
579f841f6adSraf 			break;
580f841f6adSraf 		}
581f841f6adSraf 		_aiowait_flag++;
582f841f6adSraf 		dontblock = (uresultp == (aio_result_t *)-1);
583f841f6adSraf 		if (dontblock && _kaio_outstand_cnt == 0) {
584f841f6adSraf 			kresultp = (aio_result_t *)-1;
585f841f6adSraf 			kaio_errno = EINVAL;
586f841f6adSraf 		} else {
587f841f6adSraf 			sig_mutex_unlock(&__aio_mutex);
588*a574db85Sraf 			pthread_cleanup_push(_aiowait_cleanup, NULL);
589*a574db85Sraf 			_cancel_prologue();
590f841f6adSraf 			kresultp = (aio_result_t *)_kaio(AIOWAIT,
591f841f6adSraf 			    wait, dontblock);
592*a574db85Sraf 			_cancel_epilogue();
593*a574db85Sraf 			pthread_cleanup_pop(0);
594f841f6adSraf 			sig_mutex_lock(&__aio_mutex);
595f841f6adSraf 			kaio_errno = errno;
596f841f6adSraf 		}
597f841f6adSraf 		_aiowait_flag--;
598f841f6adSraf 		sig_mutex_unlock(&__aio_mutex);
599f841f6adSraf 		if (kresultp == (aio_result_t *)1) {
600f841f6adSraf 			/* aiowait() awakened by an aionotify() */
601f841f6adSraf 			continue;
602f841f6adSraf 		} else if (kresultp != NULL &&
603f841f6adSraf 		    kresultp != (aio_result_t *)-1) {
604f841f6adSraf 			resultp = kresultp;
605f841f6adSraf 			sig_mutex_lock(&__aio_mutex);
606f841f6adSraf 			_kaio_outstand_cnt--;
607f841f6adSraf 			sig_mutex_unlock(&__aio_mutex);
608f841f6adSraf 			break;
609f841f6adSraf 		} else if (kresultp == (aio_result_t *)-1 &&
610f841f6adSraf 		    kaio_errno == EINVAL &&
611f841f6adSraf 		    uresultp == (aio_result_t *)-1) {
612f841f6adSraf 			errno = kaio_errno;
613f841f6adSraf 			resultp = (aio_result_t *)-1;
614f841f6adSraf 			break;
615f841f6adSraf 		} else if (kresultp == (aio_result_t *)-1 &&
616f841f6adSraf 		    kaio_errno == EINTR) {
617f841f6adSraf 			errno = kaio_errno;
618f841f6adSraf 			resultp = (aio_result_t *)-1;
619f841f6adSraf 			break;
620f841f6adSraf 		} else if (timedwait) {
621f841f6adSraf 			hres = hrtend - gethrtime();
622f841f6adSraf 			if (hres <= 0) {
623f841f6adSraf 				/* time is up; return */
624f841f6adSraf 				resultp = NULL;
625f841f6adSraf 				break;
626f841f6adSraf 			} else {
627f841f6adSraf 				/*
628f841f6adSraf 				 * Some time left.  Round up the remaining time
629f841f6adSraf 				 * in nanoseconds to microsec.  Retry the call.
630f841f6adSraf 				 */
631f841f6adSraf 				hres += (NANOSEC / MICROSEC) - 1;
632f841f6adSraf 				wait->tv_sec = hres / NANOSEC;
633f841f6adSraf 				wait->tv_usec =
634f841f6adSraf 				    (hres % NANOSEC) / (NANOSEC / MICROSEC);
635f841f6adSraf 			}
636f841f6adSraf 		} else {
637f841f6adSraf 			ASSERT(kresultp == NULL && uresultp == NULL);
638f841f6adSraf 			resultp = NULL;
639f841f6adSraf 			continue;
640f841f6adSraf 		}
641f841f6adSraf 	}
642f841f6adSraf 	return (resultp);
643f841f6adSraf }
644f841f6adSraf 
645f841f6adSraf /*
646f841f6adSraf  * _aio_get_timedelta calculates the remaining time and stores the result
647f841f6adSraf  * into timespec_t *wait.
648f841f6adSraf  */
649f841f6adSraf 
650f841f6adSraf int
651f841f6adSraf _aio_get_timedelta(timespec_t *end, timespec_t *wait)
652f841f6adSraf {
653f841f6adSraf 	int	ret = 0;
654f841f6adSraf 	struct	timeval cur;
655f841f6adSraf 	timespec_t curtime;
656f841f6adSraf 
657f841f6adSraf 	(void) gettimeofday(&cur, NULL);
658f841f6adSraf 	curtime.tv_sec = cur.tv_sec;
659f841f6adSraf 	curtime.tv_nsec = cur.tv_usec * 1000;   /* convert us to ns */
660f841f6adSraf 
661f841f6adSraf 	if (end->tv_sec >= curtime.tv_sec) {
662f841f6adSraf 		wait->tv_sec = end->tv_sec - curtime.tv_sec;
663f841f6adSraf 		if (end->tv_nsec >= curtime.tv_nsec) {
664f841f6adSraf 			wait->tv_nsec = end->tv_nsec - curtime.tv_nsec;
665f841f6adSraf 			if (wait->tv_sec == 0 && wait->tv_nsec == 0)
666f841f6adSraf 				ret = -1;	/* timer expired */
667f841f6adSraf 		} else {
668f841f6adSraf 			if (end->tv_sec > curtime.tv_sec) {
669f841f6adSraf 				wait->tv_sec -= 1;
670f841f6adSraf 				wait->tv_nsec = NANOSEC -
671f841f6adSraf 				    (curtime.tv_nsec - end->tv_nsec);
672f841f6adSraf 			} else {
673f841f6adSraf 				ret = -1;	/* timer expired */
674f841f6adSraf 			}
675f841f6adSraf 		}
676f841f6adSraf 	} else {
677f841f6adSraf 		ret = -1;
678f841f6adSraf 	}
679f841f6adSraf 	return (ret);
680f841f6adSraf }
681f841f6adSraf 
682f841f6adSraf /*
683f841f6adSraf  * If closing by file descriptor: we will simply cancel all the outstanding
684f841f6adSraf  * aio`s and return.  Those aio's in question will have either noticed the
685f841f6adSraf  * cancellation notice before, during, or after initiating io.
686f841f6adSraf  */
687f841f6adSraf int
688f841f6adSraf aiocancel_all(int fd)
689f841f6adSraf {
690f841f6adSraf 	aio_req_t *reqp;
691f841f6adSraf 	aio_req_t **reqpp;
692f841f6adSraf 	aio_worker_t *first;
693f841f6adSraf 	aio_worker_t *next;
694f841f6adSraf 	int canceled = 0;
695f841f6adSraf 	int done = 0;
696f841f6adSraf 	int cancelall = 0;
697f841f6adSraf 
698f841f6adSraf 	sig_mutex_lock(&__aio_mutex);
699f841f6adSraf 
700f841f6adSraf 	if (_aio_outstand_cnt == 0) {
701f841f6adSraf 		sig_mutex_unlock(&__aio_mutex);
702f841f6adSraf 		return (AIO_ALLDONE);
703f841f6adSraf 	}
704f841f6adSraf 
705f841f6adSraf 	/*
706f841f6adSraf 	 * Cancel requests from the read/write workers' queues.
707f841f6adSraf 	 */
708f841f6adSraf 	first = __nextworker_rw;
709f841f6adSraf 	next = first;
710f841f6adSraf 	do {
711f841f6adSraf 		_aio_cancel_work(next, fd, &canceled, &done);
712f841f6adSraf 	} while ((next = next->work_forw) != first);
713f841f6adSraf 
714f841f6adSraf 	/*
715f841f6adSraf 	 * finally, check if there are requests on the done queue that
716f841f6adSraf 	 * should be canceled.
717f841f6adSraf 	 */
718f841f6adSraf 	if (fd < 0)
719f841f6adSraf 		cancelall = 1;
720f841f6adSraf 	reqpp = &_aio_done_tail;
721f841f6adSraf 	while ((reqp = *reqpp) != NULL) {
722f841f6adSraf 		if (cancelall || reqp->req_args.fd == fd) {
723f841f6adSraf 			*reqpp = reqp->req_next;
724f841f6adSraf 			_aio_donecnt--;
725f841f6adSraf 			(void) _aio_hash_del(reqp->req_resultp);
726f841f6adSraf 			_aio_req_free(reqp);
727f841f6adSraf 		} else
728f841f6adSraf 			reqpp = &reqp->req_next;
729f841f6adSraf 	}
730f841f6adSraf 	if (cancelall) {
731f841f6adSraf 		ASSERT(_aio_donecnt == 0);
732f841f6adSraf 		_aio_done_head = NULL;
733f841f6adSraf 	}
734f841f6adSraf 	sig_mutex_unlock(&__aio_mutex);
735f841f6adSraf 
736f841f6adSraf 	if (canceled && done == 0)
737f841f6adSraf 		return (AIO_CANCELED);
738f841f6adSraf 	else if (done && canceled == 0)
739f841f6adSraf 		return (AIO_ALLDONE);
740f841f6adSraf 	else if ((canceled + done == 0) && KAIO_SUPPORTED(fd))
741f841f6adSraf 		return ((int)_kaio(AIOCANCEL, fd, NULL));
742f841f6adSraf 	return (AIO_NOTCANCELED);
743f841f6adSraf }
744f841f6adSraf 
745f841f6adSraf /*
746f841f6adSraf  * Cancel requests from a given work queue.  If the file descriptor
747f841f6adSraf  * parameter, fd, is non-negative, then only cancel those requests
748f841f6adSraf  * in this queue that are to this file descriptor.  If the fd
749f841f6adSraf  * parameter is -1, then cancel all requests.
750f841f6adSraf  */
751f841f6adSraf static void
752f841f6adSraf _aio_cancel_work(aio_worker_t *aiowp, int fd, int *canceled, int *done)
753f841f6adSraf {
754f841f6adSraf 	aio_req_t *reqp;
755f841f6adSraf 
756f841f6adSraf 	sig_mutex_lock(&aiowp->work_qlock1);
757f841f6adSraf 	/*
758f841f6adSraf 	 * cancel queued requests first.
759f841f6adSraf 	 */
760f841f6adSraf 	reqp = aiowp->work_tail1;
761f841f6adSraf 	while (reqp != NULL) {
762f841f6adSraf 		if (fd < 0 || reqp->req_args.fd == fd) {
763f841f6adSraf 			if (_aio_cancel_req(aiowp, reqp, canceled, done)) {
764f841f6adSraf 				/*
765f841f6adSraf 				 * Callers locks were dropped.
766f841f6adSraf 				 * reqp is invalid; start traversing
767f841f6adSraf 				 * the list from the beginning again.
768f841f6adSraf 				 */
769f841f6adSraf 				reqp = aiowp->work_tail1;
770f841f6adSraf 				continue;
771f841f6adSraf 			}
772f841f6adSraf 		}
773f841f6adSraf 		reqp = reqp->req_next;
774f841f6adSraf 	}
775f841f6adSraf 	/*
776f841f6adSraf 	 * Since the queued requests have been canceled, there can
777f841f6adSraf 	 * only be one inprogress request that should be canceled.
778f841f6adSraf 	 */
779f841f6adSraf 	if ((reqp = aiowp->work_req) != NULL &&
780f841f6adSraf 	    (fd < 0 || reqp->req_args.fd == fd))
781f841f6adSraf 		(void) _aio_cancel_req(aiowp, reqp, canceled, done);
782f841f6adSraf 	sig_mutex_unlock(&aiowp->work_qlock1);
783f841f6adSraf }
784f841f6adSraf 
785f841f6adSraf /*
786f841f6adSraf  * Cancel a request.  Return 1 if the callers locks were temporarily
787f841f6adSraf  * dropped, otherwise return 0.
788f841f6adSraf  */
789f841f6adSraf int
790f841f6adSraf _aio_cancel_req(aio_worker_t *aiowp, aio_req_t *reqp, int *canceled, int *done)
791f841f6adSraf {
792f841f6adSraf 	int ostate = reqp->req_state;
793f841f6adSraf 
794f841f6adSraf 	ASSERT(MUTEX_HELD(&__aio_mutex));
795f841f6adSraf 	ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
796f841f6adSraf 	if (ostate == AIO_REQ_CANCELED)
797f841f6adSraf 		return (0);
798f841f6adSraf 	if (ostate == AIO_REQ_DONE || ostate == AIO_REQ_DONEQ) {
799f841f6adSraf 		(*done)++;
800f841f6adSraf 		return (0);
801f841f6adSraf 	}
802f841f6adSraf 	if (reqp->req_op == AIOFSYNC && reqp != aiowp->work_req) {
803f841f6adSraf 		ASSERT(POSIX_AIO(reqp));
804f841f6adSraf 		/* Cancel the queued aio_fsync() request */
805f841f6adSraf 		if (!reqp->req_head->lio_canned) {
806f841f6adSraf 			reqp->req_head->lio_canned = 1;
807f841f6adSraf 			_aio_outstand_cnt--;
808f841f6adSraf 			(*canceled)++;
809f841f6adSraf 		}
810f841f6adSraf 		return (0);
811f841f6adSraf 	}
812f841f6adSraf 	reqp->req_state = AIO_REQ_CANCELED;
813f841f6adSraf 	_aio_req_del(aiowp, reqp, ostate);
814f841f6adSraf 	(void) _aio_hash_del(reqp->req_resultp);
815f841f6adSraf 	(*canceled)++;
816f841f6adSraf 	if (reqp == aiowp->work_req) {
817f841f6adSraf 		ASSERT(ostate == AIO_REQ_INPROGRESS);
818f841f6adSraf 		/*
819f841f6adSraf 		 * Set the result values now, before _aiodone() is called.
820f841f6adSraf 		 * We do this because the application can expect aio_return
821f841f6adSraf 		 * and aio_errno to be set to -1 and ECANCELED, respectively,
822f841f6adSraf 		 * immediately after a successful return from aiocancel()
823f841f6adSraf 		 * or aio_cancel().
824f841f6adSraf 		 */
825f841f6adSraf 		_aio_set_result(reqp, -1, ECANCELED);
826f841f6adSraf 		(void) thr_kill(aiowp->work_tid, SIGAIOCANCEL);
827f841f6adSraf 		return (0);
828f841f6adSraf 	}
829f841f6adSraf 	if (!POSIX_AIO(reqp)) {
830f841f6adSraf 		_aio_outstand_cnt--;
831f841f6adSraf 		_aio_set_result(reqp, -1, ECANCELED);
832f841f6adSraf 		return (0);
833f841f6adSraf 	}
834f841f6adSraf 	sig_mutex_unlock(&aiowp->work_qlock1);
835f841f6adSraf 	sig_mutex_unlock(&__aio_mutex);
836f841f6adSraf 	_aiodone(reqp, -1, ECANCELED);
837f841f6adSraf 	sig_mutex_lock(&__aio_mutex);
838f841f6adSraf 	sig_mutex_lock(&aiowp->work_qlock1);
839f841f6adSraf 	return (1);
840f841f6adSraf }
841f841f6adSraf 
842f841f6adSraf int
843f841f6adSraf _aio_create_worker(aio_req_t *reqp, int mode)
844f841f6adSraf {
845f841f6adSraf 	aio_worker_t *aiowp, **workers, **nextworker;
846f841f6adSraf 	int *aio_workerscnt;
847f841f6adSraf 	void *(*func)(void *);
848f841f6adSraf 	sigset_t oset;
849f841f6adSraf 	int error;
850f841f6adSraf 
851f841f6adSraf 	/*
852f841f6adSraf 	 * Put the new worker thread in the right queue.
853f841f6adSraf 	 */
854f841f6adSraf 	switch (mode) {
855f841f6adSraf 	case AIOREAD:
856f841f6adSraf 	case AIOWRITE:
857f841f6adSraf 	case AIOAREAD:
858f841f6adSraf 	case AIOAWRITE:
859f841f6adSraf #if !defined(_LP64)
860f841f6adSraf 	case AIOAREAD64:
861f841f6adSraf 	case AIOAWRITE64:
862f841f6adSraf #endif
863f841f6adSraf 		workers = &__workers_rw;
864f841f6adSraf 		nextworker = &__nextworker_rw;
865f841f6adSraf 		aio_workerscnt = &__rw_workerscnt;
866f841f6adSraf 		func = _aio_do_request;
867f841f6adSraf 		break;
868f841f6adSraf 	case AIONOTIFY:
869f841f6adSraf 		workers = &__workers_no;
870f841f6adSraf 		nextworker = &__nextworker_no;
871f841f6adSraf 		func = _aio_do_notify;
872f841f6adSraf 		aio_workerscnt = &__no_workerscnt;
873f841f6adSraf 		break;
874f841f6adSraf 	default:
875f841f6adSraf 		aio_panic("_aio_create_worker: invalid mode");
876f841f6adSraf 		break;
877f841f6adSraf 	}
878f841f6adSraf 
879f841f6adSraf 	if ((aiowp = _aio_worker_alloc()) == NULL)
880f841f6adSraf 		return (-1);
881f841f6adSraf 
882f841f6adSraf 	if (reqp) {
883f841f6adSraf 		reqp->req_state = AIO_REQ_QUEUED;
884f841f6adSraf 		reqp->req_worker = aiowp;
885f841f6adSraf 		aiowp->work_head1 = reqp;
886f841f6adSraf 		aiowp->work_tail1 = reqp;
887f841f6adSraf 		aiowp->work_next1 = reqp;
888f841f6adSraf 		aiowp->work_count1 = 1;
889f841f6adSraf 		aiowp->work_minload1 = 1;
890f841f6adSraf 	}
891f841f6adSraf 
892f841f6adSraf 	(void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
893f841f6adSraf 	error = thr_create(NULL, AIOSTKSIZE, func, aiowp,
894f841f6adSraf 	    THR_DAEMON | THR_SUSPENDED, &aiowp->work_tid);
895f841f6adSraf 	(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
896f841f6adSraf 	if (error) {
897f841f6adSraf 		if (reqp) {
898f841f6adSraf 			reqp->req_state = 0;
899f841f6adSraf 			reqp->req_worker = NULL;
900f841f6adSraf 		}
901f841f6adSraf 		_aio_worker_free(aiowp);
902f841f6adSraf 		return (-1);
903f841f6adSraf 	}
904f841f6adSraf 
905f841f6adSraf 	lmutex_lock(&__aio_mutex);
906f841f6adSraf 	(*aio_workerscnt)++;
907f841f6adSraf 	if (*workers == NULL) {
908f841f6adSraf 		aiowp->work_forw = aiowp;
909f841f6adSraf 		aiowp->work_backw = aiowp;
910f841f6adSraf 		*nextworker = aiowp;
911f841f6adSraf 		*workers = aiowp;
912f841f6adSraf 	} else {
913f841f6adSraf 		aiowp->work_backw = (*workers)->work_backw;
914f841f6adSraf 		aiowp->work_forw = (*workers);
915f841f6adSraf 		(*workers)->work_backw->work_forw = aiowp;
916f841f6adSraf 		(*workers)->work_backw = aiowp;
917f841f6adSraf 	}
918f841f6adSraf 	_aio_worker_cnt++;
919f841f6adSraf 	lmutex_unlock(&__aio_mutex);
920f841f6adSraf 
921f841f6adSraf 	(void) thr_continue(aiowp->work_tid);
922f841f6adSraf 
923f841f6adSraf 	return (0);
924f841f6adSraf }
925f841f6adSraf 
926f841f6adSraf /*
927f841f6adSraf  * This is the worker's main routine.
928f841f6adSraf  * The task of this function is to execute all queued requests;
929f841f6adSraf  * once the last pending request is executed this function will block
930f841f6adSraf  * in _aio_idle().  A new incoming request must wakeup this thread to
931f841f6adSraf  * restart the work.
932f841f6adSraf  * Every worker has an own work queue.  The queue lock is required
933f841f6adSraf  * to synchronize the addition of new requests for this worker or
934f841f6adSraf  * cancellation of pending/running requests.
935f841f6adSraf  *
936f841f6adSraf  * Cancellation scenarios:
937f841f6adSraf  * The cancellation of a request is being done asynchronously using
938f841f6adSraf  * _aio_cancel_req() from another thread context.
939f841f6adSraf  * A queued request can be cancelled in different manners :
940f841f6adSraf  * a) request is queued but not "in progress" or "done" (AIO_REQ_QUEUED):
941f841f6adSraf  *	- lock the queue -> remove the request -> unlock the queue
942f841f6adSraf  *	- this function/thread does not detect this cancellation process
943f841f6adSraf  * b) request is in progress (AIO_REQ_INPROGRESS) :
944f841f6adSraf  *	- this function first allow the cancellation of the running
945f841f6adSraf  *	  request with the flag "work_cancel_flg=1"
946f841f6adSraf  * 		see _aio_req_get() -> _aio_cancel_on()
947f841f6adSraf  *	  During this phase, it is allowed to interrupt the worker
948f841f6adSraf  *	  thread running the request (this thread) using the SIGAIOCANCEL
949f841f6adSraf  *	  signal.
950f841f6adSraf  *	  Once this thread returns from the kernel (because the request
951f841f6adSraf  *	  is just done), then it must disable a possible cancellation
952f841f6adSraf  *	  and proceed to finish the request.  To disable the cancellation
953f841f6adSraf  *	  this thread must use _aio_cancel_off() to set "work_cancel_flg=0".
954f841f6adSraf  * c) request is already done (AIO_REQ_DONE || AIO_REQ_DONEQ):
955f841f6adSraf  *	  same procedure as in a)
956f841f6adSraf  *
957f841f6adSraf  * To b)
958f841f6adSraf  *	This thread uses sigsetjmp() to define the position in the code, where
959f841f6adSraf  *	it wish to continue working in the case that a SIGAIOCANCEL signal
960f841f6adSraf  *	is detected.
961f841f6adSraf  *	Normally this thread should get the cancellation signal during the
962f841f6adSraf  *	kernel phase (reading or writing).  In that case the signal handler
963f841f6adSraf  *	aiosigcancelhndlr() is activated using the worker thread context,
964f841f6adSraf  *	which again will use the siglongjmp() function to break the standard
965f841f6adSraf  *	code flow and jump to the "sigsetjmp" position, provided that
966f841f6adSraf  *	"work_cancel_flg" is set to "1".
967f841f6adSraf  *	Because the "work_cancel_flg" is only manipulated by this worker
968f841f6adSraf  *	thread and it can only run on one CPU at a given time, it is not
969f841f6adSraf  *	necessary to protect that flag with the queue lock.
970f841f6adSraf  *	Returning from the kernel (read or write system call) we must
971f841f6adSraf  *	first disable the use of the SIGAIOCANCEL signal and accordingly
972f841f6adSraf  *	the use of the siglongjmp() function to prevent a possible deadlock:
973f841f6adSraf  *	- It can happens that this worker thread returns from the kernel and
974f841f6adSraf  *	  blocks in "work_qlock1",
975f841f6adSraf  *	- then a second thread cancels the apparently "in progress" request
976f841f6adSraf  *	  and sends the SIGAIOCANCEL signal to the worker thread,
977f841f6adSraf  *	- the worker thread gets assigned the "work_qlock1" and will returns
978f841f6adSraf  *	  from the kernel,
979f841f6adSraf  *	- the kernel detects the pending signal and activates the signal
980f841f6adSraf  *	  handler instead,
981f841f6adSraf  *	- if the "work_cancel_flg" is still set then the signal handler
982f841f6adSraf  *	  should use siglongjmp() to cancel the "in progress" request and
983f841f6adSraf  *	  it would try to acquire the same work_qlock1 in _aio_req_get()
984f841f6adSraf  *	  for a second time => deadlock.
985f841f6adSraf  *	To avoid that situation we disable the cancellation of the request
986f841f6adSraf  *	in progress BEFORE we try to acquire the work_qlock1.
987f841f6adSraf  *	In that case the signal handler will not call siglongjmp() and the
988f841f6adSraf  *	worker thread will continue running the standard code flow.
989f841f6adSraf  *	Then this thread must check the AIO_REQ_CANCELED flag to emulate
990f841f6adSraf  *	an eventually required siglongjmp() freeing the work_qlock1 and
991f841f6adSraf  *	avoiding a deadlock.
992f841f6adSraf  */
993f841f6adSraf void *
994f841f6adSraf _aio_do_request(void *arglist)
995f841f6adSraf {
996f841f6adSraf 	aio_worker_t *aiowp = (aio_worker_t *)arglist;
997f841f6adSraf 	ulwp_t *self = curthread;
998f841f6adSraf 	struct aio_args *arg;
999f841f6adSraf 	aio_req_t *reqp;		/* current AIO request */
1000f841f6adSraf 	ssize_t retval;
1001f841f6adSraf 	int error;
1002f841f6adSraf 
1003f841f6adSraf 	if (pthread_setspecific(_aio_key, aiowp) != 0)
1004f841f6adSraf 		aio_panic("_aio_do_request, pthread_setspecific()");
1005f841f6adSraf 	(void) pthread_sigmask(SIG_SETMASK, &_worker_set, NULL);
1006f841f6adSraf 	ASSERT(aiowp->work_req == NULL);
1007f841f6adSraf 
1008f841f6adSraf 	/*
1009f841f6adSraf 	 * We resume here when an operation is cancelled.
1010f841f6adSraf 	 * On first entry, aiowp->work_req == NULL, so all
1011f841f6adSraf 	 * we do is block SIGAIOCANCEL.
1012f841f6adSraf 	 */
1013f841f6adSraf 	(void) sigsetjmp(aiowp->work_jmp_buf, 0);
1014f841f6adSraf 	ASSERT(self->ul_sigdefer == 0);
1015f841f6adSraf 
1016f841f6adSraf 	sigoff(self);	/* block SIGAIOCANCEL */
1017f841f6adSraf 	if (aiowp->work_req != NULL)
1018f841f6adSraf 		_aio_finish_request(aiowp, -1, ECANCELED);
1019f841f6adSraf 
1020f841f6adSraf 	for (;;) {
1021f841f6adSraf 		/*
1022f841f6adSraf 		 * Put completed requests on aio_done_list.  This has
1023f841f6adSraf 		 * to be done as part of the main loop to ensure that
1024f841f6adSraf 		 * we don't artificially starve any aiowait'ers.
1025f841f6adSraf 		 */
1026f841f6adSraf 		if (aiowp->work_done1)
1027f841f6adSraf 			_aio_work_done(aiowp);
1028f841f6adSraf 
1029f841f6adSraf top:
1030f841f6adSraf 		/* consume any deferred SIGAIOCANCEL signal here */
1031f841f6adSraf 		sigon(self);
1032f841f6adSraf 		sigoff(self);
1033f841f6adSraf 
1034f841f6adSraf 		while ((reqp = _aio_req_get(aiowp)) == NULL) {
1035f841f6adSraf 			if (_aio_idle(aiowp) != 0)
1036f841f6adSraf 				goto top;
1037f841f6adSraf 		}
1038f841f6adSraf 		arg = &reqp->req_args;
1039f841f6adSraf 		ASSERT(reqp->req_state == AIO_REQ_INPROGRESS ||
1040f841f6adSraf 		    reqp->req_state == AIO_REQ_CANCELED);
1041f841f6adSraf 		error = 0;
1042f841f6adSraf 
1043f841f6adSraf 		switch (reqp->req_op) {
1044f841f6adSraf 		case AIOREAD:
1045f841f6adSraf 		case AIOAREAD:
1046f841f6adSraf 			sigon(self);	/* unblock SIGAIOCANCEL */
1047f841f6adSraf 			retval = pread(arg->fd, arg->buf,
1048f841f6adSraf 			    arg->bufsz, arg->offset);
1049f841f6adSraf 			if (retval == -1) {
1050f841f6adSraf 				if (errno == ESPIPE) {
1051f841f6adSraf 					retval = read(arg->fd,
1052f841f6adSraf 					    arg->buf, arg->bufsz);
1053f841f6adSraf 					if (retval == -1)
1054f841f6adSraf 						error = errno;
1055f841f6adSraf 				} else {
1056f841f6adSraf 					error = errno;
1057f841f6adSraf 				}
1058f841f6adSraf 			}
1059f841f6adSraf 			sigoff(self);	/* block SIGAIOCANCEL */
1060f841f6adSraf 			break;
1061f841f6adSraf 		case AIOWRITE:
1062f841f6adSraf 		case AIOAWRITE:
1063f841f6adSraf 			sigon(self);	/* unblock SIGAIOCANCEL */
1064f841f6adSraf 			retval = pwrite(arg->fd, arg->buf,
1065f841f6adSraf 			    arg->bufsz, arg->offset);
1066f841f6adSraf 			if (retval == -1) {
1067f841f6adSraf 				if (errno == ESPIPE) {
1068f841f6adSraf 					retval = write(arg->fd,
1069f841f6adSraf 					    arg->buf, arg->bufsz);
1070f841f6adSraf 					if (retval == -1)
1071f841f6adSraf 						error = errno;
1072f841f6adSraf 				} else {
1073f841f6adSraf 					error = errno;
1074f841f6adSraf 				}
1075f841f6adSraf 			}
1076f841f6adSraf 			sigoff(self);	/* block SIGAIOCANCEL */
1077f841f6adSraf 			break;
1078f841f6adSraf #if !defined(_LP64)
1079f841f6adSraf 		case AIOAREAD64:
1080f841f6adSraf 			sigon(self);	/* unblock SIGAIOCANCEL */
1081f841f6adSraf 			retval = pread64(arg->fd, arg->buf,
1082f841f6adSraf 			    arg->bufsz, arg->offset);
1083f841f6adSraf 			if (retval == -1) {
1084f841f6adSraf 				if (errno == ESPIPE) {
1085f841f6adSraf 					retval = read(arg->fd,
1086f841f6adSraf 					    arg->buf, arg->bufsz);
1087f841f6adSraf 					if (retval == -1)
1088f841f6adSraf 						error = errno;
1089f841f6adSraf 				} else {
1090f841f6adSraf 					error = errno;
1091f841f6adSraf 				}
1092f841f6adSraf 			}
1093f841f6adSraf 			sigoff(self);	/* block SIGAIOCANCEL */
1094f841f6adSraf 			break;
1095f841f6adSraf 		case AIOAWRITE64:
1096f841f6adSraf 			sigon(self);	/* unblock SIGAIOCANCEL */
1097f841f6adSraf 			retval = pwrite64(arg->fd, arg->buf,
1098f841f6adSraf 			    arg->bufsz, arg->offset);
1099f841f6adSraf 			if (retval == -1) {
1100f841f6adSraf 				if (errno == ESPIPE) {
1101f841f6adSraf 					retval = write(arg->fd,
1102f841f6adSraf 					    arg->buf, arg->bufsz);
1103f841f6adSraf 					if (retval == -1)
1104f841f6adSraf 						error = errno;
1105f841f6adSraf 				} else {
1106f841f6adSraf 					error = errno;
1107f841f6adSraf 				}
1108f841f6adSraf 			}
1109f841f6adSraf 			sigoff(self);	/* block SIGAIOCANCEL */
1110f841f6adSraf 			break;
1111f841f6adSraf #endif	/* !defined(_LP64) */
1112f841f6adSraf 		case AIOFSYNC:
1113f841f6adSraf 			if (_aio_fsync_del(aiowp, reqp))
1114f841f6adSraf 				goto top;
1115f841f6adSraf 			ASSERT(reqp->req_head == NULL);
1116f841f6adSraf 			/*
1117f841f6adSraf 			 * All writes for this fsync request are now
1118f841f6adSraf 			 * acknowledged.  Now make these writes visible
1119f841f6adSraf 			 * and put the final request into the hash table.
1120f841f6adSraf 			 */
1121f841f6adSraf 			if (reqp->req_state == AIO_REQ_CANCELED) {
1122f841f6adSraf 				/* EMPTY */;
1123f841f6adSraf 			} else if (arg->offset == O_SYNC) {
1124f841f6adSraf 				if ((retval = __fdsync(arg->fd, FSYNC)) == -1)
1125f841f6adSraf 					error = errno;
1126f841f6adSraf 			} else {
1127f841f6adSraf 				if ((retval = __fdsync(arg->fd, FDSYNC)) == -1)
1128f841f6adSraf 					error = errno;
1129f841f6adSraf 			}
1130f841f6adSraf 			if (_aio_hash_insert(reqp->req_resultp, reqp) != 0)
1131f841f6adSraf 				aio_panic("_aio_do_request(): AIOFSYNC: "
1132f841f6adSraf 				    "request already in hash table");
1133f841f6adSraf 			break;
1134f841f6adSraf 		default:
1135f841f6adSraf 			aio_panic("_aio_do_request, bad op");
1136f841f6adSraf 		}
1137f841f6adSraf 
1138f841f6adSraf 		_aio_finish_request(aiowp, retval, error);
1139f841f6adSraf 	}
1140f841f6adSraf 	/* NOTREACHED */
1141f841f6adSraf 	return (NULL);
1142f841f6adSraf }
1143f841f6adSraf 
1144f841f6adSraf /*
1145f841f6adSraf  * Perform the tail processing for _aio_do_request().
1146f841f6adSraf  * The in-progress request may or may not have been cancelled.
1147f841f6adSraf  */
1148f841f6adSraf static void
1149f841f6adSraf _aio_finish_request(aio_worker_t *aiowp, ssize_t retval, int error)
1150f841f6adSraf {
1151f841f6adSraf 	aio_req_t *reqp;
1152f841f6adSraf 
1153f841f6adSraf 	sig_mutex_lock(&aiowp->work_qlock1);
1154f841f6adSraf 	if ((reqp = aiowp->work_req) == NULL)
1155f841f6adSraf 		sig_mutex_unlock(&aiowp->work_qlock1);
1156f841f6adSraf 	else {
1157f841f6adSraf 		aiowp->work_req = NULL;
1158f841f6adSraf 		if (reqp->req_state == AIO_REQ_CANCELED) {
1159f841f6adSraf 			retval = -1;
1160f841f6adSraf 			error = ECANCELED;
1161f841f6adSraf 		}
1162f841f6adSraf 		if (!POSIX_AIO(reqp)) {
116334b3058fSpraks 			int notify;
1164f841f6adSraf 			sig_mutex_unlock(&aiowp->work_qlock1);
1165f841f6adSraf 			sig_mutex_lock(&__aio_mutex);
1166f841f6adSraf 			if (reqp->req_state == AIO_REQ_INPROGRESS)
1167f841f6adSraf 				reqp->req_state = AIO_REQ_DONE;
116834b3058fSpraks 			/*
116934b3058fSpraks 			 * If it was canceled, this request will not be
117034b3058fSpraks 			 * added to done list. Just free it.
117134b3058fSpraks 			 */
117234b3058fSpraks 			if (error == ECANCELED) {
1173f841f6adSraf 				_aio_outstand_cnt--;
117434b3058fSpraks 				_aio_req_free(reqp);
117534b3058fSpraks 			} else {
117634b3058fSpraks 				_aio_set_result(reqp, retval, error);
117734b3058fSpraks 				_aio_req_done_cnt++;
117834b3058fSpraks 			}
117934b3058fSpraks 			/*
118034b3058fSpraks 			 * Notify any thread that may have blocked
118134b3058fSpraks 			 * because it saw an outstanding request.
118234b3058fSpraks 			 */
118334b3058fSpraks 			notify = 0;
118434b3058fSpraks 			if (_aio_outstand_cnt == 0 && _aiowait_flag) {
118534b3058fSpraks 				notify = 1;
118634b3058fSpraks 			}
1187f841f6adSraf 			sig_mutex_unlock(&__aio_mutex);
118834b3058fSpraks 			if (notify) {
118934b3058fSpraks 				(void) _kaio(AIONOTIFY);
119034b3058fSpraks 			}
1191f841f6adSraf 		} else {
1192f841f6adSraf 			if (reqp->req_state == AIO_REQ_INPROGRESS)
1193f841f6adSraf 				reqp->req_state = AIO_REQ_DONE;
1194f841f6adSraf 			sig_mutex_unlock(&aiowp->work_qlock1);
1195f841f6adSraf 			_aiodone(reqp, retval, error);
1196f841f6adSraf 		}
1197f841f6adSraf 	}
1198f841f6adSraf }
1199f841f6adSraf 
1200f841f6adSraf void
1201f841f6adSraf _aio_req_mark_done(aio_req_t *reqp)
1202f841f6adSraf {
1203f841f6adSraf #if !defined(_LP64)
1204f841f6adSraf 	if (reqp->req_largefile)
1205f841f6adSraf 		((aiocb64_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
1206f841f6adSraf 	else
1207f841f6adSraf #endif
1208f841f6adSraf 		((aiocb_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
1209f841f6adSraf }
1210f841f6adSraf 
1211f841f6adSraf /*
1212f841f6adSraf  * Sleep for 'ticks' clock ticks to give somebody else a chance to run,
1213f841f6adSraf  * hopefully to consume one of our queued signals.
1214f841f6adSraf  */
1215f841f6adSraf static void
1216f841f6adSraf _aio_delay(int ticks)
1217f841f6adSraf {
1218f841f6adSraf 	(void) usleep(ticks * (MICROSEC / hz));
1219f841f6adSraf }
1220f841f6adSraf 
1221f841f6adSraf /*
1222f841f6adSraf  * Actually send the notifications.
1223f841f6adSraf  * We could block indefinitely here if the application
1224f841f6adSraf  * is not listening for the signal or port notifications.
1225f841f6adSraf  */
1226f841f6adSraf static void
1227f841f6adSraf send_notification(notif_param_t *npp)
1228f841f6adSraf {
1229f841f6adSraf 	extern int __sigqueue(pid_t pid, int signo,
1230f841f6adSraf 	    /* const union sigval */ void *value, int si_code, int block);
1231f841f6adSraf 
1232f841f6adSraf 	if (npp->np_signo)
1233f841f6adSraf 		(void) __sigqueue(__pid, npp->np_signo, npp->np_user,
1234f841f6adSraf 		    SI_ASYNCIO, 1);
1235f841f6adSraf 	else if (npp->np_port >= 0)
1236f841f6adSraf 		(void) _port_dispatch(npp->np_port, 0, PORT_SOURCE_AIO,
1237f841f6adSraf 		    npp->np_event, npp->np_object, npp->np_user);
1238f841f6adSraf 
1239f841f6adSraf 	if (npp->np_lio_signo)
1240f841f6adSraf 		(void) __sigqueue(__pid, npp->np_lio_signo, npp->np_lio_user,
1241f841f6adSraf 		    SI_ASYNCIO, 1);
1242f841f6adSraf 	else if (npp->np_lio_port >= 0)
1243f841f6adSraf 		(void) _port_dispatch(npp->np_lio_port, 0, PORT_SOURCE_AIO,
1244f841f6adSraf 		    npp->np_lio_event, npp->np_lio_object, npp->np_lio_user);
1245f841f6adSraf }
1246f841f6adSraf 
1247f841f6adSraf /*
1248f841f6adSraf  * Asynchronous notification worker.
1249f841f6adSraf  */
1250f841f6adSraf void *
1251f841f6adSraf _aio_do_notify(void *arg)
1252f841f6adSraf {
1253f841f6adSraf 	aio_worker_t *aiowp = (aio_worker_t *)arg;
1254f841f6adSraf 	aio_req_t *reqp;
1255f841f6adSraf 
1256f841f6adSraf 	/*
1257f841f6adSraf 	 * This isn't really necessary.  All signals are blocked.
1258f841f6adSraf 	 */
1259f841f6adSraf 	if (pthread_setspecific(_aio_key, aiowp) != 0)
1260f841f6adSraf 		aio_panic("_aio_do_notify, pthread_setspecific()");
1261f841f6adSraf 
1262f841f6adSraf 	/*
1263f841f6adSraf 	 * Notifications are never cancelled.
1264f841f6adSraf 	 * All signals remain blocked, forever.
1265f841f6adSraf 	 */
1266f841f6adSraf 	for (;;) {
1267f841f6adSraf 		while ((reqp = _aio_req_get(aiowp)) == NULL) {
1268f841f6adSraf 			if (_aio_idle(aiowp) != 0)
1269f841f6adSraf 				aio_panic("_aio_do_notify: _aio_idle() failed");
1270f841f6adSraf 		}
1271f841f6adSraf 		send_notification(&reqp->req_notify);
1272f841f6adSraf 		_aio_req_free(reqp);
1273f841f6adSraf 	}
1274f841f6adSraf 
1275f841f6adSraf 	/* NOTREACHED */
1276f841f6adSraf 	return (NULL);
1277f841f6adSraf }
1278f841f6adSraf 
1279f841f6adSraf /*
1280f841f6adSraf  * Do the completion semantics for a request that was either canceled
1281f841f6adSraf  * by _aio_cancel_req() or was completed by _aio_do_request().
1282f841f6adSraf  */
1283f841f6adSraf static void
1284f841f6adSraf _aiodone(aio_req_t *reqp, ssize_t retval, int error)
1285f841f6adSraf {
1286f841f6adSraf 	aio_result_t *resultp = reqp->req_resultp;
1287f841f6adSraf 	int notify = 0;
1288f841f6adSraf 	aio_lio_t *head;
1289f841f6adSraf 	int sigev_none;
1290f841f6adSraf 	int sigev_signal;
1291f841f6adSraf 	int sigev_thread;
1292f841f6adSraf 	int sigev_port;
1293f841f6adSraf 	notif_param_t np;
1294f841f6adSraf 
1295f841f6adSraf 	/*
1296f841f6adSraf 	 * We call _aiodone() only for Posix I/O.
1297f841f6adSraf 	 */
1298f841f6adSraf 	ASSERT(POSIX_AIO(reqp));
1299f841f6adSraf 
1300f841f6adSraf 	sigev_none = 0;
1301f841f6adSraf 	sigev_signal = 0;
1302f841f6adSraf 	sigev_thread = 0;
1303f841f6adSraf 	sigev_port = 0;
1304f841f6adSraf 	np.np_signo = 0;
1305f841f6adSraf 	np.np_port = -1;
1306f841f6adSraf 	np.np_lio_signo = 0;
1307f841f6adSraf 	np.np_lio_port = -1;
1308f841f6adSraf 
1309f841f6adSraf 	switch (reqp->req_sigevent.sigev_notify) {
1310f841f6adSraf 	case SIGEV_NONE:
1311f841f6adSraf 		sigev_none = 1;
1312f841f6adSraf 		break;
1313f841f6adSraf 	case SIGEV_SIGNAL:
1314f841f6adSraf 		sigev_signal = 1;
1315f841f6adSraf 		break;
1316f841f6adSraf 	case SIGEV_THREAD:
1317f841f6adSraf 		sigev_thread = 1;
1318f841f6adSraf 		break;
1319f841f6adSraf 	case SIGEV_PORT:
1320f841f6adSraf 		sigev_port = 1;
1321f841f6adSraf 		break;
1322f841f6adSraf 	default:
1323f841f6adSraf 		aio_panic("_aiodone: improper sigev_notify");
1324f841f6adSraf 		break;
1325f841f6adSraf 	}
1326f841f6adSraf 
1327f841f6adSraf 	/*
1328f841f6adSraf 	 * Figure out the notification parameters while holding __aio_mutex.
1329f841f6adSraf 	 * Actually perform the notifications after dropping __aio_mutex.
1330f841f6adSraf 	 * This allows us to sleep for a long time (if the notifications
1331f841f6adSraf 	 * incur delays) without impeding other async I/O operations.
1332f841f6adSraf 	 */
1333f841f6adSraf 
1334f841f6adSraf 	sig_mutex_lock(&__aio_mutex);
1335f841f6adSraf 
1336f841f6adSraf 	if (sigev_signal) {
1337f841f6adSraf 		if ((np.np_signo = reqp->req_sigevent.sigev_signo) != 0)
1338f841f6adSraf 			notify = 1;
1339f841f6adSraf 		np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
1340f841f6adSraf 	} else if (sigev_thread | sigev_port) {
1341f841f6adSraf 		if ((np.np_port = reqp->req_sigevent.sigev_signo) >= 0)
1342f841f6adSraf 			notify = 1;
1343f841f6adSraf 		np.np_event = reqp->req_op;
1344f841f6adSraf 		if (np.np_event == AIOFSYNC && reqp->req_largefile)
1345f841f6adSraf 			np.np_event = AIOFSYNC64;
1346f841f6adSraf 		np.np_object = (uintptr_t)reqp->req_aiocbp;
1347f841f6adSraf 		np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
1348f841f6adSraf 	}
1349f841f6adSraf 
1350f841f6adSraf 	if (resultp->aio_errno == EINPROGRESS)
1351f841f6adSraf 		_aio_set_result(reqp, retval, error);
1352f841f6adSraf 
1353f841f6adSraf 	_aio_outstand_cnt--;
1354f841f6adSraf 
1355f841f6adSraf 	head = reqp->req_head;
1356f841f6adSraf 	reqp->req_head = NULL;
1357f841f6adSraf 
1358f841f6adSraf 	if (sigev_none) {
1359f841f6adSraf 		_aio_enq_doneq(reqp);
1360f841f6adSraf 		reqp = NULL;
1361f841f6adSraf 	} else {
1362f841f6adSraf 		(void) _aio_hash_del(resultp);
1363f841f6adSraf 		_aio_req_mark_done(reqp);
1364f841f6adSraf 	}
1365f841f6adSraf 
1366f841f6adSraf 	_aio_waitn_wakeup();
1367f841f6adSraf 
1368f841f6adSraf 	/*
1369f841f6adSraf 	 * __aio_waitn() sets AIO_WAIT_INPROGRESS and
1370f841f6adSraf 	 * __aio_suspend() increments "_aio_kernel_suspend"
1371f841f6adSraf 	 * when they are waiting in the kernel for completed I/Os.
1372f841f6adSraf 	 *
1373f841f6adSraf 	 * _kaio(AIONOTIFY) awakes the corresponding function
1374f841f6adSraf 	 * in the kernel; then the corresponding __aio_waitn() or
1375f841f6adSraf 	 * __aio_suspend() function could reap the recently
1376f841f6adSraf 	 * completed I/Os (_aiodone()).
1377f841f6adSraf 	 */
1378f841f6adSraf 	if ((_aio_flags & AIO_WAIT_INPROGRESS) || _aio_kernel_suspend > 0)
1379f841f6adSraf 		(void) _kaio(AIONOTIFY);
1380f841f6adSraf 
1381f841f6adSraf 	sig_mutex_unlock(&__aio_mutex);
1382f841f6adSraf 
1383f841f6adSraf 	if (head != NULL) {
1384f841f6adSraf 		/*
1385f841f6adSraf 		 * If all the lio requests have completed,
1386f841f6adSraf 		 * prepare to notify the waiting thread.
1387f841f6adSraf 		 */
1388f841f6adSraf 		sig_mutex_lock(&head->lio_mutex);
1389f841f6adSraf 		ASSERT(head->lio_refcnt == head->lio_nent);
1390f841f6adSraf 		if (head->lio_refcnt == 1) {
1391f841f6adSraf 			int waiting = 0;
1392f841f6adSraf 			if (head->lio_mode == LIO_WAIT) {
1393f841f6adSraf 				if ((waiting = head->lio_waiting) != 0)
1394f841f6adSraf 					(void) cond_signal(&head->lio_cond_cv);
1395f841f6adSraf 			} else if (head->lio_port < 0) { /* none or signal */
1396f841f6adSraf 				if ((np.np_lio_signo = head->lio_signo) != 0)
1397f841f6adSraf 					notify = 1;
1398f841f6adSraf 				np.np_lio_user = head->lio_sigval.sival_ptr;
1399f841f6adSraf 			} else {			/* thread or port */
1400f841f6adSraf 				notify = 1;
1401f841f6adSraf 				np.np_lio_port = head->lio_port;
1402f841f6adSraf 				np.np_lio_event = head->lio_event;
1403f841f6adSraf 				np.np_lio_object =
1404f841f6adSraf 				    (uintptr_t)head->lio_sigevent;
1405f841f6adSraf 				np.np_lio_user = head->lio_sigval.sival_ptr;
1406f841f6adSraf 			}
1407f841f6adSraf 			head->lio_nent = head->lio_refcnt = 0;
1408f841f6adSraf 			sig_mutex_unlock(&head->lio_mutex);
1409f841f6adSraf 			if (waiting == 0)
1410f841f6adSraf 				_aio_lio_free(head);
1411f841f6adSraf 		} else {
1412f841f6adSraf 			head->lio_nent--;
1413f841f6adSraf 			head->lio_refcnt--;
1414f841f6adSraf 			sig_mutex_unlock(&head->lio_mutex);
1415f841f6adSraf 		}
1416f841f6adSraf 	}
1417f841f6adSraf 
1418f841f6adSraf 	/*
1419f841f6adSraf 	 * The request is completed; now perform the notifications.
1420f841f6adSraf 	 */
1421f841f6adSraf 	if (notify) {
1422f841f6adSraf 		if (reqp != NULL) {
1423f841f6adSraf 			/*
1424f841f6adSraf 			 * We usually put the request on the notification
1425f841f6adSraf 			 * queue because we don't want to block and delay
1426f841f6adSraf 			 * other operations behind us in the work queue.
1427f841f6adSraf 			 * Also we must never block on a cancel notification
1428f841f6adSraf 			 * because we are being called from an application
1429f841f6adSraf 			 * thread in this case and that could lead to deadlock
1430f841f6adSraf 			 * if no other thread is receiving notificatins.
1431f841f6adSraf 			 */
1432f841f6adSraf 			reqp->req_notify = np;
1433f841f6adSraf 			reqp->req_op = AIONOTIFY;
1434f841f6adSraf 			_aio_req_add(reqp, &__workers_no, AIONOTIFY);
1435f841f6adSraf 			reqp = NULL;
1436f841f6adSraf 		} else {
1437f841f6adSraf 			/*
1438f841f6adSraf 			 * We already put the request on the done queue,
1439f841f6adSraf 			 * so we can't queue it to the notification queue.
1440f841f6adSraf 			 * Just do the notification directly.
1441f841f6adSraf 			 */
1442f841f6adSraf 			send_notification(&np);
1443f841f6adSraf 		}
1444f841f6adSraf 	}
1445f841f6adSraf 
1446f841f6adSraf 	if (reqp != NULL)
1447f841f6adSraf 		_aio_req_free(reqp);
1448f841f6adSraf }
1449f841f6adSraf 
1450f841f6adSraf /*
1451f841f6adSraf  * Delete fsync requests from list head until there is
1452f841f6adSraf  * only one left.  Return 0 when there is only one,
1453f841f6adSraf  * otherwise return a non-zero value.
1454f841f6adSraf  */
1455f841f6adSraf static int
1456f841f6adSraf _aio_fsync_del(aio_worker_t *aiowp, aio_req_t *reqp)
1457f841f6adSraf {
1458f841f6adSraf 	aio_lio_t *head = reqp->req_head;
1459f841f6adSraf 	int rval = 0;
1460f841f6adSraf 
1461f841f6adSraf 	ASSERT(reqp == aiowp->work_req);
1462f841f6adSraf 	sig_mutex_lock(&aiowp->work_qlock1);
1463f841f6adSraf 	sig_mutex_lock(&head->lio_mutex);
1464f841f6adSraf 	if (head->lio_refcnt > 1) {
1465f841f6adSraf 		head->lio_refcnt--;
1466f841f6adSraf 		head->lio_nent--;
1467f841f6adSraf 		aiowp->work_req = NULL;
1468f841f6adSraf 		sig_mutex_unlock(&head->lio_mutex);
1469f841f6adSraf 		sig_mutex_unlock(&aiowp->work_qlock1);
1470f841f6adSraf 		sig_mutex_lock(&__aio_mutex);
1471f841f6adSraf 		_aio_outstand_cnt--;
1472f841f6adSraf 		_aio_waitn_wakeup();
1473f841f6adSraf 		sig_mutex_unlock(&__aio_mutex);
1474f841f6adSraf 		_aio_req_free(reqp);
1475f841f6adSraf 		return (1);
1476f841f6adSraf 	}
1477f841f6adSraf 	ASSERT(head->lio_nent == 1 && head->lio_refcnt == 1);
1478f841f6adSraf 	reqp->req_head = NULL;
1479f841f6adSraf 	if (head->lio_canned)
1480f841f6adSraf 		reqp->req_state = AIO_REQ_CANCELED;
1481f841f6adSraf 	if (head->lio_mode == LIO_DESTROY) {
1482f841f6adSraf 		aiowp->work_req = NULL;
1483f841f6adSraf 		rval = 1;
1484f841f6adSraf 	}
1485f841f6adSraf 	sig_mutex_unlock(&head->lio_mutex);
1486f841f6adSraf 	sig_mutex_unlock(&aiowp->work_qlock1);
1487f841f6adSraf 	head->lio_refcnt--;
1488f841f6adSraf 	head->lio_nent--;
1489f841f6adSraf 	_aio_lio_free(head);
1490f841f6adSraf 	if (rval != 0)
1491f841f6adSraf 		_aio_req_free(reqp);
1492f841f6adSraf 	return (rval);
1493f841f6adSraf }
1494f841f6adSraf 
1495f841f6adSraf /*
1496f841f6adSraf  * A worker is set idle when its work queue is empty.
1497f841f6adSraf  * The worker checks again that it has no more work
1498f841f6adSraf  * and then goes to sleep waiting for more work.
1499f841f6adSraf  */
1500f841f6adSraf int
1501f841f6adSraf _aio_idle(aio_worker_t *aiowp)
1502f841f6adSraf {
1503f841f6adSraf 	int error = 0;
1504f841f6adSraf 
1505f841f6adSraf 	sig_mutex_lock(&aiowp->work_qlock1);
1506f841f6adSraf 	if (aiowp->work_count1 == 0) {
1507f841f6adSraf 		ASSERT(aiowp->work_minload1 == 0);
1508f841f6adSraf 		aiowp->work_idleflg = 1;
1509f841f6adSraf 		/*
1510f841f6adSraf 		 * A cancellation handler is not needed here.
1511f841f6adSraf 		 * aio worker threads are never cancelled via pthread_cancel().
1512f841f6adSraf 		 */
1513f841f6adSraf 		error = sig_cond_wait(&aiowp->work_idle_cv,
1514f841f6adSraf 		    &aiowp->work_qlock1);
1515f841f6adSraf 		/*
1516f841f6adSraf 		 * The idle flag is normally cleared before worker is awakened
1517f841f6adSraf 		 * by aio_req_add().  On error (EINTR), we clear it ourself.
1518f841f6adSraf 		 */
1519f841f6adSraf 		if (error)
1520f841f6adSraf 			aiowp->work_idleflg = 0;
1521f841f6adSraf 	}
1522f841f6adSraf 	sig_mutex_unlock(&aiowp->work_qlock1);
1523f841f6adSraf 	return (error);
1524f841f6adSraf }
1525f841f6adSraf 
1526f841f6adSraf /*
1527f841f6adSraf  * A worker's completed AIO requests are placed onto a global
1528f841f6adSraf  * done queue.  The application is only sent a SIGIO signal if
1529f841f6adSraf  * the process has a handler enabled and it is not waiting via
1530f841f6adSraf  * aiowait().
1531f841f6adSraf  */
1532f841f6adSraf static void
1533f841f6adSraf _aio_work_done(aio_worker_t *aiowp)
1534f841f6adSraf {
1535f841f6adSraf 	aio_req_t *reqp;
1536f841f6adSraf 
1537f841f6adSraf 	sig_mutex_lock(&aiowp->work_qlock1);
1538f841f6adSraf 	reqp = aiowp->work_prev1;
1539f841f6adSraf 	reqp->req_next = NULL;
1540f841f6adSraf 	aiowp->work_done1 = 0;
1541f841f6adSraf 	aiowp->work_tail1 = aiowp->work_next1;
1542f841f6adSraf 	if (aiowp->work_tail1 == NULL)
1543f841f6adSraf 		aiowp->work_head1 = NULL;
1544f841f6adSraf 	aiowp->work_prev1 = NULL;
1545f841f6adSraf 	sig_mutex_unlock(&aiowp->work_qlock1);
1546f841f6adSraf 	sig_mutex_lock(&__aio_mutex);
1547f841f6adSraf 	_aio_donecnt++;
1548f841f6adSraf 	_aio_outstand_cnt--;
1549f841f6adSraf 	_aio_req_done_cnt--;
1550f841f6adSraf 	ASSERT(_aio_donecnt > 0 &&
1551f841f6adSraf 	    _aio_outstand_cnt >= 0 &&
1552f841f6adSraf 	    _aio_req_done_cnt >= 0);
1553f841f6adSraf 	ASSERT(reqp != NULL);
1554f841f6adSraf 
1555f841f6adSraf 	if (_aio_done_tail == NULL) {
1556f841f6adSraf 		_aio_done_head = _aio_done_tail = reqp;
1557f841f6adSraf 	} else {
1558f841f6adSraf 		_aio_done_head->req_next = reqp;
1559f841f6adSraf 		_aio_done_head = reqp;
1560f841f6adSraf 	}
1561f841f6adSraf 
1562f841f6adSraf 	if (_aiowait_flag) {
1563f841f6adSraf 		sig_mutex_unlock(&__aio_mutex);
1564f841f6adSraf 		(void) _kaio(AIONOTIFY);
1565f841f6adSraf 	} else {
1566f841f6adSraf 		sig_mutex_unlock(&__aio_mutex);
1567f841f6adSraf 		if (_sigio_enabled)
1568f841f6adSraf 			(void) kill(__pid, SIGIO);
1569f841f6adSraf 	}
1570f841f6adSraf }
1571f841f6adSraf 
1572f841f6adSraf /*
1573f841f6adSraf  * The done queue consists of AIO requests that are in either the
1574f841f6adSraf  * AIO_REQ_DONE or AIO_REQ_CANCELED state.  Requests that were cancelled
1575f841f6adSraf  * are discarded.  If the done queue is empty then NULL is returned.
1576f841f6adSraf  * Otherwise the address of a done aio_result_t is returned.
1577f841f6adSraf  */
1578f841f6adSraf aio_result_t *
1579f841f6adSraf _aio_req_done(void)
1580f841f6adSraf {
1581f841f6adSraf 	aio_req_t *reqp;
1582f841f6adSraf 	aio_result_t *resultp;
1583f841f6adSraf 
1584f841f6adSraf 	ASSERT(MUTEX_HELD(&__aio_mutex));
1585f841f6adSraf 
1586f841f6adSraf 	if ((reqp = _aio_done_tail) != NULL) {
1587f841f6adSraf 		if ((_aio_done_tail = reqp->req_next) == NULL)
1588f841f6adSraf 			_aio_done_head = NULL;
1589f841f6adSraf 		ASSERT(_aio_donecnt > 0);
1590f841f6adSraf 		_aio_donecnt--;
1591f841f6adSraf 		(void) _aio_hash_del(reqp->req_resultp);
1592f841f6adSraf 		resultp = reqp->req_resultp;
1593f841f6adSraf 		ASSERT(reqp->req_state == AIO_REQ_DONE);
1594f841f6adSraf 		_aio_req_free(reqp);
1595f841f6adSraf 		return (resultp);
1596f841f6adSraf 	}
1597f841f6adSraf 	/* is queue empty? */
1598f841f6adSraf 	if (reqp == NULL && _aio_outstand_cnt == 0) {
1599f841f6adSraf 		return ((aio_result_t *)-1);
1600f841f6adSraf 	}
1601f841f6adSraf 	return (NULL);
1602f841f6adSraf }
1603f841f6adSraf 
1604f841f6adSraf /*
1605f841f6adSraf  * Set the return and errno values for the application's use.
1606f841f6adSraf  *
1607f841f6adSraf  * For the Posix interfaces, we must set the return value first followed
1608f841f6adSraf  * by the errno value because the Posix interfaces allow for a change
1609f841f6adSraf  * in the errno value from EINPROGRESS to something else to signal
1610f841f6adSraf  * the completion of the asynchronous request.
1611f841f6adSraf  *
1612f841f6adSraf  * The opposite is true for the Solaris interfaces.  These allow for
1613f841f6adSraf  * a change in the return value from AIO_INPROGRESS to something else
1614f841f6adSraf  * to signal the completion of the asynchronous request.
1615f841f6adSraf  */
1616f841f6adSraf void
1617f841f6adSraf _aio_set_result(aio_req_t *reqp, ssize_t retval, int error)
1618f841f6adSraf {
1619f841f6adSraf 	aio_result_t *resultp = reqp->req_resultp;
1620f841f6adSraf 
1621f841f6adSraf 	if (POSIX_AIO(reqp)) {
1622f841f6adSraf 		resultp->aio_return = retval;
1623f841f6adSraf 		membar_producer();
1624f841f6adSraf 		resultp->aio_errno = error;
1625f841f6adSraf 	} else {
1626f841f6adSraf 		resultp->aio_errno = error;
1627f841f6adSraf 		membar_producer();
1628f841f6adSraf 		resultp->aio_return = retval;
1629f841f6adSraf 	}
1630f841f6adSraf }
1631f841f6adSraf 
1632f841f6adSraf /*
1633f841f6adSraf  * Add an AIO request onto the next work queue.
1634f841f6adSraf  * A circular list of workers is used to choose the next worker.
1635f841f6adSraf  */
1636f841f6adSraf void
1637f841f6adSraf _aio_req_add(aio_req_t *reqp, aio_worker_t **nextworker, int mode)
1638f841f6adSraf {
1639f841f6adSraf 	ulwp_t *self = curthread;
1640f841f6adSraf 	aio_worker_t *aiowp;
1641f841f6adSraf 	aio_worker_t *first;
1642f841f6adSraf 	int load_bal_flg = 1;
1643f841f6adSraf 	int found;
1644f841f6adSraf 
1645f841f6adSraf 	ASSERT(reqp->req_state != AIO_REQ_DONEQ);
1646f841f6adSraf 	reqp->req_next = NULL;
1647f841f6adSraf 	/*
1648f841f6adSraf 	 * Try to acquire the next worker's work queue.  If it is locked,
1649f841f6adSraf 	 * then search the list of workers until a queue is found unlocked,
1650f841f6adSraf 	 * or until the list is completely traversed at which point another
1651f841f6adSraf 	 * worker will be created.
1652f841f6adSraf 	 */
1653f841f6adSraf 	sigoff(self);		/* defer SIGIO */
1654f841f6adSraf 	sig_mutex_lock(&__aio_mutex);
1655f841f6adSraf 	first = aiowp = *nextworker;
1656f841f6adSraf 	if (mode != AIONOTIFY)
1657f841f6adSraf 		_aio_outstand_cnt++;
1658f841f6adSraf 	sig_mutex_unlock(&__aio_mutex);
1659f841f6adSraf 
1660f841f6adSraf 	switch (mode) {
1661f841f6adSraf 	case AIOREAD:
1662f841f6adSraf 	case AIOWRITE:
1663f841f6adSraf 	case AIOAREAD:
1664f841f6adSraf 	case AIOAWRITE:
1665f841f6adSraf #if !defined(_LP64)
1666f841f6adSraf 	case AIOAREAD64:
1667f841f6adSraf 	case AIOAWRITE64:
1668f841f6adSraf #endif
1669f841f6adSraf 		/* try to find an idle worker */
1670f841f6adSraf 		found = 0;
1671f841f6adSraf 		do {
1672f841f6adSraf 			if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
1673f841f6adSraf 				if (aiowp->work_idleflg) {
1674f841f6adSraf 					found = 1;
1675f841f6adSraf 					break;
1676f841f6adSraf 				}
1677f841f6adSraf 				sig_mutex_unlock(&aiowp->work_qlock1);
1678f841f6adSraf 			}
1679f841f6adSraf 		} while ((aiowp = aiowp->work_forw) != first);
1680f841f6adSraf 
1681f841f6adSraf 		if (found) {
1682f841f6adSraf 			aiowp->work_minload1++;
1683f841f6adSraf 			break;
1684f841f6adSraf 		}
1685f841f6adSraf 
1686f841f6adSraf 		/* try to acquire some worker's queue lock */
1687f841f6adSraf 		do {
1688f841f6adSraf 			if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
1689f841f6adSraf 				found = 1;
1690f841f6adSraf 				break;
1691f841f6adSraf 			}
1692f841f6adSraf 		} while ((aiowp = aiowp->work_forw) != first);
1693f841f6adSraf 
1694f841f6adSraf 		/*
1695f841f6adSraf 		 * Create more workers when the workers appear overloaded.
1696f841f6adSraf 		 * Either all the workers are busy draining their queues
1697f841f6adSraf 		 * or no worker's queue lock could be acquired.
1698f841f6adSraf 		 */
1699f841f6adSraf 		if (!found) {
1700f841f6adSraf 			if (_aio_worker_cnt < _max_workers) {
1701f841f6adSraf 				if (_aio_create_worker(reqp, mode))
1702f841f6adSraf 					aio_panic("_aio_req_add: add worker");
1703f841f6adSraf 				sigon(self);	/* reenable SIGIO */
1704f841f6adSraf 				return;
1705f841f6adSraf 			}
1706f841f6adSraf 
1707f841f6adSraf 			/*
1708f841f6adSraf 			 * No worker available and we have created
1709f841f6adSraf 			 * _max_workers, keep going through the
1710f841f6adSraf 			 * list slowly until we get a lock
1711f841f6adSraf 			 */
1712f841f6adSraf 			while (sig_mutex_trylock(&aiowp->work_qlock1) != 0) {
1713f841f6adSraf 				/*
1714f841f6adSraf 				 * give someone else a chance
1715f841f6adSraf 				 */
1716f841f6adSraf 				_aio_delay(1);
1717f841f6adSraf 				aiowp = aiowp->work_forw;
1718f841f6adSraf 			}
1719f841f6adSraf 		}
1720f841f6adSraf 
1721f841f6adSraf 		ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
1722f841f6adSraf 		if (_aio_worker_cnt < _max_workers &&
1723f841f6adSraf 		    aiowp->work_minload1 >= _minworkload) {
1724f841f6adSraf 			sig_mutex_unlock(&aiowp->work_qlock1);
1725f841f6adSraf 			sig_mutex_lock(&__aio_mutex);
1726f841f6adSraf 			*nextworker = aiowp->work_forw;
1727f841f6adSraf 			sig_mutex_unlock(&__aio_mutex);
1728f841f6adSraf 			if (_aio_create_worker(reqp, mode))
1729f841f6adSraf 				aio_panic("aio_req_add: add worker");
1730f841f6adSraf 			sigon(self);	/* reenable SIGIO */
1731f841f6adSraf 			return;
1732f841f6adSraf 		}
1733f841f6adSraf 		aiowp->work_minload1++;
1734f841f6adSraf 		break;
1735f841f6adSraf 	case AIOFSYNC:
1736f841f6adSraf 	case AIONOTIFY:
1737f841f6adSraf 		load_bal_flg = 0;
1738f841f6adSraf 		sig_mutex_lock(&aiowp->work_qlock1);
1739f841f6adSraf 		break;
1740f841f6adSraf 	default:
1741f841f6adSraf 		aio_panic("_aio_req_add: invalid mode");
1742f841f6adSraf 		break;
1743f841f6adSraf 	}
1744f841f6adSraf 	/*
1745f841f6adSraf 	 * Put request onto worker's work queue.
1746f841f6adSraf 	 */
1747f841f6adSraf 	if (aiowp->work_tail1 == NULL) {
1748f841f6adSraf 		ASSERT(aiowp->work_count1 == 0);
1749f841f6adSraf 		aiowp->work_tail1 = reqp;
1750f841f6adSraf 		aiowp->work_next1 = reqp;
1751f841f6adSraf 	} else {
1752f841f6adSraf 		aiowp->work_head1->req_next = reqp;
1753f841f6adSraf 		if (aiowp->work_next1 == NULL)
1754f841f6adSraf 			aiowp->work_next1 = reqp;
1755f841f6adSraf 	}
1756f841f6adSraf 	reqp->req_state = AIO_REQ_QUEUED;
1757f841f6adSraf 	reqp->req_worker = aiowp;
1758f841f6adSraf 	aiowp->work_head1 = reqp;
1759f841f6adSraf 	/*
1760f841f6adSraf 	 * Awaken worker if it is not currently active.
1761f841f6adSraf 	 */
1762f841f6adSraf 	if (aiowp->work_count1++ == 0 && aiowp->work_idleflg) {
1763f841f6adSraf 		aiowp->work_idleflg = 0;
1764f841f6adSraf 		(void) cond_signal(&aiowp->work_idle_cv);
1765f841f6adSraf 	}
1766f841f6adSraf 	sig_mutex_unlock(&aiowp->work_qlock1);
1767f841f6adSraf 
1768f841f6adSraf 	if (load_bal_flg) {
1769f841f6adSraf 		sig_mutex_lock(&__aio_mutex);
1770f841f6adSraf 		*nextworker = aiowp->work_forw;
1771f841f6adSraf 		sig_mutex_unlock(&__aio_mutex);
1772f841f6adSraf 	}
1773f841f6adSraf 	sigon(self);	/* reenable SIGIO */
1774f841f6adSraf }
1775f841f6adSraf 
1776f841f6adSraf /*
1777f841f6adSraf  * Get an AIO request for a specified worker.
1778f841f6adSraf  * If the work queue is empty, return NULL.
1779f841f6adSraf  */
1780f841f6adSraf aio_req_t *
1781f841f6adSraf _aio_req_get(aio_worker_t *aiowp)
1782f841f6adSraf {
1783f841f6adSraf 	aio_req_t *reqp;
1784f841f6adSraf 
1785f841f6adSraf 	sig_mutex_lock(&aiowp->work_qlock1);
1786f841f6adSraf 	if ((reqp = aiowp->work_next1) != NULL) {
1787f841f6adSraf 		/*
1788f841f6adSraf 		 * Remove a POSIX request from the queue; the
1789f841f6adSraf 		 * request queue is a singularly linked list
1790f841f6adSraf 		 * with a previous pointer.  The request is
1791f841f6adSraf 		 * removed by updating the previous pointer.
1792f841f6adSraf 		 *
1793f841f6adSraf 		 * Non-posix requests are left on the queue
1794f841f6adSraf 		 * to eventually be placed on the done queue.
1795f841f6adSraf 		 */
1796f841f6adSraf 
1797f841f6adSraf 		if (POSIX_AIO(reqp)) {
1798f841f6adSraf 			if (aiowp->work_prev1 == NULL) {
1799f841f6adSraf 				aiowp->work_tail1 = reqp->req_next;
1800f841f6adSraf 				if (aiowp->work_tail1 == NULL)
1801f841f6adSraf 					aiowp->work_head1 = NULL;
1802f841f6adSraf 			} else {
1803f841f6adSraf 				aiowp->work_prev1->req_next = reqp->req_next;
1804f841f6adSraf 				if (aiowp->work_head1 == reqp)
1805f841f6adSraf 					aiowp->work_head1 = reqp->req_next;
1806f841f6adSraf 			}
1807f841f6adSraf 
1808f841f6adSraf 		} else {
1809f841f6adSraf 			aiowp->work_prev1 = reqp;
1810f841f6adSraf 			ASSERT(aiowp->work_done1 >= 0);
1811f841f6adSraf 			aiowp->work_done1++;
1812f841f6adSraf 		}
1813f841f6adSraf 		ASSERT(reqp != reqp->req_next);
1814f841f6adSraf 		aiowp->work_next1 = reqp->req_next;
1815f841f6adSraf 		ASSERT(aiowp->work_count1 >= 1);
1816f841f6adSraf 		aiowp->work_count1--;
1817f841f6adSraf 		switch (reqp->req_op) {
1818f841f6adSraf 		case AIOREAD:
1819f841f6adSraf 		case AIOWRITE:
1820f841f6adSraf 		case AIOAREAD:
1821f841f6adSraf 		case AIOAWRITE:
1822f841f6adSraf #if !defined(_LP64)
1823f841f6adSraf 		case AIOAREAD64:
1824f841f6adSraf 		case AIOAWRITE64:
1825f841f6adSraf #endif
1826f841f6adSraf 			ASSERT(aiowp->work_minload1 > 0);
1827f841f6adSraf 			aiowp->work_minload1--;
1828f841f6adSraf 			break;
1829f841f6adSraf 		}
1830f841f6adSraf 		reqp->req_state = AIO_REQ_INPROGRESS;
1831f841f6adSraf 	}
1832f841f6adSraf 	aiowp->work_req = reqp;
1833f841f6adSraf 	ASSERT(reqp != NULL || aiowp->work_count1 == 0);
1834f841f6adSraf 	sig_mutex_unlock(&aiowp->work_qlock1);
1835f841f6adSraf 	return (reqp);
1836f841f6adSraf }
1837f841f6adSraf 
1838f841f6adSraf static void
1839f841f6adSraf _aio_req_del(aio_worker_t *aiowp, aio_req_t *reqp, int ostate)
1840f841f6adSraf {
1841f841f6adSraf 	aio_req_t **last;
1842f841f6adSraf 	aio_req_t *lastrp;
1843f841f6adSraf 	aio_req_t *next;
1844f841f6adSraf 
1845f841f6adSraf 	ASSERT(aiowp != NULL);
1846f841f6adSraf 	ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
1847f841f6adSraf 	if (POSIX_AIO(reqp)) {
1848f841f6adSraf 		if (ostate != AIO_REQ_QUEUED)
1849f841f6adSraf 			return;
1850f841f6adSraf 	}
1851f841f6adSraf 	last = &aiowp->work_tail1;
1852f841f6adSraf 	lastrp = aiowp->work_tail1;
1853f841f6adSraf 	ASSERT(ostate == AIO_REQ_QUEUED || ostate == AIO_REQ_INPROGRESS);
1854f841f6adSraf 	while ((next = *last) != NULL) {
1855f841f6adSraf 		if (next == reqp) {
1856f841f6adSraf 			*last = next->req_next;
1857f841f6adSraf 			if (aiowp->work_next1 == next)
1858f841f6adSraf 				aiowp->work_next1 = next->req_next;
1859f841f6adSraf 
1860f841f6adSraf 			if ((next->req_next != NULL) ||
1861f841f6adSraf 			    (aiowp->work_done1 == 0)) {
1862f841f6adSraf 				if (aiowp->work_head1 == next)
1863f841f6adSraf 					aiowp->work_head1 = next->req_next;
1864f841f6adSraf 				if (aiowp->work_prev1 == next)
1865f841f6adSraf 					aiowp->work_prev1 = next->req_next;
1866f841f6adSraf 			} else {
1867f841f6adSraf 				if (aiowp->work_head1 == next)
1868f841f6adSraf 					aiowp->work_head1 = lastrp;
1869f841f6adSraf 				if (aiowp->work_prev1 == next)
1870f841f6adSraf 					aiowp->work_prev1 = lastrp;
1871f841f6adSraf 			}
1872f841f6adSraf 
1873f841f6adSraf 			if (ostate == AIO_REQ_QUEUED) {
1874f841f6adSraf 				ASSERT(aiowp->work_count1 >= 1);
1875f841f6adSraf 				aiowp->work_count1--;
1876f841f6adSraf 				ASSERT(aiowp->work_minload1 >= 1);
1877f841f6adSraf 				aiowp->work_minload1--;
1878f841f6adSraf 			} else {
1879f841f6adSraf 				ASSERT(ostate == AIO_REQ_INPROGRESS &&
1880f841f6adSraf 				    !POSIX_AIO(reqp));
1881f841f6adSraf 				aiowp->work_done1--;
1882f841f6adSraf 			}
1883f841f6adSraf 			return;
1884f841f6adSraf 		}
1885f841f6adSraf 		last = &next->req_next;
1886f841f6adSraf 		lastrp = next;
1887f841f6adSraf 	}
1888f841f6adSraf 	/* NOTREACHED */
1889f841f6adSraf }
1890f841f6adSraf 
1891f841f6adSraf static void
1892f841f6adSraf _aio_enq_doneq(aio_req_t *reqp)
1893f841f6adSraf {
1894f841f6adSraf 	if (_aio_doneq == NULL) {
1895f841f6adSraf 		_aio_doneq = reqp;
1896f841f6adSraf 		reqp->req_next = reqp->req_prev = reqp;
1897f841f6adSraf 	} else {
1898f841f6adSraf 		reqp->req_next = _aio_doneq;
1899f841f6adSraf 		reqp->req_prev = _aio_doneq->req_prev;
1900f841f6adSraf 		_aio_doneq->req_prev->req_next = reqp;
1901f841f6adSraf 		_aio_doneq->req_prev = reqp;
1902f841f6adSraf 	}
1903f841f6adSraf 	reqp->req_state = AIO_REQ_DONEQ;
1904f841f6adSraf 	_aio_doneq_cnt++;
1905f841f6adSraf }
1906f841f6adSraf 
1907f841f6adSraf /*
1908f841f6adSraf  * caller owns the _aio_mutex
1909f841f6adSraf  */
1910f841f6adSraf aio_req_t *
1911f841f6adSraf _aio_req_remove(aio_req_t *reqp)
1912f841f6adSraf {
1913f841f6adSraf 	if (reqp && reqp->req_state != AIO_REQ_DONEQ)
1914f841f6adSraf 		return (NULL);
1915f841f6adSraf 
1916f841f6adSraf 	if (reqp) {
1917f841f6adSraf 		/* request in done queue */
1918f841f6adSraf 		if (_aio_doneq == reqp)
1919f841f6adSraf 			_aio_doneq = reqp->req_next;
1920f841f6adSraf 		if (_aio_doneq == reqp) {
1921f841f6adSraf 			/* only one request on queue */
1922f841f6adSraf 			_aio_doneq = NULL;
1923f841f6adSraf 		} else {
1924f841f6adSraf 			aio_req_t *tmp = reqp->req_next;
1925f841f6adSraf 			reqp->req_prev->req_next = tmp;
1926f841f6adSraf 			tmp->req_prev = reqp->req_prev;
1927f841f6adSraf 		}
1928f841f6adSraf 	} else if ((reqp = _aio_doneq) != NULL) {
1929f841f6adSraf 		if (reqp == reqp->req_next) {
1930f841f6adSraf 			/* only one request on queue */
1931f841f6adSraf 			_aio_doneq = NULL;
1932f841f6adSraf 		} else {
1933f841f6adSraf 			reqp->req_prev->req_next = _aio_doneq = reqp->req_next;
1934f841f6adSraf 			_aio_doneq->req_prev = reqp->req_prev;
1935f841f6adSraf 		}
1936f841f6adSraf 	}
1937f841f6adSraf 	if (reqp) {
1938f841f6adSraf 		_aio_doneq_cnt--;
1939f841f6adSraf 		reqp->req_next = reqp->req_prev = reqp;
1940f841f6adSraf 		reqp->req_state = AIO_REQ_DONE;
1941f841f6adSraf 	}
1942f841f6adSraf 	return (reqp);
1943f841f6adSraf }
1944f841f6adSraf 
1945f841f6adSraf /*
1946f841f6adSraf  * An AIO request is identified by an aio_result_t pointer.  The library
1947f841f6adSraf  * maps this aio_result_t pointer to its internal representation using a
1948f841f6adSraf  * hash table.  This function adds an aio_result_t pointer to the hash table.
1949f841f6adSraf  */
1950f841f6adSraf static int
1951f841f6adSraf _aio_hash_insert(aio_result_t *resultp, aio_req_t *reqp)
1952f841f6adSraf {
1953f841f6adSraf 	aio_hash_t *hashp;
1954f841f6adSraf 	aio_req_t **prev;
1955f841f6adSraf 	aio_req_t *next;
1956f841f6adSraf 
1957f841f6adSraf 	hashp = _aio_hash + AIOHASH(resultp);
1958f841f6adSraf 	lmutex_lock(&hashp->hash_lock);
1959f841f6adSraf 	prev = &hashp->hash_ptr;
1960f841f6adSraf 	while ((next = *prev) != NULL) {
1961f841f6adSraf 		if (resultp == next->req_resultp) {
1962f841f6adSraf 			lmutex_unlock(&hashp->hash_lock);
1963f841f6adSraf 			return (-1);
1964f841f6adSraf 		}
1965f841f6adSraf 		prev = &next->req_link;
1966f841f6adSraf 	}
1967f841f6adSraf 	*prev = reqp;
1968f841f6adSraf 	ASSERT(reqp->req_link == NULL);
1969f841f6adSraf 	lmutex_unlock(&hashp->hash_lock);
1970f841f6adSraf 	return (0);
1971f841f6adSraf }
1972f841f6adSraf 
1973f841f6adSraf /*
1974f841f6adSraf  * Remove an entry from the hash table.
1975f841f6adSraf  */
1976f841f6adSraf aio_req_t *
1977f841f6adSraf _aio_hash_del(aio_result_t *resultp)
1978f841f6adSraf {
1979f841f6adSraf 	aio_hash_t *hashp;
1980f841f6adSraf 	aio_req_t **prev;
1981f841f6adSraf 	aio_req_t *next = NULL;
1982f841f6adSraf 
1983f841f6adSraf 	if (_aio_hash != NULL) {
1984f841f6adSraf 		hashp = _aio_hash + AIOHASH(resultp);
1985f841f6adSraf 		lmutex_lock(&hashp->hash_lock);
1986f841f6adSraf 		prev = &hashp->hash_ptr;
1987f841f6adSraf 		while ((next = *prev) != NULL) {
1988f841f6adSraf 			if (resultp == next->req_resultp) {
1989f841f6adSraf 				*prev = next->req_link;
1990f841f6adSraf 				next->req_link = NULL;
1991f841f6adSraf 				break;
1992f841f6adSraf 			}
1993f841f6adSraf 			prev = &next->req_link;
1994f841f6adSraf 		}
1995f841f6adSraf 		lmutex_unlock(&hashp->hash_lock);
1996f841f6adSraf 	}
1997f841f6adSraf 	return (next);
1998f841f6adSraf }
1999f841f6adSraf 
2000f841f6adSraf /*
2001f841f6adSraf  *  find an entry in the hash table
2002f841f6adSraf  */
2003f841f6adSraf aio_req_t *
2004f841f6adSraf _aio_hash_find(aio_result_t *resultp)
2005f841f6adSraf {
2006f841f6adSraf 	aio_hash_t *hashp;
2007f841f6adSraf 	aio_req_t **prev;
2008f841f6adSraf 	aio_req_t *next = NULL;
2009f841f6adSraf 
2010f841f6adSraf 	if (_aio_hash != NULL) {
2011f841f6adSraf 		hashp = _aio_hash + AIOHASH(resultp);
2012f841f6adSraf 		lmutex_lock(&hashp->hash_lock);
2013f841f6adSraf 		prev = &hashp->hash_ptr;
2014f841f6adSraf 		while ((next = *prev) != NULL) {
2015f841f6adSraf 			if (resultp == next->req_resultp)
2016f841f6adSraf 				break;
2017f841f6adSraf 			prev = &next->req_link;
2018f841f6adSraf 		}
2019f841f6adSraf 		lmutex_unlock(&hashp->hash_lock);
2020f841f6adSraf 	}
2021f841f6adSraf 	return (next);
2022f841f6adSraf }
2023f841f6adSraf 
2024f841f6adSraf /*
2025f841f6adSraf  * AIO interface for POSIX
2026f841f6adSraf  */
2027f841f6adSraf int
2028f841f6adSraf _aio_rw(aiocb_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
2029f841f6adSraf     int mode, int flg)
2030f841f6adSraf {
2031f841f6adSraf 	aio_req_t *reqp;
2032f841f6adSraf 	aio_args_t *ap;
2033f841f6adSraf 	int kerr;
2034f841f6adSraf 
2035f841f6adSraf 	if (aiocbp == NULL) {
2036f841f6adSraf 		errno = EINVAL;
2037f841f6adSraf 		return (-1);
2038f841f6adSraf 	}
2039f841f6adSraf 
2040f841f6adSraf 	/* initialize kaio */
2041f841f6adSraf 	if (!_kaio_ok)
2042f841f6adSraf 		_kaio_init();
2043f841f6adSraf 
2044f841f6adSraf 	aiocbp->aio_state = NOCHECK;
2045f841f6adSraf 
2046f841f6adSraf 	/*
2047f841f6adSraf 	 * If we have been called because a list I/O
2048f841f6adSraf 	 * kaio() failed, we dont want to repeat the
2049f841f6adSraf 	 * system call
2050f841f6adSraf 	 */
2051f841f6adSraf 
2052f841f6adSraf 	if (flg & AIO_KAIO) {
2053f841f6adSraf 		/*
2054f841f6adSraf 		 * Try kernel aio first.
2055f841f6adSraf 		 * If errno is ENOTSUP/EBADFD,
2056f841f6adSraf 		 * fall back to the thread implementation.
2057f841f6adSraf 		 */
2058f841f6adSraf 		if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
2059f841f6adSraf 			aiocbp->aio_resultp.aio_errno = EINPROGRESS;
2060f841f6adSraf 			aiocbp->aio_state = CHECK;
2061f841f6adSraf 			kerr = (int)_kaio(mode, aiocbp);
2062f841f6adSraf 			if (kerr == 0)
2063f841f6adSraf 				return (0);
2064f841f6adSraf 			if (errno != ENOTSUP && errno != EBADFD) {
2065f841f6adSraf 				aiocbp->aio_resultp.aio_errno = errno;
2066f841f6adSraf 				aiocbp->aio_resultp.aio_return = -1;
2067f841f6adSraf 				aiocbp->aio_state = NOCHECK;
2068f841f6adSraf 				return (-1);
2069f841f6adSraf 			}
2070f841f6adSraf 			if (errno == EBADFD)
2071f841f6adSraf 				SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
2072f841f6adSraf 		}
2073f841f6adSraf 	}
2074f841f6adSraf 
2075f841f6adSraf 	aiocbp->aio_resultp.aio_errno = EINPROGRESS;
2076f841f6adSraf 	aiocbp->aio_state = USERAIO;
2077f841f6adSraf 
2078f841f6adSraf 	if (!__uaio_ok && __uaio_init() == -1)
2079f841f6adSraf 		return (-1);
2080f841f6adSraf 
2081f841f6adSraf 	if ((reqp = _aio_req_alloc()) == NULL) {
2082f841f6adSraf 		errno = EAGAIN;
2083f841f6adSraf 		return (-1);
2084f841f6adSraf 	}
2085f841f6adSraf 
2086f841f6adSraf 	/*
2087f841f6adSraf 	 * If an LIO request, add the list head to the aio request
2088f841f6adSraf 	 */
2089f841f6adSraf 	reqp->req_head = lio_head;
2090f841f6adSraf 	reqp->req_type = AIO_POSIX_REQ;
2091f841f6adSraf 	reqp->req_op = mode;
2092f841f6adSraf 	reqp->req_largefile = 0;
2093f841f6adSraf 
2094f841f6adSraf 	if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
2095f841f6adSraf 		reqp->req_sigevent.sigev_notify = SIGEV_NONE;
2096f841f6adSraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2097f841f6adSraf 		reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
2098f841f6adSraf 		reqp->req_sigevent.sigev_signo =
2099f841f6adSraf 		    aiocbp->aio_sigevent.sigev_signo;
2100f841f6adSraf 		reqp->req_sigevent.sigev_value.sival_ptr =
2101f841f6adSraf 		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
2102f841f6adSraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
2103f841f6adSraf 		port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
2104f841f6adSraf 		reqp->req_sigevent.sigev_notify = SIGEV_PORT;
2105f841f6adSraf 		/*
2106f841f6adSraf 		 * Reuse the sigevent structure to contain the port number
2107f841f6adSraf 		 * and the user value.  Same for SIGEV_THREAD, below.
2108f841f6adSraf 		 */
2109f841f6adSraf 		reqp->req_sigevent.sigev_signo =
2110f841f6adSraf 		    pn->portnfy_port;
2111f841f6adSraf 		reqp->req_sigevent.sigev_value.sival_ptr =
2112f841f6adSraf 		    pn->portnfy_user;
2113f841f6adSraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
2114f841f6adSraf 		reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
2115f841f6adSraf 		/*
2116f841f6adSraf 		 * The sigevent structure contains the port number
2117f841f6adSraf 		 * and the user value.  Same for SIGEV_PORT, above.
2118f841f6adSraf 		 */
2119f841f6adSraf 		reqp->req_sigevent.sigev_signo =
2120f841f6adSraf 		    aiocbp->aio_sigevent.sigev_signo;
2121f841f6adSraf 		reqp->req_sigevent.sigev_value.sival_ptr =
2122f841f6adSraf 		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
2123f841f6adSraf 	}
2124f841f6adSraf 
2125f841f6adSraf 	reqp->req_resultp = &aiocbp->aio_resultp;
2126f841f6adSraf 	reqp->req_aiocbp = aiocbp;
2127f841f6adSraf 	ap = &reqp->req_args;
2128f841f6adSraf 	ap->fd = aiocbp->aio_fildes;
2129f841f6adSraf 	ap->buf = (caddr_t)aiocbp->aio_buf;
2130f841f6adSraf 	ap->bufsz = aiocbp->aio_nbytes;
2131f841f6adSraf 	ap->offset = aiocbp->aio_offset;
2132f841f6adSraf 
2133f841f6adSraf 	if ((flg & AIO_NO_DUPS) &&
2134f841f6adSraf 	    _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
2135f841f6adSraf 		aio_panic("_aio_rw(): request already in hash table");
2136f841f6adSraf 		_aio_req_free(reqp);
2137f841f6adSraf 		errno = EINVAL;
2138f841f6adSraf 		return (-1);
2139f841f6adSraf 	}
2140f841f6adSraf 	_aio_req_add(reqp, nextworker, mode);
2141f841f6adSraf 	return (0);
2142f841f6adSraf }
2143f841f6adSraf 
2144f841f6adSraf #if !defined(_LP64)
2145f841f6adSraf /*
2146f841f6adSraf  * 64-bit AIO interface for POSIX
2147f841f6adSraf  */
2148f841f6adSraf int
2149f841f6adSraf _aio_rw64(aiocb64_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
2150f841f6adSraf     int mode, int flg)
2151f841f6adSraf {
2152f841f6adSraf 	aio_req_t *reqp;
2153f841f6adSraf 	aio_args_t *ap;
2154f841f6adSraf 	int kerr;
2155f841f6adSraf 
2156f841f6adSraf 	if (aiocbp == NULL) {
2157f841f6adSraf 		errno = EINVAL;
2158f841f6adSraf 		return (-1);
2159f841f6adSraf 	}
2160f841f6adSraf 
2161f841f6adSraf 	/* initialize kaio */
2162f841f6adSraf 	if (!_kaio_ok)
2163f841f6adSraf 		_kaio_init();
2164f841f6adSraf 
2165f841f6adSraf 	aiocbp->aio_state = NOCHECK;
2166f841f6adSraf 
2167f841f6adSraf 	/*
2168f841f6adSraf 	 * If we have been called because a list I/O
2169f841f6adSraf 	 * kaio() failed, we dont want to repeat the
2170f841f6adSraf 	 * system call
2171f841f6adSraf 	 */
2172f841f6adSraf 
2173f841f6adSraf 	if (flg & AIO_KAIO) {
2174f841f6adSraf 		/*
2175f841f6adSraf 		 * Try kernel aio first.
2176f841f6adSraf 		 * If errno is ENOTSUP/EBADFD,
2177f841f6adSraf 		 * fall back to the thread implementation.
2178f841f6adSraf 		 */
2179f841f6adSraf 		if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
2180f841f6adSraf 			aiocbp->aio_resultp.aio_errno = EINPROGRESS;
2181f841f6adSraf 			aiocbp->aio_state = CHECK;
2182f841f6adSraf 			kerr = (int)_kaio(mode, aiocbp);
2183f841f6adSraf 			if (kerr == 0)
2184f841f6adSraf 				return (0);
2185f841f6adSraf 			if (errno != ENOTSUP && errno != EBADFD) {
2186f841f6adSraf 				aiocbp->aio_resultp.aio_errno = errno;
2187f841f6adSraf 				aiocbp->aio_resultp.aio_return = -1;
2188f841f6adSraf 				aiocbp->aio_state = NOCHECK;
2189f841f6adSraf 				return (-1);
2190f841f6adSraf 			}
2191f841f6adSraf 			if (errno == EBADFD)
2192f841f6adSraf 				SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
2193f841f6adSraf 		}
2194f841f6adSraf 	}
2195f841f6adSraf 
2196f841f6adSraf 	aiocbp->aio_resultp.aio_errno = EINPROGRESS;
2197f841f6adSraf 	aiocbp->aio_state = USERAIO;
2198f841f6adSraf 
2199f841f6adSraf 	if (!__uaio_ok && __uaio_init() == -1)
2200f841f6adSraf 		return (-1);
2201f841f6adSraf 
2202f841f6adSraf 	if ((reqp = _aio_req_alloc()) == NULL) {
2203f841f6adSraf 		errno = EAGAIN;
2204f841f6adSraf 		return (-1);
2205f841f6adSraf 	}
2206f841f6adSraf 
2207f841f6adSraf 	/*
2208f841f6adSraf 	 * If an LIO request, add the list head to the aio request
2209f841f6adSraf 	 */
2210f841f6adSraf 	reqp->req_head = lio_head;
2211f841f6adSraf 	reqp->req_type = AIO_POSIX_REQ;
2212f841f6adSraf 	reqp->req_op = mode;
2213f841f6adSraf 	reqp->req_largefile = 1;
2214f841f6adSraf 
2215f841f6adSraf 	if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
2216f841f6adSraf 		reqp->req_sigevent.sigev_notify = SIGEV_NONE;
2217f841f6adSraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2218f841f6adSraf 		reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
2219f841f6adSraf 		reqp->req_sigevent.sigev_signo =
2220f841f6adSraf 		    aiocbp->aio_sigevent.sigev_signo;
2221f841f6adSraf 		reqp->req_sigevent.sigev_value.sival_ptr =
2222f841f6adSraf 		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
2223f841f6adSraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
2224f841f6adSraf 		port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
2225f841f6adSraf 		reqp->req_sigevent.sigev_notify = SIGEV_PORT;
2226f841f6adSraf 		reqp->req_sigevent.sigev_signo =
2227f841f6adSraf 		    pn->portnfy_port;
2228f841f6adSraf 		reqp->req_sigevent.sigev_value.sival_ptr =
2229f841f6adSraf 		    pn->portnfy_user;
2230f841f6adSraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
2231f841f6adSraf 		reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
2232f841f6adSraf 		reqp->req_sigevent.sigev_signo =
2233f841f6adSraf 		    aiocbp->aio_sigevent.sigev_signo;
2234f841f6adSraf 		reqp->req_sigevent.sigev_value.sival_ptr =
2235f841f6adSraf 		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
2236f841f6adSraf 	}
2237f841f6adSraf 
2238f841f6adSraf 	reqp->req_resultp = &aiocbp->aio_resultp;
2239f841f6adSraf 	reqp->req_aiocbp = aiocbp;
2240f841f6adSraf 	ap = &reqp->req_args;
2241f841f6adSraf 	ap->fd = aiocbp->aio_fildes;
2242f841f6adSraf 	ap->buf = (caddr_t)aiocbp->aio_buf;
2243f841f6adSraf 	ap->bufsz = aiocbp->aio_nbytes;
2244f841f6adSraf 	ap->offset = aiocbp->aio_offset;
2245f841f6adSraf 
2246f841f6adSraf 	if ((flg & AIO_NO_DUPS) &&
2247f841f6adSraf 	    _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
2248f841f6adSraf 		aio_panic("_aio_rw64(): request already in hash table");
2249f841f6adSraf 		_aio_req_free(reqp);
2250f841f6adSraf 		errno = EINVAL;
2251f841f6adSraf 		return (-1);
2252f841f6adSraf 	}
2253f841f6adSraf 	_aio_req_add(reqp, nextworker, mode);
2254f841f6adSraf 	return (0);
2255f841f6adSraf }
2256f841f6adSraf #endif	/* !defined(_LP64) */
2257