xref: /illumos-gate/usr/src/uts/common/os/aio_subr.c (revision 09295472)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/proc.h>
31 #include <sys/file.h>
32 #include <sys/errno.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/cmn_err.h>
36 #include <sys/systm.h>
37 #include <vm/as.h>
38 #include <vm/page.h>
39 #include <sys/uio.h>
40 #include <sys/kmem.h>
41 #include <sys/debug.h>
42 #include <sys/aio_impl.h>
43 #include <sys/epm.h>
44 #include <sys/fs/snode.h>
45 #include <sys/siginfo.h>
46 #include <sys/cpuvar.h>
47 #include <sys/tnf_probe.h>
48 #include <sys/conf.h>
49 #include <sys/sdt.h>
50 
51 int aphysio(int (*)(), int (*)(), dev_t, int, void (*)(), struct aio_req *);
52 void aio_done(struct buf *);
53 void aphysio_unlock(aio_req_t *);
54 void aio_cleanup(int);
55 void aio_cleanup_exit(void);
56 
57 /*
58  * private functions
59  */
60 static void aio_sigev_send(proc_t *, sigqueue_t *);
61 static void aio_hash_delete(aio_t *, aio_req_t *);
62 static void aio_lio_free(aio_t *, aio_lio_t *);
63 static void aio_cleanup_cleanupq(aio_t *, aio_req_t *, int);
64 static int aio_cleanup_notifyq(aio_t *, aio_req_t *, int);
65 static void aio_cleanup_pollq(aio_t *, aio_req_t *, int);
66 static void aio_cleanup_portq(aio_t *, aio_req_t *, int);
67 
68 /*
69  * async version of physio() that doesn't wait synchronously
70  * for the driver's strategy routine to complete.
71  */
72 
73 int
74 aphysio(
75 	int (*strategy)(struct buf *),
76 	int (*cancel)(struct buf *),
77 	dev_t dev,
78 	int rw,
79 	void (*mincnt)(struct buf *),
80 	struct aio_req *aio)
81 {
82 	struct uio *uio = aio->aio_uio;
83 	aio_req_t *reqp = (aio_req_t *)aio->aio_private;
84 	struct buf *bp = &reqp->aio_req_buf;
85 	struct iovec *iov;
86 	struct as *as;
87 	char *a;
88 	int	error;
89 	size_t	c;
90 	struct page **pplist;
91 	struct dev_ops *ops = devopsp[getmajor(dev)];
92 
93 	if (uio->uio_loffset < 0)
94 		return (EINVAL);
95 #ifdef	_ILP32
96 	/*
97 	 * For 32-bit kernels, check against SPEC_MAXOFFSET_T which represents
98 	 * the maximum size that can be supported by the IO subsystem.
99 	 * XXX this code assumes a D_64BIT driver.
100 	 */
101 	if (uio->uio_loffset > SPEC_MAXOFFSET_T)
102 		return (EINVAL);
103 #endif	/* _ILP32 */
104 
105 	TNF_PROBE_5(aphysio_start, "kaio", /* CSTYLED */,
106 		tnf_opaque, bp, bp,
107 		tnf_device, device, dev,
108 		tnf_offset, blkno, btodt(uio->uio_loffset),
109 		tnf_size, size, uio->uio_iov->iov_len,
110 		tnf_bioflags, rw, rw);
111 
112 	if (rw == B_READ) {
113 		CPU_STATS_ADD_K(sys, phread, 1);
114 	} else {
115 		CPU_STATS_ADD_K(sys, phwrite, 1);
116 	}
117 
118 	iov = uio->uio_iov;
119 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
120 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
121 
122 	bp->b_error = 0;
123 	bp->b_flags = B_BUSY | B_PHYS | B_ASYNC | rw;
124 	bp->b_edev = dev;
125 	bp->b_dev = cmpdev(dev);
126 	bp->b_lblkno = btodt(uio->uio_loffset);
127 	bp->b_offset = uio->uio_loffset;
128 	(void) ops->devo_getinfo(NULL, DDI_INFO_DEVT2DEVINFO,
129 	    (void *)bp->b_edev, (void **)&bp->b_dip);
130 
131 	/*
132 	 * Clustering: Clustering can set the b_iodone, b_forw and
133 	 * b_proc fields to cluster-specifc values.
134 	 */
135 	if (bp->b_iodone == NULL) {
136 		bp->b_iodone = (int (*)()) aio_done;
137 		/* b_forw points at an aio_req_t structure */
138 		bp->b_forw = (struct buf *)reqp;
139 		bp->b_proc = curproc;
140 	}
141 
142 	a = bp->b_un.b_addr = iov->iov_base;
143 	c = bp->b_bcount = iov->iov_len;
144 
145 	(*mincnt)(bp);
146 	if (bp->b_bcount != iov->iov_len)
147 		return (ENOTSUP);
148 
149 	as = bp->b_proc->p_as;
150 
151 	error = as_pagelock(as, &pplist, a,
152 	    c, rw == B_READ? S_WRITE : S_READ);
153 	if (error != 0) {
154 		bp->b_flags |= B_ERROR;
155 		bp->b_error = error;
156 		bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
157 		return (error);
158 	}
159 	reqp->aio_req_flags |= AIO_PAGELOCKDONE;
160 	bp->b_shadow = pplist;
161 	if (pplist != NULL) {
162 		bp->b_flags |= B_SHADOW;
163 	}
164 
165 	if (cancel != anocancel)
166 		cmn_err(CE_PANIC,
167 		    "aphysio: cancellation not supported, use anocancel");
168 
169 	reqp->aio_req_cancel = cancel;
170 
171 	DTRACE_IO1(start, struct buf *, bp);
172 
173 	return ((*strategy)(bp));
174 }
175 
176 /*ARGSUSED*/
177 int
178 anocancel(struct buf *bp)
179 {
180 	return (ENXIO);
181 }
182 
183 /*
184  * Called from biodone().
185  * Notify process that a pending AIO has finished.
186  */
187 
188 /*
189  * Clustering: This function is made non-static as it is used
190  * by clustering s/w as contract private interface.
191  */
192 
193 void
194 aio_done(struct buf *bp)
195 {
196 	proc_t *p;
197 	struct as *as;
198 	aio_req_t *reqp;
199 	aio_lio_t *head = NULL;
200 	aio_t *aiop;
201 	sigqueue_t *sigev = NULL;
202 	sigqueue_t *lio_sigev = NULL;
203 	port_kevent_t *pkevp = NULL;
204 	port_kevent_t *lio_pkevp = NULL;
205 	int fd;
206 	int cleanupqflag;
207 	int pollqflag;
208 	int portevpend;
209 	void (*func)();
210 	int use_port = 0;
211 
212 	p = bp->b_proc;
213 	reqp = (aio_req_t *)bp->b_forw;
214 	fd = reqp->aio_req_fd;
215 
216 	TNF_PROBE_5(aphysio_end, "kaio", /* CSTYLED */,
217 		tnf_opaque, bp, bp,
218 		tnf_device, device, bp->b_edev,
219 		tnf_offset, blkno, btodt(reqp->aio_req_uio.uio_loffset),
220 		tnf_size, size, reqp->aio_req_uio.uio_iov->iov_len,
221 		tnf_bioflags, rw, (bp->b_flags & (B_READ|B_WRITE)));
222 
223 	/*
224 	 * mapout earlier so that more kmem is available when aio is
225 	 * heavily used. bug #1262082
226 	 */
227 	if (bp->b_flags & B_REMAPPED)
228 		bp_mapout(bp);
229 
230 	/* decrement fd's ref count by one, now that aio request is done. */
231 	areleasef(fd, P_FINFO(p));
232 
233 	aiop = p->p_aio;
234 	ASSERT(aiop != NULL);
235 
236 	mutex_enter(&aiop->aio_portq_mutex);
237 	mutex_enter(&aiop->aio_mutex);
238 	ASSERT(aiop->aio_pending > 0);
239 	ASSERT(reqp->aio_req_flags & AIO_PENDING);
240 	aiop->aio_pending--;
241 	reqp->aio_req_flags &= ~AIO_PENDING;
242 	if ((pkevp = reqp->aio_req_portkev) != NULL) {
243 		/* Event port notification is desired for this transaction */
244 		if (reqp->aio_req_flags & AIO_CLOSE_PORT) {
245 			/*
246 			 * The port is being closed and it is waiting for
247 			 * pending asynchronous I/O transactions to complete.
248 			 */
249 			portevpend = --aiop->aio_portpendcnt;
250 			aio_deq(&aiop->aio_portpending, reqp);
251 			aio_enq(&aiop->aio_portq, reqp, 0);
252 			mutex_exit(&aiop->aio_mutex);
253 			mutex_exit(&aiop->aio_portq_mutex);
254 			port_send_event(pkevp);
255 			if (portevpend == 0)
256 				cv_broadcast(&aiop->aio_portcv);
257 			return;
258 		}
259 
260 		if (aiop->aio_flags & AIO_CLEANUP) {
261 			/*
262 			 * aio_cleanup_thread() is waiting for completion of
263 			 * transactions.
264 			 */
265 			as = p->p_as;
266 			mutex_enter(&as->a_contents);
267 			aio_deq(&aiop->aio_portpending, reqp);
268 			aio_enq(&aiop->aio_portcleanupq, reqp, 0);
269 			cv_signal(&aiop->aio_cleanupcv);
270 			mutex_exit(&as->a_contents);
271 			mutex_exit(&aiop->aio_mutex);
272 			mutex_exit(&aiop->aio_portq_mutex);
273 			return;
274 		}
275 
276 		aio_deq(&aiop->aio_portpending, reqp);
277 		aio_enq(&aiop->aio_portq, reqp, 0);
278 
279 		use_port = 1;
280 	} else {
281 		/*
282 		 * when the AIO_CLEANUP flag is enabled for this
283 		 * process, or when the AIO_POLL bit is set for
284 		 * this request, special handling is required.
285 		 * otherwise the request is put onto the doneq.
286 		 */
287 		cleanupqflag = (aiop->aio_flags & AIO_CLEANUP);
288 		pollqflag = (reqp->aio_req_flags & AIO_POLL);
289 		if (cleanupqflag | pollqflag) {
290 
291 			if (cleanupqflag) {
292 				as = p->p_as;
293 				mutex_enter(&as->a_contents);
294 			}
295 
296 			/*
297 			 * requests with their AIO_POLL bit set are put
298 			 * on the pollq, requests with sigevent structures
299 			 * or with listio heads are put on the notifyq, and
300 			 * the remaining requests don't require any special
301 			 * cleanup handling, so they're put onto the default
302 			 * cleanupq.
303 			 */
304 			if (pollqflag)
305 				aio_enq(&aiop->aio_pollq, reqp, AIO_POLLQ);
306 			else if (reqp->aio_req_sigqp || reqp->aio_req_lio)
307 				aio_enq(&aiop->aio_notifyq, reqp, AIO_NOTIFYQ);
308 			else
309 				aio_enq(&aiop->aio_cleanupq, reqp,
310 				    AIO_CLEANUPQ);
311 
312 			if (cleanupqflag) {
313 				cv_signal(&aiop->aio_cleanupcv);
314 				mutex_exit(&as->a_contents);
315 				mutex_exit(&aiop->aio_mutex);
316 				mutex_exit(&aiop->aio_portq_mutex);
317 			} else {
318 				ASSERT(pollqflag);
319 				/* block aio_cleanup_exit until we're done */
320 				aiop->aio_flags |= AIO_DONE_ACTIVE;
321 				mutex_exit(&aiop->aio_mutex);
322 				mutex_exit(&aiop->aio_portq_mutex);
323 				/*
324 				 * let the cleanup processing happen from an AST
325 				 * set an AST on all threads in this process
326 				 */
327 				mutex_enter(&p->p_lock);
328 				set_proc_ast(p);
329 				mutex_exit(&p->p_lock);
330 				mutex_enter(&aiop->aio_mutex);
331 				/* wakeup anybody waiting in aiowait() */
332 				cv_broadcast(&aiop->aio_waitcv);
333 
334 				/* wakeup aio_cleanup_exit if needed */
335 				if (aiop->aio_flags & AIO_CLEANUP)
336 					cv_signal(&aiop->aio_cleanupcv);
337 				aiop->aio_flags &= ~AIO_DONE_ACTIVE;
338 				mutex_exit(&aiop->aio_mutex);
339 			}
340 			return;
341 		}
342 
343 		/*
344 		 * save req's sigevent pointer, and check its
345 		 * value after releasing aio_mutex lock.
346 		 */
347 		sigev = reqp->aio_req_sigqp;
348 		reqp->aio_req_sigqp = NULL;
349 
350 		/* put request on done queue. */
351 		aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
352 	} /* portkevent */
353 
354 	/*
355 	 * when list IO notification is enabled, a notification or
356 	 * signal is sent only when all entries in the list are done.
357 	 */
358 	if ((head = reqp->aio_req_lio) != NULL) {
359 		ASSERT(head->lio_refcnt > 0);
360 		if (--head->lio_refcnt == 0) {
361 			/*
362 			 * save lio's sigevent pointer, and check
363 			 * its value after releasing aio_mutex lock.
364 			 */
365 			lio_sigev = head->lio_sigqp;
366 			head->lio_sigqp = NULL;
367 			cv_signal(&head->lio_notify);
368 			if (head->lio_port >= 0 &&
369 			    (lio_pkevp = head->lio_portkev) != NULL)
370 				head->lio_port = -1;
371 		}
372 	}
373 
374 	/*
375 	 * if AIO_WAITN set then
376 	 * send signal only when we reached the
377 	 * required amount of IO's finished
378 	 * or when all IO's are done
379 	 */
380 	if (aiop->aio_flags & AIO_WAITN) {
381 		if (aiop->aio_waitncnt > 0)
382 			aiop->aio_waitncnt--;
383 		if (aiop->aio_pending == 0 ||
384 		    aiop->aio_waitncnt == 0)
385 			cv_broadcast(&aiop->aio_waitcv);
386 	} else {
387 		cv_broadcast(&aiop->aio_waitcv);
388 	}
389 
390 	mutex_exit(&aiop->aio_mutex);
391 	mutex_exit(&aiop->aio_portq_mutex);
392 
393 	if (sigev)
394 		aio_sigev_send(p, sigev);
395 	else if (!use_port && head == NULL) {
396 		/*
397 		 * Send a SIGIO signal when the process has a handler enabled.
398 		 */
399 		if ((func = PTOU(p)->u_signal[SIGIO - 1]) != SIG_DFL &&
400 		    func != SIG_IGN)
401 			psignal(p, SIGIO);
402 	}
403 	if (pkevp)
404 		port_send_event(pkevp);
405 	if (lio_sigev)
406 		aio_sigev_send(p, lio_sigev);
407 	if (lio_pkevp)
408 		port_send_event(lio_pkevp);
409 }
410 
411 /*
412  * send a queued signal to the specified process when
413  * the event signal is non-NULL. A return value of 1
414  * will indicate that a signal is queued, and 0 means that
415  * no signal was specified, nor sent.
416  */
417 static void
418 aio_sigev_send(proc_t *p, sigqueue_t *sigev)
419 {
420 	ASSERT(sigev != NULL);
421 
422 	mutex_enter(&p->p_lock);
423 	sigaddqa(p, NULL, sigev);
424 	mutex_exit(&p->p_lock);
425 }
426 
427 /*
428  * special case handling for zero length requests. the aio request
429  * short circuits the normal completion path since all that's required
430  * to complete this request is to copyout a zero to the aio request's
431  * return value.
432  */
433 void
434 aio_zerolen(aio_req_t *reqp)
435 {
436 
437 	struct buf *bp = &reqp->aio_req_buf;
438 
439 	reqp->aio_req_flags |= AIO_ZEROLEN;
440 
441 	bp->b_forw = (struct buf *)reqp;
442 	bp->b_proc = curproc;
443 
444 	bp->b_resid = 0;
445 	bp->b_flags = 0;
446 
447 	aio_done(bp);
448 }
449 
450 /*
451  * unlock pages previously locked by as_pagelock
452  */
453 void
454 aphysio_unlock(aio_req_t *reqp)
455 {
456 	struct buf *bp;
457 	struct iovec *iov;
458 	int flags;
459 
460 	if (reqp->aio_req_flags & AIO_PHYSIODONE)
461 		return;
462 
463 	reqp->aio_req_flags |= AIO_PHYSIODONE;
464 
465 	if (reqp->aio_req_flags & AIO_ZEROLEN)
466 		return;
467 
468 	bp = &reqp->aio_req_buf;
469 	iov = reqp->aio_req_uio.uio_iov;
470 	flags = (((bp->b_flags & B_READ) == B_READ) ? S_WRITE : S_READ);
471 	if (reqp->aio_req_flags & AIO_PAGELOCKDONE) {
472 		as_pageunlock(bp->b_proc->p_as,
473 			bp->b_flags & B_SHADOW ? bp->b_shadow : NULL,
474 			iov->iov_base, iov->iov_len, flags);
475 		reqp->aio_req_flags &= ~AIO_PAGELOCKDONE;
476 	}
477 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
478 	bp->b_flags |= B_DONE;
479 }
480 
481 /*
482  * deletes a requests id from the hash table of outstanding io.
483  */
484 static void
485 aio_hash_delete(aio_t *aiop, struct aio_req_t *reqp)
486 {
487 	long index;
488 	aio_result_t *resultp = reqp->aio_req_resultp;
489 	aio_req_t *current;
490 	aio_req_t **nextp;
491 
492 	index = AIO_HASH(resultp);
493 	nextp = (aiop->aio_hash + index);
494 	while ((current = *nextp) != NULL) {
495 		if (current->aio_req_resultp == resultp) {
496 			*nextp = current->aio_hash_next;
497 			return;
498 		}
499 		nextp = &current->aio_hash_next;
500 	}
501 }
502 
503 /*
504  * Put a list head struct onto its free list.
505  */
506 static void
507 aio_lio_free(aio_t *aiop, aio_lio_t *head)
508 {
509 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
510 
511 	if (head->lio_sigqp != NULL)
512 		kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
513 	head->lio_next = aiop->aio_lio_free;
514 	aiop->aio_lio_free = head;
515 }
516 
517 /*
518  * Put a reqp onto the freelist.
519  */
520 void
521 aio_req_free(aio_t *aiop, aio_req_t *reqp)
522 {
523 	aio_lio_t *liop;
524 
525 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
526 
527 	if (reqp->aio_req_portkev) {
528 		port_free_event(reqp->aio_req_portkev);
529 		reqp->aio_req_portkev = NULL;
530 	}
531 
532 	if ((liop = reqp->aio_req_lio) != NULL) {
533 		if (--liop->lio_nent == 0)
534 			aio_lio_free(aiop, liop);
535 		reqp->aio_req_lio = NULL;
536 	}
537 	if (reqp->aio_req_sigqp != NULL) {
538 		kmem_free(reqp->aio_req_sigqp, sizeof (sigqueue_t));
539 		reqp->aio_req_sigqp = NULL;
540 	}
541 	reqp->aio_req_next = aiop->aio_free;
542 	reqp->aio_req_prev = NULL;
543 	aiop->aio_free = reqp;
544 	aiop->aio_outstanding--;
545 	if (aiop->aio_outstanding == 0)
546 		cv_broadcast(&aiop->aio_waitcv);
547 	aio_hash_delete(aiop, reqp);
548 }
549 
550 /*
551  * Put a reqp onto the freelist.
552  */
553 void
554 aio_req_free_port(aio_t *aiop, aio_req_t *reqp)
555 {
556 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
557 
558 	reqp->aio_req_next = aiop->aio_free;
559 	reqp->aio_req_prev = NULL;
560 	aiop->aio_free = reqp;
561 	aiop->aio_outstanding--;
562 	aio_hash_delete(aiop, reqp);
563 }
564 
565 
566 /*
567  * Verify the integrity of a queue.
568  */
569 #if defined(DEBUG)
570 static void
571 aio_verify_queue(aio_req_t *head,
572 	aio_req_t *entry_present, aio_req_t *entry_missing)
573 {
574 	aio_req_t *reqp;
575 	int found = 0;
576 	int present = 0;
577 
578 	if ((reqp = head) != NULL) {
579 		do {
580 			ASSERT(reqp->aio_req_prev->aio_req_next == reqp);
581 			ASSERT(reqp->aio_req_next->aio_req_prev == reqp);
582 			if (entry_present == reqp)
583 				found++;
584 			if (entry_missing == reqp)
585 				present++;
586 		} while ((reqp = reqp->aio_req_next) != head);
587 	}
588 	ASSERT(entry_present == NULL || found == 1);
589 	ASSERT(entry_missing == NULL || present == 0);
590 }
591 #else
592 #define	aio_verify_queue(x, y, z)
593 #endif
594 
595 /*
596  * Put a request onto the tail of a queue.
597  */
598 void
599 aio_enq(aio_req_t **qhead, aio_req_t *reqp, int qflg_new)
600 {
601 	aio_req_t *head;
602 	aio_req_t *prev;
603 
604 	aio_verify_queue(*qhead, NULL, reqp);
605 
606 	if ((head = *qhead) == NULL) {
607 		reqp->aio_req_next = reqp;
608 		reqp->aio_req_prev = reqp;
609 		*qhead = reqp;
610 	} else {
611 		reqp->aio_req_next = head;
612 		reqp->aio_req_prev = prev = head->aio_req_prev;
613 		prev->aio_req_next = reqp;
614 		head->aio_req_prev = reqp;
615 	}
616 	reqp->aio_req_flags |= qflg_new;
617 }
618 
619 /*
620  * Remove a request from its queue.
621  */
622 void
623 aio_deq(aio_req_t **qhead, aio_req_t *reqp)
624 {
625 	aio_verify_queue(*qhead, reqp, NULL);
626 
627 	if (reqp->aio_req_next == reqp) {
628 		*qhead = NULL;
629 	} else {
630 		reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
631 		reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
632 		if (*qhead == reqp)
633 			*qhead = reqp->aio_req_next;
634 	}
635 	reqp->aio_req_next = NULL;
636 	reqp->aio_req_prev = NULL;
637 }
638 
639 /*
640  * concatenate a specified queue with the cleanupq. the specified
641  * queue is put onto the tail of the cleanupq. all elements on the
642  * specified queue should have their aio_req_flags field cleared.
643  */
644 /*ARGSUSED*/
645 void
646 aio_cleanupq_concat(aio_t *aiop, aio_req_t *q2, int qflg)
647 {
648 	aio_req_t *cleanupqhead, *q2tail;
649 	aio_req_t *reqp = q2;
650 
651 	do {
652 		ASSERT(reqp->aio_req_flags & qflg);
653 		reqp->aio_req_flags &= ~qflg;
654 		reqp->aio_req_flags |= AIO_CLEANUPQ;
655 	} while ((reqp = reqp->aio_req_next) != q2);
656 
657 	cleanupqhead = aiop->aio_cleanupq;
658 	if (cleanupqhead == NULL)
659 		aiop->aio_cleanupq = q2;
660 	else {
661 		cleanupqhead->aio_req_prev->aio_req_next = q2;
662 		q2tail = q2->aio_req_prev;
663 		q2tail->aio_req_next = cleanupqhead;
664 		q2->aio_req_prev = cleanupqhead->aio_req_prev;
665 		cleanupqhead->aio_req_prev = q2tail;
666 	}
667 }
668 
669 /*
670  * cleanup aio requests that are on the per-process poll queue.
671  */
672 void
673 aio_cleanup(int flag)
674 {
675 	aio_t *aiop = curproc->p_aio;
676 	aio_req_t *pollqhead, *cleanupqhead, *notifyqhead;
677 	aio_req_t *cleanupport;
678 	aio_req_t *portq = NULL;
679 	void (*func)();
680 	int signalled = 0;
681 	int qflag = 0;
682 	int exitflg;
683 
684 	ASSERT(aiop != NULL);
685 
686 	if (flag == AIO_CLEANUP_EXIT)
687 		exitflg = AIO_CLEANUP_EXIT;
688 	else
689 		exitflg = 0;
690 
691 	/*
692 	 * We need to get the aio_cleanupq_mutex because we are calling
693 	 * aio_cleanup_cleanupq()
694 	 */
695 	mutex_enter(&aiop->aio_cleanupq_mutex);
696 	/*
697 	 * take all the requests off the cleanupq, the notifyq,
698 	 * and the pollq.
699 	 */
700 	mutex_enter(&aiop->aio_mutex);
701 	if ((cleanupqhead = aiop->aio_cleanupq) != NULL) {
702 		aiop->aio_cleanupq = NULL;
703 		qflag++;
704 	}
705 	if ((notifyqhead = aiop->aio_notifyq) != NULL) {
706 		aiop->aio_notifyq = NULL;
707 		qflag++;
708 	}
709 	if ((pollqhead = aiop->aio_pollq) != NULL) {
710 		aiop->aio_pollq = NULL;
711 		qflag++;
712 	}
713 	if (flag) {
714 		if ((portq = aiop->aio_portq) != NULL)
715 			qflag++;
716 
717 		if ((cleanupport = aiop->aio_portcleanupq) != NULL) {
718 			aiop->aio_portcleanupq = NULL;
719 			qflag++;
720 		}
721 	}
722 	mutex_exit(&aiop->aio_mutex);
723 
724 	/*
725 	 * return immediately if cleanupq, pollq, and
726 	 * notifyq are all empty. someone else must have
727 	 * emptied them.
728 	 */
729 	if (!qflag) {
730 		mutex_exit(&aiop->aio_cleanupq_mutex);
731 		return;
732 	}
733 
734 	/*
735 	 * do cleanup for the various queues.
736 	 */
737 	if (cleanupqhead)
738 		aio_cleanup_cleanupq(aiop, cleanupqhead, exitflg);
739 	mutex_exit(&aiop->aio_cleanupq_mutex);
740 	if (notifyqhead)
741 		signalled = aio_cleanup_notifyq(aiop, notifyqhead, exitflg);
742 	if (pollqhead)
743 		aio_cleanup_pollq(aiop, pollqhead, exitflg);
744 	if (flag && (cleanupport || portq))
745 		aio_cleanup_portq(aiop, cleanupport, exitflg);
746 
747 	if (exitflg)
748 		return;
749 
750 	/*
751 	 * If we have an active aio_cleanup_thread it's possible for
752 	 * this routine to push something on to the done queue after
753 	 * an aiowait/aiosuspend thread has already decided to block.
754 	 * This being the case, we need a cv_broadcast here to wake
755 	 * these threads up. It is simpler and cleaner to do this
756 	 * broadcast here than in the individual cleanup routines.
757 	 */
758 
759 	mutex_enter(&aiop->aio_mutex);
760 	cv_broadcast(&aiop->aio_waitcv);
761 	mutex_exit(&aiop->aio_mutex);
762 
763 	/*
764 	 * Only if the process wasn't already signalled,
765 	 * determine if a SIGIO signal should be delievered.
766 	 */
767 	if (!signalled &&
768 	    (func = PTOU(curproc)->u_signal[SIGIO - 1]) != SIG_DFL &&
769 	    func != SIG_IGN)
770 		psignal(curproc, SIGIO);
771 }
772 
773 
774 /*
775  * Do cleanup for every element of the port cleanup queue.
776  */
777 static void
778 aio_cleanup_portq(aio_t *aiop, aio_req_t *cleanupq, int exitflag)
779 {
780 	aio_req_t	*reqp;
781 	aio_req_t	*next;
782 	aio_req_t	*headp;
783 	aio_lio_t	*liop;
784 
785 	/* first check the portq */
786 	if (exitflag || ((aiop->aio_flags & AIO_CLEANUP_PORT) == 0)) {
787 		mutex_enter(&aiop->aio_mutex);
788 		if (aiop->aio_flags & AIO_CLEANUP)
789 			aiop->aio_flags |= AIO_CLEANUP_PORT;
790 		mutex_exit(&aiop->aio_mutex);
791 
792 		/*
793 		 * It is not allowed to hold locks during aphysio_unlock().
794 		 * The aio_done() interrupt function will try to acquire
795 		 * aio_mutex and aio_portq_mutex.  Therefore we disconnect
796 		 * the portq list from the aiop for the duration of the
797 		 * aphysio_unlock() loop below.
798 		 */
799 		mutex_enter(&aiop->aio_portq_mutex);
800 		headp = aiop->aio_portq;
801 		aiop->aio_portq = NULL;
802 		mutex_exit(&aiop->aio_portq_mutex);
803 		if ((reqp = headp) != NULL) {
804 			do {
805 				next = reqp->aio_req_next;
806 				aphysio_unlock(reqp);
807 				if (exitflag) {
808 					mutex_enter(&aiop->aio_mutex);
809 					aio_req_free(aiop, reqp);
810 					mutex_exit(&aiop->aio_mutex);
811 				}
812 			} while ((reqp = next) != headp);
813 		}
814 
815 		if (headp != NULL && exitflag == 0) {
816 			/* move unlocked requests back to the port queue */
817 			aio_req_t *newq;
818 
819 			mutex_enter(&aiop->aio_portq_mutex);
820 			if ((newq = aiop->aio_portq) != NULL) {
821 				aio_req_t *headprev = headp->aio_req_prev;
822 				aio_req_t *newqprev = newq->aio_req_prev;
823 
824 				headp->aio_req_prev = newqprev;
825 				newq->aio_req_prev = headprev;
826 				headprev->aio_req_next = newq;
827 				newqprev->aio_req_next = headp;
828 			}
829 			aiop->aio_portq = headp;
830 			cv_broadcast(&aiop->aio_portcv);
831 			mutex_exit(&aiop->aio_portq_mutex);
832 		}
833 	}
834 
835 	/* now check the port cleanup queue */
836 	if ((reqp = cleanupq) == NULL)
837 		return;
838 	do {
839 		next = reqp->aio_req_next;
840 		aphysio_unlock(reqp);
841 		if (exitflag) {
842 			mutex_enter(&aiop->aio_mutex);
843 			aio_req_free(aiop, reqp);
844 			mutex_exit(&aiop->aio_mutex);
845 		} else {
846 			mutex_enter(&aiop->aio_portq_mutex);
847 			aio_enq(&aiop->aio_portq, reqp, 0);
848 			mutex_exit(&aiop->aio_portq_mutex);
849 			port_send_event(reqp->aio_req_portkev);
850 			if ((liop = reqp->aio_req_lio) != NULL) {
851 				int send_event = 0;
852 
853 				mutex_enter(&aiop->aio_mutex);
854 				ASSERT(liop->lio_refcnt > 0);
855 				if (--liop->lio_refcnt == 0) {
856 					if (liop->lio_port >= 0 &&
857 					    liop->lio_portkev) {
858 						liop->lio_port = -1;
859 						send_event = 1;
860 					}
861 				}
862 				mutex_exit(&aiop->aio_mutex);
863 				if (send_event)
864 					port_send_event(liop->lio_portkev);
865 			}
866 		}
867 	} while ((reqp = next) != cleanupq);
868 }
869 
870 /*
871  * Do cleanup for every element of the cleanupq.
872  */
873 static void
874 aio_cleanup_cleanupq(aio_t *aiop, aio_req_t *qhead, int exitflg)
875 {
876 	aio_req_t *reqp, *next;
877 
878 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
879 
880 	/*
881 	 * Since aio_req_done() or aio_req_find() use the HASH list to find
882 	 * the required requests, they could potentially take away elements
883 	 * if they are already done (AIO_DONEQ is set).
884 	 * The aio_cleanupq_mutex protects the queue for the duration of the
885 	 * loop from aio_req_done() and aio_req_find().
886 	 */
887 	if ((reqp = qhead) == NULL)
888 		return;
889 	do {
890 		ASSERT(reqp->aio_req_flags & AIO_CLEANUPQ);
891 		ASSERT(reqp->aio_req_portkev == NULL);
892 		next = reqp->aio_req_next;
893 		aphysio_unlock(reqp);
894 		mutex_enter(&aiop->aio_mutex);
895 		if (exitflg)
896 			aio_req_free(aiop, reqp);
897 		else
898 			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
899 		mutex_exit(&aiop->aio_mutex);
900 	} while ((reqp = next) != qhead);
901 }
902 
903 /*
904  * do cleanup for every element of the notify queue.
905  */
906 static int
907 aio_cleanup_notifyq(aio_t *aiop, aio_req_t *qhead, int exitflg)
908 {
909 	aio_req_t *reqp, *next;
910 	aio_lio_t *liohead;
911 	sigqueue_t *sigev, *lio_sigev = NULL;
912 	int signalled = 0;
913 
914 	if ((reqp = qhead) == NULL)
915 		return (0);
916 	do {
917 		ASSERT(reqp->aio_req_flags & AIO_NOTIFYQ);
918 		next = reqp->aio_req_next;
919 		aphysio_unlock(reqp);
920 		if (exitflg) {
921 			mutex_enter(&aiop->aio_mutex);
922 			aio_req_free(aiop, reqp);
923 			mutex_exit(&aiop->aio_mutex);
924 		} else {
925 			mutex_enter(&aiop->aio_mutex);
926 			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
927 			sigev = reqp->aio_req_sigqp;
928 			reqp->aio_req_sigqp = NULL;
929 			if ((liohead = reqp->aio_req_lio) != NULL) {
930 				ASSERT(liohead->lio_refcnt > 0);
931 				if (--liohead->lio_refcnt == 0) {
932 					cv_signal(&liohead->lio_notify);
933 					lio_sigev = liohead->lio_sigqp;
934 					liohead->lio_sigqp = NULL;
935 				}
936 			}
937 			mutex_exit(&aiop->aio_mutex);
938 			if (sigev) {
939 				signalled++;
940 				aio_sigev_send(reqp->aio_req_buf.b_proc,
941 				    sigev);
942 			}
943 			if (lio_sigev) {
944 				signalled++;
945 				aio_sigev_send(reqp->aio_req_buf.b_proc,
946 				    lio_sigev);
947 			}
948 		}
949 	} while ((reqp = next) != qhead);
950 
951 	return (signalled);
952 }
953 
954 /*
955  * Do cleanup for every element of the poll queue.
956  */
957 static void
958 aio_cleanup_pollq(aio_t *aiop, aio_req_t *qhead, int exitflg)
959 {
960 	aio_req_t *reqp, *next;
961 
962 	/*
963 	 * As no other threads should be accessing the queue at this point,
964 	 * it isn't necessary to hold aio_mutex while we traverse its elements.
965 	 */
966 	if ((reqp = qhead) == NULL)
967 		return;
968 	do {
969 		ASSERT(reqp->aio_req_flags & AIO_POLLQ);
970 		next = reqp->aio_req_next;
971 		aphysio_unlock(reqp);
972 		if (exitflg) {
973 			mutex_enter(&aiop->aio_mutex);
974 			aio_req_free(aiop, reqp);
975 			mutex_exit(&aiop->aio_mutex);
976 		} else {
977 			aio_copyout_result(reqp);
978 			mutex_enter(&aiop->aio_mutex);
979 			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
980 			mutex_exit(&aiop->aio_mutex);
981 		}
982 	} while ((reqp = next) != qhead);
983 }
984 
985 /*
986  * called by exit(). waits for all outstanding kaio to finish
987  * before the kaio resources are freed.
988  */
989 void
990 aio_cleanup_exit(void)
991 {
992 	proc_t *p = curproc;
993 	aio_t *aiop = p->p_aio;
994 	aio_req_t *reqp, *next, *head;
995 	aio_lio_t *nxtlio, *liop;
996 
997 	/*
998 	 * wait for all outstanding kaio to complete. process
999 	 * is now single-threaded; no other kaio requests can
1000 	 * happen once aio_pending is zero.
1001 	 */
1002 	mutex_enter(&aiop->aio_mutex);
1003 	aiop->aio_flags |= AIO_CLEANUP;
1004 	while ((aiop->aio_pending != 0) || (aiop->aio_flags & AIO_DONE_ACTIVE))
1005 		cv_wait(&aiop->aio_cleanupcv, &aiop->aio_mutex);
1006 	mutex_exit(&aiop->aio_mutex);
1007 
1008 	/* cleanup the cleanup-thread queues. */
1009 	aio_cleanup(AIO_CLEANUP_EXIT);
1010 
1011 	/*
1012 	 * Although this process is now single-threaded, we
1013 	 * still need to protect ourselves against a race with
1014 	 * aio_cleanup_dr_delete_memory().
1015 	 */
1016 	mutex_enter(&p->p_lock);
1017 
1018 	/*
1019 	 * free up the done queue's resources.
1020 	 */
1021 	if ((head = aiop->aio_doneq) != NULL) {
1022 		aiop->aio_doneq = NULL;
1023 		reqp = head;
1024 		do {
1025 			next = reqp->aio_req_next;
1026 			aphysio_unlock(reqp);
1027 			kmem_free(reqp, sizeof (struct aio_req_t));
1028 		} while ((reqp = next) != head);
1029 	}
1030 	/*
1031 	 * release aio request freelist.
1032 	 */
1033 	for (reqp = aiop->aio_free; reqp != NULL; reqp = next) {
1034 		next = reqp->aio_req_next;
1035 		kmem_free(reqp, sizeof (struct aio_req_t));
1036 	}
1037 
1038 	/*
1039 	 * release io list head freelist.
1040 	 */
1041 	for (liop = aiop->aio_lio_free; liop != NULL; liop = nxtlio) {
1042 		nxtlio = liop->lio_next;
1043 		kmem_free(liop, sizeof (aio_lio_t));
1044 	}
1045 
1046 	if (aiop->aio_iocb)
1047 		kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
1048 
1049 	mutex_destroy(&aiop->aio_mutex);
1050 	mutex_destroy(&aiop->aio_portq_mutex);
1051 	mutex_destroy(&aiop->aio_cleanupq_mutex);
1052 	p->p_aio = NULL;
1053 	mutex_exit(&p->p_lock);
1054 	kmem_free(aiop, sizeof (struct aio));
1055 }
1056 
1057 /*
1058  * copy out aio request's result to a user-level result_t buffer.
1059  */
1060 void
1061 aio_copyout_result(aio_req_t *reqp)
1062 {
1063 	struct buf	*bp;
1064 	struct iovec	*iov;
1065 	void		*resultp;
1066 	int		error;
1067 	size_t		retval;
1068 
1069 	if (reqp->aio_req_flags & AIO_COPYOUTDONE)
1070 		return;
1071 
1072 	reqp->aio_req_flags |= AIO_COPYOUTDONE;
1073 
1074 	iov = reqp->aio_req_uio.uio_iov;
1075 	bp = &reqp->aio_req_buf;
1076 	/* "resultp" points to user-level result_t buffer */
1077 	resultp = (void *)reqp->aio_req_resultp;
1078 	if (bp->b_flags & B_ERROR) {
1079 		if (bp->b_error)
1080 			error = bp->b_error;
1081 		else
1082 			error = EIO;
1083 		retval = (size_t)-1;
1084 	} else {
1085 		error = 0;
1086 		retval = iov->iov_len - bp->b_resid;
1087 	}
1088 #ifdef	_SYSCALL32_IMPL
1089 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1090 		(void) sulword(&((aio_result_t *)resultp)->aio_return, retval);
1091 		(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1092 	} else {
1093 		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1094 		    (int)retval);
1095 		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1096 	}
1097 #else
1098 	(void) suword32(&((aio_result_t *)resultp)->aio_return, retval);
1099 	(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1100 #endif
1101 }
1102 
1103 
1104 void
1105 aio_copyout_result_port(struct iovec *iov, struct buf *bp, void *resultp)
1106 {
1107 	int errno;
1108 	size_t retval;
1109 
1110 	if (bp->b_flags & B_ERROR) {
1111 		if (bp->b_error)
1112 			errno = bp->b_error;
1113 		else
1114 			errno = EIO;
1115 		retval = (size_t)-1;
1116 	} else {
1117 		errno = 0;
1118 		retval = iov->iov_len - bp->b_resid;
1119 	}
1120 #ifdef	_SYSCALL32_IMPL
1121 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1122 		(void) sulword(&((aio_result_t *)resultp)->aio_return, retval);
1123 		(void) suword32(&((aio_result_t *)resultp)->aio_errno, errno);
1124 	} else {
1125 		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1126 		    (int)retval);
1127 		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, errno);
1128 	}
1129 #else
1130 	(void) suword32(&((aio_result_t *)resultp)->aio_return, retval);
1131 	(void) suword32(&((aio_result_t *)resultp)->aio_errno, errno);
1132 #endif
1133 }
1134 
1135 /*
1136  * This function is used to remove a request from the done queue.
1137  */
1138 
1139 void
1140 aio_req_remove_portq(aio_t *aiop, aio_req_t *reqp)
1141 {
1142 	ASSERT(MUTEX_HELD(&aiop->aio_portq_mutex));
1143 	while (aiop->aio_portq == NULL) {
1144 		/*
1145 		 * aio_portq is set to NULL when aio_cleanup_portq()
1146 		 * is working with the event queue.
1147 		 * The aio_cleanup_thread() uses aio_cleanup_portq()
1148 		 * to unlock all AIO buffers with completed transactions.
1149 		 * Wait here until aio_cleanup_portq() restores the
1150 		 * list of completed transactions in aio_portq.
1151 		 */
1152 		cv_wait(&aiop->aio_portcv, &aiop->aio_portq_mutex);
1153 	}
1154 	aio_deq(&aiop->aio_portq, reqp);
1155 }
1156 
1157 /* ARGSUSED */
1158 void
1159 aio_close_port(void *arg, int port, pid_t pid, int lastclose)
1160 {
1161 	aio_t		*aiop;
1162 	aio_req_t 	*reqp;
1163 	aio_req_t 	*next;
1164 	aio_req_t	*headp;
1165 	int		counter;
1166 
1167 	if (arg == NULL)
1168 		aiop = curproc->p_aio;
1169 	else
1170 		aiop = (aio_t *)arg;
1171 
1172 	/*
1173 	 * The PORT_SOURCE_AIO source is always associated with every new
1174 	 * created port by default.
1175 	 * If no asynchronous I/O transactions were associated with the port
1176 	 * then the aiop pointer will still be set to NULL.
1177 	 */
1178 	if (aiop == NULL)
1179 		return;
1180 
1181 	/*
1182 	 * Within a process event ports can be used to collect events other
1183 	 * than PORT_SOURCE_AIO events. At the same time the process can submit
1184 	 * asynchronous I/Os transactions which are not associated with the
1185 	 * current port.
1186 	 * The current process oriented model of AIO uses a sigle queue for
1187 	 * pending events. On close the pending queue (queue of asynchronous
1188 	 * I/O transactions using event port notification) must be scanned
1189 	 * to detect and handle pending I/Os using the current port.
1190 	 */
1191 	mutex_enter(&aiop->aio_portq_mutex);
1192 	mutex_enter(&aiop->aio_mutex);
1193 	counter = 0;
1194 	if ((headp = aiop->aio_portpending) != NULL) {
1195 		reqp = headp;
1196 		do {
1197 			if (reqp->aio_req_portkev &&
1198 			    reqp->aio_req_port == port) {
1199 				reqp->aio_req_flags |= AIO_CLOSE_PORT;
1200 				counter++;
1201 			}
1202 		} while ((reqp = reqp->aio_req_next) != headp);
1203 	}
1204 	if (counter == 0) {
1205 		/* no AIOs pending */
1206 		mutex_exit(&aiop->aio_mutex);
1207 		mutex_exit(&aiop->aio_portq_mutex);
1208 		return;
1209 	}
1210 	aiop->aio_portpendcnt += counter;
1211 	mutex_exit(&aiop->aio_mutex);
1212 	while (aiop->aio_portpendcnt)
1213 		cv_wait(&aiop->aio_portcv, &aiop->aio_portq_mutex);
1214 
1215 	/*
1216 	 * all pending AIOs are completed.
1217 	 * check port doneq
1218 	 */
1219 	headp = NULL;
1220 	if ((reqp = aiop->aio_portq) != NULL) {
1221 		do {
1222 			next = reqp->aio_req_next;
1223 			if (reqp->aio_req_port == port) {
1224 				/* dequeue request and discard event */
1225 				aio_req_remove_portq(aiop, reqp);
1226 				port_free_event(reqp->aio_req_portkev);
1227 				/* put request in temporary queue */
1228 				reqp->aio_req_next = headp;
1229 				headp = reqp;
1230 			}
1231 		} while ((reqp = next) != aiop->aio_portq);
1232 	}
1233 	mutex_exit(&aiop->aio_portq_mutex);
1234 
1235 	/* headp points to the list of requests to be discarded */
1236 	for (reqp = headp; reqp != NULL; reqp = next) {
1237 		next = reqp->aio_req_next;
1238 		aphysio_unlock(reqp);
1239 		mutex_enter(&aiop->aio_mutex);
1240 		aio_req_free_port(aiop, reqp);
1241 		mutex_exit(&aiop->aio_mutex);
1242 	}
1243 
1244 	if (aiop->aio_flags & AIO_CLEANUP)
1245 		cv_broadcast(&aiop->aio_waitcv);
1246 }
1247 
1248 /*
1249  * aio_cleanup_dr_delete_memory is used by dr's delete_memory_thread
1250  * to kick start the aio_cleanup_thread for the give process to do the
1251  * necessary cleanup.
1252  * This is needed so that delete_memory_thread can obtain writer locks
1253  * on pages that need to be relocated during a dr memory delete operation,
1254  * otherwise a deadly embrace may occur.
1255  */
1256 int
1257 aio_cleanup_dr_delete_memory(proc_t *procp)
1258 {
1259 	struct aio *aiop = procp->p_aio;
1260 	struct as *as = procp->p_as;
1261 	int ret = 0;
1262 
1263 	ASSERT(MUTEX_HELD(&procp->p_lock));
1264 
1265 	mutex_enter(&as->a_contents);
1266 
1267 	if (aiop != NULL) {
1268 		aiop->aio_rqclnup = 1;
1269 		cv_broadcast(&as->a_cv);
1270 		ret = 1;
1271 	}
1272 	mutex_exit(&as->a_contents);
1273 	return (ret);
1274 }
1275