1 /* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $FreeBSD: src/sys/kern/vfs_aio.c,v 1.70.2.28 2003/05/29 06:15:35 alc Exp $ 17 * $DragonFly: src/sys/kern/vfs_aio.c,v 1.42 2007/07/20 17:21:52 dillon Exp $ 18 */ 19 20 /* 21 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 22 */ 23 24 #include <sys/param.h> 25 #include <sys/systm.h> 26 #include <sys/buf.h> 27 #include <sys/sysproto.h> 28 #include <sys/filedesc.h> 29 #include <sys/kernel.h> 30 #include <sys/fcntl.h> 31 #include <sys/file.h> 32 #include <sys/lock.h> 33 #include <sys/unistd.h> 34 #include <sys/proc.h> 35 #include <sys/resourcevar.h> 36 #include <sys/signalvar.h> 37 #include <sys/protosw.h> 38 #include <sys/socketvar.h> 39 #include <sys/sysctl.h> 40 #include <sys/vnode.h> 41 #include <sys/conf.h> 42 #include <sys/event.h> 43 44 #include <vm/vm.h> 45 #include <vm/vm_extern.h> 46 #include <vm/pmap.h> 47 #include <vm/vm_map.h> 48 #include <vm/vm_zone.h> 49 #include <sys/aio.h> 50 51 #include <sys/file2.h> 52 #include <sys/buf2.h> 53 #include <sys/sysref2.h> 54 #include <sys/thread2.h> 55 #include <sys/mplock2.h> 56 57 #include <machine/limits.h> 58 #include "opt_vfs_aio.h" 59 60 #ifdef VFS_AIO 61 62 /* 63 * Counter for allocating reference ids to new jobs. Wrapped to 1 on 64 * overflow. 65 */ 66 static long jobrefid; 67 68 #define JOBST_NULL 0x0 69 #define JOBST_JOBQGLOBAL 0x2 70 #define JOBST_JOBRUNNING 0x3 71 #define JOBST_JOBFINISHED 0x4 72 #define JOBST_JOBQBUF 0x5 73 #define JOBST_JOBBFINISHED 0x6 74 75 #ifndef MAX_AIO_PER_PROC 76 #define MAX_AIO_PER_PROC 32 77 #endif 78 79 #ifndef MAX_AIO_QUEUE_PER_PROC 80 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 81 #endif 82 83 #ifndef MAX_AIO_PROCS 84 #define MAX_AIO_PROCS 32 85 #endif 86 87 #ifndef MAX_AIO_QUEUE 88 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 89 #endif 90 91 #ifndef TARGET_AIO_PROCS 92 #define TARGET_AIO_PROCS 4 93 #endif 94 95 #ifndef MAX_BUF_AIO 96 #define MAX_BUF_AIO 16 97 #endif 98 99 #ifndef AIOD_TIMEOUT_DEFAULT 100 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 101 #endif 102 103 #ifndef AIOD_LIFETIME_DEFAULT 104 #define AIOD_LIFETIME_DEFAULT (30 * hz) 105 #endif 106 107 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); 108 109 static int max_aio_procs = MAX_AIO_PROCS; 110 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 111 CTLFLAG_RW, &max_aio_procs, 0, 112 "Maximum number of kernel threads to use for handling async IO"); 113 114 static int num_aio_procs = 0; 115 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 116 CTLFLAG_RD, &num_aio_procs, 0, 117 "Number of presently active kernel threads for async IO"); 118 119 /* 120 * The code will adjust the actual number of AIO processes towards this 121 * number when it gets a chance. 122 */ 123 static int target_aio_procs = TARGET_AIO_PROCS; 124 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 125 0, "Preferred number of ready kernel threads for async IO"); 126 127 static int max_queue_count = MAX_AIO_QUEUE; 128 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, 129 "Maximum number of aio requests to queue, globally"); 130 131 static int num_queue_count = 0; 132 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, 133 "Number of queued aio requests"); 134 135 static int num_buf_aio = 0; 136 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, 137 "Number of aio requests presently handled by the buf subsystem"); 138 139 /* Number of async I/O thread in the process of being started */ 140 /* XXX This should be local to _aio_aqueue() */ 141 static int num_aio_resv_start = 0; 142 143 static int aiod_timeout; 144 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0, 145 "Timeout value for synchronous aio operations"); 146 147 static int aiod_lifetime; 148 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, 149 "Maximum lifetime for idle aiod"); 150 151 static int max_aio_per_proc = MAX_AIO_PER_PROC; 152 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 153 0, "Maximum active aio requests per process (stored in the process)"); 154 155 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 156 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, 157 &max_aio_queue_per_proc, 0, 158 "Maximum queued aio requests per process (stored in the process)"); 159 160 static int max_buf_aio = MAX_BUF_AIO; 161 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, 162 "Maximum buf aio requests per process (stored in the process)"); 163 164 /* 165 * AIO process info 166 */ 167 #define AIOP_FREE 0x1 /* proc on free queue */ 168 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 169 170 struct aioproclist { 171 int aioprocflags; /* AIO proc flags */ 172 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 173 struct proc *aioproc; /* The AIO thread */ 174 }; 175 176 /* 177 * data-structure for lio signal management 178 */ 179 struct aio_liojob { 180 int lioj_flags; 181 int lioj_buffer_count; 182 int lioj_buffer_finished_count; 183 int lioj_queue_count; 184 int lioj_queue_finished_count; 185 struct sigevent lioj_signal; /* signal on all I/O done */ 186 TAILQ_ENTRY(aio_liojob) lioj_list; 187 struct kaioinfo *lioj_ki; 188 }; 189 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 190 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 191 192 /* 193 * per process aio data structure 194 */ 195 struct kaioinfo { 196 int kaio_flags; /* per process kaio flags */ 197 int kaio_maxactive_count; /* maximum number of AIOs */ 198 int kaio_active_count; /* number of currently used AIOs */ 199 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 200 int kaio_queue_count; /* size of AIO queue */ 201 int kaio_ballowed_count; /* maximum number of buffers */ 202 int kaio_queue_finished_count; /* number of daemon jobs finished */ 203 int kaio_buffer_count; /* number of physio buffers */ 204 int kaio_buffer_finished_count; /* count of I/O done */ 205 struct proc *kaio_p; /* process that uses this kaio block */ 206 TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 207 TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */ 208 TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */ 209 TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 210 TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */ 211 TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ 212 }; 213 214 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 215 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 216 217 static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc; 218 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 219 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 220 static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 221 222 static void aio_init_aioinfo(struct proc *p); 223 static void aio_onceonly(void *); 224 static int aio_free_entry(struct aiocblist *aiocbe); 225 static void aio_process(struct aiocblist *aiocbe); 226 static int aio_newproc(void); 227 static int aio_aqueue(struct aiocb *job, int type); 228 static void aio_physwakeup(struct bio *bio); 229 static int aio_fphysio(struct aiocblist *aiocbe); 230 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 231 static void aio_daemon(void *uproc, struct trapframe *frame); 232 static void process_signal(void *aioj); 233 234 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 235 236 /* 237 * Zones for: 238 * kaio Per process async io info 239 * aiop async io thread data 240 * aiocb async io jobs 241 * aiol list io job pointer - internal to aio_suspend XXX 242 * aiolio list io jobs 243 */ 244 static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone; 245 246 /* 247 * Startup initialization 248 */ 249 static void 250 aio_onceonly(void *na) 251 { 252 TAILQ_INIT(&aio_freeproc); 253 TAILQ_INIT(&aio_activeproc); 254 TAILQ_INIT(&aio_jobs); 255 TAILQ_INIT(&aio_bufjobs); 256 TAILQ_INIT(&aio_freejobs); 257 kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1); 258 aiop_zone = zinit("AIOP", sizeof(struct aioproclist), 0, 0, 1); 259 aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1); 260 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t), 0, 0, 1); 261 aiolio_zone = zinit("AIOLIO", sizeof(struct aio_liojob), 0, 0, 1); 262 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 263 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 264 jobrefid = 1; 265 } 266 267 /* 268 * Init the per-process aioinfo structure. The aioinfo limits are set 269 * per-process for user limit (resource) management. 270 */ 271 static void 272 aio_init_aioinfo(struct proc *p) 273 { 274 struct kaioinfo *ki; 275 if (p->p_aioinfo == NULL) { 276 ki = zalloc(kaio_zone); 277 p->p_aioinfo = ki; 278 ki->kaio_flags = 0; 279 ki->kaio_maxactive_count = max_aio_per_proc; 280 ki->kaio_active_count = 0; 281 ki->kaio_qallowed_count = max_aio_queue_per_proc; 282 ki->kaio_queue_count = 0; 283 ki->kaio_ballowed_count = max_buf_aio; 284 ki->kaio_buffer_count = 0; 285 ki->kaio_buffer_finished_count = 0; 286 ki->kaio_p = p; 287 TAILQ_INIT(&ki->kaio_jobdone); 288 TAILQ_INIT(&ki->kaio_jobqueue); 289 TAILQ_INIT(&ki->kaio_bufdone); 290 TAILQ_INIT(&ki->kaio_bufqueue); 291 TAILQ_INIT(&ki->kaio_liojoblist); 292 TAILQ_INIT(&ki->kaio_sockqueue); 293 } 294 295 while (num_aio_procs < target_aio_procs) 296 aio_newproc(); 297 } 298 299 /* 300 * Free a job entry. Wait for completion if it is currently active, but don't 301 * delay forever. If we delay, we return a flag that says that we have to 302 * restart the queue scan. 303 */ 304 static int 305 aio_free_entry(struct aiocblist *aiocbe) 306 { 307 struct kaioinfo *ki; 308 struct aio_liojob *lj; 309 struct proc *p; 310 int error; 311 312 if (aiocbe->jobstate == JOBST_NULL) 313 panic("aio_free_entry: freeing already free job"); 314 315 p = aiocbe->userproc; 316 ki = p->p_aioinfo; 317 lj = aiocbe->lio; 318 if (ki == NULL) 319 panic("aio_free_entry: missing p->p_aioinfo"); 320 321 while (aiocbe->jobstate == JOBST_JOBRUNNING) { 322 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 323 tsleep(aiocbe, 0, "jobwai", 0); 324 } 325 if (aiocbe->bp == NULL) { 326 if (ki->kaio_queue_count <= 0) 327 panic("aio_free_entry: process queue size <= 0"); 328 if (num_queue_count <= 0) 329 panic("aio_free_entry: system wide queue size <= 0"); 330 331 if (lj) { 332 lj->lioj_queue_count--; 333 if (aiocbe->jobflags & AIOCBLIST_DONE) 334 lj->lioj_queue_finished_count--; 335 } 336 ki->kaio_queue_count--; 337 if (aiocbe->jobflags & AIOCBLIST_DONE) 338 ki->kaio_queue_finished_count--; 339 num_queue_count--; 340 } else { 341 if (lj) { 342 lj->lioj_buffer_count--; 343 if (aiocbe->jobflags & AIOCBLIST_DONE) 344 lj->lioj_buffer_finished_count--; 345 } 346 if (aiocbe->jobflags & AIOCBLIST_DONE) 347 ki->kaio_buffer_finished_count--; 348 ki->kaio_buffer_count--; 349 num_buf_aio--; 350 } 351 352 /* aiocbe is going away, we need to destroy any knotes */ 353 /* XXX lwp knote wants a thread, but only cares about the process */ 354 knote_remove(&aiocbe->klist); 355 356 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) 357 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { 358 ki->kaio_flags &= ~KAIO_WAKEUP; 359 wakeup(p); 360 } 361 362 if (aiocbe->jobstate == JOBST_JOBQBUF) { 363 if ((error = aio_fphysio(aiocbe)) != 0) 364 return error; 365 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 366 panic("aio_free_entry: invalid physio finish-up state"); 367 crit_enter(); 368 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 369 crit_exit(); 370 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) { 371 crit_enter(); 372 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 373 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 374 crit_exit(); 375 } else if (aiocbe->jobstate == JOBST_JOBFINISHED) 376 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 377 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { 378 crit_enter(); 379 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 380 crit_exit(); 381 if (aiocbe->bp) { 382 vunmapbuf(aiocbe->bp); 383 relpbuf(aiocbe->bp, NULL); 384 aiocbe->bp = NULL; 385 } 386 } 387 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 388 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 389 zfree(aiolio_zone, lj); 390 } 391 aiocbe->jobstate = JOBST_NULL; 392 callout_stop(&aiocbe->timeout); 393 fdrop(aiocbe->fd_file); 394 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 395 return 0; 396 } 397 #endif /* VFS_AIO */ 398 399 /* 400 * Rundown the jobs for a given process. 401 */ 402 void 403 aio_proc_rundown(struct proc *p) 404 { 405 #ifndef VFS_AIO 406 return; 407 #else 408 struct kaioinfo *ki; 409 struct aio_liojob *lj, *ljn; 410 struct aiocblist *aiocbe, *aiocbn; 411 struct file *fp; 412 struct socket *so; 413 414 ki = p->p_aioinfo; 415 if (ki == NULL) 416 return; 417 418 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 419 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > 420 ki->kaio_buffer_finished_count)) { 421 ki->kaio_flags |= KAIO_RUNDOWN; 422 if (tsleep(p, 0, "kaiowt", aiod_timeout)) 423 break; 424 } 425 426 /* 427 * Move any aio ops that are waiting on socket I/O to the normal job 428 * queues so they are cleaned up with any others. 429 */ 430 crit_enter(); 431 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = 432 aiocbn) { 433 aiocbn = TAILQ_NEXT(aiocbe, plist); 434 fp = aiocbe->fd_file; 435 if (fp != NULL) { 436 so = (struct socket *)fp->f_data; 437 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); 438 if (TAILQ_EMPTY(&so->so_aiojobq)) { 439 so->so_snd.ssb_flags &= ~SSB_AIO; 440 so->so_rcv.ssb_flags &= ~SSB_AIO; 441 } 442 } 443 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); 444 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); 445 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); 446 } 447 crit_exit(); 448 449 restart1: 450 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { 451 aiocbn = TAILQ_NEXT(aiocbe, plist); 452 if (aio_free_entry(aiocbe)) 453 goto restart1; 454 } 455 456 restart2: 457 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = 458 aiocbn) { 459 aiocbn = TAILQ_NEXT(aiocbe, plist); 460 if (aio_free_entry(aiocbe)) 461 goto restart2; 462 } 463 464 restart3: 465 crit_enter(); 466 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 467 ki->kaio_flags |= KAIO_WAKEUP; 468 tsleep(p, 0, "aioprn", 0); 469 crit_exit(); 470 goto restart3; 471 } 472 crit_exit(); 473 474 restart4: 475 crit_enter(); 476 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { 477 aiocbn = TAILQ_NEXT(aiocbe, plist); 478 if (aio_free_entry(aiocbe)) { 479 crit_exit(); 480 goto restart4; 481 } 482 } 483 crit_exit(); 484 485 /* 486 * If we've slept, jobs might have moved from one queue to another. 487 * Retry rundown if we didn't manage to empty the queues. 488 */ 489 if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL || 490 TAILQ_FIRST(&ki->kaio_jobqueue) != NULL || 491 TAILQ_FIRST(&ki->kaio_bufqueue) != NULL || 492 TAILQ_FIRST(&ki->kaio_bufdone) != NULL) 493 goto restart1; 494 495 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { 496 ljn = TAILQ_NEXT(lj, lioj_list); 497 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 498 0)) { 499 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 500 zfree(aiolio_zone, lj); 501 } else { 502 #ifdef DIAGNOSTIC 503 kprintf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " 504 "QF:%d\n", lj->lioj_buffer_count, 505 lj->lioj_buffer_finished_count, 506 lj->lioj_queue_count, 507 lj->lioj_queue_finished_count); 508 #endif 509 } 510 } 511 512 zfree(kaio_zone, ki); 513 p->p_aioinfo = NULL; 514 #endif /* VFS_AIO */ 515 } 516 517 #ifdef VFS_AIO 518 /* 519 * Select a job to run (called by an AIO daemon). 520 */ 521 static struct aiocblist * 522 aio_selectjob(struct aioproclist *aiop) 523 { 524 struct aiocblist *aiocbe; 525 struct kaioinfo *ki; 526 struct proc *userp; 527 528 crit_enter(); 529 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = 530 TAILQ_NEXT(aiocbe, list)) { 531 userp = aiocbe->userproc; 532 ki = userp->p_aioinfo; 533 534 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 535 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 536 crit_exit(); 537 return aiocbe; 538 } 539 } 540 crit_exit(); 541 542 return NULL; 543 } 544 545 /* 546 * The AIO processing activity. This is the code that does the I/O request for 547 * the non-physio version of the operations. The normal vn operations are used, 548 * and this code should work in all instances for every type of file, including 549 * pipes, sockets, fifos, and regular files. 550 */ 551 static void 552 aio_process(struct aiocblist *aiocbe) 553 { 554 struct thread *mytd; 555 struct aiocb *cb; 556 struct file *fp; 557 struct uio auio; 558 struct iovec aiov; 559 int cnt; 560 int error; 561 int oublock_st, oublock_end; 562 int inblock_st, inblock_end; 563 564 mytd = curthread; 565 cb = &aiocbe->uaiocb; 566 fp = aiocbe->fd_file; 567 568 aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; 569 aiov.iov_len = cb->aio_nbytes; 570 571 auio.uio_iov = &aiov; 572 auio.uio_iovcnt = 1; 573 auio.uio_offset = cb->aio_offset; 574 auio.uio_resid = cb->aio_nbytes; 575 cnt = cb->aio_nbytes; 576 auio.uio_segflg = UIO_USERSPACE; 577 auio.uio_td = mytd; 578 579 inblock_st = mytd->td_lwp->lwp_ru.ru_inblock; 580 oublock_st = mytd->td_lwp->lwp_ru.ru_oublock; 581 /* 582 * _aio_aqueue() acquires a reference to the file that is 583 * released in aio_free_entry(). 584 */ 585 if (cb->aio_lio_opcode == LIO_READ) { 586 auio.uio_rw = UIO_READ; 587 error = fo_read(fp, &auio, fp->f_cred, O_FOFFSET); 588 } else { 589 auio.uio_rw = UIO_WRITE; 590 error = fo_write(fp, &auio, fp->f_cred, O_FOFFSET); 591 } 592 inblock_end = mytd->td_lwp->lwp_ru.ru_inblock; 593 oublock_end = mytd->td_lwp->lwp_ru.ru_oublock; 594 595 aiocbe->inputcharge = inblock_end - inblock_st; 596 aiocbe->outputcharge = oublock_end - oublock_st; 597 598 if ((error) && (auio.uio_resid != cnt)) { 599 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 600 error = 0; 601 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 602 ksignal(aiocbe->userproc, SIGPIPE); 603 } 604 605 cnt -= auio.uio_resid; 606 cb->_aiocb_private.error = error; 607 cb->_aiocb_private.status = cnt; 608 } 609 610 /* 611 * The AIO daemon, most of the actual work is done in aio_process, 612 * but the setup (and address space mgmt) is done in this routine. 613 * 614 * The MP lock is held on entry. 615 */ 616 static void 617 aio_daemon(void *uproc, struct trapframe *frame) 618 { 619 struct aio_liojob *lj; 620 struct aiocb *cb; 621 struct aiocblist *aiocbe; 622 struct aioproclist *aiop; 623 struct kaioinfo *ki; 624 struct proc *mycp, *userp; 625 struct vmspace *curvm; 626 struct lwp *mylwp; 627 struct ucred *cr; 628 629 mylwp = curthread->td_lwp; 630 mycp = mylwp->lwp_proc; 631 632 if (mycp->p_textvp) { 633 vrele(mycp->p_textvp); 634 mycp->p_textvp = NULL; 635 } 636 637 /* 638 * Allocate and ready the aio control info. There is one aiop structure 639 * per daemon. 640 */ 641 aiop = zalloc(aiop_zone); 642 aiop->aioproc = mycp; 643 aiop->aioprocflags |= AIOP_FREE; 644 645 crit_enter(); 646 647 /* 648 * Place thread (lightweight process) onto the AIO free thread list. 649 */ 650 if (TAILQ_EMPTY(&aio_freeproc)) 651 wakeup(&aio_freeproc); 652 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 653 654 crit_exit(); 655 656 /* Make up a name for the daemon. */ 657 strcpy(mycp->p_comm, "aiod"); 658 659 /* 660 * Get rid of our current filedescriptors. AIOD's don't need any 661 * filedescriptors, except as temporarily inherited from the client. 662 * Credentials are also cloned, and made equivalent to "root". 663 */ 664 fdfree(mycp, NULL); 665 cr = cratom(&mycp->p_ucred); 666 cr->cr_uid = 0; 667 uireplace(&cr->cr_uidinfo, uifind(0)); 668 cr->cr_ngroups = 1; 669 cr->cr_groups[0] = 1; 670 671 /* The daemon resides in its own pgrp. */ 672 enterpgrp(mycp, mycp->p_pid, 1); 673 674 /* Mark special process type. */ 675 mycp->p_flag |= P_SYSTEM | P_KTHREADP; 676 677 /* 678 * Wakeup parent process. (Parent sleeps to keep from blasting away 679 * and creating too many daemons.) 680 */ 681 wakeup(mycp); 682 curvm = NULL; 683 684 for (;;) { 685 /* 686 * Take daemon off of free queue 687 */ 688 if (aiop->aioprocflags & AIOP_FREE) { 689 crit_enter(); 690 TAILQ_REMOVE(&aio_freeproc, aiop, list); 691 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 692 aiop->aioprocflags &= ~AIOP_FREE; 693 crit_exit(); 694 } 695 aiop->aioprocflags &= ~AIOP_SCHED; 696 697 /* 698 * Check for jobs. 699 */ 700 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 701 cb = &aiocbe->uaiocb; 702 userp = aiocbe->userproc; 703 704 aiocbe->jobstate = JOBST_JOBRUNNING; 705 706 /* 707 * Connect to process address space for user program. 708 */ 709 if (curvm != userp->p_vmspace) { 710 pmap_setlwpvm(mylwp, userp->p_vmspace); 711 if (curvm) 712 sysref_put(&curvm->vm_sysref); 713 curvm = userp->p_vmspace; 714 sysref_get(&curvm->vm_sysref); 715 } 716 717 ki = userp->p_aioinfo; 718 lj = aiocbe->lio; 719 720 /* Account for currently active jobs. */ 721 ki->kaio_active_count++; 722 723 /* Do the I/O function. */ 724 aio_process(aiocbe); 725 726 /* Decrement the active job count. */ 727 ki->kaio_active_count--; 728 729 /* 730 * Increment the completion count for wakeup/signal 731 * comparisons. 732 */ 733 aiocbe->jobflags |= AIOCBLIST_DONE; 734 ki->kaio_queue_finished_count++; 735 if (lj) 736 lj->lioj_queue_finished_count++; 737 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags 738 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { 739 ki->kaio_flags &= ~KAIO_WAKEUP; 740 wakeup(userp); 741 } 742 743 crit_enter(); 744 if (lj && (lj->lioj_flags & 745 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { 746 if ((lj->lioj_queue_finished_count == 747 lj->lioj_queue_count) && 748 (lj->lioj_buffer_finished_count == 749 lj->lioj_buffer_count)) { 750 ksignal(userp, 751 lj->lioj_signal.sigev_signo); 752 lj->lioj_flags |= 753 LIOJ_SIGNAL_POSTED; 754 } 755 } 756 crit_exit(); 757 758 aiocbe->jobstate = JOBST_JOBFINISHED; 759 760 crit_enter(); 761 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 762 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist); 763 crit_exit(); 764 KNOTE(&aiocbe->klist, 0); 765 766 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 767 wakeup(aiocbe); 768 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 769 } 770 771 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 772 ksignal(userp, cb->aio_sigevent.sigev_signo); 773 } 774 } 775 776 /* 777 * Disconnect from user address space. 778 */ 779 if (curvm) { 780 /* swap our original address space back in */ 781 pmap_setlwpvm(mylwp, mycp->p_vmspace); 782 sysref_put(&curvm->vm_sysref); 783 curvm = NULL; 784 } 785 786 /* 787 * If we are the first to be put onto the free queue, wakeup 788 * anyone waiting for a daemon. 789 */ 790 crit_enter(); 791 TAILQ_REMOVE(&aio_activeproc, aiop, list); 792 if (TAILQ_EMPTY(&aio_freeproc)) 793 wakeup(&aio_freeproc); 794 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 795 aiop->aioprocflags |= AIOP_FREE; 796 crit_exit(); 797 798 /* 799 * If daemon is inactive for a long time, allow it to exit, 800 * thereby freeing resources. 801 */ 802 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp, 803 0, "aiordy", aiod_lifetime)) { 804 crit_enter(); 805 if (TAILQ_EMPTY(&aio_jobs)) { 806 if ((aiop->aioprocflags & AIOP_FREE) && 807 (num_aio_procs > target_aio_procs)) { 808 TAILQ_REMOVE(&aio_freeproc, aiop, list); 809 crit_exit(); 810 zfree(aiop_zone, aiop); 811 num_aio_procs--; 812 #ifdef DIAGNOSTIC 813 if (mycp->p_vmspace->vm_sysref.refcnt <= 1) { 814 kprintf("AIOD: bad vm refcnt for" 815 " exiting daemon: %d\n", 816 mycp->p_vmspace->vm_sysref.refcnt); 817 } 818 #endif 819 exit1(0); 820 } 821 } 822 crit_exit(); 823 } 824 } 825 } 826 827 /* 828 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 829 * AIO daemon modifies its environment itself. 830 */ 831 static int 832 aio_newproc(void) 833 { 834 int error; 835 struct lwp *lp, *nlp; 836 struct proc *np; 837 838 lp = &lwp0; 839 error = fork1(lp, RFPROC|RFMEM|RFNOWAIT, &np); 840 if (error) 841 return error; 842 nlp = ONLY_LWP_IN_PROC(np); 843 cpu_set_fork_handler(nlp, aio_daemon, curproc); 844 start_forked_proc(lp, np); 845 846 /* 847 * Wait until daemon is started, but continue on just in case to 848 * handle error conditions. 849 */ 850 error = tsleep(np, 0, "aiosta", aiod_timeout); 851 num_aio_procs++; 852 853 return error; 854 } 855 856 /* 857 * Try the high-performance, low-overhead physio method for eligible 858 * VCHR devices. This method doesn't use an aio helper thread, and 859 * thus has very low overhead. 860 * 861 * Assumes that the caller, _aio_aqueue(), has incremented the file 862 * structure's reference count, preventing its deallocation for the 863 * duration of this call. 864 */ 865 static int 866 aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 867 { 868 int error; 869 struct aiocb *cb; 870 struct file *fp; 871 struct buf *bp; 872 struct vnode *vp; 873 struct kaioinfo *ki; 874 struct aio_liojob *lj; 875 int notify; 876 877 cb = &aiocbe->uaiocb; 878 fp = aiocbe->fd_file; 879 880 if (fp->f_type != DTYPE_VNODE) 881 return (-1); 882 883 vp = (struct vnode *)fp->f_data; 884 885 /* 886 * If its not a disk, we don't want to return a positive error. 887 * It causes the aio code to not fall through to try the thread 888 * way when you're talking to a regular file. 889 */ 890 if (!vn_isdisk(vp, &error)) { 891 if (error == ENOTBLK) 892 return (-1); 893 else 894 return (error); 895 } 896 897 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) 898 return (-1); 899 900 if (cb->aio_nbytes > 901 MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) 902 return (-1); 903 904 ki = p->p_aioinfo; 905 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 906 return (-1); 907 908 ki->kaio_buffer_count++; 909 910 lj = aiocbe->lio; 911 if (lj) 912 lj->lioj_buffer_count++; 913 914 /* Create and build a buffer header for a transfer. */ 915 bp = getpbuf(NULL); 916 BUF_KERNPROC(bp); 917 918 /* 919 * Get a copy of the kva from the physical buffer. 920 */ 921 bp->b_bio1.bio_caller_info1.ptr = p; 922 error = 0; 923 924 bp->b_cmd = (cb->aio_lio_opcode == LIO_WRITE) ? 925 BUF_CMD_WRITE : BUF_CMD_READ; 926 bp->b_bio1.bio_done = aio_physwakeup; 927 bp->b_bio1.bio_flags |= BIO_SYNC; 928 bp->b_bio1.bio_offset = cb->aio_offset; 929 930 /* Bring buffer into kernel space. */ 931 if (vmapbuf(bp, __DEVOLATILE(char *, cb->aio_buf), cb->aio_nbytes) < 0) { 932 error = EFAULT; 933 goto doerror; 934 } 935 936 crit_enter(); 937 938 aiocbe->bp = bp; 939 bp->b_bio1.bio_caller_info2.ptr = aiocbe; 940 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 941 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 942 aiocbe->jobstate = JOBST_JOBQBUF; 943 cb->_aiocb_private.status = cb->aio_nbytes; 944 num_buf_aio++; 945 bp->b_error = 0; 946 947 crit_exit(); 948 949 /* 950 * Perform the transfer. vn_strategy must be used even though we 951 * know we have a device in order to deal with requests which exceed 952 * device DMA limitations. 953 */ 954 vn_strategy(vp, &bp->b_bio1); 955 956 notify = 0; 957 crit_enter(); 958 959 #if 0 960 /* 961 * If we had an error invoking the request, or an error in processing 962 * the request before we have returned, we process it as an error in 963 * transfer. Note that such an I/O error is not indicated immediately, 964 * but is returned using the aio_error mechanism. In this case, 965 * aio_suspend will return immediately. 966 */ 967 if (bp->b_error || (bp->b_flags & B_ERROR)) { 968 struct aiocb *job = aiocbe->uuaiocb; 969 970 aiocbe->uaiocb._aiocb_private.status = 0; 971 suword(&job->_aiocb_private.status, 0); 972 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 973 suword(&job->_aiocb_private.error, bp->b_error); 974 975 ki->kaio_buffer_finished_count++; 976 977 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 978 aiocbe->jobstate = JOBST_JOBBFINISHED; 979 aiocbe->jobflags |= AIOCBLIST_DONE; 980 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 981 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 982 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 983 notify = 1; 984 } 985 } 986 #endif 987 crit_exit(); 988 if (notify) 989 KNOTE(&aiocbe->klist, 0); 990 return 0; 991 992 doerror: 993 ki->kaio_buffer_count--; 994 if (lj) 995 lj->lioj_buffer_count--; 996 aiocbe->bp = NULL; 997 relpbuf(bp, NULL); 998 return error; 999 } 1000 1001 /* 1002 * This waits/tests physio completion. 1003 */ 1004 static int 1005 aio_fphysio(struct aiocblist *iocb) 1006 { 1007 struct buf *bp; 1008 int error; 1009 1010 bp = iocb->bp; 1011 1012 error = biowait_timeout(&bp->b_bio1, "physstr", aiod_timeout); 1013 if (error == EWOULDBLOCK) 1014 return EINPROGRESS; 1015 1016 /* Release mapping into kernel space. */ 1017 vunmapbuf(bp); 1018 iocb->bp = 0; 1019 1020 error = 0; 1021 1022 /* Check for an error. */ 1023 if (bp->b_flags & B_ERROR) 1024 error = bp->b_error; 1025 1026 relpbuf(bp, NULL); 1027 return (error); 1028 } 1029 #endif /* VFS_AIO */ 1030 1031 /* 1032 * Wake up aio requests that may be serviceable now. 1033 */ 1034 void 1035 aio_swake(struct socket *so, struct signalsockbuf *ssb) 1036 { 1037 #ifndef VFS_AIO 1038 return; 1039 #else 1040 struct aiocblist *cb,*cbn; 1041 struct proc *p; 1042 struct kaioinfo *ki = NULL; 1043 int opcode, wakecount = 0; 1044 struct aioproclist *aiop; 1045 1046 if (ssb == &so->so_snd) { 1047 opcode = LIO_WRITE; 1048 so->so_snd.ssb_flags &= ~SSB_AIO; 1049 } else { 1050 opcode = LIO_READ; 1051 so->so_rcv.ssb_flags &= ~SSB_AIO; 1052 } 1053 1054 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { 1055 cbn = TAILQ_NEXT(cb, list); 1056 if (opcode == cb->uaiocb.aio_lio_opcode) { 1057 p = cb->userproc; 1058 ki = p->p_aioinfo; 1059 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1060 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); 1061 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1062 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); 1063 wakecount++; 1064 if (cb->jobstate != JOBST_JOBQGLOBAL) 1065 panic("invalid queue value"); 1066 } 1067 } 1068 1069 while (wakecount--) { 1070 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { 1071 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1072 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1073 aiop->aioprocflags &= ~AIOP_FREE; 1074 wakeup(aiop->aioproc); 1075 } 1076 } 1077 #endif /* VFS_AIO */ 1078 } 1079 1080 #ifdef VFS_AIO 1081 /* 1082 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1083 * technique is done in this code. 1084 */ 1085 static int 1086 _aio_aqueue(struct aiocb *job, struct aio_liojob *lj, int type) 1087 { 1088 struct proc *p = curproc; 1089 struct file *fp; 1090 unsigned int fd; 1091 struct socket *so; 1092 int error; 1093 int opcode, user_opcode; 1094 struct aiocblist *aiocbe; 1095 struct aioproclist *aiop; 1096 struct kaioinfo *ki; 1097 struct kevent kev; 1098 struct kqueue *kq; 1099 struct file *kq_fp; 1100 int fflags; 1101 1102 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) 1103 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1104 else 1105 aiocbe = zalloc (aiocb_zone); 1106 1107 aiocbe->inputcharge = 0; 1108 aiocbe->outputcharge = 0; 1109 callout_init(&aiocbe->timeout); 1110 SLIST_INIT(&aiocbe->klist); 1111 1112 suword(&job->_aiocb_private.status, -1); 1113 suword(&job->_aiocb_private.error, 0); 1114 suword(&job->_aiocb_private.kernelinfo, -1); 1115 1116 error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb)); 1117 if (error) { 1118 suword(&job->_aiocb_private.error, error); 1119 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1120 return error; 1121 } 1122 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL && 1123 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { 1124 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1125 return EINVAL; 1126 } 1127 1128 /* Save userspace address of the job info. */ 1129 aiocbe->uuaiocb = job; 1130 1131 /* Get the opcode. */ 1132 user_opcode = aiocbe->uaiocb.aio_lio_opcode; 1133 if (type != LIO_NOP) 1134 aiocbe->uaiocb.aio_lio_opcode = type; 1135 opcode = aiocbe->uaiocb.aio_lio_opcode; 1136 1137 /* 1138 * Range check file descriptor. 1139 */ 1140 fflags = (opcode == LIO_WRITE) ? FWRITE : FREAD; 1141 fd = aiocbe->uaiocb.aio_fildes; 1142 fp = holdfp(p->p_fd, fd, fflags); 1143 if (fp == NULL) { 1144 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1145 if (type == 0) 1146 suword(&job->_aiocb_private.error, EBADF); 1147 return EBADF; 1148 } 1149 1150 aiocbe->fd_file = fp; 1151 1152 if (aiocbe->uaiocb.aio_offset == -1LL) { 1153 error = EINVAL; 1154 goto aqueue_fail; 1155 } 1156 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1157 if (error) { 1158 error = EINVAL; 1159 goto aqueue_fail; 1160 } 1161 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1162 if (jobrefid == LONG_MAX) 1163 jobrefid = 1; 1164 else 1165 jobrefid++; 1166 1167 if (opcode == LIO_NOP) { 1168 fdrop(fp); 1169 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1170 if (type == 0) { 1171 suword(&job->_aiocb_private.error, 0); 1172 suword(&job->_aiocb_private.status, 0); 1173 suword(&job->_aiocb_private.kernelinfo, 0); 1174 } 1175 return 0; 1176 } 1177 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1178 if (type == 0) 1179 suword(&job->_aiocb_private.status, 0); 1180 error = EINVAL; 1181 goto aqueue_fail; 1182 } 1183 1184 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) { 1185 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; 1186 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr; 1187 } 1188 else { 1189 /* 1190 * This method for requesting kevent-based notification won't 1191 * work on the alpha, since we're passing in a pointer 1192 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT- 1193 * based method instead. 1194 */ 1195 if (user_opcode == LIO_NOP || user_opcode == LIO_READ || 1196 user_opcode == LIO_WRITE) 1197 goto no_kqueue; 1198 1199 error = copyin((struct kevent *)(uintptr_t)user_opcode, 1200 &kev, sizeof(kev)); 1201 if (error) 1202 goto aqueue_fail; 1203 } 1204 kq_fp = holdfp(p->p_fd, (int)kev.ident, -1); 1205 if (kq_fp == NULL || kq_fp->f_type != DTYPE_KQUEUE) { 1206 if (kq_fp) { 1207 fdrop(kq_fp); 1208 kq_fp = NULL; 1209 } 1210 error = EBADF; 1211 goto aqueue_fail; 1212 } 1213 kq = (struct kqueue *)kq_fp->f_data; 1214 kev.ident = (uintptr_t)aiocbe->uuaiocb; 1215 kev.filter = EVFILT_AIO; 1216 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 1217 kev.data = (intptr_t)aiocbe; 1218 error = kqueue_register(kq, &kev); 1219 fdrop(kq_fp); 1220 aqueue_fail: 1221 if (error) { 1222 fdrop(fp); 1223 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1224 if (type == 0) 1225 suword(&job->_aiocb_private.error, error); 1226 goto done; 1227 } 1228 no_kqueue: 1229 1230 suword(&job->_aiocb_private.error, EINPROGRESS); 1231 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1232 aiocbe->userproc = p; 1233 aiocbe->jobflags = 0; 1234 aiocbe->lio = lj; 1235 ki = p->p_aioinfo; 1236 1237 if (fp->f_type == DTYPE_SOCKET) { 1238 /* 1239 * Alternate queueing for socket ops: Reach down into the 1240 * descriptor to get the socket data. Then check to see if the 1241 * socket is ready to be read or written (based on the requested 1242 * operation). 1243 * 1244 * If it is not ready for io, then queue the aiocbe on the 1245 * socket, and set the flags so we get a call when ssb_notify() 1246 * happens. 1247 */ 1248 so = (struct socket *)fp->f_data; 1249 crit_enter(); 1250 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1251 LIO_WRITE) && (!sowriteable(so)))) { 1252 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1253 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); 1254 if (opcode == LIO_READ) 1255 so->so_rcv.ssb_flags |= SSB_AIO; 1256 else 1257 so->so_snd.ssb_flags |= SSB_AIO; 1258 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ 1259 ki->kaio_queue_count++; 1260 num_queue_count++; 1261 crit_exit(); 1262 error = 0; 1263 goto done; 1264 } 1265 crit_exit(); 1266 } 1267 1268 if ((error = aio_qphysio(p, aiocbe)) == 0) 1269 goto done; 1270 if (error > 0) { 1271 suword(&job->_aiocb_private.status, 0); 1272 aiocbe->uaiocb._aiocb_private.error = error; 1273 suword(&job->_aiocb_private.error, error); 1274 goto done; 1275 } 1276 1277 /* No buffer for daemon I/O. */ 1278 aiocbe->bp = NULL; 1279 1280 ki->kaio_queue_count++; 1281 if (lj) 1282 lj->lioj_queue_count++; 1283 crit_enter(); 1284 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1285 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1286 crit_exit(); 1287 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1288 1289 num_queue_count++; 1290 error = 0; 1291 1292 /* 1293 * If we don't have a free AIO process, and we are below our quota, then 1294 * start one. Otherwise, depend on the subsequent I/O completions to 1295 * pick-up this job. If we don't successfully create the new process 1296 * (thread) due to resource issues, we return an error for now (EAGAIN), 1297 * which is likely not the correct thing to do. 1298 */ 1299 crit_enter(); 1300 retryproc: 1301 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1302 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1303 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1304 aiop->aioprocflags &= ~AIOP_FREE; 1305 wakeup(aiop->aioproc); 1306 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1307 ((ki->kaio_active_count + num_aio_resv_start) < 1308 ki->kaio_maxactive_count)) { 1309 num_aio_resv_start++; 1310 if ((error = aio_newproc()) == 0) { 1311 num_aio_resv_start--; 1312 goto retryproc; 1313 } 1314 num_aio_resv_start--; 1315 } 1316 crit_exit(); 1317 done: 1318 return error; 1319 } 1320 1321 /* 1322 * This routine queues an AIO request, checking for quotas. 1323 */ 1324 static int 1325 aio_aqueue(struct aiocb *job, int type) 1326 { 1327 struct proc *p = curproc; 1328 struct kaioinfo *ki; 1329 1330 if (p->p_aioinfo == NULL) 1331 aio_init_aioinfo(p); 1332 1333 if (num_queue_count >= max_queue_count) 1334 return EAGAIN; 1335 1336 ki = p->p_aioinfo; 1337 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1338 return EAGAIN; 1339 1340 return _aio_aqueue(job, NULL, type); 1341 } 1342 #endif /* VFS_AIO */ 1343 1344 /* 1345 * Support the aio_return system call, as a side-effect, kernel resources are 1346 * released. 1347 * 1348 * MPALMOSTSAFE 1349 */ 1350 int 1351 sys_aio_return(struct aio_return_args *uap) 1352 { 1353 #ifndef VFS_AIO 1354 return (ENOSYS); 1355 #else 1356 struct proc *p = curproc; 1357 struct lwp *lp = curthread->td_lwp; 1358 long jobref; 1359 struct aiocblist *cb, *ncb; 1360 struct aiocb *ujob; 1361 struct kaioinfo *ki; 1362 int error; 1363 1364 ki = p->p_aioinfo; 1365 if (ki == NULL) 1366 return EINVAL; 1367 1368 ujob = uap->aiocbp; 1369 1370 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1371 if (jobref == -1 || jobref == 0) 1372 return EINVAL; 1373 1374 get_mplock(); 1375 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1376 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1377 jobref) { 1378 if (ujob == cb->uuaiocb) { 1379 uap->sysmsg_result = 1380 cb->uaiocb._aiocb_private.status; 1381 } else { 1382 uap->sysmsg_result = EFAULT; 1383 } 1384 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1385 lp->lwp_ru.ru_oublock += cb->outputcharge; 1386 cb->outputcharge = 0; 1387 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1388 lp->lwp_ru.ru_inblock += cb->inputcharge; 1389 cb->inputcharge = 0; 1390 } 1391 aio_free_entry(cb); 1392 error = 0; 1393 goto done; 1394 } 1395 } 1396 crit_enter(); 1397 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { 1398 ncb = TAILQ_NEXT(cb, plist); 1399 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) 1400 == jobref) { 1401 crit_exit(); 1402 if (ujob == cb->uuaiocb) { 1403 uap->sysmsg_result = 1404 cb->uaiocb._aiocb_private.status; 1405 } else { 1406 uap->sysmsg_result = EFAULT; 1407 } 1408 aio_free_entry(cb); 1409 error = 0; 1410 goto done; 1411 } 1412 } 1413 crit_exit(); 1414 error = EINVAL; 1415 done: 1416 rel_mplock(); 1417 return (error); 1418 #endif /* VFS_AIO */ 1419 } 1420 1421 /* 1422 * Allow a process to wakeup when any of the I/O requests are completed. 1423 * 1424 * MPALMOSTSAFE 1425 */ 1426 int 1427 sys_aio_suspend(struct aio_suspend_args *uap) 1428 { 1429 #ifndef VFS_AIO 1430 return ENOSYS; 1431 #else 1432 struct proc *p = curproc; 1433 struct timeval atv; 1434 struct timespec ts; 1435 struct aiocb *const *cbptr, *cbp; 1436 struct kaioinfo *ki; 1437 struct aiocblist *cb; 1438 int i; 1439 int njoblist; 1440 int error, timo; 1441 long *ijoblist; 1442 struct aiocb **ujoblist; 1443 1444 if ((u_int)uap->nent > AIO_LISTIO_MAX) 1445 return EINVAL; 1446 1447 timo = 0; 1448 if (uap->timeout) { 1449 /* Get timespec struct. */ 1450 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1451 return error; 1452 1453 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1454 return (EINVAL); 1455 1456 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1457 if (itimerfix(&atv)) 1458 return (EINVAL); 1459 timo = tvtohz_high(&atv); 1460 } 1461 1462 ki = p->p_aioinfo; 1463 if (ki == NULL) 1464 return EAGAIN; 1465 1466 get_mplock(); 1467 1468 njoblist = 0; 1469 ijoblist = zalloc(aiol_zone); 1470 ujoblist = zalloc(aiol_zone); 1471 cbptr = uap->aiocbp; 1472 1473 for (i = 0; i < uap->nent; i++) { 1474 cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); 1475 if (cbp == 0) 1476 continue; 1477 ujoblist[njoblist] = cbp; 1478 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1479 njoblist++; 1480 } 1481 1482 if (njoblist == 0) { 1483 zfree(aiol_zone, ijoblist); 1484 zfree(aiol_zone, ujoblist); 1485 error = 0; 1486 goto done; 1487 } 1488 1489 error = 0; 1490 for (;;) { 1491 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1492 for (i = 0; i < njoblist; i++) { 1493 if (((intptr_t) 1494 cb->uaiocb._aiocb_private.kernelinfo) == 1495 ijoblist[i]) { 1496 if (ujoblist[i] != cb->uuaiocb) 1497 error = EINVAL; 1498 zfree(aiol_zone, ijoblist); 1499 zfree(aiol_zone, ujoblist); 1500 goto done; 1501 } 1502 } 1503 } 1504 1505 crit_enter(); 1506 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = 1507 TAILQ_NEXT(cb, plist)) { 1508 for (i = 0; i < njoblist; i++) { 1509 if (((intptr_t) 1510 cb->uaiocb._aiocb_private.kernelinfo) == 1511 ijoblist[i]) { 1512 crit_exit(); 1513 if (ujoblist[i] != cb->uuaiocb) 1514 error = EINVAL; 1515 zfree(aiol_zone, ijoblist); 1516 zfree(aiol_zone, ujoblist); 1517 goto done; 1518 } 1519 } 1520 } 1521 1522 ki->kaio_flags |= KAIO_WAKEUP; 1523 error = tsleep(p, PCATCH, "aiospn", timo); 1524 crit_exit(); 1525 1526 if (error == ERESTART || error == EINTR) { 1527 zfree(aiol_zone, ijoblist); 1528 zfree(aiol_zone, ujoblist); 1529 error = EINTR; 1530 goto done; 1531 } else if (error == EWOULDBLOCK) { 1532 zfree(aiol_zone, ijoblist); 1533 zfree(aiol_zone, ujoblist); 1534 error = EAGAIN; 1535 goto done; 1536 } 1537 } 1538 1539 /* NOTREACHED */ 1540 error = EINVAL; 1541 done: 1542 rel_mplock(); 1543 return (error); 1544 #endif /* VFS_AIO */ 1545 } 1546 1547 /* 1548 * aio_cancel cancels any non-physio aio operations not currently in 1549 * progress. 1550 * 1551 * MPALMOSTSAFE 1552 */ 1553 int 1554 sys_aio_cancel(struct aio_cancel_args *uap) 1555 { 1556 #ifndef VFS_AIO 1557 return ENOSYS; 1558 #else 1559 struct proc *p = curproc; 1560 struct kaioinfo *ki; 1561 struct aiocblist *cbe, *cbn; 1562 struct file *fp; 1563 struct socket *so; 1564 struct proc *po; 1565 int error; 1566 int cancelled=0; 1567 int notcancelled=0; 1568 struct vnode *vp; 1569 1570 fp = holdfp(p->p_fd, uap->fd, -1); 1571 if (fp == NULL) 1572 return (EBADF); 1573 1574 get_mplock(); 1575 1576 if (fp->f_type == DTYPE_VNODE) { 1577 vp = (struct vnode *)fp->f_data; 1578 1579 if (vn_isdisk(vp,&error)) { 1580 uap->sysmsg_result = AIO_NOTCANCELED; 1581 error = 0; 1582 goto done2; 1583 } 1584 } else if (fp->f_type == DTYPE_SOCKET) { 1585 so = (struct socket *)fp->f_data; 1586 1587 crit_enter(); 1588 1589 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { 1590 cbn = TAILQ_NEXT(cbe, list); 1591 if ((uap->aiocbp == NULL) || 1592 (uap->aiocbp == cbe->uuaiocb) ) { 1593 po = cbe->userproc; 1594 ki = po->p_aioinfo; 1595 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 1596 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); 1597 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); 1598 if (ki->kaio_flags & KAIO_WAKEUP) { 1599 wakeup(po); 1600 } 1601 cbe->jobstate = JOBST_JOBFINISHED; 1602 cbe->uaiocb._aiocb_private.status=-1; 1603 cbe->uaiocb._aiocb_private.error=ECANCELED; 1604 cancelled++; 1605 /* XXX cancelled, knote? */ 1606 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1607 SIGEV_SIGNAL) 1608 ksignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1609 if (uap->aiocbp) 1610 break; 1611 } 1612 } 1613 crit_exit(); 1614 1615 if ((cancelled) && (uap->aiocbp)) { 1616 uap->sysmsg_result = AIO_CANCELED; 1617 error = 0; 1618 goto done2; 1619 } 1620 } 1621 ki=p->p_aioinfo; 1622 if (ki == NULL) 1623 goto done; 1624 crit_enter(); 1625 1626 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { 1627 cbn = TAILQ_NEXT(cbe, plist); 1628 1629 if ((uap->fd == cbe->uaiocb.aio_fildes) && 1630 ((uap->aiocbp == NULL ) || 1631 (uap->aiocbp == cbe->uuaiocb))) { 1632 1633 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 1634 TAILQ_REMOVE(&aio_jobs, cbe, list); 1635 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 1636 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, 1637 plist); 1638 cancelled++; 1639 ki->kaio_queue_finished_count++; 1640 cbe->jobstate = JOBST_JOBFINISHED; 1641 cbe->uaiocb._aiocb_private.status = -1; 1642 cbe->uaiocb._aiocb_private.error = ECANCELED; 1643 /* XXX cancelled, knote? */ 1644 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1645 SIGEV_SIGNAL) 1646 ksignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1647 } else { 1648 notcancelled++; 1649 } 1650 } 1651 } 1652 crit_exit(); 1653 done: 1654 if (notcancelled) 1655 uap->sysmsg_result = AIO_NOTCANCELED; 1656 else if (cancelled) 1657 uap->sysmsg_result = AIO_CANCELED; 1658 else 1659 uap->sysmsg_result = AIO_ALLDONE; 1660 error = 0; 1661 done2: 1662 rel_mplock(); 1663 fdrop(fp); 1664 return error; 1665 #endif /* VFS_AIO */ 1666 } 1667 1668 /* 1669 * aio_error is implemented in the kernel level for compatibility purposes only. 1670 * For a user mode async implementation, it would be best to do it in a userland 1671 * subroutine. 1672 * 1673 * MPALMOSTSAFE 1674 */ 1675 int 1676 sys_aio_error(struct aio_error_args *uap) 1677 { 1678 #ifndef VFS_AIO 1679 return ENOSYS; 1680 #else 1681 struct proc *p = curproc; 1682 struct aiocblist *cb; 1683 struct kaioinfo *ki; 1684 long jobref; 1685 int error; 1686 1687 ki = p->p_aioinfo; 1688 if (ki == NULL) 1689 return EINVAL; 1690 1691 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1692 if ((jobref == -1) || (jobref == 0)) 1693 return EINVAL; 1694 1695 get_mplock(); 1696 error = 0; 1697 1698 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1699 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1700 jobref) { 1701 uap->sysmsg_result = cb->uaiocb._aiocb_private.error; 1702 goto done; 1703 } 1704 } 1705 1706 crit_enter(); 1707 1708 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, 1709 plist)) { 1710 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1711 jobref) { 1712 uap->sysmsg_result = EINPROGRESS; 1713 crit_exit(); 1714 goto done; 1715 } 1716 } 1717 1718 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, 1719 plist)) { 1720 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1721 jobref) { 1722 uap->sysmsg_result = EINPROGRESS; 1723 crit_exit(); 1724 goto done; 1725 } 1726 } 1727 crit_exit(); 1728 1729 crit_enter(); 1730 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, 1731 plist)) { 1732 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1733 jobref) { 1734 uap->sysmsg_result = cb->uaiocb._aiocb_private.error; 1735 crit_exit(); 1736 goto done; 1737 } 1738 } 1739 1740 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, 1741 plist)) { 1742 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1743 jobref) { 1744 uap->sysmsg_result = EINPROGRESS; 1745 crit_exit(); 1746 goto done; 1747 } 1748 } 1749 crit_exit(); 1750 error = EINVAL; 1751 done: 1752 rel_mplock(); 1753 return (error); 1754 #endif /* VFS_AIO */ 1755 } 1756 1757 /* 1758 * syscall - asynchronous read from a file (REALTIME) 1759 * 1760 * MPALMOSTSAFE 1761 */ 1762 int 1763 sys_aio_read(struct aio_read_args *uap) 1764 { 1765 #ifndef VFS_AIO 1766 return ENOSYS; 1767 #else 1768 int error; 1769 1770 get_mplock(); 1771 error = aio_aqueue(uap->aiocbp, LIO_READ); 1772 rel_mplock(); 1773 return (error); 1774 #endif /* VFS_AIO */ 1775 } 1776 1777 /* 1778 * syscall - asynchronous write to a file (REALTIME) 1779 * 1780 * MPALMOSTSAFE 1781 */ 1782 int 1783 sys_aio_write(struct aio_write_args *uap) 1784 { 1785 #ifndef VFS_AIO 1786 return ENOSYS; 1787 #else 1788 int error; 1789 1790 get_mplock(); 1791 error = aio_aqueue(uap->aiocbp, LIO_WRITE); 1792 rel_mplock(); 1793 return (error); 1794 #endif /* VFS_AIO */ 1795 } 1796 1797 /* 1798 * syscall - XXX undocumented 1799 * 1800 * MPALMOSTSAFE 1801 */ 1802 int 1803 sys_lio_listio(struct lio_listio_args *uap) 1804 { 1805 #ifndef VFS_AIO 1806 return ENOSYS; 1807 #else 1808 struct proc *p = curproc; 1809 struct lwp *lp = curthread->td_lwp; 1810 int nent, nentqueued; 1811 struct aiocb *iocb, * const *cbptr; 1812 struct aiocblist *cb; 1813 struct kaioinfo *ki; 1814 struct aio_liojob *lj; 1815 int error, runningcode; 1816 int nerror; 1817 int i; 1818 1819 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 1820 return EINVAL; 1821 1822 nent = uap->nent; 1823 if (nent > AIO_LISTIO_MAX) 1824 return EINVAL; 1825 1826 get_mplock(); 1827 1828 if (p->p_aioinfo == NULL) 1829 aio_init_aioinfo(p); 1830 1831 if ((nent + num_queue_count) > max_queue_count) { 1832 error = EAGAIN; 1833 goto done; 1834 } 1835 1836 ki = p->p_aioinfo; 1837 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) { 1838 error = EAGAIN; 1839 goto done; 1840 } 1841 1842 lj = zalloc(aiolio_zone); 1843 if (lj == NULL) { 1844 error = EAGAIN; 1845 goto done; 1846 } 1847 1848 lj->lioj_flags = 0; 1849 lj->lioj_buffer_count = 0; 1850 lj->lioj_buffer_finished_count = 0; 1851 lj->lioj_queue_count = 0; 1852 lj->lioj_queue_finished_count = 0; 1853 lj->lioj_ki = ki; 1854 1855 /* 1856 * Setup signal. 1857 */ 1858 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 1859 error = copyin(uap->sig, &lj->lioj_signal, 1860 sizeof(lj->lioj_signal)); 1861 if (error) { 1862 zfree(aiolio_zone, lj); 1863 goto done; 1864 } 1865 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { 1866 zfree(aiolio_zone, lj); 1867 error = EINVAL; 1868 goto done; 1869 } 1870 lj->lioj_flags |= LIOJ_SIGNAL; 1871 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 1872 } else 1873 lj->lioj_flags &= ~LIOJ_SIGNAL; 1874 1875 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 1876 /* 1877 * Get pointers to the list of I/O requests. 1878 */ 1879 nerror = 0; 1880 nentqueued = 0; 1881 cbptr = uap->acb_list; 1882 for (i = 0; i < uap->nent; i++) { 1883 iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); 1884 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) { 1885 error = _aio_aqueue(iocb, lj, 0); 1886 if (error == 0) 1887 nentqueued++; 1888 else 1889 nerror++; 1890 } 1891 } 1892 1893 /* 1894 * If we haven't queued any, then just return error. 1895 */ 1896 if (nentqueued == 0) { 1897 error = 0; 1898 goto done; 1899 } 1900 1901 /* 1902 * Calculate the appropriate error return. 1903 */ 1904 runningcode = 0; 1905 if (nerror) 1906 runningcode = EIO; 1907 1908 if (uap->mode == LIO_WAIT) { 1909 int command, found, jobref; 1910 1911 for (;;) { 1912 found = 0; 1913 for (i = 0; i < uap->nent; i++) { 1914 /* 1915 * Fetch address of the control buf pointer in 1916 * user space. 1917 */ 1918 iocb = (struct aiocb *) 1919 (intptr_t)fuword(&cbptr[i]); 1920 if (((intptr_t)iocb == -1) || ((intptr_t)iocb 1921 == 0)) 1922 continue; 1923 1924 /* 1925 * Fetch the associated command from user space. 1926 */ 1927 command = fuword(&iocb->aio_lio_opcode); 1928 if (command == LIO_NOP) { 1929 found++; 1930 continue; 1931 } 1932 1933 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 1934 1935 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1936 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 1937 == jobref) { 1938 if (cb->uaiocb.aio_lio_opcode 1939 == LIO_WRITE) { 1940 lp->lwp_ru.ru_oublock += 1941 cb->outputcharge; 1942 cb->outputcharge = 0; 1943 } else if (cb->uaiocb.aio_lio_opcode 1944 == LIO_READ) { 1945 lp->lwp_ru.ru_inblock += 1946 cb->inputcharge; 1947 cb->inputcharge = 0; 1948 } 1949 found++; 1950 break; 1951 } 1952 } 1953 1954 crit_enter(); 1955 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) { 1956 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 1957 == jobref) { 1958 found++; 1959 break; 1960 } 1961 } 1962 crit_exit(); 1963 } 1964 1965 /* 1966 * If all I/Os have been disposed of, then we can 1967 * return. 1968 */ 1969 if (found == nentqueued) { 1970 error = runningcode; 1971 goto done; 1972 } 1973 1974 ki->kaio_flags |= KAIO_WAKEUP; 1975 error = tsleep(p, PCATCH, "aiospn", 0); 1976 1977 if (error == EINTR) { 1978 goto done; 1979 } else if (error == EWOULDBLOCK) { 1980 error = EAGAIN; 1981 goto done; 1982 } 1983 } 1984 } 1985 1986 error = runningcode; 1987 done: 1988 rel_mplock(); 1989 return (error); 1990 #endif /* VFS_AIO */ 1991 } 1992 1993 #ifdef VFS_AIO 1994 /* 1995 * This is a weird hack so that we can post a signal. It is safe to do so from 1996 * a timeout routine, but *not* from an interrupt routine. 1997 */ 1998 static void 1999 process_signal(void *aioj) 2000 { 2001 struct aiocblist *aiocbe = aioj; 2002 struct aio_liojob *lj = aiocbe->lio; 2003 struct aiocb *cb = &aiocbe->uaiocb; 2004 2005 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && 2006 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { 2007 ksignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 2008 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2009 } 2010 2011 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2012 ksignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); 2013 } 2014 2015 /* 2016 * Interrupt handler for physio, performs the necessary process wakeups, and 2017 * signals. 2018 */ 2019 static void 2020 aio_physwakeup(struct bio *bio) 2021 { 2022 struct buf *bp = bio->bio_buf; 2023 struct aiocblist *aiocbe; 2024 struct proc *p; 2025 struct kaioinfo *ki; 2026 struct aio_liojob *lj; 2027 2028 aiocbe = bio->bio_caller_info2.ptr; 2029 2030 if (aiocbe) { 2031 p = bio->bio_caller_info1.ptr; 2032 2033 aiocbe->jobstate = JOBST_JOBBFINISHED; 2034 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2035 aiocbe->uaiocb._aiocb_private.error = 0; 2036 aiocbe->jobflags |= AIOCBLIST_DONE; 2037 2038 if (bp->b_flags & B_ERROR) 2039 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2040 2041 lj = aiocbe->lio; 2042 if (lj) { 2043 lj->lioj_buffer_finished_count++; 2044 2045 /* 2046 * wakeup/signal if all of the interrupt jobs are done. 2047 */ 2048 if (lj->lioj_buffer_finished_count == 2049 lj->lioj_buffer_count) { 2050 /* 2051 * Post a signal if it is called for. 2052 */ 2053 if ((lj->lioj_flags & 2054 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 2055 LIOJ_SIGNAL) { 2056 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2057 callout_reset(&aiocbe->timeout, 0, 2058 process_signal, aiocbe); 2059 } 2060 } 2061 } 2062 2063 ki = p->p_aioinfo; 2064 if (ki) { 2065 ki->kaio_buffer_finished_count++; 2066 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 2067 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 2068 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 2069 2070 KNOTE(&aiocbe->klist, 0); 2071 /* Do the wakeup. */ 2072 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 2073 ki->kaio_flags &= ~KAIO_WAKEUP; 2074 wakeup(p); 2075 } 2076 } 2077 2078 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 2079 callout_reset(&aiocbe->timeout, 0, 2080 process_signal, aiocbe); 2081 } 2082 } 2083 biodone_sync(bio); 2084 } 2085 #endif /* VFS_AIO */ 2086 2087 /* 2088 * syscall - wait for the next completion of an aio request 2089 * 2090 * MPALMOSTSAFE 2091 */ 2092 int 2093 sys_aio_waitcomplete(struct aio_waitcomplete_args *uap) 2094 { 2095 #ifndef VFS_AIO 2096 return ENOSYS; 2097 #else 2098 struct proc *p = curproc; 2099 struct lwp *lp = curthread->td_lwp; 2100 struct timeval atv; 2101 struct timespec ts; 2102 struct kaioinfo *ki; 2103 struct aiocblist *cb = NULL; 2104 int error, timo; 2105 2106 suword(uap->aiocbp, (int)NULL); 2107 2108 timo = 0; 2109 if (uap->timeout) { 2110 /* Get timespec struct. */ 2111 error = copyin(uap->timeout, &ts, sizeof(ts)); 2112 if (error) 2113 return error; 2114 2115 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) 2116 return (EINVAL); 2117 2118 TIMESPEC_TO_TIMEVAL(&atv, &ts); 2119 if (itimerfix(&atv)) 2120 return (EINVAL); 2121 timo = tvtohz_high(&atv); 2122 } 2123 2124 ki = p->p_aioinfo; 2125 if (ki == NULL) 2126 return EAGAIN; 2127 2128 get_mplock(); 2129 2130 for (;;) { 2131 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { 2132 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); 2133 uap->sysmsg_result = cb->uaiocb._aiocb_private.status; 2134 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2135 lp->lwp_ru.ru_oublock += 2136 cb->outputcharge; 2137 cb->outputcharge = 0; 2138 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2139 lp->lwp_ru.ru_inblock += cb->inputcharge; 2140 cb->inputcharge = 0; 2141 } 2142 aio_free_entry(cb); 2143 error = cb->uaiocb._aiocb_private.error; 2144 break; 2145 } 2146 2147 crit_enter(); 2148 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { 2149 crit_exit(); 2150 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); 2151 uap->sysmsg_result = cb->uaiocb._aiocb_private.status; 2152 aio_free_entry(cb); 2153 error = cb->uaiocb._aiocb_private.error; 2154 break; 2155 } 2156 2157 ki->kaio_flags |= KAIO_WAKEUP; 2158 error = tsleep(p, PCATCH, "aiowc", timo); 2159 crit_exit(); 2160 2161 if (error == ERESTART) { 2162 error = EINTR; 2163 break; 2164 } 2165 if (error < 0) 2166 break; 2167 if (error == EINTR) 2168 break; 2169 if (error == EWOULDBLOCK) { 2170 error = EAGAIN; 2171 break; 2172 } 2173 } 2174 rel_mplock(); 2175 return (error); 2176 #endif /* VFS_AIO */ 2177 } 2178 2179 #ifndef VFS_AIO 2180 static int 2181 filt_aioattach(struct knote *kn) 2182 { 2183 2184 return (ENXIO); 2185 } 2186 2187 struct filterops aio_filtops = 2188 { 0, filt_aioattach, NULL, NULL }; 2189 2190 #else 2191 /* kqueue attach function */ 2192 static int 2193 filt_aioattach(struct knote *kn) 2194 { 2195 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; 2196 2197 /* 2198 * The aiocbe pointer must be validated before using it, so 2199 * registration is restricted to the kernel; the user cannot 2200 * set EV_FLAG1. 2201 */ 2202 if ((kn->kn_flags & EV_FLAG1) == 0) 2203 return (EPERM); 2204 kn->kn_flags &= ~EV_FLAG1; 2205 2206 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext); 2207 2208 return (0); 2209 } 2210 2211 /* kqueue detach function */ 2212 static void 2213 filt_aiodetach(struct knote *kn) 2214 { 2215 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; 2216 2217 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext); 2218 } 2219 2220 /* kqueue filter function */ 2221 /*ARGSUSED*/ 2222 static int 2223 filt_aio(struct knote *kn, long hint) 2224 { 2225 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; 2226 2227 kn->kn_data = aiocbe->uaiocb._aiocb_private.error; 2228 if (aiocbe->jobstate != JOBST_JOBFINISHED && 2229 aiocbe->jobstate != JOBST_JOBBFINISHED) 2230 return (0); 2231 kn->kn_flags |= EV_EOF; 2232 return (1); 2233 } 2234 2235 struct filterops aio_filtops = 2236 { 0, filt_aioattach, filt_aiodetach, filt_aio }; 2237 #endif /* VFS_AIO */ 2238