1 /* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $FreeBSD: src/sys/kern/vfs_aio.c,v 1.70.2.28 2003/05/29 06:15:35 alc Exp $ 17 * $DragonFly: src/sys/kern/vfs_aio.c,v 1.42 2007/07/20 17:21:52 dillon Exp $ 18 */ 19 20 /* 21 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 22 */ 23 24 #include <sys/param.h> 25 #include <sys/systm.h> 26 #include <sys/buf.h> 27 #include <sys/sysproto.h> 28 #include <sys/filedesc.h> 29 #include <sys/kernel.h> 30 #include <sys/fcntl.h> 31 #include <sys/file.h> 32 #include <sys/lock.h> 33 #include <sys/unistd.h> 34 #include <sys/proc.h> 35 #include <sys/resourcevar.h> 36 #include <sys/signalvar.h> 37 #include <sys/protosw.h> 38 #include <sys/socketvar.h> 39 #include <sys/sysctl.h> 40 #include <sys/vnode.h> 41 #include <sys/conf.h> 42 #include <sys/event.h> 43 44 #include <vm/vm.h> 45 #include <vm/vm_extern.h> 46 #include <vm/pmap.h> 47 #include <vm/vm_map.h> 48 #include <vm/vm_zone.h> 49 #include <sys/aio.h> 50 51 #include <sys/file2.h> 52 #include <sys/buf2.h> 53 #include <sys/sysref2.h> 54 #include <sys/thread2.h> 55 #include <sys/mplock2.h> 56 57 #include <machine/limits.h> 58 #include "opt_vfs_aio.h" 59 60 #ifdef VFS_AIO 61 62 /* 63 * Counter for allocating reference ids to new jobs. Wrapped to 1 on 64 * overflow. 65 */ 66 static long jobrefid; 67 68 #define JOBST_NULL 0x0 69 #define JOBST_JOBQGLOBAL 0x2 70 #define JOBST_JOBRUNNING 0x3 71 #define JOBST_JOBFINISHED 0x4 72 #define JOBST_JOBQBUF 0x5 73 #define JOBST_JOBBFINISHED 0x6 74 75 #ifndef MAX_AIO_PER_PROC 76 #define MAX_AIO_PER_PROC 32 77 #endif 78 79 #ifndef MAX_AIO_QUEUE_PER_PROC 80 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 81 #endif 82 83 #ifndef MAX_AIO_PROCS 84 #define MAX_AIO_PROCS 32 85 #endif 86 87 #ifndef MAX_AIO_QUEUE 88 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 89 #endif 90 91 #ifndef TARGET_AIO_PROCS 92 #define TARGET_AIO_PROCS 4 93 #endif 94 95 #ifndef MAX_BUF_AIO 96 #define MAX_BUF_AIO 16 97 #endif 98 99 #ifndef AIOD_TIMEOUT_DEFAULT 100 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 101 #endif 102 103 #ifndef AIOD_LIFETIME_DEFAULT 104 #define AIOD_LIFETIME_DEFAULT (30 * hz) 105 #endif 106 107 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); 108 109 static int max_aio_procs = MAX_AIO_PROCS; 110 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 111 CTLFLAG_RW, &max_aio_procs, 0, 112 "Maximum number of kernel threads to use for handling async IO"); 113 114 static int num_aio_procs = 0; 115 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 116 CTLFLAG_RD, &num_aio_procs, 0, 117 "Number of presently active kernel threads for async IO"); 118 119 /* 120 * The code will adjust the actual number of AIO processes towards this 121 * number when it gets a chance. 122 */ 123 static int target_aio_procs = TARGET_AIO_PROCS; 124 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 125 0, "Preferred number of ready kernel threads for async IO"); 126 127 static int max_queue_count = MAX_AIO_QUEUE; 128 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, 129 "Maximum number of aio requests to queue, globally"); 130 131 static int num_queue_count = 0; 132 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, 133 "Number of queued aio requests"); 134 135 static int num_buf_aio = 0; 136 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, 137 "Number of aio requests presently handled by the buf subsystem"); 138 139 /* Number of async I/O thread in the process of being started */ 140 /* XXX This should be local to _aio_aqueue() */ 141 static int num_aio_resv_start = 0; 142 143 static int aiod_timeout; 144 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0, 145 "Timeout value for synchronous aio operations"); 146 147 static int aiod_lifetime; 148 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, 149 "Maximum lifetime for idle aiod"); 150 151 static int max_aio_per_proc = MAX_AIO_PER_PROC; 152 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 153 0, "Maximum active aio requests per process (stored in the process)"); 154 155 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 156 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, 157 &max_aio_queue_per_proc, 0, 158 "Maximum queued aio requests per process (stored in the process)"); 159 160 static int max_buf_aio = MAX_BUF_AIO; 161 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, 162 "Maximum buf aio requests per process (stored in the process)"); 163 164 /* 165 * AIO process info 166 */ 167 #define AIOP_FREE 0x1 /* proc on free queue */ 168 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 169 170 struct aioproclist { 171 int aioprocflags; /* AIO proc flags */ 172 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 173 struct proc *aioproc; /* The AIO thread */ 174 }; 175 176 /* 177 * data-structure for lio signal management 178 */ 179 struct aio_liojob { 180 int lioj_flags; 181 int lioj_buffer_count; 182 int lioj_buffer_finished_count; 183 int lioj_queue_count; 184 int lioj_queue_finished_count; 185 struct sigevent lioj_signal; /* signal on all I/O done */ 186 TAILQ_ENTRY(aio_liojob) lioj_list; 187 struct kaioinfo *lioj_ki; 188 }; 189 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 190 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 191 192 /* 193 * per process aio data structure 194 */ 195 struct kaioinfo { 196 int kaio_flags; /* per process kaio flags */ 197 int kaio_maxactive_count; /* maximum number of AIOs */ 198 int kaio_active_count; /* number of currently used AIOs */ 199 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 200 int kaio_queue_count; /* size of AIO queue */ 201 int kaio_ballowed_count; /* maximum number of buffers */ 202 int kaio_queue_finished_count; /* number of daemon jobs finished */ 203 int kaio_buffer_count; /* number of physio buffers */ 204 int kaio_buffer_finished_count; /* count of I/O done */ 205 struct proc *kaio_p; /* process that uses this kaio block */ 206 TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 207 TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */ 208 TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */ 209 TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 210 TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */ 211 TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ 212 }; 213 214 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 215 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 216 217 static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc; 218 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 219 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 220 static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 221 222 static void aio_init_aioinfo(struct proc *p); 223 static void aio_onceonly(void *); 224 static int aio_free_entry(struct aiocblist *aiocbe); 225 static void aio_process(struct aiocblist *aiocbe); 226 static int aio_newproc(void); 227 static int aio_aqueue(struct aiocb *job, int type); 228 static void aio_physwakeup(struct bio *bio); 229 static int aio_fphysio(struct aiocblist *aiocbe); 230 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 231 static void aio_daemon(void *uproc, struct trapframe *frame); 232 static void process_signal(void *aioj); 233 234 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 235 236 /* 237 * Zones for: 238 * kaio Per process async io info 239 * aiop async io thread data 240 * aiocb async io jobs 241 * aiol list io job pointer - internal to aio_suspend XXX 242 * aiolio list io jobs 243 */ 244 static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone; 245 246 /* 247 * Startup initialization 248 */ 249 static void 250 aio_onceonly(void *na) 251 { 252 TAILQ_INIT(&aio_freeproc); 253 TAILQ_INIT(&aio_activeproc); 254 TAILQ_INIT(&aio_jobs); 255 TAILQ_INIT(&aio_bufjobs); 256 TAILQ_INIT(&aio_freejobs); 257 kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1); 258 aiop_zone = zinit("AIOP", sizeof(struct aioproclist), 0, 0, 1); 259 aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1); 260 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t), 0, 0, 1); 261 aiolio_zone = zinit("AIOLIO", sizeof(struct aio_liojob), 0, 0, 1); 262 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 263 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 264 jobrefid = 1; 265 } 266 267 /* 268 * Init the per-process aioinfo structure. The aioinfo limits are set 269 * per-process for user limit (resource) management. 270 */ 271 static void 272 aio_init_aioinfo(struct proc *p) 273 { 274 struct kaioinfo *ki; 275 if (p->p_aioinfo == NULL) { 276 ki = zalloc(kaio_zone); 277 p->p_aioinfo = ki; 278 ki->kaio_flags = 0; 279 ki->kaio_maxactive_count = max_aio_per_proc; 280 ki->kaio_active_count = 0; 281 ki->kaio_qallowed_count = max_aio_queue_per_proc; 282 ki->kaio_queue_count = 0; 283 ki->kaio_ballowed_count = max_buf_aio; 284 ki->kaio_buffer_count = 0; 285 ki->kaio_buffer_finished_count = 0; 286 ki->kaio_p = p; 287 TAILQ_INIT(&ki->kaio_jobdone); 288 TAILQ_INIT(&ki->kaio_jobqueue); 289 TAILQ_INIT(&ki->kaio_bufdone); 290 TAILQ_INIT(&ki->kaio_bufqueue); 291 TAILQ_INIT(&ki->kaio_liojoblist); 292 TAILQ_INIT(&ki->kaio_sockqueue); 293 } 294 295 while (num_aio_procs < target_aio_procs) 296 aio_newproc(); 297 } 298 299 /* 300 * Free a job entry. Wait for completion if it is currently active, but don't 301 * delay forever. If we delay, we return a flag that says that we have to 302 * restart the queue scan. 303 */ 304 static int 305 aio_free_entry(struct aiocblist *aiocbe) 306 { 307 struct kaioinfo *ki; 308 struct aio_liojob *lj; 309 struct proc *p; 310 int error; 311 312 if (aiocbe->jobstate == JOBST_NULL) 313 panic("aio_free_entry: freeing already free job"); 314 315 p = aiocbe->userproc; 316 ki = p->p_aioinfo; 317 lj = aiocbe->lio; 318 if (ki == NULL) 319 panic("aio_free_entry: missing p->p_aioinfo"); 320 321 while (aiocbe->jobstate == JOBST_JOBRUNNING) { 322 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 323 tsleep(aiocbe, 0, "jobwai", 0); 324 } 325 if (aiocbe->bp == NULL) { 326 if (ki->kaio_queue_count <= 0) 327 panic("aio_free_entry: process queue size <= 0"); 328 if (num_queue_count <= 0) 329 panic("aio_free_entry: system wide queue size <= 0"); 330 331 if (lj) { 332 lj->lioj_queue_count--; 333 if (aiocbe->jobflags & AIOCBLIST_DONE) 334 lj->lioj_queue_finished_count--; 335 } 336 ki->kaio_queue_count--; 337 if (aiocbe->jobflags & AIOCBLIST_DONE) 338 ki->kaio_queue_finished_count--; 339 num_queue_count--; 340 } else { 341 if (lj) { 342 lj->lioj_buffer_count--; 343 if (aiocbe->jobflags & AIOCBLIST_DONE) 344 lj->lioj_buffer_finished_count--; 345 } 346 if (aiocbe->jobflags & AIOCBLIST_DONE) 347 ki->kaio_buffer_finished_count--; 348 ki->kaio_buffer_count--; 349 num_buf_aio--; 350 } 351 352 /* aiocbe is going away, we need to destroy any knotes */ 353 /* XXX lwp knote wants a thread, but only cares about the process */ 354 knote_empty(&aiocbe->klist); 355 356 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) 357 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { 358 ki->kaio_flags &= ~KAIO_WAKEUP; 359 wakeup(p); 360 } 361 362 if (aiocbe->jobstate == JOBST_JOBQBUF) { 363 if ((error = aio_fphysio(aiocbe)) != 0) 364 return error; 365 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 366 panic("aio_free_entry: invalid physio finish-up state"); 367 crit_enter(); 368 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 369 crit_exit(); 370 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) { 371 crit_enter(); 372 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 373 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 374 crit_exit(); 375 } else if (aiocbe->jobstate == JOBST_JOBFINISHED) 376 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 377 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { 378 crit_enter(); 379 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 380 crit_exit(); 381 if (aiocbe->bp) { 382 vunmapbuf(aiocbe->bp); 383 relpbuf(aiocbe->bp, NULL); 384 aiocbe->bp = NULL; 385 } 386 } 387 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 388 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 389 zfree(aiolio_zone, lj); 390 } 391 aiocbe->jobstate = JOBST_NULL; 392 callout_stop(&aiocbe->timeout); 393 fdrop(aiocbe->fd_file); 394 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 395 return 0; 396 } 397 #endif /* VFS_AIO */ 398 399 /* 400 * Rundown the jobs for a given process. 401 */ 402 void 403 aio_proc_rundown(struct proc *p) 404 { 405 #ifndef VFS_AIO 406 return; 407 #else 408 struct kaioinfo *ki; 409 struct aio_liojob *lj, *ljn; 410 struct aiocblist *aiocbe, *aiocbn; 411 struct file *fp; 412 struct socket *so; 413 414 ki = p->p_aioinfo; 415 if (ki == NULL) 416 return; 417 418 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 419 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > 420 ki->kaio_buffer_finished_count)) { 421 ki->kaio_flags |= KAIO_RUNDOWN; 422 if (tsleep(p, 0, "kaiowt", aiod_timeout)) 423 break; 424 } 425 426 /* 427 * Move any aio ops that are waiting on socket I/O to the normal job 428 * queues so they are cleaned up with any others. 429 */ 430 crit_enter(); 431 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = 432 aiocbn) { 433 aiocbn = TAILQ_NEXT(aiocbe, plist); 434 fp = aiocbe->fd_file; 435 if (fp != NULL) { 436 so = (struct socket *)fp->f_data; 437 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); 438 if (TAILQ_EMPTY(&so->so_aiojobq)) { 439 atomic_clear_int(&so->so_snd.ssb_flags, 440 SSB_AIO); 441 atomic_clear_int(&so->so_rcv.ssb_flags, 442 SSB_AIO); 443 } 444 } 445 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); 446 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); 447 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); 448 } 449 crit_exit(); 450 451 restart1: 452 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { 453 aiocbn = TAILQ_NEXT(aiocbe, plist); 454 if (aio_free_entry(aiocbe)) 455 goto restart1; 456 } 457 458 restart2: 459 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = 460 aiocbn) { 461 aiocbn = TAILQ_NEXT(aiocbe, plist); 462 if (aio_free_entry(aiocbe)) 463 goto restart2; 464 } 465 466 restart3: 467 crit_enter(); 468 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 469 ki->kaio_flags |= KAIO_WAKEUP; 470 tsleep(p, 0, "aioprn", 0); 471 crit_exit(); 472 goto restart3; 473 } 474 crit_exit(); 475 476 restart4: 477 crit_enter(); 478 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { 479 aiocbn = TAILQ_NEXT(aiocbe, plist); 480 if (aio_free_entry(aiocbe)) { 481 crit_exit(); 482 goto restart4; 483 } 484 } 485 crit_exit(); 486 487 /* 488 * If we've slept, jobs might have moved from one queue to another. 489 * Retry rundown if we didn't manage to empty the queues. 490 */ 491 if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL || 492 TAILQ_FIRST(&ki->kaio_jobqueue) != NULL || 493 TAILQ_FIRST(&ki->kaio_bufqueue) != NULL || 494 TAILQ_FIRST(&ki->kaio_bufdone) != NULL) 495 goto restart1; 496 497 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { 498 ljn = TAILQ_NEXT(lj, lioj_list); 499 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 500 0)) { 501 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 502 zfree(aiolio_zone, lj); 503 } else { 504 #ifdef DIAGNOSTIC 505 kprintf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " 506 "QF:%d\n", lj->lioj_buffer_count, 507 lj->lioj_buffer_finished_count, 508 lj->lioj_queue_count, 509 lj->lioj_queue_finished_count); 510 #endif 511 } 512 } 513 514 zfree(kaio_zone, ki); 515 p->p_aioinfo = NULL; 516 #endif /* VFS_AIO */ 517 } 518 519 #ifdef VFS_AIO 520 /* 521 * Select a job to run (called by an AIO daemon). 522 */ 523 static struct aiocblist * 524 aio_selectjob(struct aioproclist *aiop) 525 { 526 struct aiocblist *aiocbe; 527 struct kaioinfo *ki; 528 struct proc *userp; 529 530 crit_enter(); 531 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = 532 TAILQ_NEXT(aiocbe, list)) { 533 userp = aiocbe->userproc; 534 ki = userp->p_aioinfo; 535 536 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 537 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 538 crit_exit(); 539 return aiocbe; 540 } 541 } 542 crit_exit(); 543 544 return NULL; 545 } 546 547 /* 548 * The AIO processing activity. This is the code that does the I/O request for 549 * the non-physio version of the operations. The normal vn operations are used, 550 * and this code should work in all instances for every type of file, including 551 * pipes, sockets, fifos, and regular files. 552 */ 553 static void 554 aio_process(struct aiocblist *aiocbe) 555 { 556 struct thread *mytd; 557 struct aiocb *cb; 558 struct file *fp; 559 struct uio auio; 560 struct iovec aiov; 561 int cnt; 562 int error; 563 int oublock_st, oublock_end; 564 int inblock_st, inblock_end; 565 566 mytd = curthread; 567 cb = &aiocbe->uaiocb; 568 fp = aiocbe->fd_file; 569 570 aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; 571 aiov.iov_len = cb->aio_nbytes; 572 573 auio.uio_iov = &aiov; 574 auio.uio_iovcnt = 1; 575 auio.uio_offset = cb->aio_offset; 576 auio.uio_resid = cb->aio_nbytes; 577 cnt = cb->aio_nbytes; 578 auio.uio_segflg = UIO_USERSPACE; 579 auio.uio_td = mytd; 580 581 inblock_st = mytd->td_lwp->lwp_ru.ru_inblock; 582 oublock_st = mytd->td_lwp->lwp_ru.ru_oublock; 583 /* 584 * _aio_aqueue() acquires a reference to the file that is 585 * released in aio_free_entry(). 586 */ 587 if (cb->aio_lio_opcode == LIO_READ) { 588 auio.uio_rw = UIO_READ; 589 error = fo_read(fp, &auio, fp->f_cred, O_FOFFSET); 590 } else { 591 auio.uio_rw = UIO_WRITE; 592 error = fo_write(fp, &auio, fp->f_cred, O_FOFFSET); 593 } 594 inblock_end = mytd->td_lwp->lwp_ru.ru_inblock; 595 oublock_end = mytd->td_lwp->lwp_ru.ru_oublock; 596 597 aiocbe->inputcharge = inblock_end - inblock_st; 598 aiocbe->outputcharge = oublock_end - oublock_st; 599 600 if ((error) && (auio.uio_resid != cnt)) { 601 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 602 error = 0; 603 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 604 ksignal(aiocbe->userproc, SIGPIPE); 605 } 606 607 cnt -= auio.uio_resid; 608 cb->_aiocb_private.error = error; 609 cb->_aiocb_private.status = cnt; 610 } 611 612 /* 613 * The AIO daemon, most of the actual work is done in aio_process, 614 * but the setup (and address space mgmt) is done in this routine. 615 */ 616 static void 617 aio_daemon(void *uproc, struct trapframe *frame) 618 { 619 struct aio_liojob *lj; 620 struct aiocb *cb; 621 struct aiocblist *aiocbe; 622 struct aioproclist *aiop; 623 struct kaioinfo *ki; 624 struct proc *mycp, *userp; 625 struct vmspace *curvm; 626 struct lwp *mylwp; 627 struct ucred *cr; 628 629 /* 630 * mplock not held on entry but we aren't mpsafe yet. 631 */ 632 get_mplock(); 633 634 mylwp = curthread->td_lwp; 635 mycp = mylwp->lwp_proc; 636 637 if (mycp->p_textvp) { 638 vrele(mycp->p_textvp); 639 mycp->p_textvp = NULL; 640 } 641 642 /* 643 * Allocate and ready the aio control info. There is one aiop structure 644 * per daemon. 645 */ 646 aiop = zalloc(aiop_zone); 647 aiop->aioproc = mycp; 648 aiop->aioprocflags |= AIOP_FREE; 649 650 crit_enter(); 651 652 /* 653 * Place thread (lightweight process) onto the AIO free thread list. 654 */ 655 if (TAILQ_EMPTY(&aio_freeproc)) 656 wakeup(&aio_freeproc); 657 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 658 659 crit_exit(); 660 661 /* Make up a name for the daemon. */ 662 strcpy(mycp->p_comm, "aiod"); 663 664 /* 665 * Get rid of our current filedescriptors. AIOD's don't need any 666 * filedescriptors, except as temporarily inherited from the client. 667 * Credentials are also cloned, and made equivalent to "root". 668 */ 669 fdfree(mycp, NULL); 670 cr = cratom(&mycp->p_ucred); 671 cr->cr_uid = 0; 672 uireplace(&cr->cr_uidinfo, uifind(0)); 673 cr->cr_ngroups = 1; 674 cr->cr_groups[0] = 1; 675 676 /* The daemon resides in its own pgrp. */ 677 enterpgrp(mycp, mycp->p_pid, 1); 678 679 /* Mark special process type. */ 680 mycp->p_flag |= P_SYSTEM | P_KTHREADP; 681 682 /* 683 * Wakeup parent process. (Parent sleeps to keep from blasting away 684 * and creating too many daemons.) 685 */ 686 wakeup(mycp); 687 curvm = NULL; 688 689 for (;;) { 690 /* 691 * Take daemon off of free queue 692 */ 693 if (aiop->aioprocflags & AIOP_FREE) { 694 crit_enter(); 695 TAILQ_REMOVE(&aio_freeproc, aiop, list); 696 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 697 aiop->aioprocflags &= ~AIOP_FREE; 698 crit_exit(); 699 } 700 aiop->aioprocflags &= ~AIOP_SCHED; 701 702 /* 703 * Check for jobs. 704 */ 705 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 706 cb = &aiocbe->uaiocb; 707 userp = aiocbe->userproc; 708 709 aiocbe->jobstate = JOBST_JOBRUNNING; 710 711 /* 712 * Connect to process address space for user program. 713 */ 714 if (curvm != userp->p_vmspace) { 715 pmap_setlwpvm(mylwp, userp->p_vmspace); 716 if (curvm) 717 sysref_put(&curvm->vm_sysref); 718 curvm = userp->p_vmspace; 719 sysref_get(&curvm->vm_sysref); 720 } 721 722 ki = userp->p_aioinfo; 723 lj = aiocbe->lio; 724 725 /* Account for currently active jobs. */ 726 ki->kaio_active_count++; 727 728 /* Do the I/O function. */ 729 aio_process(aiocbe); 730 731 /* Decrement the active job count. */ 732 ki->kaio_active_count--; 733 734 /* 735 * Increment the completion count for wakeup/signal 736 * comparisons. 737 */ 738 aiocbe->jobflags |= AIOCBLIST_DONE; 739 ki->kaio_queue_finished_count++; 740 if (lj) 741 lj->lioj_queue_finished_count++; 742 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags 743 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { 744 ki->kaio_flags &= ~KAIO_WAKEUP; 745 wakeup(userp); 746 } 747 748 crit_enter(); 749 if (lj && (lj->lioj_flags & 750 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { 751 if ((lj->lioj_queue_finished_count == 752 lj->lioj_queue_count) && 753 (lj->lioj_buffer_finished_count == 754 lj->lioj_buffer_count)) { 755 ksignal(userp, 756 lj->lioj_signal.sigev_signo); 757 lj->lioj_flags |= 758 LIOJ_SIGNAL_POSTED; 759 } 760 } 761 crit_exit(); 762 763 aiocbe->jobstate = JOBST_JOBFINISHED; 764 765 crit_enter(); 766 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 767 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist); 768 crit_exit(); 769 KNOTE(&aiocbe->klist, 0); 770 771 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 772 wakeup(aiocbe); 773 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 774 } 775 776 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 777 ksignal(userp, cb->aio_sigevent.sigev_signo); 778 } 779 } 780 781 /* 782 * Disconnect from user address space. 783 */ 784 if (curvm) { 785 /* swap our original address space back in */ 786 pmap_setlwpvm(mylwp, mycp->p_vmspace); 787 sysref_put(&curvm->vm_sysref); 788 curvm = NULL; 789 } 790 791 /* 792 * If we are the first to be put onto the free queue, wakeup 793 * anyone waiting for a daemon. 794 */ 795 crit_enter(); 796 TAILQ_REMOVE(&aio_activeproc, aiop, list); 797 if (TAILQ_EMPTY(&aio_freeproc)) 798 wakeup(&aio_freeproc); 799 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 800 aiop->aioprocflags |= AIOP_FREE; 801 crit_exit(); 802 803 /* 804 * If daemon is inactive for a long time, allow it to exit, 805 * thereby freeing resources. 806 */ 807 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp, 808 0, "aiordy", aiod_lifetime)) { 809 crit_enter(); 810 if (TAILQ_EMPTY(&aio_jobs)) { 811 if ((aiop->aioprocflags & AIOP_FREE) && 812 (num_aio_procs > target_aio_procs)) { 813 TAILQ_REMOVE(&aio_freeproc, aiop, list); 814 crit_exit(); 815 zfree(aiop_zone, aiop); 816 num_aio_procs--; 817 #ifdef DIAGNOSTIC 818 if (mycp->p_vmspace->vm_sysref.refcnt <= 1) { 819 kprintf("AIOD: bad vm refcnt for" 820 " exiting daemon: %d\n", 821 mycp->p_vmspace->vm_sysref.refcnt); 822 } 823 #endif 824 exit1(0); 825 } 826 } 827 crit_exit(); 828 } 829 } 830 } 831 832 /* 833 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 834 * AIO daemon modifies its environment itself. 835 */ 836 static int 837 aio_newproc(void) 838 { 839 int error; 840 struct lwp *lp, *nlp; 841 struct proc *np; 842 843 lp = &lwp0; 844 error = fork1(lp, RFPROC|RFMEM|RFNOWAIT, &np); 845 if (error) 846 return error; 847 nlp = ONLY_LWP_IN_PROC(np); 848 cpu_set_fork_handler(nlp, aio_daemon, curproc); 849 start_forked_proc(lp, np); 850 851 /* 852 * Wait until daemon is started, but continue on just in case to 853 * handle error conditions. 854 */ 855 error = tsleep(np, 0, "aiosta", aiod_timeout); 856 num_aio_procs++; 857 858 return error; 859 } 860 861 /* 862 * Try the high-performance, low-overhead physio method for eligible 863 * VCHR devices. This method doesn't use an aio helper thread, and 864 * thus has very low overhead. 865 * 866 * Assumes that the caller, _aio_aqueue(), has incremented the file 867 * structure's reference count, preventing its deallocation for the 868 * duration of this call. 869 */ 870 static int 871 aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 872 { 873 int error; 874 struct aiocb *cb; 875 struct file *fp; 876 struct buf *bp; 877 struct vnode *vp; 878 struct kaioinfo *ki; 879 struct aio_liojob *lj; 880 int notify; 881 882 cb = &aiocbe->uaiocb; 883 fp = aiocbe->fd_file; 884 885 if (fp->f_type != DTYPE_VNODE) 886 return (-1); 887 888 vp = (struct vnode *)fp->f_data; 889 890 /* 891 * If its not a disk, we don't want to return a positive error. 892 * It causes the aio code to not fall through to try the thread 893 * way when you're talking to a regular file. 894 */ 895 if (!vn_isdisk(vp, &error)) { 896 if (error == ENOTBLK) 897 return (-1); 898 else 899 return (error); 900 } 901 902 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) 903 return (-1); 904 905 if (cb->aio_nbytes > 906 MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) 907 return (-1); 908 909 ki = p->p_aioinfo; 910 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 911 return (-1); 912 913 ki->kaio_buffer_count++; 914 915 lj = aiocbe->lio; 916 if (lj) 917 lj->lioj_buffer_count++; 918 919 /* Create and build a buffer header for a transfer. */ 920 bp = getpbuf_kva(NULL); 921 BUF_KERNPROC(bp); 922 923 /* 924 * Get a copy of the kva from the physical buffer. 925 */ 926 bp->b_bio1.bio_caller_info1.ptr = p; 927 error = 0; 928 929 bp->b_cmd = (cb->aio_lio_opcode == LIO_WRITE) ? 930 BUF_CMD_WRITE : BUF_CMD_READ; 931 bp->b_bio1.bio_done = aio_physwakeup; 932 bp->b_bio1.bio_flags |= BIO_SYNC; 933 bp->b_bio1.bio_offset = cb->aio_offset; 934 935 /* Bring buffer into kernel space. */ 936 if (vmapbuf(bp, __DEVOLATILE(char *, cb->aio_buf), cb->aio_nbytes) < 0) { 937 error = EFAULT; 938 goto doerror; 939 } 940 941 crit_enter(); 942 943 aiocbe->bp = bp; 944 bp->b_bio1.bio_caller_info2.ptr = aiocbe; 945 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 946 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 947 aiocbe->jobstate = JOBST_JOBQBUF; 948 cb->_aiocb_private.status = cb->aio_nbytes; 949 num_buf_aio++; 950 bp->b_error = 0; 951 952 crit_exit(); 953 954 /* 955 * Perform the transfer. vn_strategy must be used even though we 956 * know we have a device in order to deal with requests which exceed 957 * device DMA limitations. 958 */ 959 vn_strategy(vp, &bp->b_bio1); 960 961 notify = 0; 962 crit_enter(); 963 964 #if 0 965 /* 966 * If we had an error invoking the request, or an error in processing 967 * the request before we have returned, we process it as an error in 968 * transfer. Note that such an I/O error is not indicated immediately, 969 * but is returned using the aio_error mechanism. In this case, 970 * aio_suspend will return immediately. 971 */ 972 if (bp->b_error || (bp->b_flags & B_ERROR)) { 973 struct aiocb *job = aiocbe->uuaiocb; 974 975 aiocbe->uaiocb._aiocb_private.status = 0; 976 suword(&job->_aiocb_private.status, 0); 977 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 978 suword(&job->_aiocb_private.error, bp->b_error); 979 980 ki->kaio_buffer_finished_count++; 981 982 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 983 aiocbe->jobstate = JOBST_JOBBFINISHED; 984 aiocbe->jobflags |= AIOCBLIST_DONE; 985 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 986 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 987 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 988 notify = 1; 989 } 990 } 991 #endif 992 crit_exit(); 993 if (notify) 994 KNOTE(&aiocbe->klist, 0); 995 return 0; 996 997 doerror: 998 ki->kaio_buffer_count--; 999 if (lj) 1000 lj->lioj_buffer_count--; 1001 aiocbe->bp = NULL; 1002 relpbuf(bp, NULL); 1003 return error; 1004 } 1005 1006 /* 1007 * This waits/tests physio completion. 1008 */ 1009 static int 1010 aio_fphysio(struct aiocblist *iocb) 1011 { 1012 struct buf *bp; 1013 int error; 1014 1015 bp = iocb->bp; 1016 1017 error = biowait_timeout(&bp->b_bio1, "physstr", aiod_timeout); 1018 if (error == EWOULDBLOCK) 1019 return EINPROGRESS; 1020 1021 /* Release mapping into kernel space. */ 1022 vunmapbuf(bp); 1023 iocb->bp = 0; 1024 1025 error = 0; 1026 1027 /* Check for an error. */ 1028 if (bp->b_flags & B_ERROR) 1029 error = bp->b_error; 1030 1031 relpbuf(bp, NULL); 1032 return (error); 1033 } 1034 #endif /* VFS_AIO */ 1035 1036 /* 1037 * Wake up aio requests that may be serviceable now. 1038 */ 1039 void 1040 aio_swake(struct socket *so, struct signalsockbuf *ssb) 1041 { 1042 #ifndef VFS_AIO 1043 return; 1044 #else 1045 struct aiocblist *cb,*cbn; 1046 struct proc *p; 1047 struct kaioinfo *ki = NULL; 1048 int opcode, wakecount = 0; 1049 struct aioproclist *aiop; 1050 1051 if (ssb == &so->so_snd) { 1052 opcode = LIO_WRITE; 1053 atomic_clear_int(&so->so_snd.ssb_flags, SSB_AIO); 1054 } else { 1055 opcode = LIO_READ; 1056 atomic_clear_int(&so->so_rcv.ssb_flags, SSB_AIO); 1057 } 1058 1059 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { 1060 cbn = TAILQ_NEXT(cb, list); 1061 if (opcode == cb->uaiocb.aio_lio_opcode) { 1062 p = cb->userproc; 1063 ki = p->p_aioinfo; 1064 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1065 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); 1066 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1067 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); 1068 wakecount++; 1069 if (cb->jobstate != JOBST_JOBQGLOBAL) 1070 panic("invalid queue value"); 1071 } 1072 } 1073 1074 while (wakecount--) { 1075 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { 1076 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1077 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1078 aiop->aioprocflags &= ~AIOP_FREE; 1079 wakeup(aiop->aioproc); 1080 } 1081 } 1082 #endif /* VFS_AIO */ 1083 } 1084 1085 #ifdef VFS_AIO 1086 /* 1087 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1088 * technique is done in this code. 1089 */ 1090 static int 1091 _aio_aqueue(struct aiocb *job, struct aio_liojob *lj, int type) 1092 { 1093 struct proc *p = curproc; 1094 struct file *fp; 1095 unsigned int fd; 1096 struct socket *so; 1097 int error; 1098 int opcode, user_opcode; 1099 struct aiocblist *aiocbe; 1100 struct aioproclist *aiop; 1101 struct kaioinfo *ki; 1102 struct kevent kev; 1103 struct kqueue *kq; 1104 struct file *kq_fp; 1105 int fflags; 1106 1107 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) 1108 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1109 else 1110 aiocbe = zalloc (aiocb_zone); 1111 1112 aiocbe->inputcharge = 0; 1113 aiocbe->outputcharge = 0; 1114 callout_init(&aiocbe->timeout); 1115 SLIST_INIT(&aiocbe->klist); 1116 1117 suword(&job->_aiocb_private.status, -1); 1118 suword(&job->_aiocb_private.error, 0); 1119 suword(&job->_aiocb_private.kernelinfo, -1); 1120 1121 error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb)); 1122 if (error) { 1123 suword(&job->_aiocb_private.error, error); 1124 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1125 return error; 1126 } 1127 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL && 1128 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { 1129 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1130 return EINVAL; 1131 } 1132 1133 /* Save userspace address of the job info. */ 1134 aiocbe->uuaiocb = job; 1135 1136 /* Get the opcode. */ 1137 user_opcode = aiocbe->uaiocb.aio_lio_opcode; 1138 if (type != LIO_NOP) 1139 aiocbe->uaiocb.aio_lio_opcode = type; 1140 opcode = aiocbe->uaiocb.aio_lio_opcode; 1141 1142 /* 1143 * Range check file descriptor. 1144 */ 1145 fflags = (opcode == LIO_WRITE) ? FWRITE : FREAD; 1146 fd = aiocbe->uaiocb.aio_fildes; 1147 fp = holdfp(p->p_fd, fd, fflags); 1148 if (fp == NULL) { 1149 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1150 if (type == 0) 1151 suword(&job->_aiocb_private.error, EBADF); 1152 return EBADF; 1153 } 1154 1155 aiocbe->fd_file = fp; 1156 1157 if (aiocbe->uaiocb.aio_offset == -1LL) { 1158 error = EINVAL; 1159 goto aqueue_fail; 1160 } 1161 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1162 if (error) { 1163 error = EINVAL; 1164 goto aqueue_fail; 1165 } 1166 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1167 if (jobrefid == LONG_MAX) 1168 jobrefid = 1; 1169 else 1170 jobrefid++; 1171 1172 if (opcode == LIO_NOP) { 1173 fdrop(fp); 1174 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1175 if (type == 0) { 1176 suword(&job->_aiocb_private.error, 0); 1177 suword(&job->_aiocb_private.status, 0); 1178 suword(&job->_aiocb_private.kernelinfo, 0); 1179 } 1180 return 0; 1181 } 1182 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1183 if (type == 0) 1184 suword(&job->_aiocb_private.status, 0); 1185 error = EINVAL; 1186 goto aqueue_fail; 1187 } 1188 1189 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) { 1190 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; 1191 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr; 1192 } 1193 else { 1194 /* 1195 * This method for requesting kevent-based notification won't 1196 * work on the alpha, since we're passing in a pointer 1197 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT- 1198 * based method instead. 1199 */ 1200 if (user_opcode == LIO_NOP || user_opcode == LIO_READ || 1201 user_opcode == LIO_WRITE) 1202 goto no_kqueue; 1203 1204 error = copyin((struct kevent *)(uintptr_t)user_opcode, 1205 &kev, sizeof(kev)); 1206 if (error) 1207 goto aqueue_fail; 1208 } 1209 kq_fp = holdfp(p->p_fd, (int)kev.ident, -1); 1210 if (kq_fp == NULL || kq_fp->f_type != DTYPE_KQUEUE) { 1211 if (kq_fp) { 1212 fdrop(kq_fp); 1213 kq_fp = NULL; 1214 } 1215 error = EBADF; 1216 goto aqueue_fail; 1217 } 1218 kq = (struct kqueue *)kq_fp->f_data; 1219 kev.ident = (uintptr_t)aiocbe->uuaiocb; 1220 kev.filter = EVFILT_AIO; 1221 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 1222 kev.data = (intptr_t)aiocbe; 1223 error = kqueue_register(kq, &kev); 1224 fdrop(kq_fp); 1225 aqueue_fail: 1226 if (error) { 1227 fdrop(fp); 1228 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1229 if (type == 0) 1230 suword(&job->_aiocb_private.error, error); 1231 goto done; 1232 } 1233 no_kqueue: 1234 1235 suword(&job->_aiocb_private.error, EINPROGRESS); 1236 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1237 aiocbe->userproc = p; 1238 aiocbe->jobflags = 0; 1239 aiocbe->lio = lj; 1240 ki = p->p_aioinfo; 1241 1242 if (fp->f_type == DTYPE_SOCKET) { 1243 /* 1244 * Alternate queueing for socket ops: Reach down into the 1245 * descriptor to get the socket data. Then check to see if the 1246 * socket is ready to be read or written (based on the requested 1247 * operation). 1248 * 1249 * If it is not ready for io, then queue the aiocbe on the 1250 * socket, and set the flags so we get a call when ssb_notify() 1251 * happens. 1252 */ 1253 so = (struct socket *)fp->f_data; 1254 crit_enter(); 1255 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1256 LIO_WRITE) && (!sowriteable(so)))) { 1257 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1258 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); 1259 if (opcode == LIO_READ) 1260 atomic_set_int(&so->so_rcv.ssb_flags, SSB_AIO); 1261 else 1262 atomic_set_int(&so->so_snd.ssb_flags, SSB_AIO); 1263 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ 1264 ki->kaio_queue_count++; 1265 num_queue_count++; 1266 crit_exit(); 1267 error = 0; 1268 goto done; 1269 } 1270 crit_exit(); 1271 } 1272 1273 if ((error = aio_qphysio(p, aiocbe)) == 0) 1274 goto done; 1275 if (error > 0) { 1276 suword(&job->_aiocb_private.status, 0); 1277 aiocbe->uaiocb._aiocb_private.error = error; 1278 suword(&job->_aiocb_private.error, error); 1279 goto done; 1280 } 1281 1282 /* No buffer for daemon I/O. */ 1283 aiocbe->bp = NULL; 1284 1285 ki->kaio_queue_count++; 1286 if (lj) 1287 lj->lioj_queue_count++; 1288 crit_enter(); 1289 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1290 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1291 crit_exit(); 1292 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1293 1294 num_queue_count++; 1295 error = 0; 1296 1297 /* 1298 * If we don't have a free AIO process, and we are below our quota, then 1299 * start one. Otherwise, depend on the subsequent I/O completions to 1300 * pick-up this job. If we don't successfully create the new process 1301 * (thread) due to resource issues, we return an error for now (EAGAIN), 1302 * which is likely not the correct thing to do. 1303 */ 1304 crit_enter(); 1305 retryproc: 1306 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1307 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1308 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1309 aiop->aioprocflags &= ~AIOP_FREE; 1310 wakeup(aiop->aioproc); 1311 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1312 ((ki->kaio_active_count + num_aio_resv_start) < 1313 ki->kaio_maxactive_count)) { 1314 num_aio_resv_start++; 1315 if ((error = aio_newproc()) == 0) { 1316 num_aio_resv_start--; 1317 goto retryproc; 1318 } 1319 num_aio_resv_start--; 1320 } 1321 crit_exit(); 1322 done: 1323 return error; 1324 } 1325 1326 /* 1327 * This routine queues an AIO request, checking for quotas. 1328 */ 1329 static int 1330 aio_aqueue(struct aiocb *job, int type) 1331 { 1332 struct proc *p = curproc; 1333 struct kaioinfo *ki; 1334 1335 if (p->p_aioinfo == NULL) 1336 aio_init_aioinfo(p); 1337 1338 if (num_queue_count >= max_queue_count) 1339 return EAGAIN; 1340 1341 ki = p->p_aioinfo; 1342 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1343 return EAGAIN; 1344 1345 return _aio_aqueue(job, NULL, type); 1346 } 1347 #endif /* VFS_AIO */ 1348 1349 /* 1350 * Support the aio_return system call, as a side-effect, kernel resources are 1351 * released. 1352 * 1353 * MPALMOSTSAFE 1354 */ 1355 int 1356 sys_aio_return(struct aio_return_args *uap) 1357 { 1358 #ifndef VFS_AIO 1359 return (ENOSYS); 1360 #else 1361 struct proc *p = curproc; 1362 struct lwp *lp = curthread->td_lwp; 1363 long jobref; 1364 struct aiocblist *cb, *ncb; 1365 struct aiocb *ujob; 1366 struct kaioinfo *ki; 1367 int error; 1368 1369 ki = p->p_aioinfo; 1370 if (ki == NULL) 1371 return EINVAL; 1372 1373 ujob = uap->aiocbp; 1374 1375 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1376 if (jobref == -1 || jobref == 0) 1377 return EINVAL; 1378 1379 get_mplock(); 1380 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1381 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1382 jobref) { 1383 if (ujob == cb->uuaiocb) { 1384 uap->sysmsg_result = 1385 cb->uaiocb._aiocb_private.status; 1386 } else { 1387 uap->sysmsg_result = EFAULT; 1388 } 1389 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1390 lp->lwp_ru.ru_oublock += cb->outputcharge; 1391 cb->outputcharge = 0; 1392 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1393 lp->lwp_ru.ru_inblock += cb->inputcharge; 1394 cb->inputcharge = 0; 1395 } 1396 aio_free_entry(cb); 1397 error = 0; 1398 goto done; 1399 } 1400 } 1401 crit_enter(); 1402 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { 1403 ncb = TAILQ_NEXT(cb, plist); 1404 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) 1405 == jobref) { 1406 crit_exit(); 1407 if (ujob == cb->uuaiocb) { 1408 uap->sysmsg_result = 1409 cb->uaiocb._aiocb_private.status; 1410 } else { 1411 uap->sysmsg_result = EFAULT; 1412 } 1413 aio_free_entry(cb); 1414 error = 0; 1415 goto done; 1416 } 1417 } 1418 crit_exit(); 1419 error = EINVAL; 1420 done: 1421 rel_mplock(); 1422 return (error); 1423 #endif /* VFS_AIO */ 1424 } 1425 1426 /* 1427 * Allow a process to wakeup when any of the I/O requests are completed. 1428 * 1429 * MPALMOSTSAFE 1430 */ 1431 int 1432 sys_aio_suspend(struct aio_suspend_args *uap) 1433 { 1434 #ifndef VFS_AIO 1435 return ENOSYS; 1436 #else 1437 struct proc *p = curproc; 1438 struct timeval atv; 1439 struct timespec ts; 1440 struct aiocb *const *cbptr, *cbp; 1441 struct kaioinfo *ki; 1442 struct aiocblist *cb; 1443 int i; 1444 int njoblist; 1445 int error, timo; 1446 long *ijoblist; 1447 struct aiocb **ujoblist; 1448 1449 if ((u_int)uap->nent > AIO_LISTIO_MAX) 1450 return EINVAL; 1451 1452 timo = 0; 1453 if (uap->timeout) { 1454 /* Get timespec struct. */ 1455 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1456 return error; 1457 1458 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1459 return (EINVAL); 1460 1461 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1462 if (itimerfix(&atv)) 1463 return (EINVAL); 1464 timo = tvtohz_high(&atv); 1465 } 1466 1467 ki = p->p_aioinfo; 1468 if (ki == NULL) 1469 return EAGAIN; 1470 1471 get_mplock(); 1472 1473 njoblist = 0; 1474 ijoblist = zalloc(aiol_zone); 1475 ujoblist = zalloc(aiol_zone); 1476 cbptr = uap->aiocbp; 1477 1478 for (i = 0; i < uap->nent; i++) { 1479 cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); 1480 if (cbp == 0) 1481 continue; 1482 ujoblist[njoblist] = cbp; 1483 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1484 njoblist++; 1485 } 1486 1487 if (njoblist == 0) { 1488 zfree(aiol_zone, ijoblist); 1489 zfree(aiol_zone, ujoblist); 1490 error = 0; 1491 goto done; 1492 } 1493 1494 error = 0; 1495 for (;;) { 1496 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1497 for (i = 0; i < njoblist; i++) { 1498 if (((intptr_t) 1499 cb->uaiocb._aiocb_private.kernelinfo) == 1500 ijoblist[i]) { 1501 if (ujoblist[i] != cb->uuaiocb) 1502 error = EINVAL; 1503 zfree(aiol_zone, ijoblist); 1504 zfree(aiol_zone, ujoblist); 1505 goto done; 1506 } 1507 } 1508 } 1509 1510 crit_enter(); 1511 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = 1512 TAILQ_NEXT(cb, plist)) { 1513 for (i = 0; i < njoblist; i++) { 1514 if (((intptr_t) 1515 cb->uaiocb._aiocb_private.kernelinfo) == 1516 ijoblist[i]) { 1517 crit_exit(); 1518 if (ujoblist[i] != cb->uuaiocb) 1519 error = EINVAL; 1520 zfree(aiol_zone, ijoblist); 1521 zfree(aiol_zone, ujoblist); 1522 goto done; 1523 } 1524 } 1525 } 1526 1527 ki->kaio_flags |= KAIO_WAKEUP; 1528 error = tsleep(p, PCATCH, "aiospn", timo); 1529 crit_exit(); 1530 1531 if (error == ERESTART || error == EINTR) { 1532 zfree(aiol_zone, ijoblist); 1533 zfree(aiol_zone, ujoblist); 1534 error = EINTR; 1535 goto done; 1536 } else if (error == EWOULDBLOCK) { 1537 zfree(aiol_zone, ijoblist); 1538 zfree(aiol_zone, ujoblist); 1539 error = EAGAIN; 1540 goto done; 1541 } 1542 } 1543 1544 /* NOTREACHED */ 1545 error = EINVAL; 1546 done: 1547 rel_mplock(); 1548 return (error); 1549 #endif /* VFS_AIO */ 1550 } 1551 1552 /* 1553 * aio_cancel cancels any non-physio aio operations not currently in 1554 * progress. 1555 * 1556 * MPALMOSTSAFE 1557 */ 1558 int 1559 sys_aio_cancel(struct aio_cancel_args *uap) 1560 { 1561 #ifndef VFS_AIO 1562 return ENOSYS; 1563 #else 1564 struct proc *p = curproc; 1565 struct kaioinfo *ki; 1566 struct aiocblist *cbe, *cbn; 1567 struct file *fp; 1568 struct socket *so; 1569 struct proc *po; 1570 int error; 1571 int cancelled=0; 1572 int notcancelled=0; 1573 struct vnode *vp; 1574 1575 fp = holdfp(p->p_fd, uap->fd, -1); 1576 if (fp == NULL) 1577 return (EBADF); 1578 1579 get_mplock(); 1580 1581 if (fp->f_type == DTYPE_VNODE) { 1582 vp = (struct vnode *)fp->f_data; 1583 1584 if (vn_isdisk(vp,&error)) { 1585 uap->sysmsg_result = AIO_NOTCANCELED; 1586 error = 0; 1587 goto done2; 1588 } 1589 } else if (fp->f_type == DTYPE_SOCKET) { 1590 so = (struct socket *)fp->f_data; 1591 1592 crit_enter(); 1593 1594 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { 1595 cbn = TAILQ_NEXT(cbe, list); 1596 if ((uap->aiocbp == NULL) || 1597 (uap->aiocbp == cbe->uuaiocb) ) { 1598 po = cbe->userproc; 1599 ki = po->p_aioinfo; 1600 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 1601 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); 1602 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); 1603 if (ki->kaio_flags & KAIO_WAKEUP) { 1604 wakeup(po); 1605 } 1606 cbe->jobstate = JOBST_JOBFINISHED; 1607 cbe->uaiocb._aiocb_private.status=-1; 1608 cbe->uaiocb._aiocb_private.error=ECANCELED; 1609 cancelled++; 1610 /* XXX cancelled, knote? */ 1611 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1612 SIGEV_SIGNAL) 1613 ksignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1614 if (uap->aiocbp) 1615 break; 1616 } 1617 } 1618 crit_exit(); 1619 1620 if ((cancelled) && (uap->aiocbp)) { 1621 uap->sysmsg_result = AIO_CANCELED; 1622 error = 0; 1623 goto done2; 1624 } 1625 } 1626 ki=p->p_aioinfo; 1627 if (ki == NULL) 1628 goto done; 1629 crit_enter(); 1630 1631 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { 1632 cbn = TAILQ_NEXT(cbe, plist); 1633 1634 if ((uap->fd == cbe->uaiocb.aio_fildes) && 1635 ((uap->aiocbp == NULL ) || 1636 (uap->aiocbp == cbe->uuaiocb))) { 1637 1638 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 1639 TAILQ_REMOVE(&aio_jobs, cbe, list); 1640 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 1641 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, 1642 plist); 1643 cancelled++; 1644 ki->kaio_queue_finished_count++; 1645 cbe->jobstate = JOBST_JOBFINISHED; 1646 cbe->uaiocb._aiocb_private.status = -1; 1647 cbe->uaiocb._aiocb_private.error = ECANCELED; 1648 /* XXX cancelled, knote? */ 1649 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1650 SIGEV_SIGNAL) 1651 ksignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1652 } else { 1653 notcancelled++; 1654 } 1655 } 1656 } 1657 crit_exit(); 1658 done: 1659 if (notcancelled) 1660 uap->sysmsg_result = AIO_NOTCANCELED; 1661 else if (cancelled) 1662 uap->sysmsg_result = AIO_CANCELED; 1663 else 1664 uap->sysmsg_result = AIO_ALLDONE; 1665 error = 0; 1666 done2: 1667 rel_mplock(); 1668 fdrop(fp); 1669 return error; 1670 #endif /* VFS_AIO */ 1671 } 1672 1673 /* 1674 * aio_error is implemented in the kernel level for compatibility purposes only. 1675 * For a user mode async implementation, it would be best to do it in a userland 1676 * subroutine. 1677 * 1678 * MPALMOSTSAFE 1679 */ 1680 int 1681 sys_aio_error(struct aio_error_args *uap) 1682 { 1683 #ifndef VFS_AIO 1684 return ENOSYS; 1685 #else 1686 struct proc *p = curproc; 1687 struct aiocblist *cb; 1688 struct kaioinfo *ki; 1689 long jobref; 1690 int error; 1691 1692 ki = p->p_aioinfo; 1693 if (ki == NULL) 1694 return EINVAL; 1695 1696 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1697 if ((jobref == -1) || (jobref == 0)) 1698 return EINVAL; 1699 1700 get_mplock(); 1701 error = 0; 1702 1703 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1704 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1705 jobref) { 1706 uap->sysmsg_result = cb->uaiocb._aiocb_private.error; 1707 goto done; 1708 } 1709 } 1710 1711 crit_enter(); 1712 1713 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, 1714 plist)) { 1715 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1716 jobref) { 1717 uap->sysmsg_result = EINPROGRESS; 1718 crit_exit(); 1719 goto done; 1720 } 1721 } 1722 1723 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, 1724 plist)) { 1725 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1726 jobref) { 1727 uap->sysmsg_result = EINPROGRESS; 1728 crit_exit(); 1729 goto done; 1730 } 1731 } 1732 crit_exit(); 1733 1734 crit_enter(); 1735 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, 1736 plist)) { 1737 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1738 jobref) { 1739 uap->sysmsg_result = cb->uaiocb._aiocb_private.error; 1740 crit_exit(); 1741 goto done; 1742 } 1743 } 1744 1745 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, 1746 plist)) { 1747 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1748 jobref) { 1749 uap->sysmsg_result = EINPROGRESS; 1750 crit_exit(); 1751 goto done; 1752 } 1753 } 1754 crit_exit(); 1755 error = EINVAL; 1756 done: 1757 rel_mplock(); 1758 return (error); 1759 #endif /* VFS_AIO */ 1760 } 1761 1762 /* 1763 * syscall - asynchronous read from a file (REALTIME) 1764 * 1765 * MPALMOSTSAFE 1766 */ 1767 int 1768 sys_aio_read(struct aio_read_args *uap) 1769 { 1770 #ifndef VFS_AIO 1771 return ENOSYS; 1772 #else 1773 int error; 1774 1775 get_mplock(); 1776 error = aio_aqueue(uap->aiocbp, LIO_READ); 1777 rel_mplock(); 1778 return (error); 1779 #endif /* VFS_AIO */ 1780 } 1781 1782 /* 1783 * syscall - asynchronous write to a file (REALTIME) 1784 * 1785 * MPALMOSTSAFE 1786 */ 1787 int 1788 sys_aio_write(struct aio_write_args *uap) 1789 { 1790 #ifndef VFS_AIO 1791 return ENOSYS; 1792 #else 1793 int error; 1794 1795 get_mplock(); 1796 error = aio_aqueue(uap->aiocbp, LIO_WRITE); 1797 rel_mplock(); 1798 return (error); 1799 #endif /* VFS_AIO */ 1800 } 1801 1802 /* 1803 * syscall - XXX undocumented 1804 * 1805 * MPALMOSTSAFE 1806 */ 1807 int 1808 sys_lio_listio(struct lio_listio_args *uap) 1809 { 1810 #ifndef VFS_AIO 1811 return ENOSYS; 1812 #else 1813 struct proc *p = curproc; 1814 struct lwp *lp = curthread->td_lwp; 1815 int nent, nentqueued; 1816 struct aiocb *iocb, * const *cbptr; 1817 struct aiocblist *cb; 1818 struct kaioinfo *ki; 1819 struct aio_liojob *lj; 1820 int error, runningcode; 1821 int nerror; 1822 int i; 1823 1824 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 1825 return EINVAL; 1826 1827 nent = uap->nent; 1828 if (nent > AIO_LISTIO_MAX) 1829 return EINVAL; 1830 1831 get_mplock(); 1832 1833 if (p->p_aioinfo == NULL) 1834 aio_init_aioinfo(p); 1835 1836 if ((nent + num_queue_count) > max_queue_count) { 1837 error = EAGAIN; 1838 goto done; 1839 } 1840 1841 ki = p->p_aioinfo; 1842 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) { 1843 error = EAGAIN; 1844 goto done; 1845 } 1846 1847 lj = zalloc(aiolio_zone); 1848 if (lj == NULL) { 1849 error = EAGAIN; 1850 goto done; 1851 } 1852 1853 lj->lioj_flags = 0; 1854 lj->lioj_buffer_count = 0; 1855 lj->lioj_buffer_finished_count = 0; 1856 lj->lioj_queue_count = 0; 1857 lj->lioj_queue_finished_count = 0; 1858 lj->lioj_ki = ki; 1859 1860 /* 1861 * Setup signal. 1862 */ 1863 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 1864 error = copyin(uap->sig, &lj->lioj_signal, 1865 sizeof(lj->lioj_signal)); 1866 if (error) { 1867 zfree(aiolio_zone, lj); 1868 goto done; 1869 } 1870 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { 1871 zfree(aiolio_zone, lj); 1872 error = EINVAL; 1873 goto done; 1874 } 1875 lj->lioj_flags |= LIOJ_SIGNAL; 1876 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 1877 } else 1878 lj->lioj_flags &= ~LIOJ_SIGNAL; 1879 1880 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 1881 /* 1882 * Get pointers to the list of I/O requests. 1883 */ 1884 nerror = 0; 1885 nentqueued = 0; 1886 cbptr = uap->acb_list; 1887 for (i = 0; i < uap->nent; i++) { 1888 iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); 1889 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) { 1890 error = _aio_aqueue(iocb, lj, 0); 1891 if (error == 0) 1892 nentqueued++; 1893 else 1894 nerror++; 1895 } 1896 } 1897 1898 /* 1899 * If we haven't queued any, then just return error. 1900 */ 1901 if (nentqueued == 0) { 1902 error = 0; 1903 goto done; 1904 } 1905 1906 /* 1907 * Calculate the appropriate error return. 1908 */ 1909 runningcode = 0; 1910 if (nerror) 1911 runningcode = EIO; 1912 1913 if (uap->mode == LIO_WAIT) { 1914 int command, found, jobref; 1915 1916 for (;;) { 1917 found = 0; 1918 for (i = 0; i < uap->nent; i++) { 1919 /* 1920 * Fetch address of the control buf pointer in 1921 * user space. 1922 */ 1923 iocb = (struct aiocb *) 1924 (intptr_t)fuword(&cbptr[i]); 1925 if (((intptr_t)iocb == -1) || ((intptr_t)iocb 1926 == 0)) 1927 continue; 1928 1929 /* 1930 * Fetch the associated command from user space. 1931 */ 1932 command = fuword(&iocb->aio_lio_opcode); 1933 if (command == LIO_NOP) { 1934 found++; 1935 continue; 1936 } 1937 1938 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 1939 1940 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1941 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 1942 == jobref) { 1943 if (cb->uaiocb.aio_lio_opcode 1944 == LIO_WRITE) { 1945 lp->lwp_ru.ru_oublock += 1946 cb->outputcharge; 1947 cb->outputcharge = 0; 1948 } else if (cb->uaiocb.aio_lio_opcode 1949 == LIO_READ) { 1950 lp->lwp_ru.ru_inblock += 1951 cb->inputcharge; 1952 cb->inputcharge = 0; 1953 } 1954 found++; 1955 break; 1956 } 1957 } 1958 1959 crit_enter(); 1960 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) { 1961 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 1962 == jobref) { 1963 found++; 1964 break; 1965 } 1966 } 1967 crit_exit(); 1968 } 1969 1970 /* 1971 * If all I/Os have been disposed of, then we can 1972 * return. 1973 */ 1974 if (found == nentqueued) { 1975 error = runningcode; 1976 goto done; 1977 } 1978 1979 ki->kaio_flags |= KAIO_WAKEUP; 1980 error = tsleep(p, PCATCH, "aiospn", 0); 1981 1982 if (error == EINTR) { 1983 goto done; 1984 } else if (error == EWOULDBLOCK) { 1985 error = EAGAIN; 1986 goto done; 1987 } 1988 } 1989 } 1990 1991 error = runningcode; 1992 done: 1993 rel_mplock(); 1994 return (error); 1995 #endif /* VFS_AIO */ 1996 } 1997 1998 #ifdef VFS_AIO 1999 /* 2000 * This is a weird hack so that we can post a signal. It is safe to do so from 2001 * a timeout routine, but *not* from an interrupt routine. 2002 */ 2003 static void 2004 process_signal(void *aioj) 2005 { 2006 struct aiocblist *aiocbe = aioj; 2007 struct aio_liojob *lj = aiocbe->lio; 2008 struct aiocb *cb = &aiocbe->uaiocb; 2009 2010 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && 2011 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { 2012 ksignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 2013 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2014 } 2015 2016 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2017 ksignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); 2018 } 2019 2020 /* 2021 * Interrupt handler for physio, performs the necessary process wakeups, and 2022 * signals. 2023 */ 2024 static void 2025 aio_physwakeup(struct bio *bio) 2026 { 2027 struct buf *bp = bio->bio_buf; 2028 struct aiocblist *aiocbe; 2029 struct proc *p; 2030 struct kaioinfo *ki; 2031 struct aio_liojob *lj; 2032 2033 aiocbe = bio->bio_caller_info2.ptr; 2034 get_mplock(); 2035 2036 if (aiocbe) { 2037 p = bio->bio_caller_info1.ptr; 2038 2039 aiocbe->jobstate = JOBST_JOBBFINISHED; 2040 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2041 aiocbe->uaiocb._aiocb_private.error = 0; 2042 aiocbe->jobflags |= AIOCBLIST_DONE; 2043 2044 if (bp->b_flags & B_ERROR) 2045 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2046 2047 lj = aiocbe->lio; 2048 if (lj) { 2049 lj->lioj_buffer_finished_count++; 2050 2051 /* 2052 * wakeup/signal if all of the interrupt jobs are done. 2053 */ 2054 if (lj->lioj_buffer_finished_count == 2055 lj->lioj_buffer_count) { 2056 /* 2057 * Post a signal if it is called for. 2058 */ 2059 if ((lj->lioj_flags & 2060 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 2061 LIOJ_SIGNAL) { 2062 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2063 callout_reset(&aiocbe->timeout, 0, 2064 process_signal, aiocbe); 2065 } 2066 } 2067 } 2068 2069 ki = p->p_aioinfo; 2070 if (ki) { 2071 ki->kaio_buffer_finished_count++; 2072 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 2073 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 2074 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 2075 2076 KNOTE(&aiocbe->klist, 0); 2077 /* Do the wakeup. */ 2078 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 2079 ki->kaio_flags &= ~KAIO_WAKEUP; 2080 wakeup(p); 2081 } 2082 } 2083 2084 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 2085 callout_reset(&aiocbe->timeout, 0, 2086 process_signal, aiocbe); 2087 } 2088 } 2089 biodone_sync(bio); 2090 rel_mplock(); 2091 } 2092 #endif /* VFS_AIO */ 2093 2094 /* 2095 * syscall - wait for the next completion of an aio request 2096 * 2097 * MPALMOSTSAFE 2098 */ 2099 int 2100 sys_aio_waitcomplete(struct aio_waitcomplete_args *uap) 2101 { 2102 #ifndef VFS_AIO 2103 return ENOSYS; 2104 #else 2105 struct proc *p = curproc; 2106 struct lwp *lp = curthread->td_lwp; 2107 struct timeval atv; 2108 struct timespec ts; 2109 struct kaioinfo *ki; 2110 struct aiocblist *cb = NULL; 2111 int error, timo; 2112 2113 suword(uap->aiocbp, (int)NULL); 2114 2115 timo = 0; 2116 if (uap->timeout) { 2117 /* Get timespec struct. */ 2118 error = copyin(uap->timeout, &ts, sizeof(ts)); 2119 if (error) 2120 return error; 2121 2122 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) 2123 return (EINVAL); 2124 2125 TIMESPEC_TO_TIMEVAL(&atv, &ts); 2126 if (itimerfix(&atv)) 2127 return (EINVAL); 2128 timo = tvtohz_high(&atv); 2129 } 2130 2131 ki = p->p_aioinfo; 2132 if (ki == NULL) 2133 return EAGAIN; 2134 2135 get_mplock(); 2136 2137 for (;;) { 2138 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { 2139 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); 2140 uap->sysmsg_result = cb->uaiocb._aiocb_private.status; 2141 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2142 lp->lwp_ru.ru_oublock += 2143 cb->outputcharge; 2144 cb->outputcharge = 0; 2145 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2146 lp->lwp_ru.ru_inblock += cb->inputcharge; 2147 cb->inputcharge = 0; 2148 } 2149 aio_free_entry(cb); 2150 error = cb->uaiocb._aiocb_private.error; 2151 break; 2152 } 2153 2154 crit_enter(); 2155 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { 2156 crit_exit(); 2157 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); 2158 uap->sysmsg_result = cb->uaiocb._aiocb_private.status; 2159 aio_free_entry(cb); 2160 error = cb->uaiocb._aiocb_private.error; 2161 break; 2162 } 2163 2164 ki->kaio_flags |= KAIO_WAKEUP; 2165 error = tsleep(p, PCATCH, "aiowc", timo); 2166 crit_exit(); 2167 2168 if (error == ERESTART) { 2169 error = EINTR; 2170 break; 2171 } 2172 if (error < 0) 2173 break; 2174 if (error == EINTR) 2175 break; 2176 if (error == EWOULDBLOCK) { 2177 error = EAGAIN; 2178 break; 2179 } 2180 } 2181 rel_mplock(); 2182 return (error); 2183 #endif /* VFS_AIO */ 2184 } 2185 2186 #ifndef VFS_AIO 2187 static int 2188 filt_aioattach(struct knote *kn) 2189 { 2190 2191 return (ENXIO); 2192 } 2193 2194 struct filterops aio_filtops = 2195 { 0, filt_aioattach, NULL, NULL }; 2196 2197 #else 2198 /* kqueue attach function */ 2199 static int 2200 filt_aioattach(struct knote *kn) 2201 { 2202 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; 2203 2204 /* 2205 * The aiocbe pointer must be validated before using it, so 2206 * registration is restricted to the kernel; the user cannot 2207 * set EV_FLAG1. 2208 */ 2209 if ((kn->kn_flags & EV_FLAG1) == 0) 2210 return (EPERM); 2211 kn->kn_flags &= ~EV_FLAG1; 2212 2213 knote_insert(&aiocbe->klist, kn); 2214 2215 return (0); 2216 } 2217 2218 /* kqueue detach function */ 2219 static void 2220 filt_aiodetach(struct knote *kn) 2221 { 2222 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; 2223 2224 knote_remove(&aiocbe->klist, kn); 2225 } 2226 2227 /* kqueue filter function */ 2228 /*ARGSUSED*/ 2229 static int 2230 filt_aio(struct knote *kn, long hint) 2231 { 2232 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; 2233 2234 kn->kn_data = aiocbe->uaiocb._aiocb_private.error; 2235 if (aiocbe->jobstate != JOBST_JOBFINISHED && 2236 aiocbe->jobstate != JOBST_JOBBFINISHED) 2237 return (0); 2238 kn->kn_flags |= EV_EOF; 2239 return (1); 2240 } 2241 2242 struct filterops aio_filtops = 2243 { 0, filt_aioattach, filt_aiodetach, filt_aio }; 2244 #endif /* VFS_AIO */ 2245