1 /* 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 35 * $FreeBSD: src/sys/kern/kern_subr.c,v 1.31.2.2 2002/04/21 08:09:37 bde Exp $ 36 */ 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/kernel.h> 41 #include <sys/proc.h> 42 #include <sys/malloc.h> 43 #include <sys/lock.h> 44 #include <sys/resourcevar.h> 45 #include <sys/sysctl.h> 46 #include <sys/uio.h> 47 #include <sys/vnode.h> 48 #include <sys/thread2.h> 49 #include <machine/limits.h> 50 51 #include <cpu/lwbuf.h> 52 53 #include <vm/vm.h> 54 #include <vm/vm_page.h> 55 #include <vm/vm_map.h> 56 57 SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 58 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 59 60 int 61 copyin_nofault(const void *udaddr, void *kaddr, size_t len) 62 { 63 thread_t td = curthread; 64 int error; 65 66 atomic_set_int(&td->td_flags, TDF_NOFAULT); 67 error = copyin(udaddr, kaddr, len); 68 atomic_clear_int(&td->td_flags, TDF_NOFAULT); 69 return error; 70 } 71 72 int 73 copyout_nofault(const void *kaddr, void *udaddr, size_t len) 74 { 75 thread_t td = curthread; 76 int error; 77 78 atomic_set_int(&td->td_flags, TDF_NOFAULT); 79 error = copyout(kaddr, udaddr, len); 80 atomic_clear_int(&td->td_flags, TDF_NOFAULT); 81 return error; 82 } 83 84 /* 85 * UIO_READ: copy the kernelspace cp to the user or kernelspace UIO 86 * UIO_WRITE: copy the user or kernelspace UIO to the kernelspace cp 87 * 88 * For userspace UIO's, uio_td must be the current thread. 89 * 90 * The syscall interface is responsible for limiting the length to 91 * ssize_t for things like read() or write() which return the bytes 92 * read or written as ssize_t. These functions work with unsigned 93 * lengths. 94 */ 95 int 96 uiomove(caddr_t cp, size_t n, struct uio *uio) 97 { 98 thread_t td = curthread; 99 struct iovec *iov; 100 size_t cnt; 101 size_t tot; 102 int error = 0; 103 int save = 0; 104 105 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 106 ("uiomove: mode")); 107 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td, 108 ("uiomove proc")); 109 110 crit_enter(); 111 save = td->td_flags & TDF_DEADLKTREAT; 112 td->td_flags |= TDF_DEADLKTREAT; 113 crit_exit(); 114 115 tot = 0; 116 117 while (n > 0 && uio->uio_resid) { 118 iov = uio->uio_iov; 119 cnt = iov->iov_len; 120 if (cnt == 0) { 121 uio->uio_iov++; 122 uio->uio_iovcnt--; 123 continue; 124 } 125 if (cnt > n) 126 cnt = n; 127 tot += cnt; 128 129 switch (uio->uio_segflg) { 130 case UIO_USERSPACE: 131 if (tot > 1024*1024) 132 lwkt_user_yield(); 133 if (uio->uio_rw == UIO_READ) 134 error = copyout(cp, iov->iov_base, cnt); 135 else 136 error = copyin(iov->iov_base, cp, cnt); 137 break; 138 case UIO_SYSSPACE: 139 if (uio->uio_rw == UIO_READ) 140 bcopy(cp, iov->iov_base, cnt); 141 else 142 bcopy(iov->iov_base, cp, cnt); 143 break; 144 case UIO_NOCOPY: 145 break; 146 } 147 148 if (error) 149 break; 150 iov->iov_base = (char *)iov->iov_base + cnt; 151 iov->iov_len -= cnt; 152 uio->uio_resid -= cnt; 153 uio->uio_offset += cnt; 154 cp += cnt; 155 n -= cnt; 156 } 157 crit_enter(); 158 td->td_flags = (td->td_flags & ~TDF_DEADLKTREAT) | save; 159 crit_exit(); 160 161 return (error); 162 } 163 164 /* 165 * This is the same as uiomove() except (cp, n) is within the bounds of 166 * the passed, locked buffer. Under certain circumstances a VM fault 167 * occuring with a locked buffer held can result in a deadlock or an 168 * attempt to recursively lock the buffer. 169 * 170 * This procedure deals with these cases. 171 * 172 * If the buffer represents a regular file, is B_CACHE, but the last VM page 173 * is not fully valid we fix-up the last VM page. This should handle the 174 * recursive lock issue. 175 * 176 * Deadlocks are another issue. We are holding the vp and the bp locked 177 * and could deadlock against a different vp and/or bp if another thread is 178 * trying to access us while we accessing it. The only solution here is 179 * to release the bp and vnode lock and do the uio to/from a system buffer, 180 * then regain the locks and copyback (if applicable). XXX TODO. 181 */ 182 int 183 uiomovebp(struct buf *bp, caddr_t cp, size_t n, struct uio *uio) 184 { 185 int count; 186 vm_page_t m; 187 188 if (bp->b_vp && bp->b_vp->v_type == VREG && 189 (bp->b_flags & B_CACHE) && 190 (count = bp->b_xio.xio_npages) != 0 && 191 (m = bp->b_xio.xio_pages[count-1])->valid != VM_PAGE_BITS_ALL) { 192 vm_page_zero_invalid(m, TRUE); 193 } 194 return (uiomove(cp, n, uio)); 195 } 196 197 /* 198 * uiomove() but fail for non-trivial VM faults, even if the VM fault is 199 * valid. Returns EFAULT if a VM fault occurred via the copyin/copyout 200 * onfault code. 201 * 202 * This allows callers to hold e.g. a busy VM page, or a busy VM object, 203 * or a locked vnode through the call and then fall-back to safer code 204 * if we fail. 205 */ 206 int 207 uiomove_nofault(caddr_t cp, size_t n, struct uio *uio) 208 { 209 thread_t td = curthread; 210 int error; 211 212 atomic_set_int(&td->td_flags, TDF_NOFAULT); 213 error = uiomove(cp, n, uio); 214 atomic_clear_int(&td->td_flags, TDF_NOFAULT); 215 return error; 216 } 217 218 /* 219 * Like uiomove() but copies zero-fill. Only allowed for UIO_READ, 220 * for obvious reasons. 221 */ 222 int 223 uiomovez(size_t n, struct uio *uio) 224 { 225 struct iovec *iov; 226 size_t cnt; 227 int error = 0; 228 229 KASSERT(uio->uio_rw == UIO_READ, ("uiomovez: mode")); 230 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 231 ("uiomove proc")); 232 233 while (n > 0 && uio->uio_resid) { 234 iov = uio->uio_iov; 235 cnt = iov->iov_len; 236 if (cnt == 0) { 237 uio->uio_iov++; 238 uio->uio_iovcnt--; 239 continue; 240 } 241 if (cnt > n) 242 cnt = n; 243 244 switch (uio->uio_segflg) { 245 case UIO_USERSPACE: 246 error = copyout(ZeroPage, iov->iov_base, cnt); 247 break; 248 case UIO_SYSSPACE: 249 bzero(iov->iov_base, cnt); 250 break; 251 case UIO_NOCOPY: 252 break; 253 } 254 255 if (error) 256 break; 257 iov->iov_base = (char *)iov->iov_base + cnt; 258 iov->iov_len -= cnt; 259 uio->uio_resid -= cnt; 260 uio->uio_offset += cnt; 261 n -= cnt; 262 } 263 return (error); 264 } 265 266 /* 267 * Wrapper for uiomove() that validates the arguments against a known-good 268 * kernel buffer. This function automatically indexes the buffer by 269 * uio_offset and handles all range checking. 270 */ 271 int 272 uiomove_frombuf(void *buf, size_t buflen, struct uio *uio) 273 { 274 size_t offset; 275 276 offset = (size_t)uio->uio_offset; 277 if ((off_t)offset != uio->uio_offset) 278 return (EINVAL); 279 if (buflen == 0 || offset >= buflen) 280 return (0); 281 return (uiomove((char *)buf + offset, buflen - offset, uio)); 282 } 283 284 /* 285 * Give next character to user as result of read. 286 */ 287 int 288 ureadc(int c, struct uio *uio) 289 { 290 struct iovec *iov; 291 char *iov_base; 292 293 again: 294 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 295 panic("ureadc"); 296 iov = uio->uio_iov; 297 if (iov->iov_len == 0) { 298 uio->uio_iovcnt--; 299 uio->uio_iov++; 300 goto again; 301 } 302 303 switch (uio->uio_segflg) { 304 case UIO_USERSPACE: 305 if (subyte(iov->iov_base, c) < 0) 306 return (EFAULT); 307 break; 308 case UIO_SYSSPACE: 309 iov_base = iov->iov_base; 310 *iov_base = c; 311 iov->iov_base = iov_base; 312 break; 313 case UIO_NOCOPY: 314 break; 315 } 316 317 iov->iov_base = (char *)iov->iov_base + 1; 318 iov->iov_len--; 319 uio->uio_resid--; 320 uio->uio_offset++; 321 return (0); 322 } 323 324 /* 325 * General routine to allocate a hash table. Make the hash table size a 326 * power of 2 greater or equal to the number of elements requested, and 327 * store the masking value in *hashmask. 328 */ 329 void * 330 hashinit(int elements, struct malloc_type *type, u_long *hashmask) 331 { 332 long hashsize; 333 LIST_HEAD(generic, generic) *hashtbl; 334 int i; 335 336 if (elements <= 0) 337 panic("hashinit: bad elements"); 338 for (hashsize = 2; hashsize < elements; hashsize <<= 1) 339 continue; 340 hashtbl = kmalloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 341 for (i = 0; i < hashsize; i++) 342 LIST_INIT(&hashtbl[i]); 343 *hashmask = hashsize - 1; 344 return (hashtbl); 345 } 346 347 void 348 hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask) 349 { 350 LIST_HEAD(generic, generic) *hashtbl, *hp; 351 352 hashtbl = vhashtbl; 353 for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++) 354 KASSERT(LIST_EMPTY(hp), ("%s: hash not empty", __func__)); 355 kfree(hashtbl, type); 356 } 357 358 /* 359 * This is a newer version which allocates a hash table of structures. 360 * 361 * The returned array will be zero'd. The caller is responsible for 362 * initializing the structures. 363 */ 364 void * 365 hashinit_ext(int elements, size_t size, struct malloc_type *type, 366 u_long *hashmask) 367 { 368 long hashsize; 369 void *hashtbl; 370 371 if (elements <= 0) 372 panic("hashinit: bad elements"); 373 for (hashsize = 2; hashsize < elements; hashsize <<= 1) 374 continue; 375 hashtbl = kmalloc((size_t)hashsize * size, type, M_WAITOK | M_ZERO); 376 *hashmask = hashsize - 1; 377 return (hashtbl); 378 } 379 380 static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, 381 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, 382 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; 383 #define NPRIMES NELEM(primes) 384 385 /* 386 * General routine to allocate a prime number sized hash table. 387 */ 388 void * 389 phashinit(int elements, struct malloc_type *type, u_long *nentries) 390 { 391 long hashsize; 392 LIST_HEAD(generic, generic) *hashtbl; 393 int i; 394 395 if (elements <= 0) 396 panic("phashinit: bad elements"); 397 for (i = 1, hashsize = primes[1]; hashsize <= elements;) { 398 i++; 399 if (i == NPRIMES) 400 break; 401 hashsize = primes[i]; 402 } 403 hashsize = primes[i - 1]; 404 hashtbl = kmalloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 405 for (i = 0; i < hashsize; i++) 406 LIST_INIT(&hashtbl[i]); 407 *nentries = hashsize; 408 return (hashtbl); 409 } 410 411 /* 412 * This is a newer version which allocates a hash table of structures 413 * in a prime-number size. 414 * 415 * The returned array will be zero'd. The caller is responsible for 416 * initializing the structures. 417 */ 418 void * 419 phashinit_ext(int elements, size_t size, struct malloc_type *type, 420 u_long *nentries) 421 { 422 long hashsize; 423 void *hashtbl; 424 int i; 425 426 if (elements <= 0) 427 panic("phashinit: bad elements"); 428 for (i = 1, hashsize = primes[1]; hashsize <= elements;) { 429 i++; 430 if (i == NPRIMES) 431 break; 432 hashsize = primes[i]; 433 } 434 hashsize = primes[i - 1]; 435 hashtbl = kmalloc((size_t)hashsize * size, type, M_WAITOK | M_ZERO); 436 *nentries = hashsize; 437 return (hashtbl); 438 } 439 440 /* 441 * Copyin an iovec. If the iovec array fits, use the preallocated small 442 * iovec structure. If it is too big, dynamically allocate an iovec array 443 * of sufficient size. 444 * 445 * MPSAFE 446 */ 447 int 448 iovec_copyin(struct iovec *uiov, struct iovec **kiov, struct iovec *siov, 449 size_t iov_cnt, size_t *iov_len) 450 { 451 struct iovec *iovp; 452 int error, i; 453 size_t len; 454 455 if (iov_cnt > UIO_MAXIOV) 456 return EMSGSIZE; 457 if (iov_cnt > UIO_SMALLIOV) { 458 *kiov = kmalloc(sizeof(struct iovec) * iov_cnt, M_IOV, 459 M_WAITOK); 460 } else { 461 *kiov = siov; 462 } 463 error = copyin(uiov, *kiov, iov_cnt * sizeof(struct iovec)); 464 if (error == 0) { 465 *iov_len = 0; 466 for (i = 0, iovp = *kiov; i < iov_cnt; i++, iovp++) { 467 /* 468 * Check for both *iov_len overflows and out of 469 * range iovp->iov_len's. We limit to the 470 * capabilities of signed integers. 471 * 472 * GCC4 - overflow check opt requires assign/test. 473 */ 474 len = *iov_len + iovp->iov_len; 475 if (len < *iov_len) 476 error = EINVAL; 477 *iov_len = len; 478 } 479 } 480 481 /* 482 * From userland disallow iovec's which exceed the sized size 483 * limit as the system calls return ssize_t. 484 * 485 * NOTE: Internal kernel interfaces can handle the unsigned 486 * limit. 487 */ 488 if (error == 0 && (ssize_t)*iov_len < 0) 489 error = EINVAL; 490 491 if (error) 492 iovec_free(kiov, siov); 493 return (error); 494 } 495 496 497 /* 498 * Copyright (c) 2004 Alan L. Cox <alc@cs.rice.edu> 499 * Copyright (c) 1982, 1986, 1991, 1993 500 * The Regents of the University of California. All rights reserved. 501 * (c) UNIX System Laboratories, Inc. 502 * All or some portions of this file are derived from material licensed 503 * to the University of California by American Telephone and Telegraph 504 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 505 * the permission of UNIX System Laboratories, Inc. 506 * 507 * Redistribution and use in source and binary forms, with or without 508 * modification, are permitted provided that the following conditions 509 * are met: 510 * 1. Redistributions of source code must retain the above copyright 511 * notice, this list of conditions and the following disclaimer. 512 * 2. Redistributions in binary form must reproduce the above copyright 513 * notice, this list of conditions and the following disclaimer in the 514 * documentation and/or other materials provided with the distribution. 515 * 4. Neither the name of the University nor the names of its contributors 516 * may be used to endorse or promote products derived from this software 517 * without specific prior written permission. 518 * 519 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 520 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 521 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 522 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 523 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 524 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 525 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 526 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 527 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 528 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 529 * SUCH DAMAGE. 530 * 531 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 532 * $FreeBSD: src/sys/i386/i386/uio_machdep.c,v 1.1 2004/03/21 20:28:36 alc Exp $ 533 */ 534 535 /* 536 * Implement uiomove(9) from physical memory using lwbuf's to reduce 537 * the creation and destruction of ephemeral mappings. 538 */ 539 int 540 uiomove_fromphys(vm_page_t *ma, vm_offset_t offset, size_t n, struct uio *uio) 541 { 542 struct lwbuf lwb_cache; 543 struct lwbuf *lwb; 544 struct thread *td = curthread; 545 struct iovec *iov; 546 void *cp; 547 vm_offset_t page_offset; 548 vm_page_t m; 549 size_t cnt; 550 int error = 0; 551 int save = 0; 552 553 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 554 ("uiomove_fromphys: mode")); 555 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 556 ("uiomove_fromphys proc")); 557 558 crit_enter(); 559 save = td->td_flags & TDF_DEADLKTREAT; 560 td->td_flags |= TDF_DEADLKTREAT; 561 crit_exit(); 562 563 while (n > 0 && uio->uio_resid) { 564 iov = uio->uio_iov; 565 cnt = iov->iov_len; 566 if (cnt == 0) { 567 uio->uio_iov++; 568 uio->uio_iovcnt--; 569 continue; 570 } 571 if (cnt > n) 572 cnt = n; 573 page_offset = offset & PAGE_MASK; 574 cnt = min(cnt, PAGE_SIZE - page_offset); 575 m = ma[offset >> PAGE_SHIFT]; 576 lwb = lwbuf_alloc(m, &lwb_cache); 577 cp = (char *)lwbuf_kva(lwb) + page_offset; 578 579 switch (uio->uio_segflg) { 580 case UIO_USERSPACE: 581 /* 582 * note: removed uioyield (it was the wrong place to 583 * put it). 584 */ 585 if (uio->uio_rw == UIO_READ) 586 error = copyout(cp, iov->iov_base, cnt); 587 else 588 error = copyin(iov->iov_base, cp, cnt); 589 if (error) { 590 lwbuf_free(lwb); 591 goto out; 592 } 593 break; 594 case UIO_SYSSPACE: 595 if (uio->uio_rw == UIO_READ) 596 bcopy(cp, iov->iov_base, cnt); 597 else 598 bcopy(iov->iov_base, cp, cnt); 599 break; 600 case UIO_NOCOPY: 601 break; 602 } 603 lwbuf_free(lwb); 604 iov->iov_base = (char *)iov->iov_base + cnt; 605 iov->iov_len -= cnt; 606 uio->uio_resid -= cnt; 607 uio->uio_offset += cnt; 608 offset += cnt; 609 n -= cnt; 610 } 611 out: 612 if (save == 0) { 613 crit_enter(); 614 td->td_flags &= ~TDF_DEADLKTREAT; 615 crit_exit(); 616 } 617 return (error); 618 } 619 620