1 /* 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 35 * $FreeBSD: src/sys/kern/kern_subr.c,v 1.31.2.2 2002/04/21 08:09:37 bde Exp $ 36 */ 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/kernel.h> 41 #include <sys/proc.h> 42 #include <sys/malloc.h> 43 #include <sys/lock.h> 44 #include <sys/resourcevar.h> 45 #include <sys/sysctl.h> 46 #include <sys/uio.h> 47 #include <sys/vnode.h> 48 #include <sys/thread2.h> 49 #include <machine/limits.h> 50 51 #include <cpu/lwbuf.h> 52 53 #include <vm/vm.h> 54 #include <vm/vm_page.h> 55 #include <vm/vm_map.h> 56 57 SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 58 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 59 60 /* 61 * UIO_READ: copy the kernelspace cp to the user or kernelspace UIO 62 * UIO_WRITE: copy the user or kernelspace UIO to the kernelspace cp 63 * 64 * For userspace UIO's, uio_td must be the current thread. 65 * 66 * The syscall interface is responsible for limiting the length to 67 * ssize_t for things like read() or write() which return the bytes 68 * read or written as ssize_t. These functions work with unsigned 69 * lengths. 70 */ 71 int 72 uiomove(caddr_t cp, size_t n, struct uio *uio) 73 { 74 thread_t td = curthread; 75 struct iovec *iov; 76 size_t cnt; 77 size_t tot; 78 int error = 0; 79 int save = 0; 80 81 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 82 ("uiomove: mode")); 83 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td, 84 ("uiomove proc")); 85 86 crit_enter(); 87 save = td->td_flags & TDF_DEADLKTREAT; 88 td->td_flags |= TDF_DEADLKTREAT; 89 crit_exit(); 90 91 tot = 0; 92 93 while (n > 0 && uio->uio_resid) { 94 iov = uio->uio_iov; 95 cnt = iov->iov_len; 96 if (cnt == 0) { 97 uio->uio_iov++; 98 uio->uio_iovcnt--; 99 continue; 100 } 101 if (cnt > n) 102 cnt = n; 103 tot += cnt; 104 105 switch (uio->uio_segflg) { 106 case UIO_USERSPACE: 107 if (tot > 1024*1024) 108 lwkt_user_yield(); 109 if (uio->uio_rw == UIO_READ) 110 error = copyout(cp, iov->iov_base, cnt); 111 else 112 error = copyin(iov->iov_base, cp, cnt); 113 break; 114 case UIO_SYSSPACE: 115 if (uio->uio_rw == UIO_READ) 116 bcopy(cp, iov->iov_base, cnt); 117 else 118 bcopy(iov->iov_base, cp, cnt); 119 break; 120 case UIO_NOCOPY: 121 break; 122 } 123 124 if (error) 125 break; 126 iov->iov_base = (char *)iov->iov_base + cnt; 127 iov->iov_len -= cnt; 128 uio->uio_resid -= cnt; 129 uio->uio_offset += cnt; 130 cp += cnt; 131 n -= cnt; 132 } 133 crit_enter(); 134 td->td_flags = (td->td_flags & ~TDF_DEADLKTREAT) | save; 135 crit_exit(); 136 137 return (error); 138 } 139 140 /* 141 * This is the same as uiomove() except (cp, n) is within the bounds of 142 * the passed, locked buffer. Under certain circumstances a VM fault 143 * occuring with a locked buffer held can result in a deadlock or an 144 * attempt to recursively lock the buffer. 145 * 146 * This procedure deals with these cases. 147 * 148 * If the buffer represents a regular file, is B_CACHE, but the last VM page 149 * is not fully valid we fix-up the last VM page. This should handle the 150 * recursive lock issue. 151 * 152 * Deadlocks are another issue. We are holding the vp and the bp locked 153 * and could deadlock against a different vp and/or bp if another thread is 154 * trying to access us while we accessing it. The only solution here is 155 * to release the bp and vnode lock and do the uio to/from a system buffer, 156 * then regain the locks and copyback (if applicable). XXX TODO. 157 */ 158 int 159 uiomovebp(struct buf *bp, caddr_t cp, size_t n, struct uio *uio) 160 { 161 int count; 162 vm_page_t m; 163 164 if (bp->b_vp && bp->b_vp->v_type == VREG && 165 (bp->b_flags & B_CACHE) && 166 (count = bp->b_xio.xio_npages) != 0 && 167 (m = bp->b_xio.xio_pages[count-1])->valid != VM_PAGE_BITS_ALL) { 168 vm_page_zero_invalid(m, TRUE); 169 } 170 return (uiomove(cp, n, uio)); 171 } 172 173 /* 174 * uiomove() but fail for non-trivial VM faults, even if the VM fault is 175 * valid. Returns EFAULT if a VM fault occurred via the copyin/copyout 176 * onfault code. 177 * 178 * This allows callers to hold e.g. a busy VM page, or a busy VM object, 179 * or a locked vnode through the call and then fall-back to safer code 180 * if we fail. 181 */ 182 int 183 uiomove_nofault(caddr_t cp, size_t n, struct uio *uio) 184 { 185 thread_t td = curthread; 186 int error; 187 188 atomic_set_int(&td->td_flags, TDF_NOFAULT); 189 error = uiomove(cp, n, uio); 190 atomic_clear_int(&td->td_flags, TDF_NOFAULT); 191 return error; 192 } 193 194 /* 195 * Like uiomove() but copies zero-fill. Only allowed for UIO_READ, 196 * for obvious reasons. 197 */ 198 int 199 uiomovez(size_t n, struct uio *uio) 200 { 201 struct iovec *iov; 202 size_t cnt; 203 int error = 0; 204 205 KASSERT(uio->uio_rw == UIO_READ, ("uiomovez: mode")); 206 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 207 ("uiomove proc")); 208 209 while (n > 0 && uio->uio_resid) { 210 iov = uio->uio_iov; 211 cnt = iov->iov_len; 212 if (cnt == 0) { 213 uio->uio_iov++; 214 uio->uio_iovcnt--; 215 continue; 216 } 217 if (cnt > n) 218 cnt = n; 219 220 switch (uio->uio_segflg) { 221 case UIO_USERSPACE: 222 error = copyout(ZeroPage, iov->iov_base, cnt); 223 break; 224 case UIO_SYSSPACE: 225 bzero(iov->iov_base, cnt); 226 break; 227 case UIO_NOCOPY: 228 break; 229 } 230 231 if (error) 232 break; 233 iov->iov_base = (char *)iov->iov_base + cnt; 234 iov->iov_len -= cnt; 235 uio->uio_resid -= cnt; 236 uio->uio_offset += cnt; 237 n -= cnt; 238 } 239 return (error); 240 } 241 242 /* 243 * Wrapper for uiomove() that validates the arguments against a known-good 244 * kernel buffer. This function automatically indexes the buffer by 245 * uio_offset and handles all range checking. 246 */ 247 int 248 uiomove_frombuf(void *buf, size_t buflen, struct uio *uio) 249 { 250 size_t offset; 251 252 offset = (size_t)uio->uio_offset; 253 if ((off_t)offset != uio->uio_offset) 254 return (EINVAL); 255 if (buflen == 0 || offset >= buflen) 256 return (0); 257 return (uiomove((char *)buf + offset, buflen - offset, uio)); 258 } 259 260 /* 261 * Give next character to user as result of read. 262 */ 263 int 264 ureadc(int c, struct uio *uio) 265 { 266 struct iovec *iov; 267 char *iov_base; 268 269 again: 270 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 271 panic("ureadc"); 272 iov = uio->uio_iov; 273 if (iov->iov_len == 0) { 274 uio->uio_iovcnt--; 275 uio->uio_iov++; 276 goto again; 277 } 278 279 switch (uio->uio_segflg) { 280 case UIO_USERSPACE: 281 if (subyte(iov->iov_base, c) < 0) 282 return (EFAULT); 283 break; 284 case UIO_SYSSPACE: 285 iov_base = iov->iov_base; 286 *iov_base = c; 287 iov->iov_base = iov_base; 288 break; 289 case UIO_NOCOPY: 290 break; 291 } 292 293 iov->iov_base = (char *)iov->iov_base + 1; 294 iov->iov_len--; 295 uio->uio_resid--; 296 uio->uio_offset++; 297 return (0); 298 } 299 300 /* 301 * General routine to allocate a hash table. Make the hash table size a 302 * power of 2 greater or equal to the number of elements requested, and 303 * store the masking value in *hashmask. 304 */ 305 void * 306 hashinit(int elements, struct malloc_type *type, u_long *hashmask) 307 { 308 long hashsize; 309 LIST_HEAD(generic, generic) *hashtbl; 310 int i; 311 312 if (elements <= 0) 313 panic("hashinit: bad elements"); 314 for (hashsize = 2; hashsize < elements; hashsize <<= 1) 315 continue; 316 hashtbl = kmalloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 317 for (i = 0; i < hashsize; i++) 318 LIST_INIT(&hashtbl[i]); 319 *hashmask = hashsize - 1; 320 return (hashtbl); 321 } 322 323 void 324 hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask) 325 { 326 LIST_HEAD(generic, generic) *hashtbl, *hp; 327 328 hashtbl = vhashtbl; 329 for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++) 330 KASSERT(LIST_EMPTY(hp), ("%s: hash not empty", __func__)); 331 kfree(hashtbl, type); 332 } 333 334 /* 335 * This is a newer version which allocates a hash table of structures. 336 * 337 * The returned array will be zero'd. The caller is responsible for 338 * initializing the structures. 339 */ 340 void * 341 hashinit_ext(int elements, size_t size, struct malloc_type *type, 342 u_long *hashmask) 343 { 344 long hashsize; 345 void *hashtbl; 346 347 if (elements <= 0) 348 panic("hashinit: bad elements"); 349 for (hashsize = 2; hashsize < elements; hashsize <<= 1) 350 continue; 351 hashtbl = kmalloc((size_t)hashsize * size, type, M_WAITOK | M_ZERO); 352 *hashmask = hashsize - 1; 353 return (hashtbl); 354 } 355 356 static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, 357 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, 358 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; 359 #define NPRIMES NELEM(primes) 360 361 /* 362 * General routine to allocate a prime number sized hash table. 363 */ 364 void * 365 phashinit(int elements, struct malloc_type *type, u_long *nentries) 366 { 367 long hashsize; 368 LIST_HEAD(generic, generic) *hashtbl; 369 int i; 370 371 if (elements <= 0) 372 panic("phashinit: bad elements"); 373 for (i = 1, hashsize = primes[1]; hashsize <= elements;) { 374 i++; 375 if (i == NPRIMES) 376 break; 377 hashsize = primes[i]; 378 } 379 hashsize = primes[i - 1]; 380 hashtbl = kmalloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 381 for (i = 0; i < hashsize; i++) 382 LIST_INIT(&hashtbl[i]); 383 *nentries = hashsize; 384 return (hashtbl); 385 } 386 387 /* 388 * This is a newer version which allocates a hash table of structures 389 * in a prime-number size. 390 * 391 * The returned array will be zero'd. The caller is responsible for 392 * initializing the structures. 393 */ 394 void * 395 phashinit_ext(int elements, size_t size, struct malloc_type *type, 396 u_long *nentries) 397 { 398 long hashsize; 399 void *hashtbl; 400 int i; 401 402 if (elements <= 0) 403 panic("phashinit: bad elements"); 404 for (i = 1, hashsize = primes[1]; hashsize <= elements;) { 405 i++; 406 if (i == NPRIMES) 407 break; 408 hashsize = primes[i]; 409 } 410 hashsize = primes[i - 1]; 411 hashtbl = kmalloc((size_t)hashsize * size, type, M_WAITOK | M_ZERO); 412 *nentries = hashsize; 413 return (hashtbl); 414 } 415 416 /* 417 * Copyin an iovec. If the iovec array fits, use the preallocated small 418 * iovec structure. If it is too big, dynamically allocate an iovec array 419 * of sufficient size. 420 * 421 * MPSAFE 422 */ 423 int 424 iovec_copyin(struct iovec *uiov, struct iovec **kiov, struct iovec *siov, 425 size_t iov_cnt, size_t *iov_len) 426 { 427 struct iovec *iovp; 428 int error, i; 429 size_t len; 430 431 if (iov_cnt > UIO_MAXIOV) 432 return EMSGSIZE; 433 if (iov_cnt > UIO_SMALLIOV) { 434 *kiov = kmalloc(sizeof(struct iovec) * iov_cnt, M_IOV, 435 M_WAITOK); 436 } else { 437 *kiov = siov; 438 } 439 error = copyin(uiov, *kiov, iov_cnt * sizeof(struct iovec)); 440 if (error == 0) { 441 *iov_len = 0; 442 for (i = 0, iovp = *kiov; i < iov_cnt; i++, iovp++) { 443 /* 444 * Check for both *iov_len overflows and out of 445 * range iovp->iov_len's. We limit to the 446 * capabilities of signed integers. 447 * 448 * GCC4 - overflow check opt requires assign/test. 449 */ 450 len = *iov_len + iovp->iov_len; 451 if (len < *iov_len) 452 error = EINVAL; 453 *iov_len = len; 454 } 455 } 456 457 /* 458 * From userland disallow iovec's which exceed the sized size 459 * limit as the system calls return ssize_t. 460 * 461 * NOTE: Internal kernel interfaces can handle the unsigned 462 * limit. 463 */ 464 if (error == 0 && (ssize_t)*iov_len < 0) 465 error = EINVAL; 466 467 if (error) 468 iovec_free(kiov, siov); 469 return (error); 470 } 471 472 473 /* 474 * Copyright (c) 2004 Alan L. Cox <alc@cs.rice.edu> 475 * Copyright (c) 1982, 1986, 1991, 1993 476 * The Regents of the University of California. All rights reserved. 477 * (c) UNIX System Laboratories, Inc. 478 * All or some portions of this file are derived from material licensed 479 * to the University of California by American Telephone and Telegraph 480 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 481 * the permission of UNIX System Laboratories, Inc. 482 * 483 * Redistribution and use in source and binary forms, with or without 484 * modification, are permitted provided that the following conditions 485 * are met: 486 * 1. Redistributions of source code must retain the above copyright 487 * notice, this list of conditions and the following disclaimer. 488 * 2. Redistributions in binary form must reproduce the above copyright 489 * notice, this list of conditions and the following disclaimer in the 490 * documentation and/or other materials provided with the distribution. 491 * 4. Neither the name of the University nor the names of its contributors 492 * may be used to endorse or promote products derived from this software 493 * without specific prior written permission. 494 * 495 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 496 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 497 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 498 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 499 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 500 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 501 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 502 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 503 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 504 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 505 * SUCH DAMAGE. 506 * 507 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 508 * $FreeBSD: src/sys/i386/i386/uio_machdep.c,v 1.1 2004/03/21 20:28:36 alc Exp $ 509 */ 510 511 /* 512 * Implement uiomove(9) from physical memory using lwbuf's to reduce 513 * the creation and destruction of ephemeral mappings. 514 */ 515 int 516 uiomove_fromphys(vm_page_t *ma, vm_offset_t offset, size_t n, struct uio *uio) 517 { 518 struct lwbuf lwb_cache; 519 struct lwbuf *lwb; 520 struct thread *td = curthread; 521 struct iovec *iov; 522 void *cp; 523 vm_offset_t page_offset; 524 vm_page_t m; 525 size_t cnt; 526 int error = 0; 527 int save = 0; 528 529 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 530 ("uiomove_fromphys: mode")); 531 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 532 ("uiomove_fromphys proc")); 533 534 crit_enter(); 535 save = td->td_flags & TDF_DEADLKTREAT; 536 td->td_flags |= TDF_DEADLKTREAT; 537 crit_exit(); 538 539 while (n > 0 && uio->uio_resid) { 540 iov = uio->uio_iov; 541 cnt = iov->iov_len; 542 if (cnt == 0) { 543 uio->uio_iov++; 544 uio->uio_iovcnt--; 545 continue; 546 } 547 if (cnt > n) 548 cnt = n; 549 page_offset = offset & PAGE_MASK; 550 cnt = min(cnt, PAGE_SIZE - page_offset); 551 m = ma[offset >> PAGE_SHIFT]; 552 lwb = lwbuf_alloc(m, &lwb_cache); 553 cp = (char *)lwbuf_kva(lwb) + page_offset; 554 555 switch (uio->uio_segflg) { 556 case UIO_USERSPACE: 557 /* 558 * note: removed uioyield (it was the wrong place to 559 * put it). 560 */ 561 if (uio->uio_rw == UIO_READ) 562 error = copyout(cp, iov->iov_base, cnt); 563 else 564 error = copyin(iov->iov_base, cp, cnt); 565 if (error) { 566 lwbuf_free(lwb); 567 goto out; 568 } 569 break; 570 case UIO_SYSSPACE: 571 if (uio->uio_rw == UIO_READ) 572 bcopy(cp, iov->iov_base, cnt); 573 else 574 bcopy(iov->iov_base, cp, cnt); 575 break; 576 case UIO_NOCOPY: 577 break; 578 } 579 lwbuf_free(lwb); 580 iov->iov_base = (char *)iov->iov_base + cnt; 581 iov->iov_len -= cnt; 582 uio->uio_resid -= cnt; 583 uio->uio_offset += cnt; 584 offset += cnt; 585 n -= cnt; 586 } 587 out: 588 if (save == 0) { 589 crit_enter(); 590 td->td_flags &= ~TDF_DEADLKTREAT; 591 crit_exit(); 592 } 593 return (error); 594 } 595 596