1 /* $NetBSD: subr_disk.c,v 1.49 2002/11/06 02:31:34 enami Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1982, 1986, 1988, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. All advertising materials mentioning features or use of this software 58 * must display the following acknowledgement: 59 * This product includes software developed by the University of 60 * California, Berkeley and its contributors. 61 * 4. Neither the name of the University nor the names of its contributors 62 * may be used to endorse or promote products derived from this software 63 * without specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 66 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 69 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 70 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 71 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 72 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 73 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 74 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 75 * SUCH DAMAGE. 76 * 77 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 78 */ 79 80 #include <sys/cdefs.h> 81 __KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.49 2002/11/06 02:31:34 enami Exp $"); 82 83 #include "opt_compat_netbsd.h" 84 85 #include <sys/param.h> 86 #include <sys/kernel.h> 87 #include <sys/malloc.h> 88 #include <sys/buf.h> 89 #include <sys/syslog.h> 90 #include <sys/disklabel.h> 91 #include <sys/disk.h> 92 #include <sys/sysctl.h> 93 #include <lib/libkern/libkern.h> 94 95 /* 96 * A global list of all disks attached to the system. May grow or 97 * shrink over time. 98 */ 99 struct disklist_head disklist; /* TAILQ_HEAD */ 100 int disk_count; /* number of drives in global disklist */ 101 struct simplelock disklist_slock = SIMPLELOCK_INITIALIZER; 102 103 /* 104 * Compute checksum for disk label. 105 */ 106 u_int 107 dkcksum(struct disklabel *lp) 108 { 109 u_short *start, *end; 110 u_short sum = 0; 111 112 start = (u_short *)lp; 113 end = (u_short *)&lp->d_partitions[lp->d_npartitions]; 114 while (start < end) 115 sum ^= *start++; 116 return (sum); 117 } 118 119 /* 120 * Disk error is the preface to plaintive error messages 121 * about failing disk transfers. It prints messages of the form 122 123 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) 124 125 * if the offset of the error in the transfer and a disk label 126 * are both available. blkdone should be -1 if the position of the error 127 * is unknown; the disklabel pointer may be null from drivers that have not 128 * been converted to use them. The message is printed with printf 129 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. 130 * The message should be completed (with at least a newline) with printf 131 * or addlog, respectively. There is no trailing space. 132 */ 133 void 134 diskerr(const struct buf *bp, const char *dname, const char *what, int pri, 135 int blkdone, const struct disklabel *lp) 136 { 137 int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev); 138 void (*pr)(const char *, ...); 139 char partname = 'a' + part; 140 int sn; 141 142 if (pri != LOG_PRINTF) { 143 static const char fmt[] = ""; 144 log(pri, fmt); 145 pr = addlog; 146 } else 147 pr = printf; 148 (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, 149 bp->b_flags & B_READ ? "read" : "writ"); 150 sn = bp->b_blkno; 151 if (bp->b_bcount <= DEV_BSIZE) 152 (*pr)("%d", sn); 153 else { 154 if (blkdone >= 0) { 155 sn += blkdone; 156 (*pr)("%d of ", sn); 157 } 158 (*pr)("%d-%d", bp->b_blkno, 159 bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); 160 } 161 if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { 162 sn += lp->d_partitions[part].p_offset; 163 (*pr)(" (%s%d bn %d; cn %d", dname, unit, sn, 164 sn / lp->d_secpercyl); 165 sn %= lp->d_secpercyl; 166 (*pr)(" tn %d sn %d)", sn / lp->d_nsectors, 167 sn % lp->d_nsectors); 168 } 169 } 170 171 /* 172 * Initialize the disklist. Called by main() before autoconfiguration. 173 */ 174 void 175 disk_init(void) 176 { 177 178 TAILQ_INIT(&disklist); 179 disk_count = 0; 180 } 181 182 /* 183 * Searches the disklist for the disk corresponding to the 184 * name provided. 185 */ 186 struct disk * 187 disk_find(char *name) 188 { 189 struct disk *diskp; 190 191 if ((name == NULL) || (disk_count <= 0)) 192 return (NULL); 193 194 simple_lock(&disklist_slock); 195 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 196 diskp = TAILQ_NEXT(diskp, dk_link)) 197 if (strcmp(diskp->dk_name, name) == 0) { 198 simple_unlock(&disklist_slock); 199 return (diskp); 200 } 201 simple_unlock(&disklist_slock); 202 203 return (NULL); 204 } 205 206 /* 207 * Attach a disk. 208 */ 209 void 210 disk_attach(struct disk *diskp) 211 { 212 int s; 213 214 /* 215 * Allocate and initialize the disklabel structures. Note that 216 * it's not safe to sleep here, since we're probably going to be 217 * called during autoconfiguration. 218 */ 219 diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT); 220 diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF, 221 M_NOWAIT); 222 if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL)) 223 panic("disk_attach: can't allocate storage for disklabel"); 224 225 memset(diskp->dk_label, 0, sizeof(struct disklabel)); 226 memset(diskp->dk_cpulabel, 0, sizeof(struct cpu_disklabel)); 227 228 /* 229 * Set the attached timestamp. 230 */ 231 s = splclock(); 232 diskp->dk_attachtime = mono_time; 233 splx(s); 234 235 /* 236 * Link into the disklist. 237 */ 238 simple_lock(&disklist_slock); 239 TAILQ_INSERT_TAIL(&disklist, diskp, dk_link); 240 simple_unlock(&disklist_slock); 241 ++disk_count; 242 } 243 244 /* 245 * Detach a disk. 246 */ 247 void 248 disk_detach(struct disk *diskp) 249 { 250 251 /* 252 * Remove from the disklist. 253 */ 254 if (--disk_count < 0) 255 panic("disk_detach: disk_count < 0"); 256 simple_lock(&disklist_slock); 257 TAILQ_REMOVE(&disklist, diskp, dk_link); 258 simple_unlock(&disklist_slock); 259 260 /* 261 * Free the space used by the disklabel structures. 262 */ 263 free(diskp->dk_label, M_DEVBUF); 264 free(diskp->dk_cpulabel, M_DEVBUF); 265 } 266 267 /* 268 * Increment a disk's busy counter. If the counter is going from 269 * 0 to 1, set the timestamp. 270 */ 271 void 272 disk_busy(struct disk *diskp) 273 { 274 int s; 275 276 /* 277 * XXX We'd like to use something as accurate as microtime(), 278 * but that doesn't depend on the system TOD clock. 279 */ 280 if (diskp->dk_busy++ == 0) { 281 s = splclock(); 282 diskp->dk_timestamp = mono_time; 283 splx(s); 284 } 285 } 286 287 /* 288 * Decrement a disk's busy counter, increment the byte count, total busy 289 * time, and reset the timestamp. 290 */ 291 void 292 disk_unbusy(struct disk *diskp, long bcount, int read) 293 { 294 int s; 295 struct timeval dv_time, diff_time; 296 297 if (diskp->dk_busy-- == 0) { 298 printf("%s: dk_busy < 0\n", diskp->dk_name); 299 panic("disk_unbusy"); 300 } 301 302 s = splclock(); 303 dv_time = mono_time; 304 splx(s); 305 306 timersub(&dv_time, &diskp->dk_timestamp, &diff_time); 307 timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time); 308 309 diskp->dk_timestamp = dv_time; 310 if (bcount > 0) { 311 if (read) { 312 diskp->dk_rbytes += bcount; 313 diskp->dk_rxfer++; 314 } else { 315 diskp->dk_wbytes += bcount; 316 diskp->dk_wxfer++; 317 } 318 } 319 } 320 321 /* 322 * Reset the metrics counters on the given disk. Note that we cannot 323 * reset the busy counter, as it may case a panic in disk_unbusy(). 324 * We also must avoid playing with the timestamp information, as it 325 * may skew any pending transfer results. 326 */ 327 void 328 disk_resetstat(struct disk *diskp) 329 { 330 int s = splbio(), t; 331 332 diskp->dk_rxfer = 0; 333 diskp->dk_rbytes = 0; 334 diskp->dk_wxfer = 0; 335 diskp->dk_wbytes = 0; 336 337 t = splclock(); 338 diskp->dk_attachtime = mono_time; 339 splx(t); 340 341 timerclear(&diskp->dk_time); 342 343 splx(s); 344 } 345 346 int 347 sysctl_disknames(void *vwhere, size_t *sizep) 348 { 349 char buf[DK_DISKNAMELEN + 1]; 350 char *where = vwhere; 351 struct disk *diskp; 352 size_t needed, left, slen; 353 int error, first; 354 355 first = 1; 356 error = 0; 357 needed = 0; 358 left = *sizep; 359 360 simple_lock(&disklist_slock); 361 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 362 diskp = TAILQ_NEXT(diskp, dk_link)) { 363 if (where == NULL) 364 needed += strlen(diskp->dk_name) + 1; 365 else { 366 memset(buf, 0, sizeof(buf)); 367 if (first) { 368 strncpy(buf, diskp->dk_name, sizeof(buf)); 369 first = 0; 370 } else { 371 buf[0] = ' '; 372 strncpy(buf + 1, diskp->dk_name, 373 sizeof(buf) - 1); 374 } 375 buf[DK_DISKNAMELEN] = '\0'; 376 slen = strlen(buf); 377 if (left < slen + 1) 378 break; 379 /* +1 to copy out the trailing NUL byte */ 380 error = copyout(buf, where, slen + 1); 381 if (error) 382 break; 383 where += slen; 384 needed += slen; 385 left -= slen; 386 } 387 } 388 simple_unlock(&disklist_slock); 389 *sizep = needed; 390 return (error); 391 } 392 393 int 394 sysctl_diskstats(int *name, u_int namelen, void *vwhere, size_t *sizep) 395 { 396 struct disk_sysctl sdisk; 397 struct disk *diskp; 398 char *where = vwhere; 399 size_t tocopy, left; 400 int error; 401 402 /* 403 * The original hw.diskstats call was broken and did not require 404 * the userland to pass in it's size of struct disk_sysctl. This 405 * was fixed after NetBSD 1.6 was released, and any applications 406 * that do not pass in the size are given an error only, unless 407 * we care about 1.6 compatibility. 408 */ 409 if (namelen == 0) 410 #ifdef COMPAT_16 411 tocopy = offsetof(struct disk_sysctl, dk_rxfer); 412 #else 413 return (EINVAL); 414 #endif 415 else 416 tocopy = name[0]; 417 418 if (where == NULL) { 419 *sizep = disk_count * tocopy; 420 return (0); 421 } 422 423 error = 0; 424 left = *sizep; 425 memset(&sdisk, 0, sizeof(sdisk)); 426 *sizep = 0; 427 428 simple_lock(&disklist_slock); 429 TAILQ_FOREACH(diskp, &disklist, dk_link) { 430 if (left < tocopy) 431 break; 432 strncpy(sdisk.dk_name, diskp->dk_name, sizeof(sdisk.dk_name)); 433 sdisk.dk_xfer = diskp->dk_rxfer + diskp->dk_wxfer; 434 sdisk.dk_rxfer = diskp->dk_rxfer; 435 sdisk.dk_wxfer = diskp->dk_wxfer; 436 sdisk.dk_seek = diskp->dk_seek; 437 sdisk.dk_bytes = diskp->dk_rbytes + diskp->dk_wbytes; 438 sdisk.dk_rbytes = diskp->dk_rbytes; 439 sdisk.dk_wbytes = diskp->dk_wbytes; 440 sdisk.dk_attachtime_sec = diskp->dk_attachtime.tv_sec; 441 sdisk.dk_attachtime_usec = diskp->dk_attachtime.tv_usec; 442 sdisk.dk_timestamp_sec = diskp->dk_timestamp.tv_sec; 443 sdisk.dk_timestamp_usec = diskp->dk_timestamp.tv_usec; 444 sdisk.dk_time_sec = diskp->dk_time.tv_sec; 445 sdisk.dk_time_usec = diskp->dk_time.tv_usec; 446 sdisk.dk_busy = diskp->dk_busy; 447 448 error = copyout(&sdisk, where, min(tocopy, sizeof(sdisk))); 449 if (error) 450 break; 451 where += tocopy; 452 *sizep += tocopy; 453 left -= tocopy; 454 } 455 simple_unlock(&disklist_slock); 456 return (error); 457 } 458 459 struct bufq_fcfs { 460 TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */ 461 }; 462 463 struct bufq_disksort { 464 TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */ 465 }; 466 467 #define PRIO_READ_BURST 48 468 #define PRIO_WRITE_REQ 16 469 470 struct bufq_prio { 471 TAILQ_HEAD(, buf) bq_read, bq_write; /* actual list of buffers */ 472 struct buf *bq_write_next; /* next request in bq_write */ 473 struct buf *bq_next; /* current request */ 474 int bq_read_burst; /* # of consecutive reads */ 475 }; 476 477 478 /* 479 * Check if two buf's are in ascending order. 480 */ 481 static __inline int 482 buf_inorder(struct buf *bp, struct buf *bq, int sortby) 483 { 484 int r; 485 486 if (bp == NULL || bq == NULL) 487 return (bq == NULL); 488 489 if (sortby == BUFQ_SORT_CYLINDER) 490 r = bp->b_cylinder - bq->b_cylinder; 491 else 492 r = 0; 493 494 if (r == 0) 495 r = bp->b_rawblkno - bq->b_rawblkno; 496 497 return (r <= 0); 498 } 499 500 501 /* 502 * First-come first-served sort for disks. 503 * 504 * Requests are appended to the queue without any reordering. 505 */ 506 static void 507 bufq_fcfs_put(struct bufq_state *bufq, struct buf *bp) 508 { 509 struct bufq_fcfs *fcfs = bufq->bq_private; 510 511 TAILQ_INSERT_TAIL(&fcfs->bq_head, bp, b_actq); 512 } 513 514 static struct buf * 515 bufq_fcfs_get(struct bufq_state *bufq, int remove) 516 { 517 struct bufq_fcfs *fcfs = bufq->bq_private; 518 struct buf *bp; 519 520 bp = TAILQ_FIRST(&fcfs->bq_head); 521 522 if (bp != NULL && remove) 523 TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq); 524 525 return (bp); 526 } 527 528 529 /* 530 * Seek sort for disks. 531 * 532 * There are actually two queues, sorted in ascendening order. The first 533 * queue holds those requests which are positioned after the current block; 534 * the second holds requests which came in after their position was passed. 535 * Thus we implement a one-way scan, retracting after reaching the end of 536 * the drive to the first request on the second queue, at which time it 537 * becomes the first queue. 538 * 539 * A one-way scan is natural because of the way UNIX read-ahead blocks are 540 * allocated. 541 */ 542 static void 543 bufq_disksort_put(struct bufq_state *bufq, struct buf *bp) 544 { 545 struct bufq_disksort *disksort = bufq->bq_private; 546 struct buf *bq, *nbq; 547 int sortby; 548 549 sortby = bufq->bq_flags & BUFQ_SORT_MASK; 550 551 bq = TAILQ_FIRST(&disksort->bq_head); 552 553 /* 554 * If the queue is empty it's easy; we just go on the end. 555 */ 556 if (bq == NULL) { 557 TAILQ_INSERT_TAIL(&disksort->bq_head, bp, b_actq); 558 return; 559 } 560 561 /* 562 * If we lie before the currently active request, then we 563 * must locate the second request list and add ourselves to it. 564 */ 565 if (buf_inorder(bp, bq, sortby)) { 566 while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) { 567 /* 568 * Check for an ``inversion'' in the normally ascending 569 * block numbers, indicating the start of the second 570 * request list. 571 */ 572 if (buf_inorder(nbq, bq, sortby)) { 573 /* 574 * Search the second request list for the first 575 * request at a larger block number. We go 576 * after that; if there is no such request, we 577 * go at the end. 578 */ 579 do { 580 if (buf_inorder(bp, nbq, sortby)) 581 goto insert; 582 bq = nbq; 583 } while ((nbq = 584 TAILQ_NEXT(bq, b_actq)) != NULL); 585 goto insert; /* after last */ 586 } 587 bq = nbq; 588 } 589 /* 590 * No inversions... we will go after the last, and 591 * be the first request in the second request list. 592 */ 593 goto insert; 594 } 595 /* 596 * Request is at/after the current request... 597 * sort in the first request list. 598 */ 599 while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) { 600 /* 601 * We want to go after the current request if there is an 602 * inversion after it (i.e. it is the end of the first 603 * request list), or if the next request is a larger cylinder 604 * than our request. 605 */ 606 if (buf_inorder(nbq, bq, sortby) || 607 buf_inorder(bp, nbq, sortby)) 608 goto insert; 609 bq = nbq; 610 } 611 /* 612 * Neither a second list nor a larger request... we go at the end of 613 * the first list, which is the same as the end of the whole schebang. 614 */ 615 insert: TAILQ_INSERT_AFTER(&disksort->bq_head, bq, bp, b_actq); 616 } 617 618 static struct buf * 619 bufq_disksort_get(struct bufq_state *bufq, int remove) 620 { 621 struct bufq_disksort *disksort = bufq->bq_private; 622 struct buf *bp; 623 624 bp = TAILQ_FIRST(&disksort->bq_head); 625 626 if (bp != NULL && remove) 627 TAILQ_REMOVE(&disksort->bq_head, bp, b_actq); 628 629 return (bp); 630 } 631 632 633 /* 634 * Seek sort for disks. 635 * 636 * There are two queues. The first queue holds read requests; the second 637 * holds write requests. The read queue is first-come first-served; the 638 * write queue is sorted in ascendening block order. 639 * The read queue is processed first. After PRIO_READ_BURST consecutive 640 * read requests with non-empty write queue PRIO_WRITE_REQ requests from 641 * the write queue will be processed. 642 */ 643 static void 644 bufq_prio_put(struct bufq_state *bufq, struct buf *bp) 645 { 646 struct bufq_prio *prio = bufq->bq_private; 647 struct buf *bq; 648 int sortby; 649 650 sortby = bufq->bq_flags & BUFQ_SORT_MASK; 651 652 /* 653 * If it's a read request append it to the list. 654 */ 655 if ((bp->b_flags & B_READ) == B_READ) { 656 TAILQ_INSERT_TAIL(&prio->bq_read, bp, b_actq); 657 return; 658 } 659 660 bq = TAILQ_FIRST(&prio->bq_write); 661 662 /* 663 * If the write list is empty, simply append it to the list. 664 */ 665 if (bq == NULL) { 666 TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq); 667 prio->bq_write_next = bp; 668 return; 669 } 670 671 /* 672 * If we lie after the next request, insert after this request. 673 */ 674 if (buf_inorder(prio->bq_write_next, bp, sortby)) 675 bq = prio->bq_write_next; 676 677 /* 678 * Search for the first request at a larger block number. 679 * We go before this request if it exists. 680 */ 681 while (bq != NULL && buf_inorder(bq, bp, sortby)) 682 bq = TAILQ_NEXT(bq, b_actq); 683 684 if (bq != NULL) 685 TAILQ_INSERT_BEFORE(bq, bp, b_actq); 686 else 687 TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq); 688 } 689 690 static struct buf * 691 bufq_prio_get(struct bufq_state *bufq, int remove) 692 { 693 struct bufq_prio *prio = bufq->bq_private; 694 struct buf *bp; 695 696 /* 697 * If no current request, get next from the lists. 698 */ 699 if (prio->bq_next == NULL) { 700 /* 701 * If at least one list is empty, select the other. 702 */ 703 if (TAILQ_FIRST(&prio->bq_read) == NULL) { 704 prio->bq_next = prio->bq_write_next; 705 prio->bq_read_burst = 0; 706 } else if (prio->bq_write_next == NULL) { 707 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 708 prio->bq_read_burst = 0; 709 } else { 710 /* 711 * Both list have requests. Select the read list up 712 * to PRIO_READ_BURST times, then select the write 713 * list PRIO_WRITE_REQ times. 714 */ 715 if (prio->bq_read_burst++ < PRIO_READ_BURST) 716 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 717 else if (prio->bq_read_burst < 718 PRIO_READ_BURST + PRIO_WRITE_REQ) 719 prio->bq_next = prio->bq_write_next; 720 else { 721 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 722 prio->bq_read_burst = 0; 723 } 724 } 725 } 726 727 bp = prio->bq_next; 728 729 if (bp != NULL && remove) { 730 if ((bp->b_flags & B_READ) == B_READ) 731 TAILQ_REMOVE(&prio->bq_read, bp, b_actq); 732 else { 733 /* 734 * Advance the write pointer before removing 735 * bp since it is actually prio->bq_write_next. 736 */ 737 prio->bq_write_next = 738 TAILQ_NEXT(prio->bq_write_next, b_actq); 739 TAILQ_REMOVE(&prio->bq_write, bp, b_actq); 740 if (prio->bq_write_next == NULL) 741 prio->bq_write_next = 742 TAILQ_FIRST(&prio->bq_write); 743 } 744 745 prio->bq_next = NULL; 746 } 747 748 return (bp); 749 } 750 751 /* 752 * Create a device buffer queue. 753 */ 754 void 755 bufq_alloc(struct bufq_state *bufq, int flags) 756 { 757 struct bufq_fcfs *fcfs; 758 struct bufq_disksort *disksort; 759 struct bufq_prio *prio; 760 761 bufq->bq_flags = flags; 762 763 switch (flags & BUFQ_SORT_MASK) { 764 case BUFQ_SORT_RAWBLOCK: 765 case BUFQ_SORT_CYLINDER: 766 break; 767 case 0: 768 if ((flags & BUFQ_METHOD_MASK) == BUFQ_FCFS) 769 break; 770 /* FALLTHROUGH */ 771 default: 772 panic("bufq_alloc: sort out of range"); 773 } 774 775 switch (flags & BUFQ_METHOD_MASK) { 776 case BUFQ_FCFS: 777 bufq->bq_get = bufq_fcfs_get; 778 bufq->bq_put = bufq_fcfs_put; 779 MALLOC(bufq->bq_private, struct bufq_fcfs *, 780 sizeof(struct bufq_fcfs), M_DEVBUF, M_ZERO); 781 fcfs = (struct bufq_fcfs *)bufq->bq_private; 782 TAILQ_INIT(&fcfs->bq_head); 783 break; 784 case BUFQ_DISKSORT: 785 bufq->bq_get = bufq_disksort_get; 786 bufq->bq_put = bufq_disksort_put; 787 MALLOC(bufq->bq_private, struct bufq_disksort *, 788 sizeof(struct bufq_disksort), M_DEVBUF, M_ZERO); 789 disksort = (struct bufq_disksort *)bufq->bq_private; 790 TAILQ_INIT(&disksort->bq_head); 791 break; 792 case BUFQ_READ_PRIO: 793 bufq->bq_get = bufq_prio_get; 794 bufq->bq_put = bufq_prio_put; 795 MALLOC(bufq->bq_private, struct bufq_prio *, 796 sizeof(struct bufq_prio), M_DEVBUF, M_ZERO); 797 prio = (struct bufq_prio *)bufq->bq_private; 798 TAILQ_INIT(&prio->bq_read); 799 TAILQ_INIT(&prio->bq_write); 800 break; 801 default: 802 panic("bufq_alloc: method out of range"); 803 } 804 } 805 806 /* 807 * Destroy a device buffer queue. 808 */ 809 void 810 bufq_free(struct bufq_state *bufq) 811 { 812 813 KASSERT(bufq->bq_private != NULL); 814 KASSERT(BUFQ_PEEK(bufq) == NULL); 815 816 FREE(bufq->bq_private, M_DEVBUF); 817 bufq->bq_get = NULL; 818 bufq->bq_put = NULL; 819 } 820