1 /* $NetBSD: subr_disk.c,v 1.37 2002/02/16 02:11:43 enami Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1982, 1986, 1988, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. All advertising materials mentioning features or use of this software 58 * must display the following acknowledgement: 59 * This product includes software developed by the University of 60 * California, Berkeley and its contributors. 61 * 4. Neither the name of the University nor the names of its contributors 62 * may be used to endorse or promote products derived from this software 63 * without specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 66 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 69 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 70 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 71 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 72 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 73 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 74 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 75 * SUCH DAMAGE. 76 * 77 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 78 */ 79 80 #include <sys/cdefs.h> 81 __KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.37 2002/02/16 02:11:43 enami Exp $"); 82 83 #include <sys/param.h> 84 #include <sys/kernel.h> 85 #include <sys/malloc.h> 86 #include <sys/buf.h> 87 #include <sys/syslog.h> 88 #include <sys/disklabel.h> 89 #include <sys/disk.h> 90 #include <sys/sysctl.h> 91 92 /* 93 * A global list of all disks attached to the system. May grow or 94 * shrink over time. 95 */ 96 struct disklist_head disklist; /* TAILQ_HEAD */ 97 int disk_count; /* number of drives in global disklist */ 98 struct simplelock disklist_slock = SIMPLELOCK_INITIALIZER; 99 100 /* 101 * Seek sort for disks. We depend on the driver which calls us using b_resid 102 * as the current cylinder number. 103 * 104 * The argument bufq is an I/O queue for the device, on which there are 105 * actually two queues, sorted in ascending cylinder order. The first 106 * queue holds those requests which are positioned after the current 107 * cylinder (in the first request); the second holds requests which came 108 * in after their cylinder number was passed. Thus we implement a one-way 109 * scan, retracting after reaching the end of the drive to the first request 110 * on the second queue, at which time it becomes the first queue. 111 * 112 * A one-way scan is natural because of the way UNIX read-ahead blocks are 113 * allocated. 114 * 115 * This is further adjusted by any `barriers' which may exist in the queue. 116 * The bufq points to the last such ordered request. 117 */ 118 void 119 disksort_cylinder(struct buf_queue *bufq, struct buf *bp) 120 { 121 struct buf *bq, *nbq; 122 123 /* 124 * If there are ordered requests on the queue, we must start 125 * the elevator sort after the last of these. 126 */ 127 if ((bq = bufq->bq_barrier) == NULL) 128 bq = BUFQ_FIRST(bufq); 129 130 /* 131 * If the queue is empty, of if it's an ordered request, 132 * it's easy; we just go on the end. 133 */ 134 if (bq == NULL || (bp->b_flags & B_ORDERED) != 0) { 135 BUFQ_INSERT_TAIL(bufq, bp); 136 return; 137 } 138 139 /* 140 * If we lie after the first (currently active) request, then we 141 * must locate the second request list and add ourselves to it. 142 */ 143 if (bp->b_cylinder < bq->b_cylinder || 144 (bp->b_cylinder == bq->b_cylinder && 145 bp->b_rawblkno < bq->b_rawblkno)) { 146 while ((nbq = BUFQ_NEXT(bq)) != NULL) { 147 /* 148 * Check for an ``inversion'' in the normally ascending 149 * cylinder numbers, indicating the start of the second 150 * request list. 151 */ 152 if (nbq->b_cylinder < bq->b_cylinder) { 153 /* 154 * Search the second request list for the first 155 * request at a larger cylinder number. We go 156 * before that; if there is no such request, we 157 * go at end. 158 */ 159 do { 160 if (bp->b_cylinder < nbq->b_cylinder) 161 goto insert; 162 if (bp->b_cylinder == nbq->b_cylinder && 163 bp->b_rawblkno < nbq->b_rawblkno) 164 goto insert; 165 bq = nbq; 166 } while ((nbq = BUFQ_NEXT(bq)) != NULL); 167 goto insert; /* after last */ 168 } 169 bq = nbq; 170 } 171 /* 172 * No inversions... we will go after the last, and 173 * be the first request in the second request list. 174 */ 175 goto insert; 176 } 177 /* 178 * Request is at/after the current request... 179 * sort in the first request list. 180 */ 181 while ((nbq = BUFQ_NEXT(bq)) != NULL) { 182 /* 183 * We want to go after the current request if there is an 184 * inversion after it (i.e. it is the end of the first 185 * request list), or if the next request is a larger cylinder 186 * than our request. 187 */ 188 if (nbq->b_cylinder < bq->b_cylinder || 189 bp->b_cylinder < nbq->b_cylinder || 190 (bp->b_cylinder == nbq->b_cylinder && 191 bp->b_rawblkno < nbq->b_rawblkno)) 192 goto insert; 193 bq = nbq; 194 } 195 /* 196 * Neither a second list nor a larger request... we go at the end of 197 * the first list, which is the same as the end of the whole schebang. 198 */ 199 insert: BUFQ_INSERT_AFTER(bufq, bq, bp); 200 } 201 202 /* 203 * Seek sort for disks. This version sorts based on b_rawblkno, which 204 * indicates the block number. 205 * 206 * As before, there are actually two queues, sorted in ascendening block 207 * order. The first queue holds those requests which are positioned after 208 * the current block (in the first request); the second holds requests which 209 * came in after their block number was passed. Thus we implement a one-way 210 * scan, retracting after reaching the end of the driver to the first request 211 * on the second queue, at which time it becomes the first queue. 212 * 213 * A one-way scan is natural because of the way UNIX read-ahead blocks are 214 * allocated. 215 * 216 * This is further adjusted by any `barriers' which may exist in the queue. 217 * The bufq points to the last such ordered request. 218 */ 219 void 220 disksort_blkno(struct buf_queue *bufq, struct buf *bp) 221 { 222 struct buf *bq, *nbq; 223 224 /* 225 * If there are ordered requests on the queue, we must start 226 * the elevator sort after the last of these. 227 */ 228 if ((bq = bufq->bq_barrier) == NULL) 229 bq = BUFQ_FIRST(bufq); 230 231 /* 232 * If the queue is empty, or if it's an ordered request, 233 * it's easy; we just go on the end. 234 */ 235 if (bq == NULL || (bp->b_flags & B_ORDERED) != 0) { 236 BUFQ_INSERT_TAIL(bufq, bp); 237 return; 238 } 239 240 /* 241 * If we lie after the first (currently active) request, then we 242 * must locate the second request list and add ourselves to it. 243 */ 244 if (bp->b_rawblkno < bq->b_rawblkno) { 245 while ((nbq = BUFQ_NEXT(bq)) != NULL) { 246 /* 247 * Check for an ``inversion'' in the normally ascending 248 * block numbers, indicating the start of the second 249 * request list. 250 */ 251 if (nbq->b_rawblkno < bq->b_rawblkno) { 252 /* 253 * Search the second request list for the first 254 * request at a larger block number. We go 255 * after that; if there is no such request, we 256 * go at the end. 257 */ 258 do { 259 if (bp->b_rawblkno < nbq->b_rawblkno) 260 goto insert; 261 bq = nbq; 262 } while ((nbq = BUFQ_NEXT(bq)) != NULL); 263 goto insert; /* after last */ 264 } 265 bq = nbq; 266 } 267 /* 268 * No inversions... we will go after the last, and 269 * be the first request in the second request list. 270 */ 271 goto insert; 272 } 273 /* 274 * Request is at/after the current request... 275 * sort in the first request list. 276 */ 277 while ((nbq = BUFQ_NEXT(bq)) != NULL) { 278 /* 279 * We want to go after the current request if there is an 280 * inversion after it (i.e. it is the end of the first 281 * request list), or if the next request is a larger cylinder 282 * than our request. 283 */ 284 if (nbq->b_rawblkno < bq->b_rawblkno || 285 bp->b_rawblkno < nbq->b_rawblkno) 286 goto insert; 287 bq = nbq; 288 } 289 /* 290 * Neither a second list nor a larger request... we go at the end of 291 * the first list, which is the same as the end of the whole schebang. 292 */ 293 insert: BUFQ_INSERT_AFTER(bufq, bq, bp); 294 } 295 296 /* 297 * Seek non-sort for disks. This version simply inserts requests at 298 * the tail of the queue. 299 */ 300 void 301 disksort_tail(struct buf_queue *bufq, struct buf *bp) 302 { 303 304 BUFQ_INSERT_TAIL(bufq, bp); 305 } 306 307 /* 308 * Compute checksum for disk label. 309 */ 310 u_int 311 dkcksum(struct disklabel *lp) 312 { 313 u_short *start, *end; 314 u_short sum = 0; 315 316 start = (u_short *)lp; 317 end = (u_short *)&lp->d_partitions[lp->d_npartitions]; 318 while (start < end) 319 sum ^= *start++; 320 return (sum); 321 } 322 323 /* 324 * Disk error is the preface to plaintive error messages 325 * about failing disk transfers. It prints messages of the form 326 327 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) 328 329 * if the offset of the error in the transfer and a disk label 330 * are both available. blkdone should be -1 if the position of the error 331 * is unknown; the disklabel pointer may be null from drivers that have not 332 * been converted to use them. The message is printed with printf 333 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. 334 * The message should be completed (with at least a newline) with printf 335 * or addlog, respectively. There is no trailing space. 336 */ 337 void 338 diskerr(struct buf *bp, char *dname, char *what, int pri, int blkdone, 339 struct disklabel *lp) 340 { 341 int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev); 342 void (*pr)(const char *, ...); 343 char partname = 'a' + part; 344 int sn; 345 346 if (pri != LOG_PRINTF) { 347 static const char fmt[] = ""; 348 log(pri, fmt); 349 pr = addlog; 350 } else 351 pr = printf; 352 (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, 353 bp->b_flags & B_READ ? "read" : "writ"); 354 sn = bp->b_blkno; 355 if (bp->b_bcount <= DEV_BSIZE) 356 (*pr)("%d", sn); 357 else { 358 if (blkdone >= 0) { 359 sn += blkdone; 360 (*pr)("%d of ", sn); 361 } 362 (*pr)("%d-%d", bp->b_blkno, 363 bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); 364 } 365 if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { 366 sn += lp->d_partitions[part].p_offset; 367 (*pr)(" (%s%d bn %d; cn %d", dname, unit, sn, 368 sn / lp->d_secpercyl); 369 sn %= lp->d_secpercyl; 370 (*pr)(" tn %d sn %d)", sn / lp->d_nsectors, 371 sn % lp->d_nsectors); 372 } 373 } 374 375 /* 376 * Initialize the disklist. Called by main() before autoconfiguration. 377 */ 378 void 379 disk_init(void) 380 { 381 382 TAILQ_INIT(&disklist); 383 disk_count = 0; 384 } 385 386 /* 387 * Searches the disklist for the disk corresponding to the 388 * name provided. 389 */ 390 struct disk * 391 disk_find(char *name) 392 { 393 struct disk *diskp; 394 395 if ((name == NULL) || (disk_count <= 0)) 396 return (NULL); 397 398 simple_lock(&disklist_slock); 399 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 400 diskp = TAILQ_NEXT(diskp, dk_link)) 401 if (strcmp(diskp->dk_name, name) == 0) { 402 simple_unlock(&disklist_slock); 403 return (diskp); 404 } 405 simple_unlock(&disklist_slock); 406 407 return (NULL); 408 } 409 410 /* 411 * Attach a disk. 412 */ 413 void 414 disk_attach(struct disk *diskp) 415 { 416 int s; 417 418 /* 419 * Allocate and initialize the disklabel structures. Note that 420 * it's not safe to sleep here, since we're probably going to be 421 * called during autoconfiguration. 422 */ 423 diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT); 424 diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF, 425 M_NOWAIT); 426 if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL)) 427 panic("disk_attach: can't allocate storage for disklabel"); 428 429 memset(diskp->dk_label, 0, sizeof(struct disklabel)); 430 memset(diskp->dk_cpulabel, 0, sizeof(struct cpu_disklabel)); 431 432 /* 433 * Set the attached timestamp. 434 */ 435 s = splclock(); 436 diskp->dk_attachtime = mono_time; 437 splx(s); 438 439 /* 440 * Link into the disklist. 441 */ 442 simple_lock(&disklist_slock); 443 TAILQ_INSERT_TAIL(&disklist, diskp, dk_link); 444 simple_unlock(&disklist_slock); 445 ++disk_count; 446 } 447 448 /* 449 * Detach a disk. 450 */ 451 void 452 disk_detach(struct disk *diskp) 453 { 454 455 /* 456 * Remove from the disklist. 457 */ 458 if (--disk_count < 0) 459 panic("disk_detach: disk_count < 0"); 460 simple_lock(&disklist_slock); 461 TAILQ_REMOVE(&disklist, diskp, dk_link); 462 simple_unlock(&disklist_slock); 463 464 /* 465 * Free the space used by the disklabel structures. 466 */ 467 free(diskp->dk_label, M_DEVBUF); 468 free(diskp->dk_cpulabel, M_DEVBUF); 469 } 470 471 /* 472 * Increment a disk's busy counter. If the counter is going from 473 * 0 to 1, set the timestamp. 474 */ 475 void 476 disk_busy(struct disk *diskp) 477 { 478 int s; 479 480 /* 481 * XXX We'd like to use something as accurate as microtime(), 482 * but that doesn't depend on the system TOD clock. 483 */ 484 if (diskp->dk_busy++ == 0) { 485 s = splclock(); 486 diskp->dk_timestamp = mono_time; 487 splx(s); 488 } 489 } 490 491 /* 492 * Decrement a disk's busy counter, increment the byte count, total busy 493 * time, and reset the timestamp. 494 */ 495 void 496 disk_unbusy(struct disk *diskp, long bcount) 497 { 498 int s; 499 struct timeval dv_time, diff_time; 500 501 if (diskp->dk_busy-- == 0) { 502 printf("%s: dk_busy < 0\n", diskp->dk_name); 503 panic("disk_unbusy"); 504 } 505 506 s = splclock(); 507 dv_time = mono_time; 508 splx(s); 509 510 timersub(&dv_time, &diskp->dk_timestamp, &diff_time); 511 timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time); 512 513 diskp->dk_timestamp = dv_time; 514 if (bcount > 0) { 515 diskp->dk_bytes += bcount; 516 diskp->dk_xfer++; 517 } 518 } 519 520 /* 521 * Reset the metrics counters on the given disk. Note that we cannot 522 * reset the busy counter, as it may case a panic in disk_unbusy(). 523 * We also must avoid playing with the timestamp information, as it 524 * may skew any pending transfer results. 525 */ 526 void 527 disk_resetstat(struct disk *diskp) 528 { 529 int s = splbio(), t; 530 531 diskp->dk_xfer = 0; 532 diskp->dk_bytes = 0; 533 534 t = splclock(); 535 diskp->dk_attachtime = mono_time; 536 splx(t); 537 538 timerclear(&diskp->dk_time); 539 540 splx(s); 541 } 542 543 int 544 sysctl_disknames(void *vwhere, size_t *sizep) 545 { 546 char buf[DK_DISKNAMELEN + 1]; 547 char *where = vwhere; 548 struct disk *diskp; 549 size_t needed, left, slen; 550 int error, first; 551 552 first = 1; 553 error = 0; 554 needed = 0; 555 left = *sizep; 556 557 simple_lock(&disklist_slock); 558 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 559 diskp = TAILQ_NEXT(diskp, dk_link)) { 560 if (where == NULL) 561 needed += strlen(diskp->dk_name) + 1; 562 else { 563 memset(buf, 0, sizeof(buf)); 564 if (first) { 565 strncpy(buf, diskp->dk_name, sizeof(buf)); 566 first = 0; 567 } else { 568 buf[0] = ' '; 569 strncpy(buf + 1, diskp->dk_name, 570 sizeof(buf) - 1); 571 } 572 buf[DK_DISKNAMELEN] = '\0'; 573 slen = strlen(buf); 574 if (left < slen + 1) 575 break; 576 /* +1 to copy out the trailing NUL byte */ 577 error = copyout(buf, where, slen + 1); 578 if (error) 579 break; 580 where += slen; 581 needed += slen; 582 left -= slen; 583 } 584 } 585 simple_unlock(&disklist_slock); 586 *sizep = needed; 587 return (error); 588 } 589 590 int 591 sysctl_diskstats(int *name, u_int namelen, void *vwhere, size_t *sizep) 592 { 593 struct disk_sysctl sdisk; 594 struct disk *diskp; 595 char *where = vwhere; 596 size_t tocopy, left; 597 int error; 598 599 if (where == NULL) { 600 *sizep = disk_count * sizeof(struct disk_sysctl); 601 return (0); 602 } 603 604 if (namelen == 0) 605 tocopy = sizeof(sdisk); 606 else 607 tocopy = name[0]; 608 609 error = 0; 610 left = *sizep; 611 memset(&sdisk, 0, sizeof(sdisk)); 612 *sizep = 0; 613 614 simple_lock(&disklist_slock); 615 TAILQ_FOREACH(diskp, &disklist, dk_link) { 616 if (left < sizeof(struct disk_sysctl)) 617 break; 618 strncpy(sdisk.dk_name, diskp->dk_name, sizeof(sdisk.dk_name)); 619 sdisk.dk_xfer = diskp->dk_xfer; 620 sdisk.dk_seek = diskp->dk_seek; 621 sdisk.dk_bytes = diskp->dk_bytes; 622 sdisk.dk_attachtime_sec = diskp->dk_attachtime.tv_sec; 623 sdisk.dk_attachtime_usec = diskp->dk_attachtime.tv_usec; 624 sdisk.dk_timestamp_sec = diskp->dk_timestamp.tv_sec; 625 sdisk.dk_timestamp_usec = diskp->dk_timestamp.tv_usec; 626 sdisk.dk_time_sec = diskp->dk_time.tv_sec; 627 sdisk.dk_time_usec = diskp->dk_time.tv_usec; 628 sdisk.dk_busy = diskp->dk_busy; 629 630 error = copyout(&sdisk, where, min(tocopy, sizeof(sdisk))); 631 if (error) 632 break; 633 where += tocopy; 634 *sizep += tocopy; 635 left -= tocopy; 636 } 637 simple_unlock(&disklist_slock); 638 return (error); 639 } 640