xref: /openbsd/sys/kern/subr_disk.c (revision db3296cf)
1 /*	$OpenBSD: subr_disk.c,v 1.23 2003/06/25 20:52:57 tedu Exp $	*/
2 /*	$NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Jason R. Thorpe.  All rights reserved.
6  * Copyright (c) 1982, 1986, 1988, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/fcntl.h>
46 #include <sys/buf.h>
47 #include <sys/stat.h>
48 #include <sys/syslog.h>
49 #include <sys/time.h>
50 #include <sys/disklabel.h>
51 #include <sys/conf.h>
52 #include <sys/lock.h>
53 #include <sys/disk.h>
54 #include <sys/dkio.h>
55 #include <sys/dkstat.h>		/* XXX */
56 #include <sys/proc.h>
57 
58 #include <dev/rndvar.h>
59 
60 /*
61  * A global list of all disks attached to the system.  May grow or
62  * shrink over time.
63  */
64 struct	disklist_head disklist;	/* TAILQ_HEAD */
65 int	disk_count;		/* number of drives in global disklist */
66 int	disk_change;		/* set if a disk has been attached/detached
67 				 * since last we looked at this variable. This
68 				 * is reset by hw_sysctl()
69 				 */
70 
71 /*
72  * Seek sort for disks.  We depend on the driver which calls us using b_resid
73  * as the current cylinder number.
74  *
75  * The argument ap structure holds a b_actf activity chain pointer on which we
76  * keep two queues, sorted in ascending cylinder order.  The first queue holds
77  * those requests which are positioned after the current cylinder (in the first
78  * request); the second holds requests which came in after their cylinder number
79  * was passed.  Thus we implement a one way scan, retracting after reaching the
80  * end of the drive to the first request on the second queue, at which time it
81  * becomes the first queue.
82  *
83  * A one-way scan is natural because of the way UNIX read-ahead blocks are
84  * allocated.
85  */
86 
87 void
88 disksort(ap, bp)
89 	register struct buf *ap, *bp;
90 {
91 	register struct buf *bq;
92 
93 	/* If the queue is empty, then it's easy. */
94 	if (ap->b_actf == NULL) {
95 		bp->b_actf = NULL;
96 		ap->b_actf = bp;
97 		return;
98 	}
99 
100 	/*
101 	 * If we lie after the first (currently active) request, then we
102 	 * must locate the second request list and add ourselves to it.
103 	 */
104 	bq = ap->b_actf;
105 	if (bp->b_cylinder < bq->b_cylinder) {
106 		while (bq->b_actf) {
107 			/*
108 			 * Check for an ``inversion'' in the normally ascending
109 			 * cylinder numbers, indicating the start of the second
110 			 * request list.
111 			 */
112 			if (bq->b_actf->b_cylinder < bq->b_cylinder) {
113 				/*
114 				 * Search the second request list for the first
115 				 * request at a larger cylinder number.  We go
116 				 * before that; if there is no such request, we
117 				 * go at end.
118 				 */
119 				do {
120 					if (bp->b_cylinder <
121 					    bq->b_actf->b_cylinder)
122 						goto insert;
123 					if (bp->b_cylinder ==
124 					    bq->b_actf->b_cylinder &&
125 					    bp->b_blkno < bq->b_actf->b_blkno)
126 						goto insert;
127 					bq = bq->b_actf;
128 				} while (bq->b_actf);
129 				goto insert;		/* after last */
130 			}
131 			bq = bq->b_actf;
132 		}
133 		/*
134 		 * No inversions... we will go after the last, and
135 		 * be the first request in the second request list.
136 		 */
137 		goto insert;
138 	}
139 	/*
140 	 * Request is at/after the current request...
141 	 * sort in the first request list.
142 	 */
143 	while (bq->b_actf) {
144 		/*
145 		 * We want to go after the current request if there is an
146 		 * inversion after it (i.e. it is the end of the first
147 		 * request list), or if the next request is a larger cylinder
148 		 * than our request.
149 		 */
150 		if (bq->b_actf->b_cylinder < bq->b_cylinder ||
151 		    bp->b_cylinder < bq->b_actf->b_cylinder ||
152 		    (bp->b_cylinder == bq->b_actf->b_cylinder &&
153 		    bp->b_blkno < bq->b_actf->b_blkno))
154 			goto insert;
155 		bq = bq->b_actf;
156 	}
157 	/*
158 	 * Neither a second list nor a larger request... we go at the end of
159 	 * the first list, which is the same as the end of the whole schebang.
160 	 */
161 insert:	bp->b_actf = bq->b_actf;
162 	bq->b_actf = bp;
163 }
164 
165 /*
166  * Compute checksum for disk label.
167  */
168 u_int
169 dkcksum(lp)
170 	register struct disklabel *lp;
171 {
172 	register u_int16_t *start, *end;
173 	register u_int16_t sum = 0;
174 
175 	start = (u_int16_t *)lp;
176 	end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions];
177 	while (start < end)
178 		sum ^= *start++;
179 	return (sum);
180 }
181 
182 /*
183  * Disk error is the preface to plaintive error messages
184  * about failing disk transfers.  It prints messages of the form
185 
186 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
187 
188  * if the offset of the error in the transfer and a disk label
189  * are both available.  blkdone should be -1 if the position of the error
190  * is unknown; the disklabel pointer may be null from drivers that have not
191  * been converted to use them.  The message is printed with printf
192  * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
193  * The message should be completed (with at least a newline) with printf
194  * or addlog, respectively.  There is no trailing space.
195  */
196 void
197 diskerr(bp, dname, what, pri, blkdone, lp)
198 	register struct buf *bp;
199 	char *dname, *what;
200 	int pri, blkdone;
201 	register struct disklabel *lp;
202 {
203 	int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
204 	register int (*pr)(const char *, ...);
205 	char partname = 'a' + part;
206 	int sn;
207 
208 	if (pri != LOG_PRINTF) {
209 		static const char fmt[] = "";
210 		log(pri, fmt);
211 		pr = addlog;
212 	} else
213 		pr = printf;
214 	(*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
215 	    bp->b_flags & B_READ ? "read" : "writ");
216 	sn = bp->b_blkno;
217 	if (bp->b_bcount <= DEV_BSIZE)
218 		(*pr)("%d", sn);
219 	else {
220 		if (blkdone >= 0) {
221 			sn += blkdone;
222 			(*pr)("%d of ", sn);
223 		}
224 		(*pr)("%d-%d", bp->b_blkno,
225 		    bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
226 	}
227 	if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
228 #ifdef tahoe
229 		sn *= DEV_BSIZE / lp->d_secsize;		/* XXX */
230 #endif
231 		sn += lp->d_partitions[part].p_offset;
232 		(*pr)(" (%s%d bn %d; cn %d", dname, unit, sn,
233 		    sn / lp->d_secpercyl);
234 		sn %= lp->d_secpercyl;
235 		(*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors);
236 	}
237 }
238 
239 /*
240  * Initialize the disklist.  Called by main() before autoconfiguration.
241  */
242 void
243 disk_init()
244 {
245 
246 	TAILQ_INIT(&disklist);
247 	disk_count = disk_change = 0;
248 }
249 
250 /*
251  * Searches the disklist for the disk corresponding to the
252  * name provided.
253  */
254 struct disk *
255 disk_find(name)
256 	char *name;
257 {
258 	struct disk *diskp;
259 
260 	if ((name == NULL) || (disk_count <= 0))
261 		return (NULL);
262 
263 	for (diskp = disklist.tqh_first; diskp != NULL;
264 	    diskp = diskp->dk_link.tqe_next)
265 		if (strcmp(diskp->dk_name, name) == 0)
266 			return (diskp);
267 
268 	return (NULL);
269 }
270 
271 int
272 disk_construct(diskp, lockname)
273 	struct disk *diskp;
274 	char *lockname;
275 {
276 	lockinit(&diskp->dk_lock, PRIBIO | PCATCH, lockname,
277 		 0, LK_CANRECURSE);
278 
279 	diskp->dk_flags |= DKF_CONSTRUCTED;
280 
281 	return (0);
282 }
283 
284 /*
285  * Attach a disk.
286  */
287 void
288 disk_attach(diskp)
289 	struct disk *diskp;
290 {
291 	int s;
292 
293 	if (!diskp->dk_flags & DKF_CONSTRUCTED)
294 		disk_construct(diskp, diskp->dk_name);
295 
296 	/*
297 	 * Allocate and initialize the disklabel structures.  Note that
298 	 * it's not safe to sleep here, since we're probably going to be
299 	 * called during autoconfiguration.
300 	 */
301 	diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT);
302 	diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF,
303 	    M_NOWAIT);
304 	if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL))
305 		panic("disk_attach: can't allocate storage for disklabel");
306 
307 	bzero(diskp->dk_label, sizeof(struct disklabel));
308 	bzero(diskp->dk_cpulabel, sizeof(struct cpu_disklabel));
309 
310 	/*
311 	 * Set the attached timestamp.
312 	 */
313 	s = splclock();
314 	diskp->dk_attachtime = mono_time;
315 	splx(s);
316 
317 	/*
318 	 * Link into the disklist.
319 	 */
320 	TAILQ_INSERT_TAIL(&disklist, diskp, dk_link);
321 	++disk_count;
322 	disk_change = 1;
323 }
324 
325 /*
326  * Detach a disk.
327  */
328 void
329 disk_detach(diskp)
330 	struct disk *diskp;
331 {
332 
333 	/*
334 	 * Free the space used by the disklabel structures.
335 	 */
336 	free(diskp->dk_label, M_DEVBUF);
337 	free(diskp->dk_cpulabel, M_DEVBUF);
338 
339 	/*
340 	 * Remove from the disklist.
341 	 */
342 	TAILQ_REMOVE(&disklist, diskp, dk_link);
343 	disk_change = 1;
344 	if (--disk_count < 0)
345 		panic("disk_detach: disk_count < 0");
346 }
347 
348 /*
349  * Increment a disk's busy counter.  If the counter is going from
350  * 0 to 1, set the timestamp.
351  */
352 void
353 disk_busy(diskp)
354 	struct disk *diskp;
355 {
356 	int s;
357 
358 	/*
359 	 * XXX We'd like to use something as accurate as microtime(),
360 	 * but that doesn't depend on the system TOD clock.
361 	 */
362 	if (diskp->dk_busy++ == 0) {
363 		s = splclock();
364 		diskp->dk_timestamp = mono_time;
365 		splx(s);
366 	}
367 }
368 
369 /*
370  * Decrement a disk's busy counter, increment the byte count, total busy
371  * time, and reset the timestamp.
372  */
373 void
374 disk_unbusy(diskp, bcount)
375 	struct disk *diskp;
376 	long bcount;
377 {
378 	int s;
379 	struct timeval dv_time, diff_time;
380 
381 	if (diskp->dk_busy-- == 0)
382 		printf("disk_unbusy: %s: dk_busy < 0\n", diskp->dk_name);
383 
384 	s = splclock();
385 	dv_time = mono_time;
386 	splx(s);
387 
388 	timersub(&dv_time, &diskp->dk_timestamp, &diff_time);
389 	timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time);
390 
391 	diskp->dk_timestamp = dv_time;
392 	if (bcount > 0) {
393 		diskp->dk_bytes += bcount;
394 		diskp->dk_xfer++;
395 	}
396 	diskp->dk_seek++;
397 
398 	add_disk_randomness(bcount ^ diff_time.tv_usec);
399 }
400 
401 
402 int
403 disk_lock(dk)
404 	struct disk *dk;
405 {
406 	int error;
407 
408 	error = lockmgr(&dk->dk_lock, LK_EXCLUSIVE, 0, curproc);
409 
410 	return (error);
411 }
412 
413 void
414 disk_unlock(dk)
415 	struct disk *dk;
416 {
417 	lockmgr(&dk->dk_lock, LK_RELEASE, 0, curproc);
418 }
419 
420 
421 /*
422  * Reset the metrics counters on the given disk.  Note that we cannot
423  * reset the busy counter, as it may case a panic in disk_unbusy().
424  * We also must avoid playing with the timestamp information, as it
425  * may skew any pending transfer results.
426  */
427 void
428 disk_resetstat(diskp)
429 	struct disk *diskp;
430 {
431 	int s = splbio(), t;
432 
433 	diskp->dk_xfer = 0;
434 	diskp->dk_bytes = 0;
435 	diskp->dk_seek = 0;
436 
437 	t = splclock();
438 	diskp->dk_attachtime = mono_time;
439 	splx(t);
440 
441 	timerclear(&diskp->dk_time);
442 
443 	splx(s);
444 }
445 
446 
447 int
448 dk_mountroot()
449 {
450 	dev_t rawdev, rrootdev;
451 	int part = DISKPART(rootdev);
452 	int (*mountrootfn)(void);
453 	struct disklabel dl;
454 	int error;
455 
456 	rrootdev = blktochr(rootdev);
457 	rawdev = MAKEDISKDEV(major(rrootdev), DISKUNIT(rootdev), RAW_PART);
458 	printf("rootdev=0x%x rrootdev=0x%x rawdev=0x%x\n", rootdev,
459 	    rrootdev, rawdev);
460 
461 	/*
462 	 * open device, ioctl for the disklabel, and close it.
463 	 */
464 	error = (cdevsw[major(rrootdev)].d_open)(rawdev, FREAD,
465 	    S_IFCHR, curproc);
466 	if (error)
467 		panic("cannot open disk, 0x%x/0x%x, error %d",
468 		    rootdev, rrootdev, error);
469 	error = (cdevsw[major(rrootdev)].d_ioctl)(rawdev, DIOCGDINFO,
470 	    (caddr_t)&dl, FREAD, curproc);
471 	if (error)
472 		panic("cannot read disk label, 0x%x/0x%x, error %d",
473 		    rootdev, rrootdev, error);
474 	(void) (cdevsw[major(rrootdev)].d_close)(rawdev, FREAD,
475 	    S_IFCHR, curproc);
476 
477 	if (dl.d_partitions[part].p_size == 0)
478 		panic("root filesystem has size 0");
479 	switch (dl.d_partitions[part].p_fstype) {
480 #ifdef EXT2FS
481 	case FS_EXT2FS:
482 		{
483 		extern int ext2fs_mountroot(void);
484 		mountrootfn = ext2fs_mountroot;
485 		}
486 		break;
487 #endif
488 #ifdef FFS
489 	case FS_BSDFFS:
490 		{
491 		extern int ffs_mountroot(void);
492 		mountrootfn = ffs_mountroot;
493 		}
494 		break;
495 #endif
496 #ifdef LFS
497 	case FS_BSDLFS:
498 		{
499 		extern int lfs_mountroot(void);
500 		mountrootfn = lfs_mountroot;
501 		}
502 		break;
503 #endif
504 #ifdef CD9660
505 	case FS_ISO9660:
506 		{
507 		extern int cd9660_mountroot(void);
508 		mountrootfn = cd9660_mountroot;
509 		}
510 		break;
511 #endif
512 	default:
513 #ifdef FFS
514 		{
515 		extern int ffs_mountroot(void);
516 
517 		printf("filesystem type %d not known.. assuming ffs\n",
518 		    dl.d_partitions[part].p_fstype);
519 		mountrootfn = ffs_mountroot;
520 		}
521 #else
522 		panic("disk 0x%x/0x%x filesystem type %d not known",
523 		    rootdev, rrootdev, dl.d_partitions[part].p_fstype);
524 #endif
525 	}
526 	return (*mountrootfn)();
527 }
528 
529 void
530 bufq_default_add(struct bufq *bq, struct buf *bp)
531 {
532 	struct bufq_default *bufq = (struct bufq_default *)bq;
533 	struct proc *p = bp->b_proc;
534 	struct buf *head;
535 
536 	if (p == NULL || p->p_nice < NZERO)
537 		head = &bufq->bufq_head[0];
538 	else if (p->p_nice == NZERO)
539 		head = &bufq->bufq_head[1];
540 	else
541 		head = &bufq->bufq_head[2];
542 
543 	disksort(head, bp);
544 }
545 
546 struct buf *
547 bufq_default_get(struct bufq *bq)
548 {
549 	struct bufq_default *bufq = (struct bufq_default *)bq;
550 	struct buf *bp, *head;
551 	int i;
552 
553 	for (i = 0; i < 3; i++) {
554 		head = &bufq->bufq_head[i];
555 		if ((bp = head->b_actf))
556 			break;
557 	}
558 	if (bp == NULL)
559 		return (NULL);
560 	head->b_actf = bp->b_actf;
561 	return (bp);
562 }
563