xref: /netbsd/sys/dev/raidframe/rf_diskqueue.c (revision c4a72b64)
1 /*	$NetBSD: rf_diskqueue.c,v 1.21 2002/09/17 02:55:12 oster Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Mark Holland
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /****************************************************************************
30  *
31  * rf_diskqueue.c -- higher-level disk queue code
32  *
33  * the routines here are a generic wrapper around the actual queueing
34  * routines.  The code here implements thread scheduling, synchronization,
35  * and locking ops (see below) on top of the lower-level queueing code.
36  *
37  * to support atomic RMW, we implement "locking operations".  When a
38  * locking op is dispatched to the lower levels of the driver, the
39  * queue is locked, and no further I/Os are dispatched until the queue
40  * receives & completes a corresponding "unlocking operation".  This
41  * code relies on the higher layers to guarantee that a locking op
42  * will always be eventually followed by an unlocking op.  The model
43  * is that the higher layers are structured so locking and unlocking
44  * ops occur in pairs, i.e.  an unlocking op cannot be generated until
45  * after a locking op reports completion.  There is no good way to
46  * check to see that an unlocking op "corresponds" to the op that
47  * currently has the queue locked, so we make no such attempt.  Since
48  * by definition there can be only one locking op outstanding on a
49  * disk, this should not be a problem.
50  *
51  * In the kernel, we allow multiple I/Os to be concurrently dispatched
52  * to the disk driver.  In order to support locking ops in this
53  * environment, when we decide to do a locking op, we stop dispatching
54  * new I/Os and wait until all dispatched I/Os have completed before
55  * dispatching the locking op.
56  *
57  * Unfortunately, the code is different in the 3 different operating
58  * states (user level, kernel, simulator).  In the kernel, I/O is
59  * non-blocking, and we have no disk threads to dispatch for us.
60  * Therefore, we have to dispatch new I/Os to the scsi driver at the
61  * time of enqueue, and also at the time of completion.  At user
62  * level, I/O is blocking, and so only the disk threads may dispatch
63  * I/Os.  Thus at user level, all we can do at enqueue time is enqueue
64  * and wake up the disk thread to do the dispatch.
65  *
66  ****************************************************************************/
67 
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.21 2002/09/17 02:55:12 oster Exp $");
70 
71 #include <dev/raidframe/raidframevar.h>
72 
73 #include "rf_threadstuff.h"
74 #include "rf_raid.h"
75 #include "rf_diskqueue.h"
76 #include "rf_alloclist.h"
77 #include "rf_acctrace.h"
78 #include "rf_etimer.h"
79 #include "rf_general.h"
80 #include "rf_freelist.h"
81 #include "rf_debugprint.h"
82 #include "rf_shutdown.h"
83 #include "rf_cvscan.h"
84 #include "rf_sstf.h"
85 #include "rf_fifo.h"
86 #include "rf_kintf.h"
87 
88 static int init_dqd(RF_DiskQueueData_t *);
89 static void clean_dqd(RF_DiskQueueData_t *);
90 static void rf_ShutdownDiskQueueSystem(void *);
91 
92 #ifndef RF_DEBUG_DISKQUEUE
93 #define RF_DEBUG_DISKQUEUE 0
94 #endif
95 
96 #if RF_DEBUG_DISKQUEUE
97 #define Dprintf1(s,a)         if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
98 #define Dprintf2(s,a,b)       if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
99 #define Dprintf3(s,a,b,c)     if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
100 #else
101 #define Dprintf1(s,a)
102 #define Dprintf2(s,a,b)
103 #define Dprintf3(s,a,b,c)
104 #endif
105 
106 /*****************************************************************************
107  *
108  * the disk queue switch defines all the functions used in the
109  * different queueing disciplines queue ID, init routine, enqueue
110  * routine, dequeue routine
111  *
112  ****************************************************************************/
113 
114 static RF_DiskQueueSW_t diskqueuesw[] = {
115 	{"fifo",		/* FIFO */
116 		rf_FifoCreate,
117 		rf_FifoEnqueue,
118 		rf_FifoDequeue,
119 		rf_FifoPeek,
120 	rf_FifoPromote},
121 
122 	{"cvscan",		/* cvscan */
123 		rf_CvscanCreate,
124 		rf_CvscanEnqueue,
125 		rf_CvscanDequeue,
126 		rf_CvscanPeek,
127 	rf_CvscanPromote},
128 
129 	{"sstf",		/* shortest seek time first */
130 		rf_SstfCreate,
131 		rf_SstfEnqueue,
132 		rf_SstfDequeue,
133 		rf_SstfPeek,
134 	rf_SstfPromote},
135 
136 	{"scan",		/* SCAN (two-way elevator) */
137 		rf_ScanCreate,
138 		rf_SstfEnqueue,
139 		rf_ScanDequeue,
140 		rf_ScanPeek,
141 	rf_SstfPromote},
142 
143 	{"cscan",		/* CSCAN (one-way elevator) */
144 		rf_CscanCreate,
145 		rf_SstfEnqueue,
146 		rf_CscanDequeue,
147 		rf_CscanPeek,
148 	rf_SstfPromote},
149 
150 };
151 #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
152 
153 static RF_FreeList_t *rf_dqd_freelist;
154 
155 #define RF_MAX_FREE_DQD 256
156 #define RF_DQD_INC       16
157 #define RF_DQD_INITIAL   64
158 
159 #include <sys/buf.h>
160 
161 static int
162 init_dqd(dqd)
163 	RF_DiskQueueData_t *dqd;
164 {
165 
166 	dqd->bp = (struct buf *) malloc(sizeof(struct buf),
167 					M_RAIDFRAME, M_NOWAIT);
168 	if (dqd->bp == NULL) {
169 		return (ENOMEM);
170 	}
171 	memset(dqd->bp, 0, sizeof(struct buf));	/* if you don't do it, nobody
172 						 * else will.. */
173 	return (0);
174 }
175 
176 static void
177 clean_dqd(dqd)
178 	RF_DiskQueueData_t *dqd;
179 {
180 	free(dqd->bp, M_RAIDFRAME);
181 }
182 /* configures a single disk queue */
183 
184 int
185 rf_ConfigureDiskQueue(
186       RF_Raid_t * raidPtr,
187       RF_DiskQueue_t * diskqueue,
188       RF_RowCol_t r,		/* row & col -- debug only.  BZZT not any
189 				 * more... */
190       RF_RowCol_t c,
191       RF_DiskQueueSW_t * p,
192       RF_SectorCount_t sectPerDisk,
193       dev_t dev,
194       int maxOutstanding,
195       RF_ShutdownList_t ** listp,
196       RF_AllocListElem_t * clList)
197 {
198 	int     rc;
199 
200 	diskqueue->row = r;
201 	diskqueue->col = c;
202 	diskqueue->qPtr = p;
203 	diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
204 	diskqueue->dev = dev;
205 	diskqueue->numOutstanding = 0;
206 	diskqueue->queueLength = 0;
207 	diskqueue->maxOutstanding = maxOutstanding;
208 	diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
209 	diskqueue->nextLockingOp = NULL;
210 	diskqueue->numWaiting = 0;
211 	diskqueue->flags = 0;
212 	diskqueue->raidPtr = raidPtr;
213 	diskqueue->rf_cinfo = &raidPtr->raid_cinfo[r][c];
214 	rc = rf_create_managed_mutex(listp, &diskqueue->mutex);
215 	if (rc) {
216 		rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc);
217 		return (rc);
218 	}
219 	rc = rf_create_managed_cond(listp, &diskqueue->cond);
220 	if (rc) {
221 		rf_print_unable_to_init_cond(__FILE__, __LINE__, rc);
222 		return (rc);
223 	}
224 	return (0);
225 }
226 
227 static void
228 rf_ShutdownDiskQueueSystem(ignored)
229 	void   *ignored;
230 {
231 	RF_FREELIST_DESTROY_CLEAN(rf_dqd_freelist, next, (RF_DiskQueueData_t *), clean_dqd);
232 }
233 
234 int
235 rf_ConfigureDiskQueueSystem(listp)
236 	RF_ShutdownList_t **listp;
237 {
238 	int     rc;
239 
240 	RF_FREELIST_CREATE(rf_dqd_freelist, RF_MAX_FREE_DQD,
241 	    RF_DQD_INC, sizeof(RF_DiskQueueData_t));
242 	if (rf_dqd_freelist == NULL)
243 		return (ENOMEM);
244 	rc = rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL);
245 	if (rc) {
246 		rf_print_unable_to_add_shutdown( __FILE__, __LINE__, rc);
247 		rf_ShutdownDiskQueueSystem(NULL);
248 		return (rc);
249 	}
250 	RF_FREELIST_PRIME_INIT(rf_dqd_freelist, RF_DQD_INITIAL, next,
251 	    (RF_DiskQueueData_t *), init_dqd);
252 	return (0);
253 }
254 
255 int
256 rf_ConfigureDiskQueues(
257     RF_ShutdownList_t ** listp,
258     RF_Raid_t * raidPtr,
259     RF_Config_t * cfgPtr)
260 {
261 	RF_DiskQueue_t **diskQueues, *spareQueues;
262 	RF_DiskQueueSW_t *p;
263 	RF_RowCol_t r, c;
264 	int     rc, i;
265 
266 	raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
267 
268 	for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
269 		if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
270 			p = &diskqueuesw[i];
271 			break;
272 		}
273 	}
274 	if (p == NULL) {
275 		RF_ERRORMSG2("Unknown queue type \"%s\".  Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
276 		p = &diskqueuesw[0];
277 	}
278 	raidPtr->qType = p;
279 	RF_CallocAndAdd(diskQueues, raidPtr->numRow, sizeof(RF_DiskQueue_t *), (RF_DiskQueue_t **), raidPtr->cleanupList);
280 	if (diskQueues == NULL) {
281 		return (ENOMEM);
282 	}
283 	raidPtr->Queues = diskQueues;
284 	for (r = 0; r < raidPtr->numRow; r++) {
285 		RF_CallocAndAdd(diskQueues[r], raidPtr->numCol +
286 				 ((r == 0) ? RF_MAXSPARE : 0),
287 				sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *),
288 				raidPtr->cleanupList);
289 		if (diskQueues[r] == NULL)
290 			return (ENOMEM);
291 		for (c = 0; c < raidPtr->numCol; c++) {
292 			rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[r][c],
293 						   r, c, p,
294 						   raidPtr->sectorsPerDisk,
295 						   raidPtr->Disks[r][c].dev,
296 						   cfgPtr->maxOutstandingDiskReqs,
297 						   listp, raidPtr->cleanupList);
298 			if (rc)
299 				return (rc);
300 		}
301 	}
302 
303 	spareQueues = &raidPtr->Queues[0][raidPtr->numCol];
304 	for (r = 0; r < raidPtr->numSpare; r++) {
305 		rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
306 		    0, raidPtr->numCol + r, p,
307 		    raidPtr->sectorsPerDisk,
308 		    raidPtr->Disks[0][raidPtr->numCol + r].dev,
309 		    cfgPtr->maxOutstandingDiskReqs, listp,
310 		    raidPtr->cleanupList);
311 		if (rc)
312 			return (rc);
313 	}
314 	return (0);
315 }
316 /* Enqueue a disk I/O
317  *
318  * Unfortunately, we have to do things differently in the different
319  * environments (simulator, user-level, kernel).
320  * At user level, all I/O is blocking, so we have 1 or more threads/disk
321  * and the thread that enqueues is different from the thread that dequeues.
322  * In the kernel, I/O is non-blocking and so we'd like to have multiple
323  * I/Os outstanding on the physical disks when possible.
324  *
325  * when any request arrives at a queue, we have two choices:
326  *    dispatch it to the lower levels
327  *    queue it up
328  *
329  * kernel rules for when to do what:
330  *    locking request:  queue empty => dispatch and lock queue,
331  *                      else queue it
332  *    unlocking req  :  always dispatch it
333  *    normal req     :  queue empty => dispatch it & set priority
334  *                      queue not full & priority is ok => dispatch it
335  *                      else queue it
336  *
337  * user-level rules:
338  *    always enqueue.  In the special case of an unlocking op, enqueue
339  *    in a special way that will cause the unlocking op to be the next
340  *    thing dequeued.
341  *
342  * simulator rules:
343  *    Do the same as at user level, with the sleeps and wakeups suppressed.
344  */
345 void
346 rf_DiskIOEnqueue(queue, req, pri)
347 	RF_DiskQueue_t *queue;
348 	RF_DiskQueueData_t *req;
349 	int     pri;
350 {
351 	RF_ETIMER_START(req->qtime);
352 	RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
353 	req->priority = pri;
354 
355 #if RF_DEBUG_DISKQUEUE
356 	if (rf_queueDebug && (req->numSector == 0)) {
357 		printf("Warning: Enqueueing zero-sector access\n");
358 	}
359 #endif
360 	/*
361          * kernel
362          */
363 	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
364 	/* locking request */
365 	if (RF_LOCKING_REQ(req)) {
366 		if (RF_QUEUE_EMPTY(queue)) {
367 			Dprintf3("Dispatching pri %d locking op to r %d c %d (queue empty)\n", pri, queue->row, queue->col);
368 			RF_LOCK_QUEUE(queue);
369 			rf_DispatchKernelIO(queue, req);
370 		} else {
371 			queue->queueLength++;	/* increment count of number
372 						 * of requests waiting in this
373 						 * queue */
374 			Dprintf3("Enqueueing pri %d locking op to r %d c %d (queue not empty)\n", pri, queue->row, queue->col);
375 			req->queue = (void *) queue;
376 			(queue->qPtr->Enqueue) (queue->qHdr, req, pri);
377 		}
378 	}
379 	/* unlocking request */
380 	else
381 		if (RF_UNLOCKING_REQ(req)) {	/* we'll do the actual unlock
382 						 * when this I/O completes */
383 			Dprintf3("Dispatching pri %d unlocking op to r %d c %d\n", pri, queue->row, queue->col);
384 			RF_ASSERT(RF_QUEUE_LOCKED(queue));
385 			rf_DispatchKernelIO(queue, req);
386 		}
387 	/* normal request */
388 		else
389 			if (RF_OK_TO_DISPATCH(queue, req)) {
390 				Dprintf3("Dispatching pri %d regular op to r %d c %d (ok to dispatch)\n", pri, queue->row, queue->col);
391 				rf_DispatchKernelIO(queue, req);
392 			} else {
393 				queue->queueLength++;	/* increment count of
394 							 * number of requests
395 							 * waiting in this queue */
396 				Dprintf3("Enqueueing pri %d regular op to r %d c %d (not ok to dispatch)\n", pri, queue->row, queue->col);
397 				req->queue = (void *) queue;
398 				(queue->qPtr->Enqueue) (queue->qHdr, req, pri);
399 			}
400 	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
401 }
402 
403 
404 /* get the next set of I/Os started, kernel version only */
405 void
406 rf_DiskIOComplete(queue, req, status)
407 	RF_DiskQueue_t *queue;
408 	RF_DiskQueueData_t *req;
409 	int     status;
410 {
411 	int     done = 0;
412 
413 	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
414 
415 	/* unlock the queue: (1) after an unlocking req completes (2) after a
416 	 * locking req fails */
417 	if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) {
418 		Dprintf2("DiskIOComplete: unlocking queue at r %d c %d\n", queue->row, queue->col);
419 		RF_ASSERT(RF_QUEUE_LOCKED(queue));
420 		RF_UNLOCK_QUEUE(queue);
421 	}
422 	queue->numOutstanding--;
423 	RF_ASSERT(queue->numOutstanding >= 0);
424 
425 	/* dispatch requests to the disk until we find one that we can't. */
426 	/* no reason to continue once we've filled up the queue */
427 	/* no reason to even start if the queue is locked */
428 
429 	while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) {
430 		if (queue->nextLockingOp) {
431 			req = queue->nextLockingOp;
432 			queue->nextLockingOp = NULL;
433 			Dprintf3("DiskIOComplete: a pri %d locking req was pending at r %d c %d\n", req->priority, queue->row, queue->col);
434 		} else {
435 			req = (queue->qPtr->Dequeue) (queue->qHdr);
436 			if (req != NULL) {
437 				Dprintf3("DiskIOComplete: extracting pri %d req from queue at r %d c %d\n", req->priority, queue->row, queue->col);
438 			} else {
439 				Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
440 			}
441 		}
442 		if (req) {
443 			queue->queueLength--;	/* decrement count of number
444 						 * of requests waiting in this
445 						 * queue */
446 			RF_ASSERT(queue->queueLength >= 0);
447 		}
448 		if (!req)
449 			done = 1;
450 		else
451 			if (RF_LOCKING_REQ(req)) {
452 				if (RF_QUEUE_EMPTY(queue)) {	/* dispatch it */
453 					Dprintf3("DiskIOComplete: dispatching pri %d locking req to r %d c %d (queue empty)\n", req->priority, queue->row, queue->col);
454 					RF_LOCK_QUEUE(queue);
455 					rf_DispatchKernelIO(queue, req);
456 					done = 1;
457 				} else {	/* put it aside to wait for
458 						 * the queue to drain */
459 					Dprintf3("DiskIOComplete: postponing pri %d locking req to r %d c %d\n", req->priority, queue->row, queue->col);
460 					RF_ASSERT(queue->nextLockingOp == NULL);
461 					queue->nextLockingOp = req;
462 					done = 1;
463 				}
464 			} else
465 				if (RF_UNLOCKING_REQ(req)) {	/* should not happen:
466 								 * unlocking ops should
467 								 * not get queued */
468 					RF_ASSERT(RF_QUEUE_LOCKED(queue));	/* support it anyway for
469 										 * the future */
470 					Dprintf3("DiskIOComplete: dispatching pri %d unl req to r %d c %d (SHOULD NOT SEE THIS)\n", req->priority, queue->row, queue->col);
471 					rf_DispatchKernelIO(queue, req);
472 					done = 1;
473 				} else
474 					if (RF_OK_TO_DISPATCH(queue, req)) {
475 						Dprintf3("DiskIOComplete: dispatching pri %d regular req to r %d c %d (ok to dispatch)\n", req->priority, queue->row, queue->col);
476 						rf_DispatchKernelIO(queue, req);
477 					} else {	/* we can't dispatch it,
478 							 * so just re-enqueue
479 							 * it.  */
480 						/* potential trouble here if
481 						 * disk queues batch reqs */
482 						Dprintf3("DiskIOComplete: re-enqueueing pri %d regular req to r %d c %d\n", req->priority, queue->row, queue->col);
483 						queue->queueLength++;
484 						(queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
485 						done = 1;
486 					}
487 	}
488 
489 	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
490 }
491 /* promotes accesses tagged with the given parityStripeID from low priority
492  * to normal priority.  This promotion is optional, meaning that a queue
493  * need not implement it.  If there is no promotion routine associated with
494  * a queue, this routine does nothing and returns -1.
495  */
496 int
497 rf_DiskIOPromote(queue, parityStripeID, which_ru)
498 	RF_DiskQueue_t *queue;
499 	RF_StripeNum_t parityStripeID;
500 	RF_ReconUnitNum_t which_ru;
501 {
502 	int     retval;
503 
504 	if (!queue->qPtr->Promote)
505 		return (-1);
506 	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
507 	retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
508 	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
509 	return (retval);
510 }
511 
512 RF_DiskQueueData_t *
513 rf_CreateDiskQueueData(
514     RF_IoType_t typ,
515     RF_SectorNum_t ssect,
516     RF_SectorCount_t nsect,
517     caddr_t buf,
518     RF_StripeNum_t parityStripeID,
519     RF_ReconUnitNum_t which_ru,
520     int (*wakeF) (void *, int),
521     void *arg,
522     RF_DiskQueueData_t * next,
523     RF_AccTraceEntry_t * tracerec,
524     void *raidPtr,
525     RF_DiskQueueDataFlags_t flags,
526     void *kb_proc)
527 {
528 	RF_DiskQueueData_t *p;
529 
530 	RF_FREELIST_GET_INIT(rf_dqd_freelist, p, next, (RF_DiskQueueData_t *), init_dqd);
531 
532 	p->sectorOffset = ssect + rf_protectedSectors;
533 	p->numSector = nsect;
534 	p->type = typ;
535 	p->buf = buf;
536 	p->parityStripeID = parityStripeID;
537 	p->which_ru = which_ru;
538 	p->CompleteFunc = wakeF;
539 	p->argument = arg;
540 	p->next = next;
541 	p->tracerec = tracerec;
542 	p->priority = RF_IO_NORMAL_PRIORITY;
543 	p->raidPtr = raidPtr;
544 	p->flags = flags;
545 	p->b_proc = kb_proc;
546 	return (p);
547 }
548 
549 void
550 rf_FreeDiskQueueData(p)
551 	RF_DiskQueueData_t *p;
552 {
553 	RF_FREELIST_FREE_CLEAN(rf_dqd_freelist, p, next, clean_dqd);
554 }
555