xref: /netbsd/sys/dev/raidframe/rf_diskqueue.c (revision 761b03c1)
1 /*	$NetBSD: rf_diskqueue.c,v 1.63 2021/12/14 00:46:43 mrg Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Mark Holland
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /****************************************************************************
30  *
31  * rf_diskqueue.c -- higher-level disk queue code
32  *
33  * the routines here are a generic wrapper around the actual queueing
34  * routines.  The code here implements thread scheduling, synchronization,
35  * and locking ops (see below) on top of the lower-level queueing code.
36  *
37  * to support atomic RMW, we implement "locking operations".  When a
38  * locking op is dispatched to the lower levels of the driver, the
39  * queue is locked, and no further I/Os are dispatched until the queue
40  * receives & completes a corresponding "unlocking operation".  This
41  * code relies on the higher layers to guarantee that a locking op
42  * will always be eventually followed by an unlocking op.  The model
43  * is that the higher layers are structured so locking and unlocking
44  * ops occur in pairs, i.e.  an unlocking op cannot be generated until
45  * after a locking op reports completion.  There is no good way to
46  * check to see that an unlocking op "corresponds" to the op that
47  * currently has the queue locked, so we make no such attempt.  Since
48  * by definition there can be only one locking op outstanding on a
49  * disk, this should not be a problem.
50  *
51  * In the kernel, we allow multiple I/Os to be concurrently dispatched
52  * to the disk driver.  In order to support locking ops in this
53  * environment, when we decide to do a locking op, we stop dispatching
54  * new I/Os and wait until all dispatched I/Os have completed before
55  * dispatching the locking op.
56  *
57  * Unfortunately, the code is different in the 3 different operating
58  * states (user level, kernel, simulator).  In the kernel, I/O is
59  * non-blocking, and we have no disk threads to dispatch for us.
60  * Therefore, we have to dispatch new I/Os to the scsi driver at the
61  * time of enqueue, and also at the time of completion.  At user
62  * level, I/O is blocking, and so only the disk threads may dispatch
63  * I/Os.  Thus at user level, all we can do at enqueue time is enqueue
64  * and wake up the disk thread to do the dispatch.
65  *
66  ****************************************************************************/
67 
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.63 2021/12/14 00:46:43 mrg Exp $");
70 
71 #include <dev/raidframe/raidframevar.h>
72 
73 #include "rf_threadstuff.h"
74 #include "rf_raid.h"
75 #include "rf_diskqueue.h"
76 #include "rf_alloclist.h"
77 #include "rf_acctrace.h"
78 #include "rf_etimer.h"
79 #include "rf_general.h"
80 #include "rf_debugprint.h"
81 #include "rf_shutdown.h"
82 #include "rf_cvscan.h"
83 #include "rf_sstf.h"
84 #include "rf_fifo.h"
85 #include "rf_kintf.h"
86 
87 #include <sys/buf.h>
88 
89 static void rf_ShutdownDiskQueueSystem(void *);
90 
91 #ifndef RF_DEBUG_DISKQUEUE
92 #define RF_DEBUG_DISKQUEUE 0
93 #endif
94 
95 #if RF_DEBUG_DISKQUEUE
96 #define Dprintf1(s,a)         if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
97 #define Dprintf2(s,a,b)       if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
98 #define Dprintf3(s,a,b,c)     if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
99 #else
100 #define Dprintf1(s,a)
101 #define Dprintf2(s,a,b)
102 #define Dprintf3(s,a,b,c)
103 #endif
104 
105 /*****************************************************************************
106  *
107  * the disk queue switch defines all the functions used in the
108  * different queueing disciplines queue ID, init routine, enqueue
109  * routine, dequeue routine
110  *
111  ****************************************************************************/
112 
113 static const RF_DiskQueueSW_t diskqueuesw[] = {
114 	{"fifo",		/* FIFO */
115 		rf_FifoCreate,
116 		rf_FifoEnqueue,
117 		rf_FifoDequeue,
118 		rf_FifoPromote},
119 
120 	{"cvscan",		/* cvscan */
121 		rf_CvscanCreate,
122 		rf_CvscanEnqueue,
123 		rf_CvscanDequeue,
124 		rf_CvscanPromote},
125 
126 	{"sstf",		/* shortest seek time first */
127 		rf_SstfCreate,
128 		rf_SstfEnqueue,
129 		rf_SstfDequeue,
130 		rf_SstfPromote},
131 
132 	{"scan",		/* SCAN (two-way elevator) */
133 		rf_ScanCreate,
134 		rf_SstfEnqueue,
135 		rf_ScanDequeue,
136 		rf_SstfPromote},
137 
138 	{"cscan",		/* CSCAN (one-way elevator) */
139 		rf_CscanCreate,
140 		rf_SstfEnqueue,
141 		rf_CscanDequeue,
142 		rf_SstfPromote},
143 
144 };
145 #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
146 
147 
148 #define RF_MAX_FREE_DQD 256
149 #define RF_MIN_FREE_DQD  64
150 
151 /* XXX: scale these... */
152 #define RF_MAX_FREE_BUFIO 256
153 #define RF_MIN_FREE_BUFIO  64
154 
155 
156 
157 /* configures a single disk queue */
158 
159 static void
rf_ShutdownDiskQueue(void * arg)160 rf_ShutdownDiskQueue(void *arg)
161 {
162 	RF_DiskQueue_t *diskqueue = arg;
163 
164 	rf_destroy_mutex2(diskqueue->mutex);
165 }
166 
167 int
rf_ConfigureDiskQueue(RF_Raid_t * raidPtr,RF_DiskQueue_t * diskqueue,RF_RowCol_t c,const RF_DiskQueueSW_t * p,RF_SectorCount_t sectPerDisk,dev_t dev,int maxOutstanding,RF_ShutdownList_t ** listp,RF_AllocListElem_t * clList)168 rf_ConfigureDiskQueue(RF_Raid_t *raidPtr, RF_DiskQueue_t *diskqueue,
169 		      RF_RowCol_t c, const RF_DiskQueueSW_t *p,
170 		      RF_SectorCount_t sectPerDisk, dev_t dev,
171 		      int maxOutstanding, RF_ShutdownList_t **listp,
172 		      RF_AllocListElem_t *clList)
173 {
174 	diskqueue->col = c;
175 	diskqueue->qPtr = p;
176 	diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
177 	diskqueue->dev = dev;
178 	diskqueue->numOutstanding = 0;
179 	diskqueue->queueLength = 0;
180 	diskqueue->maxOutstanding = maxOutstanding;
181 	diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
182 	diskqueue->flags = 0;
183 	diskqueue->raidPtr = raidPtr;
184 	diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c];
185 	rf_init_mutex2(diskqueue->mutex, IPL_VM);
186 	rf_ShutdownCreate(listp, rf_ShutdownDiskQueue, diskqueue);
187 	return (0);
188 }
189 
190 static void
rf_ShutdownDiskQueueSystem(void * arg)191 rf_ShutdownDiskQueueSystem(void *arg)
192 {
193 	RF_Raid_t *raidPtr;
194 
195 	raidPtr = (RF_Raid_t *) arg;
196 
197 	pool_destroy(&raidPtr->pools.dqd);
198 	pool_destroy(&raidPtr->pools.bufio);
199 }
200 
201 int
rf_ConfigureDiskQueueSystem(RF_ShutdownList_t ** listp,RF_Raid_t * raidPtr,RF_Config_t * cfgPtr)202 rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
203 			    RF_Config_t *cfgPtr)
204 
205 {
206 
207 	rf_pool_init(raidPtr, raidPtr->poolNames.dqd, &raidPtr->pools.dqd, sizeof(RF_DiskQueueData_t),
208 		     "dqd", RF_MIN_FREE_DQD, RF_MAX_FREE_DQD);
209 	rf_pool_init(raidPtr, raidPtr->poolNames.bufio, &raidPtr->pools.bufio, sizeof(buf_t),
210 		     "bufio", RF_MIN_FREE_BUFIO, RF_MAX_FREE_BUFIO);
211 	rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, raidPtr);
212 
213 	return (0);
214 }
215 
216 int
rf_ConfigureDiskQueues(RF_ShutdownList_t ** listp,RF_Raid_t * raidPtr,RF_Config_t * cfgPtr)217 rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
218 		       RF_Config_t *cfgPtr)
219 {
220 	RF_DiskQueue_t *diskQueues, *spareQueues;
221 	const RF_DiskQueueSW_t *p;
222 	RF_RowCol_t r,c;
223 	int     rc, i;
224 
225 	raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
226 
227 	for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
228 		if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
229 			p = &diskqueuesw[i];
230 			break;
231 		}
232 	}
233 	if (p == NULL) {
234 		RF_ERRORMSG2("Unknown queue type \"%s\".  Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
235 		p = &diskqueuesw[0];
236 	}
237 	raidPtr->qType = p;
238 
239 	diskQueues = RF_MallocAndAdd(
240 	    (raidPtr->numCol + RF_MAXSPARE) * sizeof(*diskQueues),
241 	    raidPtr->cleanupList);
242 	if (diskQueues == NULL)
243 		return (ENOMEM);
244 	raidPtr->Queues = diskQueues;
245 
246 	for (c = 0; c < raidPtr->numCol; c++) {
247 		rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c],
248 					   c, p,
249 					   raidPtr->sectorsPerDisk,
250 					   raidPtr->Disks[c].dev,
251 					   cfgPtr->maxOutstandingDiskReqs,
252 					   listp, raidPtr->cleanupList);
253 		if (rc)
254 			return (rc);
255 	}
256 
257 	spareQueues = &raidPtr->Queues[raidPtr->numCol];
258 	for (r = 0; r < raidPtr->numSpare; r++) {
259 		rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
260 					   raidPtr->numCol + r, p,
261 					   raidPtr->sectorsPerDisk,
262 					   raidPtr->Disks[raidPtr->numCol + r].dev,
263 					   cfgPtr->maxOutstandingDiskReqs, listp,
264 					   raidPtr->cleanupList);
265 		if (rc)
266 			return (rc);
267 	}
268 	return (0);
269 }
270 /* Enqueue a disk I/O
271  *
272  * In the kernel, I/O is non-blocking and so we'd like to have multiple
273  * I/Os outstanding on the physical disks when possible.
274  *
275  * when any request arrives at a queue, we have two choices:
276  *    dispatch it to the lower levels
277  *    queue it up
278  *
279  * kernel rules for when to do what:
280  *    unlocking req  :  always dispatch it
281  *    normal req     :  queue empty => dispatch it & set priority
282  *                      queue not full & priority is ok => dispatch it
283  *                      else queue it
284  */
285 void
rf_DiskIOEnqueue(RF_DiskQueue_t * queue,RF_DiskQueueData_t * req,int pri)286 rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri)
287 {
288 	RF_ETIMER_START(req->qtime);
289 	RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
290 	req->priority = pri;
291 
292 #if RF_DEBUG_DISKQUEUE
293 	if (rf_queueDebug && (req->numSector == 0)) {
294 		printf("Warning: Enqueueing zero-sector access\n");
295 	}
296 #endif
297 	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
298 	if (RF_OK_TO_DISPATCH(queue, req)) {
299 		Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col);
300 		rf_DispatchKernelIO(queue, req);
301 	} else {
302 		queue->queueLength++;	/* increment count of number of requests waiting in this queue */
303 		Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col);
304 		req->queue = (void *) queue;
305 		(queue->qPtr->Enqueue) (queue->qHdr, req, pri);
306 	}
307 	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
308 }
309 
310 
311 /* get the next set of I/Os started */
312 void
rf_DiskIOComplete(RF_DiskQueue_t * queue,RF_DiskQueueData_t * req,int status)313 rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status)
314 {
315 	int     done = 0;
316 
317 	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
318 	queue->numOutstanding--;
319 	RF_ASSERT(queue->numOutstanding >= 0);
320 
321 	/* dispatch requests to the disk until we find one that we can't. */
322 	/* no reason to continue once we've filled up the queue */
323 	/* no reason to even start if the queue is locked */
324 
325 	while (!done && !RF_QUEUE_FULL(queue)) {
326 		req = (queue->qPtr->Dequeue) (queue->qHdr);
327 		if (req) {
328 			Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n", req->priority, queue->col);
329 			queue->queueLength--;	/* decrement count of number of requests waiting in this queue */
330 			RF_ASSERT(queue->queueLength >= 0);
331 			if (RF_OK_TO_DISPATCH(queue, req)) {
332 				Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n", req->priority, queue->col);
333 				rf_DispatchKernelIO(queue, req);
334 			} else {
335 				/* we can't dispatch it, so just re-enqueue it.
336 				   potential trouble here if disk queues batch reqs */
337 				Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n", req->priority, queue->col);
338 				queue->queueLength++;
339 				(queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
340 				done = 1;
341 			}
342 		} else {
343 			Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
344 			done = 1;
345 		}
346 	}
347 
348 	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
349 }
350 /* promotes accesses tagged with the given parityStripeID from low priority
351  * to normal priority.  This promotion is optional, meaning that a queue
352  * need not implement it.  If there is no promotion routine associated with
353  * a queue, this routine does nothing and returns -1.
354  */
355 int
rf_DiskIOPromote(RF_DiskQueue_t * queue,RF_StripeNum_t parityStripeID,RF_ReconUnitNum_t which_ru)356 rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID,
357 		 RF_ReconUnitNum_t which_ru)
358 {
359 	int     retval;
360 
361 	if (!queue->qPtr->Promote)
362 		return (-1);
363 	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
364 	retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
365 	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
366 	return (retval);
367 }
368 
369 RF_DiskQueueData_t *
rf_CreateDiskQueueData(RF_IoType_t typ,RF_SectorNum_t ssect,RF_SectorCount_t nsect,void * bf,RF_StripeNum_t parityStripeID,RF_ReconUnitNum_t which_ru,void (* wakeF)(void *,int),void * arg,RF_AccTraceEntry_t * tracerec,RF_Raid_t * raidPtr,RF_DiskQueueDataFlags_t flags,const struct buf * mbp)370 rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect,
371 		       RF_SectorCount_t nsect, void *bf,
372 		       RF_StripeNum_t parityStripeID,
373 		       RF_ReconUnitNum_t which_ru,
374 		       void (*wakeF) (void *, int), void *arg,
375 		       RF_AccTraceEntry_t *tracerec, RF_Raid_t *raidPtr,
376 		       RF_DiskQueueDataFlags_t flags, const struct buf *mbp)
377 {
378 	RF_DiskQueueData_t *p;
379 
380 	p = pool_get(&raidPtr->pools.dqd, PR_WAITOK | PR_ZERO);
381 	KASSERT(p != NULL);
382 
383 	/* Obtain a buffer from our own pool.  It is possible for the
384 	   regular getiobuf() to run out of memory and return NULL.
385 	   We need to guarantee that never happens, as RAIDframe
386 	   doesn't have a good way to recover if memory allocation
387 	   fails here.
388 	*/
389 	p->bp = pool_get(&raidPtr->pools.bufio, PR_WAITOK | PR_ZERO);
390 	KASSERT(p->bp != NULL);
391 
392 	buf_init(p->bp);
393 
394 	SET(p->bp->b_cflags, BC_BUSY);	/* mark buffer busy */
395 	if (mbp) {
396 		SET(p->bp->b_flags, mbp->b_flags & rf_b_pass);
397 		p->bp->b_proc = mbp->b_proc;
398 	}
399 
400 	p->sectorOffset = ssect + rf_protectedSectors;
401 	p->numSector = nsect;
402 	p->type = typ;
403 	p->buf = bf;
404 	p->parityStripeID = parityStripeID;
405 	p->which_ru = which_ru;
406 	p->CompleteFunc = wakeF;
407 	p->argument = arg;
408 	p->next = NULL;
409 	p->tracerec = tracerec;
410 	p->priority = RF_IO_NORMAL_PRIORITY;
411 	p->raidPtr = raidPtr;
412 	p->flags = flags;
413 	return (p);
414 }
415 
416 void
rf_FreeDiskQueueData(RF_DiskQueueData_t * p)417 rf_FreeDiskQueueData(RF_DiskQueueData_t *p)
418 {
419 
420 	buf_destroy(p->bp);
421 
422 	pool_put(&p->raidPtr->pools.bufio, p->bp);
423 	pool_put(&p->raidPtr->pools.dqd, p);
424 }
425