1 /* $NetBSD: rf_diskqueue.c,v 1.21 2002/09/17 02:55:12 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /**************************************************************************** 30 * 31 * rf_diskqueue.c -- higher-level disk queue code 32 * 33 * the routines here are a generic wrapper around the actual queueing 34 * routines. The code here implements thread scheduling, synchronization, 35 * and locking ops (see below) on top of the lower-level queueing code. 36 * 37 * to support atomic RMW, we implement "locking operations". When a 38 * locking op is dispatched to the lower levels of the driver, the 39 * queue is locked, and no further I/Os are dispatched until the queue 40 * receives & completes a corresponding "unlocking operation". This 41 * code relies on the higher layers to guarantee that a locking op 42 * will always be eventually followed by an unlocking op. The model 43 * is that the higher layers are structured so locking and unlocking 44 * ops occur in pairs, i.e. an unlocking op cannot be generated until 45 * after a locking op reports completion. There is no good way to 46 * check to see that an unlocking op "corresponds" to the op that 47 * currently has the queue locked, so we make no such attempt. Since 48 * by definition there can be only one locking op outstanding on a 49 * disk, this should not be a problem. 50 * 51 * In the kernel, we allow multiple I/Os to be concurrently dispatched 52 * to the disk driver. In order to support locking ops in this 53 * environment, when we decide to do a locking op, we stop dispatching 54 * new I/Os and wait until all dispatched I/Os have completed before 55 * dispatching the locking op. 56 * 57 * Unfortunately, the code is different in the 3 different operating 58 * states (user level, kernel, simulator). In the kernel, I/O is 59 * non-blocking, and we have no disk threads to dispatch for us. 60 * Therefore, we have to dispatch new I/Os to the scsi driver at the 61 * time of enqueue, and also at the time of completion. At user 62 * level, I/O is blocking, and so only the disk threads may dispatch 63 * I/Os. Thus at user level, all we can do at enqueue time is enqueue 64 * and wake up the disk thread to do the dispatch. 65 * 66 ****************************************************************************/ 67 68 #include <sys/cdefs.h> 69 __KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.21 2002/09/17 02:55:12 oster Exp $"); 70 71 #include <dev/raidframe/raidframevar.h> 72 73 #include "rf_threadstuff.h" 74 #include "rf_raid.h" 75 #include "rf_diskqueue.h" 76 #include "rf_alloclist.h" 77 #include "rf_acctrace.h" 78 #include "rf_etimer.h" 79 #include "rf_general.h" 80 #include "rf_freelist.h" 81 #include "rf_debugprint.h" 82 #include "rf_shutdown.h" 83 #include "rf_cvscan.h" 84 #include "rf_sstf.h" 85 #include "rf_fifo.h" 86 #include "rf_kintf.h" 87 88 static int init_dqd(RF_DiskQueueData_t *); 89 static void clean_dqd(RF_DiskQueueData_t *); 90 static void rf_ShutdownDiskQueueSystem(void *); 91 92 #ifndef RF_DEBUG_DISKQUEUE 93 #define RF_DEBUG_DISKQUEUE 0 94 #endif 95 96 #if RF_DEBUG_DISKQUEUE 97 #define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 98 #define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 99 #define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) 100 #else 101 #define Dprintf1(s,a) 102 #define Dprintf2(s,a,b) 103 #define Dprintf3(s,a,b,c) 104 #endif 105 106 /***************************************************************************** 107 * 108 * the disk queue switch defines all the functions used in the 109 * different queueing disciplines queue ID, init routine, enqueue 110 * routine, dequeue routine 111 * 112 ****************************************************************************/ 113 114 static RF_DiskQueueSW_t diskqueuesw[] = { 115 {"fifo", /* FIFO */ 116 rf_FifoCreate, 117 rf_FifoEnqueue, 118 rf_FifoDequeue, 119 rf_FifoPeek, 120 rf_FifoPromote}, 121 122 {"cvscan", /* cvscan */ 123 rf_CvscanCreate, 124 rf_CvscanEnqueue, 125 rf_CvscanDequeue, 126 rf_CvscanPeek, 127 rf_CvscanPromote}, 128 129 {"sstf", /* shortest seek time first */ 130 rf_SstfCreate, 131 rf_SstfEnqueue, 132 rf_SstfDequeue, 133 rf_SstfPeek, 134 rf_SstfPromote}, 135 136 {"scan", /* SCAN (two-way elevator) */ 137 rf_ScanCreate, 138 rf_SstfEnqueue, 139 rf_ScanDequeue, 140 rf_ScanPeek, 141 rf_SstfPromote}, 142 143 {"cscan", /* CSCAN (one-way elevator) */ 144 rf_CscanCreate, 145 rf_SstfEnqueue, 146 rf_CscanDequeue, 147 rf_CscanPeek, 148 rf_SstfPromote}, 149 150 }; 151 #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t)) 152 153 static RF_FreeList_t *rf_dqd_freelist; 154 155 #define RF_MAX_FREE_DQD 256 156 #define RF_DQD_INC 16 157 #define RF_DQD_INITIAL 64 158 159 #include <sys/buf.h> 160 161 static int 162 init_dqd(dqd) 163 RF_DiskQueueData_t *dqd; 164 { 165 166 dqd->bp = (struct buf *) malloc(sizeof(struct buf), 167 M_RAIDFRAME, M_NOWAIT); 168 if (dqd->bp == NULL) { 169 return (ENOMEM); 170 } 171 memset(dqd->bp, 0, sizeof(struct buf)); /* if you don't do it, nobody 172 * else will.. */ 173 return (0); 174 } 175 176 static void 177 clean_dqd(dqd) 178 RF_DiskQueueData_t *dqd; 179 { 180 free(dqd->bp, M_RAIDFRAME); 181 } 182 /* configures a single disk queue */ 183 184 int 185 rf_ConfigureDiskQueue( 186 RF_Raid_t * raidPtr, 187 RF_DiskQueue_t * diskqueue, 188 RF_RowCol_t r, /* row & col -- debug only. BZZT not any 189 * more... */ 190 RF_RowCol_t c, 191 RF_DiskQueueSW_t * p, 192 RF_SectorCount_t sectPerDisk, 193 dev_t dev, 194 int maxOutstanding, 195 RF_ShutdownList_t ** listp, 196 RF_AllocListElem_t * clList) 197 { 198 int rc; 199 200 diskqueue->row = r; 201 diskqueue->col = c; 202 diskqueue->qPtr = p; 203 diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp); 204 diskqueue->dev = dev; 205 diskqueue->numOutstanding = 0; 206 diskqueue->queueLength = 0; 207 diskqueue->maxOutstanding = maxOutstanding; 208 diskqueue->curPriority = RF_IO_NORMAL_PRIORITY; 209 diskqueue->nextLockingOp = NULL; 210 diskqueue->numWaiting = 0; 211 diskqueue->flags = 0; 212 diskqueue->raidPtr = raidPtr; 213 diskqueue->rf_cinfo = &raidPtr->raid_cinfo[r][c]; 214 rc = rf_create_managed_mutex(listp, &diskqueue->mutex); 215 if (rc) { 216 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 217 return (rc); 218 } 219 rc = rf_create_managed_cond(listp, &diskqueue->cond); 220 if (rc) { 221 rf_print_unable_to_init_cond(__FILE__, __LINE__, rc); 222 return (rc); 223 } 224 return (0); 225 } 226 227 static void 228 rf_ShutdownDiskQueueSystem(ignored) 229 void *ignored; 230 { 231 RF_FREELIST_DESTROY_CLEAN(rf_dqd_freelist, next, (RF_DiskQueueData_t *), clean_dqd); 232 } 233 234 int 235 rf_ConfigureDiskQueueSystem(listp) 236 RF_ShutdownList_t **listp; 237 { 238 int rc; 239 240 RF_FREELIST_CREATE(rf_dqd_freelist, RF_MAX_FREE_DQD, 241 RF_DQD_INC, sizeof(RF_DiskQueueData_t)); 242 if (rf_dqd_freelist == NULL) 243 return (ENOMEM); 244 rc = rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL); 245 if (rc) { 246 rf_print_unable_to_add_shutdown( __FILE__, __LINE__, rc); 247 rf_ShutdownDiskQueueSystem(NULL); 248 return (rc); 249 } 250 RF_FREELIST_PRIME_INIT(rf_dqd_freelist, RF_DQD_INITIAL, next, 251 (RF_DiskQueueData_t *), init_dqd); 252 return (0); 253 } 254 255 int 256 rf_ConfigureDiskQueues( 257 RF_ShutdownList_t ** listp, 258 RF_Raid_t * raidPtr, 259 RF_Config_t * cfgPtr) 260 { 261 RF_DiskQueue_t **diskQueues, *spareQueues; 262 RF_DiskQueueSW_t *p; 263 RF_RowCol_t r, c; 264 int rc, i; 265 266 raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs; 267 268 for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) { 269 if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) { 270 p = &diskqueuesw[i]; 271 break; 272 } 273 } 274 if (p == NULL) { 275 RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType); 276 p = &diskqueuesw[0]; 277 } 278 raidPtr->qType = p; 279 RF_CallocAndAdd(diskQueues, raidPtr->numRow, sizeof(RF_DiskQueue_t *), (RF_DiskQueue_t **), raidPtr->cleanupList); 280 if (diskQueues == NULL) { 281 return (ENOMEM); 282 } 283 raidPtr->Queues = diskQueues; 284 for (r = 0; r < raidPtr->numRow; r++) { 285 RF_CallocAndAdd(diskQueues[r], raidPtr->numCol + 286 ((r == 0) ? RF_MAXSPARE : 0), 287 sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *), 288 raidPtr->cleanupList); 289 if (diskQueues[r] == NULL) 290 return (ENOMEM); 291 for (c = 0; c < raidPtr->numCol; c++) { 292 rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[r][c], 293 r, c, p, 294 raidPtr->sectorsPerDisk, 295 raidPtr->Disks[r][c].dev, 296 cfgPtr->maxOutstandingDiskReqs, 297 listp, raidPtr->cleanupList); 298 if (rc) 299 return (rc); 300 } 301 } 302 303 spareQueues = &raidPtr->Queues[0][raidPtr->numCol]; 304 for (r = 0; r < raidPtr->numSpare; r++) { 305 rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r], 306 0, raidPtr->numCol + r, p, 307 raidPtr->sectorsPerDisk, 308 raidPtr->Disks[0][raidPtr->numCol + r].dev, 309 cfgPtr->maxOutstandingDiskReqs, listp, 310 raidPtr->cleanupList); 311 if (rc) 312 return (rc); 313 } 314 return (0); 315 } 316 /* Enqueue a disk I/O 317 * 318 * Unfortunately, we have to do things differently in the different 319 * environments (simulator, user-level, kernel). 320 * At user level, all I/O is blocking, so we have 1 or more threads/disk 321 * and the thread that enqueues is different from the thread that dequeues. 322 * In the kernel, I/O is non-blocking and so we'd like to have multiple 323 * I/Os outstanding on the physical disks when possible. 324 * 325 * when any request arrives at a queue, we have two choices: 326 * dispatch it to the lower levels 327 * queue it up 328 * 329 * kernel rules for when to do what: 330 * locking request: queue empty => dispatch and lock queue, 331 * else queue it 332 * unlocking req : always dispatch it 333 * normal req : queue empty => dispatch it & set priority 334 * queue not full & priority is ok => dispatch it 335 * else queue it 336 * 337 * user-level rules: 338 * always enqueue. In the special case of an unlocking op, enqueue 339 * in a special way that will cause the unlocking op to be the next 340 * thing dequeued. 341 * 342 * simulator rules: 343 * Do the same as at user level, with the sleeps and wakeups suppressed. 344 */ 345 void 346 rf_DiskIOEnqueue(queue, req, pri) 347 RF_DiskQueue_t *queue; 348 RF_DiskQueueData_t *req; 349 int pri; 350 { 351 RF_ETIMER_START(req->qtime); 352 RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector); 353 req->priority = pri; 354 355 #if RF_DEBUG_DISKQUEUE 356 if (rf_queueDebug && (req->numSector == 0)) { 357 printf("Warning: Enqueueing zero-sector access\n"); 358 } 359 #endif 360 /* 361 * kernel 362 */ 363 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue"); 364 /* locking request */ 365 if (RF_LOCKING_REQ(req)) { 366 if (RF_QUEUE_EMPTY(queue)) { 367 Dprintf3("Dispatching pri %d locking op to r %d c %d (queue empty)\n", pri, queue->row, queue->col); 368 RF_LOCK_QUEUE(queue); 369 rf_DispatchKernelIO(queue, req); 370 } else { 371 queue->queueLength++; /* increment count of number 372 * of requests waiting in this 373 * queue */ 374 Dprintf3("Enqueueing pri %d locking op to r %d c %d (queue not empty)\n", pri, queue->row, queue->col); 375 req->queue = (void *) queue; 376 (queue->qPtr->Enqueue) (queue->qHdr, req, pri); 377 } 378 } 379 /* unlocking request */ 380 else 381 if (RF_UNLOCKING_REQ(req)) { /* we'll do the actual unlock 382 * when this I/O completes */ 383 Dprintf3("Dispatching pri %d unlocking op to r %d c %d\n", pri, queue->row, queue->col); 384 RF_ASSERT(RF_QUEUE_LOCKED(queue)); 385 rf_DispatchKernelIO(queue, req); 386 } 387 /* normal request */ 388 else 389 if (RF_OK_TO_DISPATCH(queue, req)) { 390 Dprintf3("Dispatching pri %d regular op to r %d c %d (ok to dispatch)\n", pri, queue->row, queue->col); 391 rf_DispatchKernelIO(queue, req); 392 } else { 393 queue->queueLength++; /* increment count of 394 * number of requests 395 * waiting in this queue */ 396 Dprintf3("Enqueueing pri %d regular op to r %d c %d (not ok to dispatch)\n", pri, queue->row, queue->col); 397 req->queue = (void *) queue; 398 (queue->qPtr->Enqueue) (queue->qHdr, req, pri); 399 } 400 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue"); 401 } 402 403 404 /* get the next set of I/Os started, kernel version only */ 405 void 406 rf_DiskIOComplete(queue, req, status) 407 RF_DiskQueue_t *queue; 408 RF_DiskQueueData_t *req; 409 int status; 410 { 411 int done = 0; 412 413 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete"); 414 415 /* unlock the queue: (1) after an unlocking req completes (2) after a 416 * locking req fails */ 417 if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) { 418 Dprintf2("DiskIOComplete: unlocking queue at r %d c %d\n", queue->row, queue->col); 419 RF_ASSERT(RF_QUEUE_LOCKED(queue)); 420 RF_UNLOCK_QUEUE(queue); 421 } 422 queue->numOutstanding--; 423 RF_ASSERT(queue->numOutstanding >= 0); 424 425 /* dispatch requests to the disk until we find one that we can't. */ 426 /* no reason to continue once we've filled up the queue */ 427 /* no reason to even start if the queue is locked */ 428 429 while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) { 430 if (queue->nextLockingOp) { 431 req = queue->nextLockingOp; 432 queue->nextLockingOp = NULL; 433 Dprintf3("DiskIOComplete: a pri %d locking req was pending at r %d c %d\n", req->priority, queue->row, queue->col); 434 } else { 435 req = (queue->qPtr->Dequeue) (queue->qHdr); 436 if (req != NULL) { 437 Dprintf3("DiskIOComplete: extracting pri %d req from queue at r %d c %d\n", req->priority, queue->row, queue->col); 438 } else { 439 Dprintf1("DiskIOComplete: no more requests to extract.\n", ""); 440 } 441 } 442 if (req) { 443 queue->queueLength--; /* decrement count of number 444 * of requests waiting in this 445 * queue */ 446 RF_ASSERT(queue->queueLength >= 0); 447 } 448 if (!req) 449 done = 1; 450 else 451 if (RF_LOCKING_REQ(req)) { 452 if (RF_QUEUE_EMPTY(queue)) { /* dispatch it */ 453 Dprintf3("DiskIOComplete: dispatching pri %d locking req to r %d c %d (queue empty)\n", req->priority, queue->row, queue->col); 454 RF_LOCK_QUEUE(queue); 455 rf_DispatchKernelIO(queue, req); 456 done = 1; 457 } else { /* put it aside to wait for 458 * the queue to drain */ 459 Dprintf3("DiskIOComplete: postponing pri %d locking req to r %d c %d\n", req->priority, queue->row, queue->col); 460 RF_ASSERT(queue->nextLockingOp == NULL); 461 queue->nextLockingOp = req; 462 done = 1; 463 } 464 } else 465 if (RF_UNLOCKING_REQ(req)) { /* should not happen: 466 * unlocking ops should 467 * not get queued */ 468 RF_ASSERT(RF_QUEUE_LOCKED(queue)); /* support it anyway for 469 * the future */ 470 Dprintf3("DiskIOComplete: dispatching pri %d unl req to r %d c %d (SHOULD NOT SEE THIS)\n", req->priority, queue->row, queue->col); 471 rf_DispatchKernelIO(queue, req); 472 done = 1; 473 } else 474 if (RF_OK_TO_DISPATCH(queue, req)) { 475 Dprintf3("DiskIOComplete: dispatching pri %d regular req to r %d c %d (ok to dispatch)\n", req->priority, queue->row, queue->col); 476 rf_DispatchKernelIO(queue, req); 477 } else { /* we can't dispatch it, 478 * so just re-enqueue 479 * it. */ 480 /* potential trouble here if 481 * disk queues batch reqs */ 482 Dprintf3("DiskIOComplete: re-enqueueing pri %d regular req to r %d c %d\n", req->priority, queue->row, queue->col); 483 queue->queueLength++; 484 (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority); 485 done = 1; 486 } 487 } 488 489 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete"); 490 } 491 /* promotes accesses tagged with the given parityStripeID from low priority 492 * to normal priority. This promotion is optional, meaning that a queue 493 * need not implement it. If there is no promotion routine associated with 494 * a queue, this routine does nothing and returns -1. 495 */ 496 int 497 rf_DiskIOPromote(queue, parityStripeID, which_ru) 498 RF_DiskQueue_t *queue; 499 RF_StripeNum_t parityStripeID; 500 RF_ReconUnitNum_t which_ru; 501 { 502 int retval; 503 504 if (!queue->qPtr->Promote) 505 return (-1); 506 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote"); 507 retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru); 508 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote"); 509 return (retval); 510 } 511 512 RF_DiskQueueData_t * 513 rf_CreateDiskQueueData( 514 RF_IoType_t typ, 515 RF_SectorNum_t ssect, 516 RF_SectorCount_t nsect, 517 caddr_t buf, 518 RF_StripeNum_t parityStripeID, 519 RF_ReconUnitNum_t which_ru, 520 int (*wakeF) (void *, int), 521 void *arg, 522 RF_DiskQueueData_t * next, 523 RF_AccTraceEntry_t * tracerec, 524 void *raidPtr, 525 RF_DiskQueueDataFlags_t flags, 526 void *kb_proc) 527 { 528 RF_DiskQueueData_t *p; 529 530 RF_FREELIST_GET_INIT(rf_dqd_freelist, p, next, (RF_DiskQueueData_t *), init_dqd); 531 532 p->sectorOffset = ssect + rf_protectedSectors; 533 p->numSector = nsect; 534 p->type = typ; 535 p->buf = buf; 536 p->parityStripeID = parityStripeID; 537 p->which_ru = which_ru; 538 p->CompleteFunc = wakeF; 539 p->argument = arg; 540 p->next = next; 541 p->tracerec = tracerec; 542 p->priority = RF_IO_NORMAL_PRIORITY; 543 p->raidPtr = raidPtr; 544 p->flags = flags; 545 p->b_proc = kb_proc; 546 return (p); 547 } 548 549 void 550 rf_FreeDiskQueueData(p) 551 RF_DiskQueueData_t *p; 552 { 553 RF_FREELIST_FREE_CLEAN(rf_dqd_freelist, p, next, clean_dqd); 554 } 555