1 /* $NetBSD: rf_paritylogging.c,v 1.14 2002/09/23 02:40:08 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: William V. Courtright II 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 30 /* 31 parity logging configuration, dag selection, and mapping is implemented here 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(0, "$NetBSD: rf_paritylogging.c,v 1.14 2002/09/23 02:40:08 oster Exp $"); 36 37 #include "rf_archs.h" 38 39 #if RF_INCLUDE_PARITYLOGGING > 0 40 41 #include <dev/raidframe/raidframevar.h> 42 43 #include "rf_raid.h" 44 #include "rf_dag.h" 45 #include "rf_dagutils.h" 46 #include "rf_dagfuncs.h" 47 #include "rf_dagffrd.h" 48 #include "rf_dagffwr.h" 49 #include "rf_dagdegrd.h" 50 #include "rf_dagdegwr.h" 51 #include "rf_paritylog.h" 52 #include "rf_paritylogDiskMgr.h" 53 #include "rf_paritylogging.h" 54 #include "rf_parityloggingdags.h" 55 #include "rf_general.h" 56 #include "rf_map.h" 57 #include "rf_utils.h" 58 #include "rf_shutdown.h" 59 60 typedef struct RF_ParityLoggingConfigInfo_s { 61 RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by 62 * IdentifyStripe */ 63 } RF_ParityLoggingConfigInfo_t; 64 65 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID); 66 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg); 67 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg); 68 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg); 69 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg); 70 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg); 71 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg); 72 73 int 74 rf_ConfigureParityLogging( 75 RF_ShutdownList_t ** listp, 76 RF_Raid_t * raidPtr, 77 RF_Config_t * cfgPtr) 78 { 79 int i, j, startdisk, rc; 80 RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity; 81 RF_SectorCount_t parityBufferCapacity, maxRegionParityRange; 82 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 83 RF_ParityLoggingConfigInfo_t *info; 84 RF_ParityLog_t *l = NULL, *next; 85 caddr_t lHeapPtr; 86 87 if (rf_numParityRegions <= 0) 88 return(EINVAL); 89 90 /* 91 * We create multiple entries on the shutdown list here, since 92 * this configuration routine is fairly complicated in and of 93 * itself, and this makes backing out of a failed configuration 94 * much simpler. 95 */ 96 97 raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG; 98 99 /* create a parity logging configuration structure */ 100 RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t), 101 (RF_ParityLoggingConfigInfo_t *), 102 raidPtr->cleanupList); 103 if (info == NULL) 104 return (ENOMEM); 105 layoutPtr->layoutSpecificInfo = (void *) info; 106 107 RF_ASSERT(raidPtr->numRow == 1); 108 109 /* the stripe identifier must identify the disks in each stripe, IN 110 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */ 111 info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol), 112 (raidPtr->numCol), 113 raidPtr->cleanupList); 114 if (info->stripeIdentifier == NULL) 115 return (ENOMEM); 116 117 startdisk = 0; 118 for (i = 0; i < (raidPtr->numCol); i++) { 119 for (j = 0; j < (raidPtr->numCol); j++) { 120 info->stripeIdentifier[i][j] = (startdisk + j) % 121 (raidPtr->numCol - 1); 122 } 123 if ((--startdisk) < 0) 124 startdisk = raidPtr->numCol - 1 - 1; 125 } 126 127 /* fill in the remaining layout parameters */ 128 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; 129 layoutPtr->numParityCol = 1; 130 layoutPtr->numParityLogCol = 1; 131 layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol - 132 layoutPtr->numParityLogCol; 133 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * 134 layoutPtr->sectorsPerStripeUnit; 135 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; 136 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * 137 layoutPtr->sectorsPerStripeUnit; 138 139 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * 140 layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; 141 142 /* configure parity log parameters 143 * 144 * parameter comment/constraints 145 * ------------------------------------------- 146 * numParityRegions* all regions (except possibly last) 147 * of equal size 148 * totalInCoreLogCapacity* amount of memory in bytes available 149 * for in-core logs (default 1 MB) 150 * numSectorsPerLog# capacity of an in-core log in sectors 151 * (1 * disk track) 152 * numParityLogs total number of in-core logs, 153 * should be at least numParityRegions 154 * regionLogCapacity size of a region log (except possibly 155 * last one) in sectors 156 * totalLogCapacity total amount of log space in sectors 157 * 158 * where '*' denotes a user settable parameter. 159 * Note that logs are fixed to be the size of a disk track, 160 * value #defined in rf_paritylog.h 161 * 162 */ 163 164 totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol; 165 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions; 166 if (rf_parityLogDebug) 167 printf("bytes per sector %d\n", raidPtr->bytesPerSector); 168 169 /* reduce fragmentation within a disk region by adjusting the number 170 * of regions in an attempt to allow an integral number of logs to fit 171 * into a disk region */ 172 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog; 173 if (fragmentation > 0) 174 for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) { 175 if (((totalLogCapacity / (rf_numParityRegions + i)) % 176 raidPtr->numSectorsPerLog) < fragmentation) { 177 rf_numParityRegions++; 178 raidPtr->regionLogCapacity = totalLogCapacity / 179 rf_numParityRegions; 180 fragmentation = raidPtr->regionLogCapacity % 181 raidPtr->numSectorsPerLog; 182 } 183 if (((totalLogCapacity / (rf_numParityRegions - i)) % 184 raidPtr->numSectorsPerLog) < fragmentation) { 185 rf_numParityRegions--; 186 raidPtr->regionLogCapacity = totalLogCapacity / 187 rf_numParityRegions; 188 fragmentation = raidPtr->regionLogCapacity % 189 raidPtr->numSectorsPerLog; 190 } 191 } 192 /* ensure integral number of regions per log */ 193 raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity / 194 raidPtr->numSectorsPerLog) * 195 raidPtr->numSectorsPerLog; 196 197 raidPtr->numParityLogs = rf_totalInCoreLogCapacity / 198 (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog); 199 /* to avoid deadlock, must ensure that enough logs exist for each 200 * region to have one simultaneously */ 201 if (raidPtr->numParityLogs < rf_numParityRegions) 202 raidPtr->numParityLogs = rf_numParityRegions; 203 204 /* create region information structs */ 205 printf("Allocating %d bytes for in-core parity region info\n", 206 (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t))); 207 RF_Malloc(raidPtr->regionInfo, 208 (rf_numParityRegions * sizeof(RF_RegionInfo_t)), 209 (RF_RegionInfo_t *)); 210 if (raidPtr->regionInfo == NULL) 211 return (ENOMEM); 212 213 /* last region may not be full capacity */ 214 lastRegionCapacity = raidPtr->regionLogCapacity; 215 while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity + 216 lastRegionCapacity > totalLogCapacity) 217 lastRegionCapacity = lastRegionCapacity - 218 raidPtr->numSectorsPerLog; 219 220 raidPtr->regionParityRange = raidPtr->sectorsPerDisk / 221 rf_numParityRegions; 222 maxRegionParityRange = raidPtr->regionParityRange; 223 224 /* i can't remember why this line is in the code -wvcii 6/30/95 */ 225 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0) 226 regionParityRange++; */ 227 228 /* build pool of unused parity logs */ 229 printf("Allocating %d bytes for %d parity logs\n", 230 raidPtr->numParityLogs * raidPtr->numSectorsPerLog * 231 raidPtr->bytesPerSector, 232 raidPtr->numParityLogs); 233 RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 234 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector, 235 (caddr_t)); 236 if (raidPtr->parityLogBufferHeap == NULL) 237 return (ENOMEM); 238 lHeapPtr = raidPtr->parityLogBufferHeap; 239 rc = rf_mutex_init(&raidPtr->parityLogPool.mutex); 240 if (rc) { 241 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 242 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 243 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 244 return (ENOMEM); 245 } 246 for (i = 0; i < raidPtr->numParityLogs; i++) { 247 if (i == 0) { 248 RF_Calloc(raidPtr->parityLogPool.parityLogs, 1, 249 sizeof(RF_ParityLog_t), (RF_ParityLog_t *)); 250 if (raidPtr->parityLogPool.parityLogs == NULL) { 251 RF_Free(raidPtr->parityLogBufferHeap, 252 raidPtr->numParityLogs * 253 raidPtr->numSectorsPerLog * 254 raidPtr->bytesPerSector); 255 return (ENOMEM); 256 } 257 l = raidPtr->parityLogPool.parityLogs; 258 } else { 259 RF_Calloc(l->next, 1, sizeof(RF_ParityLog_t), 260 (RF_ParityLog_t *)); 261 if (l->next == NULL) { 262 RF_Free(raidPtr->parityLogBufferHeap, 263 raidPtr->numParityLogs * 264 raidPtr->numSectorsPerLog * 265 raidPtr->bytesPerSector); 266 for (l = raidPtr->parityLogPool.parityLogs; 267 l; 268 l = next) { 269 next = l->next; 270 if (l->records) 271 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t))); 272 RF_Free(l, sizeof(RF_ParityLog_t)); 273 } 274 return (ENOMEM); 275 } 276 l = l->next; 277 } 278 l->bufPtr = lHeapPtr; 279 lHeapPtr += raidPtr->numSectorsPerLog * 280 raidPtr->bytesPerSector; 281 RF_Malloc(l->records, (raidPtr->numSectorsPerLog * 282 sizeof(RF_ParityLogRecord_t)), 283 (RF_ParityLogRecord_t *)); 284 if (l->records == NULL) { 285 RF_Free(raidPtr->parityLogBufferHeap, 286 raidPtr->numParityLogs * 287 raidPtr->numSectorsPerLog * 288 raidPtr->bytesPerSector); 289 for (l = raidPtr->parityLogPool.parityLogs; 290 l; 291 l = next) { 292 next = l->next; 293 if (l->records) 294 RF_Free(l->records, 295 (raidPtr->numSectorsPerLog * 296 sizeof(RF_ParityLogRecord_t))); 297 RF_Free(l, sizeof(RF_ParityLog_t)); 298 } 299 return (ENOMEM); 300 } 301 } 302 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr); 303 if (rc) { 304 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 305 __LINE__, rc); 306 rf_ShutdownParityLoggingPool(raidPtr); 307 return (rc); 308 } 309 /* build pool of region buffers */ 310 rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex); 311 if (rc) { 312 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 313 return (ENOMEM); 314 } 315 rc = rf_cond_init(&raidPtr->regionBufferPool.cond); 316 if (rc) { 317 rf_print_unable_to_init_cond(__FILE__, __LINE__, rc); 318 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); 319 return (ENOMEM); 320 } 321 raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity * 322 raidPtr->bytesPerSector; 323 printf("regionBufferPool.bufferSize %d\n", 324 raidPtr->regionBufferPool.bufferSize); 325 326 /* for now, only one region at a time may be reintegrated */ 327 raidPtr->regionBufferPool.totalBuffers = 1; 328 329 raidPtr->regionBufferPool.availableBuffers = 330 raidPtr->regionBufferPool.totalBuffers; 331 raidPtr->regionBufferPool.availBuffersIndex = 0; 332 raidPtr->regionBufferPool.emptyBuffersIndex = 0; 333 printf("Allocating %d bytes for regionBufferPool\n", 334 (int) (raidPtr->regionBufferPool.totalBuffers * 335 sizeof(caddr_t))); 336 RF_Malloc(raidPtr->regionBufferPool.buffers, 337 raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t), 338 (caddr_t *)); 339 if (raidPtr->regionBufferPool.buffers == NULL) { 340 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); 341 rf_cond_destroy(&raidPtr->regionBufferPool.cond); 342 return (ENOMEM); 343 } 344 for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) { 345 printf("Allocating %d bytes for regionBufferPool#%d\n", 346 (int) (raidPtr->regionBufferPool.bufferSize * 347 sizeof(char)), i); 348 RF_Malloc(raidPtr->regionBufferPool.buffers[i], 349 raidPtr->regionBufferPool.bufferSize * sizeof(char), 350 (caddr_t)); 351 if (raidPtr->regionBufferPool.buffers[i] == NULL) { 352 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); 353 rf_cond_destroy(&raidPtr->regionBufferPool.cond); 354 for (j = 0; j < i; j++) { 355 RF_Free(raidPtr->regionBufferPool.buffers[i], 356 raidPtr->regionBufferPool.bufferSize * 357 sizeof(char)); 358 } 359 RF_Free(raidPtr->regionBufferPool.buffers, 360 raidPtr->regionBufferPool.totalBuffers * 361 sizeof(caddr_t)); 362 return (ENOMEM); 363 } 364 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i, 365 (long) raidPtr->regionBufferPool.buffers[i]); 366 } 367 rc = rf_ShutdownCreate(listp, 368 rf_ShutdownParityLoggingRegionBufferPool, 369 raidPtr); 370 if (rc) { 371 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 372 __LINE__, rc); 373 rf_ShutdownParityLoggingRegionBufferPool(raidPtr); 374 return (rc); 375 } 376 /* build pool of parity buffers */ 377 parityBufferCapacity = maxRegionParityRange; 378 rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex); 379 if (rc) { 380 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 381 return (rc); 382 } 383 rc = rf_cond_init(&raidPtr->parityBufferPool.cond); 384 if (rc) { 385 rf_print_unable_to_init_cond(__FILE__, __LINE__, rc); 386 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); 387 return (ENOMEM); 388 } 389 raidPtr->parityBufferPool.bufferSize = parityBufferCapacity * 390 raidPtr->bytesPerSector; 391 printf("parityBufferPool.bufferSize %d\n", 392 raidPtr->parityBufferPool.bufferSize); 393 394 /* for now, only one region at a time may be reintegrated */ 395 raidPtr->parityBufferPool.totalBuffers = 1; 396 397 raidPtr->parityBufferPool.availableBuffers = 398 raidPtr->parityBufferPool.totalBuffers; 399 raidPtr->parityBufferPool.availBuffersIndex = 0; 400 raidPtr->parityBufferPool.emptyBuffersIndex = 0; 401 printf("Allocating %d bytes for parityBufferPool of %d units\n", 402 (int) (raidPtr->parityBufferPool.totalBuffers * 403 sizeof(caddr_t)), 404 raidPtr->parityBufferPool.totalBuffers ); 405 RF_Malloc(raidPtr->parityBufferPool.buffers, 406 raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t), 407 (caddr_t *)); 408 if (raidPtr->parityBufferPool.buffers == NULL) { 409 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); 410 rf_cond_destroy(&raidPtr->parityBufferPool.cond); 411 return (ENOMEM); 412 } 413 for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) { 414 printf("Allocating %d bytes for parityBufferPool#%d\n", 415 (int) (raidPtr->parityBufferPool.bufferSize * 416 sizeof(char)),i); 417 RF_Malloc(raidPtr->parityBufferPool.buffers[i], 418 raidPtr->parityBufferPool.bufferSize * sizeof(char), 419 (caddr_t)); 420 if (raidPtr->parityBufferPool.buffers == NULL) { 421 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); 422 rf_cond_destroy(&raidPtr->parityBufferPool.cond); 423 for (j = 0; j < i; j++) { 424 RF_Free(raidPtr->parityBufferPool.buffers[i], 425 raidPtr->regionBufferPool.bufferSize * 426 sizeof(char)); 427 } 428 RF_Free(raidPtr->parityBufferPool.buffers, 429 raidPtr->regionBufferPool.totalBuffers * 430 sizeof(caddr_t)); 431 return (ENOMEM); 432 } 433 printf("parityBufferPool.buffers[%d] = %lx\n", i, 434 (long) raidPtr->parityBufferPool.buffers[i]); 435 } 436 rc = rf_ShutdownCreate(listp, 437 rf_ShutdownParityLoggingParityBufferPool, 438 raidPtr); 439 if (rc) { 440 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 441 __LINE__, rc); 442 rf_ShutdownParityLoggingParityBufferPool(raidPtr); 443 return (rc); 444 } 445 /* initialize parityLogDiskQueue */ 446 rc = rf_create_managed_mutex(listp, 447 &raidPtr->parityLogDiskQueue.mutex); 448 if (rc) { 449 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 450 return (rc); 451 } 452 rc = rf_create_managed_cond(listp, &raidPtr->parityLogDiskQueue.cond); 453 if (rc) { 454 rf_print_unable_to_init_cond(__FILE__, __LINE__, rc); 455 return (rc); 456 } 457 raidPtr->parityLogDiskQueue.flushQueue = NULL; 458 raidPtr->parityLogDiskQueue.reintQueue = NULL; 459 raidPtr->parityLogDiskQueue.bufHead = NULL; 460 raidPtr->parityLogDiskQueue.bufTail = NULL; 461 raidPtr->parityLogDiskQueue.reintHead = NULL; 462 raidPtr->parityLogDiskQueue.reintTail = NULL; 463 raidPtr->parityLogDiskQueue.logBlockHead = NULL; 464 raidPtr->parityLogDiskQueue.logBlockTail = NULL; 465 raidPtr->parityLogDiskQueue.reintBlockHead = NULL; 466 raidPtr->parityLogDiskQueue.reintBlockTail = NULL; 467 raidPtr->parityLogDiskQueue.freeDataList = NULL; 468 raidPtr->parityLogDiskQueue.freeCommonList = NULL; 469 470 rc = rf_ShutdownCreate(listp, 471 rf_ShutdownParityLoggingDiskQueue, 472 raidPtr); 473 if (rc) { 474 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 475 __LINE__, rc); 476 return (rc); 477 } 478 for (i = 0; i < rf_numParityRegions; i++) { 479 rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex); 480 if (rc) { 481 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 482 for (j = 0; j < i; j++) 483 FreeRegionInfo(raidPtr, j); 484 RF_Free(raidPtr->regionInfo, 485 (rf_numParityRegions * 486 sizeof(RF_RegionInfo_t))); 487 return (ENOMEM); 488 } 489 rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex); 490 if (rc) { 491 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 492 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex); 493 for (j = 0; j < i; j++) 494 FreeRegionInfo(raidPtr, j); 495 RF_Free(raidPtr->regionInfo, 496 (rf_numParityRegions * 497 sizeof(RF_RegionInfo_t))); 498 return (ENOMEM); 499 } 500 raidPtr->regionInfo[i].reintInProgress = RF_FALSE; 501 raidPtr->regionInfo[i].regionStartAddr = 502 raidPtr->regionLogCapacity * i; 503 raidPtr->regionInfo[i].parityStartAddr = 504 raidPtr->regionParityRange * i; 505 if (i < rf_numParityRegions - 1) { 506 raidPtr->regionInfo[i].capacity = 507 raidPtr->regionLogCapacity; 508 raidPtr->regionInfo[i].numSectorsParity = 509 raidPtr->regionParityRange; 510 } else { 511 raidPtr->regionInfo[i].capacity = 512 lastRegionCapacity; 513 raidPtr->regionInfo[i].numSectorsParity = 514 raidPtr->sectorsPerDisk - 515 raidPtr->regionParityRange * i; 516 if (raidPtr->regionInfo[i].numSectorsParity > 517 maxRegionParityRange) 518 maxRegionParityRange = 519 raidPtr->regionInfo[i].numSectorsParity; 520 } 521 raidPtr->regionInfo[i].diskCount = 0; 522 RF_ASSERT(raidPtr->regionInfo[i].capacity + 523 raidPtr->regionInfo[i].regionStartAddr <= 524 totalLogCapacity); 525 RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr + 526 raidPtr->regionInfo[i].numSectorsParity <= 527 raidPtr->sectorsPerDisk); 528 printf("Allocating %d bytes for region %d\n", 529 (int) (raidPtr->regionInfo[i].capacity * 530 sizeof(RF_DiskMap_t)), i); 531 RF_Malloc(raidPtr->regionInfo[i].diskMap, 532 (raidPtr->regionInfo[i].capacity * 533 sizeof(RF_DiskMap_t)), 534 (RF_DiskMap_t *)); 535 if (raidPtr->regionInfo[i].diskMap == NULL) { 536 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex); 537 rf_mutex_destroy(&raidPtr->regionInfo[i].reintMutex); 538 for (j = 0; j < i; j++) 539 FreeRegionInfo(raidPtr, j); 540 RF_Free(raidPtr->regionInfo, 541 (rf_numParityRegions * 542 sizeof(RF_RegionInfo_t))); 543 return (ENOMEM); 544 } 545 raidPtr->regionInfo[i].loggingEnabled = RF_FALSE; 546 raidPtr->regionInfo[i].coreLog = NULL; 547 } 548 rc = rf_ShutdownCreate(listp, 549 rf_ShutdownParityLoggingRegionInfo, 550 raidPtr); 551 if (rc) { 552 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 553 __LINE__, rc); 554 rf_ShutdownParityLoggingRegionInfo(raidPtr); 555 return (rc); 556 } 557 RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0); 558 raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED; 559 rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle, 560 rf_ParityLoggingDiskManager, raidPtr,"rf_log"); 561 if (rc) { 562 raidPtr->parityLogDiskQueue.threadState = 0; 563 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n", 564 __FILE__, __LINE__, rc); 565 return (ENOMEM); 566 } 567 /* wait for thread to start */ 568 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 569 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) { 570 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 571 raidPtr->parityLogDiskQueue.mutex); 572 } 573 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 574 575 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr); 576 if (rc) { 577 RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc); 578 rf_ShutdownParityLogging(raidPtr); 579 return (rc); 580 } 581 if (rf_parityLogDebug) { 582 printf(" size of disk log in sectors: %d\n", 583 (int) totalLogCapacity); 584 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions); 585 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity); 586 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation); 587 printf(" total number of parity logs is %d\n", raidPtr->numParityLogs); 588 printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog); 589 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity); 590 } 591 rf_EnableParityLogging(raidPtr); 592 593 return (0); 594 } 595 596 static void 597 FreeRegionInfo( 598 RF_Raid_t * raidPtr, 599 RF_RegionId_t regionID) 600 { 601 RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 602 RF_Free(raidPtr->regionInfo[regionID].diskMap, 603 (raidPtr->regionInfo[regionID].capacity * 604 sizeof(RF_DiskMap_t))); 605 if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) { 606 rf_ReleaseParityLogs(raidPtr, 607 raidPtr->regionInfo[regionID].coreLog); 608 raidPtr->regionInfo[regionID].coreLog = NULL; 609 } else { 610 RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL); 611 RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0); 612 } 613 RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 614 rf_mutex_destroy(&raidPtr->regionInfo[regionID].mutex); 615 rf_mutex_destroy(&raidPtr->regionInfo[regionID].reintMutex); 616 } 617 618 619 static void 620 FreeParityLogQueue( 621 RF_Raid_t * raidPtr, 622 RF_ParityLogQueue_t * queue) 623 { 624 RF_ParityLog_t *l1, *l2; 625 626 RF_LOCK_MUTEX(queue->mutex); 627 l1 = queue->parityLogs; 628 while (l1) { 629 l2 = l1; 630 l1 = l2->next; 631 RF_Free(l2->records, (raidPtr->numSectorsPerLog * 632 sizeof(RF_ParityLogRecord_t))); 633 RF_Free(l2, sizeof(RF_ParityLog_t)); 634 } 635 RF_UNLOCK_MUTEX(queue->mutex); 636 rf_mutex_destroy(&queue->mutex); 637 } 638 639 640 static void 641 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue) 642 { 643 int i; 644 645 RF_LOCK_MUTEX(queue->mutex); 646 if (queue->availableBuffers != queue->totalBuffers) { 647 printf("Attempt to free region queue which is still in use!\n"); 648 RF_ASSERT(0); 649 } 650 for (i = 0; i < queue->totalBuffers; i++) 651 RF_Free(queue->buffers[i], queue->bufferSize); 652 RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t)); 653 RF_UNLOCK_MUTEX(queue->mutex); 654 rf_mutex_destroy(&queue->mutex); 655 } 656 657 static void 658 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg) 659 { 660 RF_Raid_t *raidPtr; 661 RF_RegionId_t i; 662 663 raidPtr = (RF_Raid_t *) arg; 664 if (rf_parityLogDebug) { 665 printf("raid%d: ShutdownParityLoggingRegionInfo\n", 666 raidPtr->raidid); 667 } 668 /* free region information structs */ 669 for (i = 0; i < rf_numParityRegions; i++) 670 FreeRegionInfo(raidPtr, i); 671 RF_Free(raidPtr->regionInfo, (rf_numParityRegions * 672 sizeof(raidPtr->regionInfo))); 673 raidPtr->regionInfo = NULL; 674 } 675 676 static void 677 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg) 678 { 679 RF_Raid_t *raidPtr; 680 681 raidPtr = (RF_Raid_t *) arg; 682 if (rf_parityLogDebug) { 683 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid); 684 } 685 /* free contents of parityLogPool */ 686 FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool); 687 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 688 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 689 } 690 691 static void 692 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg) 693 { 694 RF_Raid_t *raidPtr; 695 696 raidPtr = (RF_Raid_t *) arg; 697 if (rf_parityLogDebug) { 698 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n", 699 raidPtr->raidid); 700 } 701 FreeRegionBufferQueue(&raidPtr->regionBufferPool); 702 } 703 704 static void 705 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg) 706 { 707 RF_Raid_t *raidPtr; 708 709 raidPtr = (RF_Raid_t *) arg; 710 if (rf_parityLogDebug) { 711 printf("raid%d: ShutdownParityLoggingParityBufferPool\n", 712 raidPtr->raidid); 713 } 714 FreeRegionBufferQueue(&raidPtr->parityBufferPool); 715 } 716 717 static void 718 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg) 719 { 720 RF_ParityLogData_t *d; 721 RF_CommonLogData_t *c; 722 RF_Raid_t *raidPtr; 723 724 raidPtr = (RF_Raid_t *) arg; 725 if (rf_parityLogDebug) { 726 printf("raid%d: ShutdownParityLoggingDiskQueue\n", 727 raidPtr->raidid); 728 } 729 /* free disk manager stuff */ 730 RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL); 731 RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL); 732 RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL); 733 RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL); 734 while (raidPtr->parityLogDiskQueue.freeDataList) { 735 d = raidPtr->parityLogDiskQueue.freeDataList; 736 raidPtr->parityLogDiskQueue.freeDataList = 737 raidPtr->parityLogDiskQueue.freeDataList->next; 738 RF_Free(d, sizeof(RF_ParityLogData_t)); 739 } 740 while (raidPtr->parityLogDiskQueue.freeCommonList) { 741 c = raidPtr->parityLogDiskQueue.freeCommonList; 742 rf_mutex_destroy(&c->mutex); 743 raidPtr->parityLogDiskQueue.freeCommonList = 744 raidPtr->parityLogDiskQueue.freeCommonList->next; 745 RF_Free(c, sizeof(RF_CommonLogData_t)); 746 } 747 } 748 749 static void 750 rf_ShutdownParityLogging(RF_ThreadArg_t arg) 751 { 752 RF_Raid_t *raidPtr; 753 754 raidPtr = (RF_Raid_t *) arg; 755 if (rf_parityLogDebug) { 756 printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid); 757 } 758 /* shutdown disk thread */ 759 /* This has the desirable side-effect of forcing all regions to be 760 * reintegrated. This is necessary since all parity log maps are 761 * currently held in volatile memory. */ 762 763 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 764 raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE; 765 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 766 RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond); 767 /* 768 * pLogDiskThread will now terminate when queues are cleared 769 * now wait for it to be done 770 */ 771 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 772 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) { 773 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 774 raidPtr->parityLogDiskQueue.mutex); 775 } 776 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 777 if (rf_parityLogDebug) { 778 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid); 779 } 780 } 781 782 int 783 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr) 784 { 785 return (20); 786 } 787 788 RF_HeadSepLimit_t 789 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr) 790 { 791 return (10); 792 } 793 /* return the region ID for a given RAID address */ 794 RF_RegionId_t 795 rf_MapRegionIDParityLogging( 796 RF_Raid_t * raidPtr, 797 RF_SectorNum_t address) 798 { 799 RF_RegionId_t regionID; 800 801 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */ 802 regionID = address / raidPtr->regionParityRange; 803 if (regionID == rf_numParityRegions) { 804 /* last region may be larger than other regions */ 805 regionID--; 806 } 807 RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr); 808 RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr + 809 raidPtr->regionInfo[regionID].numSectorsParity); 810 RF_ASSERT(regionID < rf_numParityRegions); 811 return (regionID); 812 } 813 814 815 /* given a logical RAID sector, determine physical disk address of data */ 816 void 817 rf_MapSectorParityLogging( 818 RF_Raid_t * raidPtr, 819 RF_RaidAddr_t raidSector, 820 RF_RowCol_t * row, 821 RF_RowCol_t * col, 822 RF_SectorNum_t * diskSector, 823 int remap) 824 { 825 RF_StripeNum_t SUID = raidSector / 826 raidPtr->Layout.sectorsPerStripeUnit; 827 *row = 0; 828 /* *col = (SUID % (raidPtr->numCol - 829 * raidPtr->Layout.numParityLogCol)); */ 830 *col = SUID % raidPtr->Layout.numDataCol; 831 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 832 raidPtr->Layout.sectorsPerStripeUnit + 833 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 834 } 835 836 837 /* given a logical RAID sector, determine physical disk address of parity */ 838 void 839 rf_MapParityParityLogging( 840 RF_Raid_t * raidPtr, 841 RF_RaidAddr_t raidSector, 842 RF_RowCol_t * row, 843 RF_RowCol_t * col, 844 RF_SectorNum_t * diskSector, 845 int remap) 846 { 847 RF_StripeNum_t SUID = raidSector / 848 raidPtr->Layout.sectorsPerStripeUnit; 849 850 *row = 0; 851 /* *col = 852 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt 853 * r->numCol - raidPtr->Layout.numParityLogCol); */ 854 *col = raidPtr->Layout.numDataCol; 855 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 856 raidPtr->Layout.sectorsPerStripeUnit + 857 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 858 } 859 860 861 /* given a regionID and sector offset, determine the physical disk address of the parity log */ 862 void 863 rf_MapLogParityLogging( 864 RF_Raid_t * raidPtr, 865 RF_RegionId_t regionID, 866 RF_SectorNum_t regionOffset, 867 RF_RowCol_t * row, 868 RF_RowCol_t * col, 869 RF_SectorNum_t * startSector) 870 { 871 *row = 0; 872 *col = raidPtr->numCol - 1; 873 *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset; 874 } 875 876 877 /* given a regionID, determine the physical disk address of the logged 878 parity for that region */ 879 void 880 rf_MapRegionParity( 881 RF_Raid_t * raidPtr, 882 RF_RegionId_t regionID, 883 RF_RowCol_t * row, 884 RF_RowCol_t * col, 885 RF_SectorNum_t * startSector, 886 RF_SectorCount_t * numSector) 887 { 888 *row = 0; 889 *col = raidPtr->numCol - 2; 890 *startSector = raidPtr->regionInfo[regionID].parityStartAddr; 891 *numSector = raidPtr->regionInfo[regionID].numSectorsParity; 892 } 893 894 895 /* given a logical RAID address, determine the participating disks in 896 the stripe */ 897 void 898 rf_IdentifyStripeParityLogging( 899 RF_Raid_t * raidPtr, 900 RF_RaidAddr_t addr, 901 RF_RowCol_t ** diskids, 902 RF_RowCol_t * outRow) 903 { 904 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, 905 addr); 906 RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *) 907 raidPtr->Layout.layoutSpecificInfo; 908 *outRow = 0; 909 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol]; 910 } 911 912 913 void 914 rf_MapSIDToPSIDParityLogging( 915 RF_RaidLayout_t * layoutPtr, 916 RF_StripeNum_t stripeID, 917 RF_StripeNum_t * psID, 918 RF_ReconUnitNum_t * which_ru) 919 { 920 *which_ru = 0; 921 *psID = stripeID; 922 } 923 924 925 /* select an algorithm for performing an access. Returns two pointers, 926 * one to a function that will return information about the DAG, and 927 * another to a function that will create the dag. 928 */ 929 void 930 rf_ParityLoggingDagSelect( 931 RF_Raid_t * raidPtr, 932 RF_IoType_t type, 933 RF_AccessStripeMap_t * asmp, 934 RF_VoidFuncPtr * createFunc) 935 { 936 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 937 RF_PhysDiskAddr_t *failedPDA = NULL; 938 RF_RowCol_t frow, fcol; 939 RF_RowStatus_t rstat; 940 int prior_recon; 941 942 RF_ASSERT(RF_IO_IS_R_OR_W(type)); 943 944 if (asmp->numDataFailed + asmp->numParityFailed > 1) { 945 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); 946 /* *infoFunc = */ *createFunc = NULL; 947 return; 948 } else 949 if (asmp->numDataFailed + asmp->numParityFailed == 1) { 950 951 /* if under recon & already reconstructed, redirect 952 * the access to the spare drive and eliminate the 953 * failure indication */ 954 failedPDA = asmp->failedPDAs[0]; 955 frow = failedPDA->row; 956 fcol = failedPDA->col; 957 rstat = raidPtr->status[failedPDA->row]; 958 prior_recon = (rstat == rf_rs_reconfigured) || ( 959 (rstat == rf_rs_reconstructing) ? 960 rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0 961 ); 962 if (prior_recon) { 963 RF_RowCol_t or = failedPDA->row, oc = failedPDA->col; 964 RF_SectorNum_t oo = failedPDA->startSector; 965 if (layoutPtr->map->flags & 966 RF_DISTRIBUTE_SPARE) { 967 /* redirect to dist spare space */ 968 969 if (failedPDA == asmp->parityInfo) { 970 971 /* parity has failed */ 972 (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, &failedPDA->row, 973 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 974 975 if (asmp->parityInfo->next) { /* redir 2nd component, 976 * if any */ 977 RF_PhysDiskAddr_t *p = asmp->parityInfo->next; 978 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; 979 p->row = failedPDA->row; 980 p->col = failedPDA->col; 981 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + 982 SUoffs; /* cheating: 983 * startSector is not 984 * really a RAID address */ 985 } 986 } else 987 if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) { 988 RF_ASSERT(0); /* should not ever 989 * happen */ 990 } else { 991 992 /* data has failed */ 993 (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, &failedPDA->row, 994 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 995 996 } 997 998 } else { 999 /* redirect to dedicated spare space */ 1000 1001 failedPDA->row = raidPtr->Disks[frow][fcol].spareRow; 1002 failedPDA->col = raidPtr->Disks[frow][fcol].spareCol; 1003 1004 /* the parity may have two distinct 1005 * components, both of which may need 1006 * to be redirected */ 1007 if (asmp->parityInfo->next) { 1008 if (failedPDA == asmp->parityInfo) { 1009 failedPDA->next->row = failedPDA->row; 1010 failedPDA->next->col = failedPDA->col; 1011 } else 1012 if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */ 1013 asmp->parityInfo->row = failedPDA->row; 1014 asmp->parityInfo->col = failedPDA->col; 1015 } 1016 } 1017 } 1018 1019 RF_ASSERT(failedPDA->col != -1); 1020 1021 if (rf_dagDebug || rf_mapDebug) { 1022 printf("raid%d: Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n", 1023 raidPtr->raidid, type, or, oc, (long) oo, failedPDA->row, failedPDA->col, (long) failedPDA->startSector); 1024 } 1025 asmp->numDataFailed = asmp->numParityFailed = 0; 1026 } 1027 } 1028 if (type == RF_IO_TYPE_READ) { 1029 1030 if (asmp->numDataFailed == 0) 1031 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; 1032 else 1033 *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG; 1034 1035 } else { 1036 1037 1038 /* if mirroring, always use large writes. If the access 1039 * requires two distinct parity updates, always do a small 1040 * write. If the stripe contains a failure but the access 1041 * does not, do a small write. The first conditional 1042 * (numStripeUnitsAccessed <= numDataCol/2) uses a 1043 * less-than-or-equal rather than just a less-than because 1044 * when G is 3 or 4, numDataCol/2 is 1, and I want 1045 * single-stripe-unit updates to use just one disk. */ 1046 if ((asmp->numDataFailed + asmp->numParityFailed) == 0) { 1047 if (((asmp->numStripeUnitsAccessed <= 1048 (layoutPtr->numDataCol / 2)) && 1049 (layoutPtr->numDataCol != 1)) || 1050 (asmp->parityInfo->next != NULL) || 1051 rf_CheckStripeForFailures(raidPtr, asmp)) { 1052 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG; 1053 } else 1054 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG; 1055 } else 1056 if (asmp->numParityFailed == 1) 1057 *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG; 1058 else 1059 if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit) 1060 *createFunc = NULL; 1061 else 1062 *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG; 1063 } 1064 } 1065 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ 1066