1 /* $NetBSD: rf_copyback.c,v 1.44 2010/11/19 06:44:40 dholland Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /***************************************************************************** 30 * 31 * copyback.c -- code to copy reconstructed data back from spare space to 32 * the replaced disk. 33 * 34 * the code operates using callbacks on the I/Os to continue with the 35 * next unit to be copied back. We do this because a simple loop 36 * containing blocking I/Os will not work in the simulator. 37 * 38 ****************************************************************************/ 39 40 #include <sys/cdefs.h> 41 __KERNEL_RCSID(0, "$NetBSD: rf_copyback.c,v 1.44 2010/11/19 06:44:40 dholland Exp $"); 42 43 #include <dev/raidframe/raidframevar.h> 44 45 #include <sys/time.h> 46 #include <sys/buf.h> 47 #include "rf_raid.h" 48 #include "rf_mcpair.h" 49 #include "rf_acctrace.h" 50 #include "rf_etimer.h" 51 #include "rf_general.h" 52 #include "rf_utils.h" 53 #include "rf_copyback.h" 54 #include "rf_decluster.h" 55 #include "rf_driver.h" 56 #include "rf_shutdown.h" 57 #include "rf_kintf.h" 58 59 #define RF_COPYBACK_DATA 0 60 #define RF_COPYBACK_PARITY 1 61 62 int rf_copyback_in_progress; 63 64 static int rf_CopybackReadDoneProc(RF_CopybackDesc_t * desc, int status); 65 static int rf_CopybackWriteDoneProc(RF_CopybackDesc_t * desc, int status); 66 static void rf_CopybackOne(RF_CopybackDesc_t * desc, int typ, 67 RF_RaidAddr_t addr, RF_RowCol_t testCol, 68 RF_SectorNum_t testOffs); 69 static void rf_CopybackComplete(RF_CopybackDesc_t * desc, int status); 70 71 int 72 rf_ConfigureCopyback(RF_ShutdownList_t **listp) 73 { 74 rf_copyback_in_progress = 0; 75 return (0); 76 } 77 78 #include <sys/param.h> 79 #include <sys/systm.h> 80 #include <sys/proc.h> 81 #include <sys/ioctl.h> 82 #include <sys/fcntl.h> 83 #include <sys/vnode.h> 84 #include <sys/namei.h> /* for pathbuf */ 85 86 /* do a complete copyback */ 87 void 88 rf_CopybackReconstructedData(RF_Raid_t *raidPtr) 89 { 90 RF_ComponentLabel_t *c_label; 91 int found, retcode; 92 RF_CopybackDesc_t *desc; 93 RF_RowCol_t fcol; 94 RF_RaidDisk_t *badDisk; 95 char *databuf; 96 97 struct pathbuf *dev_pb; 98 struct vnode *vp; 99 struct vattr va; 100 101 int ac; 102 103 fcol = 0; 104 found = 0; 105 for (fcol = 0; fcol < raidPtr->numCol; fcol++) { 106 if (raidPtr->Disks[fcol].status == rf_ds_dist_spared 107 || raidPtr->Disks[fcol].status == rf_ds_spared) { 108 found = 1; 109 break; 110 } 111 } 112 113 if (!found) { 114 printf("raid%d: no disks need copyback\n", raidPtr->raidid); 115 return; 116 } 117 118 badDisk = &raidPtr->Disks[fcol]; 119 120 /* This device may have been opened successfully the first time. Close 121 * it before trying to open it again.. */ 122 123 if (raidPtr->raid_cinfo[fcol].ci_vp != NULL) { 124 printf("Closed the open device: %s\n", 125 raidPtr->Disks[fcol].devname); 126 vp = raidPtr->raid_cinfo[fcol].ci_vp; 127 ac = raidPtr->Disks[fcol].auto_configured; 128 rf_close_component(raidPtr, vp, ac); 129 raidPtr->raid_cinfo[fcol].ci_vp = NULL; 130 131 } 132 /* note that this disk was *not* auto_configured (any longer) */ 133 raidPtr->Disks[fcol].auto_configured = 0; 134 135 printf("About to (re-)open the device: %s\n", 136 raidPtr->Disks[fcol].devname); 137 138 dev_pb = pathbuf_create(raidPtr->Disks[fcol].devname); 139 if (dev_pb == NULL) { 140 /* shouldn't happen unless maybe the system is OOMing */ 141 printf("raid%d: copyback: pathbuf_create on device: %s failed: %d!\n", 142 raidPtr->raidid, raidPtr->Disks[fcol].devname, 143 ENOMEM); 144 return; 145 } 146 retcode = dk_lookup(dev_pb, curlwp, &vp); 147 pathbuf_destroy(dev_pb); 148 149 if (retcode) { 150 printf("raid%d: copyback: dk_lookup on device: %s failed: %d!\n", 151 raidPtr->raidid, raidPtr->Disks[fcol].devname, 152 retcode); 153 154 /* XXX the component isn't responding properly... must be 155 * still dead :-( */ 156 return; 157 158 } else { 159 160 /* Ok, so we can at least do a lookup... How about actually 161 * getting a vp for it? */ 162 163 if ((retcode = VOP_GETATTR(vp, &va, curlwp->l_cred)) != 0) 164 return; 165 retcode = rf_getdisksize(vp, curlwp, &raidPtr->Disks[fcol]); 166 if (retcode) { 167 return; 168 } 169 170 raidPtr->raid_cinfo[fcol].ci_vp = vp; 171 raidPtr->raid_cinfo[fcol].ci_dev = va.va_rdev; 172 173 raidPtr->Disks[fcol].dev = va.va_rdev; /* XXX or the above? */ 174 175 /* we allow the user to specify that only a fraction of the 176 * disks should be used this is just for debug: it speeds up 177 * the parity scan */ 178 raidPtr->Disks[fcol].numBlocks = 179 raidPtr->Disks[fcol].numBlocks * 180 rf_sizePercentage / 100; 181 } 182 183 if (retcode) { 184 printf("raid%d: copyback: target disk failed TUR\n", 185 raidPtr->raidid); 186 return; 187 } 188 /* get a buffer to hold one SU */ 189 RF_Malloc(databuf, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (char *)); 190 191 /* create a descriptor */ 192 RF_Malloc(desc, sizeof(*desc), (RF_CopybackDesc_t *)); 193 desc->raidPtr = raidPtr; 194 desc->status = 0; 195 desc->fcol = fcol; 196 desc->spCol = badDisk->spareCol; 197 desc->stripeAddr = 0; 198 desc->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; 199 desc->sectPerStripe = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.numDataCol; 200 desc->databuf = databuf; 201 desc->mcpair = rf_AllocMCPair(); 202 203 /* quiesce the array, since we don't want to code support for user 204 * accs here */ 205 rf_SuspendNewRequestsAndWait(raidPtr); 206 207 /* adjust state of the array and of the disks */ 208 RF_LOCK_MUTEX(raidPtr->mutex); 209 raidPtr->Disks[desc->fcol].status = rf_ds_optimal; 210 raidPtr->status = rf_rs_optimal; 211 rf_copyback_in_progress = 1; /* debug only */ 212 RF_UNLOCK_MUTEX(raidPtr->mutex); 213 214 RF_GETTIME(desc->starttime); 215 rf_ContinueCopyback(desc); 216 217 /* Data has been restored. Fix up the component label. */ 218 /* Don't actually need the read here.. */ 219 220 c_label = raidget_component_label(raidPtr, fcol); 221 raid_init_component_label(raidPtr, c_label); 222 223 c_label->row = 0; 224 c_label->column = fcol; 225 c_label->partitionSize = raidPtr->Disks[fcol].partitionSize; 226 c_label->partitionSizeHi = raidPtr->Disks[fcol].partitionSize >> 32; 227 228 raidflush_component_label(raidPtr, fcol); 229 230 /* XXXjld why is this here? */ 231 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); 232 } 233 234 235 /* 236 * invoked via callback after a copyback I/O has completed to 237 * continue on with the next one 238 */ 239 void 240 rf_ContinueCopyback(RF_CopybackDesc_t *desc) 241 { 242 RF_SectorNum_t testOffs, stripeAddr; 243 RF_Raid_t *raidPtr = desc->raidPtr; 244 RF_RaidAddr_t addr; 245 RF_RowCol_t testCol; 246 #if RF_DEBUG_RECON 247 int old_pctg, new_pctg; 248 struct timeval t, diff; 249 #endif 250 int done; 251 252 #if RF_DEBUG_RECON 253 old_pctg = (-1); 254 #endif 255 while (1) { 256 stripeAddr = desc->stripeAddr; 257 desc->raidPtr->copyback_stripes_done = stripeAddr 258 / desc->sectPerStripe; 259 #if RF_DEBUG_RECON 260 if (rf_prReconSched) { 261 old_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors; 262 } 263 #endif 264 desc->stripeAddr += desc->sectPerStripe; 265 #if RF_DEBUG_RECON 266 if (rf_prReconSched) { 267 new_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors; 268 if (new_pctg != old_pctg) { 269 RF_GETTIME(t); 270 RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff); 271 printf("%d %d.%06d\n", new_pctg, (int) diff.tv_sec, (int) diff.tv_usec); 272 } 273 } 274 #endif 275 if (stripeAddr >= raidPtr->totalSectors) { 276 rf_CopybackComplete(desc, 0); 277 return; 278 } 279 /* walk through the current stripe, su-by-su */ 280 for (done = 0, addr = stripeAddr; addr < stripeAddr + desc->sectPerStripe; addr += desc->sectPerSU) { 281 282 /* map the SU, disallowing remap to spare space */ 283 (raidPtr->Layout.map->MapSector) (raidPtr, addr, &testCol, &testOffs, RF_DONT_REMAP); 284 285 if (testCol == desc->fcol) { 286 rf_CopybackOne(desc, RF_COPYBACK_DATA, addr, testCol, testOffs); 287 done = 1; 288 break; 289 } 290 } 291 292 if (!done) { 293 /* we didn't find the failed disk in the data part. 294 * check parity. */ 295 296 /* map the parity for this stripe, disallowing remap 297 * to spare space */ 298 (raidPtr->Layout.map->MapParity) (raidPtr, stripeAddr, &testCol, &testOffs, RF_DONT_REMAP); 299 300 if (testCol == desc->fcol) { 301 rf_CopybackOne(desc, RF_COPYBACK_PARITY, stripeAddr, testCol, testOffs); 302 } 303 } 304 /* check to see if the last read/write pair failed */ 305 if (desc->status) { 306 rf_CopybackComplete(desc, 1); 307 return; 308 } 309 /* we didn't find any units to copy back in this stripe. 310 * Continue with the next one */ 311 } 312 } 313 314 315 /* copyback one unit */ 316 static void 317 rf_CopybackOne(RF_CopybackDesc_t *desc, int typ, RF_RaidAddr_t addr, 318 RF_RowCol_t testCol, RF_SectorNum_t testOffs) 319 { 320 RF_SectorCount_t sectPerSU = desc->sectPerSU; 321 RF_Raid_t *raidPtr = desc->raidPtr; 322 RF_RowCol_t spCol = desc->spCol; 323 RF_SectorNum_t spOffs; 324 325 /* find the spare spare location for this SU */ 326 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 327 if (typ == RF_COPYBACK_DATA) 328 raidPtr->Layout.map->MapSector(raidPtr, addr, &spCol, &spOffs, RF_REMAP); 329 else 330 raidPtr->Layout.map->MapParity(raidPtr, addr, &spCol, &spOffs, RF_REMAP); 331 } else { 332 spOffs = testOffs; 333 } 334 335 /* create reqs to read the old location & write the new */ 336 desc->readreq = rf_CreateDiskQueueData(RF_IO_TYPE_READ, spOffs, 337 sectPerSU, desc->databuf, 0L, 0, 338 (int (*) (void *, int)) rf_CopybackReadDoneProc, desc, 339 NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL, 340 PR_WAITOK); 341 desc->writereq = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, testOffs, 342 sectPerSU, desc->databuf, 0L, 0, 343 (int (*) (void *, int)) rf_CopybackWriteDoneProc, desc, 344 NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL, 345 PR_WAITOK); 346 desc->fcol = testCol; 347 348 /* enqueue the read. the write will go out as part of the callback on 349 * the read. at user-level & in the kernel, wait for the read-write 350 * pair to complete. in the simulator, just return, since everything 351 * will happen as callbacks */ 352 353 RF_LOCK_MUTEX(desc->mcpair->mutex); 354 desc->mcpair->flag = 0; 355 RF_UNLOCK_MUTEX(desc->mcpair->mutex); 356 357 rf_DiskIOEnqueue(&raidPtr->Queues[spCol], desc->readreq, RF_IO_NORMAL_PRIORITY); 358 359 RF_LOCK_MUTEX(desc->mcpair->mutex); 360 while (!desc->mcpair->flag) { 361 RF_WAIT_MCPAIR(desc->mcpair); 362 } 363 RF_UNLOCK_MUTEX(desc->mcpair->mutex); 364 rf_FreeDiskQueueData(desc->readreq); 365 rf_FreeDiskQueueData(desc->writereq); 366 367 } 368 369 370 /* called at interrupt context when the read has completed. just send out the write */ 371 static int 372 rf_CopybackReadDoneProc(RF_CopybackDesc_t *desc, int status) 373 { 374 if (status) { /* invoke the callback with bad status */ 375 printf("raid%d: copyback read failed. Aborting.\n", 376 desc->raidPtr->raidid); 377 (desc->writereq->CompleteFunc) (desc, -100); 378 } else { 379 rf_DiskIOEnqueue(&(desc->raidPtr->Queues[desc->fcol]), desc->writereq, RF_IO_NORMAL_PRIORITY); 380 } 381 return (0); 382 } 383 /* called at interrupt context when the write has completed. 384 * at user level & in the kernel, wake up the copyback thread. 385 * in the simulator, invoke the next copyback directly. 386 * can't free diskqueuedata structs in the kernel b/c we're at interrupt context. 387 */ 388 static int 389 rf_CopybackWriteDoneProc(RF_CopybackDesc_t *desc, int status) 390 { 391 if (status && status != -100) { 392 printf("raid%d: copyback write failed. Aborting.\n", 393 desc->raidPtr->raidid); 394 } 395 desc->status = status; 396 rf_MCPairWakeupFunc(desc->mcpair); 397 return (0); 398 } 399 /* invoked when the copyback has completed */ 400 static void 401 rf_CopybackComplete(RF_CopybackDesc_t *desc, int status) 402 { 403 RF_Raid_t *raidPtr = desc->raidPtr; 404 struct timeval t, diff; 405 406 if (!status) { 407 RF_LOCK_MUTEX(raidPtr->mutex); 408 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 409 RF_ASSERT(raidPtr->Layout.map->parityConfig == 'D'); 410 rf_FreeSpareTable(raidPtr); 411 } else { 412 raidPtr->Disks[desc->spCol].status = rf_ds_spare; 413 } 414 RF_UNLOCK_MUTEX(raidPtr->mutex); 415 416 RF_GETTIME(t); 417 RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff); 418 #if 0 419 printf("Copyback time was %d.%06d seconds\n", 420 (int) diff.tv_sec, (int) diff.tv_usec); 421 #endif 422 } else 423 printf("raid%d: Copyback failure. Status: %d\n", 424 raidPtr->raidid, status); 425 426 RF_Free(desc->databuf, rf_RaidAddressToByte(raidPtr, desc->sectPerSU)); 427 rf_FreeMCPair(desc->mcpair); 428 RF_Free(desc, sizeof(*desc)); 429 430 rf_copyback_in_progress = 0; 431 rf_ResumeNewRequests(raidPtr); 432 } 433