1 /*- 2 * Copyright (c) 1997, 1998 3 * Cybernet Corporation and Nan Yang Computer Services Limited. 4 * All rights reserved. 5 * 6 * This software was developed as part of the NetMAX project. 7 * 8 * Written by Greg Lehey 9 * 10 * This software is distributed under the so-called ``Berkeley 11 * License'': 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed by Cybernet Corporation 24 * and Nan Yang Computer Services Limited 25 * 4. Neither the name of the Companies nor the names of its contributors 26 * may be used to endorse or promote products derived from this software 27 * without specific prior written permission. 28 * 29 * This software is provided ``as is'', and any express or implied 30 * warranties, including, but not limited to, the implied warranties of 31 * merchantability and fitness for a particular purpose are disclaimed. 32 * In no event shall the company or contributors be liable for any 33 * direct, indirect, incidental, special, exemplary, or consequential 34 * damages (including, but not limited to, procurement of substitute 35 * goods or services; loss of use, data, or profits; or business 36 * interruption) however caused and on any theory of liability, whether 37 * in contract, strict liability, or tort (including negligence or 38 * otherwise) arising in any way out of the use of this software, even if 39 * advised of the possibility of such damage. 40 * 41 * $Id: vinumraid5.c,v 1.21 2001/01/09 04:21:27 grog Exp grog $ 42 * $FreeBSD: src/sys/dev/vinum/vinumraid5.c,v 1.6.2.2 2001/03/13 02:59:43 grog Exp $ 43 * $DragonFly: src/sys/dev/raid/vinum/vinumraid5.c,v 1.2 2003/06/17 04:28:33 dillon Exp $ 44 */ 45 #include <dev/vinum/vinumhdr.h> 46 #include <dev/vinum/request.h> 47 #include <sys/resourcevar.h> 48 49 /* 50 * Parameters which describe the current transfer. 51 * These are only used for calculation, but they 52 * need to be passed to other functions, so it's 53 * tidier to put them in a struct 54 */ 55 struct metrics { 56 daddr_t stripebase; /* base address of stripe (1st subdisk) */ 57 int stripeoffset; /* offset in stripe */ 58 int stripesectors; /* total sectors to transfer in this stripe */ 59 daddr_t sdbase; /* offset in subdisk of stripe base */ 60 int sdcount; /* number of disks involved in this transfer */ 61 daddr_t diskstart; /* remember where this transfer starts */ 62 int psdno; /* number of parity subdisk */ 63 int badsdno; /* number of down subdisk, if there is one */ 64 int firstsdno; /* first data subdisk number */ 65 /* These correspond to the fields in rqelement, sort of */ 66 int useroffset; 67 /* 68 * Initial offset and length values for the first 69 * data block 70 */ 71 int initoffset; /* start address of block to transfer */ 72 short initlen; /* length in sectors of data transfer */ 73 /* Define a normal operation */ 74 int dataoffset; /* start address of block to transfer */ 75 int datalen; /* length in sectors of data transfer */ 76 /* Define a group operation */ 77 int groupoffset; /* subdisk offset of group operation */ 78 int grouplen; /* length in sectors of group operation */ 79 /* Define a normal write operation */ 80 int writeoffset; /* subdisk offset of normal write */ 81 int writelen; /* length in sectors of write operation */ 82 enum xferinfo flags; /* to check what we're doing */ 83 int rqcount; /* number of elements in request */ 84 }; 85 86 enum requeststatus bre5(struct request *rq, 87 int plexno, 88 daddr_t * diskstart, 89 daddr_t diskend); 90 void complete_raid5_write(struct rqelement *); 91 enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex); 92 void setrqebounds(struct rqelement *rqe, struct metrics *mp); 93 94 /* 95 * define the low-level requests needed to perform 96 * a high-level I/O operation for a specific plex 97 * 'plexno'. 98 * 99 * Return 0 if all subdisks involved in the 100 * request are up, 1 if some subdisks are not up, 101 * and -1 if the request is at least partially 102 * outside the bounds of the subdisks. 103 * 104 * Modify the pointer *diskstart to point to the 105 * end address. On read, return on the first bad 106 * subdisk, so that the caller 107 * (build_read_request) can try alternatives. 108 * 109 * On entry to this routine, the prq structures 110 * are not assigned. The assignment is performed 111 * by expandrq(). Strictly speaking, the elements 112 * rqe->sdno of all entries should be set to -1, 113 * since 0 (from bzero) is a valid subdisk number. 114 * We avoid this problem by initializing the ones 115 * we use, and not looking at the others (index >= 116 * prq->requests). 117 */ 118 enum requeststatus 119 bre5(struct request *rq, 120 int plexno, 121 daddr_t * diskaddr, 122 daddr_t diskend) 123 { 124 struct metrics m; /* most of the information */ 125 struct sd *sd; 126 struct plex *plex; 127 struct buf *bp; /* user's bp */ 128 struct rqgroup *rqg; /* the request group that we will create */ 129 struct rqelement *rqe; /* point to this request information */ 130 int rsectors; /* sectors remaining in this stripe */ 131 int mysdno; /* another sd index in loops */ 132 int rqno; /* request number */ 133 134 rqg = NULL; /* shut up, damn compiler */ 135 m.diskstart = *diskaddr; /* start of transfer */ 136 bp = rq->bp; /* buffer pointer */ 137 plex = &PLEX[plexno]; /* point to the plex */ 138 139 140 while (*diskaddr < diskend) { /* until we get it all sorted out */ 141 if (*diskaddr >= plex->length) /* beyond the end of the plex */ 142 return REQUEST_EOF; /* can't continue */ 143 144 m.badsdno = -1; /* no bad subdisk yet */ 145 146 /* Part A: Define the request */ 147 /* 148 * First, calculate some sizes: 149 * The offset of the start address from 150 * the start of the stripe. 151 */ 152 m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1)); 153 154 /* 155 * The plex-relative address of the 156 * start of the stripe. 157 */ 158 m.stripebase = *diskaddr - m.stripeoffset; 159 160 /* subdisk containing the parity stripe */ 161 if (plex->organization == plex_raid5) 162 m.psdno = plex->subdisks - 1 163 - (*diskaddr / (plex->stripesize * (plex->subdisks - 1))) 164 % plex->subdisks; 165 else /* RAID-4 */ 166 m.psdno = plex->subdisks - 1; 167 168 /* 169 * The number of the subdisk in which 170 * the start is located. 171 */ 172 m.firstsdno = m.stripeoffset / plex->stripesize; 173 if (m.firstsdno >= m.psdno) /* at or past parity sd */ 174 m.firstsdno++; /* increment it */ 175 176 /* 177 * The offset from the beginning of 178 * the stripe on this subdisk. 179 */ 180 m.initoffset = m.stripeoffset % plex->stripesize; 181 182 /* The offset of the stripe start relative to this subdisk */ 183 m.sdbase = m.stripebase / (plex->subdisks - 1); 184 185 m.useroffset = *diskaddr - m.diskstart; /* The offset of the start in the user buffer */ 186 187 /* 188 * The number of sectors to transfer in the 189 * current (first) subdisk. 190 */ 191 m.initlen = min(diskend - *diskaddr, /* the amount remaining to transfer */ 192 plex->stripesize - m.initoffset); /* and the amount left in this block */ 193 194 /* 195 * The number of sectors to transfer in this stripe 196 * is the minumum of the amount remaining to transfer 197 * and the amount left in this stripe. 198 */ 199 m.stripesectors = min(diskend - *diskaddr, 200 plex->stripesize * (plex->subdisks - 1) - m.stripeoffset); 201 202 /* The number of data subdisks involved in this request */ 203 m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize; 204 205 /* Part B: decide what kind of transfer this will be. 206 207 * start and end addresses of the transfer in 208 * the current block. 209 * 210 * There are a number of different kinds of 211 * transfer, each of which relates to a 212 * specific subdisk: 213 * 214 * 1. Normal read. All participating subdisks 215 * are up, and the transfer can be made 216 * directly to the user buffer. The bounds 217 * of the transfer are described by 218 * m.dataoffset and m.datalen. We have 219 * already calculated m.initoffset and 220 * m.initlen, which define the parameters 221 * for the first data block. 222 * 223 * 2. Recovery read. One participating 224 * subdisk is down. To recover data, all 225 * the other subdisks, including the parity 226 * subdisk, must be read. The data is 227 * recovered by exclusive-oring all the 228 * other blocks. The bounds of the 229 * transfer are described by m.groupoffset 230 * and m.grouplen. 231 * 232 * 3. A read request may request reading both 233 * available data (normal read) and 234 * non-available data (recovery read). 235 * This can be a problem if the address 236 * ranges of the two reads do not coincide: 237 * in this case, the normal read needs to 238 * be extended to cover the address range 239 * of the recovery read, and must thus be 240 * performed out of malloced memory. 241 * 242 * 4. Normal write. All the participating 243 * subdisks are up. The bounds of the 244 * transfer are described by m.dataoffset 245 * and m.datalen. Since these values 246 * differ for each block, we calculate the 247 * bounds for the parity block 248 * independently as the maximum of the 249 * individual blocks and store these values 250 * in m.writeoffset and m.writelen. This 251 * write proceeds in four phases: 252 * 253 * i. Read the old contents of each block 254 * and the parity block. 255 * ii. ``Remove'' the old contents from 256 * the parity block with exclusive or. 257 * iii. ``Insert'' the new contents of the 258 * block in the parity block, again 259 * with exclusive or. 260 * 261 * iv. Write the new contents of the data 262 * blocks and the parity block. The data 263 * block transfers can be made directly from 264 * the user buffer. 265 * 266 * 5. Degraded write where the data block is 267 * not available. The bounds of the 268 * transfer are described by m.groupoffset 269 * and m.grouplen. This requires the 270 * following steps: 271 * 272 * i. Read in all the other data blocks, 273 * excluding the parity block. 274 * 275 * ii. Recreate the parity block from the 276 * other data blocks and the data to be 277 * written. 278 * 279 * iii. Write the parity block. 280 * 281 * 6. Parityless write, a write where the 282 * parity block is not available. This is 283 * in fact the simplest: just write the 284 * data blocks. This can proceed directly 285 * from the user buffer. The bounds of the 286 * transfer are described by m.dataoffset 287 * and m.datalen. 288 * 289 * 7. Combination of degraded data block write 290 * and normal write. In this case the 291 * address ranges of the reads may also 292 * need to be extended to cover all 293 * participating blocks. 294 * 295 * All requests in a group transfer transfer 296 * the same address range relative to their 297 * subdisk. The individual transfers may 298 * vary, but since our group of requests is 299 * all in a single slice, we can define a 300 * range in which they all fall. 301 * 302 * In the following code section, we determine 303 * which kind of transfer we will perform. If 304 * there is a group transfer, we also decide 305 * its bounds relative to the subdisks. At 306 * the end, we have the following values: 307 * 308 * m.flags indicates the kinds of transfers 309 * we will perform. 310 * m.initoffset indicates the offset of the 311 * beginning of any data operation relative 312 * to the beginning of the stripe base. 313 * m.initlen specifies the length of any data 314 * operation. 315 * m.dataoffset contains the same value as 316 * m.initoffset. 317 * m.datalen contains the same value as 318 * m.initlen. Initially dataoffset and 319 * datalen describe the parameters for the 320 * first data block; while building the data 321 * block requests, they are updated for each 322 * block. 323 * m.groupoffset indicates the offset of any 324 * group operation relative to the beginning 325 * of the stripe base. 326 * m.grouplen specifies the length of any 327 * group operation. 328 * m.writeoffset indicates the offset of a 329 * normal write relative to the beginning of 330 * the stripe base. This value differs from 331 * m.dataoffset in that it applies to the 332 * entire operation, and not just the first 333 * block. 334 * m.writelen specifies the total span of a 335 * normal write operation. writeoffset and 336 * writelen are used to define the parity 337 * block. 338 */ 339 m.groupoffset = 0; /* assume no group... */ 340 m.grouplen = 0; /* until we know we have one */ 341 m.writeoffset = m.initoffset; /* start offset of transfer */ 342 m.writelen = 0; /* nothing to write yet */ 343 m.flags = 0; /* no flags yet */ 344 rsectors = m.stripesectors; /* remaining sectors to examine */ 345 m.dataoffset = m.initoffset; /* start at the beginning of the transfer */ 346 m.datalen = m.initlen; 347 348 if (m.sdcount > 1) { 349 plex->multiblock++; /* more than one block for the request */ 350 /* 351 * If we have two transfers that don't overlap, 352 * (one at the end of the first block, the other 353 * at the beginning of the second block), 354 * it's cheaper to split them. 355 */ 356 if (rsectors < plex->stripesize) { 357 m.sdcount = 1; /* just one subdisk */ 358 m.stripesectors = m.initlen; /* and just this many sectors */ 359 rsectors = m.initlen; /* and in the loop counter */ 360 } 361 } 362 if (SD[plex->sdnos[m.psdno]].state < sd_reborn) /* is our parity subdisk down? */ 363 m.badsdno = m.psdno; /* note that it's down */ 364 if (bp->b_flags & B_READ) { /* read operation */ 365 for (mysdno = m.firstsdno; rsectors > 0; mysdno++) { 366 if (mysdno == m.psdno) /* ignore parity on read */ 367 mysdno++; 368 if (mysdno == plex->subdisks) /* wraparound */ 369 mysdno = 0; 370 if (mysdno == m.psdno) /* parity, */ 371 mysdno++; /* we've given already */ 372 373 if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */ 374 if (m.badsdno >= 0) /* we had one already, */ 375 return REQUEST_DOWN; /* we can't take a second */ 376 m.badsdno = mysdno; /* got the first */ 377 m.groupoffset = m.dataoffset; /* define the bounds */ 378 m.grouplen = m.datalen; 379 m.flags |= XFR_RECOVERY_READ; /* we need recovery */ 380 plex->recovered_reads++; /* count another one */ 381 } else 382 m.flags |= XFR_NORMAL_READ; /* normal read */ 383 384 /* Update the pointers for the next block */ 385 m.dataoffset = 0; /* back to the start of the stripe */ 386 rsectors -= m.datalen; /* remaining sectors to examine */ 387 m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */ 388 } 389 } else { /* write operation */ 390 for (mysdno = m.firstsdno; rsectors > 0; mysdno++) { 391 if (mysdno == m.psdno) /* parity stripe, we've dealt with that */ 392 mysdno++; 393 if (mysdno == plex->subdisks) /* wraparound */ 394 mysdno = 0; 395 if (mysdno == m.psdno) /* parity, */ 396 mysdno++; /* we've given already */ 397 398 sd = &SD[plex->sdnos[mysdno]]; 399 if (sd->state != sd_up) { 400 enum requeststatus s; 401 402 s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ 403 if (s && (m.badsdno >= 0)) { /* second bad disk, */ 404 int sdno; 405 /* 406 * If the parity disk is down, there's 407 * no recovery. We make all involved 408 * subdisks stale. Otherwise, we 409 * should be able to recover, but it's 410 * like pulling teeth. Fix it later. 411 */ 412 for (sdno = 0; sdno < m.sdcount; sdno++) { 413 struct sd *sd = &SD[plex->sdnos[sdno]]; 414 if (sd->state >= sd_reborn) /* sort of up, */ 415 set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */ 416 } 417 return s; /* and crap out */ 418 } 419 m.badsdno = mysdno; /* note which one is bad */ 420 m.flags |= XFR_DEGRADED_WRITE; /* we need recovery */ 421 plex->degraded_writes++; /* count another one */ 422 m.groupoffset = m.dataoffset; /* define the bounds */ 423 m.grouplen = m.datalen; 424 } else { 425 m.flags |= XFR_NORMAL_WRITE; /* normal write operation */ 426 if (m.writeoffset > m.dataoffset) { /* move write operation lower */ 427 m.writelen = max(m.writeoffset + m.writelen, 428 m.dataoffset + m.datalen) 429 - m.dataoffset; 430 m.writeoffset = m.dataoffset; 431 } else 432 m.writelen = max(m.writeoffset + m.writelen, 433 m.dataoffset + m.datalen) 434 - m.writeoffset; 435 } 436 437 /* Update the pointers for the next block */ 438 m.dataoffset = 0; /* back to the start of the stripe */ 439 rsectors -= m.datalen; /* remaining sectors to examine */ 440 m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */ 441 } 442 if (m.badsdno == m.psdno) { /* got a bad parity block, */ 443 struct sd *psd = &SD[plex->sdnos[m.psdno]]; 444 445 if (psd->state == sd_down) 446 set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */ 447 else if (psd->state == sd_crashed) 448 set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */ 449 m.flags &= ~XFR_NORMAL_WRITE; /* this write isn't normal, */ 450 m.flags |= XFR_PARITYLESS_WRITE; /* it's parityless */ 451 plex->parityless_writes++; /* count another one */ 452 } 453 } 454 455 /* reset the initial transfer values */ 456 m.dataoffset = m.initoffset; /* start at the beginning of the transfer */ 457 m.datalen = m.initlen; 458 459 /* decide how many requests we need */ 460 if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE)) 461 /* doing a recovery read or degraded write, */ 462 m.rqcount = plex->subdisks; /* all subdisks */ 463 else if (m.flags & XFR_NORMAL_WRITE) /* normal write, */ 464 m.rqcount = m.sdcount + 1; /* all data blocks and the parity block */ 465 else /* parityless write or normal read */ 466 m.rqcount = m.sdcount; /* just the data blocks */ 467 468 /* Part C: build the requests */ 469 rqg = allocrqg(rq, m.rqcount); /* get a request group */ 470 if (rqg == NULL) { /* malloc failed */ 471 bp->b_error = ENOMEM; 472 bp->b_flags |= B_ERROR; 473 return REQUEST_ENOMEM; 474 } 475 rqg->plexno = plexno; 476 rqg->flags = m.flags; 477 rqno = 0; /* index in the request group */ 478 479 /* 1: PARITY BLOCK */ 480 /* 481 * Are we performing an operation which requires parity? In that case, 482 * work out the parameters and define the parity block. 483 * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE 484 */ 485 if (m.flags & XFR_PARITYOP) { /* need parity */ 486 rqe = &rqg->rqe[rqno]; /* point to element */ 487 sd = &SD[plex->sdnos[m.psdno]]; /* the subdisk in question */ 488 rqe->rqg = rqg; /* point back to group */ 489 rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */ 490 &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE); /* transfer flags without data op stuf */ 491 setrqebounds(rqe, &m); /* set up the bounds of the transfer */ 492 rqe->sdno = sd->sdno; /* subdisk number */ 493 rqe->driveno = sd->driveno; 494 if (build_rq_buffer(rqe, plex)) /* build the buffer */ 495 return REQUEST_ENOMEM; /* can't do it */ 496 rqe->b.b_flags |= B_READ; /* we must read first */ 497 m.sdcount++; /* adjust the subdisk count */ 498 rqno++; /* and point to the next request */ 499 } 500 /* 501 * 2: DATA BLOCKS 502 * Now build up requests for the blocks required 503 * for individual transfers 504 */ 505 for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) { 506 if (mysdno == m.psdno) /* parity, */ 507 mysdno++; /* we've given already */ 508 if (mysdno == plex->subdisks) /* got to the end, */ 509 mysdno = 0; /* wrap around */ 510 if (mysdno == m.psdno) /* parity, */ 511 mysdno++; /* we've given already */ 512 513 rqe = &rqg->rqe[rqno]; /* point to element */ 514 sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */ 515 rqe->rqg = rqg; /* point to group */ 516 if (m.flags & XFR_NEEDS_MALLOC) /* we need a malloced buffer first */ 517 rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */ 518 else 519 rqe->flags = m.flags | XFR_DATA_BLOCK; /* transfer flags */ 520 if (mysdno == m.badsdno) { /* this is the bad subdisk */ 521 rqg->badsdno = rqno; /* note which one */ 522 rqe->flags |= XFR_BAD_SUBDISK; /* note that it's dead */ 523 /* 524 * we can't read or write from/to it, 525 * but we don't need to malloc 526 */ 527 rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE); 528 } 529 setrqebounds(rqe, &m); /* set up the bounds of the transfer */ 530 rqe->useroffset = m.useroffset; /* offset in user buffer */ 531 rqe->sdno = sd->sdno; /* subdisk number */ 532 rqe->driveno = sd->driveno; 533 if (build_rq_buffer(rqe, plex)) /* build the buffer */ 534 return REQUEST_ENOMEM; /* can't do it */ 535 if ((m.flags & XFR_PARITYOP) /* parity operation, */ 536 &&((m.flags & XFR_BAD_SUBDISK) == 0)) /* and not the bad subdisk, */ 537 rqe->b.b_flags |= B_READ; /* we must read first */ 538 539 /* Now update pointers for the next block */ 540 *diskaddr += m.datalen; /* skip past what we've done */ 541 m.stripesectors -= m.datalen; /* deduct from what's left */ 542 m.useroffset += m.datalen; /* and move on in the user buffer */ 543 m.datalen = min(m.stripesectors, plex->stripesize); /* and recalculate */ 544 m.dataoffset = 0; /* start at the beginning of next block */ 545 } 546 547 /* 548 * 3: REMAINING BLOCKS FOR RECOVERY 549 * Finally, if we have a recovery operation, build 550 * up transfers for the other subdisks. Follow the 551 * subdisks around until we get to where we started. 552 * These requests use only the group parameters. 553 */ 554 if ((rqno < m.rqcount) /* haven't done them all already */ 555 &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) { 556 for (; rqno < m.rqcount; rqno++, mysdno++) { 557 if (mysdno == m.psdno) /* parity, */ 558 mysdno++; /* we've given already */ 559 if (mysdno == plex->subdisks) /* got to the end, */ 560 mysdno = 0; /* wrap around */ 561 if (mysdno == m.psdno) /* parity, */ 562 mysdno++; /* we've given already */ 563 564 rqe = &rqg->rqe[rqno]; /* point to element */ 565 sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */ 566 rqe->rqg = rqg; /* point to group */ 567 568 rqe->sdoffset = m.sdbase + m.groupoffset; /* start of transfer */ 569 rqe->dataoffset = 0; /* for tidiness' sake */ 570 rqe->groupoffset = 0; /* group starts at the beginining */ 571 rqe->datalen = 0; 572 rqe->grouplen = m.grouplen; 573 rqe->buflen = m.grouplen; 574 rqe->flags = (m.flags | XFR_MALLOCED) /* transfer flags without data op stuf */ 575 &~XFR_DATAOP; 576 rqe->sdno = sd->sdno; /* subdisk number */ 577 rqe->driveno = sd->driveno; 578 if (build_rq_buffer(rqe, plex)) /* build the buffer */ 579 return REQUEST_ENOMEM; /* can't do it */ 580 rqe->b.b_flags |= B_READ; /* we must read first */ 581 } 582 } 583 /* 584 * We need to lock the address range before 585 * doing anything. We don't have to be 586 * performing a recovery operation: somebody 587 * else could be doing so, and the results could 588 * influence us. Note the fact here, we'll perform 589 * the lock in launch_requests. 590 */ 591 rqg->lockbase = m.stripebase; 592 if (*diskaddr < diskend) /* didn't finish the request on this stripe */ 593 plex->multistripe++; /* count another one */ 594 } 595 return REQUEST_OK; 596 } 597 598 /* 599 * Helper function for rqe5: adjust the bounds of 600 * the transfers to minimize the buffer 601 * allocation. 602 * 603 * Each request can handle two of three different 604 * data ranges: 605 * 606 * 1. The range described by the parameters 607 * dataoffset and datalen, for normal read or 608 * parityless write. 609 * 2. The range described by the parameters 610 * groupoffset and grouplen, for recovery read 611 * and degraded write. 612 * 3. For normal write, the range depends on the 613 * kind of block. For data blocks, the range 614 * is defined by dataoffset and datalen. For 615 * parity blocks, it is defined by writeoffset 616 * and writelen. 617 * 618 * In order not to allocate more memory than 619 * necessary, this function adjusts the bounds 620 * parameter for each request to cover just the 621 * minimum necessary for the function it performs. 622 * This will normally vary from one request to the 623 * next. 624 * 625 * Things are slightly different for the parity 626 * block. In this case, the bounds defined by 627 * mp->writeoffset and mp->writelen also play a 628 * r�le. Select this case by setting the 629 * parameter forparity != 0 630 */ 631 void 632 setrqebounds(struct rqelement *rqe, struct metrics *mp) 633 { 634 /* parity block of a normal write */ 635 if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) 636 == (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) { /* case 3 */ 637 if (rqe->flags & XFR_DEGRADED_WRITE) { /* also degraded write */ 638 /* 639 * With a combined normal and degraded write, we 640 * will zero out the area of the degraded write 641 * in the second phase, so we don't need to read 642 * it in. Unfortunately, we need a way to tell 643 * build_request_buffer the size of the buffer, 644 * and currently that's the length of the read. 645 * As a result, we read everything, even the stuff 646 * that we're going to nuke. 647 * FIXME XXX 648 */ 649 if (mp->groupoffset < mp->writeoffset) { /* group operation starts lower */ 650 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ 651 rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */ 652 rqe->groupoffset = 0; /* and the group at the beginning */ 653 } else { /* individual data starts first */ 654 rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */ 655 rqe->dataoffset = 0; /* individual data starts at the beginning */ 656 rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */ 657 } 658 rqe->datalen = mp->writelen; 659 rqe->grouplen = mp->grouplen; 660 } else { /* just normal write (case 3) */ 661 rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */ 662 rqe->dataoffset = 0; /* degradation starts at the beginning */ 663 rqe->groupoffset = 0; /* for tidiness' sake */ 664 rqe->datalen = mp->writelen; 665 rqe->grouplen = 0; 666 } 667 } else if (rqe->flags & XFR_DATAOP) { /* data operation (case 1 or 3) */ 668 if (rqe->flags & XFR_GROUPOP) { /* also a group operation (case 2) */ 669 if (mp->groupoffset < mp->dataoffset) { /* group operation starts lower */ 670 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ 671 rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */ 672 rqe->groupoffset = 0; /* and the group at the beginning */ 673 } else { /* individual data starts first */ 674 rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */ 675 rqe->dataoffset = 0; /* individual data starts at the beginning */ 676 rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */ 677 } 678 rqe->datalen = mp->datalen; 679 rqe->grouplen = mp->grouplen; 680 } else { /* just data operation (case 1) */ 681 rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */ 682 rqe->dataoffset = 0; /* degradation starts at the beginning */ 683 rqe->groupoffset = 0; /* for tidiness' sake */ 684 rqe->datalen = mp->datalen; 685 rqe->grouplen = 0; 686 } 687 } else { /* just group operations (case 2) */ 688 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ 689 rqe->dataoffset = 0; /* for tidiness' sake */ 690 rqe->groupoffset = 0; /* group starts at the beginining */ 691 rqe->datalen = 0; 692 rqe->grouplen = mp->grouplen; 693 } 694 rqe->buflen = max(rqe->dataoffset + rqe->datalen, /* total buffer length */ 695 rqe->groupoffset + rqe->grouplen); 696 } 697 /* Local Variables: */ 698 /* fill-column: 50 */ 699 /* End: */ 700