1 /*- 2 * Copyright (c) 1997, 1998 3 * Cybernet Corporation and Nan Yang Computer Services Limited. 4 * All rights reserved. 5 * 6 * This software was developed as part of the NetMAX project. 7 * 8 * Written by Greg Lehey 9 * 10 * This software is distributed under the so-called ``Berkeley 11 * License'': 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed by Cybernet Corporation 24 * and Nan Yang Computer Services Limited 25 * 4. Neither the name of the Companies nor the names of its contributors 26 * may be used to endorse or promote products derived from this software 27 * without specific prior written permission. 28 * 29 * This software is provided ``as is'', and any express or implied 30 * warranties, including, but not limited to, the implied warranties of 31 * merchantability and fitness for a particular purpose are disclaimed. 32 * In no event shall the company or contributors be liable for any 33 * direct, indirect, incidental, special, exemplary, or consequential 34 * damages (including, but not limited to, procurement of substitute 35 * goods or services; loss of use, data, or profits; or business 36 * interruption) however caused and on any theory of liability, whether 37 * in contract, strict liability, or tort (including negligence or 38 * otherwise) arising in any way out of the use of this software, even if 39 * advised of the possibility of such damage. 40 * 41 * $Id: vinumraid5.c,v 1.21 2001/01/09 04:21:27 grog Exp grog $ 42 * $FreeBSD: src/sys/dev/vinum/vinumraid5.c,v 1.6.2.2 2001/03/13 02:59:43 grog Exp $ 43 */ 44 #include "vinumhdr.h" 45 #include "request.h" 46 #include <sys/resourcevar.h> 47 48 /* 49 * Parameters which describe the current transfer. 50 * These are only used for calculation, but they 51 * need to be passed to other functions, so it's 52 * tidier to put them in a struct 53 */ 54 struct metrics { 55 vinum_off_t stripebase; /* base address of stripe (1st subdisk) */ 56 int stripeoffset; /* offset in stripe */ 57 int stripesectors; /* total sectors to transfer in this stripe */ 58 vinum_off_t sdbase; /* offset in subdisk of stripe base */ 59 int sdcount; /* number of disks involved in this transfer */ 60 vinum_off_t diskstart; /* remember where this transfer starts */ 61 int psdno; /* number of parity subdisk */ 62 int badsdno; /* number of down subdisk, if there is one */ 63 int firstsdno; /* first data subdisk number */ 64 /* These correspond to the fields in rqelement, sort of */ 65 int useroffset; 66 /* 67 * Initial offset and length values for the first 68 * data block 69 */ 70 int initoffset; /* start address of block to transfer */ 71 short initlen; /* length in sectors of data transfer */ 72 /* Define a normal operation */ 73 int dataoffset; /* start address of block to transfer */ 74 int datalen; /* length in sectors of data transfer */ 75 /* Define a group operation */ 76 int groupoffset; /* subdisk offset of group operation */ 77 int grouplen; /* length in sectors of group operation */ 78 /* Define a normal write operation */ 79 int writeoffset; /* subdisk offset of normal write */ 80 int writelen; /* length in sectors of write operation */ 81 enum xferinfo flags; /* to check what we're doing */ 82 int rqcount; /* number of elements in request */ 83 }; 84 85 enum requeststatus bre5(struct request *rq, 86 int plexno, 87 vinum_off_t * diskstart, 88 vinum_off_t diskend); 89 void complete_raid5_write(struct rqelement *); 90 enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex); 91 void setrqebounds(struct rqelement *rqe, struct metrics *mp); 92 93 /* 94 * define the low-level requests needed to perform 95 * a high-level I/O operation for a specific plex 96 * 'plexno'. 97 * 98 * Return 0 if all subdisks involved in the 99 * request are up, 1 if some subdisks are not up, 100 * and -1 if the request is at least partially 101 * outside the bounds of the subdisks. 102 * 103 * Modify the pointer *diskstart to point to the 104 * end address. On read, return on the first bad 105 * subdisk, so that the caller 106 * (build_read_request) can try alternatives. 107 * 108 * On entry to this routine, the prq structures 109 * are not assigned. The assignment is performed 110 * by expandrq(). Strictly speaking, the elements 111 * rqe->sdno of all entries should be set to -1, 112 * since 0 (from bzero) is a valid subdisk number. 113 * We avoid this problem by initializing the ones 114 * we use, and not looking at the others (index >= 115 * prq->requests). 116 */ 117 enum requeststatus 118 bre5(struct request *rq, 119 int plexno, 120 vinum_off_t * diskaddr, 121 vinum_off_t diskend) 122 { 123 struct metrics m; /* most of the information */ 124 struct sd *sd; 125 struct plex *plex; 126 struct bio *bio; /* user's bp */ 127 struct buf *bp; 128 struct rqgroup *rqg; /* the request group that we will create */ 129 struct rqelement *rqe; /* point to this request information */ 130 int rsectors; /* sectors remaining in this stripe */ 131 int mysdno; /* another sd index in loops */ 132 int rqno; /* request number */ 133 134 rqg = NULL; /* shut up, damn compiler */ 135 m.diskstart = *diskaddr; /* start of transfer */ 136 bio = rq->bio; /* buffer pointer */ 137 bp = bio->bio_buf; 138 plex = &PLEX[plexno]; /* point to the plex */ 139 140 141 while (*diskaddr < diskend) { /* until we get it all sorted out */ 142 if (*diskaddr >= plex->length) /* beyond the end of the plex */ 143 return REQUEST_EOF; /* can't continue */ 144 145 m.badsdno = -1; /* no bad subdisk yet */ 146 147 /* Part A: Define the request */ 148 /* 149 * First, calculate some sizes: 150 * The offset of the start address from 151 * the start of the stripe. 152 */ 153 m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1)); 154 155 /* 156 * The plex-relative address of the 157 * start of the stripe. 158 */ 159 m.stripebase = *diskaddr - m.stripeoffset; 160 161 /* subdisk containing the parity stripe */ 162 if (plex->organization == plex_raid5) 163 m.psdno = plex->subdisks - 1 164 - (*diskaddr / (plex->stripesize * (plex->subdisks - 1))) 165 % plex->subdisks; 166 else /* RAID-4 */ 167 m.psdno = plex->subdisks - 1; 168 169 /* 170 * The number of the subdisk in which 171 * the start is located. 172 */ 173 m.firstsdno = m.stripeoffset / plex->stripesize; 174 if (m.firstsdno >= m.psdno) /* at or past parity sd */ 175 m.firstsdno++; /* increment it */ 176 177 /* 178 * The offset from the beginning of 179 * the stripe on this subdisk. 180 */ 181 m.initoffset = m.stripeoffset % plex->stripesize; 182 183 /* The offset of the stripe start relative to this subdisk */ 184 m.sdbase = m.stripebase / (plex->subdisks - 1); 185 186 m.useroffset = *diskaddr - m.diskstart; /* The offset of the start in the user buffer */ 187 188 /* 189 * The number of sectors to transfer in the 190 * current (first) subdisk. 191 */ 192 m.initlen = umin(diskend - *diskaddr, /* the amount remaining to transfer */ 193 plex->stripesize - m.initoffset); /* and the amount left in this block */ 194 195 /* 196 * The number of sectors to transfer in this stripe 197 * is the minumum of the amount remaining to transfer 198 * and the amount left in this stripe. 199 */ 200 m.stripesectors = umin(diskend - *diskaddr, 201 plex->stripesize * (plex->subdisks - 1) - m.stripeoffset); 202 203 /* The number of data subdisks involved in this request */ 204 m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize; 205 206 /* Part B: decide what kind of transfer this will be. 207 208 * start and end addresses of the transfer in 209 * the current block. 210 * 211 * There are a number of different kinds of 212 * transfer, each of which relates to a 213 * specific subdisk: 214 * 215 * 1. Normal read. All participating subdisks 216 * are up, and the transfer can be made 217 * directly to the user buffer. The bounds 218 * of the transfer are described by 219 * m.dataoffset and m.datalen. We have 220 * already calculated m.initoffset and 221 * m.initlen, which define the parameters 222 * for the first data block. 223 * 224 * 2. Recovery read. One participating 225 * subdisk is down. To recover data, all 226 * the other subdisks, including the parity 227 * subdisk, must be read. The data is 228 * recovered by exclusive-oring all the 229 * other blocks. The bounds of the 230 * transfer are described by m.groupoffset 231 * and m.grouplen. 232 * 233 * 3. A read request may request reading both 234 * available data (normal read) and 235 * non-available data (recovery read). 236 * This can be a problem if the address 237 * ranges of the two reads do not coincide: 238 * in this case, the normal read needs to 239 * be extended to cover the address range 240 * of the recovery read, and must thus be 241 * performed out of malloced memory. 242 * 243 * 4. Normal write. All the participating 244 * subdisks are up. The bounds of the 245 * transfer are described by m.dataoffset 246 * and m.datalen. Since these values 247 * differ for each block, we calculate the 248 * bounds for the parity block 249 * independently as the maximum of the 250 * individual blocks and store these values 251 * in m.writeoffset and m.writelen. This 252 * write proceeds in four phases: 253 * 254 * i. Read the old contents of each block 255 * and the parity block. 256 * ii. ``Remove'' the old contents from 257 * the parity block with exclusive or. 258 * iii. ``Insert'' the new contents of the 259 * block in the parity block, again 260 * with exclusive or. 261 * 262 * iv. Write the new contents of the data 263 * blocks and the parity block. The data 264 * block transfers can be made directly from 265 * the user buffer. 266 * 267 * 5. Degraded write where the data block is 268 * not available. The bounds of the 269 * transfer are described by m.groupoffset 270 * and m.grouplen. This requires the 271 * following steps: 272 * 273 * i. Read in all the other data blocks, 274 * excluding the parity block. 275 * 276 * ii. Recreate the parity block from the 277 * other data blocks and the data to be 278 * written. 279 * 280 * iii. Write the parity block. 281 * 282 * 6. Parityless write, a write where the 283 * parity block is not available. This is 284 * in fact the simplest: just write the 285 * data blocks. This can proceed directly 286 * from the user buffer. The bounds of the 287 * transfer are described by m.dataoffset 288 * and m.datalen. 289 * 290 * 7. Combination of degraded data block write 291 * and normal write. In this case the 292 * address ranges of the reads may also 293 * need to be extended to cover all 294 * participating blocks. 295 * 296 * All requests in a group transfer transfer 297 * the same address range relative to their 298 * subdisk. The individual transfers may 299 * vary, but since our group of requests is 300 * all in a single slice, we can define a 301 * range in which they all fall. 302 * 303 * In the following code section, we determine 304 * which kind of transfer we will perform. If 305 * there is a group transfer, we also decide 306 * its bounds relative to the subdisks. At 307 * the end, we have the following values: 308 * 309 * m.flags indicates the kinds of transfers 310 * we will perform. 311 * m.initoffset indicates the offset of the 312 * beginning of any data operation relative 313 * to the beginning of the stripe base. 314 * m.initlen specifies the length of any data 315 * operation. 316 * m.dataoffset contains the same value as 317 * m.initoffset. 318 * m.datalen contains the same value as 319 * m.initlen. Initially dataoffset and 320 * datalen describe the parameters for the 321 * first data block; while building the data 322 * block requests, they are updated for each 323 * block. 324 * m.groupoffset indicates the offset of any 325 * group operation relative to the beginning 326 * of the stripe base. 327 * m.grouplen specifies the length of any 328 * group operation. 329 * m.writeoffset indicates the offset of a 330 * normal write relative to the beginning of 331 * the stripe base. This value differs from 332 * m.dataoffset in that it applies to the 333 * entire operation, and not just the first 334 * block. 335 * m.writelen specifies the total span of a 336 * normal write operation. writeoffset and 337 * writelen are used to define the parity 338 * block. 339 */ 340 m.groupoffset = 0; /* assume no group... */ 341 m.grouplen = 0; /* until we know we have one */ 342 m.writeoffset = m.initoffset; /* start offset of transfer */ 343 m.writelen = 0; /* nothing to write yet */ 344 m.flags = 0; /* no flags yet */ 345 rsectors = m.stripesectors; /* remaining sectors to examine */ 346 m.dataoffset = m.initoffset; /* start at the beginning of the transfer */ 347 m.datalen = m.initlen; 348 349 if (m.sdcount > 1) { 350 plex->multiblock++; /* more than one block for the request */ 351 /* 352 * If we have two transfers that don't overlap, 353 * (one at the end of the first block, the other 354 * at the beginning of the second block), 355 * it's cheaper to split them. 356 */ 357 if (rsectors < plex->stripesize) { 358 m.sdcount = 1; /* just one subdisk */ 359 m.stripesectors = m.initlen; /* and just this many sectors */ 360 rsectors = m.initlen; /* and in the loop counter */ 361 } 362 } 363 if (SD[plex->sdnos[m.psdno]].state < sd_reborn) /* is our parity subdisk down? */ 364 m.badsdno = m.psdno; /* note that it's down */ 365 if (bp->b_cmd == BUF_CMD_READ) { /* read operation */ 366 for (mysdno = m.firstsdno; rsectors > 0; mysdno++) { 367 if (mysdno == m.psdno) /* ignore parity on read */ 368 mysdno++; 369 if (mysdno == plex->subdisks) /* wraparound */ 370 mysdno = 0; 371 if (mysdno == m.psdno) /* parity, */ 372 mysdno++; /* we've given already */ 373 374 if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */ 375 if (m.badsdno >= 0) /* we had one already, */ 376 return REQUEST_DOWN; /* we can't take a second */ 377 m.badsdno = mysdno; /* got the first */ 378 m.groupoffset = m.dataoffset; /* define the bounds */ 379 m.grouplen = m.datalen; 380 m.flags |= XFR_RECOVERY_READ; /* we need recovery */ 381 plex->recovered_reads++; /* count another one */ 382 } else 383 m.flags |= XFR_NORMAL_READ; /* normal read */ 384 385 /* Update the pointers for the next block */ 386 m.dataoffset = 0; /* back to the start of the stripe */ 387 rsectors -= m.datalen; /* remaining sectors to examine */ 388 m.datalen = umin(rsectors, plex->stripesize); /* amount that will fit in this block */ 389 } 390 } else { /* write operation */ 391 for (mysdno = m.firstsdno; rsectors > 0; mysdno++) { 392 if (mysdno == m.psdno) /* parity stripe, we've dealt with that */ 393 mysdno++; 394 if (mysdno == plex->subdisks) /* wraparound */ 395 mysdno = 0; 396 if (mysdno == m.psdno) /* parity, */ 397 mysdno++; /* we've given already */ 398 399 sd = &SD[plex->sdnos[mysdno]]; 400 if (sd->state != sd_up) { 401 enum requeststatus s; 402 403 s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ 404 if (s && (m.badsdno >= 0)) { /* second bad disk, */ 405 int sdno; 406 /* 407 * If the parity disk is down, there's 408 * no recovery. We make all involved 409 * subdisks stale. Otherwise, we 410 * should be able to recover, but it's 411 * like pulling teeth. Fix it later. 412 */ 413 for (sdno = 0; sdno < m.sdcount; sdno++) { 414 struct sd *sd = &SD[plex->sdnos[sdno]]; 415 if (sd->state >= sd_reborn) /* sort of up, */ 416 set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */ 417 } 418 return s; /* and crap out */ 419 } 420 m.badsdno = mysdno; /* note which one is bad */ 421 m.flags |= XFR_DEGRADED_WRITE; /* we need recovery */ 422 plex->degraded_writes++; /* count another one */ 423 m.groupoffset = m.dataoffset; /* define the bounds */ 424 m.grouplen = m.datalen; 425 } else { 426 m.flags |= XFR_NORMAL_WRITE; /* normal write operation */ 427 if (m.writeoffset > m.dataoffset) { /* move write operation lower */ 428 m.writelen = umax(m.writeoffset + m.writelen, 429 m.dataoffset + m.datalen) 430 - m.dataoffset; 431 m.writeoffset = m.dataoffset; 432 } else 433 m.writelen = umax(m.writeoffset + m.writelen, 434 m.dataoffset + m.datalen) 435 - m.writeoffset; 436 } 437 438 /* Update the pointers for the next block */ 439 m.dataoffset = 0; /* back to the start of the stripe */ 440 rsectors -= m.datalen; /* remaining sectors to examine */ 441 m.datalen = umin(rsectors, plex->stripesize); /* amount that will fit in this block */ 442 } 443 if (m.badsdno == m.psdno) { /* got a bad parity block, */ 444 struct sd *psd = &SD[plex->sdnos[m.psdno]]; 445 446 if (psd->state == sd_down) 447 set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */ 448 else if (psd->state == sd_crashed) 449 set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */ 450 m.flags &= ~XFR_NORMAL_WRITE; /* this write isn't normal, */ 451 m.flags |= XFR_PARITYLESS_WRITE; /* it's parityless */ 452 plex->parityless_writes++; /* count another one */ 453 } 454 } 455 456 /* reset the initial transfer values */ 457 m.dataoffset = m.initoffset; /* start at the beginning of the transfer */ 458 m.datalen = m.initlen; 459 460 /* decide how many requests we need */ 461 if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE)) 462 /* doing a recovery read or degraded write, */ 463 m.rqcount = plex->subdisks; /* all subdisks */ 464 else if (m.flags & XFR_NORMAL_WRITE) /* normal write, */ 465 m.rqcount = m.sdcount + 1; /* all data blocks and the parity block */ 466 else /* parityless write or normal read */ 467 m.rqcount = m.sdcount; /* just the data blocks */ 468 469 /* Part C: build the requests */ 470 rqg = allocrqg(rq, m.rqcount); /* get a request group */ 471 if (rqg == NULL) { /* malloc failed */ 472 bp->b_error = ENOMEM; 473 bp->b_flags |= B_ERROR; 474 return REQUEST_ENOMEM; 475 } 476 rqg->plexno = plexno; 477 rqg->flags = m.flags; 478 rqno = 0; /* index in the request group */ 479 480 /* 1: PARITY BLOCK */ 481 /* 482 * Are we performing an operation which requires parity? In that case, 483 * work out the parameters and define the parity block. 484 * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE 485 */ 486 if (m.flags & XFR_PARITYOP) { /* need parity */ 487 rqe = &rqg->rqe[rqno]; /* point to element */ 488 sd = &SD[plex->sdnos[m.psdno]]; /* the subdisk in question */ 489 rqe->rqg = rqg; /* point back to group */ 490 rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */ 491 &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE); /* transfer flags without data op stuf */ 492 setrqebounds(rqe, &m); /* set up the bounds of the transfer */ 493 rqe->sdno = sd->sdno; /* subdisk number */ 494 rqe->driveno = sd->driveno; 495 if (build_rq_buffer(rqe, plex)) /* build the buffer */ 496 return REQUEST_ENOMEM; /* can't do it */ 497 rqe->b.b_cmd = BUF_CMD_READ; /* we must read first */ 498 m.sdcount++; /* adjust the subdisk count */ 499 rqno++; /* and point to the next request */ 500 } 501 /* 502 * 2: DATA BLOCKS 503 * Now build up requests for the blocks required 504 * for individual transfers 505 */ 506 for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) { 507 if (mysdno == m.psdno) /* parity, */ 508 mysdno++; /* we've given already */ 509 if (mysdno == plex->subdisks) /* got to the end, */ 510 mysdno = 0; /* wrap around */ 511 if (mysdno == m.psdno) /* parity, */ 512 mysdno++; /* we've given already */ 513 514 rqe = &rqg->rqe[rqno]; /* point to element */ 515 sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */ 516 rqe->rqg = rqg; /* point to group */ 517 if (m.flags & XFR_NEEDS_MALLOC) /* we need a malloced buffer first */ 518 rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */ 519 else 520 rqe->flags = m.flags | XFR_DATA_BLOCK; /* transfer flags */ 521 if (mysdno == m.badsdno) { /* this is the bad subdisk */ 522 rqg->badsdno = rqno; /* note which one */ 523 rqe->flags |= XFR_BAD_SUBDISK; /* note that it's dead */ 524 /* 525 * we can't read or write from/to it, 526 * but we don't need to malloc 527 */ 528 rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE); 529 } 530 setrqebounds(rqe, &m); /* set up the bounds of the transfer */ 531 rqe->useroffset = m.useroffset; /* offset in user buffer */ 532 rqe->sdno = sd->sdno; /* subdisk number */ 533 rqe->driveno = sd->driveno; 534 if (build_rq_buffer(rqe, plex)) /* build the buffer */ 535 return REQUEST_ENOMEM; /* can't do it */ 536 if ((m.flags & XFR_PARITYOP) /* parity operation, */ 537 &&((m.flags & XFR_BAD_SUBDISK) == 0)) /* and not the bad subdisk, */ 538 rqe->b.b_cmd = BUF_CMD_READ; /* we must read first */ 539 540 /* Now update pointers for the next block */ 541 *diskaddr += m.datalen; /* skip past what we've done */ 542 m.stripesectors -= m.datalen; /* deduct from what's left */ 543 m.useroffset += m.datalen; /* and move on in the user buffer */ 544 m.datalen = umin(m.stripesectors, plex->stripesize); /* and recalculate */ 545 m.dataoffset = 0; /* start at the beginning of next block */ 546 } 547 548 /* 549 * 3: REMAINING BLOCKS FOR RECOVERY 550 * Finally, if we have a recovery operation, build 551 * up transfers for the other subdisks. Follow the 552 * subdisks around until we get to where we started. 553 * These requests use only the group parameters. 554 */ 555 if ((rqno < m.rqcount) /* haven't done them all already */ 556 &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) { 557 for (; rqno < m.rqcount; rqno++, mysdno++) { 558 if (mysdno == m.psdno) /* parity, */ 559 mysdno++; /* we've given already */ 560 if (mysdno == plex->subdisks) /* got to the end, */ 561 mysdno = 0; /* wrap around */ 562 if (mysdno == m.psdno) /* parity, */ 563 mysdno++; /* we've given already */ 564 565 rqe = &rqg->rqe[rqno]; /* point to element */ 566 sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */ 567 rqe->rqg = rqg; /* point to group */ 568 569 rqe->sdoffset = m.sdbase + m.groupoffset; /* start of transfer */ 570 rqe->dataoffset = 0; /* for tidiness' sake */ 571 rqe->groupoffset = 0; /* group starts at the beginining */ 572 rqe->datalen = 0; 573 rqe->grouplen = m.grouplen; 574 rqe->buflen = m.grouplen; 575 rqe->flags = (m.flags | XFR_MALLOCED) /* transfer flags without data op stuf */ 576 &~XFR_DATAOP; 577 rqe->sdno = sd->sdno; /* subdisk number */ 578 rqe->driveno = sd->driveno; 579 if (build_rq_buffer(rqe, plex)) /* build the buffer */ 580 return REQUEST_ENOMEM; /* can't do it */ 581 rqe->b.b_cmd = BUF_CMD_READ; /* we must read first */ 582 } 583 } 584 /* 585 * We need to lock the address range before 586 * doing anything. We don't have to be 587 * performing a recovery operation: somebody 588 * else could be doing so, and the results could 589 * influence us. Note the fact here, we'll perform 590 * the lock in launch_requests. 591 */ 592 rqg->lockbase = m.stripebase; 593 if (*diskaddr < diskend) /* didn't finish the request on this stripe */ 594 plex->multistripe++; /* count another one */ 595 } 596 return REQUEST_OK; 597 } 598 599 /* 600 * Helper function for rqe5: adjust the bounds of 601 * the transfers to minimize the buffer 602 * allocation. 603 * 604 * Each request can handle two of three different 605 * data ranges: 606 * 607 * 1. The range described by the parameters 608 * dataoffset and datalen, for normal read or 609 * parityless write. 610 * 2. The range described by the parameters 611 * groupoffset and grouplen, for recovery read 612 * and degraded write. 613 * 3. For normal write, the range depends on the 614 * kind of block. For data blocks, the range 615 * is defined by dataoffset and datalen. For 616 * parity blocks, it is defined by writeoffset 617 * and writelen. 618 * 619 * In order not to allocate more memory than 620 * necessary, this function adjusts the bounds 621 * parameter for each request to cover just the 622 * minimum necessary for the function it performs. 623 * This will normally vary from one request to the 624 * next. 625 * 626 * Things are slightly different for the parity 627 * block. In this case, the bounds defined by 628 * mp->writeoffset and mp->writelen also play a 629 * r�le. Select this case by setting the 630 * parameter forparity != 0 631 */ 632 void 633 setrqebounds(struct rqelement *rqe, struct metrics *mp) 634 { 635 /* parity block of a normal write */ 636 if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) 637 == (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) { /* case 3 */ 638 if (rqe->flags & XFR_DEGRADED_WRITE) { /* also degraded write */ 639 /* 640 * With a combined normal and degraded write, we 641 * will zero out the area of the degraded write 642 * in the second phase, so we don't need to read 643 * it in. Unfortunately, we need a way to tell 644 * build_request_buffer the size of the buffer, 645 * and currently that's the length of the read. 646 * As a result, we read everything, even the stuff 647 * that we're going to nuke. 648 * FIXME XXX 649 */ 650 if (mp->groupoffset < mp->writeoffset) { /* group operation starts lower */ 651 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ 652 rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */ 653 rqe->groupoffset = 0; /* and the group at the beginning */ 654 } else { /* individual data starts first */ 655 rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */ 656 rqe->dataoffset = 0; /* individual data starts at the beginning */ 657 rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */ 658 } 659 rqe->datalen = mp->writelen; 660 rqe->grouplen = mp->grouplen; 661 } else { /* just normal write (case 3) */ 662 rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */ 663 rqe->dataoffset = 0; /* degradation starts at the beginning */ 664 rqe->groupoffset = 0; /* for tidiness' sake */ 665 rqe->datalen = mp->writelen; 666 rqe->grouplen = 0; 667 } 668 } else if (rqe->flags & XFR_DATAOP) { /* data operation (case 1 or 3) */ 669 if (rqe->flags & XFR_GROUPOP) { /* also a group operation (case 2) */ 670 if (mp->groupoffset < mp->dataoffset) { /* group operation starts lower */ 671 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ 672 rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */ 673 rqe->groupoffset = 0; /* and the group at the beginning */ 674 } else { /* individual data starts first */ 675 rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */ 676 rqe->dataoffset = 0; /* individual data starts at the beginning */ 677 rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */ 678 } 679 rqe->datalen = mp->datalen; 680 rqe->grouplen = mp->grouplen; 681 } else { /* just data operation (case 1) */ 682 rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */ 683 rqe->dataoffset = 0; /* degradation starts at the beginning */ 684 rqe->groupoffset = 0; /* for tidiness' sake */ 685 rqe->datalen = mp->datalen; 686 rqe->grouplen = 0; 687 } 688 } else { /* just group operations (case 2) */ 689 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ 690 rqe->dataoffset = 0; /* for tidiness' sake */ 691 rqe->groupoffset = 0; /* group starts at the beginining */ 692 rqe->datalen = 0; 693 rqe->grouplen = mp->grouplen; 694 } 695 rqe->buflen = umax(rqe->dataoffset + rqe->datalen, /* total buffer length */ 696 rqe->groupoffset + rqe->grouplen); 697 } 698