1 /*- 2 * Copyright (c) 1997, 1998 3 * Cybernet Corporation and Nan Yang Computer Services Limited. 4 * All rights reserved. 5 * 6 * This software was developed as part of the NetMAX project. 7 * 8 * Written by Greg Lehey 9 * 10 * This software is distributed under the so-called ``Berkeley 11 * License'': 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed by Cybernet Corporation 24 * and Nan Yang Computer Services Limited 25 * 4. Neither the name of the Companies nor the names of its contributors 26 * may be used to endorse or promote products derived from this software 27 * without specific prior written permission. 28 * 29 * This software is provided ``as is'', and any express or implied 30 * warranties, including, but not limited to, the implied warranties of 31 * merchantability and fitness for a particular purpose are disclaimed. 32 * In no event shall the company or contributors be liable for any 33 * direct, indirect, incidental, special, exemplary, or consequential 34 * damages (including, but not limited to, procurement of substitute 35 * goods or services; loss of use, data, or profits; or business 36 * interruption) however caused and on any theory of liability, whether 37 * in contract, strict liability, or tort (including negligence or 38 * otherwise) arising in any way out of the use of this software, even if 39 * advised of the possibility of such damage. 40 * 41 * $Id: vinumraid5.c,v 1.21 2001/01/09 04:21:27 grog Exp grog $ 42 * $FreeBSD: src/sys/dev/vinum/vinumraid5.c,v 1.6.2.2 2001/03/13 02:59:43 grog Exp $ 43 * $DragonFly: src/sys/dev/raid/vinum/vinumraid5.c,v 1.5 2006/04/30 17:22:17 dillon Exp $ 44 */ 45 #include "vinumhdr.h" 46 #include "request.h" 47 #include <sys/resourcevar.h> 48 49 /* 50 * Parameters which describe the current transfer. 51 * These are only used for calculation, but they 52 * need to be passed to other functions, so it's 53 * tidier to put them in a struct 54 */ 55 struct metrics { 56 vinum_off_t stripebase; /* base address of stripe (1st subdisk) */ 57 int stripeoffset; /* offset in stripe */ 58 int stripesectors; /* total sectors to transfer in this stripe */ 59 vinum_off_t sdbase; /* offset in subdisk of stripe base */ 60 int sdcount; /* number of disks involved in this transfer */ 61 vinum_off_t diskstart; /* remember where this transfer starts */ 62 int psdno; /* number of parity subdisk */ 63 int badsdno; /* number of down subdisk, if there is one */ 64 int firstsdno; /* first data subdisk number */ 65 /* These correspond to the fields in rqelement, sort of */ 66 int useroffset; 67 /* 68 * Initial offset and length values for the first 69 * data block 70 */ 71 int initoffset; /* start address of block to transfer */ 72 short initlen; /* length in sectors of data transfer */ 73 /* Define a normal operation */ 74 int dataoffset; /* start address of block to transfer */ 75 int datalen; /* length in sectors of data transfer */ 76 /* Define a group operation */ 77 int groupoffset; /* subdisk offset of group operation */ 78 int grouplen; /* length in sectors of group operation */ 79 /* Define a normal write operation */ 80 int writeoffset; /* subdisk offset of normal write */ 81 int writelen; /* length in sectors of write operation */ 82 enum xferinfo flags; /* to check what we're doing */ 83 int rqcount; /* number of elements in request */ 84 }; 85 86 enum requeststatus bre5(struct request *rq, 87 int plexno, 88 vinum_off_t * diskstart, 89 vinum_off_t diskend); 90 void complete_raid5_write(struct rqelement *); 91 enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex); 92 void setrqebounds(struct rqelement *rqe, struct metrics *mp); 93 94 /* 95 * define the low-level requests needed to perform 96 * a high-level I/O operation for a specific plex 97 * 'plexno'. 98 * 99 * Return 0 if all subdisks involved in the 100 * request are up, 1 if some subdisks are not up, 101 * and -1 if the request is at least partially 102 * outside the bounds of the subdisks. 103 * 104 * Modify the pointer *diskstart to point to the 105 * end address. On read, return on the first bad 106 * subdisk, so that the caller 107 * (build_read_request) can try alternatives. 108 * 109 * On entry to this routine, the prq structures 110 * are not assigned. The assignment is performed 111 * by expandrq(). Strictly speaking, the elements 112 * rqe->sdno of all entries should be set to -1, 113 * since 0 (from bzero) is a valid subdisk number. 114 * We avoid this problem by initializing the ones 115 * we use, and not looking at the others (index >= 116 * prq->requests). 117 */ 118 enum requeststatus 119 bre5(struct request *rq, 120 int plexno, 121 vinum_off_t * diskaddr, 122 vinum_off_t diskend) 123 { 124 struct metrics m; /* most of the information */ 125 struct sd *sd; 126 struct plex *plex; 127 struct bio *bio; /* user's bp */ 128 struct buf *bp; 129 struct rqgroup *rqg; /* the request group that we will create */ 130 struct rqelement *rqe; /* point to this request information */ 131 int rsectors; /* sectors remaining in this stripe */ 132 int mysdno; /* another sd index in loops */ 133 int rqno; /* request number */ 134 135 rqg = NULL; /* shut up, damn compiler */ 136 m.diskstart = *diskaddr; /* start of transfer */ 137 bio = rq->bio; /* buffer pointer */ 138 bp = bio->bio_buf; 139 plex = &PLEX[plexno]; /* point to the plex */ 140 141 142 while (*diskaddr < diskend) { /* until we get it all sorted out */ 143 if (*diskaddr >= plex->length) /* beyond the end of the plex */ 144 return REQUEST_EOF; /* can't continue */ 145 146 m.badsdno = -1; /* no bad subdisk yet */ 147 148 /* Part A: Define the request */ 149 /* 150 * First, calculate some sizes: 151 * The offset of the start address from 152 * the start of the stripe. 153 */ 154 m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1)); 155 156 /* 157 * The plex-relative address of the 158 * start of the stripe. 159 */ 160 m.stripebase = *diskaddr - m.stripeoffset; 161 162 /* subdisk containing the parity stripe */ 163 if (plex->organization == plex_raid5) 164 m.psdno = plex->subdisks - 1 165 - (*diskaddr / (plex->stripesize * (plex->subdisks - 1))) 166 % plex->subdisks; 167 else /* RAID-4 */ 168 m.psdno = plex->subdisks - 1; 169 170 /* 171 * The number of the subdisk in which 172 * the start is located. 173 */ 174 m.firstsdno = m.stripeoffset / plex->stripesize; 175 if (m.firstsdno >= m.psdno) /* at or past parity sd */ 176 m.firstsdno++; /* increment it */ 177 178 /* 179 * The offset from the beginning of 180 * the stripe on this subdisk. 181 */ 182 m.initoffset = m.stripeoffset % plex->stripesize; 183 184 /* The offset of the stripe start relative to this subdisk */ 185 m.sdbase = m.stripebase / (plex->subdisks - 1); 186 187 m.useroffset = *diskaddr - m.diskstart; /* The offset of the start in the user buffer */ 188 189 /* 190 * The number of sectors to transfer in the 191 * current (first) subdisk. 192 */ 193 m.initlen = umin(diskend - *diskaddr, /* the amount remaining to transfer */ 194 plex->stripesize - m.initoffset); /* and the amount left in this block */ 195 196 /* 197 * The number of sectors to transfer in this stripe 198 * is the minumum of the amount remaining to transfer 199 * and the amount left in this stripe. 200 */ 201 m.stripesectors = umin(diskend - *diskaddr, 202 plex->stripesize * (plex->subdisks - 1) - m.stripeoffset); 203 204 /* The number of data subdisks involved in this request */ 205 m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize; 206 207 /* Part B: decide what kind of transfer this will be. 208 209 * start and end addresses of the transfer in 210 * the current block. 211 * 212 * There are a number of different kinds of 213 * transfer, each of which relates to a 214 * specific subdisk: 215 * 216 * 1. Normal read. All participating subdisks 217 * are up, and the transfer can be made 218 * directly to the user buffer. The bounds 219 * of the transfer are described by 220 * m.dataoffset and m.datalen. We have 221 * already calculated m.initoffset and 222 * m.initlen, which define the parameters 223 * for the first data block. 224 * 225 * 2. Recovery read. One participating 226 * subdisk is down. To recover data, all 227 * the other subdisks, including the parity 228 * subdisk, must be read. The data is 229 * recovered by exclusive-oring all the 230 * other blocks. The bounds of the 231 * transfer are described by m.groupoffset 232 * and m.grouplen. 233 * 234 * 3. A read request may request reading both 235 * available data (normal read) and 236 * non-available data (recovery read). 237 * This can be a problem if the address 238 * ranges of the two reads do not coincide: 239 * in this case, the normal read needs to 240 * be extended to cover the address range 241 * of the recovery read, and must thus be 242 * performed out of malloced memory. 243 * 244 * 4. Normal write. All the participating 245 * subdisks are up. The bounds of the 246 * transfer are described by m.dataoffset 247 * and m.datalen. Since these values 248 * differ for each block, we calculate the 249 * bounds for the parity block 250 * independently as the maximum of the 251 * individual blocks and store these values 252 * in m.writeoffset and m.writelen. This 253 * write proceeds in four phases: 254 * 255 * i. Read the old contents of each block 256 * and the parity block. 257 * ii. ``Remove'' the old contents from 258 * the parity block with exclusive or. 259 * iii. ``Insert'' the new contents of the 260 * block in the parity block, again 261 * with exclusive or. 262 * 263 * iv. Write the new contents of the data 264 * blocks and the parity block. The data 265 * block transfers can be made directly from 266 * the user buffer. 267 * 268 * 5. Degraded write where the data block is 269 * not available. The bounds of the 270 * transfer are described by m.groupoffset 271 * and m.grouplen. This requires the 272 * following steps: 273 * 274 * i. Read in all the other data blocks, 275 * excluding the parity block. 276 * 277 * ii. Recreate the parity block from the 278 * other data blocks and the data to be 279 * written. 280 * 281 * iii. Write the parity block. 282 * 283 * 6. Parityless write, a write where the 284 * parity block is not available. This is 285 * in fact the simplest: just write the 286 * data blocks. This can proceed directly 287 * from the user buffer. The bounds of the 288 * transfer are described by m.dataoffset 289 * and m.datalen. 290 * 291 * 7. Combination of degraded data block write 292 * and normal write. In this case the 293 * address ranges of the reads may also 294 * need to be extended to cover all 295 * participating blocks. 296 * 297 * All requests in a group transfer transfer 298 * the same address range relative to their 299 * subdisk. The individual transfers may 300 * vary, but since our group of requests is 301 * all in a single slice, we can define a 302 * range in which they all fall. 303 * 304 * In the following code section, we determine 305 * which kind of transfer we will perform. If 306 * there is a group transfer, we also decide 307 * its bounds relative to the subdisks. At 308 * the end, we have the following values: 309 * 310 * m.flags indicates the kinds of transfers 311 * we will perform. 312 * m.initoffset indicates the offset of the 313 * beginning of any data operation relative 314 * to the beginning of the stripe base. 315 * m.initlen specifies the length of any data 316 * operation. 317 * m.dataoffset contains the same value as 318 * m.initoffset. 319 * m.datalen contains the same value as 320 * m.initlen. Initially dataoffset and 321 * datalen describe the parameters for the 322 * first data block; while building the data 323 * block requests, they are updated for each 324 * block. 325 * m.groupoffset indicates the offset of any 326 * group operation relative to the beginning 327 * of the stripe base. 328 * m.grouplen specifies the length of any 329 * group operation. 330 * m.writeoffset indicates the offset of a 331 * normal write relative to the beginning of 332 * the stripe base. This value differs from 333 * m.dataoffset in that it applies to the 334 * entire operation, and not just the first 335 * block. 336 * m.writelen specifies the total span of a 337 * normal write operation. writeoffset and 338 * writelen are used to define the parity 339 * block. 340 */ 341 m.groupoffset = 0; /* assume no group... */ 342 m.grouplen = 0; /* until we know we have one */ 343 m.writeoffset = m.initoffset; /* start offset of transfer */ 344 m.writelen = 0; /* nothing to write yet */ 345 m.flags = 0; /* no flags yet */ 346 rsectors = m.stripesectors; /* remaining sectors to examine */ 347 m.dataoffset = m.initoffset; /* start at the beginning of the transfer */ 348 m.datalen = m.initlen; 349 350 if (m.sdcount > 1) { 351 plex->multiblock++; /* more than one block for the request */ 352 /* 353 * If we have two transfers that don't overlap, 354 * (one at the end of the first block, the other 355 * at the beginning of the second block), 356 * it's cheaper to split them. 357 */ 358 if (rsectors < plex->stripesize) { 359 m.sdcount = 1; /* just one subdisk */ 360 m.stripesectors = m.initlen; /* and just this many sectors */ 361 rsectors = m.initlen; /* and in the loop counter */ 362 } 363 } 364 if (SD[plex->sdnos[m.psdno]].state < sd_reborn) /* is our parity subdisk down? */ 365 m.badsdno = m.psdno; /* note that it's down */ 366 if (bp->b_cmd == BUF_CMD_READ) { /* read operation */ 367 for (mysdno = m.firstsdno; rsectors > 0; mysdno++) { 368 if (mysdno == m.psdno) /* ignore parity on read */ 369 mysdno++; 370 if (mysdno == plex->subdisks) /* wraparound */ 371 mysdno = 0; 372 if (mysdno == m.psdno) /* parity, */ 373 mysdno++; /* we've given already */ 374 375 if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */ 376 if (m.badsdno >= 0) /* we had one already, */ 377 return REQUEST_DOWN; /* we can't take a second */ 378 m.badsdno = mysdno; /* got the first */ 379 m.groupoffset = m.dataoffset; /* define the bounds */ 380 m.grouplen = m.datalen; 381 m.flags |= XFR_RECOVERY_READ; /* we need recovery */ 382 plex->recovered_reads++; /* count another one */ 383 } else 384 m.flags |= XFR_NORMAL_READ; /* normal read */ 385 386 /* Update the pointers for the next block */ 387 m.dataoffset = 0; /* back to the start of the stripe */ 388 rsectors -= m.datalen; /* remaining sectors to examine */ 389 m.datalen = umin(rsectors, plex->stripesize); /* amount that will fit in this block */ 390 } 391 } else { /* write operation */ 392 for (mysdno = m.firstsdno; rsectors > 0; mysdno++) { 393 if (mysdno == m.psdno) /* parity stripe, we've dealt with that */ 394 mysdno++; 395 if (mysdno == plex->subdisks) /* wraparound */ 396 mysdno = 0; 397 if (mysdno == m.psdno) /* parity, */ 398 mysdno++; /* we've given already */ 399 400 sd = &SD[plex->sdnos[mysdno]]; 401 if (sd->state != sd_up) { 402 enum requeststatus s; 403 404 s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ 405 if (s && (m.badsdno >= 0)) { /* second bad disk, */ 406 int sdno; 407 /* 408 * If the parity disk is down, there's 409 * no recovery. We make all involved 410 * subdisks stale. Otherwise, we 411 * should be able to recover, but it's 412 * like pulling teeth. Fix it later. 413 */ 414 for (sdno = 0; sdno < m.sdcount; sdno++) { 415 struct sd *sd = &SD[plex->sdnos[sdno]]; 416 if (sd->state >= sd_reborn) /* sort of up, */ 417 set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */ 418 } 419 return s; /* and crap out */ 420 } 421 m.badsdno = mysdno; /* note which one is bad */ 422 m.flags |= XFR_DEGRADED_WRITE; /* we need recovery */ 423 plex->degraded_writes++; /* count another one */ 424 m.groupoffset = m.dataoffset; /* define the bounds */ 425 m.grouplen = m.datalen; 426 } else { 427 m.flags |= XFR_NORMAL_WRITE; /* normal write operation */ 428 if (m.writeoffset > m.dataoffset) { /* move write operation lower */ 429 m.writelen = umax(m.writeoffset + m.writelen, 430 m.dataoffset + m.datalen) 431 - m.dataoffset; 432 m.writeoffset = m.dataoffset; 433 } else 434 m.writelen = umax(m.writeoffset + m.writelen, 435 m.dataoffset + m.datalen) 436 - m.writeoffset; 437 } 438 439 /* Update the pointers for the next block */ 440 m.dataoffset = 0; /* back to the start of the stripe */ 441 rsectors -= m.datalen; /* remaining sectors to examine */ 442 m.datalen = umin(rsectors, plex->stripesize); /* amount that will fit in this block */ 443 } 444 if (m.badsdno == m.psdno) { /* got a bad parity block, */ 445 struct sd *psd = &SD[plex->sdnos[m.psdno]]; 446 447 if (psd->state == sd_down) 448 set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */ 449 else if (psd->state == sd_crashed) 450 set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */ 451 m.flags &= ~XFR_NORMAL_WRITE; /* this write isn't normal, */ 452 m.flags |= XFR_PARITYLESS_WRITE; /* it's parityless */ 453 plex->parityless_writes++; /* count another one */ 454 } 455 } 456 457 /* reset the initial transfer values */ 458 m.dataoffset = m.initoffset; /* start at the beginning of the transfer */ 459 m.datalen = m.initlen; 460 461 /* decide how many requests we need */ 462 if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE)) 463 /* doing a recovery read or degraded write, */ 464 m.rqcount = plex->subdisks; /* all subdisks */ 465 else if (m.flags & XFR_NORMAL_WRITE) /* normal write, */ 466 m.rqcount = m.sdcount + 1; /* all data blocks and the parity block */ 467 else /* parityless write or normal read */ 468 m.rqcount = m.sdcount; /* just the data blocks */ 469 470 /* Part C: build the requests */ 471 rqg = allocrqg(rq, m.rqcount); /* get a request group */ 472 if (rqg == NULL) { /* malloc failed */ 473 bp->b_error = ENOMEM; 474 bp->b_flags |= B_ERROR; 475 return REQUEST_ENOMEM; 476 } 477 rqg->plexno = plexno; 478 rqg->flags = m.flags; 479 rqno = 0; /* index in the request group */ 480 481 /* 1: PARITY BLOCK */ 482 /* 483 * Are we performing an operation which requires parity? In that case, 484 * work out the parameters and define the parity block. 485 * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE 486 */ 487 if (m.flags & XFR_PARITYOP) { /* need parity */ 488 rqe = &rqg->rqe[rqno]; /* point to element */ 489 sd = &SD[plex->sdnos[m.psdno]]; /* the subdisk in question */ 490 rqe->rqg = rqg; /* point back to group */ 491 rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */ 492 &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE); /* transfer flags without data op stuf */ 493 setrqebounds(rqe, &m); /* set up the bounds of the transfer */ 494 rqe->sdno = sd->sdno; /* subdisk number */ 495 rqe->driveno = sd->driveno; 496 if (build_rq_buffer(rqe, plex)) /* build the buffer */ 497 return REQUEST_ENOMEM; /* can't do it */ 498 rqe->b.b_cmd = BUF_CMD_READ; /* we must read first */ 499 m.sdcount++; /* adjust the subdisk count */ 500 rqno++; /* and point to the next request */ 501 } 502 /* 503 * 2: DATA BLOCKS 504 * Now build up requests for the blocks required 505 * for individual transfers 506 */ 507 for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) { 508 if (mysdno == m.psdno) /* parity, */ 509 mysdno++; /* we've given already */ 510 if (mysdno == plex->subdisks) /* got to the end, */ 511 mysdno = 0; /* wrap around */ 512 if (mysdno == m.psdno) /* parity, */ 513 mysdno++; /* we've given already */ 514 515 rqe = &rqg->rqe[rqno]; /* point to element */ 516 sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */ 517 rqe->rqg = rqg; /* point to group */ 518 if (m.flags & XFR_NEEDS_MALLOC) /* we need a malloced buffer first */ 519 rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */ 520 else 521 rqe->flags = m.flags | XFR_DATA_BLOCK; /* transfer flags */ 522 if (mysdno == m.badsdno) { /* this is the bad subdisk */ 523 rqg->badsdno = rqno; /* note which one */ 524 rqe->flags |= XFR_BAD_SUBDISK; /* note that it's dead */ 525 /* 526 * we can't read or write from/to it, 527 * but we don't need to malloc 528 */ 529 rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE); 530 } 531 setrqebounds(rqe, &m); /* set up the bounds of the transfer */ 532 rqe->useroffset = m.useroffset; /* offset in user buffer */ 533 rqe->sdno = sd->sdno; /* subdisk number */ 534 rqe->driveno = sd->driveno; 535 if (build_rq_buffer(rqe, plex)) /* build the buffer */ 536 return REQUEST_ENOMEM; /* can't do it */ 537 if ((m.flags & XFR_PARITYOP) /* parity operation, */ 538 &&((m.flags & XFR_BAD_SUBDISK) == 0)) /* and not the bad subdisk, */ 539 rqe->b.b_cmd = BUF_CMD_READ; /* we must read first */ 540 541 /* Now update pointers for the next block */ 542 *diskaddr += m.datalen; /* skip past what we've done */ 543 m.stripesectors -= m.datalen; /* deduct from what's left */ 544 m.useroffset += m.datalen; /* and move on in the user buffer */ 545 m.datalen = umin(m.stripesectors, plex->stripesize); /* and recalculate */ 546 m.dataoffset = 0; /* start at the beginning of next block */ 547 } 548 549 /* 550 * 3: REMAINING BLOCKS FOR RECOVERY 551 * Finally, if we have a recovery operation, build 552 * up transfers for the other subdisks. Follow the 553 * subdisks around until we get to where we started. 554 * These requests use only the group parameters. 555 */ 556 if ((rqno < m.rqcount) /* haven't done them all already */ 557 &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) { 558 for (; rqno < m.rqcount; rqno++, mysdno++) { 559 if (mysdno == m.psdno) /* parity, */ 560 mysdno++; /* we've given already */ 561 if (mysdno == plex->subdisks) /* got to the end, */ 562 mysdno = 0; /* wrap around */ 563 if (mysdno == m.psdno) /* parity, */ 564 mysdno++; /* we've given already */ 565 566 rqe = &rqg->rqe[rqno]; /* point to element */ 567 sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */ 568 rqe->rqg = rqg; /* point to group */ 569 570 rqe->sdoffset = m.sdbase + m.groupoffset; /* start of transfer */ 571 rqe->dataoffset = 0; /* for tidiness' sake */ 572 rqe->groupoffset = 0; /* group starts at the beginining */ 573 rqe->datalen = 0; 574 rqe->grouplen = m.grouplen; 575 rqe->buflen = m.grouplen; 576 rqe->flags = (m.flags | XFR_MALLOCED) /* transfer flags without data op stuf */ 577 &~XFR_DATAOP; 578 rqe->sdno = sd->sdno; /* subdisk number */ 579 rqe->driveno = sd->driveno; 580 if (build_rq_buffer(rqe, plex)) /* build the buffer */ 581 return REQUEST_ENOMEM; /* can't do it */ 582 rqe->b.b_cmd = BUF_CMD_READ; /* we must read first */ 583 } 584 } 585 /* 586 * We need to lock the address range before 587 * doing anything. We don't have to be 588 * performing a recovery operation: somebody 589 * else could be doing so, and the results could 590 * influence us. Note the fact here, we'll perform 591 * the lock in launch_requests. 592 */ 593 rqg->lockbase = m.stripebase; 594 if (*diskaddr < diskend) /* didn't finish the request on this stripe */ 595 plex->multistripe++; /* count another one */ 596 } 597 return REQUEST_OK; 598 } 599 600 /* 601 * Helper function for rqe5: adjust the bounds of 602 * the transfers to minimize the buffer 603 * allocation. 604 * 605 * Each request can handle two of three different 606 * data ranges: 607 * 608 * 1. The range described by the parameters 609 * dataoffset and datalen, for normal read or 610 * parityless write. 611 * 2. The range described by the parameters 612 * groupoffset and grouplen, for recovery read 613 * and degraded write. 614 * 3. For normal write, the range depends on the 615 * kind of block. For data blocks, the range 616 * is defined by dataoffset and datalen. For 617 * parity blocks, it is defined by writeoffset 618 * and writelen. 619 * 620 * In order not to allocate more memory than 621 * necessary, this function adjusts the bounds 622 * parameter for each request to cover just the 623 * minimum necessary for the function it performs. 624 * This will normally vary from one request to the 625 * next. 626 * 627 * Things are slightly different for the parity 628 * block. In this case, the bounds defined by 629 * mp->writeoffset and mp->writelen also play a 630 * r�le. Select this case by setting the 631 * parameter forparity != 0 632 */ 633 void 634 setrqebounds(struct rqelement *rqe, struct metrics *mp) 635 { 636 /* parity block of a normal write */ 637 if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) 638 == (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) { /* case 3 */ 639 if (rqe->flags & XFR_DEGRADED_WRITE) { /* also degraded write */ 640 /* 641 * With a combined normal and degraded write, we 642 * will zero out the area of the degraded write 643 * in the second phase, so we don't need to read 644 * it in. Unfortunately, we need a way to tell 645 * build_request_buffer the size of the buffer, 646 * and currently that's the length of the read. 647 * As a result, we read everything, even the stuff 648 * that we're going to nuke. 649 * FIXME XXX 650 */ 651 if (mp->groupoffset < mp->writeoffset) { /* group operation starts lower */ 652 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ 653 rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */ 654 rqe->groupoffset = 0; /* and the group at the beginning */ 655 } else { /* individual data starts first */ 656 rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */ 657 rqe->dataoffset = 0; /* individual data starts at the beginning */ 658 rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */ 659 } 660 rqe->datalen = mp->writelen; 661 rqe->grouplen = mp->grouplen; 662 } else { /* just normal write (case 3) */ 663 rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */ 664 rqe->dataoffset = 0; /* degradation starts at the beginning */ 665 rqe->groupoffset = 0; /* for tidiness' sake */ 666 rqe->datalen = mp->writelen; 667 rqe->grouplen = 0; 668 } 669 } else if (rqe->flags & XFR_DATAOP) { /* data operation (case 1 or 3) */ 670 if (rqe->flags & XFR_GROUPOP) { /* also a group operation (case 2) */ 671 if (mp->groupoffset < mp->dataoffset) { /* group operation starts lower */ 672 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ 673 rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */ 674 rqe->groupoffset = 0; /* and the group at the beginning */ 675 } else { /* individual data starts first */ 676 rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */ 677 rqe->dataoffset = 0; /* individual data starts at the beginning */ 678 rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */ 679 } 680 rqe->datalen = mp->datalen; 681 rqe->grouplen = mp->grouplen; 682 } else { /* just data operation (case 1) */ 683 rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */ 684 rqe->dataoffset = 0; /* degradation starts at the beginning */ 685 rqe->groupoffset = 0; /* for tidiness' sake */ 686 rqe->datalen = mp->datalen; 687 rqe->grouplen = 0; 688 } 689 } else { /* just group operations (case 2) */ 690 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ 691 rqe->dataoffset = 0; /* for tidiness' sake */ 692 rqe->groupoffset = 0; /* group starts at the beginining */ 693 rqe->datalen = 0; 694 rqe->grouplen = mp->grouplen; 695 } 696 rqe->buflen = umax(rqe->dataoffset + rqe->datalen, /* total buffer length */ 697 rqe->groupoffset + rqe->grouplen); 698 } 699 /* Local Variables: */ 700 /* fill-column: 50 */ 701 /* End: */ 702