xref: /dragonfly/sys/dev/raid/vinum/vinumraid5.c (revision 1de703da)
1 /*-
2  * Copyright (c) 1997, 1998
3  *	Cybernet Corporation and Nan Yang Computer Services Limited.
4  *      All rights reserved.
5  *
6  *  This software was developed as part of the NetMAX project.
7  *
8  *  Written by Greg Lehey
9  *
10  *  This software is distributed under the so-called ``Berkeley
11  *  License'':
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. All advertising materials mentioning features or use of this software
22  *    must display the following acknowledgement:
23  *	This product includes software developed by Cybernet Corporation
24  *      and Nan Yang Computer Services Limited
25  * 4. Neither the name of the Companies nor the names of its contributors
26  *    may be used to endorse or promote products derived from this software
27  *    without specific prior written permission.
28  *
29  * This software is provided ``as is'', and any express or implied
30  * warranties, including, but not limited to, the implied warranties of
31  * merchantability and fitness for a particular purpose are disclaimed.
32  * In no event shall the company or contributors be liable for any
33  * direct, indirect, incidental, special, exemplary, or consequential
34  * damages (including, but not limited to, procurement of substitute
35  * goods or services; loss of use, data, or profits; or business
36  * interruption) however caused and on any theory of liability, whether
37  * in contract, strict liability, or tort (including negligence or
38  * otherwise) arising in any way out of the use of this software, even if
39  * advised of the possibility of such damage.
40  *
41  * $Id: vinumraid5.c,v 1.21 2001/01/09 04:21:27 grog Exp grog $
42  * $FreeBSD: src/sys/dev/vinum/vinumraid5.c,v 1.6.2.2 2001/03/13 02:59:43 grog Exp $
43  * $DragonFly: src/sys/dev/raid/vinum/vinumraid5.c,v 1.2 2003/06/17 04:28:33 dillon Exp $
44  */
45 #include <dev/vinum/vinumhdr.h>
46 #include <dev/vinum/request.h>
47 #include <sys/resourcevar.h>
48 
49 /*
50  * Parameters which describe the current transfer.
51  * These are only used for calculation, but they
52  * need to be passed to other functions, so it's
53  * tidier to put them in a struct
54  */
55 struct metrics {
56     daddr_t stripebase;					    /* base address of stripe (1st subdisk) */
57     int stripeoffset;					    /* offset in stripe */
58     int stripesectors;					    /* total sectors to transfer in this stripe */
59     daddr_t sdbase;					    /* offset in subdisk of stripe base */
60     int sdcount;					    /* number of disks involved in this transfer */
61     daddr_t diskstart;					    /* remember where this transfer starts */
62     int psdno;						    /* number of parity subdisk */
63     int badsdno;					    /* number of down subdisk, if there is one */
64     int firstsdno;					    /* first data subdisk number */
65     /* These correspond to the fields in rqelement, sort of */
66     int useroffset;
67     /*
68      * Initial offset and length values for the first
69      * data block
70      */
71     int initoffset;					    /* start address of block to transfer */
72     short initlen;					    /* length in sectors of data transfer */
73     /* Define a normal operation */
74     int dataoffset;					    /* start address of block to transfer */
75     int datalen;					    /* length in sectors of data transfer */
76     /* Define a group operation */
77     int groupoffset;					    /* subdisk offset of group operation */
78     int grouplen;					    /* length in sectors of group operation */
79     /* Define a normal write operation */
80     int writeoffset;					    /* subdisk offset of normal write */
81     int writelen;					    /* length in sectors of write operation */
82     enum xferinfo flags;				    /* to check what we're doing */
83     int rqcount;					    /* number of elements in request */
84 };
85 
86 enum requeststatus bre5(struct request *rq,
87     int plexno,
88     daddr_t * diskstart,
89     daddr_t diskend);
90 void complete_raid5_write(struct rqelement *);
91 enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
92 void setrqebounds(struct rqelement *rqe, struct metrics *mp);
93 
94 /*
95  * define the low-level requests needed to perform
96  * a high-level I/O operation for a specific plex
97  * 'plexno'.
98  *
99  * Return 0 if all subdisks involved in the
100  * request are up, 1 if some subdisks are not up,
101  * and -1 if the request is at least partially
102  * outside the bounds of the subdisks.
103  *
104  * Modify the pointer *diskstart to point to the
105  * end address.  On read, return on the first bad
106  * subdisk, so that the caller
107  * (build_read_request) can try alternatives.
108  *
109  * On entry to this routine, the prq structures
110  * are not assigned.  The assignment is performed
111  * by expandrq().  Strictly speaking, the elements
112  * rqe->sdno of all entries should be set to -1,
113  * since 0 (from bzero) is a valid subdisk number.
114  * We avoid this problem by initializing the ones
115  * we use, and not looking at the others (index >=
116  * prq->requests).
117  */
118 enum requeststatus
119 bre5(struct request *rq,
120     int plexno,
121     daddr_t * diskaddr,
122     daddr_t diskend)
123 {
124     struct metrics m;					    /* most of the information */
125     struct sd *sd;
126     struct plex *plex;
127     struct buf *bp;					    /* user's bp */
128     struct rqgroup *rqg;				    /* the request group that we will create */
129     struct rqelement *rqe;				    /* point to this request information */
130     int rsectors;					    /* sectors remaining in this stripe */
131     int mysdno;						    /* another sd index in loops */
132     int rqno;						    /* request number */
133 
134     rqg = NULL;						    /* shut up, damn compiler */
135     m.diskstart = *diskaddr;				    /* start of transfer */
136     bp = rq->bp;					    /* buffer pointer */
137     plex = &PLEX[plexno];				    /* point to the plex */
138 
139 
140     while (*diskaddr < diskend) {			    /* until we get it all sorted out */
141 	if (*diskaddr >= plex->length)			    /* beyond the end of the plex */
142 	    return REQUEST_EOF;				    /* can't continue */
143 
144 	m.badsdno = -1;					    /* no bad subdisk yet */
145 
146 	/* Part A: Define the request */
147 	/*
148 	 * First, calculate some sizes:
149 	 * The offset of the start address from
150 	 * the start of the stripe.
151 	 */
152 	m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1));
153 
154 	/*
155 	 * The plex-relative address of the
156 	 * start of the stripe.
157 	 */
158 	m.stripebase = *diskaddr - m.stripeoffset;
159 
160 	/* subdisk containing the parity stripe */
161 	if (plex->organization == plex_raid5)
162 	    m.psdno = plex->subdisks - 1
163 		- (*diskaddr / (plex->stripesize * (plex->subdisks - 1)))
164 		% plex->subdisks;
165 	else						    /* RAID-4 */
166 	    m.psdno = plex->subdisks - 1;
167 
168 	/*
169 	 * The number of the subdisk in which
170 	 * the start is located.
171 	 */
172 	m.firstsdno = m.stripeoffset / plex->stripesize;
173 	if (m.firstsdno >= m.psdno)			    /* at or past parity sd */
174 	    m.firstsdno++;				    /* increment it */
175 
176 	/*
177 	 * The offset from the beginning of
178 	 * the stripe on this subdisk.
179 	 */
180 	m.initoffset = m.stripeoffset % plex->stripesize;
181 
182 	/* The offset of the stripe start relative to this subdisk */
183 	m.sdbase = m.stripebase / (plex->subdisks - 1);
184 
185 	m.useroffset = *diskaddr - m.diskstart;		    /* The offset of the start in the user buffer */
186 
187 	/*
188 	 * The number of sectors to transfer in the
189 	 * current (first) subdisk.
190 	 */
191 	m.initlen = min(diskend - *diskaddr,		    /* the amount remaining to transfer */
192 	    plex->stripesize - m.initoffset);		    /* and the amount left in this block */
193 
194 	/*
195 	 * The number of sectors to transfer in this stripe
196 	 * is the minumum of the amount remaining to transfer
197 	 * and the amount left in this stripe.
198 	 */
199 	m.stripesectors = min(diskend - *diskaddr,
200 	    plex->stripesize * (plex->subdisks - 1) - m.stripeoffset);
201 
202 	/* The number of data subdisks involved in this request */
203 	m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize;
204 
205 	/* Part B: decide what kind of transfer this will be.
206 
207 	 * start and end addresses of the transfer in
208 	 * the current block.
209 	 *
210 	 * There are a number of different kinds of
211 	 * transfer, each of which relates to a
212 	 * specific subdisk:
213 	 *
214 	 * 1. Normal read.  All participating subdisks
215 	 *    are up, and the transfer can be made
216 	 *    directly to the user buffer.  The bounds
217 	 *    of the transfer are described by
218 	 *    m.dataoffset and m.datalen.  We have
219 	 *    already calculated m.initoffset and
220 	 *    m.initlen, which define the parameters
221 	 *    for the first data block.
222 	 *
223 	 * 2. Recovery read.  One participating
224 	 *    subdisk is down.  To recover data, all
225 	 *    the other subdisks, including the parity
226 	 *    subdisk, must be read.  The data is
227 	 *    recovered by exclusive-oring all the
228 	 *    other blocks.  The bounds of the
229 	 *    transfer are described by m.groupoffset
230 	 *    and m.grouplen.
231 	 *
232 	 * 3. A read request may request reading both
233 	 *    available data (normal read) and
234 	 *    non-available data (recovery read).
235 	 *    This can be a problem if the address
236 	 *    ranges of the two reads do not coincide:
237 	 *    in this case, the normal read needs to
238 	 *    be extended to cover the address range
239 	 *    of the recovery read, and must thus be
240 	 *    performed out of malloced memory.
241 	 *
242 	 * 4. Normal write.  All the participating
243 	 *    subdisks are up.  The bounds of the
244 	 *    transfer are described by m.dataoffset
245 	 *    and m.datalen.  Since these values
246 	 *    differ for each block, we calculate the
247 	 *    bounds for the parity block
248 	 *    independently as the maximum of the
249 	 *    individual blocks and store these values
250 	 *    in m.writeoffset and m.writelen.  This
251 	 *    write proceeds in four phases:
252 	 *
253 	 *    i.  Read the old contents of each block
254 	 *        and the parity block.
255 	 *    ii.  ``Remove'' the old contents from
256 	 *         the parity block with exclusive or.
257 	 *    iii. ``Insert'' the new contents of the
258 	 *          block in the parity block, again
259 	 *          with exclusive or.
260 	 *
261 	 *    iv.  Write the new contents of the data
262 	 *         blocks and the parity block.  The data
263 	 *         block transfers can be made directly from
264 	 *         the user buffer.
265 	 *
266 	 * 5. Degraded write where the data block is
267 	 *    not available.  The bounds of the
268 	 *    transfer are described by m.groupoffset
269 	 *    and m.grouplen. This requires the
270 	 *    following steps:
271 	 *
272 	 *    i.  Read in all the other data blocks,
273 	 *        excluding the parity block.
274 	 *
275 	 *    ii.  Recreate the parity block from the
276 	 *         other data blocks and the data to be
277 	 *         written.
278 	 *
279 	 *    iii. Write the parity block.
280 	 *
281 	 * 6. Parityless write, a write where the
282 	 *    parity block is not available.  This is
283 	 *    in fact the simplest: just write the
284 	 *    data blocks.  This can proceed directly
285 	 *    from the user buffer.  The bounds of the
286 	 *    transfer are described by m.dataoffset
287 	 *    and m.datalen.
288 	 *
289 	 * 7. Combination of degraded data block write
290 	 *    and normal write.  In this case the
291 	 *    address ranges of the reads may also
292 	 *    need to be extended to cover all
293 	 *    participating blocks.
294 	 *
295 	 * All requests in a group transfer transfer
296 	 * the same address range relative to their
297 	 * subdisk.  The individual transfers may
298 	 * vary, but since our group of requests is
299 	 * all in a single slice, we can define a
300 	 * range in which they all fall.
301 	 *
302 	 * In the following code section, we determine
303 	 * which kind of transfer we will perform.  If
304 	 * there is a group transfer, we also decide
305 	 * its bounds relative to the subdisks.  At
306 	 * the end, we have the following values:
307 	 *
308 	 *  m.flags indicates the kinds of transfers
309 	 *    we will perform.
310 	 *  m.initoffset indicates the offset of the
311 	 *    beginning of any data operation relative
312 	 *    to the beginning of the stripe base.
313 	 *  m.initlen specifies the length of any data
314 	 *    operation.
315 	 *  m.dataoffset contains the same value as
316 	 *    m.initoffset.
317 	 *  m.datalen contains the same value as
318 	 *    m.initlen.  Initially dataoffset and
319 	 *    datalen describe the parameters for the
320 	 *    first data block; while building the data
321 	 *    block requests, they are updated for each
322 	 *    block.
323 	 *  m.groupoffset indicates the offset of any
324 	 *    group operation relative to the beginning
325 	 *    of the stripe base.
326 	 *  m.grouplen specifies the length of any
327 	 *    group operation.
328 	 *  m.writeoffset indicates the offset of a
329 	 *    normal write relative to the beginning of
330 	 *    the stripe base.  This value differs from
331 	 *    m.dataoffset in that it applies to the
332 	 *    entire operation, and not just the first
333 	 *    block.
334 	 *  m.writelen specifies the total span of a
335 	 *    normal write operation.  writeoffset and
336 	 *    writelen are used to define the parity
337 	 *    block.
338 	 */
339 	m.groupoffset = 0;				    /* assume no group... */
340 	m.grouplen = 0;					    /* until we know we have one */
341 	m.writeoffset = m.initoffset;			    /* start offset of transfer */
342 	m.writelen = 0;					    /* nothing to write yet */
343 	m.flags = 0;					    /* no flags yet */
344 	rsectors = m.stripesectors;			    /* remaining sectors to examine */
345 	m.dataoffset = m.initoffset;			    /* start at the beginning of the transfer */
346 	m.datalen = m.initlen;
347 
348 	if (m.sdcount > 1) {
349 	    plex->multiblock++;				    /* more than one block for the request */
350 	    /*
351 	     * If we have two transfers that don't overlap,
352 	     * (one at the end of the first block, the other
353 	     * at the beginning of the second block),
354 	     * it's cheaper to split them.
355 	     */
356 	    if (rsectors < plex->stripesize) {
357 		m.sdcount = 1;				    /* just one subdisk */
358 		m.stripesectors = m.initlen;		    /* and just this many sectors */
359 		rsectors = m.initlen;			    /* and in the loop counter */
360 	    }
361 	}
362 	if (SD[plex->sdnos[m.psdno]].state < sd_reborn)	    /* is our parity subdisk down? */
363 	    m.badsdno = m.psdno;			    /* note that it's down */
364 	if (bp->b_flags & B_READ) {			    /* read operation */
365 	    for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
366 		if (mysdno == m.psdno)			    /* ignore parity on read */
367 		    mysdno++;
368 		if (mysdno == plex->subdisks)		    /* wraparound */
369 		    mysdno = 0;
370 		if (mysdno == m.psdno)			    /* parity, */
371 		    mysdno++;				    /* we've given already */
372 
373 		if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */
374 		    if (m.badsdno >= 0)			    /* we had one already, */
375 			return REQUEST_DOWN;		    /* we can't take a second */
376 		    m.badsdno = mysdno;			    /* got the first */
377 		    m.groupoffset = m.dataoffset;	    /* define the bounds */
378 		    m.grouplen = m.datalen;
379 		    m.flags |= XFR_RECOVERY_READ;	    /* we need recovery */
380 		    plex->recovered_reads++;		    /* count another one */
381 		} else
382 		    m.flags |= XFR_NORMAL_READ;		    /* normal read */
383 
384 		/* Update the pointers for the next block */
385 		m.dataoffset = 0;			    /* back to the start of the stripe */
386 		rsectors -= m.datalen;			    /* remaining sectors to examine */
387 		m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
388 	    }
389 	} else {					    /* write operation */
390 	    for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
391 		if (mysdno == m.psdno)			    /* parity stripe, we've dealt with that */
392 		    mysdno++;
393 		if (mysdno == plex->subdisks)		    /* wraparound */
394 		    mysdno = 0;
395 		if (mysdno == m.psdno)			    /* parity, */
396 		    mysdno++;				    /* we've given already */
397 
398 		sd = &SD[plex->sdnos[mysdno]];
399 		if (sd->state != sd_up) {
400 		    enum requeststatus s;
401 
402 		    s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
403 		    if (s && (m.badsdno >= 0)) {	    /* second bad disk, */
404 			int sdno;
405 			/*
406 			 * If the parity disk is down, there's
407 			 * no recovery.  We make all involved
408 			 * subdisks stale.  Otherwise, we
409 			 * should be able to recover, but it's
410 			 * like pulling teeth.  Fix it later.
411 			 */
412 			for (sdno = 0; sdno < m.sdcount; sdno++) {
413 			    struct sd *sd = &SD[plex->sdnos[sdno]];
414 			    if (sd->state >= sd_reborn)	    /* sort of up, */
415 				set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */
416 			}
417 			return s;			    /* and crap out */
418 		    }
419 		    m.badsdno = mysdno;			    /* note which one is bad */
420 		    m.flags |= XFR_DEGRADED_WRITE;	    /* we need recovery */
421 		    plex->degraded_writes++;		    /* count another one */
422 		    m.groupoffset = m.dataoffset;	    /* define the bounds */
423 		    m.grouplen = m.datalen;
424 		} else {
425 		    m.flags |= XFR_NORMAL_WRITE;	    /* normal write operation */
426 		    if (m.writeoffset > m.dataoffset) {	    /* move write operation lower */
427 			m.writelen = max(m.writeoffset + m.writelen,
428 			    m.dataoffset + m.datalen)
429 			    - m.dataoffset;
430 			m.writeoffset = m.dataoffset;
431 		    } else
432 			m.writelen = max(m.writeoffset + m.writelen,
433 			    m.dataoffset + m.datalen)
434 			    - m.writeoffset;
435 		}
436 
437 		/* Update the pointers for the next block */
438 		m.dataoffset = 0;			    /* back to the start of the stripe */
439 		rsectors -= m.datalen;			    /* remaining sectors to examine */
440 		m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
441 	    }
442 	    if (m.badsdno == m.psdno) {			    /* got a bad parity block, */
443 		struct sd *psd = &SD[plex->sdnos[m.psdno]];
444 
445 		if (psd->state == sd_down)
446 		    set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */
447 		else if (psd->state == sd_crashed)
448 		    set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */
449 		m.flags &= ~XFR_NORMAL_WRITE;		    /* this write isn't normal, */
450 		m.flags |= XFR_PARITYLESS_WRITE;	    /* it's parityless */
451 		plex->parityless_writes++;		    /* count another one */
452 	    }
453 	}
454 
455 	/* reset the initial transfer values */
456 	m.dataoffset = m.initoffset;			    /* start at the beginning of the transfer */
457 	m.datalen = m.initlen;
458 
459 	/* decide how many requests we need */
460 	if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))
461 	    /* doing a recovery read or degraded write, */
462 	    m.rqcount = plex->subdisks;			    /* all subdisks */
463 	else if (m.flags & XFR_NORMAL_WRITE)		    /* normal write, */
464 	    m.rqcount = m.sdcount + 1;			    /* all data blocks and the parity block */
465 	else						    /* parityless write or normal read */
466 	    m.rqcount = m.sdcount;			    /* just the data blocks */
467 
468 	/* Part C: build the requests */
469 	rqg = allocrqg(rq, m.rqcount);			    /* get a request group */
470 	if (rqg == NULL) {				    /* malloc failed */
471 	    bp->b_error = ENOMEM;
472 	    bp->b_flags |= B_ERROR;
473 	    return REQUEST_ENOMEM;
474 	}
475 	rqg->plexno = plexno;
476 	rqg->flags = m.flags;
477 	rqno = 0;					    /* index in the request group */
478 
479 	/* 1: PARITY BLOCK */
480 	/*
481 	 * Are we performing an operation which requires parity?  In that case,
482 	 * work out the parameters and define the parity block.
483 	 * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE
484 	 */
485 	if (m.flags & XFR_PARITYOP) {			    /* need parity */
486 	    rqe = &rqg->rqe[rqno];			    /* point to element */
487 	    sd = &SD[plex->sdnos[m.psdno]];		    /* the subdisk in question */
488 	    rqe->rqg = rqg;				    /* point back to group */
489 	    rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */
490 	    &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE);	    /* transfer flags without data op stuf */
491 	    setrqebounds(rqe, &m);			    /* set up the bounds of the transfer */
492 	    rqe->sdno = sd->sdno;			    /* subdisk number */
493 	    rqe->driveno = sd->driveno;
494 	    if (build_rq_buffer(rqe, plex))		    /* build the buffer */
495 		return REQUEST_ENOMEM;			    /* can't do it */
496 	    rqe->b.b_flags |= B_READ;			    /* we must read first */
497 	    m.sdcount++;				    /* adjust the subdisk count */
498 	    rqno++;					    /* and point to the next request */
499 	}
500 	/*
501 	 * 2: DATA BLOCKS
502 	 * Now build up requests for the blocks required
503 	 * for individual transfers
504 	 */
505 	for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) {
506 	    if (mysdno == m.psdno)			    /* parity, */
507 		mysdno++;				    /* we've given already */
508 	    if (mysdno == plex->subdisks)		    /* got to the end, */
509 		mysdno = 0;				    /* wrap around */
510 	    if (mysdno == m.psdno)			    /* parity, */
511 		mysdno++;				    /* we've given already */
512 
513 	    rqe = &rqg->rqe[rqno];			    /* point to element */
514 	    sd = &SD[plex->sdnos[mysdno]];		    /* the subdisk in question */
515 	    rqe->rqg = rqg;				    /* point to group */
516 	    if (m.flags & XFR_NEEDS_MALLOC)		    /* we need a malloced buffer first */
517 		rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */
518 	    else
519 		rqe->flags = m.flags | XFR_DATA_BLOCK;	    /* transfer flags */
520 	    if (mysdno == m.badsdno) {			    /* this is the bad subdisk */
521 		rqg->badsdno = rqno;			    /* note which one */
522 		rqe->flags |= XFR_BAD_SUBDISK;		    /* note that it's dead */
523 		/*
524 		 * we can't read or write from/to it,
525 		 * but we don't need to malloc
526 		 */
527 		rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE);
528 	    }
529 	    setrqebounds(rqe, &m);			    /* set up the bounds of the transfer */
530 	    rqe->useroffset = m.useroffset;		    /* offset in user buffer */
531 	    rqe->sdno = sd->sdno;			    /* subdisk number */
532 	    rqe->driveno = sd->driveno;
533 	    if (build_rq_buffer(rqe, plex))		    /* build the buffer */
534 		return REQUEST_ENOMEM;			    /* can't do it */
535 	    if ((m.flags & XFR_PARITYOP)		    /* parity operation, */
536 	    &&((m.flags & XFR_BAD_SUBDISK) == 0))	    /* and not the bad subdisk, */
537 		rqe->b.b_flags |= B_READ;		    /* we must read first */
538 
539 	    /* Now update pointers for the next block */
540 	    *diskaddr += m.datalen;			    /* skip past what we've done */
541 	    m.stripesectors -= m.datalen;		    /* deduct from what's left */
542 	    m.useroffset += m.datalen;			    /* and move on in the user buffer */
543 	    m.datalen = min(m.stripesectors, plex->stripesize);	/* and recalculate */
544 	    m.dataoffset = 0;				    /* start at the beginning of next block */
545 	}
546 
547 	/*
548 	 * 3: REMAINING BLOCKS FOR RECOVERY
549 	 * Finally, if we have a recovery operation, build
550 	 * up transfers for the other subdisks.  Follow the
551 	 * subdisks around until we get to where we started.
552 	 * These requests use only the group parameters.
553 	 */
554 	if ((rqno < m.rqcount)				    /* haven't done them all already */
555 	&&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) {
556 	    for (; rqno < m.rqcount; rqno++, mysdno++) {
557 		if (mysdno == m.psdno)			    /* parity, */
558 		    mysdno++;				    /* we've given already */
559 		if (mysdno == plex->subdisks)		    /* got to the end, */
560 		    mysdno = 0;				    /* wrap around */
561 		if (mysdno == m.psdno)			    /* parity, */
562 		    mysdno++;				    /* we've given already */
563 
564 		rqe = &rqg->rqe[rqno];			    /* point to element */
565 		sd = &SD[plex->sdnos[mysdno]];		    /* the subdisk in question */
566 		rqe->rqg = rqg;				    /* point to group */
567 
568 		rqe->sdoffset = m.sdbase + m.groupoffset;   /* start of transfer */
569 		rqe->dataoffset = 0;			    /* for tidiness' sake */
570 		rqe->groupoffset = 0;			    /* group starts at the beginining */
571 		rqe->datalen = 0;
572 		rqe->grouplen = m.grouplen;
573 		rqe->buflen = m.grouplen;
574 		rqe->flags = (m.flags | XFR_MALLOCED)	    /* transfer flags without data op stuf */
575 		&~XFR_DATAOP;
576 		rqe->sdno = sd->sdno;			    /* subdisk number */
577 		rqe->driveno = sd->driveno;
578 		if (build_rq_buffer(rqe, plex))		    /* build the buffer */
579 		    return REQUEST_ENOMEM;		    /* can't do it */
580 		rqe->b.b_flags |= B_READ;		    /* we must read first */
581 	    }
582 	}
583 	/*
584 	 * We need to lock the address range before
585 	 * doing anything.  We don't have to be
586 	 * performing a recovery operation: somebody
587 	 * else could be doing so, and the results could
588 	 * influence us.  Note the fact here, we'll perform
589 	 * the lock in launch_requests.
590 	 */
591 	rqg->lockbase = m.stripebase;
592 	if (*diskaddr < diskend)			    /* didn't finish the request on this stripe */
593 	    plex->multistripe++;			    /* count another one */
594     }
595     return REQUEST_OK;
596 }
597 
598 /*
599  * Helper function for rqe5: adjust the bounds of
600  * the transfers to minimize the buffer
601  * allocation.
602  *
603  * Each request can handle two of three different
604  * data ranges:
605  *
606  * 1.  The range described by the parameters
607  *     dataoffset and datalen, for normal read or
608  *     parityless write.
609  * 2.  The range described by the parameters
610  *     groupoffset and grouplen, for recovery read
611  *     and degraded write.
612  * 3.  For normal write, the range depends on the
613  *     kind of block.  For data blocks, the range
614  *     is defined by dataoffset and datalen.  For
615  *     parity blocks, it is defined by writeoffset
616  *     and writelen.
617  *
618  * In order not to allocate more memory than
619  * necessary, this function adjusts the bounds
620  * parameter for each request to cover just the
621  * minimum necessary for the function it performs.
622  * This will normally vary from one request to the
623  * next.
624  *
625  * Things are slightly different for the parity
626  * block.  In this case, the bounds defined by
627  * mp->writeoffset and mp->writelen also play a
628  * r�le.  Select this case by setting the
629  * parameter forparity != 0
630  */
631 void
632 setrqebounds(struct rqelement *rqe, struct metrics *mp)
633 {
634     /* parity block of a normal write */
635     if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK))
636 	== (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) {	    /* case 3 */
637 	if (rqe->flags & XFR_DEGRADED_WRITE) {		    /* also degraded write */
638 	    /*
639 	     * With a combined normal and degraded write, we
640 	     * will zero out the area of the degraded write
641 	     * in the second phase, so we don't need to read
642 	     * it in.  Unfortunately, we need a way to tell
643 	     * build_request_buffer the size of the buffer,
644 	     * and currently that's the length of the read.
645 	     * As a result, we read everything, even the stuff
646 	     * that we're going to nuke.
647 	     * FIXME XXX
648 	     */
649 	    if (mp->groupoffset < mp->writeoffset) {	    /* group operation starts lower */
650 		rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
651 		rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */
652 		rqe->groupoffset = 0;			    /* and the group at the beginning */
653 	    } else {					    /* individual data starts first */
654 		rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */
655 		rqe->dataoffset = 0;			    /* individual data starts at the beginning */
656 		rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */
657 	    }
658 	    rqe->datalen = mp->writelen;
659 	    rqe->grouplen = mp->grouplen;
660 	} else {					    /* just normal write (case 3) */
661 	    rqe->sdoffset = mp->sdbase + mp->writeoffset;   /* start of transfer */
662 	    rqe->dataoffset = 0;			    /* degradation starts at the beginning */
663 	    rqe->groupoffset = 0;			    /* for tidiness' sake */
664 	    rqe->datalen = mp->writelen;
665 	    rqe->grouplen = 0;
666 	}
667     } else if (rqe->flags & XFR_DATAOP) {		    /* data operation (case 1 or 3) */
668 	if (rqe->flags & XFR_GROUPOP) {			    /* also a group operation (case 2) */
669 	    if (mp->groupoffset < mp->dataoffset) {	    /* group operation starts lower */
670 		rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
671 		rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */
672 		rqe->groupoffset = 0;			    /* and the group at the beginning */
673 	    } else {					    /* individual data starts first */
674 		rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */
675 		rqe->dataoffset = 0;			    /* individual data starts at the beginning */
676 		rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */
677 	    }
678 	    rqe->datalen = mp->datalen;
679 	    rqe->grouplen = mp->grouplen;
680 	} else {					    /* just data operation (case 1) */
681 	    rqe->sdoffset = mp->sdbase + mp->dataoffset;    /* start of transfer */
682 	    rqe->dataoffset = 0;			    /* degradation starts at the beginning */
683 	    rqe->groupoffset = 0;			    /* for tidiness' sake */
684 	    rqe->datalen = mp->datalen;
685 	    rqe->grouplen = 0;
686 	}
687     } else {						    /* just group operations (case 2) */
688 	rqe->sdoffset = mp->sdbase + mp->groupoffset;	    /* start of transfer */
689 	rqe->dataoffset = 0;				    /* for tidiness' sake */
690 	rqe->groupoffset = 0;				    /* group starts at the beginining */
691 	rqe->datalen = 0;
692 	rqe->grouplen = mp->grouplen;
693     }
694     rqe->buflen = max(rqe->dataoffset + rqe->datalen,	    /* total buffer length */
695 	rqe->groupoffset + rqe->grouplen);
696 }
697 /* Local Variables: */
698 /* fill-column: 50 */
699 /* End: */
700