xref: /openbsd/sys/dev/softraid_raid5.c (revision 9b7c3dbb)
1 /* $OpenBSD: softraid_raid5.c,v 1.26 2016/05/31 15:19:12 jsing Exp $ */
2 /*
3  * Copyright (c) 2014 Joel Sing <jsing@openbsd.org>
4  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
5  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include "bio.h"
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/buf.h>
25 #include <sys/device.h>
26 #include <sys/ioctl.h>
27 #include <sys/malloc.h>
28 #include <sys/kernel.h>
29 #include <sys/disk.h>
30 #include <sys/rwlock.h>
31 #include <sys/queue.h>
32 #include <sys/fcntl.h>
33 #include <sys/mount.h>
34 #include <sys/sensors.h>
35 #include <sys/stat.h>
36 #include <sys/task.h>
37 #include <sys/pool.h>
38 #include <sys/conf.h>
39 #include <sys/uio.h>
40 
41 #include <scsi/scsi_all.h>
42 #include <scsi/scsiconf.h>
43 #include <scsi/scsi_disk.h>
44 
45 #include <dev/softraidvar.h>
46 
47 /* RAID 5 functions. */
48 int	sr_raid5_create(struct sr_discipline *, struct bioc_createraid *,
49 	    int, int64_t);
50 int	sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *,
51 	    int, void *);
52 int	sr_raid5_init(struct sr_discipline *);
53 int	sr_raid5_rw(struct sr_workunit *);
54 int	sr_raid5_openings(struct sr_discipline *);
55 void	sr_raid5_intr(struct buf *);
56 int	sr_raid5_wu_done(struct sr_workunit *);
57 void	sr_raid5_set_chunk_state(struct sr_discipline *, int, int);
58 void	sr_raid5_set_vol_state(struct sr_discipline *);
59 
60 int	sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, long,
61 	    void *, int, int, void *);
62 int	sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, long,
63 	    void *);
64 int	sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int,
65 	    daddr_t, long, void *, int, int);
66 void	sr_raid5_xor(void *, void *, int);
67 
68 void	sr_raid5_rebuild(struct sr_discipline *);
69 void	sr_raid5_scrub(struct sr_discipline *);
70 
71 /* discipline initialisation. */
72 void
73 sr_raid5_discipline_init(struct sr_discipline *sd)
74 {
75 	/* Fill out discipline members. */
76 	sd->sd_type = SR_MD_RAID5;
77 	strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name));
78 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
79 	    SR_CAP_REBUILD | SR_CAP_REDUNDANT;
80 	sd->sd_max_ccb_per_wu = 4; /* only if stripsize <= MAXPHYS */
81 	sd->sd_max_wu = SR_RAID5_NOWU + 2;	/* Two for scrub/rebuild. */
82 
83 	/* Setup discipline specific function pointers. */
84 	sd->sd_assemble = sr_raid5_assemble;
85 	sd->sd_create = sr_raid5_create;
86 	sd->sd_openings = sr_raid5_openings;
87 	sd->sd_rebuild = sr_raid5_rebuild;
88 	sd->sd_scsi_rw = sr_raid5_rw;
89 	sd->sd_scsi_intr = sr_raid5_intr;
90 	sd->sd_scsi_wu_done = sr_raid5_wu_done;
91 	sd->sd_set_chunk_state = sr_raid5_set_chunk_state;
92 	sd->sd_set_vol_state = sr_raid5_set_vol_state;
93 }
94 
95 int
96 sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc,
97     int no_chunk, int64_t coerced_size)
98 {
99 	if (no_chunk < 3) {
100 		sr_error(sd->sd_sc, "%s requires three or more chunks",
101 		    sd->sd_name);
102 		return EINVAL;
103 	}
104 
105 	/*
106 	 * XXX add variable strip size later even though MAXPHYS is really
107 	 * the clever value, users like to tinker with that type of stuff.
108 	 */
109 	sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
110 	sd->sd_meta->ssdi.ssd_size = (coerced_size &
111 	    ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
112 	    DEV_BSHIFT) - 1)) * (no_chunk - 1);
113 
114 	return sr_raid5_init(sd);
115 }
116 
117 int
118 sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
119     int no_chunk, void *data)
120 {
121 	return sr_raid5_init(sd);
122 }
123 
124 int
125 sr_raid5_init(struct sr_discipline *sd)
126 {
127 	/* Initialise runtime values. */
128 	sd->mds.mdd_raid5.sr5_strip_bits =
129 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
130 	if (sd->mds.mdd_raid5.sr5_strip_bits == -1) {
131 		sr_error(sd->sd_sc, "invalid strip size");
132 		return EINVAL;
133 	}
134 
135 	return 0;
136 }
137 
138 int
139 sr_raid5_openings(struct sr_discipline *sd)
140 {
141 	/* Two work units per I/O, two for rebuild/scrub. */
142 	return ((sd->sd_max_wu - 2) >> 1);
143 }
144 
145 void
146 sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
147 {
148 	int			old_state, s;
149 
150 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
151 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
152 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
153 
154 	/* ok to go to splbio since this only happens in error path */
155 	s = splbio();
156 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
157 
158 	/* multiple IOs to the same chunk that fail will come through here */
159 	if (old_state == new_state)
160 		goto done;
161 
162 	switch (old_state) {
163 	case BIOC_SDONLINE:
164 		switch (new_state) {
165 		case BIOC_SDOFFLINE:
166 		case BIOC_SDSCRUB:
167 			break;
168 		default:
169 			goto die;
170 		}
171 		break;
172 
173 	case BIOC_SDOFFLINE:
174 		if (new_state == BIOC_SDREBUILD) {
175 			;
176 		} else
177 			goto die;
178 		break;
179 
180 	case BIOC_SDSCRUB:
181 		switch (new_state) {
182 		case BIOC_SDONLINE:
183 		case BIOC_SDOFFLINE:
184 			break;
185 		default:
186 			goto die;
187 		}
188 		break;
189 
190 	case BIOC_SDREBUILD:
191 		switch (new_state) {
192 		case BIOC_SDONLINE:
193 		case BIOC_SDOFFLINE:
194 			break;
195 		default:
196 			goto die;
197 		}
198 		break;
199 
200 	default:
201 die:
202 		splx(s); /* XXX */
203 		panic("%s: %s: %s: invalid chunk state transition "
204 		    "%d -> %d", DEVNAME(sd->sd_sc),
205 		    sd->sd_meta->ssd_devname,
206 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
207 		    old_state, new_state);
208 		/* NOTREACHED */
209 	}
210 
211 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
212 	sd->sd_set_vol_state(sd);
213 
214 	sd->sd_must_flush = 1;
215 	task_add(systq, &sd->sd_meta_save_task);
216 done:
217 	splx(s);
218 }
219 
220 void
221 sr_raid5_set_vol_state(struct sr_discipline *sd)
222 {
223 	int			states[SR_MAX_STATES];
224 	int			new_state, i, s, nd;
225 	int			old_state = sd->sd_vol_status;
226 
227 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
228 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
229 
230 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
231 
232 	for (i = 0; i < SR_MAX_STATES; i++)
233 		states[i] = 0;
234 
235 	for (i = 0; i < nd; i++) {
236 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
237 		if (s >= SR_MAX_STATES)
238 			panic("%s: %s: %s: invalid chunk state",
239 			    DEVNAME(sd->sd_sc),
240 			    sd->sd_meta->ssd_devname,
241 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
242 		states[s]++;
243 	}
244 
245 	if (states[BIOC_SDONLINE] == nd)
246 		new_state = BIOC_SVONLINE;
247 	else if (states[BIOC_SDONLINE] < nd - 1)
248 		new_state = BIOC_SVOFFLINE;
249 	else if (states[BIOC_SDSCRUB] != 0)
250 		new_state = BIOC_SVSCRUB;
251 	else if (states[BIOC_SDREBUILD] != 0)
252 		new_state = BIOC_SVREBUILD;
253 	else if (states[BIOC_SDONLINE] == nd - 1)
254 		new_state = BIOC_SVDEGRADED;
255 	else {
256 #ifdef SR_DEBUG
257 		DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state "
258 		    "was %d\n", DEVNAME(sd->sd_sc), old_state);
259 		for (i = 0; i < nd; i++)
260 			DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n",
261 			    DEVNAME(sd->sd_sc), i,
262 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
263 #endif
264 		panic("invalid volume state");
265 	}
266 
267 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n",
268 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
269 	    old_state, new_state);
270 
271 	switch (old_state) {
272 	case BIOC_SVONLINE:
273 		switch (new_state) {
274 		case BIOC_SVONLINE: /* can go to same state */
275 		case BIOC_SVOFFLINE:
276 		case BIOC_SVDEGRADED:
277 		case BIOC_SVREBUILD: /* happens on boot */
278 			break;
279 		default:
280 			goto die;
281 		}
282 		break;
283 
284 	case BIOC_SVOFFLINE:
285 		/* XXX this might be a little too much */
286 		goto die;
287 
288 	case BIOC_SVDEGRADED:
289 		switch (new_state) {
290 		case BIOC_SVOFFLINE:
291 		case BIOC_SVREBUILD:
292 		case BIOC_SVDEGRADED: /* can go to the same state */
293 			break;
294 		default:
295 			goto die;
296 		}
297 		break;
298 
299 	case BIOC_SVBUILDING:
300 		switch (new_state) {
301 		case BIOC_SVONLINE:
302 		case BIOC_SVOFFLINE:
303 		case BIOC_SVBUILDING: /* can go to the same state */
304 			break;
305 		default:
306 			goto die;
307 		}
308 		break;
309 
310 	case BIOC_SVSCRUB:
311 		switch (new_state) {
312 		case BIOC_SVONLINE:
313 		case BIOC_SVOFFLINE:
314 		case BIOC_SVDEGRADED:
315 		case BIOC_SVSCRUB: /* can go to same state */
316 			break;
317 		default:
318 			goto die;
319 		}
320 		break;
321 
322 	case BIOC_SVREBUILD:
323 		switch (new_state) {
324 		case BIOC_SVONLINE:
325 		case BIOC_SVOFFLINE:
326 		case BIOC_SVDEGRADED:
327 		case BIOC_SVREBUILD: /* can go to the same state */
328 			break;
329 		default:
330 			goto die;
331 		}
332 		break;
333 
334 	default:
335 die:
336 		panic("%s: %s: invalid volume state transition %d -> %d",
337 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
338 		    old_state, new_state);
339 		/* NOTREACHED */
340 	}
341 
342 	sd->sd_vol_status = new_state;
343 }
344 
345 static inline int
346 sr_raid5_chunk_online(struct sr_discipline *sd, int chunk)
347 {
348 	switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
349 	case BIOC_SDONLINE:
350 	case BIOC_SDSCRUB:
351 		return 1;
352 	default:
353 		return 0;
354 	}
355 }
356 
357 static inline int
358 sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk)
359 {
360 	switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
361 	case BIOC_SDREBUILD:
362 		return 1;
363 	default:
364 		return 0;
365 	}
366 }
367 
368 int
369 sr_raid5_rw(struct sr_workunit *wu)
370 {
371 	struct sr_workunit	*wu_r = NULL;
372 	struct sr_discipline	*sd = wu->swu_dis;
373 	struct scsi_xfer	*xs = wu->swu_xs;
374 	struct sr_chunk		*scp;
375 	daddr_t			blkno, lba;
376 	int64_t			chunk_offs, lbaoffs, offset, strip_offs;
377 	int64_t			strip_bits, strip_no, strip_size;
378 	int64_t			chunk, no_chunk;
379 	int64_t			parity, row_size;
380 	long			length, datalen;
381 	void			*data;
382 	int			s;
383 
384 	/* blkno and scsi error will be handled by sr_validate_io */
385 	if (sr_validate_io(wu, &blkno, "sr_raid5_rw"))
386 		goto bad;
387 
388 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: blkno %lld size %d\n",
389 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
390 	    (xs->flags & SCSI_DATA_IN) ? "read" : "write",
391 	    (long long)blkno, xs->datalen);
392 
393 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
394 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
395 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
396 	row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
397 
398 	data = xs->data;
399 	datalen = xs->datalen;
400 	lbaoffs	= blkno << DEV_BSHIFT;
401 
402 	if (xs->flags & SCSI_DATA_OUT) {
403 		if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
404 			printf("%s: %s failed to get read work unit",
405 			    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
406 			goto bad;
407 		}
408 		wu_r->swu_state = SR_WU_INPROGRESS;
409 		wu_r->swu_flags |= SR_WUF_DISCIPLINE;
410 	}
411 
412 	wu->swu_blk_start = 0;
413 	while (datalen != 0) {
414 		strip_no = lbaoffs >> strip_bits;
415 		strip_offs = lbaoffs & (strip_size - 1);
416 		chunk_offs = (strip_no / no_chunk) << strip_bits;
417 		offset = chunk_offs + strip_offs;
418 
419 		/* get size remaining in this stripe */
420 		length = MIN(strip_size - strip_offs, datalen);
421 
422 		/*
423 		 * Map disk offset to data and parity chunks, using a left
424 		 * asymmetric algorithm for the parity assignment.
425 		 */
426 		chunk = strip_no % no_chunk;
427 		parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
428 		if (chunk >= parity)
429 			chunk++;
430 
431 		lba = offset >> DEV_BSHIFT;
432 
433 		/* XXX big hammer.. exclude I/O from entire stripe */
434 		if (wu->swu_blk_start == 0)
435 			wu->swu_blk_start = (strip_no / no_chunk) * row_size;
436 		wu->swu_blk_end = (strip_no / no_chunk) * row_size +
437 		    (row_size - 1);
438 
439 		scp = sd->sd_vol.sv_chunks[chunk];
440 		if (xs->flags & SCSI_DATA_IN) {
441 			switch (scp->src_meta.scm_status) {
442 			case BIOC_SDONLINE:
443 			case BIOC_SDSCRUB:
444 				/*
445 				 * Chunk is online, issue a single read
446 				 * request.
447 				 */
448 				if (sr_raid5_addio(wu, chunk, lba, length,
449 				    data, xs->flags, 0, NULL))
450 					goto bad;
451 				break;
452 			case BIOC_SDOFFLINE:
453 			case BIOC_SDREBUILD:
454 			case BIOC_SDHOTSPARE:
455 				if (sr_raid5_regenerate(wu, chunk, lba,
456 				    length, data))
457 					goto bad;
458 				break;
459 			default:
460 				printf("%s: is offline, can't read\n",
461 				    DEVNAME(sd->sd_sc));
462 				goto bad;
463 			}
464 		} else {
465 			if (sr_raid5_write(wu, wu_r, chunk, parity, lba,
466 			    length, data, xs->flags, 0))
467 				goto bad;
468 		}
469 
470 		/* advance to next block */
471 		lbaoffs += length;
472 		datalen -= length;
473 		data += length;
474 	}
475 
476 	s = splbio();
477 	if (wu_r) {
478 		if (wu_r->swu_io_count > 0) {
479 			/* collide write request with reads */
480 			wu_r->swu_blk_start = wu->swu_blk_start;
481 			wu_r->swu_blk_end = wu->swu_blk_end;
482 
483 			wu->swu_state = SR_WU_DEFERRED;
484 			wu_r->swu_collider = wu;
485 			TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
486 
487 			wu = wu_r;
488 		} else {
489 			sr_scsi_wu_put(sd, wu_r);
490 		}
491 	}
492 	splx(s);
493 
494 	sr_schedule_wu(wu);
495 
496 	return (0);
497 
498 bad:
499 	/* wu is unwound by sr_wu_put */
500 	if (wu_r)
501 		sr_scsi_wu_put(sd, wu_r);
502 	return (1);
503 }
504 
505 int
506 sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno,
507     long len, void *data)
508 {
509 	struct sr_discipline	*sd = wu->swu_dis;
510 	int			i;
511 
512 	/*
513 	 * Regenerate a block on a RAID 5 volume by xoring the data and parity
514 	 * from all of the remaining online chunks. This requires the parity
515 	 * to already be correct.
516 	 */
517 
518 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, "
519 	    "regenerating block %llu\n",
520 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno);
521 
522 	memset(data, 0, len);
523 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
524 		if (i == chunk)
525 			continue;
526 		if (!sr_raid5_chunk_online(sd, i))
527 			goto bad;
528 		if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN,
529 		    0, data))
530 			goto bad;
531 	}
532 	return (0);
533 
534 bad:
535 	return (1);
536 }
537 
538 int
539 sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk,
540     int parity, daddr_t blkno, long len, void *data, int xsflags,
541     int ccbflags)
542 {
543 	struct sr_discipline	*sd = wu->swu_dis;
544 	struct scsi_xfer	*xs = wu->swu_xs;
545 	void			*xorbuf;
546 	int			chunk_online, chunk_rebuild;
547 	int			parity_online, parity_rebuild;
548 	int			other_offline = 0, other_rebuild = 0;
549 	int			i;
550 
551 	/*
552 	 * Perform a write to a RAID 5 volume. This write routine does not
553 	 * require the parity to already be correct and will operate on a
554 	 * uninitialised volume.
555 	 *
556 	 * There are four possible cases:
557 	 *
558 	 * 1) All data chunks and parity are online. In this case we read the
559 	 *    data from all data chunks, except the one we are writing to, in
560 	 *    order to calculate and write the new parity.
561 	 *
562 	 * 2) The parity chunk is offline. In this case we only need to write
563 	 *    to the data chunk. No parity calculation is required.
564 	 *
565 	 * 3) The data chunk is offline. In this case we read the data from all
566 	 *    online chunks in order to calculate and write the new parity.
567 	 *    This is the same as (1) except we do not write the data chunk.
568 	 *
569 	 * 4) A different data chunk is offline. The new parity is calculated
570 	 *    by taking the existing parity, xoring the original data and
571 	 *    xoring in the new data. This requires that the parity already be
572 	 *    correct, which it will be if any of the data chunks has
573 	 *    previously been written.
574 	 *
575 	 * There is an additional complication introduced by a chunk that is
576 	 * being rebuilt. If this is the data or parity chunk, then we want
577 	 * to write to it as per normal. If it is another data chunk then we
578 	 * need to presume that it has not yet been regenerated and use the
579 	 * same method as detailed in (4) above.
580 	 */
581 
582 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i "
583 	    "blkno %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
584 	    chunk, parity, (unsigned long long)blkno);
585 
586 	chunk_online = sr_raid5_chunk_online(sd, chunk);
587 	chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk);
588 	parity_online = sr_raid5_chunk_online(sd, parity);
589 	parity_rebuild = sr_raid5_chunk_rebuild(sd, parity);
590 
591 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
592 		if (i == chunk || i == parity)
593 			continue;
594 		if (sr_raid5_chunk_rebuild(sd, i))
595 			other_rebuild = 1;
596 		else if (!sr_raid5_chunk_online(sd, i))
597 			other_offline = 1;
598 	}
599 
600 	DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, "
601 	    "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
602 	    chunk_online, parity_online, other_offline);
603 
604 	if (!parity_online && !parity_rebuild)
605 		goto data_write;
606 
607 	xorbuf = sr_block_get(sd, len);
608 	if (xorbuf == NULL)
609 		goto bad;
610 	memcpy(xorbuf, data, len);
611 
612 	if (other_offline || other_rebuild) {
613 
614 		/*
615 		 * XXX - If we can guarantee that this LBA has been scrubbed
616 		 * then we can also take this faster path.
617 		 */
618 
619 		/* Read in existing data and existing parity. */
620 		if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL,
621 		    SCSI_DATA_IN, 0, xorbuf))
622 			goto bad;
623 		if (sr_raid5_addio(wu_r, parity, blkno, len, NULL,
624 		    SCSI_DATA_IN, 0, xorbuf))
625 			goto bad;
626 
627 	} else {
628 
629 		/* Read in existing data from all other chunks. */
630 		for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
631 			if (i == chunk || i == parity)
632 				continue;
633 			if (sr_raid5_addio(wu_r, i, blkno, len, NULL,
634 			    SCSI_DATA_IN, 0, xorbuf))
635 				goto bad;
636 		}
637 
638 	}
639 
640 	/* Write new parity. */
641 	if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags,
642 	    SR_CCBF_FREEBUF, NULL))
643 		goto bad;
644 
645 data_write:
646 	/* Write new data. */
647 	if (chunk_online || chunk_rebuild)
648 		if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags,
649 		    0, NULL))
650 			goto bad;
651 
652 	return (0);
653 
654 bad:
655 	return (1);
656 }
657 
658 void
659 sr_raid5_intr(struct buf *bp)
660 {
661 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
662 	struct sr_workunit	*wu = ccb->ccb_wu;
663 	struct sr_discipline	*sd = wu->swu_dis;
664 	int			s;
665 
666 	DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n",
667 	    DEVNAME(sd->sd_sc), bp, wu->swu_xs);
668 
669 	s = splbio();
670 	sr_ccb_done(ccb);
671 
672 	/* XXX - Should this be done via the taskq? */
673 
674 	/* XOR data to result. */
675 	if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque)
676 		sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data,
677 		    ccb->ccb_buf.b_bcount);
678 
679 	/* Free allocated data buffer. */
680 	if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
681 		sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
682 		ccb->ccb_buf.b_data = NULL;
683 	}
684 
685 	sr_wu_done(wu);
686 	splx(s);
687 }
688 
689 int
690 sr_raid5_wu_done(struct sr_workunit *wu)
691 {
692 	struct sr_discipline	*sd = wu->swu_dis;
693 	struct scsi_xfer	*xs = wu->swu_xs;
694 
695 	/* XXX - we have no way of propagating errors... */
696 	if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD))
697 		return SR_WU_OK;
698 
699 	/* XXX - This is insufficient for RAID 5. */
700 	if (wu->swu_ios_succeeded > 0) {
701 		xs->error = XS_NOERROR;
702 		return SR_WU_OK;
703 	}
704 
705 	if (xs->flags & SCSI_DATA_IN) {
706 		printf("%s: retrying read on block %lld\n",
707 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
708 		sr_wu_release_ccbs(wu);
709 		wu->swu_state = SR_WU_RESTART;
710 		if (sd->sd_scsi_rw(wu) == 0)
711 			return SR_WU_RESTART;
712 	} else {
713 		/* XXX - retry write if we just went from online to degraded. */
714 		printf("%s: permanently fail write on block %lld\n",
715 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
716 	}
717 
718 	wu->swu_state = SR_WU_FAILED;
719 	xs->error = XS_DRIVER_STUFFUP;
720 
721 	return SR_WU_FAILED;
722 }
723 
724 int
725 sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
726     long len, void *data, int xsflags, int ccbflags, void *xorbuf)
727 {
728 	struct sr_discipline	*sd = wu->swu_dis;
729 	struct sr_ccb		*ccb;
730 
731 	DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld "
732 	    "length %ld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
733 	    chunk, (long long)blkno, len, xorbuf ? "X0R" : "-");
734 
735 	/* Allocate temporary buffer. */
736 	if (data == NULL) {
737 		data = sr_block_get(sd, len);
738 		if (data == NULL)
739 			return (-1);
740 		ccbflags |= SR_CCBF_FREEBUF;
741 	}
742 
743 	ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
744 	if (ccb == NULL) {
745 		if (ccbflags & SR_CCBF_FREEBUF)
746 			sr_block_put(sd, data, len);
747 		return (-1);
748 	}
749 	ccb->ccb_opaque = xorbuf;
750 	sr_wu_enqueue_ccb(wu, ccb);
751 
752 	return (0);
753 }
754 
755 void
756 sr_raid5_xor(void *a, void *b, int len)
757 {
758 	uint32_t		*xa = a, *xb = b;
759 
760 	len >>= 2;
761 	while (len--)
762 		*xa++ ^= *xb++;
763 }
764 
765 void
766 sr_raid5_rebuild(struct sr_discipline *sd)
767 {
768 	int64_t strip_no, strip_size, strip_bits, i, restart;
769 	int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size;
770 	struct sr_workunit *wu_r, *wu_w;
771 	int s, slept, percent = 0, old_percent = -1;
772 	int rebuild_chunk = -1;
773 	void *xorbuf;
774 
775 	/* Find the rebuild chunk. */
776 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
777 		if (sr_raid5_chunk_rebuild(sd, i)) {
778 			rebuild_chunk = i;
779 			break;
780 		}
781 	}
782 	if (rebuild_chunk == -1)
783 		goto bad;
784 
785 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
786 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
787 	chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1;
788 	chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count;
789 	chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits;
790 	row_size = (chunk_count << strip_bits) >> DEV_BSHIFT;
791 
792 	DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, "
793 	    "chunk count = %lld, chunk size = %lld, chunk strips = %lld, "
794 	    "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
795 	    sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips,
796 	    row_size);
797 
798 	restart = sd->sd_meta->ssd_rebuild / row_size;
799 	if (restart > chunk_strips) {
800 		printf("%s: bogus rebuild restart offset, starting from 0\n",
801 		    DEVNAME(sd->sd_sc));
802 		restart = 0;
803 	}
804 	if (restart != 0) {
805 		percent = sr_rebuild_percent(sd);
806 		printf("%s: resuming rebuild on %s at %d%%\n",
807 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, percent);
808 	}
809 
810 	for (strip_no = restart; strip_no < chunk_strips; strip_no++) {
811 		chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no;
812 
813 		DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, "
814 		    "chunk lba = %lld\n", DEVNAME(sd->sd_sc),
815 		    sd->sd_meta->ssd_devname, strip_no, chunk_lba);
816 
817 		wu_w = sr_scsi_wu_get(sd, 0);
818 		wu_r = sr_scsi_wu_get(sd, 0);
819 
820 		xorbuf = sr_block_get(sd, strip_size);
821 		if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba,
822 		    strip_size, xorbuf))
823 			goto bad;
824 		if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size,
825 		    xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL))
826 			goto bad;
827 
828 		/* Collide write work unit with read work unit. */
829 		wu_r->swu_state = SR_WU_INPROGRESS;
830 		wu_r->swu_flags |= SR_WUF_REBUILD;
831 		wu_w->swu_state = SR_WU_DEFERRED;
832 		wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
833 		wu_r->swu_collider = wu_w;
834 
835 		/* Block I/O to this strip while we rebuild it. */
836 		wu_r->swu_blk_start = (strip_no / chunk_count) * row_size;
837 		wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1;
838 		wu_w->swu_blk_start = wu_r->swu_blk_start;
839 		wu_w->swu_blk_end = wu_r->swu_blk_end;
840 
841 		DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, "
842 		    "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc),
843 		    sd->sd_meta->ssd_devname,
844 		    wu_r->swu_blk_start, wu_r->swu_blk_end);
845 
846 		s = splbio();
847 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
848 		splx(s);
849 
850 		sr_schedule_wu(wu_r);
851 
852 		slept = 0;
853 		while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
854 			tsleep(wu_w, PRIBIO, "sr_rebuild", 0);
855 			slept = 1;
856 		}
857 		if (!slept)
858 			tsleep(sd->sd_sc, PWAIT, "sr_yield", 1);
859 
860 		sr_scsi_wu_put(sd, wu_r);
861 		sr_scsi_wu_put(sd, wu_w);
862 
863 		sd->sd_meta->ssd_rebuild = chunk_lba * chunk_count;
864 
865 		percent = sr_rebuild_percent(sd);
866 		if (percent != old_percent && strip_no != chunk_strips - 1) {
867 			if (sr_meta_save(sd, SR_META_DIRTY))
868 				printf("%s: could not save metadata to %s\n",
869 				    DEVNAME(sd->sd_sc),
870 				    sd->sd_meta->ssd_devname);
871 			old_percent = percent;
872 		}
873 
874 		if (sd->sd_reb_abort)
875 			goto abort;
876 	}
877 
878 	DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc),
879 	    sd->sd_meta->ssd_devname);
880 
881 	/* all done */
882 	sd->sd_meta->ssd_rebuild = 0;
883 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
884 		if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status ==
885 		    BIOC_SDREBUILD) {
886 			sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE);
887 			break;
888 		}
889 	}
890 
891 	return;
892 
893 abort:
894 	if (sr_meta_save(sd, SR_META_DIRTY))
895 		printf("%s: could not save metadata to %s\n",
896 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
897 bad:
898 	return;
899 }
900 
901 #if 0
902 void
903 sr_raid5_scrub(struct sr_discipline *sd)
904 {
905 	int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits;
906 	int64_t i;
907 	struct sr_workunit *wu_r, *wu_w;
908 	int s, slept;
909 	void *xorbuf;
910 
911 	wu_w = sr_scsi_wu_get(sd, 0);
912 	wu_r = sr_scsi_wu_get(sd, 0);
913 
914 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
915 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
916 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
917 	max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits;
918 
919 	for (strip_no = 0; strip_no < max_strip; strip_no++) {
920 		parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
921 
922 		xorbuf = sr_block_get(sd, strip_size);
923 		for (i = 0; i <= no_chunk; i++) {
924 			if (i != parity)
925 				sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size,
926 				    NULL, SCSI_DATA_IN, 0, xorbuf);
927 		}
928 		sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf,
929 		    SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL);
930 
931 		wu_r->swu_flags |= SR_WUF_REBUILD;
932 
933 		/* Collide wu_w with wu_r */
934 		wu_w->swu_state = SR_WU_DEFERRED;
935 		wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
936 		wu_r->swu_collider = wu_w;
937 
938 		s = splbio();
939 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
940 		splx(s);
941 
942 		wu_r->swu_state = SR_WU_INPROGRESS;
943 		sr_schedule_wu(wu_r);
944 
945 		slept = 0;
946 		while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
947 			tsleep(wu_w, PRIBIO, "sr_scrub", 0);
948 			slept = 1;
949 		}
950 		if (!slept)
951 			tsleep(sd->sd_sc, PWAIT, "sr_yield", 1);
952 	}
953 done:
954 	return;
955 }
956 #endif
957