xref: /openbsd/sys/dev/softraid_raid6.c (revision 9b7c3dbb)
1 /* $OpenBSD: softraid_raid6.c,v 1.71 2016/04/12 16:26:54 krw Exp $ */
2 /*
3  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
4  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bio.h"
20 
21 #include <sys/param.h>
22 #include <sys/systm.h>
23 #include <sys/buf.h>
24 #include <sys/device.h>
25 #include <sys/ioctl.h>
26 #include <sys/malloc.h>
27 #include <sys/kernel.h>
28 #include <sys/disk.h>
29 #include <sys/rwlock.h>
30 #include <sys/queue.h>
31 #include <sys/fcntl.h>
32 #include <sys/mount.h>
33 #include <sys/sensors.h>
34 #include <sys/stat.h>
35 #include <sys/task.h>
36 #include <sys/conf.h>
37 #include <sys/uio.h>
38 
39 #include <scsi/scsi_all.h>
40 #include <scsi/scsiconf.h>
41 #include <scsi/scsi_disk.h>
42 
43 #include <dev/softraidvar.h>
44 
45 uint8_t *gf_map[256];
46 uint8_t	gf_pow[768];
47 int	gf_log[256];
48 
49 /* RAID 6 functions. */
50 int	sr_raid6_create(struct sr_discipline *, struct bioc_createraid *,
51 	    int, int64_t);
52 int	sr_raid6_assemble(struct sr_discipline *, struct bioc_createraid *,
53 	    int, void *);
54 int	sr_raid6_init(struct sr_discipline *);
55 int	sr_raid6_rw(struct sr_workunit *);
56 int	sr_raid6_openings(struct sr_discipline *);
57 void	sr_raid6_intr(struct buf *);
58 int	sr_raid6_wu_done(struct sr_workunit *);
59 void	sr_raid6_set_chunk_state(struct sr_discipline *, int, int);
60 void	sr_raid6_set_vol_state(struct sr_discipline *);
61 
62 void	sr_raid6_xorp(void *, void *, int);
63 void	sr_raid6_xorq(void *, void *, int, int);
64 int	sr_raid6_addio(struct sr_workunit *wu, int, daddr_t, long,
65 	    void *, int, int, void *, void *, int);
66 void	sr_raid6_scrub(struct sr_discipline *);
67 int	sr_failio(struct sr_workunit *);
68 
69 void	gf_init(void);
70 uint8_t gf_inv(uint8_t);
71 int	gf_premul(uint8_t);
72 uint8_t gf_mul(uint8_t, uint8_t);
73 
74 #define SR_NOFAIL		0x00
75 #define SR_FAILX		(1L << 0)
76 #define SR_FAILY		(1L << 1)
77 #define SR_FAILP		(1L << 2)
78 #define SR_FAILQ		(1L << 3)
79 
80 struct sr_raid6_opaque {
81 	int	gn;
82 	void	*pbuf;
83 	void	*qbuf;
84 };
85 
86 /* discipline initialisation. */
87 void
88 sr_raid6_discipline_init(struct sr_discipline *sd)
89 {
90 	/* Initialize GF256 tables. */
91 	gf_init();
92 
93 	/* Fill out discipline members. */
94 	sd->sd_type = SR_MD_RAID6;
95 	strlcpy(sd->sd_name, "RAID 6", sizeof(sd->sd_name));
96 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
97 	    SR_CAP_REDUNDANT;
98 	sd->sd_max_wu = SR_RAID6_NOWU;
99 
100 	/* Setup discipline specific function pointers. */
101 	sd->sd_assemble = sr_raid6_assemble;
102 	sd->sd_create = sr_raid6_create;
103 	sd->sd_openings = sr_raid6_openings;
104 	sd->sd_scsi_rw = sr_raid6_rw;
105 	sd->sd_scsi_intr = sr_raid6_intr;
106 	sd->sd_scsi_wu_done = sr_raid6_wu_done;
107 	sd->sd_set_chunk_state = sr_raid6_set_chunk_state;
108 	sd->sd_set_vol_state = sr_raid6_set_vol_state;
109 }
110 
111 int
112 sr_raid6_create(struct sr_discipline *sd, struct bioc_createraid *bc,
113     int no_chunk, int64_t coerced_size)
114 {
115 	if (no_chunk < 4) {
116 		sr_error(sd->sd_sc, "%s requires four or more chunks",
117 		    sd->sd_name);
118 		return EINVAL;
119 	}
120 
121 	/*
122 	 * XXX add variable strip size later even though MAXPHYS is really
123 	 * the clever value, users like * to tinker with that type of stuff.
124 	 */
125 	sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
126 	sd->sd_meta->ssdi.ssd_size = (coerced_size &
127 	    ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
128 	    DEV_BSHIFT) - 1)) * (no_chunk - 2);
129 
130 	return sr_raid6_init(sd);
131 }
132 
133 int
134 sr_raid6_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
135     int no_chunk, void *data)
136 {
137 	return sr_raid6_init(sd);
138 }
139 
140 int
141 sr_raid6_init(struct sr_discipline *sd)
142 {
143 	/* Initialise runtime values. */
144 	sd->mds.mdd_raid6.sr6_strip_bits =
145 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
146 	if (sd->mds.mdd_raid6.sr6_strip_bits == -1) {
147 		sr_error(sd->sd_sc, "invalid strip size");
148 		return EINVAL;
149 	}
150 
151 	/* only if stripsize <= MAXPHYS */
152 	sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no);
153 
154 	return 0;
155 }
156 
157 int
158 sr_raid6_openings(struct sr_discipline *sd)
159 {
160 	return (sd->sd_max_wu >> 1); /* 2 wu's per IO */
161 }
162 
163 void
164 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
165 {
166 	int			old_state, s;
167 
168 	/* XXX this is for RAID 0 */
169 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
170 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
171 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
172 
173 	/* ok to go to splbio since this only happens in error path */
174 	s = splbio();
175 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
176 
177 	/* multiple IOs to the same chunk that fail will come through here */
178 	if (old_state == new_state)
179 		goto done;
180 
181 	switch (old_state) {
182 	case BIOC_SDONLINE:
183 		switch (new_state) {
184 		case BIOC_SDOFFLINE:
185 		case BIOC_SDSCRUB:
186 			break;
187 		default:
188 			goto die;
189 		}
190 		break;
191 
192 	case BIOC_SDOFFLINE:
193 		if (new_state == BIOC_SDREBUILD) {
194 			;
195 		} else
196 			goto die;
197 		break;
198 
199 	case BIOC_SDSCRUB:
200 		switch (new_state) {
201 		case BIOC_SDONLINE:
202 		case BIOC_SDOFFLINE:
203 			break;
204 		default:
205 			goto die;
206 		}
207 		break;
208 
209 	case BIOC_SDREBUILD:
210 		switch (new_state) {
211 		case BIOC_SDONLINE:
212 		case BIOC_SDOFFLINE:
213 			break;
214 		default:
215 			goto die;
216 		}
217 		break;
218 
219 	default:
220 die:
221 		splx(s); /* XXX */
222 		panic("%s: %s: %s: invalid chunk state transition "
223 		    "%d -> %d", DEVNAME(sd->sd_sc),
224 		    sd->sd_meta->ssd_devname,
225 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
226 		    old_state, new_state);
227 		/* NOTREACHED */
228 	}
229 
230 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
231 	sd->sd_set_vol_state(sd);
232 
233 	sd->sd_must_flush = 1;
234 	task_add(systq, &sd->sd_meta_save_task);
235 done:
236 	splx(s);
237 }
238 
239 void
240 sr_raid6_set_vol_state(struct sr_discipline *sd)
241 {
242 	int			states[SR_MAX_STATES];
243 	int			new_state, i, s, nd;
244 	int			old_state = sd->sd_vol_status;
245 
246 	/* XXX this is for RAID 0 */
247 
248 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
249 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
250 
251 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
252 
253 	for (i = 0; i < SR_MAX_STATES; i++)
254 		states[i] = 0;
255 
256 	for (i = 0; i < nd; i++) {
257 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
258 		if (s >= SR_MAX_STATES)
259 			panic("%s: %s: %s: invalid chunk state",
260 			    DEVNAME(sd->sd_sc),
261 			    sd->sd_meta->ssd_devname,
262 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
263 		states[s]++;
264 	}
265 
266 	if (states[BIOC_SDONLINE] == nd)
267 		new_state = BIOC_SVONLINE;
268 	else if (states[BIOC_SDONLINE] < nd - 2)
269 		new_state = BIOC_SVOFFLINE;
270 	else if (states[BIOC_SDSCRUB] != 0)
271 		new_state = BIOC_SVSCRUB;
272 	else if (states[BIOC_SDREBUILD] != 0)
273 		new_state = BIOC_SVREBUILD;
274 	else if (states[BIOC_SDONLINE] < nd)
275 		new_state = BIOC_SVDEGRADED;
276 	else {
277 		printf("old_state = %d, ", old_state);
278 		for (i = 0; i < nd; i++)
279 			printf("%d = %d, ", i,
280 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
281 		panic("invalid new_state");
282 	}
283 
284 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n",
285 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
286 	    old_state, new_state);
287 
288 	switch (old_state) {
289 	case BIOC_SVONLINE:
290 		switch (new_state) {
291 		case BIOC_SVONLINE: /* can go to same state */
292 		case BIOC_SVOFFLINE:
293 		case BIOC_SVDEGRADED:
294 		case BIOC_SVREBUILD: /* happens on boot */
295 			break;
296 		default:
297 			goto die;
298 		}
299 		break;
300 
301 	case BIOC_SVOFFLINE:
302 		/* XXX this might be a little too much */
303 		goto die;
304 
305 	case BIOC_SVDEGRADED:
306 		switch (new_state) {
307 		case BIOC_SVOFFLINE:
308 		case BIOC_SVREBUILD:
309 		case BIOC_SVDEGRADED: /* can go to the same state */
310 			break;
311 		default:
312 			goto die;
313 		}
314 		break;
315 
316 	case BIOC_SVBUILDING:
317 		switch (new_state) {
318 		case BIOC_SVONLINE:
319 		case BIOC_SVOFFLINE:
320 		case BIOC_SVBUILDING: /* can go to the same state */
321 			break;
322 		default:
323 			goto die;
324 		}
325 		break;
326 
327 	case BIOC_SVSCRUB:
328 		switch (new_state) {
329 		case BIOC_SVONLINE:
330 		case BIOC_SVOFFLINE:
331 		case BIOC_SVDEGRADED:
332 		case BIOC_SVSCRUB: /* can go to same state */
333 			break;
334 		default:
335 			goto die;
336 		}
337 		break;
338 
339 	case BIOC_SVREBUILD:
340 		switch (new_state) {
341 		case BIOC_SVONLINE:
342 		case BIOC_SVOFFLINE:
343 		case BIOC_SVDEGRADED:
344 		case BIOC_SVREBUILD: /* can go to the same state */
345 			break;
346 		default:
347 			goto die;
348 		}
349 		break;
350 
351 	default:
352 die:
353 		panic("%s: %s: invalid volume state transition %d -> %d",
354 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
355 		    old_state, new_state);
356 		/* NOTREACHED */
357 	}
358 
359 	sd->sd_vol_status = new_state;
360 }
361 
362 /*  modes:
363  *   readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
364  *		0, qbuf, NULL, 0);
365  *   readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
366  *		0, pbuf, NULL, 0);
367  *   readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
368  *		0, pbuf, qbuf, gf_pow[i]);
369  */
370 
371 int
372 sr_raid6_rw(struct sr_workunit *wu)
373 {
374 	struct sr_workunit	*wu_r = NULL;
375 	struct sr_discipline	*sd = wu->swu_dis;
376 	struct scsi_xfer	*xs = wu->swu_xs;
377 	struct sr_chunk		*scp;
378 	int			s, fail, i, gxinv, pxinv;
379 	daddr_t			blkno, lba;
380 	int64_t			chunk_offs, lbaoffs, offset, strip_offs;
381 	int64_t			strip_no, strip_size, strip_bits, row_size;
382 	int64_t			fchunk, no_chunk, chunk, qchunk, pchunk;
383 	long			length, datalen;
384 	void			*pbuf, *data, *qbuf;
385 
386 	/* blkno and scsi error will be handled by sr_validate_io */
387 	if (sr_validate_io(wu, &blkno, "sr_raid6_rw"))
388 		goto bad;
389 
390 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
391 	strip_bits = sd->mds.mdd_raid6.sr6_strip_bits;
392 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2;
393 	row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
394 
395 	data = xs->data;
396 	datalen = xs->datalen;
397 	lbaoffs	= blkno << DEV_BSHIFT;
398 
399 	if (xs->flags & SCSI_DATA_OUT) {
400 		if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
401 			printf("%s: can't get wu_r", DEVNAME(sd->sd_sc));
402 			goto bad;
403 		}
404 		wu_r->swu_state = SR_WU_INPROGRESS;
405 		wu_r->swu_flags |= SR_WUF_DISCIPLINE;
406 	}
407 
408 	wu->swu_blk_start = 0;
409 	while (datalen != 0) {
410 		strip_no = lbaoffs >> strip_bits;
411 		strip_offs = lbaoffs & (strip_size - 1);
412 		chunk_offs = (strip_no / no_chunk) << strip_bits;
413 		offset = chunk_offs + strip_offs;
414 
415 		/* get size remaining in this stripe */
416 		length = MIN(strip_size - strip_offs, datalen);
417 
418 		/* map disk offset to parity/data drive */
419 		chunk = strip_no % no_chunk;
420 
421 		qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2));
422 		if (qchunk == 0)
423 			pchunk = no_chunk + 1;
424 		else
425 			pchunk = qchunk - 1;
426 		if (chunk >= pchunk)
427 			chunk++;
428 		if (chunk >= qchunk)
429 			chunk++;
430 
431 		lba = offset >> DEV_BSHIFT;
432 
433 		/* XXX big hammer.. exclude I/O from entire stripe */
434 		if (wu->swu_blk_start == 0)
435 			wu->swu_blk_start = (strip_no / no_chunk) * row_size;
436 		wu->swu_blk_end = (strip_no / no_chunk) * row_size + (row_size - 1);
437 
438 		fail = 0;
439 		fchunk = -1;
440 
441 		/* Get disk-fail flags */
442 		for (i=0; i< no_chunk+2; i++) {
443 			scp = sd->sd_vol.sv_chunks[i];
444 			switch (scp->src_meta.scm_status) {
445 			case BIOC_SDOFFLINE:
446 			case BIOC_SDREBUILD:
447 			case BIOC_SDHOTSPARE:
448 				if (i == qchunk)
449 					fail |= SR_FAILQ;
450 				else if (i == pchunk)
451 					fail |= SR_FAILP;
452 				else if (i == chunk)
453 					fail |= SR_FAILX;
454 				else {
455 					/* dual data-disk failure */
456 					fail |= SR_FAILY;
457 					fchunk = i;
458 				}
459 				break;
460 			}
461 		}
462 		if (xs->flags & SCSI_DATA_IN) {
463 			if (!(fail & SR_FAILX)) {
464 				/* drive is good. issue single read request */
465 				if (sr_raid6_addio(wu, chunk, lba, length,
466 				    data, xs->flags, 0, NULL, NULL, 0))
467 					goto bad;
468 			} else if (fail & SR_FAILP) {
469 				/* Dx, P failed */
470 				printf("Disk %llx offline, "
471 				    "regenerating Dx+P\n", chunk);
472 
473 				gxinv = gf_inv(gf_pow[chunk]);
474 
475 				/* Calculate: Dx = (Q^Dz*gz)*inv(gx) */
476 				memset(data, 0, length);
477 				if (sr_raid6_addio(wu, qchunk, lba, length,
478 				    NULL, SCSI_DATA_IN, 0, NULL, data, gxinv))
479 					goto bad;
480 
481 				/* Read Dz * gz * inv(gx) */
482 				for (i = 0; i < no_chunk+2; i++) {
483 					if  (i == qchunk || i == pchunk || i == chunk)
484 						continue;
485 
486 					if (sr_raid6_addio(wu, i, lba, length,
487 					    NULL, SCSI_DATA_IN, 0, NULL, data,
488 					    gf_mul(gf_pow[i], gxinv)))
489 						goto bad;
490 				}
491 
492 				/* data will contain correct value on completion */
493 			} else if (fail & SR_FAILY) {
494 				/* Dx, Dy failed */
495 				printf("Disk %llx & %llx offline, "
496 				    "regenerating Dx+Dy\n", chunk, fchunk);
497 
498 				gxinv = gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]);
499 				pxinv = gf_mul(gf_pow[fchunk], gxinv);
500 
501 				/* read Q * inv(gx + gy) */
502 				memset(data, 0, length);
503 				if (sr_raid6_addio(wu, qchunk, lba, length,
504 				    NULL, SCSI_DATA_IN, 0, NULL, data, gxinv))
505 					goto bad;
506 
507 				/* read P * gy * inv(gx + gy) */
508 				if (sr_raid6_addio(wu, pchunk, lba, length,
509 				    NULL, SCSI_DATA_IN, 0, NULL, data, pxinv))
510 					goto bad;
511 
512 				/* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz
513 				 *   Q:  sr_raid6_xorp(qbuf, --, length);
514 				 *   P:  sr_raid6_xorp(pbuf, --, length);
515 				 *   Dz: sr_raid6_xorp(pbuf, --, length);
516 				 *	 sr_raid6_xorq(qbuf, --, length, gf_pow[i]);
517 				 */
518 				for (i = 0; i < no_chunk+2; i++) {
519 					if (i == qchunk || i == pchunk ||
520 					    i == chunk || i == fchunk)
521 						continue;
522 
523 					/* read Dz * (gz + gy) * inv(gx + gy) */
524 					if (sr_raid6_addio(wu, i, lba, length,
525 					    NULL, SCSI_DATA_IN, 0, NULL, data,
526 					    pxinv ^ gf_mul(gf_pow[i], gxinv)))
527 						goto bad;
528 				}
529 			} else {
530 				/* Two cases: single disk (Dx) or (Dx+Q)
531 				 *   Dx = Dz ^ P (same as RAID5)
532 				 */
533 				printf("Disk %llx offline, "
534 				    "regenerating Dx%s\n", chunk,
535 				    fail & SR_FAILQ ? "+Q" : " single");
536 
537 				/* Calculate: Dx = P^Dz
538 				 *   P:  sr_raid6_xorp(data, ---, length);
539 				 *   Dz: sr_raid6_xorp(data, ---, length);
540 				 */
541 				memset(data, 0, length);
542 				for (i = 0; i < no_chunk+2; i++) {
543 					if (i != chunk && i != qchunk) {
544 						/* Read Dz */
545 						if (sr_raid6_addio(wu, i, lba,
546 						    length, NULL, SCSI_DATA_IN,
547 						    0, data, NULL, 0))
548 							goto bad;
549 					}
550 				}
551 
552 				/* data will contain correct value on completion */
553 			}
554 		} else {
555 			/* XXX handle writes to failed/offline disk? */
556 			if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP))
557 				goto bad;
558 
559 			/*
560 			 * initialize pbuf with contents of new data to be
561 			 * written. This will be XORed with old data and old
562 			 * parity in the intr routine. The result in pbuf
563 			 * is the new parity data.
564 			 */
565 			qbuf = sr_block_get(sd, length);
566 			if (qbuf == NULL)
567 				goto bad;
568 
569 			pbuf = sr_block_get(sd, length);
570 			if (pbuf == NULL)
571 				goto bad;
572 
573 			/* Calculate P = Dn; Q = gn * Dn */
574 			if (gf_premul(gf_pow[chunk]))
575 				goto bad;
576 			sr_raid6_xorp(pbuf, data, length);
577 			sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]);
578 
579 			/* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */
580 			if (sr_raid6_addio(wu_r, chunk, lba, length, NULL,
581 				SCSI_DATA_IN, 0, pbuf, qbuf, gf_pow[chunk]))
582 				goto bad;
583 
584 			/* Read old xor-parity: P ^= P' */
585 			if (sr_raid6_addio(wu_r, pchunk, lba, length, NULL,
586 				SCSI_DATA_IN, 0, pbuf, NULL, 0))
587 				goto bad;
588 
589 			/* Read old q-parity: Q ^= Q' */
590 			if (sr_raid6_addio(wu_r, qchunk, lba, length, NULL,
591 				SCSI_DATA_IN, 0, qbuf, NULL, 0))
592 				goto bad;
593 
594 			/* write new data */
595 			if (sr_raid6_addio(wu, chunk, lba, length, data,
596 			    xs->flags, 0, NULL, NULL, 0))
597 				goto bad;
598 
599 			/* write new xor-parity */
600 			if (sr_raid6_addio(wu, pchunk, lba, length, pbuf,
601 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
602 				goto bad;
603 
604 			/* write new q-parity */
605 			if (sr_raid6_addio(wu, qchunk, lba, length, qbuf,
606 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
607 				goto bad;
608 		}
609 
610 		/* advance to next block */
611 		lbaoffs += length;
612 		datalen -= length;
613 		data += length;
614 	}
615 
616 	s = splbio();
617 	if (wu_r) {
618 		/* collide write request with reads */
619 		wu_r->swu_blk_start = wu->swu_blk_start;
620 		wu_r->swu_blk_end = wu->swu_blk_end;
621 
622 		wu->swu_state = SR_WU_DEFERRED;
623 		wu_r->swu_collider = wu;
624 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
625 
626 		wu = wu_r;
627 	}
628 	splx(s);
629 
630 	sr_schedule_wu(wu);
631 
632 	return (0);
633 bad:
634 	/* XXX - can leak pbuf/qbuf on error. */
635 	/* wu is unwound by sr_wu_put */
636 	if (wu_r)
637 		sr_scsi_wu_put(sd, wu_r);
638 	return (1);
639 }
640 
641 /* Handle failure I/O completion */
642 int
643 sr_failio(struct sr_workunit *wu)
644 {
645 	struct sr_discipline	*sd = wu->swu_dis;
646 	struct sr_ccb		*ccb;
647 
648 	if (!(wu->swu_flags & SR_WUF_FAIL))
649 		return (0);
650 
651 	/* Wu is a 'fake'.. don't do real I/O just intr */
652 	TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
653 	TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link)
654 		sr_raid6_intr(&ccb->ccb_buf);
655 	return (1);
656 }
657 
658 void
659 sr_raid6_intr(struct buf *bp)
660 {
661 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
662 	struct sr_workunit	*wu = ccb->ccb_wu;
663 	struct sr_discipline	*sd = wu->swu_dis;
664 	struct sr_raid6_opaque  *pq = ccb->ccb_opaque;
665 	int			s;
666 
667 	DNPRINTF(SR_D_INTR, "%s: sr_raid6_intr bp %p xs %p\n",
668 	    DEVNAME(sd->sd_sc), bp, wu->swu_xs);
669 
670 	s = splbio();
671 	sr_ccb_done(ccb);
672 
673 	/* XOR data to result. */
674 	if (ccb->ccb_state == SR_CCB_OK && pq) {
675 		if (pq->pbuf)
676 			/* Calculate xor-parity */
677 			sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data,
678 			    ccb->ccb_buf.b_bcount);
679 		if (pq->qbuf)
680 			/* Calculate q-parity */
681 			sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data,
682 			    ccb->ccb_buf.b_bcount, pq->gn);
683 		free(pq, M_DEVBUF, 0);
684 		ccb->ccb_opaque = NULL;
685 	}
686 
687 	/* Free allocated data buffer. */
688 	if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
689 		sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
690 		ccb->ccb_buf.b_data = NULL;
691 	}
692 
693 	sr_wu_done(wu);
694 	splx(s);
695 }
696 
697 int
698 sr_raid6_wu_done(struct sr_workunit *wu)
699 {
700 	struct sr_discipline	*sd = wu->swu_dis;
701 	struct scsi_xfer	*xs = wu->swu_xs;
702 
703 	/* XXX - we have no way of propagating errors... */
704 	if (wu->swu_flags & SR_WUF_DISCIPLINE)
705 		return SR_WU_OK;
706 
707 	/* XXX - This is insufficient for RAID 6. */
708 	if (wu->swu_ios_succeeded > 0) {
709 		xs->error = XS_NOERROR;
710 		return SR_WU_OK;
711 	}
712 
713 	if (xs->flags & SCSI_DATA_IN) {
714 		printf("%s: retrying read on block %lld\n",
715 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
716 		sr_wu_release_ccbs(wu);
717 		wu->swu_state = SR_WU_RESTART;
718 		if (sd->sd_scsi_rw(wu) == 0)
719 			return SR_WU_RESTART;
720 	} else {
721 		printf("%s: permanently fail write on block %lld\n",
722 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
723 	}
724 
725 	wu->swu_state = SR_WU_FAILED;
726 	xs->error = XS_DRIVER_STUFFUP;
727 
728 	return SR_WU_FAILED;
729 }
730 
731 int
732 sr_raid6_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
733     long len, void *data, int xsflags, int ccbflags, void *pbuf,
734     void *qbuf, int gn)
735 {
736 	struct sr_discipline	*sd = wu->swu_dis;
737 	struct sr_ccb		*ccb;
738 	struct sr_raid6_opaque  *pqbuf;
739 
740 	DNPRINTF(SR_D_DIS, "sr_raid6_addio: %s %d.%lld %ld %p:%p\n",
741 	    (xsflags & SCSI_DATA_IN) ? "read" : "write", chunk,
742 	    (long long)blkno, len, pbuf, qbuf);
743 
744 	/* Allocate temporary buffer. */
745 	if (data == NULL) {
746 		data = sr_block_get(sd, len);
747 		if (data == NULL)
748 			return (-1);
749 		ccbflags |= SR_CCBF_FREEBUF;
750 	}
751 
752 	ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
753 	if (ccb == NULL) {
754 		if (ccbflags & SR_CCBF_FREEBUF)
755 			sr_block_put(sd, data, len);
756 		return (-1);
757 	}
758 	if (pbuf || qbuf) {
759 		/* XXX - can leak data and ccb on failure. */
760 		if (qbuf && gf_premul(gn))
761 			return (-1);
762 
763 		/* XXX - should be preallocated? */
764 		pqbuf = malloc(sizeof(struct sr_raid6_opaque),
765 		    M_DEVBUF, M_ZERO | M_NOWAIT);
766 		if (pqbuf == NULL) {
767 			sr_ccb_put(ccb);
768 			return (-1);
769 		}
770 		pqbuf->pbuf = pbuf;
771 		pqbuf->qbuf = qbuf;
772 		pqbuf->gn = gn;
773 		ccb->ccb_opaque = pqbuf;
774 	}
775 	sr_wu_enqueue_ccb(wu, ccb);
776 
777 	return (0);
778 }
779 
780 /* Perform RAID6 parity calculation.
781  *   P=xor parity, Q=GF256 parity, D=data, gn=disk# */
782 void
783 sr_raid6_xorp(void *p, void *d, int len)
784 {
785 	uint32_t *pbuf = p, *data = d;
786 
787 	len >>= 2;
788 	while (len--)
789 		*pbuf++ ^= *data++;
790 }
791 
792 void
793 sr_raid6_xorq(void *q, void *d, int len, int gn)
794 {
795 	uint32_t	*qbuf = q, *data = d, x;
796 	uint8_t		*gn_map = gf_map[gn];
797 
798 	len >>= 2;
799 	while (len--) {
800 		x = *data++;
801 		*qbuf++ ^= (((uint32_t)gn_map[x & 0xff]) |
802 			    ((uint32_t)gn_map[(x >> 8) & 0xff] << 8) |
803 			    ((uint32_t)gn_map[(x >> 16) & 0xff] << 16) |
804 			    ((uint32_t)gn_map[(x >> 24) & 0xff] << 24));
805 	}
806 }
807 
808 /* Create GF256 log/pow tables: polynomial = 0x11D */
809 void
810 gf_init(void)
811 {
812 	int i;
813 	uint8_t p = 1;
814 
815 	/* use 2N pow table to avoid using % in multiply */
816 	for (i=0; i<256; i++) {
817 		gf_log[p] = i;
818 		gf_pow[i] = gf_pow[i+255] = p;
819 		p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00));
820 	}
821 	gf_log[0] = 512;
822 }
823 
824 uint8_t
825 gf_inv(uint8_t a)
826 {
827 	return gf_pow[255 - gf_log[a]];
828 }
829 
830 uint8_t
831 gf_mul(uint8_t a, uint8_t b)
832 {
833 	return gf_pow[gf_log[a] + gf_log[b]];
834 }
835 
836 /* Precalculate multiplication tables for drive gn */
837 int
838 gf_premul(uint8_t gn)
839 {
840 	int i;
841 
842 	if (gf_map[gn] != NULL)
843 		return (0);
844 
845 	if ((gf_map[gn] = malloc(256, M_DEVBUF, M_ZERO | M_NOWAIT)) == NULL)
846 		return (-1);
847 
848 	for (i=0; i<256; i++)
849 		gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]];
850 	return (0);
851 }
852