xref: /openbsd/sys/dev/softraid_raid6.c (revision a6445c1d)
1 /* $OpenBSD: softraid_raid6.c,v 1.63 2014/09/14 14:17:24 jsg Exp $ */
2 /*
3  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
4  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bio.h"
20 
21 #include <sys/param.h>
22 #include <sys/systm.h>
23 #include <sys/buf.h>
24 #include <sys/device.h>
25 #include <sys/ioctl.h>
26 #include <sys/malloc.h>
27 #include <sys/kernel.h>
28 #include <sys/disk.h>
29 #include <sys/rwlock.h>
30 #include <sys/queue.h>
31 #include <sys/fcntl.h>
32 #include <sys/disklabel.h>
33 #include <sys/mount.h>
34 #include <sys/sensors.h>
35 #include <sys/stat.h>
36 #include <sys/task.h>
37 #include <sys/conf.h>
38 #include <sys/uio.h>
39 
40 #include <scsi/scsi_all.h>
41 #include <scsi/scsiconf.h>
42 #include <scsi/scsi_disk.h>
43 
44 #include <dev/softraidvar.h>
45 #include <dev/rndvar.h>
46 
47 uint8_t *gf_map[256];
48 uint8_t	gf_pow[768];
49 int	gf_log[256];
50 
51 /* RAID 6 functions. */
52 int	sr_raid6_create(struct sr_discipline *, struct bioc_createraid *,
53 	    int, int64_t);
54 int	sr_raid6_assemble(struct sr_discipline *, struct bioc_createraid *,
55 	    int, void *);
56 int	sr_raid6_init(struct sr_discipline *);
57 int	sr_raid6_rw(struct sr_workunit *);
58 int	sr_raid6_openings(struct sr_discipline *);
59 void	sr_raid6_intr(struct buf *);
60 int	sr_raid6_wu_done(struct sr_workunit *);
61 void	sr_raid6_set_chunk_state(struct sr_discipline *, int, int);
62 void	sr_raid6_set_vol_state(struct sr_discipline *);
63 
64 void	sr_raid6_xorp(void *, void *, int);
65 void	sr_raid6_xorq(void *, void *, int, int);
66 int	sr_raid6_addio(struct sr_workunit *wu, int, daddr_t, daddr_t,
67 	    void *, int, int, void *, void *, int);
68 void	sr_raid6_scrub(struct sr_discipline *);
69 int	sr_failio(struct sr_workunit *);
70 
71 void	gf_init(void);
72 uint8_t gf_inv(uint8_t);
73 int	gf_premul(uint8_t);
74 uint8_t gf_mul(uint8_t, uint8_t);
75 
76 #define SR_NOFAIL		0x00
77 #define SR_FAILX		(1L << 0)
78 #define SR_FAILY		(1L << 1)
79 #define SR_FAILP		(1L << 2)
80 #define SR_FAILQ		(1L << 3)
81 
82 struct sr_raid6_opaque {
83 	int	gn;
84 	void	*pbuf;
85 	void	*qbuf;
86 };
87 
88 /* discipline initialisation. */
89 void
90 sr_raid6_discipline_init(struct sr_discipline *sd)
91 {
92 	/* Initialize GF256 tables. */
93 	gf_init();
94 
95 	/* Fill out discipline members. */
96 	sd->sd_type = SR_MD_RAID6;
97 	strlcpy(sd->sd_name, "RAID 6", sizeof(sd->sd_name));
98 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
99 	    SR_CAP_REDUNDANT;
100 	sd->sd_max_wu = SR_RAID6_NOWU;
101 
102 	/* Setup discipline specific function pointers. */
103 	sd->sd_assemble = sr_raid6_assemble;
104 	sd->sd_create = sr_raid6_create;
105 	sd->sd_openings = sr_raid6_openings;
106 	sd->sd_scsi_rw = sr_raid6_rw;
107 	sd->sd_scsi_intr = sr_raid6_intr;
108 	sd->sd_scsi_wu_done = sr_raid6_wu_done;
109 	sd->sd_set_chunk_state = sr_raid6_set_chunk_state;
110 	sd->sd_set_vol_state = sr_raid6_set_vol_state;
111 }
112 
113 int
114 sr_raid6_create(struct sr_discipline *sd, struct bioc_createraid *bc,
115     int no_chunk, int64_t coerced_size)
116 {
117 	if (no_chunk < 4) {
118 		sr_error(sd->sd_sc, "%s requires four or more chunks",
119 		    sd->sd_name);
120 		return EINVAL;
121 	}
122 
123 	/*
124 	 * XXX add variable strip size later even though MAXPHYS is really
125 	 * the clever value, users like * to tinker with that type of stuff.
126 	 */
127 	sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
128 	sd->sd_meta->ssdi.ssd_size = (coerced_size &
129 	    ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
130 	    DEV_BSHIFT) - 1)) * (no_chunk - 2);
131 
132 	return sr_raid6_init(sd);
133 }
134 
135 int
136 sr_raid6_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
137     int no_chunk, void *data)
138 {
139 	return sr_raid6_init(sd);
140 }
141 
142 int
143 sr_raid6_init(struct sr_discipline *sd)
144 {
145 	/* Initialise runtime values. */
146 	sd->mds.mdd_raid6.sr6_strip_bits =
147 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
148 	if (sd->mds.mdd_raid6.sr6_strip_bits == -1) {
149 		sr_error(sd->sd_sc, "invalid strip size");
150 		return EINVAL;
151 	}
152 
153 	/* only if stripsize <= MAXPHYS */
154 	sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no);
155 
156 	return 0;
157 }
158 
159 int
160 sr_raid6_openings(struct sr_discipline *sd)
161 {
162 	return (sd->sd_max_wu >> 1); /* 2 wu's per IO */
163 }
164 
165 void
166 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
167 {
168 	int			old_state, s;
169 
170 	/* XXX this is for RAID 0 */
171 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
172 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
173 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
174 
175 	/* ok to go to splbio since this only happens in error path */
176 	s = splbio();
177 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
178 
179 	/* multiple IOs to the same chunk that fail will come through here */
180 	if (old_state == new_state)
181 		goto done;
182 
183 	switch (old_state) {
184 	case BIOC_SDONLINE:
185 		switch (new_state) {
186 		case BIOC_SDOFFLINE:
187 		case BIOC_SDSCRUB:
188 			break;
189 		default:
190 			goto die;
191 		}
192 		break;
193 
194 	case BIOC_SDOFFLINE:
195 		if (new_state == BIOC_SDREBUILD) {
196 			;
197 		} else
198 			goto die;
199 		break;
200 
201 	case BIOC_SDSCRUB:
202 		switch (new_state) {
203 		case BIOC_SDONLINE:
204 		case BIOC_SDOFFLINE:
205 			break;
206 		default:
207 			goto die;
208 		}
209 		break;
210 
211 	case BIOC_SDREBUILD:
212 		switch (new_state) {
213 		case BIOC_SDONLINE:
214 		case BIOC_SDOFFLINE:
215 			break;
216 		default:
217 			goto die;
218 		}
219 		break;
220 
221 	default:
222 die:
223 		splx(s); /* XXX */
224 		panic("%s: %s: %s: invalid chunk state transition "
225 		    "%d -> %d", DEVNAME(sd->sd_sc),
226 		    sd->sd_meta->ssd_devname,
227 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
228 		    old_state, new_state);
229 		/* NOTREACHED */
230 	}
231 
232 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
233 	sd->sd_set_vol_state(sd);
234 
235 	sd->sd_must_flush = 1;
236 	task_add(systq, &sd->sd_meta_save_task);
237 done:
238 	splx(s);
239 }
240 
241 void
242 sr_raid6_set_vol_state(struct sr_discipline *sd)
243 {
244 	int			states[SR_MAX_STATES];
245 	int			new_state, i, s, nd;
246 	int			old_state = sd->sd_vol_status;
247 
248 	/* XXX this is for RAID 0 */
249 
250 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
251 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
252 
253 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
254 
255 	for (i = 0; i < SR_MAX_STATES; i++)
256 		states[i] = 0;
257 
258 	for (i = 0; i < nd; i++) {
259 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
260 		if (s >= SR_MAX_STATES)
261 			panic("%s: %s: %s: invalid chunk state",
262 			    DEVNAME(sd->sd_sc),
263 			    sd->sd_meta->ssd_devname,
264 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
265 		states[s]++;
266 	}
267 
268 	if (states[BIOC_SDONLINE] == nd)
269 		new_state = BIOC_SVONLINE;
270 	else if (states[BIOC_SDONLINE] < nd - 2)
271 		new_state = BIOC_SVOFFLINE;
272 	else if (states[BIOC_SDSCRUB] != 0)
273 		new_state = BIOC_SVSCRUB;
274 	else if (states[BIOC_SDREBUILD] != 0)
275 		new_state = BIOC_SVREBUILD;
276 	else if (states[BIOC_SDONLINE] < nd)
277 		new_state = BIOC_SVDEGRADED;
278 	else {
279 		printf("old_state = %d, ", old_state);
280 		for (i = 0; i < nd; i++)
281 			printf("%d = %d, ", i,
282 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
283 		panic("invalid new_state");
284 	}
285 
286 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n",
287 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
288 	    old_state, new_state);
289 
290 	switch (old_state) {
291 	case BIOC_SVONLINE:
292 		switch (new_state) {
293 		case BIOC_SVONLINE: /* can go to same state */
294 		case BIOC_SVOFFLINE:
295 		case BIOC_SVDEGRADED:
296 		case BIOC_SVREBUILD: /* happens on boot */
297 			break;
298 		default:
299 			goto die;
300 		}
301 		break;
302 
303 	case BIOC_SVOFFLINE:
304 		/* XXX this might be a little too much */
305 		goto die;
306 
307 	case BIOC_SVDEGRADED:
308 		switch (new_state) {
309 		case BIOC_SVOFFLINE:
310 		case BIOC_SVREBUILD:
311 		case BIOC_SVDEGRADED: /* can go to the same state */
312 			break;
313 		default:
314 			goto die;
315 		}
316 		break;
317 
318 	case BIOC_SVBUILDING:
319 		switch (new_state) {
320 		case BIOC_SVONLINE:
321 		case BIOC_SVOFFLINE:
322 		case BIOC_SVBUILDING: /* can go to the same state */
323 			break;
324 		default:
325 			goto die;
326 		}
327 		break;
328 
329 	case BIOC_SVSCRUB:
330 		switch (new_state) {
331 		case BIOC_SVONLINE:
332 		case BIOC_SVOFFLINE:
333 		case BIOC_SVDEGRADED:
334 		case BIOC_SVSCRUB: /* can go to same state */
335 			break;
336 		default:
337 			goto die;
338 		}
339 		break;
340 
341 	case BIOC_SVREBUILD:
342 		switch (new_state) {
343 		case BIOC_SVONLINE:
344 		case BIOC_SVOFFLINE:
345 		case BIOC_SVDEGRADED:
346 		case BIOC_SVREBUILD: /* can go to the same state */
347 			break;
348 		default:
349 			goto die;
350 		}
351 		break;
352 
353 	default:
354 die:
355 		panic("%s: %s: invalid volume state transition %d -> %d",
356 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
357 		    old_state, new_state);
358 		/* NOTREACHED */
359 	}
360 
361 	sd->sd_vol_status = new_state;
362 }
363 
364 /*  modes:
365  *   readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
366  *		0, qbuf, NULL, 0);
367  *   readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
368  *		0, pbuf, NULL, 0);
369  *   readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
370  *		0, pbuf, qbuf, gf_pow[i]);
371  */
372 
373 int
374 sr_raid6_rw(struct sr_workunit *wu)
375 {
376 	struct sr_workunit	*wu_r = NULL;
377 	struct sr_discipline	*sd = wu->swu_dis;
378 	struct scsi_xfer	*xs = wu->swu_xs;
379 	struct sr_chunk		*scp;
380 	int			s, fail, i, gxinv, pxinv;
381 	daddr_t			blk, lba;
382 	int64_t			chunk_offs, lbaoffs, phys_offs, strip_offs;
383 	int64_t			strip_no, strip_size, strip_bits;
384 	int64_t			fchunk, no_chunk, chunk, qchunk, pchunk;
385 	int64_t			length, datalen, row_size;
386 	void			*pbuf, *data, *qbuf;
387 
388 	/* blk and scsi error will be handled by sr_validate_io */
389 	if (sr_validate_io(wu, &blk, "sr_raid6_rw"))
390 		goto bad;
391 
392 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
393 	strip_bits = sd->mds.mdd_raid6.sr6_strip_bits;
394 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2;
395 	row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
396 
397 	data = xs->data;
398 	datalen = xs->datalen;
399 	lbaoffs	= blk << DEV_BSHIFT;
400 
401 	if (xs->flags & SCSI_DATA_OUT) {
402 		if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
403 			printf("%s: can't get wu_r", DEVNAME(sd->sd_sc));
404 			goto bad;
405 		}
406 		wu_r->swu_state = SR_WU_INPROGRESS;
407 		wu_r->swu_flags |= SR_WUF_DISCIPLINE;
408 	}
409 
410 	wu->swu_blk_start = 0;
411 	while (datalen != 0) {
412 		strip_no = lbaoffs >> strip_bits;
413 		strip_offs = lbaoffs & (strip_size - 1);
414 		chunk_offs = (strip_no / no_chunk) << strip_bits;
415 		phys_offs = chunk_offs + strip_offs +
416 		    (sd->sd_meta->ssd_data_offset << DEV_BSHIFT);
417 
418 		/* get size remaining in this stripe */
419 		length = MIN(strip_size - strip_offs, datalen);
420 
421 		/* map disk offset to parity/data drive */
422 		chunk = strip_no % no_chunk;
423 
424 		qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2));
425 		if (qchunk == 0)
426 			pchunk = no_chunk + 1;
427 		else
428 			pchunk = qchunk - 1;
429 		if (chunk >= pchunk)
430 			chunk++;
431 		if (chunk >= qchunk)
432 			chunk++;
433 
434 		lba = phys_offs >> DEV_BSHIFT;
435 
436 		/* XXX big hammer.. exclude I/O from entire stripe */
437 		if (wu->swu_blk_start == 0)
438 			wu->swu_blk_start = (strip_no / no_chunk) * row_size;
439 		wu->swu_blk_end = (strip_no / no_chunk) * row_size + (row_size - 1);
440 
441 		fail = 0;
442 		fchunk = -1;
443 
444 		/* Get disk-fail flags */
445 		for (i=0; i< no_chunk+2; i++) {
446 			scp = sd->sd_vol.sv_chunks[i];
447 			switch (scp->src_meta.scm_status) {
448 			case BIOC_SDOFFLINE:
449 			case BIOC_SDREBUILD:
450 			case BIOC_SDHOTSPARE:
451 				if (i == qchunk)
452 					fail |= SR_FAILQ;
453 				else if (i == pchunk)
454 					fail |= SR_FAILP;
455 				else if (i == chunk)
456 					fail |= SR_FAILX;
457 				else {
458 					/* dual data-disk failure */
459 					fail |= SR_FAILY;
460 					fchunk = i;
461 				}
462 				break;
463 			}
464 		}
465 		if (xs->flags & SCSI_DATA_IN) {
466 			if (!(fail & SR_FAILX)) {
467 				/* drive is good. issue single read request */
468 				if (sr_raid6_addio(wu, chunk, lba, length,
469 				    data, xs->flags, 0, NULL, NULL, 0))
470 					goto bad;
471 			} else if (fail & SR_FAILP) {
472 				/* Dx, P failed */
473 				printf("Disk %llx offline, "
474 				    "regenerating Dx+P\n", chunk);
475 
476 				gxinv = gf_inv(gf_pow[chunk]);
477 
478 				/* Calculate: Dx = (Q^Dz*gz)*inv(gx) */
479 				memset(data, 0, length);
480 				if (sr_raid6_addio(wu, qchunk, lba, length,
481 				    NULL, SCSI_DATA_IN, 0, NULL, data, gxinv))
482 					goto bad;
483 
484 				/* Read Dz * gz * inv(gx) */
485 				for (i = 0; i < no_chunk+2; i++) {
486 					if  (i == qchunk || i == pchunk || i == chunk)
487 						continue;
488 
489 					if (sr_raid6_addio(wu, i, lba, length,
490 					    NULL, SCSI_DATA_IN, 0, NULL, data,
491 					    gf_mul(gf_pow[i], gxinv)))
492 						goto bad;
493 				}
494 
495 				/* data will contain correct value on completion */
496 			} else if (fail & SR_FAILY) {
497 				/* Dx, Dy failed */
498 				printf("Disk %llx & %llx offline, "
499 				    "regenerating Dx+Dy\n", chunk, fchunk);
500 
501 				gxinv = gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]);
502 				pxinv = gf_mul(gf_pow[fchunk], gxinv);
503 
504 				/* read Q * inv(gx + gy) */
505 				memset(data, 0, length);
506 				if (sr_raid6_addio(wu, qchunk, lba, length,
507 				    NULL, SCSI_DATA_IN, 0, NULL, data, gxinv))
508 					goto bad;
509 
510 				/* read P * gy * inv(gx + gy) */
511 				if (sr_raid6_addio(wu, pchunk, lba, length,
512 				    NULL, SCSI_DATA_IN, 0, NULL, data, pxinv))
513 					goto bad;
514 
515 				/* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz
516 				 *   Q:  sr_raid6_xorp(qbuf, --, length);
517 				 *   P:  sr_raid6_xorp(pbuf, --, length);
518 				 *   Dz: sr_raid6_xorp(pbuf, --, length);
519 				 *	 sr_raid6_xorq(qbuf, --, length, gf_pow[i]);
520 				 */
521 				for (i = 0; i < no_chunk+2; i++) {
522 					if (i == qchunk || i == pchunk ||
523 					    i == chunk || i == fchunk)
524 						continue;
525 
526 					/* read Dz * (gz + gy) * inv(gx + gy) */
527 					if (sr_raid6_addio(wu, i, lba, length,
528 					    NULL, SCSI_DATA_IN, 0, NULL, data,
529 					    pxinv ^ gf_mul(gf_pow[i], gxinv)))
530 						goto bad;
531 				}
532 			} else {
533 				/* Two cases: single disk (Dx) or (Dx+Q)
534 				 *   Dx = Dz ^ P (same as RAID5)
535 				 */
536 				printf("Disk %llx offline, "
537 				    "regenerating Dx%s\n", chunk,
538 				    fail & SR_FAILQ ? "+Q" : " single");
539 
540 				/* Calculate: Dx = P^Dz
541 				 *   P:  sr_raid6_xorp(data, ---, length);
542 				 *   Dz: sr_raid6_xorp(data, ---, length);
543 				 */
544 				memset(data, 0, length);
545 				for (i = 0; i < no_chunk+2; i++) {
546 					if (i != chunk && i != qchunk) {
547 						/* Read Dz */
548 						if (sr_raid6_addio(wu, i, lba,
549 						    length, NULL, SCSI_DATA_IN,
550 						    0, data, NULL, 0))
551 							goto bad;
552 					}
553 				}
554 
555 				/* data will contain correct value on completion */
556 			}
557 		} else {
558 			/* XXX handle writes to failed/offline disk? */
559 			if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP))
560 				goto bad;
561 
562 			/*
563 			 * initialize pbuf with contents of new data to be
564 			 * written. This will be XORed with old data and old
565 			 * parity in the intr routine. The result in pbuf
566 			 * is the new parity data.
567 			 */
568 			qbuf = sr_block_get(sd, length);
569 			if (qbuf == NULL)
570 				goto bad;
571 
572 			pbuf = sr_block_get(sd, length);
573 			if (pbuf == NULL)
574 				goto bad;
575 
576 			/* Calculate P = Dn; Q = gn * Dn */
577 			if (gf_premul(gf_pow[chunk]))
578 				goto bad;
579 			sr_raid6_xorp(pbuf, data, length);
580 			sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]);
581 
582 			/* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */
583 			if (sr_raid6_addio(wu_r, chunk, lba, length, NULL,
584 				SCSI_DATA_IN, 0, pbuf, qbuf, gf_pow[chunk]))
585 				goto bad;
586 
587 			/* Read old xor-parity: P ^= P' */
588 			if (sr_raid6_addio(wu_r, pchunk, lba, length, NULL,
589 				SCSI_DATA_IN, 0, pbuf, NULL, 0))
590 				goto bad;
591 
592 			/* Read old q-parity: Q ^= Q' */
593 			if (sr_raid6_addio(wu_r, qchunk, lba, length, NULL,
594 				SCSI_DATA_IN, 0, qbuf, NULL, 0))
595 				goto bad;
596 
597 			/* write new data */
598 			if (sr_raid6_addio(wu, chunk, lba, length, data,
599 			    xs->flags, 0, NULL, NULL, 0))
600 				goto bad;
601 
602 			/* write new xor-parity */
603 			if (sr_raid6_addio(wu, pchunk, lba, length, pbuf,
604 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
605 				goto bad;
606 
607 			/* write new q-parity */
608 			if (sr_raid6_addio(wu, qchunk, lba, length, qbuf,
609 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
610 				goto bad;
611 		}
612 
613 		/* advance to next block */
614 		lbaoffs += length;
615 		datalen -= length;
616 		data += length;
617 	}
618 
619 	s = splbio();
620 	if (wu_r) {
621 		/* collide write request with reads */
622 		wu_r->swu_blk_start = wu->swu_blk_start;
623 		wu_r->swu_blk_end = wu->swu_blk_end;
624 
625 		wu->swu_state = SR_WU_DEFERRED;
626 		wu_r->swu_collider = wu;
627 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
628 
629 		wu = wu_r;
630 	}
631 	splx(s);
632 
633 	sr_schedule_wu(wu);
634 
635 	return (0);
636 bad:
637 	/* XXX - can leak pbuf/qbuf on error. */
638 	/* wu is unwound by sr_wu_put */
639 	if (wu_r)
640 		sr_scsi_wu_put(sd, wu_r);
641 	return (1);
642 }
643 
644 /* Handle failure I/O completion */
645 int
646 sr_failio(struct sr_workunit *wu)
647 {
648 	struct sr_discipline	*sd = wu->swu_dis;
649 	struct sr_ccb		*ccb;
650 
651 	if (!(wu->swu_flags & SR_WUF_FAIL))
652 		return (0);
653 
654 	/* Wu is a 'fake'.. don't do real I/O just intr */
655 	TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
656 	TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link)
657 		sr_raid6_intr(&ccb->ccb_buf);
658 	return (1);
659 }
660 
661 void
662 sr_raid6_intr(struct buf *bp)
663 {
664 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
665 	struct sr_workunit	*wu = ccb->ccb_wu;
666 	struct sr_discipline	*sd = wu->swu_dis;
667 	struct sr_raid6_opaque  *pq = ccb->ccb_opaque;
668 	int			s;
669 
670 	DNPRINTF(SR_D_INTR, "%s: sr_raid6_intr bp %p xs %p\n",
671 	    DEVNAME(sd->sd_sc), bp, wu->swu_xs);
672 
673 	s = splbio();
674 	sr_ccb_done(ccb);
675 
676 	/* XOR data to result. */
677 	if (ccb->ccb_state == SR_CCB_OK && pq) {
678 		if (pq->pbuf)
679 			/* Calculate xor-parity */
680 			sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data,
681 			    ccb->ccb_buf.b_bcount);
682 		if (pq->qbuf)
683 			/* Calculate q-parity */
684 			sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data,
685 			    ccb->ccb_buf.b_bcount, pq->gn);
686 		free(pq, M_DEVBUF, 0);
687 		ccb->ccb_opaque = NULL;
688 	}
689 
690 	/* Free allocated data buffer. */
691 	if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
692 		sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
693 		ccb->ccb_buf.b_data = NULL;
694 	}
695 
696 	sr_wu_done(wu);
697 	splx(s);
698 }
699 
700 int
701 sr_raid6_wu_done(struct sr_workunit *wu)
702 {
703 	struct sr_discipline	*sd = wu->swu_dis;
704 	struct scsi_xfer	*xs = wu->swu_xs;
705 
706 	/* XXX - we have no way of propagating errors... */
707 	if (wu->swu_flags & SR_WUF_DISCIPLINE)
708 		return SR_WU_OK;
709 
710 	/* XXX - This is insufficient for RAID 6. */
711 	if (wu->swu_ios_succeeded > 0) {
712 		xs->error = XS_NOERROR;
713 		return SR_WU_OK;
714 	}
715 
716 	if (xs->flags & SCSI_DATA_IN) {
717 		printf("%s: retrying read on block %lld\n",
718 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
719 		sr_wu_release_ccbs(wu);
720 		wu->swu_state = SR_WU_RESTART;
721 		if (sd->sd_scsi_rw(wu) == 0)
722 			return SR_WU_RESTART;
723 	} else {
724 		printf("%s: permanently fail write on block %lld\n",
725 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
726 	}
727 
728 	wu->swu_state = SR_WU_FAILED;
729 	xs->error = XS_DRIVER_STUFFUP;
730 
731 	return SR_WU_FAILED;
732 }
733 
734 int
735 sr_raid6_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
736     daddr_t len, void *data, int xsflags, int ccbflags, void *pbuf,
737     void *qbuf, int gn)
738 {
739 	struct sr_discipline	*sd = wu->swu_dis;
740 	struct sr_ccb		*ccb;
741 	struct sr_raid6_opaque  *pqbuf;
742 
743 	DNPRINTF(SR_D_DIS, "sr_raid6_addio: %s %d.%llx %llx %p:%p\n",
744 	    (xsflags & SCSI_DATA_IN) ? "read" : "write", chunk,
745 	    (long long)blkno, (long long)len,
746 	    pbuf, qbuf);
747 
748 	/* Allocate temporary buffer. */
749 	if (data == NULL) {
750 		data = sr_block_get(sd, len);
751 		if (data == NULL)
752 			return (-1);
753 		ccbflags |= SR_CCBF_FREEBUF;
754 	}
755 
756 	ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
757 	if (ccb == NULL) {
758 		if (ccbflags & SR_CCBF_FREEBUF)
759 			sr_block_put(sd, data, len);
760 		return (-1);
761 	}
762 	if (pbuf || qbuf) {
763 		/* XXX - can leak data and ccb on failure. */
764 		if (qbuf && gf_premul(gn))
765 			return (-1);
766 
767 		/* XXX - should be preallocated? */
768 		pqbuf = malloc(sizeof(struct sr_raid6_opaque),
769 		    M_DEVBUF, M_ZERO | M_NOWAIT);
770 		if (pqbuf == NULL) {
771 			sr_ccb_put(ccb);
772 			return (-1);
773 		}
774 		pqbuf->pbuf = pbuf;
775 		pqbuf->qbuf = qbuf;
776 		pqbuf->gn = gn;
777 		ccb->ccb_opaque = pqbuf;
778 	}
779 	sr_wu_enqueue_ccb(wu, ccb);
780 
781 	return (0);
782 }
783 
784 /* Perform RAID6 parity calculation.
785  *   P=xor parity, Q=GF256 parity, D=data, gn=disk# */
786 void
787 sr_raid6_xorp(void *p, void *d, int len)
788 {
789 	uint32_t *pbuf = p, *data = d;
790 
791 	len >>= 2;
792 	while (len--)
793 		*pbuf++ ^= *data++;
794 }
795 
796 void
797 sr_raid6_xorq(void *q, void *d, int len, int gn)
798 {
799 	uint32_t 	*qbuf = q, *data = d, x;
800 	uint8_t	 	*gn_map = gf_map[gn];
801 
802 	len >>= 2;
803 	while (len--) {
804 		x = *data++;
805 		*qbuf++ ^= (((uint32_t)gn_map[x & 0xff]) |
806 		  	    ((uint32_t)gn_map[(x >> 8) & 0xff] << 8) |
807 			    ((uint32_t)gn_map[(x >> 16) & 0xff] << 16) |
808 			    ((uint32_t)gn_map[(x >> 24) & 0xff] << 24));
809 	}
810 }
811 
812 /* Create GF256 log/pow tables: polynomial = 0x11D */
813 void
814 gf_init(void)
815 {
816 	int i;
817 	uint8_t p = 1;
818 
819 	/* use 2N pow table to avoid using % in multiply */
820 	for (i=0; i<256; i++) {
821 		gf_log[p] = i;
822 		gf_pow[i] = gf_pow[i+255] = p;
823 		p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00));
824 	}
825 	gf_log[0] = 512;
826 }
827 
828 uint8_t
829 gf_inv(uint8_t a)
830 {
831 	return gf_pow[255 - gf_log[a]];
832 }
833 
834 uint8_t
835 gf_mul(uint8_t a, uint8_t b)
836 {
837 	return gf_pow[gf_log[a] + gf_log[b]];
838 }
839 
840 /* Precalculate multiplication tables for drive gn */
841 int
842 gf_premul(uint8_t gn)
843 {
844 	int i;
845 
846 	if (gf_map[gn] != NULL)
847 		return (0);
848 
849 	if ((gf_map[gn] = malloc(256, M_DEVBUF, M_ZERO | M_NOWAIT)) == NULL)
850 		return (-1);
851 
852 	for (i=0; i<256; i++)
853 		gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]];
854 	return (0);
855 }
856