xref: /openbsd/sys/dev/softraid_raid6.c (revision 404b540a)
1 /* $OpenBSD: softraid_raid6.c,v 1.6 2009/08/26 20:14:44 jordan Exp $ */
2 /*
3  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
4  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bio.h"
20 
21 #include <sys/param.h>
22 #include <sys/systm.h>
23 #include <sys/buf.h>
24 #include <sys/device.h>
25 #include <sys/ioctl.h>
26 #include <sys/proc.h>
27 #include <sys/malloc.h>
28 #include <sys/kernel.h>
29 #include <sys/disk.h>
30 #include <sys/rwlock.h>
31 #include <sys/queue.h>
32 #include <sys/fcntl.h>
33 #include <sys/disklabel.h>
34 #include <sys/mount.h>
35 #include <sys/sensors.h>
36 #include <sys/stat.h>
37 #include <sys/conf.h>
38 #include <sys/uio.h>
39 
40 #include <scsi/scsi_all.h>
41 #include <scsi/scsiconf.h>
42 #include <scsi/scsi_disk.h>
43 
44 #include <dev/softraidvar.h>
45 #include <dev/rndvar.h>
46 
47 uint8_t *gf_map[256];
48 uint8_t	gf_pow[768];
49 int	gf_log[256];
50 
51 /* RAID 6 functions. */
52 int	sr_raid6_alloc_resources(struct sr_discipline *);
53 int	sr_raid6_free_resources(struct sr_discipline *);
54 int	sr_raid6_rw(struct sr_workunit *);
55 int	sr_raid6_openings(struct sr_discipline *);
56 void	sr_raid6_intr(struct buf *);
57 void	sr_raid6_recreate_wu(struct sr_workunit *);
58 void	sr_raid6_set_chunk_state(struct sr_discipline *, int, int);
59 void	sr_raid6_set_vol_state(struct sr_discipline *);
60 
61 void	sr_raid6_xorp(void *, void *, int);
62 void	sr_raid6_xorq(void *, void *, int, int);
63 int	sr_raid6_addio(struct sr_workunit *wu, int, daddr64_t, daddr64_t,
64 	    void *, int, int, void *, void *, int);
65 void 	sr_dump(void *, int);
66 void	sr_raid6_scrub(struct sr_discipline *);
67 int	sr_failio(struct sr_workunit *);
68 
69 void	*sr_get_block(struct sr_discipline *, int);
70 void	sr_put_block(struct sr_discipline *, void *);
71 
72 void	gf_init(void);
73 uint8_t gf_inv(uint8_t);
74 int	gf_premul(uint8_t);
75 
76 #define SR_NOFAIL		0x00
77 #define SR_FAILX		(1L << 0)
78 #define SR_FAILY		(1L << 1)
79 #define SR_FAILP		(1L << 2)
80 #define SR_FAILQ		(1L << 3)
81 
82 struct sr_raid6_opaque {
83 	int      gn;
84 	void	*pbuf;
85 	void	*qbuf;
86 };
87 
88 /* discipline initialisation. */
89 void
90 sr_raid6_discipline_init(struct sr_discipline *sd)
91 {
92 	/* Initialize GF256 tables */
93 	gf_init();
94 
95 	/* fill out discipline members. */
96 	sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no); /* only if stripsize <= MAXPHYS */
97 	sd->sd_max_wu = SR_RAID6_NOWU;
98 	sd->sd_rebuild = 0;
99 
100 	/* setup discipline pointers. */
101 	sd->sd_alloc_resources = sr_raid6_alloc_resources;
102 	sd->sd_free_resources = sr_raid6_free_resources;
103 	sd->sd_start_discipline = NULL;
104 	sd->sd_scsi_inquiry = sr_raid_inquiry;
105 	sd->sd_scsi_read_cap = sr_raid_read_cap;
106 	sd->sd_scsi_tur = sr_raid_tur;
107 	sd->sd_scsi_req_sense = sr_raid_request_sense;
108 	sd->sd_scsi_start_stop = sr_raid_start_stop;
109 	sd->sd_scsi_sync = sr_raid_sync;
110 	sd->sd_scsi_rw = sr_raid6_rw;
111 	sd->sd_set_chunk_state = sr_raid6_set_chunk_state;
112 	sd->sd_set_vol_state = sr_raid6_set_vol_state;
113 	sd->sd_openings = sr_raid6_openings;
114 }
115 
116 int
117 sr_raid6_openings(struct sr_discipline *sd)
118 {
119 	return (sd->sd_max_wu >> 1); /* 2 wu's per IO */
120 }
121 
122 int
123 sr_raid6_alloc_resources(struct sr_discipline *sd)
124 {
125 	int			rv = EINVAL;
126 
127 	if (!sd)
128 		return (rv);
129 
130 	DNPRINTF(SR_D_DIS, "%s: sr_raid6_alloc_resources\n",
131 	    DEVNAME(sd->sd_sc));
132 
133 	if (sr_wu_alloc(sd))
134 		goto bad;
135 	if (sr_ccb_alloc(sd))
136 		goto bad;
137 
138 	/* setup runtime values */
139 	sd->mds.mdd_raid6.sr6_strip_bits =
140 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
141 	if (sd->mds.mdd_raid6.sr6_strip_bits == -1)
142 		goto bad;
143 
144 	rv = 0;
145 bad:
146 	return (rv);
147 }
148 
149 int
150 sr_raid6_free_resources(struct sr_discipline *sd)
151 {
152 	int			rv = EINVAL;
153 
154 	if (!sd)
155 		return (rv);
156 
157 	DNPRINTF(SR_D_DIS, "%s: sr_raid6_free_resources\n",
158 	    DEVNAME(sd->sd_sc));
159 
160 	sr_wu_free(sd);
161 	sr_ccb_free(sd);
162 
163 	rv = 0;
164 	return (rv);
165 }
166 
167 void
168 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
169 {
170 	int			old_state, s;
171 
172 	/* XXX this is for RAID 0 */
173 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
174 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
175 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
176 
177 	/* ok to go to splbio since this only happens in error path */
178 	s = splbio();
179 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
180 
181 	/* multiple IOs to the same chunk that fail will come through here */
182 	if (old_state == new_state)
183 		goto done;
184 
185 	switch (old_state) {
186 	case BIOC_SDONLINE:
187 		switch (new_state) {
188 		case BIOC_SDOFFLINE:
189 		case BIOC_SDSCRUB:
190 			break;
191 		default:
192 			goto die;
193 		}
194 		break;
195 
196 	case BIOC_SDOFFLINE:
197 		if (new_state == BIOC_SDREBUILD) {
198 			;
199 		} else
200 			goto die;
201 		break;
202 
203 	case BIOC_SDSCRUB:
204 		switch (new_state) {
205 		case BIOC_SDONLINE:
206 		case BIOC_SDOFFLINE:
207 			break;
208 		default:
209 			goto die;
210 		}
211 		break;
212 
213 	case BIOC_SDREBUILD:
214 		switch (new_state) {
215 		case BIOC_SDONLINE:
216 		case BIOC_SDOFFLINE:
217 			break;
218 		default:
219 			goto die;
220 		}
221 		break;
222 
223 	default:
224 die:
225 		splx(s); /* XXX */
226 		panic("%s: %s: %s: invalid chunk state transition "
227 		    "%d -> %d\n", DEVNAME(sd->sd_sc),
228 		    sd->sd_meta->ssd_devname,
229 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
230 		    old_state, new_state);
231 		/* NOTREACHED */
232 	}
233 
234 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
235 	sd->sd_set_vol_state(sd);
236 
237 	sd->sd_must_flush = 1;
238 	workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL);
239 done:
240 	splx(s);
241 }
242 
243 void
244 sr_raid6_set_vol_state(struct sr_discipline *sd)
245 {
246 	int			states[SR_MAX_STATES];
247 	int			new_state, i, s, nd;
248 	int			old_state = sd->sd_vol_status;
249 
250 	/* XXX this is for RAID 0 */
251 
252 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
253 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
254 
255 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
256 
257 	for (i = 0; i < SR_MAX_STATES; i++)
258 		states[i] = 0;
259 
260 	for (i = 0; i < nd; i++) {
261 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
262 		if (s >= SR_MAX_STATES)
263 			panic("%s: %s: %s: invalid chunk state",
264 			    DEVNAME(sd->sd_sc),
265 			    sd->sd_meta->ssd_devname,
266 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
267 		states[s]++;
268 	}
269 
270 	if (states[BIOC_SDONLINE] == nd)
271 		new_state = BIOC_SVONLINE;
272 	else if (states[BIOC_SDONLINE] < nd - 2)
273 		new_state = BIOC_SVOFFLINE;
274 	else if (states[BIOC_SDSCRUB] != 0)
275 		new_state = BIOC_SVSCRUB;
276 	else if (states[BIOC_SDREBUILD] != 0)
277 		new_state = BIOC_SVREBUILD;
278 	else if (states[BIOC_SDONLINE] < nd)
279 		new_state = BIOC_SVDEGRADED;
280 	else {
281 		printf("old_state = %d, ", old_state);
282 		for (i = 0; i < nd; i++)
283 			printf("%d = %d, ", i,
284 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
285 		panic("invalid new_state");
286 	}
287 
288 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n",
289 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
290 	    old_state, new_state);
291 
292 	switch (old_state) {
293 	case BIOC_SVONLINE:
294 		switch (new_state) {
295 		case BIOC_SVONLINE: /* can go to same state */
296 		case BIOC_SVOFFLINE:
297 		case BIOC_SVDEGRADED:
298 		case BIOC_SVREBUILD: /* happens on boot */
299 			break;
300 		default:
301 			goto die;
302 		}
303 		break;
304 
305 	case BIOC_SVOFFLINE:
306 		/* XXX this might be a little too much */
307 		goto die;
308 
309 	case BIOC_SVSCRUB:
310 		switch (new_state) {
311 		case BIOC_SVONLINE:
312 		case BIOC_SVOFFLINE:
313 		case BIOC_SVDEGRADED:
314 		case BIOC_SVSCRUB: /* can go to same state */
315 			break;
316 		default:
317 			goto die;
318 		}
319 		break;
320 
321 	case BIOC_SVBUILDING:
322 		switch (new_state) {
323 		case BIOC_SVONLINE:
324 		case BIOC_SVOFFLINE:
325 		case BIOC_SVBUILDING: /* can go to the same state */
326 			break;
327 		default:
328 			goto die;
329 		}
330 		break;
331 
332 	case BIOC_SVREBUILD:
333 		switch (new_state) {
334 		case BIOC_SVONLINE:
335 		case BIOC_SVOFFLINE:
336 		case BIOC_SVDEGRADED:
337 		case BIOC_SVREBUILD: /* can go to the same state */
338 			break;
339 		default:
340 			goto die;
341 		}
342 		break;
343 
344 	case BIOC_SVDEGRADED:
345 		switch (new_state) {
346 		case BIOC_SVOFFLINE:
347 		case BIOC_SVREBUILD:
348 		case BIOC_SVDEGRADED: /* can go to the same state */
349 			break;
350 		default:
351 			goto die;
352 		}
353 		break;
354 
355 	default:
356 die:
357 		panic("%s: %s: invalid volume state transition %d -> %d\n",
358 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
359 		    old_state, new_state);
360 		/* NOTREACHED */
361 	}
362 
363 	sd->sd_vol_status = new_state;
364 }
365 
366 /*  modes:
367  *   readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
368  *	        SR_CCBF_FREEBUF, qbuf, NULL, 0);
369  *   readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
370  *		SR_CCBF_FREEBUF, pbuf, NULL, 0);
371  *   readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
372  *		SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[i]);
373  */
374 
375 int
376 sr_raid6_rw(struct sr_workunit *wu)
377 {
378 	struct sr_workunit	*wu_w = NULL;
379 	struct sr_discipline	*sd = wu->swu_dis;
380 	struct scsi_xfer	*xs = wu->swu_xs;
381 	struct sr_chunk		*scp;
382 	int			s, fail, i;
383 	daddr64_t		blk, lbaoffs, strip_no, chunk, qchunk, pchunk, fchunk;
384 	daddr64_t		strip_size, no_chunk, lba, chunk_offs, phys_offs;
385 	daddr64_t		strip_bits, length, strip_offs, datalen;
386 	void		        *pbuf, *data, *qbuf;
387 
388 	/* blk and scsi error will be handled by sr_validate_io */
389 	if (sr_validate_io(wu, &blk, "sr_raid6_rw"))
390 		goto bad;
391 
392 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
393 	strip_bits = sd->mds.mdd_raid6.sr6_strip_bits;
394 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2;
395 
396 	data = xs->data;
397 	datalen = xs->datalen;
398 	lbaoffs	= blk << DEV_BSHIFT;
399 
400 	if (xs->flags & SCSI_DATA_OUT)
401 		/* create write workunit */
402 		if ((wu_w = sr_wu_get(sd, 0)) == NULL) {
403 			printf("%s: can't get wu_w", DEVNAME(sd->sd_sc));
404 			goto bad;
405 		}
406 
407 	wu->swu_blk_start = 0;
408 	while (datalen != 0) {
409 		strip_no = lbaoffs >> strip_bits;
410 		strip_offs = lbaoffs & (strip_size - 1);
411 		chunk_offs = (strip_no / no_chunk) << strip_bits;
412 		phys_offs = chunk_offs + strip_offs +
413 		    ((SR_META_OFFSET + SR_META_SIZE) << DEV_BSHIFT);
414 
415 		/* get size remaining in this stripe */
416 		length = MIN(strip_size - strip_offs, datalen);
417 
418 		/* map disk offset to parity/data drive */
419 		chunk = strip_no % no_chunk;
420 
421 		qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2));
422 		if (qchunk == 0)
423 			pchunk = no_chunk + 1;
424 		else
425 			pchunk = qchunk - 1;
426 		if (chunk >= pchunk)
427 			chunk++;
428 		if (chunk >= qchunk)
429 			chunk++;
430 
431 		lba = phys_offs >> DEV_BSHIFT;
432 
433 		/* XXX big hammer.. exclude I/O from entire stripe */
434 		if (wu->swu_blk_start == 0)
435 			wu->swu_blk_start = chunk_offs >> DEV_BSHIFT;
436 		wu->swu_blk_end = ((chunk_offs + (no_chunk << strip_bits)) >> DEV_BSHIFT) - 1;
437 
438 		fail = 0;
439 		fchunk = -1;
440 
441 		/* Get disk-fail flags */
442 		for (i=0; i< no_chunk+2; i++) {
443 			scp = sd->sd_vol.sv_chunks[i];
444 			switch (scp->src_meta.scm_status) {
445 			case BIOC_SDOFFLINE:
446 			case BIOC_SDREBUILD:
447 			case BIOC_SDHOTSPARE:
448 				if (i == qchunk)
449 					fail |= SR_FAILQ;
450 				else if (i == pchunk)
451 					fail |= SR_FAILP;
452 				else if (i == chunk)
453 					fail |= SR_FAILX;
454 				else {
455 					/* dual data-disk failure */
456 					fail |= SR_FAILY;
457 					fchunk = i;
458 				}
459 				break;
460 			}
461 		}
462 		if (xs->flags & SCSI_DATA_IN) {
463 			if (!(fail & SR_FAILX)) {
464 				/* drive is good. issue single read request */
465 				if (sr_raid6_addio(wu, chunk, lba, length,
466 				    data, xs->flags, 0, NULL, NULL, 0))
467 					goto bad;
468 			} else if (fail & SR_FAILP) {
469 				/* Dx, P failed */
470 				printf("Disk %llx offline, "
471 				    "regenerating Dx+P\n", chunk);
472 
473 				qbuf = sr_get_block(sd, length);
474 				if (qbuf == NULL)
475 					goto bad;
476 
477 				/* Calculate: Dx*gx = Q^(Dz*gz)
478 				 *   Q:  sr_raid6_xorp(data, --, length);
479 				 *   Dz: sr_raid6_xorq(data, --, length, gf_pow[i]);
480 				 */
481 				memset(data, 0, length);
482 				for (i = 0; i < no_chunk+2; i++) {
483 					if  (i == qchunk) {
484 						/* Read Q */
485 						if (sr_raid6_addio(wu, i, lba,
486 						    length, NULL, SCSI_DATA_IN,
487 						    SR_CCBF_FREEBUF, qbuf,
488 						    NULL, 0))
489 						    	goto bad;
490 					} else if (i != chunk && i != pchunk) {
491 						/* Read Dz * gz */
492 						if (sr_raid6_addio(wu, i, lba,
493 						   length, NULL, SCSI_DATA_IN,
494 						   SR_CCBF_FREEBUF, NULL,
495 						   qbuf, gf_pow[i]))
496 						   	goto bad;
497 					}
498 				}
499 
500 				/* run fake wu when read i/o is complete */
501 				if (wu_w == NULL &&
502 				    (wu_w = sr_wu_get(sd, 0)) == NULL)
503 					goto bad;
504 
505 				wu_w->swu_flags |= SR_WUF_FAIL;
506 				if (sr_raid6_addio(wu_w, 0, 0, length, qbuf, 0,
507 				    SR_CCBF_FREEBUF, NULL, data,
508 				    gf_inv(gf_pow[chunk])))
509 					goto bad;
510 			} else if (fail & SR_FAILY) {
511 				/* Dx, Dy failed */
512 				printf("Disk %llx & %llx offline, "
513 				    "regenerating Dx+Dy\n", chunk, fchunk);
514 				qbuf = sr_get_block(sd, length);
515 				if (qbuf == NULL)
516 					goto bad;
517 				pbuf = sr_get_block(sd, length);
518 				if (pbuf == NULL)
519 					goto bad;
520 
521 				/* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz
522 				 *   Q:  sr_raid6_xorp(qbuf, --, length);
523 				 *   P:  sr_raid6_xorp(pbuf, --, length);
524 				 *   Dz: sr_raid6_xorp(pbuf, --, length);
525 				 *	 sr_raid6_xorq(qbuf, --, length, gf_pow[i]);
526 				 */
527 				memset(data, 0, length);
528 				for (i = 0; i < no_chunk+2; i++) {
529 					if (i == qchunk) {
530 						/* read Q */
531 						if (sr_raid6_addio(wu, i, lba,
532 						    length,  NULL, SCSI_DATA_IN,
533 						    SR_CCBF_FREEBUF, qbuf,
534 						    NULL, 0))
535 						    	goto bad;
536 					} else if (i == pchunk) {
537 						/* read P */
538 						if (sr_raid6_addio(wu, i, lba,
539 						    length,  NULL, SCSI_DATA_IN,
540 						    SR_CCBF_FREEBUF, pbuf,
541 						    NULL, 0))
542 						    	goto bad;
543 					} else if (i != chunk) {
544 						/* read Dz * gz */
545 						if (sr_raid6_addio(wu, i, lba,
546 						    length, NULL, SCSI_DATA_IN,
547 						    SR_CCBF_FREEBUF, pbuf,
548 						    qbuf, gf_pow[i]))
549 						    	goto bad;
550 					}
551 				}
552 
553 				/* run fake wu when read i/o is complete */
554 				if (wu_w == NULL &&
555 				    (wu_w = sr_wu_get(sd, 0)) == NULL)
556 					goto bad;
557 
558 				wu_w->swu_flags |= SR_WUF_FAIL;
559 				if (sr_raid6_addio(wu_w, 0, 0, length, pbuf, 0,
560 				    SR_CCBF_FREEBUF, NULL, data,
561 				    gf_inv(gf_pow[255+chunk-fchunk] ^ 1)))
562 					goto bad;
563 				if (sr_raid6_addio(wu_w, 0, 0, length, qbuf, 0,
564 				    SR_CCBF_FREEBUF, NULL, data,
565 				    gf_inv(gf_pow[chunk] ^ gf_pow[fchunk])))
566 					goto bad;
567 			} else {
568 				/* Two cases: single disk (Dx) or (Dx+Q)
569 				 *   Dx = Dz ^ P (same as RAID5)
570 				 */
571 				printf("Disk %llx offline, "
572 				    "regenerating Dx%s\n", chunk,
573 				    fail & SR_FAILQ ? "+Q" : " single");
574 
575 				/* Calculate: Dx = P^Dz
576  				 *   P:  sr_raid6_xorp(data, ---, length);
577  				 *   Dz: sr_raid6_xorp(data, ---, length);
578 				 */
579 				memset(data, 0, length);
580 				for (i = 0; i < no_chunk+2; i++) {
581 					if (i != chunk && i != qchunk) {
582 						/* Read Dz */
583 						if (sr_raid6_addio(wu, i, lba,
584 						    length, NULL, SCSI_DATA_IN,
585 						    SR_CCBF_FREEBUF, data,
586 						    NULL, 0))
587 	 				    	    	goto bad;
588 					}
589 				}
590 
591 				/* data will contain correct value on completion */
592 			}
593 		} else {
594 			/* XXX handle writes to failed/offline disk? */
595 			if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP))
596 				goto bad;
597 
598 			/*
599 			 * initialize pbuf with contents of new data to be
600 			 * written. This will be XORed with old data and old
601 			 * parity in the intr routine. The result in pbuf
602 			 * is the new parity data.
603 			 */
604 			qbuf = sr_get_block(sd, length);
605 			if (qbuf == NULL)
606 				goto bad;
607 
608 			pbuf = sr_get_block(sd, length);
609 			if (pbuf == NULL)
610 				goto bad;
611 
612 			/* Calulate P = Dn; Q = gn * Dn */
613 			if (gf_premul(gf_pow[chunk]))
614 				goto bad;
615 			sr_raid6_xorp(pbuf, data, length);
616 			sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]);
617 
618 			/* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */
619 			if (sr_raid6_addio(wu, chunk, lba, length, NULL,
620 				SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, qbuf,
621 				gf_pow[chunk]))
622 				goto bad;
623 
624 			/* Read old xor-parity: P ^= P' */
625 			if (sr_raid6_addio(wu, pchunk, lba, length, NULL,
626 				SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, NULL, 0))
627 				goto bad;
628 
629 			/* Read old q-parity: Q ^= Q' */
630 			if (sr_raid6_addio(wu, qchunk, lba, length, NULL,
631 				SCSI_DATA_IN, SR_CCBF_FREEBUF, qbuf, NULL, 0))
632 				goto bad;
633 
634 			/* write new data */
635 			if (sr_raid6_addio(wu_w, chunk, lba, length, data,
636 			    xs->flags, 0, NULL, NULL, 0))
637 				goto bad;
638 
639 			/* write new xor-parity */
640 			if (sr_raid6_addio(wu_w, pchunk, lba, length, pbuf,
641 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
642 				goto bad;
643 
644 			/* write new q-parity */
645 			if (sr_raid6_addio(wu_w, qchunk, lba, length, qbuf,
646 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
647 				goto bad;
648 		}
649 
650 		/* advance to next block */
651 		lbaoffs += length;
652 		datalen -= length;
653 		data += length;
654 	}
655 
656 	s = splbio();
657 	if (wu_w) {
658 		/* collide write request with reads */
659 		wu_w->swu_blk_start = wu->swu_blk_start;
660 		wu_w->swu_blk_end = wu->swu_blk_end;
661 
662 		/*
663 		 * put xs block in write request (scsi_done not called till
664 		 * write completes)
665 		 */
666 		wu_w->swu_xs = wu->swu_xs;
667 		wu->swu_xs = NULL;
668 
669 		wu_w->swu_state = SR_WU_DEFERRED;
670 		wu->swu_collider = wu_w;
671 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
672 	}
673 
674 	/* rebuild io, let rebuild routine deal with it */
675 	if (wu->swu_flags & SR_WUF_REBUILD)
676 		goto queued;
677 
678 	/* current io failed, restart */
679 	if (wu->swu_state == SR_WU_RESTART)
680 		goto start;
681 
682 	/* deferred io failed, don't restart */
683 	if (wu->swu_state == SR_WU_REQUEUE)
684 		goto queued;
685 
686 	if (sr_check_io_collision(wu))
687 		goto queued;
688 
689 start:
690 	sr_raid_startwu(wu);
691 queued:
692 	splx(s);
693 	return (0);
694 bad:
695 	/* wu is unwound by sr_wu_put */
696 	if (wu_w)
697 		sr_wu_put(wu_w);
698 	return (1);
699 }
700 
701 /* Handle failure I/O completion */
702 int
703 sr_failio(struct sr_workunit *wu)
704 {
705 	struct sr_discipline	*sd = wu->swu_dis;
706 	struct sr_ccb		*ccb;
707 
708 	if (!(wu->swu_flags & SR_WUF_FAIL))
709 		return (0);
710 
711 	/* Wu is a 'fake'.. don't do real I/O just intr */
712 	TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
713 	TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link)
714 		sr_raid6_intr(&ccb->ccb_buf);
715 	return (1);
716 }
717 
718 void
719 sr_raid6_intr(struct buf *bp)
720 {
721 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
722 	struct sr_workunit	*wu = ccb->ccb_wu, *wup;
723 	struct sr_discipline	*sd = wu->swu_dis;
724 	struct scsi_xfer	*xs = wu->swu_xs;
725 	struct sr_softc		*sc = sd->sd_sc;
726 	struct sr_raid6_opaque  *pq = ccb->ccb_opaque;
727 	int			s, pend;
728 
729 	DNPRINTF(SR_D_INTR, "%s: sr_intr bp %p xs %p\n",
730 	    DEVNAME(sc), bp, xs);
731 
732 	DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d"
733 	    " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc),
734 	    ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags,
735 	    ccb->ccb_buf.b_blkno, ccb->ccb_target);
736 
737 	s = splbio();
738 
739 	if (ccb->ccb_buf.b_flags & B_ERROR) {
740 		DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n",
741 		    DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target);
742 		printf("io error: disk %x\n", ccb->ccb_target);
743 		wu->swu_ios_failed++;
744 		ccb->ccb_state = SR_CCB_FAILED;
745 		if (ccb->ccb_target != -1)
746 			sd->sd_set_chunk_state(sd, ccb->ccb_target,
747 			    BIOC_SDOFFLINE);
748 		else
749 			panic("%s: invalid target on wu: %p", DEVNAME(sc), wu);
750 	} else {
751 		ccb->ccb_state = SR_CCB_OK;
752 		wu->swu_ios_succeeded++;
753 
754 		/* XOR data to result */
755 		if (pq) {
756 			if (pq->pbuf)
757 				/* Calculate xor-parity */
758 				sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data,
759 				    ccb->ccb_buf.b_bcount);
760 			if (pq->qbuf)
761 				/* Calculate q-parity */
762 				sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data,
763 				    ccb->ccb_buf.b_bcount, pq->gn);
764 			free(pq, M_DEVBUF);
765 			ccb->ccb_opaque = NULL;
766 		}
767 	}
768 
769 	/* free allocated data buffer */
770 	if (ccb->ccb_flag & SR_CCBF_FREEBUF) {
771 		sr_put_block(sd, ccb->ccb_buf.b_data);
772 		ccb->ccb_buf.b_data = NULL;
773 	}
774 	wu->swu_ios_complete++;
775 
776 	DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n",
777 	    DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count,
778 	    wu->swu_ios_failed);
779 
780 	if (wu->swu_ios_complete >= wu->swu_io_count) {
781 
782 		/* if all ios failed, retry reads and give up on writes */
783 		if (wu->swu_ios_failed == wu->swu_ios_complete) {
784 			if (xs->flags & SCSI_DATA_IN) {
785 				printf("%s: retrying read on block %lld\n",
786 				    DEVNAME(sc), ccb->ccb_buf.b_blkno);
787 				sr_ccb_put(ccb);
788 				TAILQ_INIT(&wu->swu_ccb);
789 				wu->swu_state = SR_WU_RESTART;
790 				if (sd->sd_scsi_rw(wu))
791 					goto bad;
792 				else
793 					goto retry;
794 			} else {
795 				printf("%s: permanently fail write on block "
796 				    "%lld\n", DEVNAME(sc),
797 				    ccb->ccb_buf.b_blkno);
798 				xs->error = XS_DRIVER_STUFFUP;
799 				goto bad;
800 			}
801 		}
802 
803 		if (xs != NULL) {
804 			xs->error = XS_NOERROR;
805 			xs->resid = 0;
806 			xs->flags |= ITSDONE;
807 		}
808 
809 		pend = 0;
810 		TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
811 			if (wu == wup) {
812 				/* wu on pendq, remove */
813 				TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);
814 				pend = 1;
815 
816 				if (wu->swu_collider) {
817 					if (wu->swu_ios_failed)
818 						/* toss all ccbs and recreate */
819 						sr_raid6_recreate_wu(wu->swu_collider);
820 
821 					/* restart deferred wu */
822 					wu->swu_collider->swu_state =
823 					    SR_WU_INPROGRESS;
824 					TAILQ_REMOVE(&sd->sd_wu_defq,
825 					    wu->swu_collider, swu_link);
826 					if (sr_failio(wu->swu_collider) == 0)
827 						sr_raid_startwu(wu->swu_collider);
828 				}
829 				break;
830 			}
831 		}
832 
833 		if (!pend)
834 			printf("%s: wu: %p not on pending queue\n",
835 			    DEVNAME(sc), wu);
836 
837 		if (wu->swu_flags & SR_WUF_REBUILD) {
838 			if (wu->swu_xs->flags & SCSI_DATA_OUT) {
839 				wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
840 				wakeup(wu);
841 			}
842 		} else {
843 			/* do not change the order of these 2 functions */
844 			sr_wu_put(wu);
845 			if (xs != NULL)
846 				scsi_done(xs);
847 		}
848 
849 		if (sd->sd_sync && sd->sd_wu_pending == 0)
850 			wakeup(sd);
851 	}
852 
853 retry:
854 	splx(s);
855 	return;
856 bad:
857 	xs->error = XS_DRIVER_STUFFUP;
858 	xs->flags |= ITSDONE;
859 	if (wu->swu_flags & SR_WUF_REBUILD) {
860 		wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
861 		wakeup(wu);
862 	} else {
863 		/* do not change the order of these 2 functions */
864 		sr_wu_put(wu);
865 		scsi_done(xs);
866 	}
867 
868 	splx(s);
869 }
870 
871 void
872 sr_raid6_recreate_wu(struct sr_workunit *wu)
873 {
874 	struct sr_discipline	*sd = wu->swu_dis;
875 	struct sr_workunit	*wup = wu;
876 	struct sr_ccb		*ccb;
877 
878 	do {
879 		DNPRINTF(SR_D_INTR, "%s: sr_raid6_recreate_wu: %p\n", wup);
880 
881 		/* toss all ccbs */
882 		while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) {
883 			TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link);
884 			sr_ccb_put(ccb);
885 		}
886 		TAILQ_INIT(&wup->swu_ccb);
887 
888 		/* recreate ccbs */
889 		wup->swu_state = SR_WU_REQUEUE;
890 		if (sd->sd_scsi_rw(wup))
891 			panic("could not requeue io");
892 
893 		wup = wup->swu_collider;
894 	} while (wup);
895 }
896 
897 int
898 sr_raid6_addio(struct sr_workunit *wu, int dsk, daddr64_t blk, daddr64_t len,
899     void *data, int flag, int ccbflag, void *pbuf, void *qbuf, int gn)
900 {
901 	struct sr_discipline 	*sd = wu->swu_dis;
902 	struct sr_ccb		*ccb;
903 	struct sr_raid6_opaque  *pqbuf;
904 
905 	ccb = sr_ccb_get(sd);
906 	if (!ccb)
907 		return (-1);
908 
909 	/* allocate temporary buffer */
910 	if (data == NULL) {
911 		data = sr_get_block(sd, len);
912 		if (data == NULL)
913 			return (-1);
914 	}
915 
916 	DNPRINTF(0, "%sio: %d.%llx %llx %p:%p\n",
917 	    flag & SCSI_DATA_IN ? "read" : "write",
918 	    dsk, blk, len, pbuf, qbuf);
919 
920 	ccb->ccb_flag = ccbflag;
921 	if (flag & SCSI_POLL) {
922 		ccb->ccb_buf.b_flags = 0;
923 		ccb->ccb_buf.b_iodone = NULL;
924 	} else {
925 		ccb->ccb_buf.b_flags = B_CALL;
926 		ccb->ccb_buf.b_iodone = sr_raid6_intr;
927 	}
928 	if (flag & SCSI_DATA_IN)
929 		ccb->ccb_buf.b_flags |= B_READ;
930 	else
931 		ccb->ccb_buf.b_flags |= B_WRITE;
932 
933 	/* add offset for metadata */
934 	ccb->ccb_buf.b_flags |= B_PHYS;
935 	ccb->ccb_buf.b_blkno = blk;
936 	ccb->ccb_buf.b_bcount = len;
937 	ccb->ccb_buf.b_bufsize = len;
938 	ccb->ccb_buf.b_resid = len;
939 	ccb->ccb_buf.b_data = data;
940 	ccb->ccb_buf.b_error = 0;
941 	ccb->ccb_buf.b_proc = curproc;
942 	ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[dsk]->src_dev_mm;
943 	ccb->ccb_buf.b_vp = sd->sd_vol.sv_chunks[dsk]->src_vn;
944 	if ((ccb->ccb_buf.b_flags & B_READ) == 0)
945 		ccb->ccb_buf.b_vp->v_numoutput++;
946 
947 	ccb->ccb_wu = wu;
948 	ccb->ccb_target = dsk;
949 	if (pbuf || qbuf) {
950 		if (qbuf && gf_premul(gn))
951 			return (-1);
952 
953 		pqbuf = malloc(sizeof(struct sr_raid6_opaque), M_DEVBUF, M_CANFAIL);
954 		if (pqbuf == NULL) {
955 			sr_ccb_put(ccb);
956 			return (-1);
957 		}
958 		pqbuf->pbuf = pbuf;
959 		pqbuf->qbuf = qbuf;
960 		pqbuf->gn = gn;
961 		ccb->ccb_opaque = pqbuf;
962 	}
963 
964 	LIST_INIT(&ccb->ccb_buf.b_dep);
965 	TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link);
966 
967 	DNPRINTF(SR_D_DIS, "%s: %s: sr_raid6: b_bcount: %d "
968 	    "b_blkno: %x b_flags 0x%0x b_data %p\n",
969 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
970 	    ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno,
971 	    ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data);
972 
973 	wu->swu_io_count++;
974 
975 	return (0);
976 }
977 
978 /* Perform RAID6 parity calculation.
979  *   P=xor parity, Q=GF256 parity, D=data, gn=disk# */
980 void
981 sr_raid6_xorp(void *p, void *d, int len)
982 {
983 	uint8_t *pbuf = p, *data = d;
984 
985 	while (len--)
986 		pbuf[len] ^= data[len];
987 }
988 
989 void
990 sr_raid6_xorq(void *q, void *d, int len, int gn)
991 {
992 	uint8_t		*qbuf = q, *data = d;
993 	uint8_t		*gn_map = gf_map[gn];
994 
995 	/* Have to do this a byte at a time */
996 	/* Faster multiply.. gn is always constant */
997 	while (len--)
998 		qbuf[len] ^= gn_map[data[len]];
999 }
1000 
1001 /* Create GF256 log/pow tables: polynomial = 0x11D */
1002 void
1003 gf_init(void)
1004 {
1005 	int i;
1006 	uint8_t p = 1;
1007 
1008 	/* use 2N pow table to avoid using % in multiply */
1009 	for (i=0; i<256; i++) {
1010 		gf_log[p] = i;
1011 		gf_pow[i] = gf_pow[i+255] = p;
1012 		p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00));
1013 	}
1014 	gf_log[0] = 512;
1015 }
1016 
1017 uint8_t
1018 gf_inv(uint8_t a)
1019 {
1020 	return gf_pow[255 - gf_log[a]];
1021 }
1022 
1023 /* Precalculate multiplication tables for drive gn */
1024 int
1025 gf_premul(uint8_t gn)
1026 {
1027 	int i;
1028 
1029 	if (gf_map[gn] != NULL)
1030 		return (0);
1031 
1032 	if ((gf_map[gn] = malloc(256, M_DEVBUF, M_CANFAIL)) == NULL)
1033 		return (-1);
1034 
1035 	for (i=0; i<256; i++)
1036 		gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]];
1037 	return (0);
1038 }
1039