xref: /openbsd/sys/dev/softraid_raid6.c (revision 898184e3)
1 /* $OpenBSD: softraid_raid6.c,v 1.35 2013/03/25 16:01:49 jsing Exp $ */
2 /*
3  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
4  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bio.h"
20 
21 #include <sys/param.h>
22 #include <sys/systm.h>
23 #include <sys/buf.h>
24 #include <sys/device.h>
25 #include <sys/ioctl.h>
26 #include <sys/proc.h>
27 #include <sys/malloc.h>
28 #include <sys/kernel.h>
29 #include <sys/disk.h>
30 #include <sys/rwlock.h>
31 #include <sys/queue.h>
32 #include <sys/fcntl.h>
33 #include <sys/disklabel.h>
34 #include <sys/mount.h>
35 #include <sys/sensors.h>
36 #include <sys/stat.h>
37 #include <sys/conf.h>
38 #include <sys/uio.h>
39 
40 #include <scsi/scsi_all.h>
41 #include <scsi/scsiconf.h>
42 #include <scsi/scsi_disk.h>
43 
44 #include <dev/softraidvar.h>
45 #include <dev/rndvar.h>
46 
47 uint8_t *gf_map[256];
48 uint8_t	gf_pow[768];
49 int	gf_log[256];
50 
51 /* RAID 6 functions. */
52 int	sr_raid6_create(struct sr_discipline *, struct bioc_createraid *,
53 	    int, int64_t);
54 int	sr_raid6_assemble(struct sr_discipline *, struct bioc_createraid *,
55 	    int, void *);
56 int	sr_raid6_alloc_resources(struct sr_discipline *);
57 int	sr_raid6_free_resources(struct sr_discipline *);
58 int	sr_raid6_rw(struct sr_workunit *);
59 int	sr_raid6_openings(struct sr_discipline *);
60 void	sr_raid6_intr(struct buf *);
61 void	sr_raid6_set_chunk_state(struct sr_discipline *, int, int);
62 void	sr_raid6_set_vol_state(struct sr_discipline *);
63 
64 void	sr_raid6_xorp(void *, void *, int);
65 void	sr_raid6_xorq(void *, void *, int, int);
66 int	sr_raid6_addio(struct sr_workunit *wu, int, daddr64_t, daddr64_t,
67 	    void *, int, int, void *, void *, int);
68 void	sr_dump(void *, int);
69 void	sr_raid6_scrub(struct sr_discipline *);
70 int	sr_failio(struct sr_workunit *);
71 
72 void	*sr_get_block(struct sr_discipline *, int);
73 void	sr_put_block(struct sr_discipline *, void *, int);
74 
75 void	gf_init(void);
76 uint8_t gf_inv(uint8_t);
77 int	gf_premul(uint8_t);
78 uint8_t gf_mul(uint8_t, uint8_t);
79 
80 #define SR_NOFAIL		0x00
81 #define SR_FAILX		(1L << 0)
82 #define SR_FAILY		(1L << 1)
83 #define SR_FAILP		(1L << 2)
84 #define SR_FAILQ		(1L << 3)
85 
86 struct sr_raid6_opaque {
87 	int      gn;
88 	void	*pbuf;
89 	void	*qbuf;
90 };
91 
92 /* discipline initialisation. */
93 void
94 sr_raid6_discipline_init(struct sr_discipline *sd)
95 {
96 	/* Initialize GF256 tables. */
97 	gf_init();
98 
99 	/* Fill out discipline members. */
100 	sd->sd_type = SR_MD_RAID6;
101 	strlcpy(sd->sd_name, "RAID 6", sizeof(sd->sd_name));
102 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
103 	    SR_CAP_REDUNDANT;
104 	sd->sd_max_wu = SR_RAID6_NOWU;
105 
106 	/* Setup discipline specific function pointers. */
107 	sd->sd_alloc_resources = sr_raid6_alloc_resources;
108 	sd->sd_assemble = sr_raid6_assemble;
109 	sd->sd_create = sr_raid6_create;
110 	sd->sd_free_resources = sr_raid6_free_resources;
111 	sd->sd_openings = sr_raid6_openings;
112 	sd->sd_scsi_rw = sr_raid6_rw;
113 	sd->sd_scsi_intr = sr_raid6_intr;
114 	sd->sd_set_chunk_state = sr_raid6_set_chunk_state;
115 	sd->sd_set_vol_state = sr_raid6_set_vol_state;
116 }
117 
118 int
119 sr_raid6_create(struct sr_discipline *sd, struct bioc_createraid *bc,
120     int no_chunk, int64_t coerced_size)
121 {
122 
123 	if (no_chunk < 4)
124 		return EINVAL;
125 
126 	/*
127 	 * XXX add variable strip size later even though MAXPHYS is really
128 	 * the clever value, users like * to tinker with that type of stuff.
129 	 */
130         sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
131         sd->sd_meta->ssdi.ssd_size = (coerced_size &
132 	    ~((sd->sd_meta->ssdi.ssd_strip_size >> DEV_BSHIFT) - 1)) *
133 	    (no_chunk - 2);
134 
135 	/* only if stripsize <= MAXPHYS */
136 	sd->sd_max_ccb_per_wu = max(6, 2 * no_chunk);
137 
138 	return 0;
139 }
140 
141 int
142 sr_raid6_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
143     int no_chunk, void *data)
144 {
145 
146 	/* only if stripsize <= MAXPHYS */
147 	sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no);
148 
149 	return 0;
150 }
151 
152 int
153 sr_raid6_openings(struct sr_discipline *sd)
154 {
155 	return (sd->sd_max_wu >> 1); /* 2 wu's per IO */
156 }
157 
158 int
159 sr_raid6_alloc_resources(struct sr_discipline *sd)
160 {
161 	int			rv = EINVAL;
162 
163 	DNPRINTF(SR_D_DIS, "%s: sr_raid6_alloc_resources\n",
164 	    DEVNAME(sd->sd_sc));
165 
166 	if (sr_wu_alloc(sd))
167 		goto bad;
168 	if (sr_ccb_alloc(sd))
169 		goto bad;
170 
171 	/* setup runtime values */
172 	sd->mds.mdd_raid6.sr6_strip_bits =
173 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
174 	if (sd->mds.mdd_raid6.sr6_strip_bits == -1)
175 		goto bad;
176 
177 	rv = 0;
178 bad:
179 	return (rv);
180 }
181 
182 int
183 sr_raid6_free_resources(struct sr_discipline *sd)
184 {
185 	int			rv = EINVAL;
186 
187 	DNPRINTF(SR_D_DIS, "%s: sr_raid6_free_resources\n",
188 	    DEVNAME(sd->sd_sc));
189 
190 	sr_wu_free(sd);
191 	sr_ccb_free(sd);
192 
193 	rv = 0;
194 	return (rv);
195 }
196 
197 void
198 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
199 {
200 	int			old_state, s;
201 
202 	/* XXX this is for RAID 0 */
203 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
204 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
205 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
206 
207 	/* ok to go to splbio since this only happens in error path */
208 	s = splbio();
209 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
210 
211 	/* multiple IOs to the same chunk that fail will come through here */
212 	if (old_state == new_state)
213 		goto done;
214 
215 	switch (old_state) {
216 	case BIOC_SDONLINE:
217 		switch (new_state) {
218 		case BIOC_SDOFFLINE:
219 		case BIOC_SDSCRUB:
220 			break;
221 		default:
222 			goto die;
223 		}
224 		break;
225 
226 	case BIOC_SDOFFLINE:
227 		if (new_state == BIOC_SDREBUILD) {
228 			;
229 		} else
230 			goto die;
231 		break;
232 
233 	case BIOC_SDSCRUB:
234 		switch (new_state) {
235 		case BIOC_SDONLINE:
236 		case BIOC_SDOFFLINE:
237 			break;
238 		default:
239 			goto die;
240 		}
241 		break;
242 
243 	case BIOC_SDREBUILD:
244 		switch (new_state) {
245 		case BIOC_SDONLINE:
246 		case BIOC_SDOFFLINE:
247 			break;
248 		default:
249 			goto die;
250 		}
251 		break;
252 
253 	default:
254 die:
255 		splx(s); /* XXX */
256 		panic("%s: %s: %s: invalid chunk state transition "
257 		    "%d -> %d", DEVNAME(sd->sd_sc),
258 		    sd->sd_meta->ssd_devname,
259 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
260 		    old_state, new_state);
261 		/* NOTREACHED */
262 	}
263 
264 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
265 	sd->sd_set_vol_state(sd);
266 
267 	sd->sd_must_flush = 1;
268 	workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL);
269 done:
270 	splx(s);
271 }
272 
273 void
274 sr_raid6_set_vol_state(struct sr_discipline *sd)
275 {
276 	int			states[SR_MAX_STATES];
277 	int			new_state, i, s, nd;
278 	int			old_state = sd->sd_vol_status;
279 
280 	/* XXX this is for RAID 0 */
281 
282 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
283 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
284 
285 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
286 
287 	for (i = 0; i < SR_MAX_STATES; i++)
288 		states[i] = 0;
289 
290 	for (i = 0; i < nd; i++) {
291 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
292 		if (s >= SR_MAX_STATES)
293 			panic("%s: %s: %s: invalid chunk state",
294 			    DEVNAME(sd->sd_sc),
295 			    sd->sd_meta->ssd_devname,
296 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
297 		states[s]++;
298 	}
299 
300 	if (states[BIOC_SDONLINE] == nd)
301 		new_state = BIOC_SVONLINE;
302 	else if (states[BIOC_SDONLINE] < nd - 2)
303 		new_state = BIOC_SVOFFLINE;
304 	else if (states[BIOC_SDSCRUB] != 0)
305 		new_state = BIOC_SVSCRUB;
306 	else if (states[BIOC_SDREBUILD] != 0)
307 		new_state = BIOC_SVREBUILD;
308 	else if (states[BIOC_SDONLINE] < nd)
309 		new_state = BIOC_SVDEGRADED;
310 	else {
311 		printf("old_state = %d, ", old_state);
312 		for (i = 0; i < nd; i++)
313 			printf("%d = %d, ", i,
314 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
315 		panic("invalid new_state");
316 	}
317 
318 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n",
319 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
320 	    old_state, new_state);
321 
322 	switch (old_state) {
323 	case BIOC_SVONLINE:
324 		switch (new_state) {
325 		case BIOC_SVONLINE: /* can go to same state */
326 		case BIOC_SVOFFLINE:
327 		case BIOC_SVDEGRADED:
328 		case BIOC_SVREBUILD: /* happens on boot */
329 			break;
330 		default:
331 			goto die;
332 		}
333 		break;
334 
335 	case BIOC_SVOFFLINE:
336 		/* XXX this might be a little too much */
337 		goto die;
338 
339 	case BIOC_SVSCRUB:
340 		switch (new_state) {
341 		case BIOC_SVONLINE:
342 		case BIOC_SVOFFLINE:
343 		case BIOC_SVDEGRADED:
344 		case BIOC_SVSCRUB: /* can go to same state */
345 			break;
346 		default:
347 			goto die;
348 		}
349 		break;
350 
351 	case BIOC_SVBUILDING:
352 		switch (new_state) {
353 		case BIOC_SVONLINE:
354 		case BIOC_SVOFFLINE:
355 		case BIOC_SVBUILDING: /* can go to the same state */
356 			break;
357 		default:
358 			goto die;
359 		}
360 		break;
361 
362 	case BIOC_SVREBUILD:
363 		switch (new_state) {
364 		case BIOC_SVONLINE:
365 		case BIOC_SVOFFLINE:
366 		case BIOC_SVDEGRADED:
367 		case BIOC_SVREBUILD: /* can go to the same state */
368 			break;
369 		default:
370 			goto die;
371 		}
372 		break;
373 
374 	case BIOC_SVDEGRADED:
375 		switch (new_state) {
376 		case BIOC_SVOFFLINE:
377 		case BIOC_SVREBUILD:
378 		case BIOC_SVDEGRADED: /* can go to the same state */
379 			break;
380 		default:
381 			goto die;
382 		}
383 		break;
384 
385 	default:
386 die:
387 		panic("%s: %s: invalid volume state transition %d -> %d",
388 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
389 		    old_state, new_state);
390 		/* NOTREACHED */
391 	}
392 
393 	sd->sd_vol_status = new_state;
394 }
395 
396 /*  modes:
397  *   readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
398  *	        SR_CCBF_FREEBUF, qbuf, NULL, 0);
399  *   readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
400  *		SR_CCBF_FREEBUF, pbuf, NULL, 0);
401  *   readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
402  *		SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[i]);
403  */
404 
405 int
406 sr_raid6_rw(struct sr_workunit *wu)
407 {
408 	struct sr_workunit	*wu_r = NULL;
409 	struct sr_discipline	*sd = wu->swu_dis;
410 	struct scsi_xfer	*xs = wu->swu_xs;
411 	struct sr_chunk		*scp;
412 	int			s, fail, i, gxinv, pxinv;
413 	daddr64_t		blk, lbaoffs, strip_no, chunk, qchunk, pchunk, fchunk;
414 	daddr64_t		strip_size, no_chunk, lba, chunk_offs, phys_offs;
415 	daddr64_t		strip_bits, length, strip_offs, datalen, row_size;
416 	void		        *pbuf, *data, *qbuf;
417 
418 	/* blk and scsi error will be handled by sr_validate_io */
419 	if (sr_validate_io(wu, &blk, "sr_raid6_rw"))
420 		goto bad;
421 
422 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
423 	strip_bits = sd->mds.mdd_raid6.sr6_strip_bits;
424 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2;
425 	row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
426 
427 	data = xs->data;
428 	datalen = xs->datalen;
429 	lbaoffs	= blk << DEV_BSHIFT;
430 
431 	if (xs->flags & SCSI_DATA_OUT)
432 		/* create write workunit */
433 		if ((wu_r = scsi_io_get(&sd->sd_iopool, SCSI_NOSLEEP)) == NULL){
434 			printf("%s: can't get wu_r", DEVNAME(sd->sd_sc));
435 			goto bad;
436 		}
437 
438 	wu->swu_blk_start = 0;
439 	while (datalen != 0) {
440 		strip_no = lbaoffs >> strip_bits;
441 		strip_offs = lbaoffs & (strip_size - 1);
442 		chunk_offs = (strip_no / no_chunk) << strip_bits;
443 		phys_offs = chunk_offs + strip_offs +
444 		    (sd->sd_meta->ssd_data_offset << DEV_BSHIFT);
445 
446 		/* get size remaining in this stripe */
447 		length = MIN(strip_size - strip_offs, datalen);
448 
449 		/* map disk offset to parity/data drive */
450 		chunk = strip_no % no_chunk;
451 
452 		qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2));
453 		if (qchunk == 0)
454 			pchunk = no_chunk + 1;
455 		else
456 			pchunk = qchunk - 1;
457 		if (chunk >= pchunk)
458 			chunk++;
459 		if (chunk >= qchunk)
460 			chunk++;
461 
462 		lba = phys_offs >> DEV_BSHIFT;
463 
464 		/* XXX big hammer.. exclude I/O from entire stripe */
465 		if (wu->swu_blk_start == 0)
466 			wu->swu_blk_start = (strip_no / no_chunk) * row_size;
467 		wu->swu_blk_end = (strip_no / no_chunk) * row_size + (row_size - 1);
468 
469 		fail = 0;
470 		fchunk = -1;
471 
472 		/* Get disk-fail flags */
473 		for (i=0; i< no_chunk+2; i++) {
474 			scp = sd->sd_vol.sv_chunks[i];
475 			switch (scp->src_meta.scm_status) {
476 			case BIOC_SDOFFLINE:
477 			case BIOC_SDREBUILD:
478 			case BIOC_SDHOTSPARE:
479 				if (i == qchunk)
480 					fail |= SR_FAILQ;
481 				else if (i == pchunk)
482 					fail |= SR_FAILP;
483 				else if (i == chunk)
484 					fail |= SR_FAILX;
485 				else {
486 					/* dual data-disk failure */
487 					fail |= SR_FAILY;
488 					fchunk = i;
489 				}
490 				break;
491 			}
492 		}
493 		if (xs->flags & SCSI_DATA_IN) {
494 			if (!(fail & SR_FAILX)) {
495 				/* drive is good. issue single read request */
496 				if (sr_raid6_addio(wu, chunk, lba, length,
497 				    data, xs->flags, 0, NULL, NULL, 0))
498 					goto bad;
499 			} else if (fail & SR_FAILP) {
500 				/* Dx, P failed */
501 				printf("Disk %llx offline, "
502 				    "regenerating Dx+P\n", chunk);
503 
504 				gxinv = gf_inv(gf_pow[chunk]);
505 
506 				/* Calculate: Dx = (Q^Dz*gz)*inv(gx) */
507 				memset(data, 0, length);
508 				if (sr_raid6_addio(wu, qchunk, lba, length, NULL,
509 				    SCSI_DATA_IN, SR_CCBF_FREEBUF, NULL, data,
510 				    gxinv))
511 					goto bad;
512 
513 				/* Read Dz * gz * inv(gx) */
514 				for (i = 0; i < no_chunk+2; i++) {
515 					if  (i == qchunk || i == pchunk || i == chunk)
516 						continue;
517 
518 					if (sr_raid6_addio(wu, i, lba,
519 					   length, NULL, SCSI_DATA_IN,
520 					   SR_CCBF_FREEBUF, NULL,
521 					   data, gf_mul(gf_pow[i], gxinv)))
522 						goto bad;
523 				}
524 
525 				/* data will contain correct value on completion */
526 			} else if (fail & SR_FAILY) {
527 				/* Dx, Dy failed */
528 				printf("Disk %llx & %llx offline, "
529 				    "regenerating Dx+Dy\n", chunk, fchunk);
530 
531 				gxinv = gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]);
532 				pxinv = gf_mul(gf_pow[fchunk], gxinv);
533 
534 				/* read Q * inv(gx + gy) */
535 				memset(data, 0, length);
536 				if (sr_raid6_addio(wu, qchunk, lba,
537 				    length,  NULL, SCSI_DATA_IN,
538 				    SR_CCBF_FREEBUF, NULL,
539 				    data, gxinv))
540 					goto bad;
541 
542 				/* read P * gy * inv(gx + gy) */
543 				if (sr_raid6_addio(wu, pchunk, lba,
544 				    length,  NULL, SCSI_DATA_IN,
545 				    SR_CCBF_FREEBUF, NULL,
546 				    data, pxinv))
547 					goto bad;
548 
549 				/* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz
550 				 *   Q:  sr_raid6_xorp(qbuf, --, length);
551 				 *   P:  sr_raid6_xorp(pbuf, --, length);
552 				 *   Dz: sr_raid6_xorp(pbuf, --, length);
553 				 *	 sr_raid6_xorq(qbuf, --, length, gf_pow[i]);
554 				 */
555 				for (i = 0; i < no_chunk+2; i++) {
556 					if (i == qchunk || i == pchunk ||
557 					    i == chunk || i == fchunk)
558 						continue;
559 
560 					/* read Dz * (gz + gy) * inv(gx + gy) */
561 					if (sr_raid6_addio(wu, i, lba,
562 					    length, NULL, SCSI_DATA_IN,
563 					    SR_CCBF_FREEBUF, NULL, data,
564 					    pxinv ^ gf_mul(gf_pow[i], gxinv)))
565 						goto bad;
566 				}
567 			} else {
568 				/* Two cases: single disk (Dx) or (Dx+Q)
569 				 *   Dx = Dz ^ P (same as RAID5)
570 				 */
571 				printf("Disk %llx offline, "
572 				    "regenerating Dx%s\n", chunk,
573 				    fail & SR_FAILQ ? "+Q" : " single");
574 
575 				/* Calculate: Dx = P^Dz
576 				 *   P:  sr_raid6_xorp(data, ---, length);
577 				 *   Dz: sr_raid6_xorp(data, ---, length);
578 				 */
579 				memset(data, 0, length);
580 				for (i = 0; i < no_chunk+2; i++) {
581 					if (i != chunk && i != qchunk) {
582 						/* Read Dz */
583 						if (sr_raid6_addio(wu, i, lba,
584 						    length, NULL, SCSI_DATA_IN,
585 						    SR_CCBF_FREEBUF, data,
586 						    NULL, 0))
587 							goto bad;
588 					}
589 				}
590 
591 				/* data will contain correct value on completion */
592 			}
593 		} else {
594 			/* XXX handle writes to failed/offline disk? */
595 			if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP))
596 				goto bad;
597 
598 			/*
599 			 * initialize pbuf with contents of new data to be
600 			 * written. This will be XORed with old data and old
601 			 * parity in the intr routine. The result in pbuf
602 			 * is the new parity data.
603 			 */
604 			qbuf = sr_get_block(sd, length);
605 			if (qbuf == NULL)
606 				goto bad;
607 
608 			pbuf = sr_get_block(sd, length);
609 			if (pbuf == NULL)
610 				goto bad;
611 
612 			/* Calculate P = Dn; Q = gn * Dn */
613 			if (gf_premul(gf_pow[chunk]))
614 				goto bad;
615 			sr_raid6_xorp(pbuf, data, length);
616 			sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]);
617 
618 			/* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */
619 			if (sr_raid6_addio(wu_r, chunk, lba, length, NULL,
620 				SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, qbuf,
621 				gf_pow[chunk]))
622 				goto bad;
623 
624 			/* Read old xor-parity: P ^= P' */
625 			if (sr_raid6_addio(wu_r, pchunk, lba, length, NULL,
626 				SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, NULL, 0))
627 				goto bad;
628 
629 			/* Read old q-parity: Q ^= Q' */
630 			if (sr_raid6_addio(wu_r, qchunk, lba, length, NULL,
631 				SCSI_DATA_IN, SR_CCBF_FREEBUF, qbuf, NULL, 0))
632 				goto bad;
633 
634 			/* write new data */
635 			if (sr_raid6_addio(wu, chunk, lba, length, data,
636 			    xs->flags, 0, NULL, NULL, 0))
637 				goto bad;
638 
639 			/* write new xor-parity */
640 			if (sr_raid6_addio(wu, pchunk, lba, length, pbuf,
641 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
642 				goto bad;
643 
644 			/* write new q-parity */
645 			if (sr_raid6_addio(wu, qchunk, lba, length, qbuf,
646 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
647 				goto bad;
648 		}
649 
650 		/* advance to next block */
651 		lbaoffs += length;
652 		datalen -= length;
653 		data += length;
654 	}
655 
656 	s = splbio();
657 	if (wu_r) {
658 		/* collide write request with reads */
659 		wu_r->swu_blk_start = wu->swu_blk_start;
660 		wu_r->swu_blk_end = wu->swu_blk_end;
661 
662 		wu->swu_state = SR_WU_DEFERRED;
663 		wu_r->swu_collider = wu;
664 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
665 
666 		wu = wu_r;
667 	}
668 
669 	/* rebuild io, let rebuild routine deal with it */
670 	if (wu->swu_flags & SR_WUF_REBUILD)
671 		goto queued;
672 
673 	/* current io failed, restart */
674 	if (wu->swu_state == SR_WU_RESTART)
675 		goto start;
676 
677 	/* deferred io failed, don't restart */
678 	if (wu->swu_state == SR_WU_REQUEUE)
679 		goto queued;
680 
681 	if (sr_check_io_collision(wu))
682 		goto queued;
683 
684 start:
685 	sr_raid_startwu(wu);
686 queued:
687 	splx(s);
688 	return (0);
689 bad:
690 	/* wu is unwound by sr_wu_put */
691 	if (wu_r)
692 		scsi_io_put(&sd->sd_iopool, wu_r);
693 	return (1);
694 }
695 
696 /* Handle failure I/O completion */
697 int
698 sr_failio(struct sr_workunit *wu)
699 {
700 	struct sr_discipline	*sd = wu->swu_dis;
701 	struct sr_ccb		*ccb;
702 
703 	if (!(wu->swu_flags & SR_WUF_FAIL))
704 		return (0);
705 
706 	/* Wu is a 'fake'.. don't do real I/O just intr */
707 	TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
708 	TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link)
709 		sr_raid6_intr(&ccb->ccb_buf);
710 	return (1);
711 }
712 
713 void
714 sr_raid6_intr(struct buf *bp)
715 {
716 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
717 	struct sr_workunit	*wu = ccb->ccb_wu, *wup;
718 	struct sr_discipline	*sd = wu->swu_dis;
719 	struct scsi_xfer	*xs = wu->swu_xs;
720 	struct sr_softc		*sc = sd->sd_sc;
721 	struct sr_raid6_opaque  *pq = ccb->ccb_opaque;
722 	int			s, pend;
723 
724 	DNPRINTF(SR_D_INTR, "%s: sr_intr bp %p xs %p\n",
725 	    DEVNAME(sc), bp, xs);
726 
727 	DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d"
728 	    " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc),
729 	    ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags,
730 	    ccb->ccb_buf.b_blkno, ccb->ccb_target);
731 
732 	s = splbio();
733 
734 	if (ccb->ccb_buf.b_flags & B_ERROR) {
735 		DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n",
736 		    DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target);
737 		printf("io error: disk %x\n", ccb->ccb_target);
738 		wu->swu_ios_failed++;
739 		ccb->ccb_state = SR_CCB_FAILED;
740 		if (ccb->ccb_target != -1)
741 			sd->sd_set_chunk_state(sd, ccb->ccb_target,
742 			    BIOC_SDOFFLINE);
743 		else
744 			panic("%s: invalid target on wu: %p", DEVNAME(sc), wu);
745 	} else {
746 		ccb->ccb_state = SR_CCB_OK;
747 		wu->swu_ios_succeeded++;
748 
749 		/* XOR data to result */
750 		if (pq) {
751 			if (pq->pbuf)
752 				/* Calculate xor-parity */
753 				sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data,
754 				    ccb->ccb_buf.b_bcount);
755 			if (pq->qbuf)
756 				/* Calculate q-parity */
757 				sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data,
758 				    ccb->ccb_buf.b_bcount, pq->gn);
759 			free(pq, M_DEVBUF);
760 			ccb->ccb_opaque = NULL;
761 		}
762 	}
763 
764 	/* free allocated data buffer */
765 	if (ccb->ccb_flag & SR_CCBF_FREEBUF) {
766 		sr_put_block(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
767 		ccb->ccb_buf.b_data = NULL;
768 	}
769 	wu->swu_ios_complete++;
770 
771 	DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n",
772 	    DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count,
773 	    wu->swu_ios_failed);
774 
775 	if (wu->swu_ios_complete >= wu->swu_io_count) {
776 
777 		/* if all ios failed, retry reads and give up on writes */
778 		if (wu->swu_ios_failed == wu->swu_ios_complete) {
779 			if (xs->flags & SCSI_DATA_IN) {
780 				printf("%s: retrying read on block %lld\n",
781 				    DEVNAME(sc), ccb->ccb_buf.b_blkno);
782 				sr_ccb_put(ccb);
783 				TAILQ_INIT(&wu->swu_ccb);
784 				wu->swu_state = SR_WU_RESTART;
785 				if (sd->sd_scsi_rw(wu))
786 					goto bad;
787 				else
788 					goto retry;
789 			} else {
790 				printf("%s: permanently fail write on block "
791 				    "%lld\n", DEVNAME(sc),
792 				    ccb->ccb_buf.b_blkno);
793 				xs->error = XS_DRIVER_STUFFUP;
794 				goto bad;
795 			}
796 		}
797 
798 		if (xs != NULL)
799 			xs->error = XS_NOERROR;
800 
801 		pend = 0;
802 		TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
803 			if (wu == wup) {
804 				/* wu on pendq, remove */
805 				TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);
806 				pend = 1;
807 
808 				if (wu->swu_collider) {
809 					if (wu->swu_ios_failed)
810 						/* toss all ccbs and recreate */
811 						sr_raid_recreate_wu(wu->swu_collider);
812 
813 					/* restart deferred wu */
814 					wu->swu_collider->swu_state =
815 					    SR_WU_INPROGRESS;
816 					TAILQ_REMOVE(&sd->sd_wu_defq,
817 					    wu->swu_collider, swu_link);
818 					if (sr_failio(wu->swu_collider) == 0)
819 						sr_raid_startwu(wu->swu_collider);
820 				}
821 				break;
822 			}
823 		}
824 
825 		if (!pend)
826 			printf("%s: wu: %p not on pending queue\n",
827 			    DEVNAME(sc), wu);
828 
829 		if (wu->swu_flags & SR_WUF_REBUILD) {
830 			if (wu->swu_xs->flags & SCSI_DATA_OUT) {
831 				wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
832 				wakeup(wu);
833 			}
834 		} else {
835 			if (xs != NULL)
836 				sr_scsi_done(sd, xs);
837 			else
838 				scsi_io_put(&sd->sd_iopool, wu);
839 		}
840 
841 		if (sd->sd_sync && sd->sd_wu_pending == 0)
842 			wakeup(sd);
843 	}
844 
845 retry:
846 	splx(s);
847 	return;
848 bad:
849 	xs->error = XS_DRIVER_STUFFUP;
850 	if (wu->swu_flags & SR_WUF_REBUILD) {
851 		wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
852 		wakeup(wu);
853 	} else {
854 		sr_scsi_done(sd, xs);
855 	}
856 
857 	splx(s);
858 }
859 
860 int
861 sr_raid6_addio(struct sr_workunit *wu, int dsk, daddr64_t blk, daddr64_t len,
862     void *data, int flag, int ccbflag, void *pbuf, void *qbuf, int gn)
863 {
864 	struct sr_discipline	*sd = wu->swu_dis;
865 	struct sr_ccb		*ccb;
866 	struct sr_raid6_opaque  *pqbuf;
867 
868 	ccb = sr_ccb_get(sd);
869 	if (!ccb)
870 		return (-1);
871 
872 	/* allocate temporary buffer */
873 	if (data == NULL) {
874 		data = sr_get_block(sd, len);
875 		if (data == NULL)
876 			return (-1);
877 	}
878 
879 	DNPRINTF(0, "%sio: %d.%llx %llx %p:%p\n",
880 	    flag & SCSI_DATA_IN ? "read" : "write",
881 	    dsk, blk, len, pbuf, qbuf);
882 
883 	ccb->ccb_flag = ccbflag;
884 	if (flag & SCSI_POLL) {
885 		ccb->ccb_buf.b_flags = 0;
886 		ccb->ccb_buf.b_iodone = NULL;
887 	} else {
888 		ccb->ccb_buf.b_flags = B_CALL;
889 		ccb->ccb_buf.b_iodone = sr_raid6_intr;
890 	}
891 	if (flag & SCSI_DATA_IN)
892 		ccb->ccb_buf.b_flags |= B_READ;
893 	else
894 		ccb->ccb_buf.b_flags |= B_WRITE;
895 
896 	/* add offset for metadata */
897 	ccb->ccb_buf.b_flags |= B_PHYS;
898 	ccb->ccb_buf.b_blkno = blk;
899 	ccb->ccb_buf.b_bcount = len;
900 	ccb->ccb_buf.b_bufsize = len;
901 	ccb->ccb_buf.b_resid = len;
902 	ccb->ccb_buf.b_data = data;
903 	ccb->ccb_buf.b_error = 0;
904 	ccb->ccb_buf.b_proc = curproc;
905 	ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[dsk]->src_dev_mm;
906 	ccb->ccb_buf.b_vp = sd->sd_vol.sv_chunks[dsk]->src_vn;
907 	ccb->ccb_buf.b_bq = NULL;
908 	if ((ccb->ccb_buf.b_flags & B_READ) == 0)
909 		ccb->ccb_buf.b_vp->v_numoutput++;
910 
911 	ccb->ccb_wu = wu;
912 	ccb->ccb_target = dsk;
913 	if (pbuf || qbuf) {
914 		if (qbuf && gf_premul(gn))
915 			return (-1);
916 
917 		pqbuf = malloc(sizeof(struct sr_raid6_opaque), M_DEVBUF, M_ZERO | M_NOWAIT);
918 		if (pqbuf == NULL) {
919 			sr_ccb_put(ccb);
920 			return (-1);
921 		}
922 		pqbuf->pbuf = pbuf;
923 		pqbuf->qbuf = qbuf;
924 		pqbuf->gn = gn;
925 		ccb->ccb_opaque = pqbuf;
926 	}
927 
928 	LIST_INIT(&ccb->ccb_buf.b_dep);
929 	TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link);
930 
931 	DNPRINTF(SR_D_DIS, "%s: %s: sr_raid6: b_bcount: %d "
932 	    "b_blkno: %x b_flags 0x%0x b_data %p\n",
933 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
934 	    ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno,
935 	    ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data);
936 
937 	wu->swu_io_count++;
938 
939 	return (0);
940 }
941 
942 /* Perform RAID6 parity calculation.
943  *   P=xor parity, Q=GF256 parity, D=data, gn=disk# */
944 void
945 sr_raid6_xorp(void *p, void *d, int len)
946 {
947 	uint32_t *pbuf = p, *data = d;
948 
949 	len >>= 2;
950 	while (len--)
951 		*pbuf++ ^= *data++;
952 }
953 
954 void
955 sr_raid6_xorq(void *q, void *d, int len, int gn)
956 {
957 	uint32_t 	*qbuf = q, *data = d, x;
958 	uint8_t	 	*gn_map = gf_map[gn];
959 
960 	len >>= 2;
961 	while (len--) {
962 		x = *data++;
963 		*qbuf++ ^= (((uint32_t)gn_map[x & 0xff]) |
964 		  	    ((uint32_t)gn_map[(x >> 8) & 0xff] << 8) |
965 			    ((uint32_t)gn_map[(x >> 16) & 0xff] << 16) |
966 			    ((uint32_t)gn_map[(x >> 24) & 0xff] << 24));
967 	}
968 }
969 
970 /* Create GF256 log/pow tables: polynomial = 0x11D */
971 void
972 gf_init(void)
973 {
974 	int i;
975 	uint8_t p = 1;
976 
977 	/* use 2N pow table to avoid using % in multiply */
978 	for (i=0; i<256; i++) {
979 		gf_log[p] = i;
980 		gf_pow[i] = gf_pow[i+255] = p;
981 		p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00));
982 	}
983 	gf_log[0] = 512;
984 }
985 
986 uint8_t
987 gf_inv(uint8_t a)
988 {
989 	return gf_pow[255 - gf_log[a]];
990 }
991 
992 uint8_t
993 gf_mul(uint8_t a, uint8_t b)
994 {
995 	return gf_pow[gf_log[a] + gf_log[b]];
996 }
997 
998 /* Precalculate multiplication tables for drive gn */
999 int
1000 gf_premul(uint8_t gn)
1001 {
1002 	int i;
1003 
1004 	if (gf_map[gn] != NULL)
1005 		return (0);
1006 
1007 	if ((gf_map[gn] = malloc(256, M_DEVBUF, M_ZERO | M_NOWAIT)) == NULL)
1008 		return (-1);
1009 
1010 	for (i=0; i<256; i++)
1011 		gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]];
1012 	return (0);
1013 }
1014