xref: /openbsd/sys/dev/softraid_raid6.c (revision 0f9e9ec2)
1 /* $OpenBSD: softraid_raid6.c,v 1.73 2024/05/13 01:15:50 jsg Exp $ */
2 /*
3  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
4  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bio.h"
20 
21 #include <sys/param.h>
22 #include <sys/systm.h>
23 #include <sys/buf.h>
24 #include <sys/device.h>
25 #include <sys/ioctl.h>
26 #include <sys/malloc.h>
27 #include <sys/kernel.h>
28 #include <sys/disk.h>
29 #include <sys/rwlock.h>
30 #include <sys/queue.h>
31 #include <sys/fcntl.h>
32 #include <sys/mount.h>
33 #include <sys/sensors.h>
34 #include <sys/stat.h>
35 #include <sys/task.h>
36 #include <sys/conf.h>
37 #include <sys/uio.h>
38 
39 #include <scsi/scsi_all.h>
40 #include <scsi/scsiconf.h>
41 #include <scsi/scsi_disk.h>
42 
43 #include <dev/softraidvar.h>
44 
45 uint8_t *gf_map[256];
46 uint8_t	gf_pow[768];
47 int	gf_log[256];
48 
49 /* RAID 6 functions. */
50 int	sr_raid6_create(struct sr_discipline *, struct bioc_createraid *,
51 	    int, int64_t);
52 int	sr_raid6_assemble(struct sr_discipline *, struct bioc_createraid *,
53 	    int, void *);
54 int	sr_raid6_init(struct sr_discipline *);
55 int	sr_raid6_rw(struct sr_workunit *);
56 int	sr_raid6_openings(struct sr_discipline *);
57 void	sr_raid6_intr(struct buf *);
58 int	sr_raid6_wu_done(struct sr_workunit *);
59 void	sr_raid6_set_chunk_state(struct sr_discipline *, int, int);
60 void	sr_raid6_set_vol_state(struct sr_discipline *);
61 
62 void	sr_raid6_xorp(void *, void *, int);
63 void	sr_raid6_xorq(void *, void *, int, int);
64 int	sr_raid6_addio(struct sr_workunit *wu, int, daddr_t, long,
65 	    void *, int, int, void *, void *, int);
66 int	sr_failio(struct sr_workunit *);
67 
68 void	gf_init(void);
69 uint8_t gf_inv(uint8_t);
70 int	gf_premul(uint8_t);
71 uint8_t gf_mul(uint8_t, uint8_t);
72 
73 #define SR_NOFAIL		0x00
74 #define SR_FAILX		(1L << 0)
75 #define SR_FAILY		(1L << 1)
76 #define SR_FAILP		(1L << 2)
77 #define SR_FAILQ		(1L << 3)
78 
79 struct sr_raid6_opaque {
80 	int	gn;
81 	void	*pbuf;
82 	void	*qbuf;
83 };
84 
85 /* discipline initialisation. */
86 void
sr_raid6_discipline_init(struct sr_discipline * sd)87 sr_raid6_discipline_init(struct sr_discipline *sd)
88 {
89 	/* Initialize GF256 tables. */
90 	gf_init();
91 
92 	/* Fill out discipline members. */
93 	sd->sd_type = SR_MD_RAID6;
94 	strlcpy(sd->sd_name, "RAID 6", sizeof(sd->sd_name));
95 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
96 	    SR_CAP_REDUNDANT;
97 	sd->sd_max_wu = SR_RAID6_NOWU;
98 
99 	/* Setup discipline specific function pointers. */
100 	sd->sd_assemble = sr_raid6_assemble;
101 	sd->sd_create = sr_raid6_create;
102 	sd->sd_openings = sr_raid6_openings;
103 	sd->sd_scsi_rw = sr_raid6_rw;
104 	sd->sd_scsi_intr = sr_raid6_intr;
105 	sd->sd_scsi_wu_done = sr_raid6_wu_done;
106 	sd->sd_set_chunk_state = sr_raid6_set_chunk_state;
107 	sd->sd_set_vol_state = sr_raid6_set_vol_state;
108 }
109 
110 int
sr_raid6_create(struct sr_discipline * sd,struct bioc_createraid * bc,int no_chunk,int64_t coerced_size)111 sr_raid6_create(struct sr_discipline *sd, struct bioc_createraid *bc,
112     int no_chunk, int64_t coerced_size)
113 {
114 	if (no_chunk < 4) {
115 		sr_error(sd->sd_sc, "%s requires four or more chunks",
116 		    sd->sd_name);
117 		return EINVAL;
118 	}
119 
120 	/*
121 	 * XXX add variable strip size later even though MAXPHYS is really
122 	 * the clever value, users like * to tinker with that type of stuff.
123 	 */
124 	sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
125 	sd->sd_meta->ssdi.ssd_size = (coerced_size &
126 	    ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
127 	    DEV_BSHIFT) - 1)) * (no_chunk - 2);
128 
129 	return sr_raid6_init(sd);
130 }
131 
132 int
sr_raid6_assemble(struct sr_discipline * sd,struct bioc_createraid * bc,int no_chunk,void * data)133 sr_raid6_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
134     int no_chunk, void *data)
135 {
136 	return sr_raid6_init(sd);
137 }
138 
139 int
sr_raid6_init(struct sr_discipline * sd)140 sr_raid6_init(struct sr_discipline *sd)
141 {
142 	/* Initialise runtime values. */
143 	sd->mds.mdd_raid6.sr6_strip_bits =
144 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
145 	if (sd->mds.mdd_raid6.sr6_strip_bits == -1) {
146 		sr_error(sd->sd_sc, "invalid strip size");
147 		return EINVAL;
148 	}
149 
150 	/* only if stripsize <= MAXPHYS */
151 	sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no);
152 
153 	return 0;
154 }
155 
156 int
sr_raid6_openings(struct sr_discipline * sd)157 sr_raid6_openings(struct sr_discipline *sd)
158 {
159 	return (sd->sd_max_wu >> 1); /* 2 wu's per IO */
160 }
161 
162 void
sr_raid6_set_chunk_state(struct sr_discipline * sd,int c,int new_state)163 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
164 {
165 	int			old_state, s;
166 
167 	/* XXX this is for RAID 0 */
168 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
169 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
170 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
171 
172 	/* ok to go to splbio since this only happens in error path */
173 	s = splbio();
174 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
175 
176 	/* multiple IOs to the same chunk that fail will come through here */
177 	if (old_state == new_state)
178 		goto done;
179 
180 	switch (old_state) {
181 	case BIOC_SDONLINE:
182 		switch (new_state) {
183 		case BIOC_SDOFFLINE:
184 		case BIOC_SDSCRUB:
185 			break;
186 		default:
187 			goto die;
188 		}
189 		break;
190 
191 	case BIOC_SDOFFLINE:
192 		if (new_state == BIOC_SDREBUILD) {
193 			;
194 		} else
195 			goto die;
196 		break;
197 
198 	case BIOC_SDSCRUB:
199 		switch (new_state) {
200 		case BIOC_SDONLINE:
201 		case BIOC_SDOFFLINE:
202 			break;
203 		default:
204 			goto die;
205 		}
206 		break;
207 
208 	case BIOC_SDREBUILD:
209 		switch (new_state) {
210 		case BIOC_SDONLINE:
211 		case BIOC_SDOFFLINE:
212 			break;
213 		default:
214 			goto die;
215 		}
216 		break;
217 
218 	default:
219 die:
220 		splx(s); /* XXX */
221 		panic("%s: %s: %s: invalid chunk state transition %d -> %d",
222 		    DEVNAME(sd->sd_sc),
223 		    sd->sd_meta->ssd_devname,
224 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
225 		    old_state, new_state);
226 		/* NOTREACHED */
227 	}
228 
229 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
230 	sd->sd_set_vol_state(sd);
231 
232 	sd->sd_must_flush = 1;
233 	task_add(systq, &sd->sd_meta_save_task);
234 done:
235 	splx(s);
236 }
237 
238 void
sr_raid6_set_vol_state(struct sr_discipline * sd)239 sr_raid6_set_vol_state(struct sr_discipline *sd)
240 {
241 	int			states[SR_MAX_STATES];
242 	int			new_state, i, s, nd;
243 	int			old_state = sd->sd_vol_status;
244 
245 	/* XXX this is for RAID 0 */
246 
247 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
248 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
249 
250 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
251 
252 	for (i = 0; i < SR_MAX_STATES; i++)
253 		states[i] = 0;
254 
255 	for (i = 0; i < nd; i++) {
256 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
257 		if (s >= SR_MAX_STATES)
258 			panic("%s: %s: %s: invalid chunk state",
259 			    DEVNAME(sd->sd_sc),
260 			    sd->sd_meta->ssd_devname,
261 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
262 		states[s]++;
263 	}
264 
265 	if (states[BIOC_SDONLINE] == nd)
266 		new_state = BIOC_SVONLINE;
267 	else if (states[BIOC_SDONLINE] < nd - 2)
268 		new_state = BIOC_SVOFFLINE;
269 	else if (states[BIOC_SDSCRUB] != 0)
270 		new_state = BIOC_SVSCRUB;
271 	else if (states[BIOC_SDREBUILD] != 0)
272 		new_state = BIOC_SVREBUILD;
273 	else if (states[BIOC_SDONLINE] < nd)
274 		new_state = BIOC_SVDEGRADED;
275 	else {
276 		printf("old_state = %d, ", old_state);
277 		for (i = 0; i < nd; i++)
278 			printf("%d = %d, ", i,
279 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
280 		panic("invalid new_state");
281 	}
282 
283 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n",
284 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
285 	    old_state, new_state);
286 
287 	switch (old_state) {
288 	case BIOC_SVONLINE:
289 		switch (new_state) {
290 		case BIOC_SVONLINE: /* can go to same state */
291 		case BIOC_SVOFFLINE:
292 		case BIOC_SVDEGRADED:
293 		case BIOC_SVREBUILD: /* happens on boot */
294 			break;
295 		default:
296 			goto die;
297 		}
298 		break;
299 
300 	case BIOC_SVOFFLINE:
301 		/* XXX this might be a little too much */
302 		goto die;
303 
304 	case BIOC_SVDEGRADED:
305 		switch (new_state) {
306 		case BIOC_SVOFFLINE:
307 		case BIOC_SVREBUILD:
308 		case BIOC_SVDEGRADED: /* can go to the same state */
309 			break;
310 		default:
311 			goto die;
312 		}
313 		break;
314 
315 	case BIOC_SVBUILDING:
316 		switch (new_state) {
317 		case BIOC_SVONLINE:
318 		case BIOC_SVOFFLINE:
319 		case BIOC_SVBUILDING: /* can go to the same state */
320 			break;
321 		default:
322 			goto die;
323 		}
324 		break;
325 
326 	case BIOC_SVSCRUB:
327 		switch (new_state) {
328 		case BIOC_SVONLINE:
329 		case BIOC_SVOFFLINE:
330 		case BIOC_SVDEGRADED:
331 		case BIOC_SVSCRUB: /* can go to same state */
332 			break;
333 		default:
334 			goto die;
335 		}
336 		break;
337 
338 	case BIOC_SVREBUILD:
339 		switch (new_state) {
340 		case BIOC_SVONLINE:
341 		case BIOC_SVOFFLINE:
342 		case BIOC_SVDEGRADED:
343 		case BIOC_SVREBUILD: /* can go to the same state */
344 			break;
345 		default:
346 			goto die;
347 		}
348 		break;
349 
350 	default:
351 die:
352 		panic("%s: %s: invalid volume state transition %d -> %d",
353 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
354 		    old_state, new_state);
355 		/* NOTREACHED */
356 	}
357 
358 	sd->sd_vol_status = new_state;
359 }
360 
361 /*  modes:
362  *   readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
363  *		0, qbuf, NULL, 0);
364  *   readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
365  *		0, pbuf, NULL, 0);
366  *   readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
367  *		0, pbuf, qbuf, gf_pow[i]);
368  */
369 
370 int
sr_raid6_rw(struct sr_workunit * wu)371 sr_raid6_rw(struct sr_workunit *wu)
372 {
373 	struct sr_workunit	*wu_r = NULL;
374 	struct sr_discipline	*sd = wu->swu_dis;
375 	struct scsi_xfer	*xs = wu->swu_xs;
376 	struct sr_chunk		*scp;
377 	int			s, fail, i, gxinv, pxinv;
378 	daddr_t			blkno, lba;
379 	int64_t			chunk_offs, lbaoffs, offset, strip_offs;
380 	int64_t			strip_no, strip_size, strip_bits, row_size;
381 	int64_t			fchunk, no_chunk, chunk, qchunk, pchunk;
382 	long			length, datalen;
383 	void			*pbuf, *data, *qbuf;
384 
385 	/* blkno and scsi error will be handled by sr_validate_io */
386 	if (sr_validate_io(wu, &blkno, "sr_raid6_rw"))
387 		goto bad;
388 
389 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
390 	strip_bits = sd->mds.mdd_raid6.sr6_strip_bits;
391 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2;
392 	row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
393 
394 	data = xs->data;
395 	datalen = xs->datalen;
396 	lbaoffs	= blkno << DEV_BSHIFT;
397 
398 	if (xs->flags & SCSI_DATA_OUT) {
399 		if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
400 			printf("%s: can't get wu_r", DEVNAME(sd->sd_sc));
401 			goto bad;
402 		}
403 		wu_r->swu_state = SR_WU_INPROGRESS;
404 		wu_r->swu_flags |= SR_WUF_DISCIPLINE;
405 	}
406 
407 	wu->swu_blk_start = 0;
408 	while (datalen != 0) {
409 		strip_no = lbaoffs >> strip_bits;
410 		strip_offs = lbaoffs & (strip_size - 1);
411 		chunk_offs = (strip_no / no_chunk) << strip_bits;
412 		offset = chunk_offs + strip_offs;
413 
414 		/* get size remaining in this stripe */
415 		length = MIN(strip_size - strip_offs, datalen);
416 
417 		/* map disk offset to parity/data drive */
418 		chunk = strip_no % no_chunk;
419 
420 		qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2));
421 		if (qchunk == 0)
422 			pchunk = no_chunk + 1;
423 		else
424 			pchunk = qchunk - 1;
425 		if (chunk >= pchunk)
426 			chunk++;
427 		if (chunk >= qchunk)
428 			chunk++;
429 
430 		lba = offset >> DEV_BSHIFT;
431 
432 		/* XXX big hammer.. exclude I/O from entire stripe */
433 		if (wu->swu_blk_start == 0)
434 			wu->swu_blk_start = (strip_no / no_chunk) * row_size;
435 		wu->swu_blk_end = (strip_no / no_chunk) * row_size + (row_size - 1);
436 
437 		fail = 0;
438 		fchunk = -1;
439 
440 		/* Get disk-fail flags */
441 		for (i=0; i< no_chunk+2; i++) {
442 			scp = sd->sd_vol.sv_chunks[i];
443 			switch (scp->src_meta.scm_status) {
444 			case BIOC_SDOFFLINE:
445 			case BIOC_SDREBUILD:
446 			case BIOC_SDHOTSPARE:
447 				if (i == qchunk)
448 					fail |= SR_FAILQ;
449 				else if (i == pchunk)
450 					fail |= SR_FAILP;
451 				else if (i == chunk)
452 					fail |= SR_FAILX;
453 				else {
454 					/* dual data-disk failure */
455 					fail |= SR_FAILY;
456 					fchunk = i;
457 				}
458 				break;
459 			}
460 		}
461 		if (xs->flags & SCSI_DATA_IN) {
462 			if (!(fail & SR_FAILX)) {
463 				/* drive is good. issue single read request */
464 				if (sr_raid6_addio(wu, chunk, lba, length,
465 				    data, xs->flags, 0, NULL, NULL, 0))
466 					goto bad;
467 			} else if (fail & SR_FAILP) {
468 				/* Dx, P failed */
469 				printf("Disk %llx offline, "
470 				    "regenerating Dx+P\n", chunk);
471 
472 				gxinv = gf_inv(gf_pow[chunk]);
473 
474 				/* Calculate: Dx = (Q^Dz*gz)*inv(gx) */
475 				memset(data, 0, length);
476 				if (sr_raid6_addio(wu, qchunk, lba, length,
477 				    NULL, SCSI_DATA_IN, 0, NULL, data, gxinv))
478 					goto bad;
479 
480 				/* Read Dz * gz * inv(gx) */
481 				for (i = 0; i < no_chunk+2; i++) {
482 					if  (i == qchunk || i == pchunk || i == chunk)
483 						continue;
484 
485 					if (sr_raid6_addio(wu, i, lba, length,
486 					    NULL, SCSI_DATA_IN, 0, NULL, data,
487 					    gf_mul(gf_pow[i], gxinv)))
488 						goto bad;
489 				}
490 
491 				/* data will contain correct value on completion */
492 			} else if (fail & SR_FAILY) {
493 				/* Dx, Dy failed */
494 				printf("Disk %llx & %llx offline, "
495 				    "regenerating Dx+Dy\n", chunk, fchunk);
496 
497 				gxinv = gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]);
498 				pxinv = gf_mul(gf_pow[fchunk], gxinv);
499 
500 				/* read Q * inv(gx + gy) */
501 				memset(data, 0, length);
502 				if (sr_raid6_addio(wu, qchunk, lba, length,
503 				    NULL, SCSI_DATA_IN, 0, NULL, data, gxinv))
504 					goto bad;
505 
506 				/* read P * gy * inv(gx + gy) */
507 				if (sr_raid6_addio(wu, pchunk, lba, length,
508 				    NULL, SCSI_DATA_IN, 0, NULL, data, pxinv))
509 					goto bad;
510 
511 				/* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz
512 				 *   Q:  sr_raid6_xorp(qbuf, --, length);
513 				 *   P:  sr_raid6_xorp(pbuf, --, length);
514 				 *   Dz: sr_raid6_xorp(pbuf, --, length);
515 				 *	 sr_raid6_xorq(qbuf, --, length, gf_pow[i]);
516 				 */
517 				for (i = 0; i < no_chunk+2; i++) {
518 					if (i == qchunk || i == pchunk ||
519 					    i == chunk || i == fchunk)
520 						continue;
521 
522 					/* read Dz * (gz + gy) * inv(gx + gy) */
523 					if (sr_raid6_addio(wu, i, lba, length,
524 					    NULL, SCSI_DATA_IN, 0, NULL, data,
525 					    pxinv ^ gf_mul(gf_pow[i], gxinv)))
526 						goto bad;
527 				}
528 			} else {
529 				/* Two cases: single disk (Dx) or (Dx+Q)
530 				 *   Dx = Dz ^ P (same as RAID5)
531 				 */
532 				printf("Disk %llx offline, "
533 				    "regenerating Dx%s\n", chunk,
534 				    fail & SR_FAILQ ? "+Q" : " single");
535 
536 				/* Calculate: Dx = P^Dz
537 				 *   P:  sr_raid6_xorp(data, ---, length);
538 				 *   Dz: sr_raid6_xorp(data, ---, length);
539 				 */
540 				memset(data, 0, length);
541 				for (i = 0; i < no_chunk+2; i++) {
542 					if (i != chunk && i != qchunk) {
543 						/* Read Dz */
544 						if (sr_raid6_addio(wu, i, lba,
545 						    length, NULL, SCSI_DATA_IN,
546 						    0, data, NULL, 0))
547 							goto bad;
548 					}
549 				}
550 
551 				/* data will contain correct value on completion */
552 			}
553 		} else {
554 			/* XXX handle writes to failed/offline disk? */
555 			if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP))
556 				goto bad;
557 
558 			/*
559 			 * initialize pbuf with contents of new data to be
560 			 * written. This will be XORed with old data and old
561 			 * parity in the intr routine. The result in pbuf
562 			 * is the new parity data.
563 			 */
564 			qbuf = sr_block_get(sd, length);
565 			if (qbuf == NULL)
566 				goto bad;
567 
568 			pbuf = sr_block_get(sd, length);
569 			if (pbuf == NULL)
570 				goto bad;
571 
572 			/* Calculate P = Dn; Q = gn * Dn */
573 			if (gf_premul(gf_pow[chunk]))
574 				goto bad;
575 			sr_raid6_xorp(pbuf, data, length);
576 			sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]);
577 
578 			/* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */
579 			if (sr_raid6_addio(wu_r, chunk, lba, length, NULL,
580 				SCSI_DATA_IN, 0, pbuf, qbuf, gf_pow[chunk]))
581 				goto bad;
582 
583 			/* Read old xor-parity: P ^= P' */
584 			if (sr_raid6_addio(wu_r, pchunk, lba, length, NULL,
585 				SCSI_DATA_IN, 0, pbuf, NULL, 0))
586 				goto bad;
587 
588 			/* Read old q-parity: Q ^= Q' */
589 			if (sr_raid6_addio(wu_r, qchunk, lba, length, NULL,
590 				SCSI_DATA_IN, 0, qbuf, NULL, 0))
591 				goto bad;
592 
593 			/* write new data */
594 			if (sr_raid6_addio(wu, chunk, lba, length, data,
595 			    xs->flags, 0, NULL, NULL, 0))
596 				goto bad;
597 
598 			/* write new xor-parity */
599 			if (sr_raid6_addio(wu, pchunk, lba, length, pbuf,
600 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
601 				goto bad;
602 
603 			/* write new q-parity */
604 			if (sr_raid6_addio(wu, qchunk, lba, length, qbuf,
605 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
606 				goto bad;
607 		}
608 
609 		/* advance to next block */
610 		lbaoffs += length;
611 		datalen -= length;
612 		data += length;
613 	}
614 
615 	s = splbio();
616 	if (wu_r) {
617 		/* collide write request with reads */
618 		wu_r->swu_blk_start = wu->swu_blk_start;
619 		wu_r->swu_blk_end = wu->swu_blk_end;
620 
621 		wu->swu_state = SR_WU_DEFERRED;
622 		wu_r->swu_collider = wu;
623 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
624 
625 		wu = wu_r;
626 	}
627 	splx(s);
628 
629 	sr_schedule_wu(wu);
630 
631 	return (0);
632 bad:
633 	/* XXX - can leak pbuf/qbuf on error. */
634 	/* wu is unwound by sr_wu_put */
635 	if (wu_r)
636 		sr_scsi_wu_put(sd, wu_r);
637 	return (1);
638 }
639 
640 /* Handle failure I/O completion */
641 int
sr_failio(struct sr_workunit * wu)642 sr_failio(struct sr_workunit *wu)
643 {
644 	struct sr_discipline	*sd = wu->swu_dis;
645 	struct sr_ccb		*ccb;
646 
647 	if (!(wu->swu_flags & SR_WUF_FAIL))
648 		return (0);
649 
650 	/* Wu is a 'fake'.. don't do real I/O just intr */
651 	TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
652 	TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link)
653 		sr_raid6_intr(&ccb->ccb_buf);
654 	return (1);
655 }
656 
657 void
sr_raid6_intr(struct buf * bp)658 sr_raid6_intr(struct buf *bp)
659 {
660 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
661 	struct sr_workunit	*wu = ccb->ccb_wu;
662 	struct sr_discipline	*sd = wu->swu_dis;
663 	struct sr_raid6_opaque  *pq = ccb->ccb_opaque;
664 	int			s;
665 
666 	DNPRINTF(SR_D_INTR, "%s: sr_raid6_intr bp %p xs %p\n",
667 	    DEVNAME(sd->sd_sc), bp, wu->swu_xs);
668 
669 	s = splbio();
670 	sr_ccb_done(ccb);
671 
672 	/* XOR data to result. */
673 	if (ccb->ccb_state == SR_CCB_OK && pq) {
674 		if (pq->pbuf)
675 			/* Calculate xor-parity */
676 			sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data,
677 			    ccb->ccb_buf.b_bcount);
678 		if (pq->qbuf)
679 			/* Calculate q-parity */
680 			sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data,
681 			    ccb->ccb_buf.b_bcount, pq->gn);
682 		free(pq, M_DEVBUF, 0);
683 		ccb->ccb_opaque = NULL;
684 	}
685 
686 	/* Free allocated data buffer. */
687 	if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
688 		sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
689 		ccb->ccb_buf.b_data = NULL;
690 	}
691 
692 	sr_wu_done(wu);
693 	splx(s);
694 }
695 
696 int
sr_raid6_wu_done(struct sr_workunit * wu)697 sr_raid6_wu_done(struct sr_workunit *wu)
698 {
699 	struct sr_discipline	*sd = wu->swu_dis;
700 	struct scsi_xfer	*xs = wu->swu_xs;
701 
702 	/* XXX - we have no way of propagating errors... */
703 	if (wu->swu_flags & SR_WUF_DISCIPLINE)
704 		return SR_WU_OK;
705 
706 	/* XXX - This is insufficient for RAID 6. */
707 	if (wu->swu_ios_succeeded > 0) {
708 		xs->error = XS_NOERROR;
709 		return SR_WU_OK;
710 	}
711 
712 	if (xs->flags & SCSI_DATA_IN) {
713 		printf("%s: retrying read on block %lld\n",
714 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
715 		sr_wu_release_ccbs(wu);
716 		wu->swu_state = SR_WU_RESTART;
717 		if (sd->sd_scsi_rw(wu) == 0)
718 			return SR_WU_RESTART;
719 	} else {
720 		printf("%s: permanently fail write on block %lld\n",
721 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
722 	}
723 
724 	wu->swu_state = SR_WU_FAILED;
725 	xs->error = XS_DRIVER_STUFFUP;
726 
727 	return SR_WU_FAILED;
728 }
729 
730 int
sr_raid6_addio(struct sr_workunit * wu,int chunk,daddr_t blkno,long len,void * data,int xsflags,int ccbflags,void * pbuf,void * qbuf,int gn)731 sr_raid6_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
732     long len, void *data, int xsflags, int ccbflags, void *pbuf,
733     void *qbuf, int gn)
734 {
735 	struct sr_discipline	*sd = wu->swu_dis;
736 	struct sr_ccb		*ccb;
737 	struct sr_raid6_opaque  *pqbuf;
738 
739 	DNPRINTF(SR_D_DIS, "sr_raid6_addio: %s %d.%lld %ld %p:%p\n",
740 	    (xsflags & SCSI_DATA_IN) ? "read" : "write", chunk,
741 	    (long long)blkno, len, pbuf, qbuf);
742 
743 	/* Allocate temporary buffer. */
744 	if (data == NULL) {
745 		data = sr_block_get(sd, len);
746 		if (data == NULL)
747 			return (-1);
748 		ccbflags |= SR_CCBF_FREEBUF;
749 	}
750 
751 	ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
752 	if (ccb == NULL) {
753 		if (ccbflags & SR_CCBF_FREEBUF)
754 			sr_block_put(sd, data, len);
755 		return (-1);
756 	}
757 	if (pbuf || qbuf) {
758 		/* XXX - can leak data and ccb on failure. */
759 		if (qbuf && gf_premul(gn))
760 			return (-1);
761 
762 		/* XXX - should be preallocated? */
763 		pqbuf = malloc(sizeof(struct sr_raid6_opaque),
764 		    M_DEVBUF, M_ZERO | M_NOWAIT);
765 		if (pqbuf == NULL) {
766 			sr_ccb_put(ccb);
767 			return (-1);
768 		}
769 		pqbuf->pbuf = pbuf;
770 		pqbuf->qbuf = qbuf;
771 		pqbuf->gn = gn;
772 		ccb->ccb_opaque = pqbuf;
773 	}
774 	sr_wu_enqueue_ccb(wu, ccb);
775 
776 	return (0);
777 }
778 
779 /* Perform RAID6 parity calculation.
780  *   P=xor parity, Q=GF256 parity, D=data, gn=disk# */
781 void
sr_raid6_xorp(void * p,void * d,int len)782 sr_raid6_xorp(void *p, void *d, int len)
783 {
784 	uint32_t *pbuf = p, *data = d;
785 
786 	len >>= 2;
787 	while (len--)
788 		*pbuf++ ^= *data++;
789 }
790 
791 void
sr_raid6_xorq(void * q,void * d,int len,int gn)792 sr_raid6_xorq(void *q, void *d, int len, int gn)
793 {
794 	uint32_t	*qbuf = q, *data = d, x;
795 	uint8_t		*gn_map = gf_map[gn];
796 
797 	len >>= 2;
798 	while (len--) {
799 		x = *data++;
800 		*qbuf++ ^= (((uint32_t)gn_map[x & 0xff]) |
801 			    ((uint32_t)gn_map[(x >> 8) & 0xff] << 8) |
802 			    ((uint32_t)gn_map[(x >> 16) & 0xff] << 16) |
803 			    ((uint32_t)gn_map[(x >> 24) & 0xff] << 24));
804 	}
805 }
806 
807 /* Create GF256 log/pow tables: polynomial = 0x11D */
808 void
gf_init(void)809 gf_init(void)
810 {
811 	int i;
812 	uint8_t p = 1;
813 
814 	/* use 2N pow table to avoid using % in multiply */
815 	for (i=0; i<256; i++) {
816 		gf_log[p] = i;
817 		gf_pow[i] = gf_pow[i+255] = p;
818 		p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00));
819 	}
820 	gf_log[0] = 512;
821 }
822 
823 uint8_t
gf_inv(uint8_t a)824 gf_inv(uint8_t a)
825 {
826 	return gf_pow[255 - gf_log[a]];
827 }
828 
829 uint8_t
gf_mul(uint8_t a,uint8_t b)830 gf_mul(uint8_t a, uint8_t b)
831 {
832 	return gf_pow[gf_log[a] + gf_log[b]];
833 }
834 
835 /* Precalculate multiplication tables for drive gn */
836 int
gf_premul(uint8_t gn)837 gf_premul(uint8_t gn)
838 {
839 	int i;
840 
841 	if (gf_map[gn] != NULL)
842 		return (0);
843 
844 	if ((gf_map[gn] = malloc(256, M_DEVBUF, M_ZERO | M_NOWAIT)) == NULL)
845 		return (-1);
846 
847 	for (i=0; i<256; i++)
848 		gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]];
849 	return (0);
850 }
851