1 /* $OpenBSD: softraid_raid6.c,v 1.73 2024/05/13 01:15:50 jsg Exp $ */
2 /*
3 * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
4 * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19 #include "bio.h"
20
21 #include <sys/param.h>
22 #include <sys/systm.h>
23 #include <sys/buf.h>
24 #include <sys/device.h>
25 #include <sys/ioctl.h>
26 #include <sys/malloc.h>
27 #include <sys/kernel.h>
28 #include <sys/disk.h>
29 #include <sys/rwlock.h>
30 #include <sys/queue.h>
31 #include <sys/fcntl.h>
32 #include <sys/mount.h>
33 #include <sys/sensors.h>
34 #include <sys/stat.h>
35 #include <sys/task.h>
36 #include <sys/conf.h>
37 #include <sys/uio.h>
38
39 #include <scsi/scsi_all.h>
40 #include <scsi/scsiconf.h>
41 #include <scsi/scsi_disk.h>
42
43 #include <dev/softraidvar.h>
44
45 uint8_t *gf_map[256];
46 uint8_t gf_pow[768];
47 int gf_log[256];
48
49 /* RAID 6 functions. */
50 int sr_raid6_create(struct sr_discipline *, struct bioc_createraid *,
51 int, int64_t);
52 int sr_raid6_assemble(struct sr_discipline *, struct bioc_createraid *,
53 int, void *);
54 int sr_raid6_init(struct sr_discipline *);
55 int sr_raid6_rw(struct sr_workunit *);
56 int sr_raid6_openings(struct sr_discipline *);
57 void sr_raid6_intr(struct buf *);
58 int sr_raid6_wu_done(struct sr_workunit *);
59 void sr_raid6_set_chunk_state(struct sr_discipline *, int, int);
60 void sr_raid6_set_vol_state(struct sr_discipline *);
61
62 void sr_raid6_xorp(void *, void *, int);
63 void sr_raid6_xorq(void *, void *, int, int);
64 int sr_raid6_addio(struct sr_workunit *wu, int, daddr_t, long,
65 void *, int, int, void *, void *, int);
66 int sr_failio(struct sr_workunit *);
67
68 void gf_init(void);
69 uint8_t gf_inv(uint8_t);
70 int gf_premul(uint8_t);
71 uint8_t gf_mul(uint8_t, uint8_t);
72
73 #define SR_NOFAIL 0x00
74 #define SR_FAILX (1L << 0)
75 #define SR_FAILY (1L << 1)
76 #define SR_FAILP (1L << 2)
77 #define SR_FAILQ (1L << 3)
78
79 struct sr_raid6_opaque {
80 int gn;
81 void *pbuf;
82 void *qbuf;
83 };
84
85 /* discipline initialisation. */
86 void
sr_raid6_discipline_init(struct sr_discipline * sd)87 sr_raid6_discipline_init(struct sr_discipline *sd)
88 {
89 /* Initialize GF256 tables. */
90 gf_init();
91
92 /* Fill out discipline members. */
93 sd->sd_type = SR_MD_RAID6;
94 strlcpy(sd->sd_name, "RAID 6", sizeof(sd->sd_name));
95 sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
96 SR_CAP_REDUNDANT;
97 sd->sd_max_wu = SR_RAID6_NOWU;
98
99 /* Setup discipline specific function pointers. */
100 sd->sd_assemble = sr_raid6_assemble;
101 sd->sd_create = sr_raid6_create;
102 sd->sd_openings = sr_raid6_openings;
103 sd->sd_scsi_rw = sr_raid6_rw;
104 sd->sd_scsi_intr = sr_raid6_intr;
105 sd->sd_scsi_wu_done = sr_raid6_wu_done;
106 sd->sd_set_chunk_state = sr_raid6_set_chunk_state;
107 sd->sd_set_vol_state = sr_raid6_set_vol_state;
108 }
109
110 int
sr_raid6_create(struct sr_discipline * sd,struct bioc_createraid * bc,int no_chunk,int64_t coerced_size)111 sr_raid6_create(struct sr_discipline *sd, struct bioc_createraid *bc,
112 int no_chunk, int64_t coerced_size)
113 {
114 if (no_chunk < 4) {
115 sr_error(sd->sd_sc, "%s requires four or more chunks",
116 sd->sd_name);
117 return EINVAL;
118 }
119
120 /*
121 * XXX add variable strip size later even though MAXPHYS is really
122 * the clever value, users like * to tinker with that type of stuff.
123 */
124 sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
125 sd->sd_meta->ssdi.ssd_size = (coerced_size &
126 ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
127 DEV_BSHIFT) - 1)) * (no_chunk - 2);
128
129 return sr_raid6_init(sd);
130 }
131
132 int
sr_raid6_assemble(struct sr_discipline * sd,struct bioc_createraid * bc,int no_chunk,void * data)133 sr_raid6_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
134 int no_chunk, void *data)
135 {
136 return sr_raid6_init(sd);
137 }
138
139 int
sr_raid6_init(struct sr_discipline * sd)140 sr_raid6_init(struct sr_discipline *sd)
141 {
142 /* Initialise runtime values. */
143 sd->mds.mdd_raid6.sr6_strip_bits =
144 sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
145 if (sd->mds.mdd_raid6.sr6_strip_bits == -1) {
146 sr_error(sd->sd_sc, "invalid strip size");
147 return EINVAL;
148 }
149
150 /* only if stripsize <= MAXPHYS */
151 sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no);
152
153 return 0;
154 }
155
156 int
sr_raid6_openings(struct sr_discipline * sd)157 sr_raid6_openings(struct sr_discipline *sd)
158 {
159 return (sd->sd_max_wu >> 1); /* 2 wu's per IO */
160 }
161
162 void
sr_raid6_set_chunk_state(struct sr_discipline * sd,int c,int new_state)163 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
164 {
165 int old_state, s;
166
167 /* XXX this is for RAID 0 */
168 DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
169 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
170 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
171
172 /* ok to go to splbio since this only happens in error path */
173 s = splbio();
174 old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
175
176 /* multiple IOs to the same chunk that fail will come through here */
177 if (old_state == new_state)
178 goto done;
179
180 switch (old_state) {
181 case BIOC_SDONLINE:
182 switch (new_state) {
183 case BIOC_SDOFFLINE:
184 case BIOC_SDSCRUB:
185 break;
186 default:
187 goto die;
188 }
189 break;
190
191 case BIOC_SDOFFLINE:
192 if (new_state == BIOC_SDREBUILD) {
193 ;
194 } else
195 goto die;
196 break;
197
198 case BIOC_SDSCRUB:
199 switch (new_state) {
200 case BIOC_SDONLINE:
201 case BIOC_SDOFFLINE:
202 break;
203 default:
204 goto die;
205 }
206 break;
207
208 case BIOC_SDREBUILD:
209 switch (new_state) {
210 case BIOC_SDONLINE:
211 case BIOC_SDOFFLINE:
212 break;
213 default:
214 goto die;
215 }
216 break;
217
218 default:
219 die:
220 splx(s); /* XXX */
221 panic("%s: %s: %s: invalid chunk state transition %d -> %d",
222 DEVNAME(sd->sd_sc),
223 sd->sd_meta->ssd_devname,
224 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
225 old_state, new_state);
226 /* NOTREACHED */
227 }
228
229 sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
230 sd->sd_set_vol_state(sd);
231
232 sd->sd_must_flush = 1;
233 task_add(systq, &sd->sd_meta_save_task);
234 done:
235 splx(s);
236 }
237
238 void
sr_raid6_set_vol_state(struct sr_discipline * sd)239 sr_raid6_set_vol_state(struct sr_discipline *sd)
240 {
241 int states[SR_MAX_STATES];
242 int new_state, i, s, nd;
243 int old_state = sd->sd_vol_status;
244
245 /* XXX this is for RAID 0 */
246
247 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
248 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
249
250 nd = sd->sd_meta->ssdi.ssd_chunk_no;
251
252 for (i = 0; i < SR_MAX_STATES; i++)
253 states[i] = 0;
254
255 for (i = 0; i < nd; i++) {
256 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
257 if (s >= SR_MAX_STATES)
258 panic("%s: %s: %s: invalid chunk state",
259 DEVNAME(sd->sd_sc),
260 sd->sd_meta->ssd_devname,
261 sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
262 states[s]++;
263 }
264
265 if (states[BIOC_SDONLINE] == nd)
266 new_state = BIOC_SVONLINE;
267 else if (states[BIOC_SDONLINE] < nd - 2)
268 new_state = BIOC_SVOFFLINE;
269 else if (states[BIOC_SDSCRUB] != 0)
270 new_state = BIOC_SVSCRUB;
271 else if (states[BIOC_SDREBUILD] != 0)
272 new_state = BIOC_SVREBUILD;
273 else if (states[BIOC_SDONLINE] < nd)
274 new_state = BIOC_SVDEGRADED;
275 else {
276 printf("old_state = %d, ", old_state);
277 for (i = 0; i < nd; i++)
278 printf("%d = %d, ", i,
279 sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
280 panic("invalid new_state");
281 }
282
283 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n",
284 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
285 old_state, new_state);
286
287 switch (old_state) {
288 case BIOC_SVONLINE:
289 switch (new_state) {
290 case BIOC_SVONLINE: /* can go to same state */
291 case BIOC_SVOFFLINE:
292 case BIOC_SVDEGRADED:
293 case BIOC_SVREBUILD: /* happens on boot */
294 break;
295 default:
296 goto die;
297 }
298 break;
299
300 case BIOC_SVOFFLINE:
301 /* XXX this might be a little too much */
302 goto die;
303
304 case BIOC_SVDEGRADED:
305 switch (new_state) {
306 case BIOC_SVOFFLINE:
307 case BIOC_SVREBUILD:
308 case BIOC_SVDEGRADED: /* can go to the same state */
309 break;
310 default:
311 goto die;
312 }
313 break;
314
315 case BIOC_SVBUILDING:
316 switch (new_state) {
317 case BIOC_SVONLINE:
318 case BIOC_SVOFFLINE:
319 case BIOC_SVBUILDING: /* can go to the same state */
320 break;
321 default:
322 goto die;
323 }
324 break;
325
326 case BIOC_SVSCRUB:
327 switch (new_state) {
328 case BIOC_SVONLINE:
329 case BIOC_SVOFFLINE:
330 case BIOC_SVDEGRADED:
331 case BIOC_SVSCRUB: /* can go to same state */
332 break;
333 default:
334 goto die;
335 }
336 break;
337
338 case BIOC_SVREBUILD:
339 switch (new_state) {
340 case BIOC_SVONLINE:
341 case BIOC_SVOFFLINE:
342 case BIOC_SVDEGRADED:
343 case BIOC_SVREBUILD: /* can go to the same state */
344 break;
345 default:
346 goto die;
347 }
348 break;
349
350 default:
351 die:
352 panic("%s: %s: invalid volume state transition %d -> %d",
353 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
354 old_state, new_state);
355 /* NOTREACHED */
356 }
357
358 sd->sd_vol_status = new_state;
359 }
360
361 /* modes:
362 * readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
363 * 0, qbuf, NULL, 0);
364 * readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
365 * 0, pbuf, NULL, 0);
366 * readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
367 * 0, pbuf, qbuf, gf_pow[i]);
368 */
369
370 int
sr_raid6_rw(struct sr_workunit * wu)371 sr_raid6_rw(struct sr_workunit *wu)
372 {
373 struct sr_workunit *wu_r = NULL;
374 struct sr_discipline *sd = wu->swu_dis;
375 struct scsi_xfer *xs = wu->swu_xs;
376 struct sr_chunk *scp;
377 int s, fail, i, gxinv, pxinv;
378 daddr_t blkno, lba;
379 int64_t chunk_offs, lbaoffs, offset, strip_offs;
380 int64_t strip_no, strip_size, strip_bits, row_size;
381 int64_t fchunk, no_chunk, chunk, qchunk, pchunk;
382 long length, datalen;
383 void *pbuf, *data, *qbuf;
384
385 /* blkno and scsi error will be handled by sr_validate_io */
386 if (sr_validate_io(wu, &blkno, "sr_raid6_rw"))
387 goto bad;
388
389 strip_size = sd->sd_meta->ssdi.ssd_strip_size;
390 strip_bits = sd->mds.mdd_raid6.sr6_strip_bits;
391 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2;
392 row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
393
394 data = xs->data;
395 datalen = xs->datalen;
396 lbaoffs = blkno << DEV_BSHIFT;
397
398 if (xs->flags & SCSI_DATA_OUT) {
399 if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
400 printf("%s: can't get wu_r", DEVNAME(sd->sd_sc));
401 goto bad;
402 }
403 wu_r->swu_state = SR_WU_INPROGRESS;
404 wu_r->swu_flags |= SR_WUF_DISCIPLINE;
405 }
406
407 wu->swu_blk_start = 0;
408 while (datalen != 0) {
409 strip_no = lbaoffs >> strip_bits;
410 strip_offs = lbaoffs & (strip_size - 1);
411 chunk_offs = (strip_no / no_chunk) << strip_bits;
412 offset = chunk_offs + strip_offs;
413
414 /* get size remaining in this stripe */
415 length = MIN(strip_size - strip_offs, datalen);
416
417 /* map disk offset to parity/data drive */
418 chunk = strip_no % no_chunk;
419
420 qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2));
421 if (qchunk == 0)
422 pchunk = no_chunk + 1;
423 else
424 pchunk = qchunk - 1;
425 if (chunk >= pchunk)
426 chunk++;
427 if (chunk >= qchunk)
428 chunk++;
429
430 lba = offset >> DEV_BSHIFT;
431
432 /* XXX big hammer.. exclude I/O from entire stripe */
433 if (wu->swu_blk_start == 0)
434 wu->swu_blk_start = (strip_no / no_chunk) * row_size;
435 wu->swu_blk_end = (strip_no / no_chunk) * row_size + (row_size - 1);
436
437 fail = 0;
438 fchunk = -1;
439
440 /* Get disk-fail flags */
441 for (i=0; i< no_chunk+2; i++) {
442 scp = sd->sd_vol.sv_chunks[i];
443 switch (scp->src_meta.scm_status) {
444 case BIOC_SDOFFLINE:
445 case BIOC_SDREBUILD:
446 case BIOC_SDHOTSPARE:
447 if (i == qchunk)
448 fail |= SR_FAILQ;
449 else if (i == pchunk)
450 fail |= SR_FAILP;
451 else if (i == chunk)
452 fail |= SR_FAILX;
453 else {
454 /* dual data-disk failure */
455 fail |= SR_FAILY;
456 fchunk = i;
457 }
458 break;
459 }
460 }
461 if (xs->flags & SCSI_DATA_IN) {
462 if (!(fail & SR_FAILX)) {
463 /* drive is good. issue single read request */
464 if (sr_raid6_addio(wu, chunk, lba, length,
465 data, xs->flags, 0, NULL, NULL, 0))
466 goto bad;
467 } else if (fail & SR_FAILP) {
468 /* Dx, P failed */
469 printf("Disk %llx offline, "
470 "regenerating Dx+P\n", chunk);
471
472 gxinv = gf_inv(gf_pow[chunk]);
473
474 /* Calculate: Dx = (Q^Dz*gz)*inv(gx) */
475 memset(data, 0, length);
476 if (sr_raid6_addio(wu, qchunk, lba, length,
477 NULL, SCSI_DATA_IN, 0, NULL, data, gxinv))
478 goto bad;
479
480 /* Read Dz * gz * inv(gx) */
481 for (i = 0; i < no_chunk+2; i++) {
482 if (i == qchunk || i == pchunk || i == chunk)
483 continue;
484
485 if (sr_raid6_addio(wu, i, lba, length,
486 NULL, SCSI_DATA_IN, 0, NULL, data,
487 gf_mul(gf_pow[i], gxinv)))
488 goto bad;
489 }
490
491 /* data will contain correct value on completion */
492 } else if (fail & SR_FAILY) {
493 /* Dx, Dy failed */
494 printf("Disk %llx & %llx offline, "
495 "regenerating Dx+Dy\n", chunk, fchunk);
496
497 gxinv = gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]);
498 pxinv = gf_mul(gf_pow[fchunk], gxinv);
499
500 /* read Q * inv(gx + gy) */
501 memset(data, 0, length);
502 if (sr_raid6_addio(wu, qchunk, lba, length,
503 NULL, SCSI_DATA_IN, 0, NULL, data, gxinv))
504 goto bad;
505
506 /* read P * gy * inv(gx + gy) */
507 if (sr_raid6_addio(wu, pchunk, lba, length,
508 NULL, SCSI_DATA_IN, 0, NULL, data, pxinv))
509 goto bad;
510
511 /* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz
512 * Q: sr_raid6_xorp(qbuf, --, length);
513 * P: sr_raid6_xorp(pbuf, --, length);
514 * Dz: sr_raid6_xorp(pbuf, --, length);
515 * sr_raid6_xorq(qbuf, --, length, gf_pow[i]);
516 */
517 for (i = 0; i < no_chunk+2; i++) {
518 if (i == qchunk || i == pchunk ||
519 i == chunk || i == fchunk)
520 continue;
521
522 /* read Dz * (gz + gy) * inv(gx + gy) */
523 if (sr_raid6_addio(wu, i, lba, length,
524 NULL, SCSI_DATA_IN, 0, NULL, data,
525 pxinv ^ gf_mul(gf_pow[i], gxinv)))
526 goto bad;
527 }
528 } else {
529 /* Two cases: single disk (Dx) or (Dx+Q)
530 * Dx = Dz ^ P (same as RAID5)
531 */
532 printf("Disk %llx offline, "
533 "regenerating Dx%s\n", chunk,
534 fail & SR_FAILQ ? "+Q" : " single");
535
536 /* Calculate: Dx = P^Dz
537 * P: sr_raid6_xorp(data, ---, length);
538 * Dz: sr_raid6_xorp(data, ---, length);
539 */
540 memset(data, 0, length);
541 for (i = 0; i < no_chunk+2; i++) {
542 if (i != chunk && i != qchunk) {
543 /* Read Dz */
544 if (sr_raid6_addio(wu, i, lba,
545 length, NULL, SCSI_DATA_IN,
546 0, data, NULL, 0))
547 goto bad;
548 }
549 }
550
551 /* data will contain correct value on completion */
552 }
553 } else {
554 /* XXX handle writes to failed/offline disk? */
555 if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP))
556 goto bad;
557
558 /*
559 * initialize pbuf with contents of new data to be
560 * written. This will be XORed with old data and old
561 * parity in the intr routine. The result in pbuf
562 * is the new parity data.
563 */
564 qbuf = sr_block_get(sd, length);
565 if (qbuf == NULL)
566 goto bad;
567
568 pbuf = sr_block_get(sd, length);
569 if (pbuf == NULL)
570 goto bad;
571
572 /* Calculate P = Dn; Q = gn * Dn */
573 if (gf_premul(gf_pow[chunk]))
574 goto bad;
575 sr_raid6_xorp(pbuf, data, length);
576 sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]);
577
578 /* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */
579 if (sr_raid6_addio(wu_r, chunk, lba, length, NULL,
580 SCSI_DATA_IN, 0, pbuf, qbuf, gf_pow[chunk]))
581 goto bad;
582
583 /* Read old xor-parity: P ^= P' */
584 if (sr_raid6_addio(wu_r, pchunk, lba, length, NULL,
585 SCSI_DATA_IN, 0, pbuf, NULL, 0))
586 goto bad;
587
588 /* Read old q-parity: Q ^= Q' */
589 if (sr_raid6_addio(wu_r, qchunk, lba, length, NULL,
590 SCSI_DATA_IN, 0, qbuf, NULL, 0))
591 goto bad;
592
593 /* write new data */
594 if (sr_raid6_addio(wu, chunk, lba, length, data,
595 xs->flags, 0, NULL, NULL, 0))
596 goto bad;
597
598 /* write new xor-parity */
599 if (sr_raid6_addio(wu, pchunk, lba, length, pbuf,
600 xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
601 goto bad;
602
603 /* write new q-parity */
604 if (sr_raid6_addio(wu, qchunk, lba, length, qbuf,
605 xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
606 goto bad;
607 }
608
609 /* advance to next block */
610 lbaoffs += length;
611 datalen -= length;
612 data += length;
613 }
614
615 s = splbio();
616 if (wu_r) {
617 /* collide write request with reads */
618 wu_r->swu_blk_start = wu->swu_blk_start;
619 wu_r->swu_blk_end = wu->swu_blk_end;
620
621 wu->swu_state = SR_WU_DEFERRED;
622 wu_r->swu_collider = wu;
623 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
624
625 wu = wu_r;
626 }
627 splx(s);
628
629 sr_schedule_wu(wu);
630
631 return (0);
632 bad:
633 /* XXX - can leak pbuf/qbuf on error. */
634 /* wu is unwound by sr_wu_put */
635 if (wu_r)
636 sr_scsi_wu_put(sd, wu_r);
637 return (1);
638 }
639
640 /* Handle failure I/O completion */
641 int
sr_failio(struct sr_workunit * wu)642 sr_failio(struct sr_workunit *wu)
643 {
644 struct sr_discipline *sd = wu->swu_dis;
645 struct sr_ccb *ccb;
646
647 if (!(wu->swu_flags & SR_WUF_FAIL))
648 return (0);
649
650 /* Wu is a 'fake'.. don't do real I/O just intr */
651 TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
652 TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link)
653 sr_raid6_intr(&ccb->ccb_buf);
654 return (1);
655 }
656
657 void
sr_raid6_intr(struct buf * bp)658 sr_raid6_intr(struct buf *bp)
659 {
660 struct sr_ccb *ccb = (struct sr_ccb *)bp;
661 struct sr_workunit *wu = ccb->ccb_wu;
662 struct sr_discipline *sd = wu->swu_dis;
663 struct sr_raid6_opaque *pq = ccb->ccb_opaque;
664 int s;
665
666 DNPRINTF(SR_D_INTR, "%s: sr_raid6_intr bp %p xs %p\n",
667 DEVNAME(sd->sd_sc), bp, wu->swu_xs);
668
669 s = splbio();
670 sr_ccb_done(ccb);
671
672 /* XOR data to result. */
673 if (ccb->ccb_state == SR_CCB_OK && pq) {
674 if (pq->pbuf)
675 /* Calculate xor-parity */
676 sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data,
677 ccb->ccb_buf.b_bcount);
678 if (pq->qbuf)
679 /* Calculate q-parity */
680 sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data,
681 ccb->ccb_buf.b_bcount, pq->gn);
682 free(pq, M_DEVBUF, 0);
683 ccb->ccb_opaque = NULL;
684 }
685
686 /* Free allocated data buffer. */
687 if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
688 sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
689 ccb->ccb_buf.b_data = NULL;
690 }
691
692 sr_wu_done(wu);
693 splx(s);
694 }
695
696 int
sr_raid6_wu_done(struct sr_workunit * wu)697 sr_raid6_wu_done(struct sr_workunit *wu)
698 {
699 struct sr_discipline *sd = wu->swu_dis;
700 struct scsi_xfer *xs = wu->swu_xs;
701
702 /* XXX - we have no way of propagating errors... */
703 if (wu->swu_flags & SR_WUF_DISCIPLINE)
704 return SR_WU_OK;
705
706 /* XXX - This is insufficient for RAID 6. */
707 if (wu->swu_ios_succeeded > 0) {
708 xs->error = XS_NOERROR;
709 return SR_WU_OK;
710 }
711
712 if (xs->flags & SCSI_DATA_IN) {
713 printf("%s: retrying read on block %lld\n",
714 sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
715 sr_wu_release_ccbs(wu);
716 wu->swu_state = SR_WU_RESTART;
717 if (sd->sd_scsi_rw(wu) == 0)
718 return SR_WU_RESTART;
719 } else {
720 printf("%s: permanently fail write on block %lld\n",
721 sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
722 }
723
724 wu->swu_state = SR_WU_FAILED;
725 xs->error = XS_DRIVER_STUFFUP;
726
727 return SR_WU_FAILED;
728 }
729
730 int
sr_raid6_addio(struct sr_workunit * wu,int chunk,daddr_t blkno,long len,void * data,int xsflags,int ccbflags,void * pbuf,void * qbuf,int gn)731 sr_raid6_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
732 long len, void *data, int xsflags, int ccbflags, void *pbuf,
733 void *qbuf, int gn)
734 {
735 struct sr_discipline *sd = wu->swu_dis;
736 struct sr_ccb *ccb;
737 struct sr_raid6_opaque *pqbuf;
738
739 DNPRINTF(SR_D_DIS, "sr_raid6_addio: %s %d.%lld %ld %p:%p\n",
740 (xsflags & SCSI_DATA_IN) ? "read" : "write", chunk,
741 (long long)blkno, len, pbuf, qbuf);
742
743 /* Allocate temporary buffer. */
744 if (data == NULL) {
745 data = sr_block_get(sd, len);
746 if (data == NULL)
747 return (-1);
748 ccbflags |= SR_CCBF_FREEBUF;
749 }
750
751 ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
752 if (ccb == NULL) {
753 if (ccbflags & SR_CCBF_FREEBUF)
754 sr_block_put(sd, data, len);
755 return (-1);
756 }
757 if (pbuf || qbuf) {
758 /* XXX - can leak data and ccb on failure. */
759 if (qbuf && gf_premul(gn))
760 return (-1);
761
762 /* XXX - should be preallocated? */
763 pqbuf = malloc(sizeof(struct sr_raid6_opaque),
764 M_DEVBUF, M_ZERO | M_NOWAIT);
765 if (pqbuf == NULL) {
766 sr_ccb_put(ccb);
767 return (-1);
768 }
769 pqbuf->pbuf = pbuf;
770 pqbuf->qbuf = qbuf;
771 pqbuf->gn = gn;
772 ccb->ccb_opaque = pqbuf;
773 }
774 sr_wu_enqueue_ccb(wu, ccb);
775
776 return (0);
777 }
778
779 /* Perform RAID6 parity calculation.
780 * P=xor parity, Q=GF256 parity, D=data, gn=disk# */
781 void
sr_raid6_xorp(void * p,void * d,int len)782 sr_raid6_xorp(void *p, void *d, int len)
783 {
784 uint32_t *pbuf = p, *data = d;
785
786 len >>= 2;
787 while (len--)
788 *pbuf++ ^= *data++;
789 }
790
791 void
sr_raid6_xorq(void * q,void * d,int len,int gn)792 sr_raid6_xorq(void *q, void *d, int len, int gn)
793 {
794 uint32_t *qbuf = q, *data = d, x;
795 uint8_t *gn_map = gf_map[gn];
796
797 len >>= 2;
798 while (len--) {
799 x = *data++;
800 *qbuf++ ^= (((uint32_t)gn_map[x & 0xff]) |
801 ((uint32_t)gn_map[(x >> 8) & 0xff] << 8) |
802 ((uint32_t)gn_map[(x >> 16) & 0xff] << 16) |
803 ((uint32_t)gn_map[(x >> 24) & 0xff] << 24));
804 }
805 }
806
807 /* Create GF256 log/pow tables: polynomial = 0x11D */
808 void
gf_init(void)809 gf_init(void)
810 {
811 int i;
812 uint8_t p = 1;
813
814 /* use 2N pow table to avoid using % in multiply */
815 for (i=0; i<256; i++) {
816 gf_log[p] = i;
817 gf_pow[i] = gf_pow[i+255] = p;
818 p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00));
819 }
820 gf_log[0] = 512;
821 }
822
823 uint8_t
gf_inv(uint8_t a)824 gf_inv(uint8_t a)
825 {
826 return gf_pow[255 - gf_log[a]];
827 }
828
829 uint8_t
gf_mul(uint8_t a,uint8_t b)830 gf_mul(uint8_t a, uint8_t b)
831 {
832 return gf_pow[gf_log[a] + gf_log[b]];
833 }
834
835 /* Precalculate multiplication tables for drive gn */
836 int
gf_premul(uint8_t gn)837 gf_premul(uint8_t gn)
838 {
839 int i;
840
841 if (gf_map[gn] != NULL)
842 return (0);
843
844 if ((gf_map[gn] = malloc(256, M_DEVBUF, M_ZERO | M_NOWAIT)) == NULL)
845 return (-1);
846
847 for (i=0; i<256; i++)
848 gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]];
849 return (0);
850 }
851