xref: /openbsd/sys/dev/softraid_raid5.c (revision f6d8fcae)
1 /* $OpenBSD: softraid_raid5.c,v 1.32 2021/05/16 15:12:37 deraadt Exp $ */
2 /*
3  * Copyright (c) 2014 Joel Sing <jsing@openbsd.org>
4  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
5  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include "bio.h"
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/buf.h>
25 #include <sys/device.h>
26 #include <sys/ioctl.h>
27 #include <sys/malloc.h>
28 #include <sys/kernel.h>
29 #include <sys/disk.h>
30 #include <sys/rwlock.h>
31 #include <sys/queue.h>
32 #include <sys/fcntl.h>
33 #include <sys/mount.h>
34 #include <sys/sensors.h>
35 #include <sys/stat.h>
36 #include <sys/task.h>
37 #include <sys/pool.h>
38 #include <sys/conf.h>
39 #include <sys/uio.h>
40 
41 #include <scsi/scsi_all.h>
42 #include <scsi/scsiconf.h>
43 #include <scsi/scsi_disk.h>
44 
45 #include <dev/softraidvar.h>
46 
47 /* RAID 5 functions. */
48 int	sr_raid5_create(struct sr_discipline *, struct bioc_createraid *,
49 	    int, int64_t);
50 int	sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *,
51 	    int, void *);
52 int	sr_raid5_init(struct sr_discipline *);
53 int	sr_raid5_rw(struct sr_workunit *);
54 int	sr_raid5_openings(struct sr_discipline *);
55 void	sr_raid5_intr(struct buf *);
56 int	sr_raid5_wu_done(struct sr_workunit *);
57 void	sr_raid5_set_chunk_state(struct sr_discipline *, int, int);
58 void	sr_raid5_set_vol_state(struct sr_discipline *);
59 
60 int	sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, long,
61 	    void *, int, int, void *);
62 int	sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, long,
63 	    void *);
64 int	sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int,
65 	    daddr_t, long, void *, int, int);
66 void	sr_raid5_xor(void *, void *, int);
67 
68 void	sr_raid5_rebuild(struct sr_discipline *);
69 void	sr_raid5_scrub(struct sr_discipline *);
70 
71 /* discipline initialisation. */
72 void
sr_raid5_discipline_init(struct sr_discipline * sd)73 sr_raid5_discipline_init(struct sr_discipline *sd)
74 {
75 	/* Fill out discipline members. */
76 	sd->sd_type = SR_MD_RAID5;
77 	strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name));
78 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
79 	    SR_CAP_REBUILD | SR_CAP_REDUNDANT;
80 	sd->sd_max_wu = SR_RAID5_NOWU + 2;	/* Two for scrub/rebuild. */
81 
82 	/* Setup discipline specific function pointers. */
83 	sd->sd_assemble = sr_raid5_assemble;
84 	sd->sd_create = sr_raid5_create;
85 	sd->sd_openings = sr_raid5_openings;
86 	sd->sd_rebuild = sr_raid5_rebuild;
87 	sd->sd_scsi_rw = sr_raid5_rw;
88 	sd->sd_scsi_intr = sr_raid5_intr;
89 	sd->sd_scsi_wu_done = sr_raid5_wu_done;
90 	sd->sd_set_chunk_state = sr_raid5_set_chunk_state;
91 	sd->sd_set_vol_state = sr_raid5_set_vol_state;
92 }
93 
94 int
sr_raid5_create(struct sr_discipline * sd,struct bioc_createraid * bc,int no_chunk,int64_t coerced_size)95 sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc,
96     int no_chunk, int64_t coerced_size)
97 {
98 	if (no_chunk < 3) {
99 		sr_error(sd->sd_sc, "%s requires three or more chunks",
100 		    sd->sd_name);
101 		return EINVAL;
102 	}
103 
104 	/*
105 	 * XXX add variable strip size later even though MAXPHYS is really
106 	 * the clever value, users like to tinker with that type of stuff.
107 	 */
108 	sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
109 	sd->sd_meta->ssdi.ssd_size = (coerced_size &
110 	    ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
111 	    DEV_BSHIFT) - 1)) * (no_chunk - 1);
112 
113 	return sr_raid5_init(sd);
114 }
115 
116 int
sr_raid5_assemble(struct sr_discipline * sd,struct bioc_createraid * bc,int no_chunk,void * data)117 sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
118     int no_chunk, void *data)
119 {
120 	return sr_raid5_init(sd);
121 }
122 
123 int
sr_raid5_init(struct sr_discipline * sd)124 sr_raid5_init(struct sr_discipline *sd)
125 {
126 	/* Initialise runtime values. */
127 	sd->mds.mdd_raid5.sr5_strip_bits =
128 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
129 	if (sd->mds.mdd_raid5.sr5_strip_bits == -1) {
130 		sr_error(sd->sd_sc, "invalid strip size");
131 		return EINVAL;
132 	}
133 
134 	sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
135 
136 	return 0;
137 }
138 
139 int
sr_raid5_openings(struct sr_discipline * sd)140 sr_raid5_openings(struct sr_discipline *sd)
141 {
142 	/* Two work units per I/O, two for rebuild/scrub. */
143 	return ((sd->sd_max_wu - 2) >> 1);
144 }
145 
146 void
sr_raid5_set_chunk_state(struct sr_discipline * sd,int c,int new_state)147 sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
148 {
149 	int			old_state, s;
150 
151 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
152 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
153 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
154 
155 	/* ok to go to splbio since this only happens in error path */
156 	s = splbio();
157 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
158 
159 	/* multiple IOs to the same chunk that fail will come through here */
160 	if (old_state == new_state)
161 		goto done;
162 
163 	switch (old_state) {
164 	case BIOC_SDONLINE:
165 		switch (new_state) {
166 		case BIOC_SDOFFLINE:
167 		case BIOC_SDSCRUB:
168 			break;
169 		default:
170 			goto die;
171 		}
172 		break;
173 
174 	case BIOC_SDOFFLINE:
175 		if (new_state == BIOC_SDREBUILD) {
176 			;
177 		} else
178 			goto die;
179 		break;
180 
181 	case BIOC_SDSCRUB:
182 		switch (new_state) {
183 		case BIOC_SDONLINE:
184 		case BIOC_SDOFFLINE:
185 			break;
186 		default:
187 			goto die;
188 		}
189 		break;
190 
191 	case BIOC_SDREBUILD:
192 		switch (new_state) {
193 		case BIOC_SDONLINE:
194 		case BIOC_SDOFFLINE:
195 			break;
196 		default:
197 			goto die;
198 		}
199 		break;
200 
201 	default:
202 die:
203 		splx(s); /* XXX */
204 		panic("%s: %s: %s: invalid chunk state transition %d -> %d",
205 		    DEVNAME(sd->sd_sc),
206 		    sd->sd_meta->ssd_devname,
207 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
208 		    old_state, new_state);
209 		/* NOTREACHED */
210 	}
211 
212 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
213 	sd->sd_set_vol_state(sd);
214 
215 	sd->sd_must_flush = 1;
216 	task_add(systq, &sd->sd_meta_save_task);
217 done:
218 	splx(s);
219 }
220 
221 void
sr_raid5_set_vol_state(struct sr_discipline * sd)222 sr_raid5_set_vol_state(struct sr_discipline *sd)
223 {
224 	int			states[SR_MAX_STATES];
225 	int			new_state, i, s, nd;
226 	int			old_state = sd->sd_vol_status;
227 
228 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
229 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
230 
231 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
232 
233 	for (i = 0; i < SR_MAX_STATES; i++)
234 		states[i] = 0;
235 
236 	for (i = 0; i < nd; i++) {
237 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
238 		if (s >= SR_MAX_STATES)
239 			panic("%s: %s: %s: invalid chunk state",
240 			    DEVNAME(sd->sd_sc),
241 			    sd->sd_meta->ssd_devname,
242 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
243 		states[s]++;
244 	}
245 
246 	if (states[BIOC_SDONLINE] == nd)
247 		new_state = BIOC_SVONLINE;
248 	else if (states[BIOC_SDONLINE] < nd - 1)
249 		new_state = BIOC_SVOFFLINE;
250 	else if (states[BIOC_SDSCRUB] != 0)
251 		new_state = BIOC_SVSCRUB;
252 	else if (states[BIOC_SDREBUILD] != 0)
253 		new_state = BIOC_SVREBUILD;
254 	else if (states[BIOC_SDONLINE] == nd - 1)
255 		new_state = BIOC_SVDEGRADED;
256 	else {
257 #ifdef SR_DEBUG
258 		DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state "
259 		    "was %d\n", DEVNAME(sd->sd_sc), old_state);
260 		for (i = 0; i < nd; i++)
261 			DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n",
262 			    DEVNAME(sd->sd_sc), i,
263 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
264 #endif
265 		panic("invalid volume state");
266 	}
267 
268 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n",
269 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
270 	    old_state, new_state);
271 
272 	switch (old_state) {
273 	case BIOC_SVONLINE:
274 		switch (new_state) {
275 		case BIOC_SVONLINE: /* can go to same state */
276 		case BIOC_SVOFFLINE:
277 		case BIOC_SVDEGRADED:
278 		case BIOC_SVREBUILD: /* happens on boot */
279 			break;
280 		default:
281 			goto die;
282 		}
283 		break;
284 
285 	case BIOC_SVOFFLINE:
286 		/* XXX this might be a little too much */
287 		goto die;
288 
289 	case BIOC_SVDEGRADED:
290 		switch (new_state) {
291 		case BIOC_SVOFFLINE:
292 		case BIOC_SVREBUILD:
293 		case BIOC_SVDEGRADED: /* can go to the same state */
294 			break;
295 		default:
296 			goto die;
297 		}
298 		break;
299 
300 	case BIOC_SVBUILDING:
301 		switch (new_state) {
302 		case BIOC_SVONLINE:
303 		case BIOC_SVOFFLINE:
304 		case BIOC_SVBUILDING: /* can go to the same state */
305 			break;
306 		default:
307 			goto die;
308 		}
309 		break;
310 
311 	case BIOC_SVSCRUB:
312 		switch (new_state) {
313 		case BIOC_SVONLINE:
314 		case BIOC_SVOFFLINE:
315 		case BIOC_SVDEGRADED:
316 		case BIOC_SVSCRUB: /* can go to same state */
317 			break;
318 		default:
319 			goto die;
320 		}
321 		break;
322 
323 	case BIOC_SVREBUILD:
324 		switch (new_state) {
325 		case BIOC_SVONLINE:
326 		case BIOC_SVOFFLINE:
327 		case BIOC_SVDEGRADED:
328 		case BIOC_SVREBUILD: /* can go to the same state */
329 			break;
330 		default:
331 			goto die;
332 		}
333 		break;
334 
335 	default:
336 die:
337 		panic("%s: %s: invalid volume state transition %d -> %d",
338 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
339 		    old_state, new_state);
340 		/* NOTREACHED */
341 	}
342 
343 	sd->sd_vol_status = new_state;
344 }
345 
346 static inline int
sr_raid5_chunk_online(struct sr_discipline * sd,int chunk)347 sr_raid5_chunk_online(struct sr_discipline *sd, int chunk)
348 {
349 	switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
350 	case BIOC_SDONLINE:
351 	case BIOC_SDSCRUB:
352 		return 1;
353 	default:
354 		return 0;
355 	}
356 }
357 
358 static inline int
sr_raid5_chunk_rebuild(struct sr_discipline * sd,int chunk)359 sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk)
360 {
361 	switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
362 	case BIOC_SDREBUILD:
363 		return 1;
364 	default:
365 		return 0;
366 	}
367 }
368 
369 int
sr_raid5_rw(struct sr_workunit * wu)370 sr_raid5_rw(struct sr_workunit *wu)
371 {
372 	struct sr_workunit	*wu_r = NULL;
373 	struct sr_discipline	*sd = wu->swu_dis;
374 	struct scsi_xfer	*xs = wu->swu_xs;
375 	struct sr_chunk		*scp;
376 	daddr_t			blkno, lba;
377 	int64_t			chunk_offs, lbaoffs, offset, strip_offs;
378 	int64_t			strip_bits, strip_no, strip_size;
379 	int64_t			chunk, no_chunk;
380 	int64_t			parity, row_size;
381 	long			length, datalen;
382 	void			*data;
383 	int			s;
384 
385 	/* blkno and scsi error will be handled by sr_validate_io */
386 	if (sr_validate_io(wu, &blkno, "sr_raid5_rw"))
387 		goto bad;
388 
389 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: blkno %lld size %d\n",
390 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
391 	    (xs->flags & SCSI_DATA_IN) ? "read" : "write",
392 	    (long long)blkno, xs->datalen);
393 
394 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
395 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
396 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
397 	row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
398 
399 	data = xs->data;
400 	datalen = xs->datalen;
401 	lbaoffs	= blkno << DEV_BSHIFT;
402 
403 	if (xs->flags & SCSI_DATA_OUT) {
404 		if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
405 			printf("%s: %s failed to get read work unit",
406 			    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
407 			goto bad;
408 		}
409 		wu_r->swu_state = SR_WU_INPROGRESS;
410 		wu_r->swu_flags |= SR_WUF_DISCIPLINE;
411 	}
412 
413 	wu->swu_blk_start = 0;
414 	while (datalen != 0) {
415 		strip_no = lbaoffs >> strip_bits;
416 		strip_offs = lbaoffs & (strip_size - 1);
417 		chunk_offs = (strip_no / no_chunk) << strip_bits;
418 		offset = chunk_offs + strip_offs;
419 
420 		/* get size remaining in this stripe */
421 		length = MIN(strip_size - strip_offs, datalen);
422 
423 		/*
424 		 * Map disk offset to data and parity chunks, using a left
425 		 * asymmetric algorithm for the parity assignment.
426 		 */
427 		chunk = strip_no % no_chunk;
428 		parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
429 		if (chunk >= parity)
430 			chunk++;
431 
432 		lba = offset >> DEV_BSHIFT;
433 
434 		/* XXX big hammer.. exclude I/O from entire stripe */
435 		if (wu->swu_blk_start == 0)
436 			wu->swu_blk_start = (strip_no / no_chunk) * row_size;
437 		wu->swu_blk_end = (strip_no / no_chunk) * row_size +
438 		    (row_size - 1);
439 
440 		scp = sd->sd_vol.sv_chunks[chunk];
441 		if (xs->flags & SCSI_DATA_IN) {
442 			switch (scp->src_meta.scm_status) {
443 			case BIOC_SDONLINE:
444 			case BIOC_SDSCRUB:
445 				/*
446 				 * Chunk is online, issue a single read
447 				 * request.
448 				 */
449 				if (sr_raid5_addio(wu, chunk, lba, length,
450 				    data, xs->flags, 0, NULL))
451 					goto bad;
452 				break;
453 			case BIOC_SDOFFLINE:
454 			case BIOC_SDREBUILD:
455 			case BIOC_SDHOTSPARE:
456 				if (sr_raid5_regenerate(wu, chunk, lba,
457 				    length, data))
458 					goto bad;
459 				break;
460 			default:
461 				printf("%s: is offline, can't read\n",
462 				    DEVNAME(sd->sd_sc));
463 				goto bad;
464 			}
465 		} else {
466 			if (sr_raid5_write(wu, wu_r, chunk, parity, lba,
467 			    length, data, xs->flags, 0))
468 				goto bad;
469 		}
470 
471 		/* advance to next block */
472 		lbaoffs += length;
473 		datalen -= length;
474 		data += length;
475 	}
476 
477 	s = splbio();
478 	if (wu_r) {
479 		if (wu_r->swu_io_count > 0) {
480 			/* collide write request with reads */
481 			wu_r->swu_blk_start = wu->swu_blk_start;
482 			wu_r->swu_blk_end = wu->swu_blk_end;
483 
484 			wu->swu_state = SR_WU_DEFERRED;
485 			wu_r->swu_collider = wu;
486 			TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
487 
488 			wu = wu_r;
489 		} else {
490 			sr_scsi_wu_put(sd, wu_r);
491 		}
492 	}
493 	splx(s);
494 
495 	sr_schedule_wu(wu);
496 
497 	return (0);
498 
499 bad:
500 	/* wu is unwound by sr_wu_put */
501 	if (wu_r)
502 		sr_scsi_wu_put(sd, wu_r);
503 	return (1);
504 }
505 
506 int
sr_raid5_regenerate(struct sr_workunit * wu,int chunk,daddr_t blkno,long len,void * data)507 sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno,
508     long len, void *data)
509 {
510 	struct sr_discipline	*sd = wu->swu_dis;
511 	int			i;
512 
513 	/*
514 	 * Regenerate a block on a RAID 5 volume by xoring the data and parity
515 	 * from all of the remaining online chunks. This requires the parity
516 	 * to already be correct.
517 	 */
518 
519 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, "
520 	    "regenerating block %llu\n",
521 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno);
522 
523 	memset(data, 0, len);
524 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
525 		if (i == chunk)
526 			continue;
527 		if (!sr_raid5_chunk_online(sd, i))
528 			goto bad;
529 		if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN,
530 		    0, data))
531 			goto bad;
532 	}
533 	return (0);
534 
535 bad:
536 	return (1);
537 }
538 
539 int
sr_raid5_write(struct sr_workunit * wu,struct sr_workunit * wu_r,int chunk,int parity,daddr_t blkno,long len,void * data,int xsflags,int ccbflags)540 sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk,
541     int parity, daddr_t blkno, long len, void *data, int xsflags,
542     int ccbflags)
543 {
544 	struct sr_discipline	*sd = wu->swu_dis;
545 	struct scsi_xfer	*xs = wu->swu_xs;
546 	void			*xorbuf;
547 	int			chunk_online, chunk_rebuild;
548 	int			parity_online, parity_rebuild;
549 	int			other_offline = 0, other_rebuild = 0;
550 	int			i;
551 
552 	/*
553 	 * Perform a write to a RAID 5 volume. This write routine does not
554 	 * require the parity to already be correct and will operate on a
555 	 * uninitialised volume.
556 	 *
557 	 * There are four possible cases:
558 	 *
559 	 * 1) All data chunks and parity are online. In this case we read the
560 	 *    data from all data chunks, except the one we are writing to, in
561 	 *    order to calculate and write the new parity.
562 	 *
563 	 * 2) The parity chunk is offline. In this case we only need to write
564 	 *    to the data chunk. No parity calculation is required.
565 	 *
566 	 * 3) The data chunk is offline. In this case we read the data from all
567 	 *    online chunks in order to calculate and write the new parity.
568 	 *    This is the same as (1) except we do not write the data chunk.
569 	 *
570 	 * 4) A different data chunk is offline. The new parity is calculated
571 	 *    by taking the existing parity, xoring the original data and
572 	 *    xoring in the new data. This requires that the parity already be
573 	 *    correct, which it will be if any of the data chunks has
574 	 *    previously been written.
575 	 *
576 	 * There is an additional complication introduced by a chunk that is
577 	 * being rebuilt. If this is the data or parity chunk, then we want
578 	 * to write to it as per normal. If it is another data chunk then we
579 	 * need to presume that it has not yet been regenerated and use the
580 	 * same method as detailed in (4) above.
581 	 */
582 
583 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i "
584 	    "blkno %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
585 	    chunk, parity, (unsigned long long)blkno);
586 
587 	chunk_online = sr_raid5_chunk_online(sd, chunk);
588 	chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk);
589 	parity_online = sr_raid5_chunk_online(sd, parity);
590 	parity_rebuild = sr_raid5_chunk_rebuild(sd, parity);
591 
592 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
593 		if (i == chunk || i == parity)
594 			continue;
595 		if (sr_raid5_chunk_rebuild(sd, i))
596 			other_rebuild = 1;
597 		else if (!sr_raid5_chunk_online(sd, i))
598 			other_offline = 1;
599 	}
600 
601 	DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, "
602 	    "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
603 	    chunk_online, parity_online, other_offline);
604 
605 	if (!parity_online && !parity_rebuild)
606 		goto data_write;
607 
608 	xorbuf = sr_block_get(sd, len);
609 	if (xorbuf == NULL)
610 		goto bad;
611 	memcpy(xorbuf, data, len);
612 
613 	if (other_offline || other_rebuild) {
614 
615 		/*
616 		 * XXX - If we can guarantee that this LBA has been scrubbed
617 		 * then we can also take this faster path.
618 		 */
619 
620 		/* Read in existing data and existing parity. */
621 		if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL,
622 		    SCSI_DATA_IN, 0, xorbuf))
623 			goto bad;
624 		if (sr_raid5_addio(wu_r, parity, blkno, len, NULL,
625 		    SCSI_DATA_IN, 0, xorbuf))
626 			goto bad;
627 
628 	} else {
629 
630 		/* Read in existing data from all other chunks. */
631 		for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
632 			if (i == chunk || i == parity)
633 				continue;
634 			if (sr_raid5_addio(wu_r, i, blkno, len, NULL,
635 			    SCSI_DATA_IN, 0, xorbuf))
636 				goto bad;
637 		}
638 
639 	}
640 
641 	/* Write new parity. */
642 	if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags,
643 	    SR_CCBF_FREEBUF, NULL))
644 		goto bad;
645 
646 data_write:
647 	/* Write new data. */
648 	if (chunk_online || chunk_rebuild)
649 		if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags,
650 		    0, NULL))
651 			goto bad;
652 
653 	return (0);
654 
655 bad:
656 	return (1);
657 }
658 
659 void
sr_raid5_intr(struct buf * bp)660 sr_raid5_intr(struct buf *bp)
661 {
662 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
663 	struct sr_workunit	*wu = ccb->ccb_wu;
664 	struct sr_discipline	*sd = wu->swu_dis;
665 	int			s;
666 
667 	DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n",
668 	    DEVNAME(sd->sd_sc), bp, wu->swu_xs);
669 
670 	s = splbio();
671 	sr_ccb_done(ccb);
672 
673 	/* XXX - Should this be done via the taskq? */
674 
675 	/* XOR data to result. */
676 	if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque)
677 		sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data,
678 		    ccb->ccb_buf.b_bcount);
679 
680 	/* Free allocated data buffer. */
681 	if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
682 		sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
683 		ccb->ccb_buf.b_data = NULL;
684 	}
685 
686 	sr_wu_done(wu);
687 	splx(s);
688 }
689 
690 int
sr_raid5_wu_done(struct sr_workunit * wu)691 sr_raid5_wu_done(struct sr_workunit *wu)
692 {
693 	struct sr_discipline	*sd = wu->swu_dis;
694 	struct scsi_xfer	*xs = wu->swu_xs;
695 
696 	/* XXX - we have no way of propagating errors... */
697 	if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD))
698 		return SR_WU_OK;
699 
700 	/* XXX - This is insufficient for RAID 5. */
701 	if (wu->swu_ios_succeeded > 0) {
702 		xs->error = XS_NOERROR;
703 		return SR_WU_OK;
704 	}
705 
706 	if (xs->flags & SCSI_DATA_IN) {
707 		printf("%s: retrying read on block %lld\n",
708 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
709 		sr_wu_release_ccbs(wu);
710 		wu->swu_state = SR_WU_RESTART;
711 		if (sd->sd_scsi_rw(wu) == 0)
712 			return SR_WU_RESTART;
713 	} else {
714 		/* XXX - retry write if we just went from online to degraded. */
715 		printf("%s: permanently fail write on block %lld\n",
716 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
717 	}
718 
719 	wu->swu_state = SR_WU_FAILED;
720 	xs->error = XS_DRIVER_STUFFUP;
721 
722 	return SR_WU_FAILED;
723 }
724 
725 int
sr_raid5_addio(struct sr_workunit * wu,int chunk,daddr_t blkno,long len,void * data,int xsflags,int ccbflags,void * xorbuf)726 sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
727     long len, void *data, int xsflags, int ccbflags, void *xorbuf)
728 {
729 	struct sr_discipline	*sd = wu->swu_dis;
730 	struct sr_ccb		*ccb;
731 
732 	DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld "
733 	    "length %ld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
734 	    chunk, (long long)blkno, len, xorbuf ? "X0R" : "-");
735 
736 	/* Allocate temporary buffer. */
737 	if (data == NULL) {
738 		data = sr_block_get(sd, len);
739 		if (data == NULL)
740 			return (-1);
741 		ccbflags |= SR_CCBF_FREEBUF;
742 	}
743 
744 	ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
745 	if (ccb == NULL) {
746 		if (ccbflags & SR_CCBF_FREEBUF)
747 			sr_block_put(sd, data, len);
748 		return (-1);
749 	}
750 	ccb->ccb_opaque = xorbuf;
751 	sr_wu_enqueue_ccb(wu, ccb);
752 
753 	return (0);
754 }
755 
756 void
sr_raid5_xor(void * a,void * b,int len)757 sr_raid5_xor(void *a, void *b, int len)
758 {
759 	uint32_t		*xa = a, *xb = b;
760 
761 	len >>= 2;
762 	while (len--)
763 		*xa++ ^= *xb++;
764 }
765 
766 void
sr_raid5_rebuild(struct sr_discipline * sd)767 sr_raid5_rebuild(struct sr_discipline *sd)
768 {
769 	int64_t strip_no, strip_size, strip_bits, i, restart;
770 	int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size;
771 	struct sr_workunit *wu_r, *wu_w;
772 	int s, slept, percent = 0, old_percent = -1;
773 	int rebuild_chunk = -1;
774 	void *xorbuf;
775 
776 	/* Find the rebuild chunk. */
777 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
778 		if (sr_raid5_chunk_rebuild(sd, i)) {
779 			rebuild_chunk = i;
780 			break;
781 		}
782 	}
783 	if (rebuild_chunk == -1)
784 		goto bad;
785 
786 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
787 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
788 	chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1;
789 	chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count;
790 	chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits;
791 	row_size = (chunk_count << strip_bits) >> DEV_BSHIFT;
792 
793 	DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, "
794 	    "chunk count = %lld, chunk size = %lld, chunk strips = %lld, "
795 	    "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
796 	    sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips,
797 	    row_size);
798 
799 	restart = sd->sd_meta->ssd_rebuild / row_size;
800 	if (restart > chunk_strips) {
801 		printf("%s: bogus rebuild restart offset, starting from 0\n",
802 		    DEVNAME(sd->sd_sc));
803 		restart = 0;
804 	}
805 	if (restart != 0) {
806 		percent = sr_rebuild_percent(sd);
807 		printf("%s: resuming rebuild on %s at %d%%\n",
808 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, percent);
809 	}
810 
811 	for (strip_no = restart; strip_no < chunk_strips; strip_no++) {
812 		chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no;
813 
814 		DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, "
815 		    "chunk lba = %lld\n", DEVNAME(sd->sd_sc),
816 		    sd->sd_meta->ssd_devname, strip_no, chunk_lba);
817 
818 		wu_w = sr_scsi_wu_get(sd, 0);
819 		wu_r = sr_scsi_wu_get(sd, 0);
820 
821 		xorbuf = sr_block_get(sd, strip_size);
822 		if (xorbuf == NULL)
823 			goto bad;
824 		if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba,
825 		    strip_size, xorbuf))
826 			goto bad;
827 		if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size,
828 		    xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL))
829 			goto bad;
830 
831 		/* Collide write work unit with read work unit. */
832 		wu_r->swu_state = SR_WU_INPROGRESS;
833 		wu_r->swu_flags |= SR_WUF_REBUILD;
834 		wu_w->swu_state = SR_WU_DEFERRED;
835 		wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
836 		wu_r->swu_collider = wu_w;
837 
838 		/* Block I/O to this strip while we rebuild it. */
839 		wu_r->swu_blk_start = (strip_no / chunk_count) * row_size;
840 		wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1;
841 		wu_w->swu_blk_start = wu_r->swu_blk_start;
842 		wu_w->swu_blk_end = wu_r->swu_blk_end;
843 
844 		DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, "
845 		    "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc),
846 		    sd->sd_meta->ssd_devname,
847 		    wu_r->swu_blk_start, wu_r->swu_blk_end);
848 
849 		s = splbio();
850 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
851 		splx(s);
852 
853 		sr_schedule_wu(wu_r);
854 
855 		slept = 0;
856 		while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
857 			tsleep_nsec(wu_w, PRIBIO, "sr_rebuild", INFSLP);
858 			slept = 1;
859 		}
860 		if (!slept) {
861 			tsleep_nsec(sd->sd_sc, PWAIT, "sr_yield",
862 			    MSEC_TO_NSEC(1));
863 		}
864 
865 		sr_scsi_wu_put(sd, wu_r);
866 		sr_scsi_wu_put(sd, wu_w);
867 
868 		sd->sd_meta->ssd_rebuild = chunk_lba * chunk_count;
869 
870 		percent = sr_rebuild_percent(sd);
871 		if (percent != old_percent && strip_no != chunk_strips - 1) {
872 			if (sr_meta_save(sd, SR_META_DIRTY))
873 				printf("%s: could not save metadata to %s\n",
874 				    DEVNAME(sd->sd_sc),
875 				    sd->sd_meta->ssd_devname);
876 			old_percent = percent;
877 		}
878 
879 		if (sd->sd_reb_abort)
880 			goto abort;
881 	}
882 
883 	DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc),
884 	    sd->sd_meta->ssd_devname);
885 
886 	/* all done */
887 	sd->sd_meta->ssd_rebuild = 0;
888 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
889 		if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status ==
890 		    BIOC_SDREBUILD) {
891 			sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE);
892 			break;
893 		}
894 	}
895 
896 	return;
897 
898 abort:
899 	if (sr_meta_save(sd, SR_META_DIRTY))
900 		printf("%s: could not save metadata to %s\n",
901 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
902 bad:
903 	return;
904 }
905 
906 #if 0
907 void
908 sr_raid5_scrub(struct sr_discipline *sd)
909 {
910 	int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits;
911 	int64_t i;
912 	struct sr_workunit *wu_r, *wu_w;
913 	int s, slept;
914 	void *xorbuf;
915 
916 	wu_w = sr_scsi_wu_get(sd, 0);
917 	wu_r = sr_scsi_wu_get(sd, 0);
918 
919 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
920 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
921 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
922 	max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits;
923 
924 	for (strip_no = 0; strip_no < max_strip; strip_no++) {
925 		parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
926 
927 		xorbuf = sr_block_get(sd, strip_size);
928 		for (i = 0; i <= no_chunk; i++) {
929 			if (i != parity)
930 				sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size,
931 				    NULL, SCSI_DATA_IN, 0, xorbuf);
932 		}
933 		sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf,
934 		    SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL);
935 
936 		wu_r->swu_flags |= SR_WUF_REBUILD;
937 
938 		/* Collide wu_w with wu_r */
939 		wu_w->swu_state = SR_WU_DEFERRED;
940 		wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
941 		wu_r->swu_collider = wu_w;
942 
943 		s = splbio();
944 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
945 		splx(s);
946 
947 		wu_r->swu_state = SR_WU_INPROGRESS;
948 		sr_schedule_wu(wu_r);
949 
950 		slept = 0;
951 		while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
952 			tsleep_nsec(wu_w, PRIBIO, "sr_scrub", INFSLP);
953 			slept = 1;
954 		}
955 		if (!slept) {
956 			tsleep_nsec(sd->sd_sc, PWAIT, "sr_yield",
957 			    MSEC_TO_NSEC(1));
958 		}
959 	}
960 }
961 #endif
962