xref: /freebsd/sys/geom/raid/tr_raid1e.c (revision 148a8da8)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/bio.h>
34 #include <sys/endian.h>
35 #include <sys/kernel.h>
36 #include <sys/kobj.h>
37 #include <sys/limits.h>
38 #include <sys/lock.h>
39 #include <sys/malloc.h>
40 #include <sys/mutex.h>
41 #include <sys/sysctl.h>
42 #include <sys/systm.h>
43 #include <geom/geom.h>
44 #include "geom/raid/g_raid.h"
45 #include "g_raid_tr_if.h"
46 
47 #define N	2
48 
49 SYSCTL_DECL(_kern_geom_raid_raid1e);
50 
51 #define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
52 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
53 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
54     &g_raid1e_rebuild_slab, 0,
55     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
56 
57 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
58 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
59 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
60     &g_raid1e_rebuild_fair_io, 0,
61     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
62 
63 #define RAID1E_REBUILD_CLUSTER_IDLE 100
64 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
65 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
66     &g_raid1e_rebuild_cluster_idle, 0,
67     "Number of slabs to do each time we trigger a rebuild cycle");
68 
69 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
70 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
71 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
72     &g_raid1e_rebuild_meta_update, 0,
73     "When to update the meta data.");
74 
75 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
76 
77 #define TR_RAID1E_NONE 0
78 #define TR_RAID1E_REBUILD 1
79 #define TR_RAID1E_RESYNC 2
80 
81 #define TR_RAID1E_F_DOING_SOME	0x1
82 #define TR_RAID1E_F_LOCKED	0x2
83 #define TR_RAID1E_F_ABORT	0x4
84 
85 struct g_raid_tr_raid1e_object {
86 	struct g_raid_tr_object	 trso_base;
87 	int			 trso_starting;
88 	int			 trso_stopping;
89 	int			 trso_type;
90 	int			 trso_recover_slabs; /* slabs before rest */
91 	int			 trso_fair_io;
92 	int			 trso_meta_update;
93 	int			 trso_flags;
94 	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
95 	void			*trso_buffer;	 /* Buffer space */
96 	off_t			 trso_lock_pos; /* Locked range start. */
97 	off_t			 trso_lock_len; /* Locked range length. */
98 	struct bio		 trso_bio;
99 };
100 
101 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
102 static g_raid_tr_event_t g_raid_tr_event_raid1e;
103 static g_raid_tr_start_t g_raid_tr_start_raid1e;
104 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
105 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
106 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
107 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
108 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
109 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
110 static g_raid_tr_free_t g_raid_tr_free_raid1e;
111 
112 static kobj_method_t g_raid_tr_raid1e_methods[] = {
113 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
114 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
115 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
116 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
117 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
118 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
119 	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
120 	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
121 	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
122 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
123 	{ 0, 0 }
124 };
125 
126 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
127 	"RAID1E",
128 	g_raid_tr_raid1e_methods,
129 	sizeof(struct g_raid_tr_raid1e_object),
130 	.trc_enable = 1,
131 	.trc_priority = 200,
132 	.trc_accept_unmapped = 1
133 };
134 
135 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
136 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
137     struct g_raid_subdisk *sd);
138 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
139     int no, off_t off, off_t len, u_int mask);
140 
141 static inline void
142 V2P(struct g_raid_volume *vol, off_t virt,
143     int *disk, off_t *offset, off_t *start)
144 {
145 	off_t nstrip;
146 	u_int strip_size;
147 
148 	strip_size = vol->v_strip_size;
149 	/* Strip number. */
150 	nstrip = virt / strip_size;
151 	/* Start position in strip. */
152 	*start = virt % strip_size;
153 	/* Disk number. */
154 	*disk = (nstrip * N) % vol->v_disks_count;
155 	/* Strip start position in disk. */
156 	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
157 }
158 
159 static inline void
160 P2V(struct g_raid_volume *vol, int disk, off_t offset,
161     off_t *virt, int *copy)
162 {
163 	off_t nstrip, start;
164 	u_int strip_size;
165 
166 	strip_size = vol->v_strip_size;
167 	/* Start position in strip. */
168 	start = offset % strip_size;
169 	/* Physical strip number. */
170 	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
171 	/* Number of physical strip (copy) inside virtual strip. */
172 	*copy = nstrip % N;
173 	/* Offset in virtual space. */
174 	*virt = (nstrip / N) * strip_size + start;
175 }
176 
177 static int
178 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
179 {
180 	struct g_raid_tr_raid1e_object *trs;
181 
182 	trs = (struct g_raid_tr_raid1e_object *)tr;
183 	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
184 	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
185 		return (G_RAID_TR_TASTE_FAIL);
186 	trs->trso_starting = 1;
187 	return (G_RAID_TR_TASTE_SUCCEED);
188 }
189 
190 static int
191 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
192 {
193 	struct g_raid_softc *sc;
194 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
195 	int i, j, state, sstate;
196 
197 	sc = vol->v_softc;
198 	state = G_RAID_VOLUME_S_OPTIMAL;
199 	for (i = 0; i < vol->v_disks_count / N; i++) {
200 		bestsd = &vol->v_subdisks[i * N];
201 		for (j = 1; j < N; j++) {
202 			sd = &vol->v_subdisks[i * N + j];
203 			if (sd->sd_state > bestsd->sd_state)
204 				bestsd = sd;
205 			else if (sd->sd_state == bestsd->sd_state &&
206 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
207 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
208 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
209 				bestsd = sd;
210 		}
211 		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
212 		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
213 			/* We found reasonable candidate. */
214 			G_RAID_DEBUG1(1, sc,
215 			    "Promote subdisk %s:%d from %s to ACTIVE.",
216 			    vol->v_name, bestsd->sd_pos,
217 			    g_raid_subdisk_state2str(bestsd->sd_state));
218 			g_raid_change_subdisk_state(bestsd,
219 			    G_RAID_SUBDISK_S_ACTIVE);
220 			g_raid_write_metadata(sc,
221 			    vol, bestsd, bestsd->sd_disk);
222 		}
223 		worstsd = &vol->v_subdisks[i * N];
224 		for (j = 1; j < N; j++) {
225 			sd = &vol->v_subdisks[i * N + j];
226 			if (sd->sd_state < worstsd->sd_state)
227 				worstsd = sd;
228 		}
229 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
230 			sstate = G_RAID_VOLUME_S_OPTIMAL;
231 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
232 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
233 		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
234 			sstate = G_RAID_VOLUME_S_DEGRADED;
235 		else
236 			sstate = G_RAID_VOLUME_S_BROKEN;
237 		if (sstate < state)
238 			state = sstate;
239 	}
240 	return (state);
241 }
242 
243 static int
244 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
245 {
246 	struct g_raid_softc *sc;
247 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
248 	int i, j, state, sstate;
249 
250 	sc = vol->v_softc;
251 	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
252 	    vol->v_disks_count)
253 		return (G_RAID_VOLUME_S_OPTIMAL);
254 	for (i = 0; i < vol->v_disks_count; i++) {
255 		sd = &vol->v_subdisks[i];
256 		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
257 			/* We found reasonable candidate. */
258 			G_RAID_DEBUG1(1, sc,
259 			    "Promote subdisk %s:%d from %s to STALE.",
260 			    vol->v_name, sd->sd_pos,
261 			    g_raid_subdisk_state2str(sd->sd_state));
262 			g_raid_change_subdisk_state(sd,
263 			    G_RAID_SUBDISK_S_STALE);
264 			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
265 		}
266 	}
267 	state = G_RAID_VOLUME_S_OPTIMAL;
268 	for (i = 0; i < vol->v_disks_count; i++) {
269 		bestsd = &vol->v_subdisks[i];
270 		worstsd = &vol->v_subdisks[i];
271 		for (j = 1; j < N; j++) {
272 			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
273 			if (sd->sd_state > bestsd->sd_state)
274 				bestsd = sd;
275 			else if (sd->sd_state == bestsd->sd_state &&
276 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
277 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
278 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
279 				bestsd = sd;
280 			if (sd->sd_state < worstsd->sd_state)
281 				worstsd = sd;
282 		}
283 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
284 			sstate = G_RAID_VOLUME_S_OPTIMAL;
285 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
286 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
287 		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
288 			sstate = G_RAID_VOLUME_S_DEGRADED;
289 		else
290 			sstate = G_RAID_VOLUME_S_BROKEN;
291 		if (sstate < state)
292 			state = sstate;
293 	}
294 	return (state);
295 }
296 
297 static int
298 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
299     struct g_raid_subdisk *sd)
300 {
301 	struct g_raid_tr_raid1e_object *trs;
302 	struct g_raid_softc *sc;
303 	u_int s;
304 
305 	sc = vol->v_softc;
306 	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
307 	if (trs->trso_stopping &&
308 	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
309 		s = G_RAID_VOLUME_S_STOPPED;
310 	else if (trs->trso_starting)
311 		s = G_RAID_VOLUME_S_STARTING;
312 	else {
313 		if ((vol->v_disks_count % N) == 0)
314 			s = g_raid_tr_update_state_raid1e_even(vol);
315 		else
316 			s = g_raid_tr_update_state_raid1e_odd(vol);
317 	}
318 	if (s != vol->v_state) {
319 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
320 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
321 		    G_RAID_EVENT_VOLUME);
322 		g_raid_change_volume_state(vol, s);
323 		if (!trs->trso_starting && !trs->trso_stopping)
324 			g_raid_write_metadata(sc, vol, NULL, NULL);
325 	}
326 	if (!trs->trso_starting && !trs->trso_stopping)
327 		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
328 	return (0);
329 }
330 
331 static void
332 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
333     struct g_raid_disk *disk)
334 {
335 	struct g_raid_volume *vol;
336 
337 	vol = sd->sd_volume;
338 	/*
339 	 * We don't fail the last disk in the pack, since it still has decent
340 	 * data on it and that's better than failing the disk if it is the root
341 	 * file system.
342 	 *
343 	 * XXX should this be controlled via a tunable?  It makes sense for
344 	 * the volume that has / on it.  I can't think of a case where we'd
345 	 * want the volume to go away on this kind of event.
346 	 */
347 	if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
348 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
349 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
350 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
351 	     vol->v_disks_count) &&
352 	    (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
353 		return;
354 	g_raid_fail_disk(sc, sd, disk);
355 }
356 
357 static void
358 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
359 {
360 	struct g_raid_volume *vol;
361 	struct g_raid_subdisk *sd;
362 
363 	vol = trs->trso_base.tro_volume;
364 	sd = trs->trso_failed_sd;
365 	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
366 	free(trs->trso_buffer, M_TR_RAID1E);
367 	trs->trso_buffer = NULL;
368 	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
369 	trs->trso_type = TR_RAID1E_NONE;
370 	trs->trso_recover_slabs = 0;
371 	trs->trso_failed_sd = NULL;
372 	g_raid_tr_update_state_raid1e(vol, NULL);
373 }
374 
375 static void
376 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
377 {
378 	struct g_raid_tr_raid1e_object *trs;
379 	struct g_raid_subdisk *sd;
380 
381 	trs = (struct g_raid_tr_raid1e_object *)tr;
382 	sd = trs->trso_failed_sd;
383 	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
384 	    "Subdisk %s:%d-%s rebuild completed.",
385 	    sd->sd_volume->v_name, sd->sd_pos,
386 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
387 	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
388 	sd->sd_rebuild_pos = 0;
389 	g_raid_tr_raid1e_rebuild_done(trs);
390 }
391 
392 static void
393 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
394 {
395 	struct g_raid_tr_raid1e_object *trs;
396 	struct g_raid_subdisk *sd;
397 	struct g_raid_volume *vol;
398 
399 	vol = tr->tro_volume;
400 	trs = (struct g_raid_tr_raid1e_object *)tr;
401 	sd = trs->trso_failed_sd;
402 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
403 		G_RAID_DEBUG1(1, vol->v_softc,
404 		    "Subdisk %s:%d-%s rebuild is aborting.",
405 		    sd->sd_volume->v_name, sd->sd_pos,
406 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
407 		trs->trso_flags |= TR_RAID1E_F_ABORT;
408 	} else {
409 		G_RAID_DEBUG1(0, vol->v_softc,
410 		    "Subdisk %s:%d-%s rebuild aborted.",
411 		    sd->sd_volume->v_name, sd->sd_pos,
412 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
413 		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
414 		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
415 			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
416 			g_raid_unlock_range(tr->tro_volume,
417 			    trs->trso_lock_pos, trs->trso_lock_len);
418 		}
419 		g_raid_tr_raid1e_rebuild_done(trs);
420 	}
421 }
422 
423 static void
424 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
425 {
426 	struct g_raid_tr_raid1e_object *trs;
427 	struct g_raid_softc *sc;
428 	struct g_raid_volume *vol;
429 	struct g_raid_subdisk *sd;
430 	struct bio *bp;
431 	off_t len, virtual, vend, offset, start;
432 	int disk, copy, best;
433 
434 	trs = (struct g_raid_tr_raid1e_object *)tr;
435 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
436 		return;
437 	vol = tr->tro_volume;
438 	sc = vol->v_softc;
439 	sd = trs->trso_failed_sd;
440 
441 	while (1) {
442 		if (sd->sd_rebuild_pos >= sd->sd_size) {
443 			g_raid_tr_raid1e_rebuild_finish(tr);
444 			return;
445 		}
446 		/* Get virtual offset from physical rebuild position. */
447 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
448 		/* Get physical offset back to get first stripe position. */
449 		V2P(vol, virtual, &disk, &offset, &start);
450 		/* Calculate contignous data length. */
451 		len = MIN(g_raid1e_rebuild_slab,
452 		    sd->sd_size - sd->sd_rebuild_pos);
453 		if ((vol->v_disks_count % N) != 0)
454 			len = MIN(len, vol->v_strip_size - start);
455 		/* Find disk with most accurate data. */
456 		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
457 		    offset + start, len, 0);
458 		if (best < 0) {
459 			/* There is no any valid disk. */
460 			g_raid_tr_raid1e_rebuild_abort(tr);
461 			return;
462 		} else if (best != copy) {
463 			/* Some other disk has better data. */
464 			break;
465 		}
466 		/* We have the most accurate data. Skip the range. */
467 		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
468 		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
469 		sd->sd_rebuild_pos += len;
470 	}
471 
472 	bp = &trs->trso_bio;
473 	memset(bp, 0, sizeof(*bp));
474 	bp->bio_offset = offset + start +
475 	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
476 	bp->bio_length = len;
477 	bp->bio_data = trs->trso_buffer;
478 	bp->bio_cmd = BIO_READ;
479 	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
480 	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
481 	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
482 	/*
483 	 * If we are crossing stripe boundary, correct affected virtual
484 	 * range we should lock.
485 	 */
486 	if (start + len > vol->v_strip_size) {
487 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
488 		len = vend - virtual;
489 	}
490 	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
491 	trs->trso_flags |= TR_RAID1E_F_LOCKED;
492 	trs->trso_lock_pos = virtual;
493 	trs->trso_lock_len = len;
494 	/* Lock callback starts I/O */
495 	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
496 }
497 
498 static void
499 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
500 {
501 	struct g_raid_volume *vol;
502 	struct g_raid_tr_raid1e_object *trs;
503 	struct g_raid_subdisk *sd;
504 
505 	vol = tr->tro_volume;
506 	trs = (struct g_raid_tr_raid1e_object *)tr;
507 	if (trs->trso_failed_sd) {
508 		G_RAID_DEBUG1(1, vol->v_softc,
509 		    "Already rebuild in start rebuild. pos %jd\n",
510 		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
511 		return;
512 	}
513 	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
514 	if (sd == NULL)
515 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
516 	if (sd == NULL) {
517 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
518 		if (sd != NULL) {
519 			sd->sd_rebuild_pos = 0;
520 			g_raid_change_subdisk_state(sd,
521 			    G_RAID_SUBDISK_S_RESYNC);
522 			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
523 		} else {
524 			sd = g_raid_get_subdisk(vol,
525 			    G_RAID_SUBDISK_S_UNINITIALIZED);
526 			if (sd == NULL)
527 				sd = g_raid_get_subdisk(vol,
528 				    G_RAID_SUBDISK_S_NEW);
529 			if (sd != NULL) {
530 				sd->sd_rebuild_pos = 0;
531 				g_raid_change_subdisk_state(sd,
532 				    G_RAID_SUBDISK_S_REBUILD);
533 				g_raid_write_metadata(vol->v_softc,
534 				    vol, sd, NULL);
535 			}
536 		}
537 	}
538 	if (sd == NULL) {
539 		G_RAID_DEBUG1(1, vol->v_softc,
540 		    "No failed disk to rebuild.  night night.");
541 		return;
542 	}
543 	trs->trso_failed_sd = sd;
544 	G_RAID_DEBUG1(0, vol->v_softc,
545 	    "Subdisk %s:%d-%s rebuild start at %jd.",
546 	    sd->sd_volume->v_name, sd->sd_pos,
547 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
548 	    trs->trso_failed_sd->sd_rebuild_pos);
549 	trs->trso_type = TR_RAID1E_REBUILD;
550 	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
551 	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
552 	g_raid_tr_raid1e_rebuild_some(tr);
553 }
554 
555 static void
556 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
557     struct g_raid_subdisk *sd)
558 {
559 	struct g_raid_volume *vol;
560 	struct g_raid_tr_raid1e_object *trs;
561 	int nr;
562 
563 	vol = tr->tro_volume;
564 	trs = (struct g_raid_tr_raid1e_object *)tr;
565 	if (trs->trso_stopping)
566 		return;
567 	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
568 	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
569 	switch(trs->trso_type) {
570 	case TR_RAID1E_NONE:
571 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
572 			return;
573 		if (nr == 0) {
574 			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
575 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
576 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
577 			if (nr == 0)
578 				return;
579 		}
580 		g_raid_tr_raid1e_rebuild_start(tr);
581 		break;
582 	case TR_RAID1E_REBUILD:
583 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
584 		    trs->trso_failed_sd == sd)
585 			g_raid_tr_raid1e_rebuild_abort(tr);
586 		break;
587 	case TR_RAID1E_RESYNC:
588 		break;
589 	}
590 }
591 
592 static int
593 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
594     struct g_raid_subdisk *sd, u_int event)
595 {
596 
597 	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
598 	return (0);
599 }
600 
601 static int
602 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
603 {
604 	struct g_raid_tr_raid1e_object *trs;
605 	struct g_raid_volume *vol;
606 
607 	trs = (struct g_raid_tr_raid1e_object *)tr;
608 	vol = tr->tro_volume;
609 	trs->trso_starting = 0;
610 	g_raid_tr_update_state_raid1e(vol, NULL);
611 	return (0);
612 }
613 
614 static int
615 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
616 {
617 	struct g_raid_tr_raid1e_object *trs;
618 	struct g_raid_volume *vol;
619 
620 	trs = (struct g_raid_tr_raid1e_object *)tr;
621 	vol = tr->tro_volume;
622 	trs->trso_starting = 0;
623 	trs->trso_stopping = 1;
624 	g_raid_tr_update_state_raid1e(vol, NULL);
625 	return (0);
626 }
627 
628 /*
629  * Select the disk to read from.  Take into account: subdisk state, running
630  * error recovery, average disk load, head position and possible cache hits.
631  */
632 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
633 static int
634 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
635     int no, off_t off, off_t len, u_int mask)
636 {
637 	struct g_raid_subdisk *sd;
638 	off_t offset;
639 	int i, best, prio, bestprio;
640 
641 	best = -1;
642 	bestprio = INT_MAX;
643 	for (i = 0; i < N; i++) {
644 		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
645 		offset = off;
646 		if (no + i >= vol->v_disks_count)
647 			offset += vol->v_strip_size;
648 
649 		prio = G_RAID_SUBDISK_LOAD(sd);
650 		if ((mask & (1 << sd->sd_pos)) != 0)
651 			continue;
652 		switch (sd->sd_state) {
653 		case G_RAID_SUBDISK_S_ACTIVE:
654 			break;
655 		case G_RAID_SUBDISK_S_RESYNC:
656 			if (offset + off < sd->sd_rebuild_pos)
657 				break;
658 			/* FALLTHROUGH */
659 		case G_RAID_SUBDISK_S_STALE:
660 			prio += i << 24;
661 			break;
662 		case G_RAID_SUBDISK_S_REBUILD:
663 			if (offset + off < sd->sd_rebuild_pos)
664 				break;
665 			/* FALLTHROUGH */
666 		default:
667 			continue;
668 		}
669 		prio += min(sd->sd_recovery, 255) << 16;
670 		/* If disk head is precisely in position - highly prefer it. */
671 		if (G_RAID_SUBDISK_POS(sd) == offset)
672 			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
673 		else
674 		/* If disk head is close to position - prefer it. */
675 		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
676 		    G_RAID_SUBDISK_TRACK_SIZE)
677 			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
678 		if (prio < bestprio) {
679 			bestprio = prio;
680 			best = i;
681 		}
682 	}
683 	return (best);
684 }
685 
686 static void
687 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
688 {
689 	struct g_raid_volume *vol;
690 	struct g_raid_subdisk *sd;
691 	struct bio_queue_head queue;
692 	struct bio *cbp;
693 	char *addr;
694 	off_t offset, start, length, remain;
695 	u_int no, strip_size;
696 	int best;
697 
698 	vol = tr->tro_volume;
699 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
700 		addr = NULL;
701 	else
702 		addr = bp->bio_data;
703 	strip_size = vol->v_strip_size;
704 	V2P(vol, bp->bio_offset, &no, &offset, &start);
705 	remain = bp->bio_length;
706 	bioq_init(&queue);
707 	while (remain > 0) {
708 		length = MIN(strip_size - start, remain);
709 		best = g_raid_tr_raid1e_select_read_disk(vol,
710 		    no, offset, length, 0);
711 		KASSERT(best >= 0, ("No readable disk in volume %s!",
712 		    vol->v_name));
713 		no += best;
714 		if (no >= vol->v_disks_count) {
715 			no -= vol->v_disks_count;
716 			offset += strip_size;
717 		}
718 		cbp = g_clone_bio(bp);
719 		if (cbp == NULL)
720 			goto failure;
721 		cbp->bio_offset = offset + start;
722 		cbp->bio_length = length;
723 		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
724 			cbp->bio_ma_offset += (uintptr_t)addr;
725 			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
726 			cbp->bio_ma_offset %= PAGE_SIZE;
727 			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
728 			    cbp->bio_length) / PAGE_SIZE;
729 		} else
730 			cbp->bio_data = addr;
731 		cbp->bio_caller1 = &vol->v_subdisks[no];
732 		bioq_insert_tail(&queue, cbp);
733 		no += N - best;
734 		if (no >= vol->v_disks_count) {
735 			no -= vol->v_disks_count;
736 			offset += strip_size;
737 		}
738 		remain -= length;
739 		addr += length;
740 		start = 0;
741 	}
742 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
743 		sd = cbp->bio_caller1;
744 		cbp->bio_caller1 = NULL;
745 		g_raid_subdisk_iostart(sd, cbp);
746 	}
747 	return;
748 failure:
749 	while ((cbp = bioq_takefirst(&queue)) != NULL)
750 		g_destroy_bio(cbp);
751 	if (bp->bio_error == 0)
752 		bp->bio_error = ENOMEM;
753 	g_raid_iodone(bp, bp->bio_error);
754 }
755 
756 static void
757 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
758 {
759 	struct g_raid_volume *vol;
760 	struct g_raid_subdisk *sd;
761 	struct bio_queue_head queue;
762 	struct bio *cbp;
763 	char *addr;
764 	off_t offset, start, length, remain;
765 	u_int no, strip_size;
766 	int i;
767 
768 	vol = tr->tro_volume;
769 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
770 		addr = NULL;
771 	else
772 		addr = bp->bio_data;
773 	strip_size = vol->v_strip_size;
774 	V2P(vol, bp->bio_offset, &no, &offset, &start);
775 	remain = bp->bio_length;
776 	bioq_init(&queue);
777 	while (remain > 0) {
778 		length = MIN(strip_size - start, remain);
779 		for (i = 0; i < N; i++) {
780 			sd = &vol->v_subdisks[no];
781 			switch (sd->sd_state) {
782 			case G_RAID_SUBDISK_S_ACTIVE:
783 			case G_RAID_SUBDISK_S_STALE:
784 			case G_RAID_SUBDISK_S_RESYNC:
785 				break;
786 			case G_RAID_SUBDISK_S_REBUILD:
787 				if (offset + start >= sd->sd_rebuild_pos)
788 					goto nextdisk;
789 				break;
790 			default:
791 				goto nextdisk;
792 			}
793 			cbp = g_clone_bio(bp);
794 			if (cbp == NULL)
795 				goto failure;
796 			cbp->bio_offset = offset + start;
797 			cbp->bio_length = length;
798 			if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
799 			    bp->bio_cmd != BIO_DELETE) {
800 				cbp->bio_ma_offset += (uintptr_t)addr;
801 				cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
802 				cbp->bio_ma_offset %= PAGE_SIZE;
803 				cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
804 				    cbp->bio_length) / PAGE_SIZE;
805 			} else
806 				cbp->bio_data = addr;
807 			cbp->bio_caller1 = sd;
808 			bioq_insert_tail(&queue, cbp);
809 nextdisk:
810 			if (++no >= vol->v_disks_count) {
811 				no = 0;
812 				offset += strip_size;
813 			}
814 		}
815 		remain -= length;
816 		if (bp->bio_cmd != BIO_DELETE)
817 			addr += length;
818 		start = 0;
819 	}
820 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
821 		sd = cbp->bio_caller1;
822 		cbp->bio_caller1 = NULL;
823 		g_raid_subdisk_iostart(sd, cbp);
824 	}
825 	return;
826 failure:
827 	while ((cbp = bioq_takefirst(&queue)) != NULL)
828 		g_destroy_bio(cbp);
829 	if (bp->bio_error == 0)
830 		bp->bio_error = ENOMEM;
831 	g_raid_iodone(bp, bp->bio_error);
832 }
833 
834 static void
835 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
836 {
837 	struct g_raid_volume *vol;
838 	struct g_raid_tr_raid1e_object *trs;
839 
840 	vol = tr->tro_volume;
841 	trs = (struct g_raid_tr_raid1e_object *)tr;
842 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
843 	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
844 	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
845 		g_raid_iodone(bp, EIO);
846 		return;
847 	}
848 	/*
849 	 * If we're rebuilding, squeeze in rebuild activity every so often,
850 	 * even when the disk is busy.  Be sure to only count real I/O
851 	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
852 	 * by this module.
853 	 */
854 	if (trs->trso_failed_sd != NULL &&
855 	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
856 		/* Make this new or running now round short. */
857 		trs->trso_recover_slabs = 0;
858 		if (--trs->trso_fair_io <= 0) {
859 			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
860 			g_raid_tr_raid1e_rebuild_some(tr);
861 		}
862 	}
863 	switch (bp->bio_cmd) {
864 	case BIO_READ:
865 		g_raid_tr_iostart_raid1e_read(tr, bp);
866 		break;
867 	case BIO_WRITE:
868 	case BIO_DELETE:
869 		g_raid_tr_iostart_raid1e_write(tr, bp);
870 		break;
871 	case BIO_FLUSH:
872 		g_raid_tr_flush_common(tr, bp);
873 		break;
874 	default:
875 		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
876 		    bp->bio_cmd, vol->v_name));
877 		break;
878 	}
879 }
880 
881 static void
882 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
883     struct g_raid_subdisk *sd, struct bio *bp)
884 {
885 	struct bio *cbp;
886 	struct g_raid_subdisk *nsd;
887 	struct g_raid_volume *vol;
888 	struct bio *pbp;
889 	struct g_raid_tr_raid1e_object *trs;
890 	off_t virtual, offset, start;
891 	uintptr_t mask;
892 	int error, do_write, copy, disk, best;
893 
894 	trs = (struct g_raid_tr_raid1e_object *)tr;
895 	vol = tr->tro_volume;
896 	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
897 		if (trs->trso_type == TR_RAID1E_REBUILD) {
898 			nsd = trs->trso_failed_sd;
899 			if (bp->bio_cmd == BIO_READ) {
900 
901 				/* Immediately abort rebuild, if requested. */
902 				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
903 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
904 					g_raid_tr_raid1e_rebuild_abort(tr);
905 					return;
906 				}
907 
908 				/* On read error, skip and cross fingers. */
909 				if (bp->bio_error != 0) {
910 					G_RAID_LOGREQ(0, bp,
911 					    "Read error during rebuild (%d), "
912 					    "possible data loss!",
913 					    bp->bio_error);
914 					goto rebuild_round_done;
915 				}
916 
917 				/*
918 				 * The read operation finished, queue the
919 				 * write and get out.
920 				 */
921 				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
922 				    bp->bio_error);
923 				bp->bio_cmd = BIO_WRITE;
924 				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
925 				bp->bio_offset = nsd->sd_rebuild_pos;
926 				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
927 				g_raid_subdisk_iostart(nsd, bp);
928 			} else {
929 				/*
930 				 * The write operation just finished.  Do
931 				 * another.  We keep cloning the master bio
932 				 * since it has the right buffers allocated to
933 				 * it.
934 				 */
935 				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
936 				    bp->bio_error);
937 				if (bp->bio_error != 0 ||
938 				    trs->trso_flags & TR_RAID1E_F_ABORT) {
939 					if ((trs->trso_flags &
940 					    TR_RAID1E_F_ABORT) == 0) {
941 						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
942 						    nsd, nsd->sd_disk);
943 					}
944 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
945 					g_raid_tr_raid1e_rebuild_abort(tr);
946 					return;
947 				}
948 rebuild_round_done:
949 				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
950 				g_raid_unlock_range(tr->tro_volume,
951 				    trs->trso_lock_pos, trs->trso_lock_len);
952 				nsd->sd_rebuild_pos += bp->bio_length;
953 				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
954 					g_raid_tr_raid1e_rebuild_finish(tr);
955 					return;
956 				}
957 
958 				/* Abort rebuild if we are stopping */
959 				if (trs->trso_stopping) {
960 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
961 					g_raid_tr_raid1e_rebuild_abort(tr);
962 					return;
963 				}
964 
965 				if (--trs->trso_meta_update <= 0) {
966 					g_raid_write_metadata(vol->v_softc,
967 					    vol, nsd, nsd->sd_disk);
968 					trs->trso_meta_update =
969 					    g_raid1e_rebuild_meta_update;
970 					/* Compensate short rebuild I/Os. */
971 					if ((vol->v_disks_count % N) != 0 &&
972 					    vol->v_strip_size <
973 					     g_raid1e_rebuild_slab) {
974 						trs->trso_meta_update *=
975 						    g_raid1e_rebuild_slab;
976 						trs->trso_meta_update /=
977 						    vol->v_strip_size;
978 					}
979 				}
980 				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
981 				if (--trs->trso_recover_slabs <= 0)
982 					return;
983 				/* Run next rebuild iteration. */
984 				g_raid_tr_raid1e_rebuild_some(tr);
985 			}
986 		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
987 			/*
988 			 * read good sd, read bad sd in parallel.  when both
989 			 * done, compare the buffers.  write good to the bad
990 			 * if different.  do the next bit of work.
991 			 */
992 			panic("Somehow, we think we're doing a resync");
993 		}
994 		return;
995 	}
996 	pbp = bp->bio_parent;
997 	pbp->bio_inbed++;
998 	mask = (intptr_t)bp->bio_caller2;
999 	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
1000 		/*
1001 		 * Read failed on first drive.  Retry the read error on
1002 		 * another disk drive, if available, before erroring out the
1003 		 * read.
1004 		 */
1005 		sd->sd_disk->d_read_errs++;
1006 		G_RAID_LOGREQ(0, bp,
1007 		    "Read error (%d), %d read errors total",
1008 		    bp->bio_error, sd->sd_disk->d_read_errs);
1009 
1010 		/*
1011 		 * If there are too many read errors, we move to degraded.
1012 		 * XXX Do we want to FAIL the drive (eg, make the user redo
1013 		 * everything to get it back in sync), or just degrade the
1014 		 * drive, which kicks off a resync?
1015 		 */
1016 		do_write = 0;
1017 		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1018 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1019 		else if (mask == 0)
1020 			do_write = 1;
1021 
1022 		/* Restore what we were doing. */
1023 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1024 		V2P(vol, virtual, &disk, &offset, &start);
1025 
1026 		/* Find the other disk, and try to do the I/O to it. */
1027 		mask |= 1 << copy;
1028 		best = g_raid_tr_raid1e_select_read_disk(vol,
1029 		    disk, offset, start, mask);
1030 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1031 			disk += best;
1032 			if (disk >= vol->v_disks_count) {
1033 				disk -= vol->v_disks_count;
1034 				offset += vol->v_strip_size;
1035 			}
1036 			cbp->bio_offset = offset + start;
1037 			cbp->bio_length = bp->bio_length;
1038 			cbp->bio_data = bp->bio_data;
1039 			cbp->bio_ma = bp->bio_ma;
1040 			cbp->bio_ma_offset = bp->bio_ma_offset;
1041 			cbp->bio_ma_n = bp->bio_ma_n;
1042 			g_destroy_bio(bp);
1043 			nsd = &vol->v_subdisks[disk];
1044 			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1045 			    nsd->sd_pos);
1046 			if (do_write)
1047 				mask |= 1 << 31;
1048 			if ((mask & (1U << 31)) != 0)
1049 				sd->sd_recovery++;
1050 			cbp->bio_caller2 = (void *)mask;
1051 			if (do_write) {
1052 				cbp->bio_caller1 = nsd;
1053 				/* Lock callback starts I/O */
1054 				g_raid_lock_range(sd->sd_volume,
1055 				    virtual, cbp->bio_length, pbp, cbp);
1056 			} else {
1057 				g_raid_subdisk_iostart(nsd, cbp);
1058 			}
1059 			return;
1060 		}
1061 		/*
1062 		 * We can't retry.  Return the original error by falling
1063 		 * through.  This will happen when there's only one good disk.
1064 		 * We don't need to fail the raid, since its actual state is
1065 		 * based on the state of the subdisks.
1066 		 */
1067 		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1068 	}
1069 	if (bp->bio_cmd == BIO_READ &&
1070 	    bp->bio_error == 0 &&
1071 	    (mask & (1U << 31)) != 0) {
1072 		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1073 
1074 		/* Restore what we were doing. */
1075 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1076 		V2P(vol, virtual, &disk, &offset, &start);
1077 
1078 		/* Find best disk to write. */
1079 		best = g_raid_tr_raid1e_select_read_disk(vol,
1080 		    disk, offset, start, ~mask);
1081 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1082 			disk += best;
1083 			if (disk >= vol->v_disks_count) {
1084 				disk -= vol->v_disks_count;
1085 				offset += vol->v_strip_size;
1086 			}
1087 			cbp->bio_offset = offset + start;
1088 			cbp->bio_cmd = BIO_WRITE;
1089 			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1090 			cbp->bio_caller2 = (void *)mask;
1091 			g_destroy_bio(bp);
1092 			G_RAID_LOGREQ(2, cbp,
1093 			    "Attempting bad sector remap on failing drive.");
1094 			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1095 			return;
1096 		}
1097 	}
1098 	if ((mask & (1U << 31)) != 0) {
1099 		/*
1100 		 * We're done with a recovery, mark the range as unlocked.
1101 		 * For any write errors, we aggressively fail the disk since
1102 		 * there was both a READ and a WRITE error at this location.
1103 		 * Both types of errors generally indicates the drive is on
1104 		 * the verge of total failure anyway.  Better to stop trusting
1105 		 * it now.  However, we need to reset error to 0 in that case
1106 		 * because we're not failing the original I/O which succeeded.
1107 		 */
1108 
1109 		/* Restore what we were doing. */
1110 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1111 		V2P(vol, virtual, &disk, &offset, &start);
1112 
1113 		for (copy = 0; copy < N; copy++) {
1114 			if ((mask & (1 << copy) ) != 0)
1115 				vol->v_subdisks[(disk + copy) %
1116 				    vol->v_disks_count].sd_recovery--;
1117 		}
1118 
1119 		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1120 			G_RAID_LOGREQ(0, bp, "Remap write failed: "
1121 			    "failing subdisk.");
1122 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1123 			bp->bio_error = 0;
1124 		}
1125 		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1126 		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1127 	}
1128 	if (pbp->bio_cmd != BIO_READ) {
1129 		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1130 			pbp->bio_error = bp->bio_error;
1131 		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1132 			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1133 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1134 		}
1135 		error = pbp->bio_error;
1136 	} else
1137 		error = bp->bio_error;
1138 	g_destroy_bio(bp);
1139 	if (pbp->bio_children == pbp->bio_inbed) {
1140 		pbp->bio_completed = pbp->bio_length;
1141 		g_raid_iodone(pbp, error);
1142 	}
1143 }
1144 
1145 static int
1146 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1147     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1148 {
1149 	struct g_raid_volume *vol;
1150 	struct g_raid_subdisk *sd;
1151 	struct bio_queue_head queue;
1152 	char *addr;
1153 	off_t offset, start, length, remain;
1154 	u_int no, strip_size;
1155 	int i, error;
1156 
1157 	vol = tr->tro_volume;
1158 	addr = virtual;
1159 	strip_size = vol->v_strip_size;
1160 	V2P(vol, boffset, &no, &offset, &start);
1161 	remain = blength;
1162 	bioq_init(&queue);
1163 	while (remain > 0) {
1164 		length = MIN(strip_size - start, remain);
1165 		for (i = 0; i < N; i++) {
1166 			sd = &vol->v_subdisks[no];
1167 			switch (sd->sd_state) {
1168 			case G_RAID_SUBDISK_S_ACTIVE:
1169 			case G_RAID_SUBDISK_S_STALE:
1170 			case G_RAID_SUBDISK_S_RESYNC:
1171 				break;
1172 			case G_RAID_SUBDISK_S_REBUILD:
1173 				if (offset + start >= sd->sd_rebuild_pos)
1174 					goto nextdisk;
1175 				break;
1176 			default:
1177 				goto nextdisk;
1178 			}
1179 			error = g_raid_subdisk_kerneldump(sd,
1180 			    addr, 0, offset + start, length);
1181 			if (error != 0)
1182 				return (error);
1183 nextdisk:
1184 			if (++no >= vol->v_disks_count) {
1185 				no = 0;
1186 				offset += strip_size;
1187 			}
1188 		}
1189 		remain -= length;
1190 		addr += length;
1191 		start = 0;
1192 	}
1193 	return (0);
1194 }
1195 
1196 static int
1197 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1198 {
1199 	struct bio *bp;
1200 	struct g_raid_subdisk *sd;
1201 
1202 	bp = (struct bio *)argp;
1203 	sd = (struct g_raid_subdisk *)bp->bio_caller1;
1204 	g_raid_subdisk_iostart(sd, bp);
1205 
1206 	return (0);
1207 }
1208 
1209 static int
1210 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1211 {
1212 	struct g_raid_tr_raid1e_object *trs;
1213 	struct g_raid_volume *vol;
1214 
1215 	vol = tr->tro_volume;
1216 	trs = (struct g_raid_tr_raid1e_object *)tr;
1217 	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1218 	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1219 	/* Compensate short rebuild I/Os. */
1220 	if ((vol->v_disks_count % N) != 0 &&
1221 	    vol->v_strip_size < g_raid1e_rebuild_slab) {
1222 		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1223 		trs->trso_recover_slabs /= vol->v_strip_size;
1224 	}
1225 	if (trs->trso_type == TR_RAID1E_REBUILD)
1226 		g_raid_tr_raid1e_rebuild_some(tr);
1227 	return (0);
1228 }
1229 
1230 static int
1231 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1232 {
1233 	struct g_raid_tr_raid1e_object *trs;
1234 
1235 	trs = (struct g_raid_tr_raid1e_object *)tr;
1236 
1237 	if (trs->trso_buffer != NULL) {
1238 		free(trs->trso_buffer, M_TR_RAID1E);
1239 		trs->trso_buffer = NULL;
1240 	}
1241 	return (0);
1242 }
1243 
1244 G_RAID_TR_DECLARE(raid1e, "RAID1E");
1245